3 // namespace: System.Text.RegularExpressions
6 // author: Dan Lewis (dlewis@gmx.co.uk)
10 using System.Collections;
12 namespace System.Text.RegularExpressions {
14 enum OpCode : ushort {
15 False = 0, // always fails
16 True, // always succeeds
20 Position, // zero-width position assertion
21 String, // match string literal
22 Reference, // back reference
26 Character, // match character exactly
27 Category, // match character from category
28 Range, // match character from range
29 Set, // match character from set
30 In, // match character from group of tests
36 Balance, // balance groups
37 BalanceStart, //track balance group length
41 IfDefined, // conditional on capture
42 Sub, // non-backtracking subexpression
43 Test, // non-backtracking lookahead/behind
44 Branch, // alternative expression
45 Jump, // unconditional goto
46 Repeat, // new repeat context
47 Until, // repeat subexpression within context
48 FastRepeat, // repeat simple subexpression
49 Anchor, // anchoring expression
53 Info // pattern information
57 enum OpFlags : ushort {
59 Negate = 0x100, // succeed on mismatch
60 IgnoreCase = 0x200, // case insensitive matching
61 RightToLeft = 0x400, // right-to-left matching
62 Lazy = 0x800 // minimizing repeat
65 enum Position : ushort {
67 Start, // start of string \A
68 StartOfString, // start of string \A
69 StartOfLine, // start of line ^
70 StartOfScan, // start of scan \G
71 End, // end or before newline at end \Z
72 EndOfString, // end of string \z
73 EndOfLine, // end of line $
74 Boundary, // word boundary \b
75 NonBoundary // not word boundary \B
78 // see category.cs for Category enum
81 Match Scan (Regex regex, string text, int start, int end);
84 interface IMachineFactory {
85 IMachine NewInstance ();
86 IDictionary Mapping { get; set; }
87 int GroupCount { get; }
92 // Flags: [RightToLeft] ??
93 // SKIP: relative address of tail expression
94 // OFFSET: offset of anchor from start of pattern
105 // In practice, the anchoring expression is only going to be
106 // Position (StartOfString, StartOfLine, StartOfScan) or String.
107 // This is because the optimizer looks for position anchors at the
108 // start of the expression, and if that fails it looks for the
109 // longest substring. If an expression has neither a position
110 // anchor or a longest substring anchor, then the anchoring expression
111 // is left empty. Since an empty expression will anchor at any
112 // position in any string, the entire input string will be scanned.
116 // Flags: [RightToLeft, IgnoreCase]
117 // LEN: length of string
118 // STR: string characters
122 // SKIP: relative address of next branch
136 // Repeat SKIP MIN MAX
139 // SKIP: relative address of Until instruction
140 // MIN: minimum iterations
141 // MAX: maximum iterations (0xffff is infinity)
148 // FastRepeat SKIP MIN MAX
151 // SKIP: relative address of tail expression
152 // MIN: minimum iterations
153 // MAX: maximum iterations (0xffff is infinity)
155 // FastRepeat :1 MIN MAX
162 // The subexpression of a FastRepeat construct must not contain any
163 // complex operators. These include: Open, Close, Balance, Repeat,
164 // FastRepeat, Sub, Test. In addition, the subexpression must have
165 // been determined to have a fixed width.
169 // SKIP: relative address of tail expression
177 // The Sub operator invokes an independent subexpression. This means
178 // that the subexpression will match only once and so will not
179 // participate in any backtracking.
183 // TSKIP: relative address of true expression
184 // FSKIP: relative address of false expression
186 // Usage: (?(?=test)true|false)
195 // Usage: (?(?=test)true)
213 // For negative lookaheads, just swap the values of TSKIP and
214 // FSKIP. For lookbehinds, the test expression must be compiled
215 // in reverse. The test expression is always executed as an
216 // independent subexpression, so its behaviour is non-backtracking
217 // (like a Sub clause.)
219 // IfDefined SKIP GID
221 // SKIP: relative address of else expression
222 // GID: number of group to check
224 // Usage: (?(gid)true)
230 // Usage: (?(gid)true|false)
240 // SKIP: relative address of target expression
248 // Flags: [Negate, IgnoreCase, RightToLeft]
249 // CHAR: exact character to match
253 // Flags: [Negate, RightToLeft]
254 // CAT: category to match (see Category enum)
258 // Flags: [Negate, IgnoreCase, RightToLeft]
259 // LO: lowest character in range
260 // HI: higest character in range
264 // Flags: [Negate, IgnoreCase, RightToLeft]
265 // LO: lowest character in set
266 // LEN: number of words in set
267 // SET: bit array representing characters in set
271 // Each word in the set represents 16 characters, so the first word
272 // defines membership for characters LO to LO + 15, the second for
273 // LO + 16 to LO + 31, and so on up to LO + (LEN * 16 - 1). It is
274 // up to the compiler to provide a compact representation for sparse
275 // unicode sets. The simple way is to use Set 0 4096. Other methods
276 // involve paritioning the set and placing the components into an
281 // SKIP: relative address of tail expression
299 // The In instruction consumes a single character, using the flags
300 // of the first instruction in the subexpression to determine its
301 // IgnoreCase and RightToLeft properties. The subexpression is then
302 // applied to the single character as a disjunction. If any instruction
303 // in the subexpression succeeds, the entire In construct succeeds
304 // and matching continues with the tail.
308 // POS: position to match (see Position enum)
312 // GID: number of group to open
316 // GID: number of group to close
320 // GID: number of capturing group (0 if none)
321 // BAL: number of group to undefine
323 // Info GROUPS MIN MAX
325 // GROUPS: number of capturing groups
326 // MIN: minimum width of pattern
327 // MAX: maximum width of pattern (0xffff means undefined)
335 // Flags: [IgnoreCase, RightToLeft]
336 // GID: number of group to reference