3 // namespace: System.Text.RegularExpressions
\r
6 // author: Dan Lewis (dlewis@gmx.co.uk)
\r
10 using System.Collections;
\r
12 namespace System.Text.RegularExpressions {
\r
14 enum OpCode : ushort {
\r
15 False = 0, // always fails
\r
16 True, // always succeeds
\r
20 Position, // zero-width position assertion
\r
21 String, // match string literal
\r
22 Reference, // back reference
\r
24 // character matching
\r
26 Character, // match character exactly
\r
27 Category, // match character from category
\r
28 Range, // match character from range
\r
29 Set, // match character from set
\r
30 In, // match character from group of tests
\r
35 Close, // close group
\r
36 Balance, // balance groups
\r
40 IfDefined, // conditional on capture
\r
41 Sub, // non-backtracking subexpression
\r
42 Test, // non-backtracking lookahead/behind
\r
43 Branch, // alternative expression
\r
44 Jump, // unconditional goto
\r
45 Repeat, // new repeat context
\r
46 Until, // repeat subexpression within context
\r
47 FastRepeat, // repeat simple subexpression
\r
48 Anchor, // anchoring expression
\r
52 Info // pattern information
\r
56 enum OpFlags : ushort {
\r
58 Negate = 0x100, // succeed on mismatch
\r
59 IgnoreCase = 0x200, // case insensitive matching
\r
60 RightToLeft = 0x400, // right-to-left matching
\r
61 Lazy = 0x800 // minimizing repeat
\r
64 enum Position : ushort {
\r
66 Start, // start of string \A
\r
67 StartOfString, // start of string \A
\r
68 StartOfLine, // start of line ^
\r
69 StartOfScan, // start of scan \G
\r
70 End, // end or before newline at end \Z
\r
71 EndOfString, // end of string \z
\r
72 EndOfLine, // end of line $
\r
73 Boundary, // word boundary \b
\r
74 NonBoundary // not word boundary \B
\r
77 // see category.cs for Category enum
\r
79 interface IMachine {
\r
80 Match Scan (Regex regex, string text, int start, int end);
\r
83 interface IMachineFactory {
\r
84 IMachine NewInstance ();
\r
85 IDictionary Mapping { get; set; }
\r
86 int GroupCount { get; }
\r
89 // Anchor SKIP OFFSET
\r
91 // Flags: [RightToLeft] ??
\r
92 // SKIP: relative address of tail expression
\r
93 // OFFSET: offset of anchor from start of pattern
\r
104 // In practice, the anchoring expression is only going to be
\r
105 // Position (StartOfString, StartOfLine, StartOfScan) or String.
\r
106 // This is because the optimizer looks for position anchors at the
\r
107 // start of the expression, and if that fails it looks for the
\r
108 // longest substring. If an expression has neither a position
\r
109 // anchor or a longest substring anchor, then the anchoring expression
\r
110 // is left empty. Since an empty expression will anchor at any
\r
111 // position in any string, the entire input string will be scanned.
\r
113 // String LEN STR...
\r
115 // Flags: [RightToLeft, IgnoreCase]
\r
116 // LEN: length of string
\r
117 // STR: string characters
\r
121 // SKIP: relative address of next branch
\r
135 // Repeat SKIP MIN MAX
\r
138 // SKIP: relative address of Until instruction
\r
139 // MIN: minimum iterations
\r
140 // MAX: maximum iterations (0xffff is infinity)
\r
142 // Repeat :1 MIN MAX
\r
147 // FastRepeat SKIP MIN MAX
\r
150 // SKIP: relative address of tail expression
\r
151 // MIN: minimum iterations
\r
152 // MAX: maximum iterations (0xffff is infinity)
\r
154 // FastRepeat :1 MIN MAX
\r
161 // The subexpression of a FastRepeat construct must not contain any
\r
162 // complex operators. These include: Open, Close, Balance, Repeat,
\r
163 // FastRepeat, Sub, Test. In addition, the subexpression must have
\r
164 // been determined to have a fixed width.
\r
168 // SKIP: relative address of tail expression
\r
176 // The Sub operator invokes an independent subexpression. This means
\r
177 // that the subexpression will match only once and so will not
\r
178 // participate in any backtracking.
\r
180 // Test TSKIP FSKIP
\r
182 // TSKIP: relative address of true expression
\r
183 // FSKIP: relative address of false expression
\r
185 // Usage: (?(?=test)true|false)
\r
194 // Usage: (?(?=test)true)
\r
212 // For negative lookaheads, just swap the values of TSKIP and
\r
213 // FSKIP. For lookbehinds, the test expression must be compiled
\r
214 // in reverse. The test expression is always executed as an
\r
215 // independent subexpression, so its behaviour is non-backtracking
\r
216 // (like a Sub clause.)
\r
218 // IfDefined SKIP GID
\r
220 // SKIP: relative address of else expression
\r
221 // GID: number of group to check
\r
223 // Usage: (?(gid)true)
\r
229 // Usage: (?(gid)true|false)
\r
239 // SKIP: relative address of target expression
\r
243 // :1 <target expr>
\r
247 // Flags: [Negate, IgnoreCase, RightToLeft]
\r
248 // CHAR: exact character to match
\r
252 // Flags: [Negate, RightToLeft]
\r
253 // CAT: category to match (see Category enum)
\r
257 // Flags: [Negate, IgnoreCase, RightToLeft]
\r
258 // LO: lowest character in range
\r
259 // HI: higest character in range
\r
261 // Set LO LEN SET...
\r
263 // Flags: [Negate, IgnoreCase, RightToLeft]
\r
264 // LO: lowest character in set
\r
265 // LEN: number of words in set
\r
266 // SET: bit array representing characters in set
\r
270 // Each word in the set represents 16 characters, so the first word
\r
271 // defines membership for characters LO to LO + 15, the second for
\r
272 // LO + 16 to LO + 31, and so on up to LO + (LEN * 16 - 1). It is
\r
273 // up to the compiler to provide a compact representation for sparse
\r
274 // unicode sets. The simple way is to use Set 0 4096. Other methods
\r
275 // involve paritioning the set and placing the components into an
\r
280 // SKIP: relative address of tail expression
\r
298 // The In instruction consumes a single character, using the flags
\r
299 // of the first instruction in the subexpression to determine its
\r
300 // IgnoreCase and RightToLeft properties. The subexpression is then
\r
301 // applied to the single character as a disjunction. If any instruction
\r
302 // in the subexpression succeeds, the entire In construct succeeds
\r
303 // and matching continues with the tail.
\r
307 // POS: position to match (see Position enum)
\r
311 // GID: number of group to open
\r
315 // GID: number of group to close
\r
319 // GID: number of capturing group (0 if none)
\r
320 // BAL: number of group to undefine
\r
322 // Info GROUPS MIN MAX
\r
324 // GROUPS: number of capturing groups
\r
325 // MIN: minimum width of pattern
\r
326 // MAX: maximum width of pattern (0xffff means undefined)
\r
334 // Flags: [IgnoreCase, RightToLeft]
\r
335 // GID: number of group to reference
\r