3 // namespace: System.Text.RegularExpressions
6 // author: Dan Lewis (dlewis@gmx.co.uk)
10 // Permission is hereby granted, free of charge, to any person obtaining
11 // a copy of this software and associated documentation files (the
12 // "Software"), to deal in the Software without restriction, including
13 // without limitation the rights to use, copy, modify, merge, publish,
14 // distribute, sublicense, and/or sell copies of the Software, and to
15 // permit persons to whom the Software is furnished to do so, subject to
16 // the following conditions:
18 // The above copyright notice and this permission notice shall be
19 // included in all copies or substantial portions of the Software.
21 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
22 // EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
23 // MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
24 // NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
25 // LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
26 // OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
27 // WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
31 using System.Collections;
33 namespace System.Text.RegularExpressions {
35 enum OpCode : ushort {
36 False = 0, // always fails
37 True, // always succeeds
41 Position, // zero-width position assertion
42 String, // match string literal
43 Reference, // back reference
47 Character, // match character exactly
48 Category, // match character from category
49 Range, // match character from range
50 Set, // match character from set
51 In, // match character from group of tests
57 Balance, // balance groups
58 BalanceStart, //track balance group length
62 IfDefined, // conditional on capture
63 Sub, // non-backtracking subexpression
64 Test, // non-backtracking lookahead/behind
65 Branch, // alternative expression
66 Jump, // unconditional goto
67 Repeat, // new repeat context
68 Until, // repeat subexpression within context
69 FastRepeat, // repeat simple subexpression
70 Anchor, // anchoring expression
74 Info // pattern information
78 enum OpFlags : ushort {
80 Negate = 0x100, // succeed on mismatch
81 IgnoreCase = 0x200, // case insensitive matching
82 RightToLeft = 0x400, // right-to-left matching
83 Lazy = 0x800 // minimizing repeat
86 enum Position : ushort {
88 Start, // start of string \A
89 StartOfString, // start of string \A
90 StartOfLine, // start of line ^
91 StartOfScan, // start of scan \G
92 End, // end or before newline at end \Z
93 EndOfString, // end of string \z
94 EndOfLine, // end of line $
95 Boundary, // word boundary \b
96 NonBoundary // not word boundary \B
99 // see category.cs for Category enum
102 Match Scan (Regex regex, string text, int start, int end);
105 interface IMachineFactory {
106 IMachine NewInstance ();
107 IDictionary Mapping { get; set; }
108 int GroupCount { get; }
111 // Anchor SKIP OFFSET
113 // Flags: [RightToLeft] ??
114 // SKIP: relative address of tail expression
115 // OFFSET: offset of anchor from start of pattern
126 // In practice, the anchoring expression is only going to be
127 // Position (StartOfString, StartOfLine, StartOfScan) or String.
128 // This is because the optimizer looks for position anchors at the
129 // start of the expression, and if that fails it looks for the
130 // longest substring. If an expression has neither a position
131 // anchor or a longest substring anchor, then the anchoring expression
132 // is left empty. Since an empty expression will anchor at any
133 // position in any string, the entire input string will be scanned.
137 // Flags: [RightToLeft, IgnoreCase]
138 // LEN: length of string
139 // STR: string characters
143 // SKIP: relative address of next branch
157 // Repeat SKIP MIN MAX
160 // SKIP: relative address of Until instruction
161 // MIN: minimum iterations
162 // MAX: maximum iterations (0xffff is infinity)
169 // FastRepeat SKIP MIN MAX
172 // SKIP: relative address of tail expression
173 // MIN: minimum iterations
174 // MAX: maximum iterations (0xffff is infinity)
176 // FastRepeat :1 MIN MAX
183 // The subexpression of a FastRepeat construct must not contain any
184 // complex operators. These include: Open, Close, Balance, Repeat,
185 // FastRepeat, Sub, Test. In addition, the subexpression must have
186 // been determined to have a fixed width.
190 // SKIP: relative address of tail expression
198 // The Sub operator invokes an independent subexpression. This means
199 // that the subexpression will match only once and so will not
200 // participate in any backtracking.
204 // TSKIP: relative address of true expression
205 // FSKIP: relative address of false expression
207 // Usage: (?(?=test)true|false)
216 // Usage: (?(?=test)true)
234 // For negative lookaheads, just swap the values of TSKIP and
235 // FSKIP. For lookbehinds, the test expression must be compiled
236 // in reverse. The test expression is always executed as an
237 // independent subexpression, so its behaviour is non-backtracking
238 // (like a Sub clause.)
240 // IfDefined SKIP GID
242 // SKIP: relative address of else expression
243 // GID: number of group to check
245 // Usage: (?(gid)true)
251 // Usage: (?(gid)true|false)
261 // SKIP: relative address of target expression
269 // Flags: [Negate, IgnoreCase, RightToLeft]
270 // CHAR: exact character to match
274 // Flags: [Negate, RightToLeft]
275 // CAT: category to match (see Category enum)
279 // Flags: [Negate, IgnoreCase, RightToLeft]
280 // LO: lowest character in range
281 // HI: higest character in range
285 // Flags: [Negate, IgnoreCase, RightToLeft]
286 // LO: lowest character in set
287 // LEN: number of words in set
288 // SET: bit array representing characters in set
292 // Each word in the set represents 16 characters, so the first word
293 // defines membership for characters LO to LO + 15, the second for
294 // LO + 16 to LO + 31, and so on up to LO + (LEN * 16 - 1). It is
295 // up to the compiler to provide a compact representation for sparse
296 // unicode sets. The simple way is to use Set 0 4096. Other methods
297 // involve paritioning the set and placing the components into an
302 // SKIP: relative address of tail expression
320 // The In instruction consumes a single character, using the flags
321 // of the first instruction in the subexpression to determine its
322 // IgnoreCase and RightToLeft properties. The subexpression is then
323 // applied to the single character as a disjunction. If any instruction
324 // in the subexpression succeeds, the entire In construct succeeds
325 // and matching continues with the tail.
329 // POS: position to match (see Position enum)
333 // GID: number of group to open
337 // GID: number of group to close
341 // GID: number of capturing group (0 if none)
342 // BAL: number of group to undefine
344 // Info GROUPS MIN MAX
346 // GROUPS: number of capturing groups
347 // MIN: minimum width of pattern
348 // MAX: maximum width of pattern (0xffff means undefined)
356 // Flags: [IgnoreCase, RightToLeft]
357 // GID: number of group to reference