3 // namespace: System.Text.RegularExpressions
6 // author: Dan Lewis (dlewis@gmx.co.uk)
10 // Permission is hereby granted, free of charge, to any person obtaining
11 // a copy of this software and associated documentation files (the
12 // "Software"), to deal in the Software without restriction, including
13 // without limitation the rights to use, copy, modify, merge, publish,
14 // distribute, sublicense, and/or sell copies of the Software, and to
15 // permit persons to whom the Software is furnished to do so, subject to
16 // the following conditions:
18 // The above copyright notice and this permission notice shall be
19 // included in all copies or substantial portions of the Software.
21 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
22 // EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
23 // MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
24 // NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
25 // LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
26 // OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
27 // WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
31 using System.Collections;
33 namespace System.Text.RegularExpressions {
35 enum OpCode : ushort {
36 False = 0, // always fails
37 True, // always succeeds
41 Position, // zero-width position assertion
42 String, // match string literal
43 Reference, // back reference
47 Character, // match character exactly
48 Category, // match character from category
49 NotCategory, // match character _not_ from category
50 Range, // match character from range
51 Set, // match character from set
52 In, // match character from group of tests
58 Balance, // balance groups
59 BalanceStart, //track balance group length
63 IfDefined, // conditional on capture
64 Sub, // non-backtracking subexpression
65 Test, // non-backtracking lookahead/behind
66 Branch, // alternative expression
67 Jump, // unconditional goto
68 Repeat, // new repeat context
69 Until, // repeat subexpression within context
70 FastRepeat, // repeat simple subexpression
71 Anchor, // anchoring expression
75 Info // pattern information
79 enum OpFlags : ushort {
81 Negate = 0x100, // succeed on mismatch
82 IgnoreCase = 0x200, // case insensitive matching
83 RightToLeft = 0x400, // right-to-left matching
84 Lazy = 0x800 // minimizing repeat
87 enum Position : ushort {
89 Start, // start of string \A
90 StartOfString, // start of string \A
91 StartOfLine, // start of line ^
92 StartOfScan, // start of scan \G
93 End, // end or before newline at end \Z
94 EndOfString, // end of string \z
95 EndOfLine, // end of line $
96 Boundary, // word boundary \b
97 NonBoundary // not word boundary \B
100 // see category.cs for Category enum
103 Match Scan (Regex regex, string text, int start, int end);
106 interface IMachineFactory {
107 IMachine NewInstance ();
108 IDictionary Mapping { get; set; }
109 int GroupCount { get; }
112 // Anchor SKIP OFFSET
114 // Flags: [RightToLeft] ??
115 // SKIP: relative address of tail expression
116 // OFFSET: offset of anchor from start of pattern
127 // In practice, the anchoring expression is only going to be
128 // Position (StartOfString, StartOfLine, StartOfScan) or String.
129 // This is because the optimizer looks for position anchors at the
130 // start of the expression, and if that fails it looks for the
131 // longest substring. If an expression has neither a position
132 // anchor or a longest substring anchor, then the anchoring expression
133 // is left empty. Since an empty expression will anchor at any
134 // position in any string, the entire input string will be scanned.
138 // Flags: [RightToLeft, IgnoreCase]
139 // LEN: length of string
140 // STR: string characters
144 // SKIP: relative address of next branch
158 // Repeat SKIP MIN MAX
161 // SKIP: relative address of Until instruction
162 // MIN: minimum iterations (2 slots)
163 // MAX: maximum iterations (2 slots, 0x7fffffff is infinity)
170 // FastRepeat SKIP MIN MAX
173 // SKIP: relative address of tail expression
174 // MIN: minimum iterations (2 slots)
175 // MAX: maximum iterations (2 slots, 0x7fffffff is infinity)
177 // FastRepeat :1 MIN MAX
184 // The subexpression of a FastRepeat construct must not contain any
185 // complex operators. These include: Open, Close, Balance, Repeat,
186 // FastRepeat, Sub, Test. In addition, the subexpression must have
187 // been determined to have a fixed width.
191 // SKIP: relative address of tail expression
199 // The Sub operator invokes an independent subexpression. This means
200 // that the subexpression will match only once and so will not
201 // participate in any backtracking.
205 // TSKIP: relative address of true expression
206 // FSKIP: relative address of false expression
208 // Usage: (?(?=test)true|false)
217 // Usage: (?(?=test)true)
235 // For negative lookaheads, just swap the values of TSKIP and
236 // FSKIP. For lookbehinds, the test expression must be compiled
237 // in reverse. The test expression is always executed as an
238 // independent subexpression, so its behaviour is non-backtracking
239 // (like a Sub clause.)
241 // IfDefined SKIP GID
243 // SKIP: relative address of else expression
244 // GID: number of group to check
246 // Usage: (?(gid)true)
252 // Usage: (?(gid)true|false)
262 // SKIP: relative address of target expression
270 // Flags: [Negate, IgnoreCase, RightToLeft]
271 // CHAR: exact character to match
275 // Flags: [Negate, RightToLeft]
276 // CAT: category to match (see Category enum)
280 // Flags: [Negate, IgnoreCase, RightToLeft]
281 // LO: lowest character in range
282 // HI: higest character in range
286 // Flags: [Negate, IgnoreCase, RightToLeft]
287 // LO: lowest character in set
288 // LEN: number of words in set
289 // SET: bit array representing characters in set
293 // Each word in the set represents 16 characters, so the first word
294 // defines membership for characters LO to LO + 15, the second for
295 // LO + 16 to LO + 31, and so on up to LO + (LEN * 16 - 1). It is
296 // up to the compiler to provide a compact representation for sparse
297 // unicode sets. The simple way is to use Set 0 4096. Other methods
298 // involve paritioning the set and placing the components into an
303 // SKIP: relative address of tail expression
321 // The In instruction consumes a single character, using the flags
322 // of the first instruction in the subexpression to determine its
323 // IgnoreCase and RightToLeft properties. The subexpression is then
324 // applied to the single character as a disjunction. If any instruction
325 // in the subexpression succeeds, the entire In construct succeeds
326 // and matching continues with the tail.
330 // POS: position to match (see Position enum)
334 // GID: number of group to open
338 // GID: number of group to close
342 // GID: number of capturing group (0 if none)
343 // BAL: number of group to undefine
345 // Info GROUPS MIN MAX
347 // GROUPS: number of capturing groups (2 slots)
348 // MIN: minimum width of pattern (2 slots)
349 // MAX: maximum width of pattern (2 slots, 0x7fffffff means undefined)
357 // Flags: [IgnoreCase, RightToLeft]
358 // GID: number of group to reference