3 // namespace: System.Text.RegularExpressions
6 // author: Dan Lewis (dlewis@gmx.co.uk)
10 // Permission is hereby granted, free of charge, to any person obtaining
11 // a copy of this software and associated documentation files (the
12 // "Software"), to deal in the Software without restriction, including
13 // without limitation the rights to use, copy, modify, merge, publish,
14 // distribute, sublicense, and/or sell copies of the Software, and to
15 // permit persons to whom the Software is furnished to do so, subject to
16 // the following conditions:
18 // The above copyright notice and this permission notice shall be
19 // included in all copies or substantial portions of the Software.
21 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
22 // EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
23 // MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
24 // NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
25 // LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
26 // OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
27 // WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
31 using System.Collections;
33 namespace System.Text.RegularExpressions {
35 enum OpCode : ushort {
36 False = 0, // always fails
37 True, // always succeeds
41 Position, // zero-width position assertion
42 String, // match string literal
43 Reference, // back reference
47 Character, // match character exactly
48 Category, // match character from category
49 NotCategory, // match character _not_ from category
50 Range, // match character from range
51 Set, // match character from set
52 In, // match character from group of tests
58 Balance, // balance groups
59 BalanceStart, //track balance group length
63 IfDefined, // conditional on capture
64 Sub, // non-backtracking subexpression
65 Test, // non-backtracking lookahead/behind
66 Branch, // alternative expression
67 Jump, // unconditional goto
68 Repeat, // new repeat context
69 Until, // repeat subexpression within context
70 FastRepeat, // repeat simple subexpression
71 Anchor, // anchoring expression
75 Info // pattern information
79 enum OpFlags : ushort {
81 Negate = 0x100, // succeed on mismatch
82 IgnoreCase = 0x200, // case insensitive matching
83 RightToLeft = 0x400, // right-to-left matching
84 Lazy = 0x800 // minimizing repeat
87 enum Position : ushort {
89 Start, // start of string \A
90 StartOfString, // start of string \A
91 StartOfLine, // start of line ^
92 StartOfScan, // start of scan \G
93 End, // end or before newline at end \Z
94 EndOfString, // end of string \z
95 EndOfLine, // end of line $
96 Boundary, // word boundary \b
97 NonBoundary // not word boundary \B
100 // see category.cs for Category enum
103 Match Scan (Regex regex, string text, int start, int end);
104 string [] Split (Regex regex, string input, int count, int startat);
105 string Replace (Regex regex, string input, string replacement, int count, int startat);
106 string Result (string replacement, Match match);
109 interface IMachineFactory {
110 IMachine NewInstance ();
111 IDictionary Mapping { get; set; }
112 int GroupCount { get; }
113 int Gap { get; set; } // Index of first group whose number differs from its index, or 1+GroupCount
114 string [] NamesMapping { get; set; }
117 // Anchor SKIP OFFSET
119 // Flags: [RightToLeft] ??
120 // SKIP: relative address of tail expression
121 // OFFSET: offset of anchor from start of pattern
132 // In practice, the anchoring expression is only going to be
133 // Position (StartOfString, StartOfLine, StartOfScan) or String.
134 // This is because the optimizer looks for position anchors at the
135 // start of the expression, and if that fails it looks for the
136 // longest substring. If an expression has neither a position
137 // anchor or a longest substring anchor, then the anchoring expression
138 // is left empty. Since an empty expression will anchor at any
139 // position in any string, the entire input string will be scanned.
143 // Flags: [RightToLeft, IgnoreCase]
144 // LEN: length of string
145 // STR: string characters
149 // SKIP: relative address of next branch
163 // Repeat SKIP MIN MAX
166 // SKIP: relative address of Until instruction
167 // MIN: minimum iterations (2 slots)
168 // MAX: maximum iterations (2 slots, 0x7fffffff is infinity)
175 // FastRepeat SKIP MIN MAX
178 // SKIP: relative address of tail expression
179 // MIN: minimum iterations (2 slots)
180 // MAX: maximum iterations (2 slots, 0x7fffffff is infinity)
182 // FastRepeat :1 MIN MAX
189 // The subexpression of a FastRepeat construct must not contain any
190 // complex operators. These include: Open, Close, Balance, Repeat,
191 // FastRepeat, Sub, Test. In addition, the subexpression must have
192 // been determined to have a fixed width.
196 // SKIP: relative address of tail expression
204 // The Sub operator invokes an independent subexpression. This means
205 // that the subexpression will match only once and so will not
206 // participate in any backtracking.
210 // TSKIP: relative address of true expression
211 // FSKIP: relative address of false expression
213 // Usage: (?(?=test)true|false)
222 // Usage: (?(?=test)true)
240 // For negative lookaheads, just swap the values of TSKIP and
241 // FSKIP. For lookbehinds, the test expression must be compiled
242 // in reverse. The test expression is always executed as an
243 // independent subexpression, so its behaviour is non-backtracking
244 // (like a Sub clause.)
246 // IfDefined SKIP GID
248 // SKIP: relative address of else expression
249 // GID: number of group to check
251 // Usage: (?(gid)true)
257 // Usage: (?(gid)true|false)
267 // SKIP: relative address of target expression
275 // Flags: [Negate, IgnoreCase, RightToLeft]
276 // CHAR: exact character to match
280 // Flags: [Negate, RightToLeft]
281 // CAT: category to match (see Category enum)
285 // Flags: [Negate, IgnoreCase, RightToLeft]
286 // LO: lowest character in range
287 // HI: higest character in range
291 // Flags: [Negate, IgnoreCase, RightToLeft]
292 // LO: lowest character in set
293 // LEN: number of words in set
294 // SET: bit array representing characters in set
298 // Each word in the set represents 16 characters, so the first word
299 // defines membership for characters LO to LO + 15, the second for
300 // LO + 16 to LO + 31, and so on up to LO + (LEN * 16 - 1). It is
301 // up to the compiler to provide a compact representation for sparse
302 // unicode sets. The simple way is to use Set 0 4096. Other methods
303 // involve paritioning the set and placing the components into an
308 // SKIP: relative address of tail expression
326 // The In instruction consumes a single character, using the flags
327 // of the first instruction in the subexpression to determine its
328 // IgnoreCase and RightToLeft properties. The subexpression is then
329 // applied to the single character as a disjunction. If any instruction
330 // in the subexpression succeeds, the entire In construct succeeds
331 // and matching continues with the tail.
335 // POS: position to match (see Position enum)
339 // GID: number of group to open
343 // GID: number of group to close
347 // GID: number of capturing group (0 if none)
348 // BAL: number of group to undefine
350 // Info GROUPS MIN MAX
352 // GROUPS: number of capturing groups (2 slots)
353 // MIN: minimum width of pattern (2 slots)
354 // MAX: maximum width of pattern (2 slots, 0x7fffffff means undefined)
362 // Flags: [IgnoreCase, RightToLeft]
363 // GID: number of group to reference