3 // namespace: System.Text.RegularExpressions
6 // author: Dan Lewis (dlewis@gmx.co.uk)
10 // Permission is hereby granted, free of charge, to any person obtaining
11 // a copy of this software and associated documentation files (the
12 // "Software"), to deal in the Software without restriction, including
13 // without limitation the rights to use, copy, modify, merge, publish,
14 // distribute, sublicense, and/or sell copies of the Software, and to
15 // permit persons to whom the Software is furnished to do so, subject to
16 // the following conditions:
18 // The above copyright notice and this permission notice shall be
19 // included in all copies or substantial portions of the Software.
21 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
22 // EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
23 // MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
24 // NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
25 // LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
26 // OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
27 // WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
31 using System.Collections;
33 namespace System.Text.RegularExpressions {
35 enum OpCode : ushort {
36 False = 0, // always fails
37 True, // always succeeds
41 Position, // zero-width position assertion
42 String, // match string literal
43 Reference, // back reference
47 Character, // match character exactly
48 Category, // match character from category
49 NotCategory, // match character _not_ from category
50 Range, // match character from range
51 Set, // match character from set
52 In, // match character from group of tests
58 Balance, // balance groups
59 BalanceStart, //track balance group length
63 IfDefined, // conditional on capture
64 Sub, // non-backtracking subexpression
65 Test, // non-backtracking lookahead/behind
66 Branch, // alternative expression
67 Jump, // unconditional goto
68 Repeat, // new repeat context
69 Until, // repeat subexpression within context
70 FastRepeat, // repeat simple subexpression
71 Anchor, // anchoring expression
75 Info, // pattern information
77 JumpTest // Jump if we didn't already go
78 // through this path with an alternative
79 // option (an "or" path).. i.e. so we
80 // don't do short circuit or
84 enum OpFlags : ushort {
86 Negate = 0x100, // succeed on mismatch
87 IgnoreCase = 0x200, // case insensitive matching
88 RightToLeft = 0x400, // right-to-left matching
89 Lazy = 0x800 // minimizing repeat
92 enum Position : ushort {
94 Start, // start of string \A
95 StartOfString, // start of string \A
96 StartOfLine, // start of line ^
97 StartOfScan, // start of scan \G
98 End, // end or before newline at end \Z
99 EndOfString, // end of string \z
100 EndOfLine, // end of line $
101 Boundary, // word boundary \b
102 NonBoundary // not word boundary \B
105 // see category.cs for Category enum
108 Match Scan (Regex regex, string text, int start, int end);
109 string [] Split (Regex regex, string input, int count, int startat);
110 string Replace (Regex regex, string input, string replacement, int count, int startat);
111 string Result (string replacement, Match match);
114 interface IMachineFactory {
115 IMachine NewInstance ();
116 IDictionary Mapping { get; set; }
117 int GroupCount { get; }
118 int Gap { get; set; } // Index of first group whose number differs from its index, or 1+GroupCount
119 string [] NamesMapping { get; set; }
122 // Anchor SKIP OFFSET
124 // Flags: [RightToLeft] ??
125 // SKIP: relative address of tail expression
126 // OFFSET: offset of anchor from start of pattern
137 // In practice, the anchoring expression is only going to be
138 // Position (StartOfString, StartOfLine, StartOfScan) or String.
139 // This is because the optimizer looks for position anchors at the
140 // start of the expression, and if that fails it looks for the
141 // longest substring. If an expression has neither a position
142 // anchor or a longest substring anchor, then the anchoring expression
143 // is left empty. Since an empty expression will anchor at any
144 // position in any string, the entire input string will be scanned.
148 // Flags: [RightToLeft, IgnoreCase]
149 // LEN: length of string
150 // STR: string characters
154 // SKIP: relative address of next branch
168 // Repeat SKIP MIN MAX
171 // SKIP: relative address of Until instruction
172 // MIN: minimum iterations (2 slots)
173 // MAX: maximum iterations (2 slots, 0x7fffffff is infinity)
180 // FastRepeat SKIP MIN MAX
183 // SKIP: relative address of tail expression
184 // MIN: minimum iterations (2 slots)
185 // MAX: maximum iterations (2 slots, 0x7fffffff is infinity)
187 // FastRepeat :1 MIN MAX
194 // The subexpression of a FastRepeat construct must not contain any
195 // complex operators. These include: Open, Close, Balance, Repeat,
196 // FastRepeat, Sub, Test. In addition, the subexpression must have
197 // been determined to have a fixed width.
201 // SKIP: relative address of tail expression
209 // The Sub operator invokes an independent subexpression. This means
210 // that the subexpression will match only once and so will not
211 // participate in any backtracking.
215 // TSKIP: relative address of true expression
216 // FSKIP: relative address of false expression
218 // Usage: (?(?=test)true|false)
227 // Usage: (?(?=test)true)
245 // For negative lookaheads, just swap the values of TSKIP and
246 // FSKIP. For lookbehinds, the test expression must be compiled
247 // in reverse. The test expression is always executed as an
248 // independent subexpression, so its behaviour is non-backtracking
249 // (like a Sub clause.)
251 // IfDefined SKIP GID
253 // SKIP: relative address of else expression
254 // GID: number of group to check
256 // Usage: (?(gid)true)
262 // Usage: (?(gid)true|false)
272 // SKIP: relative address of target expression
280 // Flags: [Negate, IgnoreCase, RightToLeft]
281 // CHAR: exact character to match
285 // Flags: [Negate, RightToLeft]
286 // CAT: category to match (see Category enum)
290 // Flags: [Negate, IgnoreCase, RightToLeft]
291 // LO: lowest character in range
292 // HI: higest character in range
296 // Flags: [Negate, IgnoreCase, RightToLeft]
297 // LO: lowest character in set
298 // LEN: number of words in set
299 // SET: bit array representing characters in set
303 // Each word in the set represents 16 characters, so the first word
304 // defines membership for characters LO to LO + 15, the second for
305 // LO + 16 to LO + 31, and so on up to LO + (LEN * 16 - 1). It is
306 // up to the compiler to provide a compact representation for sparse
307 // unicode sets. The simple way is to use Set 0 4096. Other methods
308 // involve paritioning the set and placing the components into an
313 // SKIP: relative address of tail expression
331 // The In instruction consumes a single character, using the flags
332 // of the first instruction in the subexpression to determine its
333 // IgnoreCase and RightToLeft properties. The subexpression is then
334 // applied to the single character as a disjunction. If any instruction
335 // in the subexpression succeeds, the entire In construct succeeds
336 // and matching continues with the tail.
340 // POS: position to match (see Position enum)
344 // GID: number of group to open
348 // GID: number of group to close
352 // GID: number of capturing group (0 if none)
353 // BAL: number of group to undefine
355 // Info GROUPS MIN MAX
357 // GROUPS: number of capturing groups (2 slots)
358 // MIN: minimum width of pattern (2 slots)
359 // MAX: maximum width of pattern (2 slots, 0x7fffffff means undefined)
367 // Flags: [IgnoreCase, RightToLeft]
368 // GID: number of group to reference