// // assembly: System // namespace: System.Text.RegularExpressions // file: arch.cs // // author: Dan Lewis (dlewis@gmx.co.uk) // (c) 2002 // // Permission is hereby granted, free of charge, to any person obtaining // a copy of this software and associated documentation files (the // "Software"), to deal in the Software without restriction, including // without limitation the rights to use, copy, modify, merge, publish, // distribute, sublicense, and/or sell copies of the Software, and to // permit persons to whom the Software is furnished to do so, subject to // the following conditions: // // The above copyright notice and this permission notice shall be // included in all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, // EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF // MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND // NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE // LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION // OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION // WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. // using System; using System.Collections; namespace System.Text.RegularExpressions { enum OpCode : ushort { False = 0, // always fails True, // always succeeds // matching Position, // zero-width position assertion String, // match string literal Reference, // back reference // character matching Character, // match character exactly Category, // match character from category NotCategory, // match character _not_ from category Range, // match character from range Set, // match character from set In, // match character from group of tests // capturing Open, // open group Close, // close group Balance, // balance groups BalanceStart, //track balance group length // control flow IfDefined, // conditional on capture Sub, // non-backtracking subexpression Test, // non-backtracking lookahead/behind Branch, // alternative expression Jump, // unconditional goto Repeat, // new repeat context Until, // repeat subexpression within context FastRepeat, // repeat simple subexpression Anchor, // anchoring expression // miscellaneous Info // pattern information } [Flags] enum OpFlags : ushort { None = 0x000, Negate = 0x100, // succeed on mismatch IgnoreCase = 0x200, // case insensitive matching RightToLeft = 0x400, // right-to-left matching Lazy = 0x800 // minimizing repeat } enum Position : ushort { Any, // anywhere Start, // start of string \A StartOfString, // start of string \A StartOfLine, // start of line ^ StartOfScan, // start of scan \G End, // end or before newline at end \Z EndOfString, // end of string \z EndOfLine, // end of line $ Boundary, // word boundary \b NonBoundary // not word boundary \B }; // see category.cs for Category enum interface IMachine { Match Scan (Regex regex, string text, int start, int end); } interface IMachineFactory { IMachine NewInstance (); IDictionary Mapping { get; set; } int GroupCount { get; } } // Anchor SKIP OFFSET // // Flags: [RightToLeft] ?? // SKIP: relative address of tail expression // OFFSET: offset of anchor from start of pattern // // Usage: // // Anchor :1 OFFSET // // True // 1: // // Notes: // // In practice, the anchoring expression is only going to be // Position (StartOfString, StartOfLine, StartOfScan) or String. // This is because the optimizer looks for position anchors at the // start of the expression, and if that fails it looks for the // longest substring. If an expression has neither a position // anchor or a longest substring anchor, then the anchoring expression // is left empty. Since an empty expression will anchor at any // position in any string, the entire input string will be scanned. // String LEN STR... // // Flags: [RightToLeft, IgnoreCase] // LEN: length of string // STR: string characters // Branch SKIP // // SKIP: relative address of next branch // // Branch :1 // // Jump :4 // 1: Branch :2 // // Jump :4 // 2: Branch :3 // // Jump :4 // 3: False // 4: // Repeat SKIP MIN MAX // // Flags: [Lazy] // SKIP: relative address of Until instruction // MIN: minimum iterations (2 slots) // MAX: maximum iterations (2 slots, 0x7fffffff is infinity) // // Repeat :1 MIN MAX // // Until // 1: // FastRepeat SKIP MIN MAX // // Flags: [Lazy] // SKIP: relative address of tail expression // MIN: minimum iterations (2 slots) // MAX: maximum iterations (2 slots, 0x7fffffff is infinity) // // FastRepeat :1 MIN MAX // // True // 1: // // Notes: // // The subexpression of a FastRepeat construct must not contain any // complex operators. These include: Open, Close, Balance, Repeat, // FastRepeat, Sub, Test. In addition, the subexpression must have // been determined to have a fixed width. // Sub SKIP // // SKIP: relative address of tail expression // // Sub :1 // // 1: // // Notes: // // The Sub operator invokes an independent subexpression. This means // that the subexpression will match only once and so will not // participate in any backtracking. // Test TSKIP FSKIP // // TSKIP: relative address of true expression // FSKIP: relative address of false expression // // Usage: (?(?=test)true|false) // // Test :1 :2 // // 1: // Jump // 2: // // // Usage: (?(?=test)true) // // Test :1 :2 // // 1: // 2: // // Usage: (?=test) // // Test :1 :2 // // 1: // Jump 3: // 2: False // 3: // // Notes: // // For negative lookaheads, just swap the values of TSKIP and // FSKIP. For lookbehinds, the test expression must be compiled // in reverse. The test expression is always executed as an // independent subexpression, so its behaviour is non-backtracking // (like a Sub clause.) // IfDefined SKIP GID // // SKIP: relative address of else expression // GID: number of group to check // // Usage: (?(gid)true) // // IfDefined :1 // // 1: // // Usage: (?(gid)true|false) // // IfDefined :1 // // Jump :2 // 1: // 2: // Jump SKIP // // SKIP: relative address of target expression // // Jump :1 // ... // :1 // Character CHAR // // Flags: [Negate, IgnoreCase, RightToLeft] // CHAR: exact character to match // Category CAT // // Flags: [Negate, RightToLeft] // CAT: category to match (see Category enum) // Range LO HI // // Flags: [Negate, IgnoreCase, RightToLeft] // LO: lowest character in range // HI: higest character in range // Set LO LEN SET... // // Flags: [Negate, IgnoreCase, RightToLeft] // LO: lowest character in set // LEN: number of words in set // SET: bit array representing characters in set // // Notes: // // Each word in the set represents 16 characters, so the first word // defines membership for characters LO to LO + 15, the second for // LO + 16 to LO + 31, and so on up to LO + (LEN * 16 - 1). It is // up to the compiler to provide a compact representation for sparse // unicode sets. The simple way is to use Set 0 4096. Other methods // involve paritioning the set and placing the components into an // In block. // In SKIP // // SKIP: relative address of tail expression // // Usage: [expr] // // In :1 // // True // :1 // // Usage: [^expr] // // In :1 // // False // :1 // // Notes: // // The In instruction consumes a single character, using the flags // of the first instruction in the subexpression to determine its // IgnoreCase and RightToLeft properties. The subexpression is then // applied to the single character as a disjunction. If any instruction // in the subexpression succeeds, the entire In construct succeeds // and matching continues with the tail. // Position POS // // POS: position to match (see Position enum) // Open GID // // GID: number of group to open // Close GID // // GID: number of group to close // Balance GID BAL // // GID: number of capturing group (0 if none) // BAL: number of group to undefine // Info GROUPS MIN MAX // // GROUPS: number of capturing groups (2 slots) // MIN: minimum width of pattern (2 slots) // MAX: maximum width of pattern (2 slots, 0x7fffffff means undefined) // False // True // Reference GID // // Flags: [IgnoreCase, RightToLeft] // GID: number of group to reference }