// // assembly: System // namespace: System.Text.RegularExpressions // file: interpreter.cs // // author: Dan Lewis (dlewis@gmx.co.uk) // (c) 2002 // // Permission is hereby granted, free of charge, to any person obtaining // a copy of this software and associated documentation files (the // "Software"), to deal in the Software without restriction, including // without limitation the rights to use, copy, modify, merge, publish, // distribute, sublicense, and/or sell copies of the Software, and to // permit persons to whom the Software is furnished to do so, subject to // the following conditions: // // The above copyright notice and this permission notice shall be // included in all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, // EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF // MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND // NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE // LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION // OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION // WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. // using System; using System.Collections; using System.Diagnostics; using System.Globalization; namespace System.Text.RegularExpressions { class Interpreter : IMachine { private int ReadProgramCount (int ptr) { int ret = program [ptr + 1]; ret <<= 16; ret += program [ptr]; return ret; } public Interpreter (ushort[] program) { this.program = program; this.qs = null; // process info block Debug.Assert ((OpCode)program[0] == OpCode.Info, "Regex", "Cant' find info block"); this.group_count = ReadProgramCount (1) + 1; this.match_min = ReadProgramCount (3); //this.match_max = ReadProgramCount (5); // setup this.program_start = 7; this.groups = new int [group_count]; } // IMachine implementation public Match Scan (Regex regex, string text, int start, int end) { this.text = text; this.text_end = end; this.scan_ptr = start; if (Eval (Mode.Match, ref scan_ptr, program_start)) return GenerateMatch (regex); return Match.Empty; } // private methods private void Reset () { ResetGroups (); fast = repeat = null; } private bool Eval (Mode mode, ref int ref_ptr, int pc) { int ptr = ref_ptr; Begin: for (;;) { ushort word = program[pc]; OpCode op = (OpCode)(word & 0x00ff); OpFlags flags = (OpFlags)(word & 0xff00); switch (op) { case OpCode.Anchor: { int skip = program[pc + 1]; int anch_offset = program[pc + 2]; bool anch_reverse = (flags & OpFlags.RightToLeft) != 0; int anch_ptr = anch_reverse ? ptr - anch_offset : ptr + anch_offset; int anch_end = text_end - match_min + anch_offset; // maximum anchor position int anch_begin = 0; // the general case for an anchoring expression is at the bottom, however we // do some checks for the common cases before to save processing time. the current // optimizer only outputs three types of anchoring expressions: fixed position, // fixed substring, and no anchor. OpCode anch_op = (OpCode)(program[pc + 3] & 0x00ff); if (anch_op == OpCode.Position && skip == 6) { // position anchor // Anchor // Position // True switch ((Position)program[pc + 4]) { case Position.StartOfString: if (anch_reverse || anch_offset == 0) { if (anch_reverse) ptr = anch_offset; if (TryMatch (ref ptr, pc + skip)) goto Pass; } break; case Position.StartOfLine: if (anch_ptr == 0) { ptr = 0; if (TryMatch (ref ptr, pc + skip)) goto Pass; ++ anch_ptr; } while ((anch_reverse && anch_ptr >= 0) || (!anch_reverse && anch_ptr <= anch_end)) { if (anch_ptr == 0 || text[anch_ptr - 1] == '\n') { if (anch_reverse) ptr = anch_ptr == anch_end ? anch_ptr : anch_ptr + anch_offset; else ptr = anch_ptr == 0 ? anch_ptr : anch_ptr - anch_offset; if (TryMatch (ref ptr, pc + skip)) goto Pass; } if (anch_reverse) -- anch_ptr; else ++ anch_ptr; } break; case Position.StartOfScan: if (anch_ptr == scan_ptr) { ptr = anch_reverse ? scan_ptr + anch_offset : scan_ptr - anch_offset; if (TryMatch (ref ptr, pc + skip)) goto Pass; } break; default: // FIXME break; } } else if (qs != null || (anch_op == OpCode.String && skip == 6 + program[pc + 4])) { // substring anchor // Anchor // String // True bool reverse = ((OpFlags)program[pc + 3] & OpFlags.RightToLeft) != 0; string substring = GetString (pc + 3); if (qs == null) { bool ignore = ((OpFlags)program[pc + 3] & OpFlags.IgnoreCase) != 0; qs = new QuickSearch (substring, ignore, reverse); } while ((anch_reverse && anch_ptr >= anch_begin) || (!anch_reverse && anch_ptr <= anch_end)) { if (reverse) { anch_ptr = qs.Search (text, anch_ptr, anch_begin); if (anch_ptr != -1) anch_ptr += substring.Length ; } else anch_ptr = qs.Search (text, anch_ptr, anch_end); if (anch_ptr < 0) break; ptr = reverse ? anch_ptr + anch_offset : anch_ptr - anch_offset; if (TryMatch (ref ptr, pc + skip)) goto Pass; if (reverse) anch_ptr -= 2; else ++ anch_ptr; } } else if (anch_op == OpCode.True) { // no anchor // Anchor // True while ((anch_reverse && anch_ptr >= anch_begin) || (!anch_reverse && anch_ptr <= anch_end)) { ptr = anch_ptr; if (TryMatch (ref ptr, pc + skip)) goto Pass; if (anch_reverse) -- anch_ptr; else ++ anch_ptr; } } else { // general case // Anchor // // True while ((anch_reverse && anch_ptr >= anch_begin) || (!anch_reverse && anch_ptr <= anch_end)) { ptr = anch_ptr; if (Eval (Mode.Match, ref ptr, pc + 3)) { // anchor expression passed: try real expression at the correct offset ptr = anch_reverse ? anch_ptr + anch_offset : anch_ptr - anch_offset; if (TryMatch (ref ptr, pc + skip)) goto Pass; } if (anch_reverse) -- anch_ptr; else ++ anch_ptr; } } goto Fail; } case OpCode.False: { goto Fail; } case OpCode.True: { goto Pass; } case OpCode.Position: { if (!IsPosition ((Position)program[pc + 1], ptr)) goto Fail; pc += 2; break; } case OpCode.String: { bool reverse = (flags & OpFlags.RightToLeft) != 0; bool ignore = (flags & OpFlags.IgnoreCase) != 0; int len = program[pc + 1]; if (reverse) { ptr -= len; if (ptr < 0) goto Fail; } else if (ptr + len > text_end) goto Fail; pc += 2; for (int i = 0; i < len; ++ i) { char c = text[ptr + i]; if (ignore) c = Char.ToLower (c); if (c != (char)program[pc ++]) goto Fail; } if (!reverse) ptr += len; break; } case OpCode.Reference: { bool reverse = (flags & OpFlags.RightToLeft) != 0; bool ignore = (flags & OpFlags.IgnoreCase) != 0; int m = GetLastDefined (program [pc + 1]); if (m < 0) goto Fail; int str = marks [m].Index; int len = marks [m].Length; if (reverse) { ptr -= len; if (ptr < 0) goto Fail; } else if (ptr + len > text_end) goto Fail; pc += 2; for (int i = 0; i < len; ++ i) { if (ignore) { if (Char.ToLower (text[ptr + i]) != Char.ToLower (text[str + i])) goto Fail; } else { if (text[ptr + i] != text[str + i]) goto Fail; } } if (!reverse) ptr += len; break; } case OpCode.Character: case OpCode.Category: case OpCode.NotCategory: case OpCode.Range: case OpCode.Set: { if (!EvalChar (mode, ref ptr, ref pc, false)) goto Fail; break; } case OpCode.In: { int target = pc + program[pc + 1]; pc += 2; if (!EvalChar (mode, ref ptr, ref pc, true)) goto Fail; pc = target; break; } case OpCode.Open: { Open (program[pc + 1], ptr); pc += 2; break; } case OpCode.Close: { Close (program[pc + 1], ptr); pc += 2; break; } case OpCode.BalanceStart: { int start = ptr; //point before the balancing group if (!Eval (Mode.Match, ref ptr, pc + 5)) goto Fail; if(!Balance (program[pc + 1], program[pc + 2], (program[pc + 3] == 1 ? true : false) , start)) { goto Fail; } pc += program[pc + 4]; break; } case OpCode.Balance: { goto Pass; } case OpCode.IfDefined: { int m = GetLastDefined (program [pc + 2]); if (m < 0) pc += program[pc + 1]; else pc += 3; break; } case OpCode.Sub: { if (!Eval (Mode.Match, ref ptr, pc + 2)) goto Fail; pc += program[pc + 1]; break; } case OpCode.Test: { int cp = Checkpoint (); int test_ptr = ptr; if (Eval (Mode.Match, ref test_ptr, pc + 3)) pc += program[pc + 1]; else { Backtrack (cp); pc += program[pc + 2]; } break; } case OpCode.Branch: { OpCode branch_op; do { int cp = Checkpoint (); if (Eval (Mode.Match, ref ptr, pc + 2)) goto Pass; Backtrack (cp); pc += program[pc + 1]; branch_op = (OpCode)(program[pc] & 0xff); } while (branch_op != OpCode.False); goto Fail; } case OpCode.Jump: { pc += program[pc + 1]; break; } case OpCode.Repeat: { this.repeat = new RepeatContext ( this.repeat, // previous context ReadProgramCount (pc + 2), // minimum ReadProgramCount (pc + 4), // maximum (flags & OpFlags.Lazy) != 0, // lazy pc + 6 // subexpression ); if (Eval (Mode.Match, ref ptr, pc + program[pc + 1])) goto Pass; else { this.repeat = this.repeat.Previous; goto Fail; } } case OpCode.Until: { RepeatContext current = this.repeat; // // Can we avoid recursion? // // Backtracking can be forced in nested quantifiers from the tail of this quantifier. // Thus, we cannot, in general, use a simple loop on repeat.Expression to handle // quantifiers. // // If 'deep' was unmolested, that implies that there was no nested quantifiers. // Thus, we can safely avoid recursion. // if (deep == current) goto Pass; int start = current.Start; int start_count = current.Count; while (!current.IsMinimum) { ++ current.Count; current.Start = ptr; deep = current; if (!Eval (Mode.Match, ref ptr, current.Expression)) { current.Start = start; current.Count = start_count; goto Fail; } if (deep != current) // recursive mode goto Pass; } if (ptr == current.Start) { // degenerate match ... match tail or fail this.repeat = current.Previous; deep = null; if (Eval (Mode.Match, ref ptr, pc + 1)) goto Pass; this.repeat = current; goto Fail; } if (current.IsLazy) { for (;;) { // match tail first ... this.repeat = current.Previous; deep = null; int cp = Checkpoint (); if (Eval (Mode.Match, ref ptr, pc + 1)) goto Pass; Backtrack (cp); // ... then match more this.repeat = current; if (current.IsMaximum) goto Fail; ++ current.Count; current.Start = ptr; deep = current; if (!Eval (Mode.Match, ref ptr, current.Expression)) { current.Start = start; current.Count = start_count; goto Fail; } if (deep != current) // recursive mode goto Pass; // Degenerate match: ptr has not moved since the last (failed) tail match. // So, next and subsequent tail matches will fail. if (ptr == current.Start) goto Fail; } } else { int stack_size = stack.Count; // match greedily as much as possible while (!current.IsMaximum) { int cp = Checkpoint (); int old_ptr = ptr; int old_start = current.Start; ++ current.Count; current.Start = ptr; deep = current; if (!Eval (Mode.Match, ref ptr, current.Expression)) { -- current.Count; current.Start = old_start; Backtrack (cp); break; } if (deep != current) { // recursive mode: no more backtracking, truncate the stack stack.Count = stack_size; goto Pass; } stack.Push (cp); stack.Push (old_ptr); // Degenerate match: no point going on if (ptr == current.Start) break; } // then, match the tail, backtracking as necessary. this.repeat = current.Previous; for (;;) { deep = null; if (Eval (Mode.Match, ref ptr, pc + 1)) { stack.Count = stack_size; goto Pass; } if (stack.Count == stack_size) { this.repeat = current; goto Fail; } --current.Count; ptr = stack.Pop (); Backtrack (stack.Pop ()); } } } case OpCode.FastRepeat: { this.fast = new RepeatContext ( fast, ReadProgramCount (pc + 2), // minimum ReadProgramCount (pc + 4), // maximum (flags & OpFlags.Lazy) != 0, // lazy pc + 6 // subexpression ); fast.Start = ptr; int cp = Checkpoint (); pc += program[pc + 1]; // tail expression ushort tail_word = program[pc]; int c1 = -1; // first character of tail operator int c2 = -1; // ... and the same character, in upper case if ignoring case int coff = 0; // 0 or -1 depending on direction OpCode tail_op = (OpCode)(tail_word & 0xff); if (tail_op == OpCode.Character || tail_op == OpCode.String) { OpFlags tail_flags = (OpFlags)(tail_word & 0xff00); if ((tail_flags & OpFlags.Negate) != 0) goto skip; if (tail_op == OpCode.String) { int offset = 0; if ((tail_flags & OpFlags.RightToLeft) != 0) { offset = program[pc + 1] - 1 ; } c1 = program[pc + 2 + offset]; // first char of string } else c1 = program[pc + 1]; // character if ((tail_flags & OpFlags.IgnoreCase) != 0) c2 = Char.ToUpper ((char)c1); // ignore case else c2 = c1; if ((tail_flags & OpFlags.RightToLeft) != 0) coff = -1; // reverse else coff = 0; } skip: if (fast.IsLazy) { if (!fast.IsMinimum && !Eval (Mode.Count, ref ptr, fast.Expression)) { //Console.WriteLine ("lazy fast: failed mininum."); fast = fast.Previous; goto Fail; } while (true) { int p = ptr + coff; if (c1 < 0 || (p >= 0 && p < text_end && (c1 == text[p] || c2 == text[p]))) { deep = null; if (Eval (Mode.Match, ref ptr, pc)) break; } if (fast.IsMaximum) { //Console.WriteLine ("lazy fast: failed with maximum."); fast = fast.Previous; goto Fail; } Backtrack (cp); if (!Eval (Mode.Count, ref ptr, fast.Expression)) { //Console.WriteLine ("lazy fast: no more."); fast = fast.Previous; goto Fail; } } fast = fast.Previous; goto Pass; } else { if (!Eval (Mode.Count, ref ptr, fast.Expression)) { fast = fast.Previous; goto Fail; } int width; if (fast.Count > 0) width = (ptr - fast.Start) / fast.Count; else width = 0; while (true) { int p = ptr + coff; if (c1 < 0 || (p >= 0 && p < text_end && (c1 == text[p] || c2 == text[p]))) { deep = null; if (Eval (Mode.Match, ref ptr, pc)) break; } -- fast.Count; if (!fast.IsMinimum) { fast = fast.Previous; goto Fail; } ptr -= width; Backtrack (cp); } fast = fast.Previous; goto Pass; } } case OpCode.Info: { Debug.Assert (false, "Regex", "Info block found in pattern"); goto Fail; } } } Pass: ref_ptr = ptr; switch (mode) { case Mode.Match: return true; case Mode.Count: { ++ fast.Count; if (fast.IsMaximum || (fast.IsLazy && fast.IsMinimum)) return true; pc = fast.Expression; goto Begin; } } Fail: switch (mode) { case Mode.Match: return false; case Mode.Count: { if (!fast.IsLazy && fast.IsMinimum) return true; ref_ptr = fast.Start; return false; } } return false; } private bool EvalChar (Mode mode, ref int ptr, ref int pc, bool multi) { bool consumed = false; char c = '\0'; bool negate; bool ignore; do { ushort word = program[pc]; OpCode op = (OpCode)(word & 0x00ff); OpFlags flags = (OpFlags)(word & 0xff00); ++ pc; ignore = (flags & OpFlags.IgnoreCase) != 0; // consume character: the direction of an In construct is // determined by the direction of its first op if (!consumed) { if ((flags & OpFlags.RightToLeft) != 0) { if (ptr <= 0) return false; c = text[-- ptr]; } else { if (ptr >= text_end) return false; c = text[ptr ++]; } if (ignore) c = Char.ToLower (c); consumed = true; } // negate flag negate = (flags & OpFlags.Negate) != 0; // execute op switch (op) { case OpCode.True: return true; case OpCode.False: return false; case OpCode.Character: { if (c == (char)program[pc ++]) return !negate; break; } case OpCode.Category: { if (CategoryUtils.IsCategory ((Category)program[pc ++], c)) return !negate; break; } case OpCode.NotCategory: { if (!CategoryUtils.IsCategory ((Category)program[pc ++], c)) return !negate; break; } case OpCode.Range: { int lo = (char)program[pc ++]; int hi = (char)program[pc ++]; if (lo <= c && c <= hi) return !negate; break; } case OpCode.Set: { int lo = (char)program[pc ++]; int len = (char)program[pc ++]; int bits = pc; pc += len; int i = (int)c - lo; if (i < 0 || i >= len << 4) break; if ((program[bits + (i >> 4)] & (1 << (i & 0xf))) != 0) return !negate; break; } } } while (multi); return negate; } private bool TryMatch (ref int ref_ptr, int pc) { Reset (); int ptr = ref_ptr; marks [groups [0]].Start = ptr; if (Eval (Mode.Match, ref ptr, pc)) { marks [groups [0]].End = ptr; ref_ptr = ptr; return true; } return false; } private bool IsPosition (Position pos, int ptr) { switch (pos) { case Position.Start: case Position.StartOfString: return ptr == 0; case Position.StartOfLine: return ptr == 0 || text[ptr - 1] == '\n'; case Position.StartOfScan: return ptr == scan_ptr; case Position.End: return ptr == text_end || (ptr == text_end - 1 && text[ptr] == '\n'); case Position.EndOfLine: return ptr == text_end || text[ptr] == '\n'; case Position.EndOfString: return ptr == text_end; case Position.Boundary: if (text_end == 0) return false; if (ptr == 0) return IsWordChar (text[ptr]); else if (ptr == text_end) return IsWordChar (text[ptr - 1]); else return IsWordChar (text[ptr]) != IsWordChar (text[ptr - 1]); case Position.NonBoundary: if (text_end == 0) return false; if (ptr == 0) return !IsWordChar (text[ptr]); else if (ptr == text_end) return !IsWordChar (text[ptr - 1]); else return IsWordChar (text[ptr]) == IsWordChar (text[ptr - 1]); default: return false; } } private bool IsWordChar (char c) { return CategoryUtils.IsCategory (Category.Word, c); } private string GetString (int pc) { int len = program[pc + 1]; int str = pc + 2; char[] cs = new char[len]; for (int i = 0; i < len; ++ i) cs[i] = (char)program[str ++]; return new string (cs); } // capture management private void Open (int gid, int ptr) { int m = groups [gid]; if (m < mark_start || marks [m].IsDefined) { m = CreateMark (m); groups [gid] = m; } marks [m].Start = ptr; } private void Close (int gid, int ptr) { marks [groups [gid]].End = ptr; } private bool Balance (int gid, int balance_gid, bool capture, int ptr) { int b = groups [balance_gid]; if(b == -1 || marks[b].Index < 0) { //Group not previously matched return false; } Debug.Assert (marks [b].IsDefined, "Regex", "Balancng group not closed"); if (gid > 0 && capture){ Open (gid, marks [b].Index + marks [b].Length); Close (gid, ptr); } groups [balance_gid] = marks[b].Previous; return true; } private int Checkpoint () { mark_start = mark_end; return mark_start; } private void Backtrack (int cp) { Debug.Assert (cp > mark_start, "Regex", "Attempt to backtrack forwards"); for (int i = 0; i < groups.Length; ++ i) { int m = groups [i]; while (cp <= m) m = marks [m].Previous; groups [i] = m; } } private void ResetGroups () { int n = groups.Length; if (marks == null) marks = new Mark [n * 10]; for (int i = 0; i < n; ++ i) { groups [i] = i; marks [i].Start = -1; marks [i].End = -1; marks [i].Previous = -1; } mark_start = 0; mark_end = n; } private int GetLastDefined (int gid) { int m = groups [gid]; while (m >= 0 && !marks [m].IsDefined) m = marks [m].Previous; return m; } private int CreateMark (int previous) { if (mark_end == marks.Length) { Mark [] dest = new Mark [marks.Length * 2]; marks.CopyTo (dest, 0); marks = dest; } int m = mark_end ++; marks [m].Start = marks [m].End = -1; marks [m].Previous = previous; return m; } private void GetGroupInfo (int gid, out int first_mark_index, out int n_caps) { first_mark_index = -1; n_caps = 0; for (int m = groups [gid]; m >= 0; m = marks [m].Previous) { if (!marks [m].IsDefined) continue; if (first_mark_index < 0) first_mark_index = m; ++n_caps; } } private void PopulateGroup (Group g, int first_mark_index, int n_caps) { int i = 1; for (int m = marks [first_mark_index].Previous; m >= 0; m = marks [m].Previous) { if (!marks [m].IsDefined) continue; Capture cap = new Capture (text, marks [m].Index, marks [m].Length); g.Captures.SetValue (cap, n_caps - 1 - i); ++i; } } private Match GenerateMatch (Regex regex) { int n_caps, first_mark_index; Group g; GetGroupInfo (0, out first_mark_index, out n_caps); Match retval = new Match (regex, this, text, text_end, groups.Length, marks [first_mark_index].Index, marks [first_mark_index].Length, n_caps); PopulateGroup (retval, first_mark_index, n_caps); for (int gid = 1; gid < groups.Length; ++ gid) { GetGroupInfo (gid, out first_mark_index, out n_caps); if (first_mark_index < 0) { g = Group.Fail; } else { g = new Group (text, marks [first_mark_index].Index, marks [first_mark_index].Length, n_caps); PopulateGroup (g, first_mark_index, n_caps); } retval.Groups.SetValue (g, gid); } return retval; } // interpreter attributes private ushort[] program; // regex program private int program_start; // first instruction after info block private string text; // input text private int text_end; // end of input text (last character + 1) private int group_count; // number of capturing groups private int match_min;//, match_max; // match width information private QuickSearch qs; // fast substring matcher // match state private int scan_ptr; // start of scan private RepeatContext repeat; // current repeat context private RepeatContext fast; // fast repeat context // Repeat/Until handling private IntStack stack = new IntStack (); // utility stack private RepeatContext deep; // points to the most-nested repeat context private Mark[] marks = null; // mark stack private int mark_start; // start of current checkpoint private int mark_end; // end of checkpoint/next free mark private int[] groups; // current group definitions // private classes private struct IntStack { int [] values; int count; public int Pop () { return values [--count]; } public void Push (int value) { if (values == null) { values = new int [8]; } else if (count == values.Length) { int new_size = values.Length; new_size += new_size >> 1; int [] new_values = new int [new_size]; for (int i = 0; i < count; ++i) new_values [i] = values [i]; values = new_values; } values [count++] = value; } public int Top { get { return values [count - 1]; } } public int Count { get { return count; } set { if (value > count) throw new SystemException ("can only truncate the stack"); count = value; } } } private class RepeatContext { public RepeatContext (RepeatContext previous, int min, int max, bool lazy, int expr_pc) { this.previous = previous; this.min = min; this.max = max; this.lazy = lazy; this.expr_pc = expr_pc; this.start = -1; this.count = 0; } public int Count { get { return count; } set { count = value; } } public int Start { get { return start; } set { start = value; } } public bool IsMinimum { get { return min <= count; } } public bool IsMaximum { get { return max <= count; } } public bool IsLazy { get { return lazy; } } public int Expression { get { return expr_pc; } } public RepeatContext Previous { get { return previous; } } private int start; private int min, max; private bool lazy; private int expr_pc; private RepeatContext previous; private int count; } private enum Mode { Search, Match, Count } } }