3 // namespace: System.Text.RegularExpressions
\r
4 // file: interpreter.cs
\r
6 // author: Dan Lewis (dlewis@gmx.co.uk)
\r
10 using System.Collections;
\r
11 using System.Diagnostics;
\r
12 using System.Globalization;
\r
14 namespace System.Text.RegularExpressions {
\r
16 class Interpreter : IMachine {
\r
17 public Interpreter (ushort[] program) {
\r
18 this.program = program;
\r
21 // process info block
\r
23 Debug.Assert ((OpCode)program[0] == OpCode.Info, "Regex", "Cant' find info block");
\r
25 this.group_count = program[1] + 1;
\r
26 this.match_min = program[2];
\r
27 this.match_max = program[3];
\r
31 this.program_start = 4;
\r
32 this.groups = new int [group_count];
\r
35 // IMachine implementation
\r
37 public Match Scan (Regex regex, string text, int start, int end) {
\r
39 this.text_end = end;
\r
40 this.scan_ptr = start;
\r
42 if (Eval (Mode.Match, ref scan_ptr, program_start))
\r
43 return GenerateMatch (regex);
\r
50 private void Reset () {
\r
52 fast = repeat = null;
\r
55 private bool Eval (Mode mode, ref int ref_ptr, int pc) {
\r
59 ushort word = program[pc];
\r
60 OpCode op = (OpCode)(word & 0x00ff);
\r
61 OpFlags flags = (OpFlags)(word & 0xff00);
\r
64 case OpCode.Anchor: {
\r
65 int skip = program[pc + 1];
\r
67 int anch_offset = program[pc + 2];
\r
68 int anch_ptr = ptr + anch_offset;
\r
69 int anch_end = text_end - match_min + anch_offset; // maximum anchor position
\r
71 // the general case for an anchoring expression is at the bottom, however we
\r
72 // do some checks for the common cases before to save processing time. the current
\r
73 // optimizer only outputs three types of anchoring expressions: fixed position,
\r
74 // fixed substring, and no anchor.
\r
76 OpCode anch_op = (OpCode)(program[pc + 3] & 0x00ff);
\r
77 if (anch_op == OpCode.Position && skip == 6) { // position anchor
\r
82 switch ((Position)program[pc + 4]) {
\r
83 case Position.StartOfString:
\r
84 if (anch_ptr == 0) {
\r
86 if (TryMatch (ref ptr, pc + skip))
\r
91 case Position.StartOfLine:
\r
92 if (anch_ptr == 0) {
\r
94 if (TryMatch (ref ptr, pc + skip))
\r
100 while (anch_ptr <= anch_end) {
\r
101 if (text[anch_ptr - 1] == '\n') {
\r
102 ptr = anch_ptr - anch_offset;
\r
103 if (TryMatch (ref ptr, pc + skip))
\r
111 case Position.StartOfScan:
\r
112 if (anch_ptr == scan_ptr) {
\r
113 ptr = scan_ptr - anch_offset;
\r
114 if (TryMatch (ref ptr, pc + skip))
\r
124 else if (qs != null ||
\r
125 (anch_op == OpCode.String && skip == 6 + program[pc + 4])) { // substring anchor
\r
131 bool ignore = ((OpFlags)program[pc + 3] & OpFlags.IgnoreCase) != 0;
\r
132 string substring = GetString (pc + 3);
\r
134 qs = new QuickSearch (substring, ignore);
\r
137 while (anch_ptr <= anch_end) {
\r
138 anch_ptr = qs.Search (text, anch_ptr, anch_end);
\r
142 ptr = anch_ptr - anch_offset;
\r
143 if (TryMatch (ref ptr, pc + skip))
\r
149 else if (anch_op == OpCode.True) { // no anchor
\r
153 while (anch_ptr <= anch_end) {
\r
155 if (TryMatch (ref ptr, pc + skip))
\r
161 else { // general case
\r
166 while (anch_ptr <= anch_end) {
\r
168 if (Eval (Mode.Match, ref ptr, pc + 3)) {
\r
169 // anchor expression passed: try real expression at the correct offset
\r
171 ptr = anch_ptr - anch_offset;
\r
172 if (TryMatch (ref ptr, pc + skip))
\r
183 case OpCode.False: {
\r
187 case OpCode.True: {
\r
191 case OpCode.Position: {
\r
192 if (!IsPosition ((Position)program[pc + 1], ptr))
\r
198 case OpCode.String: {
\r
199 bool reverse = (flags & OpFlags.RightToLeft) != 0;
\r
200 bool ignore = (flags & OpFlags.IgnoreCase) != 0;
\r
201 int len = program[pc + 1];
\r
208 else if (ptr + len > text_end)
\r
212 for (int i = 0; i < len; ++ i) {
\r
213 char c = text[ptr + i];
\r
215 c = Char.ToLower (c);
\r
217 if (c != (char)program[pc ++])
\r
226 case OpCode.Reference: {
\r
227 bool reverse = (flags & OpFlags.RightToLeft) != 0;
\r
228 bool ignore = (flags & OpFlags.IgnoreCase) != 0;
\r
229 int m = GetLastDefined (program [pc + 1]);
\r
233 int str = marks [m].Index;
\r
234 int len = marks [m].Length;
\r
241 else if (ptr + len > text_end)
\r
245 for (int i = 0; i < len; ++ i) {
\r
247 if (Char.ToLower (text[ptr + i]) != Char.ToLower (text[str + i]))
\r
251 if (text[ptr + i] != text[str + i])
\r
261 case OpCode.Character: case OpCode.Category:
\r
262 case OpCode.Range: case OpCode.Set: {
\r
263 if (!EvalChar (mode, ref ptr, ref pc, false))
\r
269 int target = pc + program[pc + 1];
\r
271 if (!EvalChar (mode, ref ptr, ref pc, true))
\r
278 case OpCode.Open: {
\r
279 Open (program[pc + 1], ptr);
\r
284 case OpCode.Close: {
\r
285 Close (program[pc + 1], ptr);
\r
290 case OpCode.Balance: {
\r
291 Balance (program[pc + 1], program[pc + 2], ptr);
\r
295 case OpCode.IfDefined: {
\r
296 int m = GetLastDefined (program [pc + 2]);
\r
298 pc += program[pc + 1];
\r
305 if (!Eval (Mode.Match, ref ptr, pc + 2))
\r
308 pc += program[pc + 1];
\r
312 case OpCode.Test: {
\r
313 int cp = Checkpoint ();
\r
314 int test_ptr = ptr;
\r
315 if (Eval (Mode.Match, ref test_ptr, pc + 3))
\r
316 pc += program[pc + 1];
\r
319 pc += program[pc + 2];
\r
324 case OpCode.Branch: {
\r
327 int cp = Checkpoint ();
\r
328 if (Eval (Mode.Match, ref ptr, pc + 2))
\r
333 pc += program[pc + 1];
\r
334 branch_op = (OpCode)(program[pc] & 0xff);
\r
335 } while (branch_op != OpCode.False);
\r
340 case OpCode.Jump: {
\r
341 pc += program[pc + 1];
\r
345 case OpCode.Repeat: {
\r
346 this.repeat = new RepeatContext (
\r
347 this.repeat, // previous context
\r
348 program[pc + 2], // minimum
\r
349 program[pc + 3], // maximum
\r
350 (flags & OpFlags.Lazy) != 0, // lazy
\r
351 pc + 4 // subexpression
\r
354 if (Eval (Mode.Match, ref ptr, pc + program[pc + 1]))
\r
357 this.repeat = this.repeat.Previous;
\r
362 case OpCode.Until: {
\r
363 RepeatContext current = this.repeat;
\r
364 int start = current.Start;
\r
366 if (!current.IsMinimum) {
\r
368 current.Start = ptr;
\r
369 if (Eval (Mode.Match, ref ptr, repeat.Expression))
\r
372 current.Start = start;
\r
377 if (ptr == current.Start) {
\r
378 // degenerate match ... match tail or fail
\r
380 this.repeat = current.Previous;
\r
381 if (Eval (Mode.Match, ref ptr, pc + 1))
\r
384 this.repeat = current;
\r
388 if (current.IsLazy) {
\r
389 // match tail first ...
\r
391 this.repeat = current.Previous;
\r
392 int cp = Checkpoint ();
\r
393 if (Eval (Mode.Match, ref ptr, pc + 1))
\r
398 // ... then match more
\r
400 this.repeat = current;
\r
401 if (!current.IsMaximum) {
\r
403 current.Start = ptr;
\r
404 if (Eval (Mode.Match, ref ptr, current.Expression))
\r
407 current.Start = start;
\r
415 // match more first ...
\r
417 if (!current.IsMaximum) {
\r
418 int cp = Checkpoint ();
\r
420 current.Start = ptr;
\r
421 if (Eval (Mode.Match, ref ptr, current.Expression))
\r
424 current.Start = start;
\r
429 // ... then match tail
\r
431 this.repeat = current.Previous;
\r
432 if (Eval (Mode.Match, ref ptr, pc + 1))
\r
435 this.repeat = current;
\r
440 case OpCode.FastRepeat: {
\r
441 this.fast = new RepeatContext (
\r
443 program[pc + 2], // minimum
\r
444 program[pc + 3], // maximum
\r
445 (flags & OpFlags.Lazy) != 0, // lazy
\r
446 pc + 4 // subexpression
\r
450 int cp = Checkpoint ();
\r
452 pc += program[pc + 1]; // tail expression
\r
453 ushort tail_word = program[pc];
\r
455 int c1, c2; // first character of tail operator
\r
456 int coff; // 0 or -1 depending on direction
\r
458 OpCode tail_op = (OpCode)(tail_word & 0xff);
\r
459 if (tail_op == OpCode.Character || tail_op == OpCode.String) {
\r
460 OpFlags tail_flags = (OpFlags)(tail_word & 0xff00);
\r
462 if (tail_op == OpCode.String)
\r
463 c1 = program[pc + 2]; // first char of string
\r
465 c1 = program[pc + 1]; // character
\r
467 if ((tail_flags & OpFlags.IgnoreCase) != 0)
\r
468 c2 = Char.ToUpper ((char)c1); // ignore case
\r
472 if ((tail_flags & OpFlags.RightToLeft) != 0)
\r
473 coff = -1; // reverse
\r
483 if (!fast.IsMinimum && !Eval (Mode.Count, ref ptr, fast.Expression)) {
\r
484 //Console.WriteLine ("lazy fast: failed mininum.");
\r
485 fast = fast.Previous;
\r
490 int p = ptr + coff;
\r
491 if ((c1 < 0 || (p >= 0 && p < text_end && (c1 == text[p] || c2 == text[p]))) &&
\r
492 Eval (Mode.Match, ref ptr, pc))
\r
495 if (fast.IsMaximum) {
\r
496 //Console.WriteLine ("lazy fast: failed with maximum.");
\r
497 fast = fast.Previous;
\r
502 if (!Eval (Mode.Count, ref ptr, fast.Expression)) {
\r
503 //Console.WriteLine ("lazy fast: no more.");
\r
504 fast = fast.Previous;
\r
508 fast = fast.Previous;
\r
512 if (!Eval (Mode.Count, ref ptr, fast.Expression)) {
\r
513 fast = fast.Previous;
\r
518 if (fast.Count > 0)
\r
519 width = (ptr - fast.Start) / fast.Count;
\r
524 int p = ptr + coff;
\r
525 if ((c1 < 0 || (p >= 0 && p < text_end && (c1 == text[p] || c2 == text[p]))) &&
\r
526 Eval (Mode.Match, ref ptr, pc))
\r
530 if (!fast.IsMinimum) {
\r
531 fast = fast.Previous;
\r
538 fast = fast.Previous;
\r
543 case OpCode.Info: {
\r
544 Debug.Assert (false, "Regex", "Info block found in pattern");
\r
558 if (fast.IsMaximum || (fast.IsLazy && fast.IsMinimum))
\r
561 pc = fast.Expression;
\r
572 if (!fast.IsLazy && fast.IsMinimum)
\r
575 ref_ptr = fast.Start;
\r
583 private bool EvalChar (Mode mode, ref int ptr, ref int pc, bool multi) {
\r
584 bool consumed = false;
\r
589 ushort word = program[pc];
\r
590 OpCode op = (OpCode)(word & 0x00ff);
\r
591 OpFlags flags = (OpFlags)(word & 0xff00);
\r
595 ignore = (flags & OpFlags.IgnoreCase) != 0;
\r
597 // consume character: the direction of an In construct is
\r
598 // determined by the direction of its first op
\r
601 if ((flags & OpFlags.RightToLeft) != 0) {
\r
608 if (ptr >= text_end)
\r
615 c = Char.ToLower (c);
\r
622 negate = (flags & OpFlags.Negate) != 0;
\r
633 case OpCode.Character: {
\r
634 if (c == (char)program[pc ++])
\r
639 case OpCode.Category: {
\r
640 if (CategoryUtils.IsCategory ((Category)program[pc ++], c))
\r
646 case OpCode.Range: {
\r
647 int lo = (char)program[pc ++];
\r
648 int hi = (char)program[pc ++];
\r
649 if (lo <= c && c <= hi)
\r
655 int lo = (char)program[pc ++];
\r
656 int len = (char)program[pc ++];
\r
660 int i = (int)c - lo;
\r
661 if (i < 0 || i >= len << 4)
\r
664 if ((program[bits + (i >> 4)] & (1 << (i & 0xf))) != 0)
\r
674 private bool TryMatch (ref int ref_ptr, int pc) {
\r
678 marks [groups [0]].Start = ptr;
\r
679 if (Eval (Mode.Match, ref ptr, pc)) {
\r
680 marks [groups [0]].End = ptr;
\r
688 private bool IsPosition (Position pos, int ptr) {
\r
690 case Position.Start: case Position.StartOfString:
\r
693 case Position.StartOfLine:
\r
694 return ptr == 0 || text[ptr - 1] == '\n';
\r
696 case Position.StartOfScan:
\r
697 return ptr == scan_ptr;
\r
700 return ptr == text_end ||
\r
701 (ptr == text_end - 1 && text[ptr] == '\n');
\r
703 case Position.EndOfLine:
\r
704 return ptr == text_end || text[ptr] == '\n';
\r
706 case Position.EndOfString:
\r
707 return ptr == text_end;
\r
709 case Position.Boundary:
\r
714 return IsWordChar (text[ptr]);
\r
715 else if (ptr == text_end)
\r
716 return IsWordChar (text[ptr - 1]);
\r
718 return IsWordChar (text[ptr]) != IsWordChar (text[ptr - 1]);
\r
720 case Position.NonBoundary:
\r
725 return !IsWordChar (text[ptr]);
\r
726 else if (ptr == text_end)
\r
727 return !IsWordChar (text[ptr - 1]);
\r
729 return IsWordChar (text[ptr]) == IsWordChar (text[ptr - 1]);
\r
736 private bool IsWordChar (char c) {
\r
737 return CategoryUtils.IsCategory (Category.Word, c);
\r
740 private string GetString (int pc) {
\r
741 int len = program[pc + 1];
\r
744 char[] cs = new char[len];
\r
745 for (int i = 0; i < len; ++ i)
\r
746 cs[i] = (char)program[str ++];
\r
748 return new string (cs);
\r
751 // capture management
\r
753 private void Open (int gid, int ptr) {
\r
754 int m = groups [gid];
\r
755 if (m < mark_start || marks [m].IsDefined) {
\r
756 m = CreateMark (m);
\r
760 marks [m].Start = ptr;
\r
763 private void Close (int gid, int ptr) {
\r
764 marks [groups [gid]].End = ptr;
\r
767 private void Balance (int gid, int balance_gid, int ptr) {
\r
768 int b = groups [balance_gid];
\r
769 Debug.Assert (marks [b].IsDefined, "Regex", "Balancing group not closed");
\r
772 Open (gid, marks [b].Index + marks [b].Length);
\r
776 groups [balance_gid] = marks [b].Previous;
\r
779 private int Checkpoint () {
\r
780 mark_start = mark_end;
\r
784 private void Backtrack (int cp) {
\r
785 Debug.Assert (cp > mark_start, "Regex", "Attempt to backtrack forwards");
\r
787 for (int i = 0; i < groups.Length; ++ i) {
\r
788 int m = groups [i];
\r
790 m = marks [m].Previous;
\r
796 private void ResetGroups () {
\r
797 int n = groups.Length;
\r
799 marks = new Mark [n * 10];
\r
801 for (int i = 0; i < n; ++ i) {
\r
804 marks [i].Start = -1;
\r
805 marks [i].End = -1;
\r
806 marks [i].Previous = -1;
\r
813 private int GetLastDefined (int gid) {
\r
814 int m = groups [gid];
\r
815 while (m >= 0 && !marks [m].IsDefined)
\r
816 m = marks [m].Previous;
\r
821 private int CreateMark (int previous) {
\r
822 if (mark_end == marks.Length) {
\r
823 Mark [] dest = new Mark [marks.Length * 2];
\r
824 marks.CopyTo (dest, 0);
\r
828 int m = mark_end ++;
\r
829 marks [m].Start = marks [m].End = -1;
\r
830 marks [m].Previous = previous;
\r
835 private Match GenerateMatch (Regex regex) {
\r
836 int[][] grps = new int[groups.Length][];
\r
837 ArrayList caps = new ArrayList ();
\r
839 for (int gid = 0; gid < groups.Length; ++ gid) {
\r
841 for (int m = groups[gid]; m >= 0; m = marks[m].Previous) {
\r
842 if (!marks[m].IsDefined)
\r
845 caps.Add (marks[m].Index);
\r
846 caps.Add (marks[m].Length);
\r
849 grps[gid] = (int[])caps.ToArray (typeof (int));
\r
852 return new Match (regex, this, text, text_end, grps);
\r
855 // interpreter attributes
\r
857 private ushort[] program; // regex program
\r
858 private int program_start; // first instruction after info block
\r
859 private string text; // input text
\r
860 private int text_end; // end of input text (last character + 1)
\r
861 private int group_count; // number of capturing groups
\r
862 private int match_min, match_max; // match width information
\r
863 private QuickSearch qs; // fast substring matcher
\r
867 private int scan_ptr; // start of scan
\r
869 private RepeatContext repeat; // current repeat context
\r
870 private RepeatContext fast; // fast repeat context
\r
872 private Mark[] marks = null; // mark stack
\r
873 private int mark_start; // start of current checkpoint
\r
874 private int mark_end; // end of checkpoint/next free mark
\r
876 private int[] groups; // current group definitions
\r
880 private struct Mark {
\r
881 public int Start, End;
\r
882 public int Previous;
\r
884 public bool IsDefined {
\r
885 get { return Start >= 0 && End >= 0; }
\r
889 get { return Start < End ? Start : End; }
\r
892 public int Length {
\r
893 get { return Start < End ? End - Start : Start - End; }
\r
897 private class RepeatContext {
\r
898 public RepeatContext (RepeatContext previous, int min, int max, bool lazy, int expr_pc) {
\r
899 this.previous = previous;
\r
903 this.expr_pc = expr_pc;
\r
910 get { return count; }
\r
911 set { count = value; }
\r
915 get { return start; }
\r
916 set { start = value; }
\r
919 public bool IsMinimum {
\r
920 get { return min <= count; }
\r
923 public bool IsMaximum {
\r
924 get { return max <= count; }
\r
927 public bool IsLazy {
\r
928 get { return lazy; }
\r
931 public int Expression {
\r
932 get { return expr_pc; }
\r
935 public RepeatContext Previous {
\r
936 get { return previous; }
\r
940 private int min, max;
\r
942 private int expr_pc;
\r
943 private RepeatContext previous;
\r
948 private enum Mode {
\r