3 // namespace: System.Text.RegularExpressions
\r
4 // file: interpreter.cs
\r
6 // author: Dan Lewis (dlewis@gmx.co.uk)
\r
10 using System.Collections;
\r
11 using System.Diagnostics;
\r
12 using System.Globalization;
\r
14 namespace System.Text.RegularExpressions {
\r
16 class Interpreter : IMachine {
\r
17 public Interpreter (ushort[] program) {
\r
18 this.program = program;
\r
21 // process info block
\r
23 Debug.Assert ((OpCode)program[0] == OpCode.Info, "Regex", "Cant' find info block");
\r
25 this.group_count = program[1] + 1;
\r
26 this.match_min = program[2];
\r
27 this.match_max = program[3];
\r
31 this.program_start = 4;
\r
32 this.groups = new int [group_count];
\r
35 // IMachine implementation
\r
37 public Match Scan (Regex regex, string text, int start, int end) {
\r
39 this.text_end = end;
\r
40 this.scan_ptr = start;
\r
42 if (Eval (Mode.Match, ref scan_ptr, program_start))
\r
43 return GenerateMatch (regex);
\r
50 private void Reset () {
\r
52 fast = repeat = null;
\r
55 private bool Eval (Mode mode, ref int ref_ptr, int pc) {
\r
59 ushort word = program[pc];
\r
60 OpCode op = (OpCode)(word & 0x00ff);
\r
61 OpFlags flags = (OpFlags)(word & 0xff00);
\r
64 case OpCode.Anchor: {
\r
65 int skip = program[pc + 1];
\r
67 int anch_offset = program[pc + 2];
\r
68 bool anch_reverse = (flags & OpFlags.RightToLeft) != 0;
\r
69 int anch_ptr = anch_reverse ? ptr - anch_offset : ptr + anch_offset;
\r
70 int anch_end = text_end - match_min + anch_offset; // maximum anchor position
\r
76 // the general case for an anchoring expression is at the bottom, however we
\r
77 // do some checks for the common cases before to save processing time. the current
\r
78 // optimizer only outputs three types of anchoring expressions: fixed position,
\r
79 // fixed substring, and no anchor.
\r
81 OpCode anch_op = (OpCode)(program[pc + 3] & 0x00ff);
\r
82 if (anch_op == OpCode.Position && skip == 6) { // position anchor
\r
87 switch ((Position)program[pc + 4]) {
\r
88 case Position.StartOfString:
\r
89 if (anch_reverse || anch_offset == 0) {
\r
91 if (TryMatch (ref ptr, pc + skip))
\r
96 case Position.StartOfLine:
\r
98 if (anch_ptr == 0) {
\r
100 if (TryMatch (ref ptr, pc + skip))
\r
106 while ((anch_reverse && anch_ptr >= 0) || (!anch_reverse && anch_ptr <= anch_end)) {
\r
107 if (anch_ptr == 0 || text[anch_ptr - 1] == '\n') {
\r
109 ptr = anch_ptr == anch_end ? anch_ptr : anch_ptr + anch_offset;
\r
111 ptr = anch_ptr == 0 ? anch_ptr : anch_ptr - anch_offset;
\r
112 if (TryMatch (ref ptr, pc + skip))
\r
123 case Position.StartOfScan:
\r
124 if (anch_ptr == scan_ptr) {
\r
125 ptr = anch_reverse ? scan_ptr + anch_offset : scan_ptr - anch_offset;
\r
126 if (TryMatch (ref ptr, pc + skip))
\r
136 else if (qs != null ||
\r
137 (anch_op == OpCode.String && skip == 6 + program[pc + 4])) { // substring anchor
\r
142 bool reverse = ((OpFlags)program[pc + 3] & OpFlags.RightToLeft) != 0;
\r
143 string substring = GetString (pc + 3);
\r
146 bool ignore = ((OpFlags)program[pc + 3] & OpFlags.IgnoreCase) != 0;
\r
148 qs = new QuickSearch (substring, ignore, reverse);
\r
150 while ((anch_reverse && anch_ptr >= anch_begin)
\r
151 || (!anch_reverse && anch_ptr <= anch_end)) {
\r
155 anch_ptr = qs.Search (text, anch_ptr, anch_begin);
\r
156 if (anch_ptr != -1)
\r
157 anch_ptr += substring.Length ;
\r
161 anch_ptr = qs.Search (text, anch_ptr, anch_end);
\r
165 ptr = reverse ? anch_ptr + anch_offset : anch_ptr - anch_offset;
\r
166 if (TryMatch (ref ptr, pc + skip))
\r
175 else if (anch_op == OpCode.True) { // no anchor
\r
180 while ((anch_reverse && anch_ptr >= anch_begin)
\r
181 || (!anch_reverse && anch_ptr <= anch_end)) {
\r
184 if (TryMatch (ref ptr, pc + skip))
\r
192 else { // general case
\r
197 while ((anch_reverse && anch_ptr >= anch_begin)
\r
198 || (!anch_reverse && anch_ptr <= anch_end)) {
\r
201 if (Eval (Mode.Match, ref ptr, pc + 3)) {
\r
202 // anchor expression passed: try real expression at the correct offset
\r
204 ptr = anch_reverse ? anch_ptr + anch_offset : anch_ptr - anch_offset;
\r
205 if (TryMatch (ref ptr, pc + skip))
\r
219 case OpCode.False: {
\r
223 case OpCode.True: {
\r
227 case OpCode.Position: {
\r
228 if (!IsPosition ((Position)program[pc + 1], ptr))
\r
234 case OpCode.String: {
\r
235 bool reverse = (flags & OpFlags.RightToLeft) != 0;
\r
236 bool ignore = (flags & OpFlags.IgnoreCase) != 0;
\r
237 int len = program[pc + 1];
\r
245 if (ptr + len > text_end)
\r
249 for (int i = 0; i < len; ++ i) {
\r
250 char c = text[ptr + i];
\r
252 c = Char.ToLower (c);
\r
254 if (c != (char)program[pc ++])
\r
263 case OpCode.Reference: {
\r
264 bool reverse = (flags & OpFlags.RightToLeft) != 0;
\r
265 bool ignore = (flags & OpFlags.IgnoreCase) != 0;
\r
266 int m = GetLastDefined (program [pc + 1]);
\r
270 int str = marks [m].Index;
\r
271 int len = marks [m].Length;
\r
278 else if (ptr + len > text_end)
\r
282 for (int i = 0; i < len; ++ i) {
\r
284 if (Char.ToLower (text[ptr + i]) != Char.ToLower (text[str + i]))
\r
288 if (text[ptr + i] != text[str + i])
\r
298 case OpCode.Character: case OpCode.Category:
\r
299 case OpCode.Range: case OpCode.Set: {
\r
300 if (!EvalChar (mode, ref ptr, ref pc, false))
\r
306 int target = pc + program[pc + 1];
\r
308 if (!EvalChar (mode, ref ptr, ref pc, true))
\r
315 case OpCode.Open: {
\r
316 Open (program[pc + 1], ptr);
\r
321 case OpCode.Close: {
\r
322 Close (program[pc + 1], ptr);
\r
327 case OpCode.Balance: {
\r
328 Balance (program[pc + 1], program[pc + 2], ptr);
\r
332 case OpCode.IfDefined: {
\r
333 int m = GetLastDefined (program [pc + 2]);
\r
335 pc += program[pc + 1];
\r
342 if (!Eval (Mode.Match, ref ptr, pc + 2))
\r
345 pc += program[pc + 1];
\r
349 case OpCode.Test: {
\r
350 int cp = Checkpoint ();
\r
351 int test_ptr = ptr;
\r
352 if (Eval (Mode.Match, ref test_ptr, pc + 3))
\r
353 pc += program[pc + 1];
\r
356 pc += program[pc + 2];
\r
361 case OpCode.Branch: {
\r
364 int cp = Checkpoint ();
\r
365 if (Eval (Mode.Match, ref ptr, pc + 2))
\r
370 pc += program[pc + 1];
\r
371 branch_op = (OpCode)(program[pc] & 0xff);
\r
372 } while (branch_op != OpCode.False);
\r
377 case OpCode.Jump: {
\r
378 pc += program[pc + 1];
\r
382 case OpCode.Repeat: {
\r
383 this.repeat = new RepeatContext (
\r
384 this.repeat, // previous context
\r
385 program[pc + 2], // minimum
\r
386 program[pc + 3], // maximum
\r
387 (flags & OpFlags.Lazy) != 0, // lazy
\r
388 pc + 4 // subexpression
\r
391 if (Eval (Mode.Match, ref ptr, pc + program[pc + 1]))
\r
394 this.repeat = this.repeat.Previous;
\r
399 case OpCode.Until: {
\r
400 RepeatContext current = this.repeat;
\r
401 int start = current.Start;
\r
403 if (!current.IsMinimum) {
\r
405 current.Start = ptr;
\r
406 if (Eval (Mode.Match, ref ptr, repeat.Expression))
\r
409 current.Start = start;
\r
414 if (ptr == current.Start) {
\r
415 // degenerate match ... match tail or fail
\r
417 this.repeat = current.Previous;
\r
418 if (Eval (Mode.Match, ref ptr, pc + 1))
\r
421 this.repeat = current;
\r
425 if (current.IsLazy) {
\r
426 // match tail first ...
\r
428 this.repeat = current.Previous;
\r
429 int cp = Checkpoint ();
\r
430 if (Eval (Mode.Match, ref ptr, pc + 1))
\r
435 // ... then match more
\r
437 this.repeat = current;
\r
438 if (!current.IsMaximum) {
\r
440 current.Start = ptr;
\r
441 if (Eval (Mode.Match, ref ptr, current.Expression))
\r
444 current.Start = start;
\r
452 // match more first ...
\r
454 if (!current.IsMaximum) {
\r
455 int cp = Checkpoint ();
\r
457 current.Start = ptr;
\r
458 if (Eval (Mode.Match, ref ptr, current.Expression))
\r
461 current.Start = start;
\r
466 // ... then match tail
\r
468 this.repeat = current.Previous;
\r
469 if (Eval (Mode.Match, ref ptr, pc + 1))
\r
472 this.repeat = current;
\r
477 case OpCode.FastRepeat: {
\r
478 this.fast = new RepeatContext (
\r
480 program[pc + 2], // minimum
\r
481 program[pc + 3], // maximum
\r
482 (flags & OpFlags.Lazy) != 0, // lazy
\r
483 pc + 4 // subexpression
\r
487 int cp = Checkpoint ();
\r
489 pc += program[pc + 1]; // tail expression
\r
490 ushort tail_word = program[pc];
\r
492 int c1, c2; // first character of tail operator
\r
493 int coff; // 0 or -1 depending on direction
\r
495 OpCode tail_op = (OpCode)(tail_word & 0xff);
\r
496 if (tail_op == OpCode.Character || tail_op == OpCode.String) {
\r
497 OpFlags tail_flags = (OpFlags)(tail_word & 0xff00);
\r
499 if (tail_op == OpCode.String)
\r
503 if ((tail_flags & OpFlags.RightToLeft) != 0)
\r
505 offset = program[pc + 1] - 1 ;
\r
508 c1 = program[pc + 2 + offset]; // first char of string
\r
511 c1 = program[pc + 1]; // character
\r
513 if ((tail_flags & OpFlags.IgnoreCase) != 0)
\r
514 c2 = Char.ToUpper ((char)c1); // ignore case
\r
518 if ((tail_flags & OpFlags.RightToLeft) != 0)
\r
519 coff = -1; // reverse
\r
529 if (!fast.IsMinimum && !Eval (Mode.Count, ref ptr, fast.Expression)) {
\r
530 //Console.WriteLine ("lazy fast: failed mininum.");
\r
531 fast = fast.Previous;
\r
536 int p = ptr + coff;
\r
537 if ((c1 < 0 || (p >= 0 && p < text_end && (c1 == text[p] || c2 == text[p]))) &&
\r
538 Eval (Mode.Match, ref ptr, pc))
\r
541 if (fast.IsMaximum) {
\r
542 //Console.WriteLine ("lazy fast: failed with maximum.");
\r
543 fast = fast.Previous;
\r
548 if (!Eval (Mode.Count, ref ptr, fast.Expression)) {
\r
549 //Console.WriteLine ("lazy fast: no more.");
\r
550 fast = fast.Previous;
\r
554 fast = fast.Previous;
\r
558 if (!Eval (Mode.Count, ref ptr, fast.Expression)) {
\r
559 fast = fast.Previous;
\r
564 if (fast.Count > 0)
\r
565 width = (ptr - fast.Start) / fast.Count;
\r
570 int p = ptr + coff;
\r
571 if ((c1 < 0 || (p >= 0 && p < text_end && (c1 == text[p] || c2 == text[p]))) &&
\r
572 Eval (Mode.Match, ref ptr, pc))
\r
576 if (!fast.IsMinimum) {
\r
577 fast = fast.Previous;
\r
584 fast = fast.Previous;
\r
589 case OpCode.Info: {
\r
590 Debug.Assert (false, "Regex", "Info block found in pattern");
\r
604 if (fast.IsMaximum || (fast.IsLazy && fast.IsMinimum))
\r
607 pc = fast.Expression;
\r
618 if (!fast.IsLazy && fast.IsMinimum)
\r
621 ref_ptr = fast.Start;
\r
629 private bool EvalChar (Mode mode, ref int ptr, ref int pc, bool multi) {
\r
630 bool consumed = false;
\r
636 ushort word = program[pc];
\r
637 OpCode op = (OpCode)(word & 0x00ff);
\r
638 OpFlags flags = (OpFlags)(word & 0xff00);
\r
642 ignore = (flags & OpFlags.IgnoreCase) != 0;
\r
644 // consume character: the direction of an In construct is
\r
645 // determined by the direction of its first op
\r
648 if ((flags & OpFlags.RightToLeft) != 0) {
\r
655 if (ptr >= text_end)
\r
662 c = Char.ToLower (c);
\r
669 negate = (flags & OpFlags.Negate) != 0;
\r
680 case OpCode.Character: {
\r
681 if (c == (char)program[pc ++])
\r
686 case OpCode.Category: {
\r
687 if (CategoryUtils.IsCategory ((Category)program[pc ++], c))
\r
693 case OpCode.Range: {
\r
694 int lo = (char)program[pc ++];
\r
695 int hi = (char)program[pc ++];
\r
696 if (lo <= c && c <= hi)
\r
702 int lo = (char)program[pc ++];
\r
703 int len = (char)program[pc ++];
\r
707 int i = (int)c - lo;
\r
708 if (i < 0 || i >= len << 4)
\r
712 if ((program[bits + (i >> 4)] & (1 << (i & 0xf))) != 0)
\r
722 private bool TryMatch (ref int ref_ptr, int pc) {
\r
726 marks [groups [0]].Start = ptr;
\r
727 if (Eval (Mode.Match, ref ptr, pc)) {
\r
728 marks [groups [0]].End = ptr;
\r
736 private bool IsPosition (Position pos, int ptr) {
\r
738 case Position.Start: case Position.StartOfString:
\r
741 case Position.StartOfLine:
\r
742 return ptr == 0 || text[ptr - 1] == '\n';
\r
744 case Position.StartOfScan:
\r
745 return ptr == scan_ptr;
\r
748 return ptr == text_end ||
\r
749 (ptr == text_end - 1 && text[ptr] == '\n');
\r
751 case Position.EndOfLine:
\r
752 return ptr == text_end || text[ptr] == '\n';
\r
754 case Position.EndOfString:
\r
755 return ptr == text_end;
\r
757 case Position.Boundary:
\r
762 return IsWordChar (text[ptr]);
\r
763 else if (ptr == text_end)
\r
764 return IsWordChar (text[ptr - 1]);
\r
766 return IsWordChar (text[ptr]) != IsWordChar (text[ptr - 1]);
\r
768 case Position.NonBoundary:
\r
773 return !IsWordChar (text[ptr]);
\r
774 else if (ptr == text_end)
\r
775 return !IsWordChar (text[ptr - 1]);
\r
777 return IsWordChar (text[ptr]) == IsWordChar (text[ptr - 1]);
\r
784 private bool IsWordChar (char c) {
\r
785 return CategoryUtils.IsCategory (Category.Word, c);
\r
788 private string GetString (int pc) {
\r
789 int len = program[pc + 1];
\r
792 char[] cs = new char[len];
\r
793 for (int i = 0; i < len; ++ i)
\r
794 cs[i] = (char)program[str ++];
\r
796 return new string (cs);
\r
799 // capture management
\r
801 private void Open (int gid, int ptr) {
\r
802 int m = groups [gid];
\r
803 if (m < mark_start || marks [m].IsDefined) {
\r
804 m = CreateMark (m);
\r
808 marks [m].Start = ptr;
\r
811 private void Close (int gid, int ptr) {
\r
812 marks [groups [gid]].End = ptr;
\r
815 private void Balance (int gid, int balance_gid, int ptr) {
\r
816 int b = groups [balance_gid];
\r
817 Debug.Assert (marks [b].IsDefined, "Regex", "Balancing group not closed");
\r
820 Open (gid, marks [b].Index + marks [b].Length);
\r
824 groups [balance_gid] = marks [b].Previous;
\r
827 private int Checkpoint () {
\r
828 mark_start = mark_end;
\r
832 private void Backtrack (int cp) {
\r
833 Debug.Assert (cp > mark_start, "Regex", "Attempt to backtrack forwards");
\r
835 for (int i = 0; i < groups.Length; ++ i) {
\r
836 int m = groups [i];
\r
838 m = marks [m].Previous;
\r
844 private void ResetGroups () {
\r
845 int n = groups.Length;
\r
847 marks = new Mark [n * 10];
\r
849 for (int i = 0; i < n; ++ i) {
\r
852 marks [i].Start = -1;
\r
853 marks [i].End = -1;
\r
854 marks [i].Previous = -1;
\r
861 private int GetLastDefined (int gid) {
\r
862 int m = groups [gid];
\r
863 while (m >= 0 && !marks [m].IsDefined)
\r
864 m = marks [m].Previous;
\r
869 private int CreateMark (int previous) {
\r
870 if (mark_end == marks.Length) {
\r
871 Mark [] dest = new Mark [marks.Length * 2];
\r
872 marks.CopyTo (dest, 0);
\r
876 int m = mark_end ++;
\r
877 marks [m].Start = marks [m].End = -1;
\r
878 marks [m].Previous = previous;
\r
883 private Match GenerateMatch (Regex regex) {
\r
884 int[][] grps = new int[groups.Length][];
\r
885 ArrayList caps = new ArrayList ();
\r
887 for (int gid = 0; gid < groups.Length; ++ gid) {
\r
889 for (int m = groups[gid]; m >= 0; m = marks[m].Previous) {
\r
890 if (!marks[m].IsDefined)
\r
893 caps.Add (marks[m].Index);
\r
894 caps.Add (marks[m].Length);
\r
897 grps[gid] = (int[])caps.ToArray (typeof (int));
\r
900 return new Match (regex, this, text, text_end, grps);
\r
903 // interpreter attributes
\r
905 private ushort[] program; // regex program
\r
906 private int program_start; // first instruction after info block
\r
907 private string text; // input text
\r
908 private int text_end; // end of input text (last character + 1)
\r
909 private int group_count; // number of capturing groups
\r
910 private int match_min, match_max; // match width information
\r
911 private QuickSearch qs; // fast substring matcher
\r
915 private int scan_ptr; // start of scan
\r
917 private RepeatContext repeat; // current repeat context
\r
918 private RepeatContext fast; // fast repeat context
\r
920 private Mark[] marks = null; // mark stack
\r
921 private int mark_start; // start of current checkpoint
\r
922 private int mark_end; // end of checkpoint/next free mark
\r
924 private int[] groups; // current group definitions
\r
928 private struct Mark {
\r
929 public int Start, End;
\r
930 public int Previous;
\r
932 public bool IsDefined {
\r
933 get { return Start >= 0 && End >= 0; }
\r
937 get { return Start < End ? Start : End; }
\r
940 public int Length {
\r
941 get { return Start < End ? End - Start : Start - End; }
\r
945 private class RepeatContext {
\r
946 public RepeatContext (RepeatContext previous, int min, int max, bool lazy, int expr_pc) {
\r
947 this.previous = previous;
\r
951 this.expr_pc = expr_pc;
\r
958 get { return count; }
\r
959 set { count = value; }
\r
963 get { return start; }
\r
964 set { start = value; }
\r
967 public bool IsMinimum {
\r
968 get { return min <= count; }
\r
971 public bool IsMaximum {
\r
972 get { return max <= count; }
\r
975 public bool IsLazy {
\r
976 get { return lazy; }
\r
979 public int Expression {
\r
980 get { return expr_pc; }
\r
983 public RepeatContext Previous {
\r
984 get { return previous; }
\r
988 private int min, max;
\r
990 private int expr_pc;
\r
991 private RepeatContext previous;
\r
996 private enum Mode {
\r