3 // namespace: System.Text.RegularExpressions
4 // file: interpreter.cs
6 // author: Dan Lewis (dlewis@gmx.co.uk)
10 // Permission is hereby granted, free of charge, to any person obtaining
11 // a copy of this software and associated documentation files (the
12 // "Software"), to deal in the Software without restriction, including
13 // without limitation the rights to use, copy, modify, merge, publish,
14 // distribute, sublicense, and/or sell copies of the Software, and to
15 // permit persons to whom the Software is furnished to do so, subject to
16 // the following conditions:
18 // The above copyright notice and this permission notice shall be
19 // included in all copies or substantial portions of the Software.
21 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
22 // EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
23 // MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
24 // NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
25 // LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
26 // OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
27 // WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
31 using System.Collections;
32 using System.Diagnostics;
33 using System.Globalization;
35 namespace System.Text.RegularExpressions {
37 class Interpreter : IMachine {
38 private int ReadProgramCount (int ptr)
40 int ret = program [ptr + 1];
46 public Interpreter (ushort[] program) {
47 this.program = program;
52 Debug.Assert ((OpCode)program[0] == OpCode.Info, "Regex", "Cant' find info block");
54 this.group_count = ReadProgramCount (1) + 1;
55 this.match_min = ReadProgramCount (3);
56 //this.match_max = ReadProgramCount (5);
60 this.program_start = 7;
61 this.groups = new int [group_count];
64 // IMachine implementation
66 public Match Scan (Regex regex, string text, int start, int end) {
69 this.scan_ptr = start;
71 if (Eval (Mode.Match, ref scan_ptr, program_start))
72 return GenerateMatch (regex);
79 private void Reset () {
84 private bool Eval (Mode mode, ref int ref_ptr, int pc) {
88 ushort word = program[pc];
89 OpCode op = (OpCode)(word & 0x00ff);
90 OpFlags flags = (OpFlags)(word & 0xff00);
94 int skip = program[pc + 1];
96 int anch_offset = program[pc + 2];
97 bool anch_reverse = (flags & OpFlags.RightToLeft) != 0;
98 int anch_ptr = anch_reverse ? ptr - anch_offset : ptr + anch_offset;
99 int anch_end = text_end - match_min + anch_offset; // maximum anchor position
105 // the general case for an anchoring expression is at the bottom, however we
106 // do some checks for the common cases before to save processing time. the current
107 // optimizer only outputs three types of anchoring expressions: fixed position,
108 // fixed substring, and no anchor.
110 OpCode anch_op = (OpCode)(program[pc + 3] & 0x00ff);
111 if (anch_op == OpCode.Position && skip == 6) { // position anchor
116 switch ((Position)program[pc + 4]) {
117 case Position.StartOfString:
118 if (anch_reverse || anch_offset == 0) {
121 if (TryMatch (ref ptr, pc + skip))
126 case Position.StartOfLine:
129 if (TryMatch (ref ptr, pc + skip))
135 while ((anch_reverse && anch_ptr >= 0) || (!anch_reverse && anch_ptr <= anch_end)) {
136 if (anch_ptr == 0 || text[anch_ptr - 1] == '\n') {
138 ptr = anch_ptr == anch_end ? anch_ptr : anch_ptr + anch_offset;
140 ptr = anch_ptr == 0 ? anch_ptr : anch_ptr - anch_offset;
141 if (TryMatch (ref ptr, pc + skip))
152 case Position.StartOfScan:
153 if (anch_ptr == scan_ptr) {
154 ptr = anch_reverse ? scan_ptr + anch_offset : scan_ptr - anch_offset;
155 if (TryMatch (ref ptr, pc + skip))
165 else if (qs != null ||
166 (anch_op == OpCode.String && skip == 6 + program[pc + 4])) { // substring anchor
171 bool reverse = ((OpFlags)program[pc + 3] & OpFlags.RightToLeft) != 0;
172 string substring = GetString (pc + 3);
175 bool ignore = ((OpFlags)program[pc + 3] & OpFlags.IgnoreCase) != 0;
177 qs = new QuickSearch (substring, ignore, reverse);
179 while ((anch_reverse && anch_ptr >= anch_begin)
180 || (!anch_reverse && anch_ptr <= anch_end)) {
184 anch_ptr = qs.Search (text, anch_ptr, anch_begin);
186 anch_ptr += substring.Length ;
190 anch_ptr = qs.Search (text, anch_ptr, anch_end);
194 ptr = reverse ? anch_ptr + anch_offset : anch_ptr - anch_offset;
195 if (TryMatch (ref ptr, pc + skip))
204 else if (anch_op == OpCode.True) { // no anchor
209 while ((anch_reverse && anch_ptr >= anch_begin)
210 || (!anch_reverse && anch_ptr <= anch_end)) {
213 if (TryMatch (ref ptr, pc + skip))
221 else { // general case
226 while ((anch_reverse && anch_ptr >= anch_begin)
227 || (!anch_reverse && anch_ptr <= anch_end)) {
230 if (Eval (Mode.Match, ref ptr, pc + 3)) {
231 // anchor expression passed: try real expression at the correct offset
233 ptr = anch_reverse ? anch_ptr + anch_offset : anch_ptr - anch_offset;
234 if (TryMatch (ref ptr, pc + skip))
256 case OpCode.Position: {
257 if (!IsPosition ((Position)program[pc + 1], ptr))
263 case OpCode.String: {
264 bool reverse = (flags & OpFlags.RightToLeft) != 0;
265 bool ignore = (flags & OpFlags.IgnoreCase) != 0;
266 int len = program[pc + 1];
274 if (ptr + len > text_end)
278 for (int i = 0; i < len; ++ i) {
279 char c = text[ptr + i];
281 c = Char.ToLower (c);
283 if (c != (char)program[pc ++])
292 case OpCode.Reference: {
293 bool reverse = (flags & OpFlags.RightToLeft) != 0;
294 bool ignore = (flags & OpFlags.IgnoreCase) != 0;
295 int m = GetLastDefined (program [pc + 1]);
299 int str = marks [m].Index;
300 int len = marks [m].Length;
307 else if (ptr + len > text_end)
311 for (int i = 0; i < len; ++ i) {
313 if (Char.ToLower (text[ptr + i]) != Char.ToLower (text[str + i]))
317 if (text[ptr + i] != text[str + i])
327 case OpCode.Character: case OpCode.Category: case OpCode.NotCategory:
328 case OpCode.Range: case OpCode.Set: {
329 if (!EvalChar (mode, ref ptr, ref pc, false))
335 int target = pc + program[pc + 1];
337 if (!EvalChar (mode, ref ptr, ref pc, true))
345 Open (program[pc + 1], ptr);
351 Close (program[pc + 1], ptr);
356 case OpCode.BalanceStart: {
358 int start = ptr; //point before the balancing group
360 if (!Eval (Mode.Match, ref ptr, pc + 5))
365 if(!Balance (program[pc + 1], program[pc + 2], (program[pc + 3] == 1 ? true : false) , start)) {
370 pc += program[pc + 4];
374 case OpCode.Balance: {
378 case OpCode.IfDefined: {
379 int m = GetLastDefined (program [pc + 2]);
381 pc += program[pc + 1];
388 if (!Eval (Mode.Match, ref ptr, pc + 2))
391 pc += program[pc + 1];
396 int cp = Checkpoint ();
398 if (Eval (Mode.Match, ref test_ptr, pc + 3))
399 pc += program[pc + 1];
402 pc += program[pc + 2];
407 case OpCode.Branch: {
410 int cp = Checkpoint ();
411 if (Eval (Mode.Match, ref ptr, pc + 2))
416 pc += program[pc + 1];
417 branch_op = (OpCode)(program[pc] & 0xff);
418 } while (branch_op != OpCode.False);
424 pc += program[pc + 1];
428 case OpCode.Repeat: {
429 this.repeat = new RepeatContext (
430 this.repeat, // previous context
431 ReadProgramCount (pc + 2), // minimum
432 ReadProgramCount (pc + 4), // maximum
433 (flags & OpFlags.Lazy) != 0, // lazy
434 pc + 6 // subexpression
437 if (Eval (Mode.Match, ref ptr, pc + program[pc + 1]))
440 this.repeat = this.repeat.Previous;
446 RepeatContext current = this.repeat;
449 // Can we avoid recursion?
451 // Backtracking can be forced in nested quantifiers from the tail of this quantifier.
452 // Thus, we cannot, in general, use a simple loop on repeat.Expression to handle
455 // If 'deep' was unmolested, that implies that there was no nested quantifiers.
456 // Thus, we can safely avoid recursion.
461 int start = current.Start;
462 int start_count = current.Count;
464 while (!current.IsMinimum) {
468 if (!Eval (Mode.Match, ref ptr, current.Expression)) {
469 current.Start = start;
470 current.Count = start_count;
473 if (deep != current) // recursive mode
477 if (ptr == current.Start) {
478 // degenerate match ... match tail or fail
479 this.repeat = current.Previous;
481 if (Eval (Mode.Match, ref ptr, pc + 1))
484 this.repeat = current;
488 if (current.IsLazy) {
490 // match tail first ...
491 this.repeat = current.Previous;
493 int cp = Checkpoint ();
494 if (Eval (Mode.Match, ref ptr, pc + 1))
499 // ... then match more
500 this.repeat = current;
501 if (current.IsMaximum)
506 if (!Eval (Mode.Match, ref ptr, current.Expression)) {
507 current.Start = start;
508 current.Count = start_count;
511 if (deep != current) // recursive mode
513 // Degenerate match: ptr has not moved since the last (failed) tail match.
514 // So, next and subsequent tail matches will fail.
515 if (ptr == current.Start)
519 int stack_size = stack.Count;
521 // match greedily as much as possible
522 while (!current.IsMaximum) {
523 int cp = Checkpoint ();
525 int old_start = current.Start;
530 if (!Eval (Mode.Match, ref ptr, current.Expression)) {
532 current.Start = old_start;
536 if (deep != current) {
537 // recursive mode: no more backtracking, truncate the stack
538 stack.Count = stack_size;
542 stack.Push (old_ptr);
544 // Degenerate match: no point going on
545 if (ptr == current.Start)
549 // then, match the tail, backtracking as necessary.
550 this.repeat = current.Previous;
553 if (Eval (Mode.Match, ref ptr, pc + 1)) {
554 stack.Count = stack_size;
557 if (stack.Count == stack_size) {
558 this.repeat = current;
564 Backtrack (stack.Pop ());
569 case OpCode.FastRepeat: {
570 this.fast = new RepeatContext (
572 ReadProgramCount (pc + 2), // minimum
573 ReadProgramCount (pc + 4), // maximum
574 (flags & OpFlags.Lazy) != 0, // lazy
575 pc + 6 // subexpression
580 int cp = Checkpoint ();
582 pc += program[pc + 1]; // tail expression
583 ushort tail_word = program[pc];
585 int c1 = -1; // first character of tail operator
586 int c2 = -1; // ... and the same character, in upper case if ignoring case
587 int coff = 0; // 0 or -1 depending on direction
589 OpCode tail_op = (OpCode)(tail_word & 0xff);
590 if (tail_op == OpCode.Character || tail_op == OpCode.String) {
591 OpFlags tail_flags = (OpFlags)(tail_word & 0xff00);
593 if ((tail_flags & OpFlags.Negate) != 0)
596 if (tail_op == OpCode.String)
600 if ((tail_flags & OpFlags.RightToLeft) != 0)
602 offset = program[pc + 1] - 1 ;
605 c1 = program[pc + 2 + offset]; // first char of string
608 c1 = program[pc + 1]; // character
610 if ((tail_flags & OpFlags.IgnoreCase) != 0)
611 c2 = Char.ToUpper ((char)c1); // ignore case
615 if ((tail_flags & OpFlags.RightToLeft) != 0)
616 coff = -1; // reverse
623 if (!fast.IsMinimum && !Eval (Mode.Count, ref ptr, fast.Expression)) {
624 //Console.WriteLine ("lazy fast: failed mininum.");
625 fast = fast.Previous;
631 if (c1 < 0 || (p >= 0 && p < text_end && (c1 == text[p] || c2 == text[p]))) {
633 if (Eval (Mode.Match, ref ptr, pc))
637 if (fast.IsMaximum) {
638 //Console.WriteLine ("lazy fast: failed with maximum.");
639 fast = fast.Previous;
644 if (!Eval (Mode.Count, ref ptr, fast.Expression)) {
645 //Console.WriteLine ("lazy fast: no more.");
646 fast = fast.Previous;
650 fast = fast.Previous;
654 if (!Eval (Mode.Count, ref ptr, fast.Expression)) {
655 fast = fast.Previous;
661 width = (ptr - fast.Start) / fast.Count;
667 if (c1 < 0 || (p >= 0 && p < text_end && (c1 == text[p] || c2 == text[p]))) {
669 if (Eval (Mode.Match, ref ptr, pc))
674 if (!fast.IsMinimum) {
675 fast = fast.Previous;
682 fast = fast.Previous;
688 Debug.Assert (false, "Regex", "Info block found in pattern");
702 if (fast.IsMaximum || (fast.IsLazy && fast.IsMinimum))
705 pc = fast.Expression;
716 if (!fast.IsLazy && fast.IsMinimum)
719 ref_ptr = fast.Start;
727 private bool EvalChar (Mode mode, ref int ptr, ref int pc, bool multi) {
728 bool consumed = false;
734 ushort word = program[pc];
735 OpCode op = (OpCode)(word & 0x00ff);
736 OpFlags flags = (OpFlags)(word & 0xff00);
740 ignore = (flags & OpFlags.IgnoreCase) != 0;
742 // consume character: the direction of an In construct is
743 // determined by the direction of its first op
746 if ((flags & OpFlags.RightToLeft) != 0) {
760 c = Char.ToLower (c);
767 negate = (flags & OpFlags.Negate) != 0;
778 case OpCode.Character: {
779 if (c == (char)program[pc ++])
784 case OpCode.Category: {
785 if (CategoryUtils.IsCategory ((Category)program[pc ++], c))
790 case OpCode.NotCategory: {
791 if (!CategoryUtils.IsCategory ((Category)program[pc ++], c))
797 int lo = (char)program[pc ++];
798 int hi = (char)program[pc ++];
799 if (lo <= c && c <= hi)
805 int lo = (char)program[pc ++];
806 int len = (char)program[pc ++];
811 if (i < 0 || i >= len << 4)
815 if ((program[bits + (i >> 4)] & (1 << (i & 0xf))) != 0)
825 private bool TryMatch (ref int ref_ptr, int pc) {
829 marks [groups [0]].Start = ptr;
830 if (Eval (Mode.Match, ref ptr, pc)) {
831 marks [groups [0]].End = ptr;
839 private bool IsPosition (Position pos, int ptr) {
841 case Position.Start: case Position.StartOfString:
844 case Position.StartOfLine:
845 return ptr == 0 || text[ptr - 1] == '\n';
847 case Position.StartOfScan:
848 return ptr == scan_ptr;
851 return ptr == text_end ||
852 (ptr == text_end - 1 && text[ptr] == '\n');
854 case Position.EndOfLine:
855 return ptr == text_end || text[ptr] == '\n';
857 case Position.EndOfString:
858 return ptr == text_end;
860 case Position.Boundary:
865 return IsWordChar (text[ptr]);
866 else if (ptr == text_end)
867 return IsWordChar (text[ptr - 1]);
869 return IsWordChar (text[ptr]) != IsWordChar (text[ptr - 1]);
871 case Position.NonBoundary:
876 return !IsWordChar (text[ptr]);
877 else if (ptr == text_end)
878 return !IsWordChar (text[ptr - 1]);
880 return IsWordChar (text[ptr]) == IsWordChar (text[ptr - 1]);
887 private bool IsWordChar (char c) {
888 return CategoryUtils.IsCategory (Category.Word, c);
891 private string GetString (int pc) {
892 int len = program[pc + 1];
895 char[] cs = new char[len];
896 for (int i = 0; i < len; ++ i)
897 cs[i] = (char)program[str ++];
899 return new string (cs);
902 // capture management
904 private void Open (int gid, int ptr) {
905 int m = groups [gid];
906 if (m < mark_start || marks [m].IsDefined) {
911 marks [m].Start = ptr;
914 private void Close (int gid, int ptr) {
915 marks [groups [gid]].End = ptr;
918 private bool Balance (int gid, int balance_gid, bool capture, int ptr) {
919 int b = groups [balance_gid];
921 if(b == -1 || marks[b].Index < 0) {
922 //Group not previously matched
926 Debug.Assert (marks [b].IsDefined, "Regex", "Balancng group not closed");
928 if (gid > 0 && capture){
929 Open (gid, marks [b].Index + marks [b].Length);
933 groups [balance_gid] = marks[b].Previous;
938 private int Checkpoint () {
939 mark_start = mark_end;
943 private void Backtrack (int cp) {
944 Debug.Assert (cp > mark_start, "Regex", "Attempt to backtrack forwards");
946 for (int i = 0; i < groups.Length; ++ i) {
949 m = marks [m].Previous;
955 private void ResetGroups () {
956 int n = groups.Length;
958 marks = new Mark [n * 10];
960 for (int i = 0; i < n; ++ i) {
963 marks [i].Start = -1;
965 marks [i].Previous = -1;
972 private int GetLastDefined (int gid) {
973 int m = groups [gid];
974 while (m >= 0 && !marks [m].IsDefined)
975 m = marks [m].Previous;
980 private int CreateMark (int previous) {
981 if (mark_end == marks.Length) {
982 Mark [] dest = new Mark [marks.Length * 2];
983 marks.CopyTo (dest, 0);
988 marks [m].Start = marks [m].End = -1;
989 marks [m].Previous = previous;
994 private void GetGroupInfo (int gid, out int first_mark_index, out int n_caps)
996 first_mark_index = -1;
998 for (int m = groups [gid]; m >= 0; m = marks [m].Previous) {
999 if (!marks [m].IsDefined)
1001 if (first_mark_index < 0)
1002 first_mark_index = m;
1007 private void PopulateGroup (Group g, int first_mark_index, int n_caps)
1010 for (int m = marks [first_mark_index].Previous; m >= 0; m = marks [m].Previous) {
1011 if (!marks [m].IsDefined)
1013 Capture cap = new Capture (text, marks [m].Index, marks [m].Length);
1014 g.Captures.SetValue (cap, n_caps - 1 - i);
1019 private Match GenerateMatch (Regex regex)
1021 int n_caps, first_mark_index;
1023 GetGroupInfo (0, out first_mark_index, out n_caps);
1024 Match retval = new Match (regex, this, text, text_end, groups.Length,
1025 marks [first_mark_index].Index, marks [first_mark_index].Length, n_caps);
1026 PopulateGroup (retval, first_mark_index, n_caps);
1028 for (int gid = 1; gid < groups.Length; ++ gid) {
1029 GetGroupInfo (gid, out first_mark_index, out n_caps);
1030 if (first_mark_index < 0) {
1033 g = new Group (text, marks [first_mark_index].Index, marks [first_mark_index].Length, n_caps);
1034 PopulateGroup (g, first_mark_index, n_caps);
1036 retval.Groups.SetValue (g, gid);
1041 // interpreter attributes
1043 private ushort[] program; // regex program
1044 private int program_start; // first instruction after info block
1045 private string text; // input text
1046 private int text_end; // end of input text (last character + 1)
1047 private int group_count; // number of capturing groups
1048 private int match_min;//, match_max; // match width information
1049 private QuickSearch qs; // fast substring matcher
1053 private int scan_ptr; // start of scan
1055 private RepeatContext repeat; // current repeat context
1056 private RepeatContext fast; // fast repeat context
1058 // Repeat/Until handling
1059 private IntStack stack = new IntStack (); // utility stack
1060 private RepeatContext deep; // points to the most-nested repeat context
1062 private Mark[] marks = null; // mark stack
1063 private int mark_start; // start of current checkpoint
1064 private int mark_end; // end of checkpoint/next free mark
1066 private int[] groups; // current group definitions
1070 private struct IntStack {
1075 return values [--count];
1077 public void Push (int value)
1079 if (values == null) {
1080 values = new int [8];
1081 } else if (count == values.Length) {
1082 int new_size = values.Length;
1083 new_size += new_size >> 1;
1084 int [] new_values = new int [new_size];
1085 for (int i = 0; i < count; ++i)
1086 new_values [i] = values [i];
1087 values = new_values;
1089 values [count++] = value;
1092 get { return values [count - 1]; }
1095 get { return count; }
1098 throw new SystemException ("can only truncate the stack");
1104 private class RepeatContext {
1105 public RepeatContext (RepeatContext previous, int min, int max, bool lazy, int expr_pc) {
1106 this.previous = previous;
1110 this.expr_pc = expr_pc;
1117 get { return count; }
1118 set { count = value; }
1122 get { return start; }
1123 set { start = value; }
1126 public bool IsMinimum {
1127 get { return min <= count; }
1130 public bool IsMaximum {
1131 get { return max <= count; }
1134 public bool IsLazy {
1135 get { return lazy; }
1138 public int Expression {
1139 get { return expr_pc; }
1142 public RepeatContext Previous {
1143 get { return previous; }
1147 private int min, max;
1149 private int expr_pc;
1150 private RepeatContext previous;