3 // namespace: System.Text.RegularExpressions
4 // file: interpreter.cs
6 // author: Dan Lewis (dlewis@gmx.co.uk)
10 // Permission is hereby granted, free of charge, to any person obtaining
11 // a copy of this software and associated documentation files (the
12 // "Software"), to deal in the Software without restriction, including
13 // without limitation the rights to use, copy, modify, merge, publish,
14 // distribute, sublicense, and/or sell copies of the Software, and to
15 // permit persons to whom the Software is furnished to do so, subject to
16 // the following conditions:
18 // The above copyright notice and this permission notice shall be
19 // included in all copies or substantial portions of the Software.
21 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
22 // EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
23 // MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
24 // NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
25 // LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
26 // OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
27 // WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
31 using System.Collections;
32 using System.Diagnostics;
33 using System.Globalization;
35 namespace System.Text.RegularExpressions {
37 partial class Interpreter : BaseMachine {
38 private int ReadProgramCount (int ptr)
40 int ret = program [ptr + 1];
46 public Interpreter (ushort[] program) {
47 this.program = program;
51 Debug.Assert ((OpCode)program[0] == OpCode.Info, "Regex", "Cant' find info block");
52 this.group_count = ReadProgramCount (1) + 1;
53 this.match_min = ReadProgramCount (3);
54 //this.match_max = ReadProgramCount (5);
58 this.program_start = 7;
59 this.groups = new int [group_count];
62 // IMachine implementation
64 public override Match Scan (Regex regex, string text, int start, int end, bool substring_mode) {
65 this.regex_rtl = (regex.Options & RegexOptions.RightToLeft) != 0;
69 this.text_start = regex_rtl && substring_mode ? end : start;
70 this.text_end = regex_rtl ? substring_mode ? start : 0 : end;
71 this.initialized = true;
75 this.text_start = start;
79 this.scan_ptr = text_start;
80 this.substring_mode = substring_mode;
82 if (Eval (Mode.Match, ref scan_ptr, program_start))
83 return GenerateMatch (regex);
90 private void Reset () {
95 private bool Eval (Mode mode, ref int ref_ptr, int pc) {
99 ushort word = program[pc];
100 OpCode op = (OpCode)(word & 0x00ff);
101 OpFlags flags = (OpFlags)(word & 0xff00);
104 case OpCode.Anchor: {
105 int skip = program[pc + 1];
107 int anch_offset = program[pc + 2];
108 bool anch_reverse = (flags & OpFlags.RightToLeft) != 0;
109 int anch_ptr = anch_reverse ? ptr - anch_offset : ptr + anch_offset;
110 int anch_end = (regex_rtl ? text_start : text_end) - match_min + anch_offset; // maximum anchor position
116 // the general case for an anchoring expression is at the bottom, however we
117 // do some checks for the common cases before to save processing time. the current
118 // optimizer only outputs three types of anchoring expressions: fixed position,
119 // fixed substring, and no anchor.
121 OpCode anch_op = (OpCode)(program[pc + 3] & 0x00ff);
122 if (anch_op == OpCode.Position && skip == 6) { // position anchor
127 switch ((Position)program[pc + 4]) {
128 case Position.StartOfString:
129 if (anch_reverse || anch_offset == 0) {
132 if (TryMatch (ref ptr, pc + skip))
137 case Position.StartOfLine:
140 if (TryMatch (ref ptr, pc + skip))
146 while ((anch_reverse && anch_ptr >= 0) || (!anch_reverse && anch_ptr <= anch_end)) {
147 if (anch_ptr == 0 || text[anch_ptr - 1] == '\n') {
149 ptr = anch_ptr == anch_end ? anch_ptr : anch_ptr + anch_offset;
151 ptr = anch_ptr == 0 ? anch_ptr : anch_ptr - anch_offset;
152 if (TryMatch (ref ptr, pc + skip))
163 case Position.StartOfScan:
164 if (anch_ptr == scan_ptr) {
165 ptr = anch_reverse ? scan_ptr + anch_offset : scan_ptr - anch_offset;
166 if (TryMatch (ref ptr, pc + skip))
176 else if (qs != null ||
177 (anch_op == OpCode.String && skip == 6 + program[pc + 4])) { // substring anchor
182 bool reverse = ((OpFlags)program[pc + 3] & OpFlags.RightToLeft) != 0;
185 bool ignore = ((OpFlags)program[pc + 3] & OpFlags.IgnoreCase) != 0;
186 string substring = GetString (pc + 3);
187 qs = new QuickSearch (substring, ignore, reverse);
189 while ((anch_reverse && anch_ptr >= anch_begin)
190 || (!anch_reverse && anch_ptr <= anch_end)) {
194 anch_ptr = qs.Search (text, anch_ptr, anch_begin);
196 anch_ptr += qs.Length ;
200 anch_ptr = qs.Search (text, anch_ptr, anch_end);
204 ptr = reverse ? anch_ptr + anch_offset : anch_ptr - anch_offset;
205 if (TryMatch (ref ptr, pc + skip))
214 else if (anch_op == OpCode.True) { // no anchor
219 while ((anch_reverse && anch_ptr >= anch_begin)
220 || (!anch_reverse && anch_ptr <= anch_end)) {
223 if (TryMatch (ref ptr, pc + skip))
231 else { // general case
236 while ((anch_reverse && anch_ptr >= anch_begin)
237 || (!anch_reverse && anch_ptr <= anch_end)) {
240 if (Eval (Mode.Match, ref ptr, pc + 3)) {
241 // anchor expression passed: try real expression at the correct offset
243 ptr = anch_reverse ? anch_ptr + anch_offset : anch_ptr - anch_offset;
244 if (TryMatch (ref ptr, pc + skip))
266 case OpCode.Position: {
267 if (!IsPosition ((Position)program[pc + 1], ptr))
273 case OpCode.String: {
274 bool reverse = (flags & OpFlags.RightToLeft) != 0;
275 bool ignore = (flags & OpFlags.IgnoreCase) != 0;
276 int len = program[pc + 1];
280 if ((!regex_rtl && ptr < 0) || (regex_rtl && ptr < text_end))
284 if (ptr + len > text_end)
288 for (int i = 0; i < len; ++ i) {
289 char c = text[ptr + i];
291 c = Char.ToLower (c);
293 if (c != (char)program[pc ++])
302 case OpCode.Reference: {
303 bool reverse = (flags & OpFlags.RightToLeft) != 0;
304 bool ignore = (flags & OpFlags.IgnoreCase) != 0;
305 int m = GetLastDefined (program [pc + 1]);
309 int str = marks [m].Index;
310 int len = marks [m].Length;
314 if ((!regex_rtl && ptr < 0) || (regex_rtl && ptr < text_end))
317 else if (ptr + len > text_end)
322 for (int i = 0; i < len; ++ i) {
323 if (Char.ToLower (text[ptr + i]) != Char.ToLower (text[str + i]))
327 for (int i = 0; i < len; ++ i) {
328 if (text[ptr + i] != text[str + i])
338 case OpCode.Character: case OpCode.Category: case OpCode.NotCategory:
339 case OpCode.Range: case OpCode.Set: {
340 if (!EvalChar (mode, ref ptr, ref pc, false))
346 int target = pc + program[pc + 1];
348 if (!EvalChar (mode, ref ptr, ref pc, true))
356 Open (program[pc + 1], ptr);
362 Close (program[pc + 1], ptr);
367 case OpCode.BalanceStart: {
369 int start = ptr; //point before the balancing group
371 if (!Eval (Mode.Match, ref ptr, pc + 5))
376 if(!Balance (program[pc + 1], program[pc + 2], (program[pc + 3] == 1 ? true : false) , start)) {
381 pc += program[pc + 4];
385 case OpCode.Balance: {
389 case OpCode.IfDefined: {
390 int m = GetLastDefined (program [pc + 2]);
392 pc += program[pc + 1];
399 if (!Eval (Mode.Match, ref ptr, pc + 2))
402 pc += program[pc + 1];
407 int cp = Checkpoint ();
409 if (Eval (Mode.Match, ref test_ptr, pc + 3))
410 pc += program[pc + 1];
413 pc += program[pc + 2];
418 case OpCode.Branch: {
421 int cp = Checkpoint ();
422 if (Eval (Mode.Match, ref ptr, pc + 2))
427 pc += program[pc + 1];
428 branch_op = (OpCode)(program[pc] & 0xff);
429 } while (branch_op != OpCode.False);
435 pc += program[pc + 1];
439 case OpCode.Repeat: {
440 this.repeat = new RepeatContext (
441 this.repeat, // previous context
442 ReadProgramCount (pc + 2), // minimum
443 ReadProgramCount (pc + 4), // maximum
444 (flags & OpFlags.Lazy) != 0, // lazy
445 pc + 6 // subexpression
448 if (Eval (Mode.Match, ref ptr, pc + program[pc + 1]))
451 this.repeat = this.repeat.Previous;
457 RepeatContext current = this.repeat;
460 // Can we avoid recursion?
462 // Backtracking can be forced in nested quantifiers from the tail of this quantifier.
463 // Thus, we cannot, in general, use a simple loop on repeat.Expression to handle
466 // If 'deep' was unmolested, that implies that there was no nested quantifiers.
467 // Thus, we can safely avoid recursion.
472 int start = current.Start;
473 int start_count = current.Count;
475 while (!current.IsMinimum) {
479 if (!Eval (Mode.Match, ref ptr, current.Expression)) {
480 current.Start = start;
481 current.Count = start_count;
484 if (deep != current) // recursive mode
488 if (ptr == current.Start) {
489 // degenerate match ... match tail or fail
490 this.repeat = current.Previous;
492 if (Eval (Mode.Match, ref ptr, pc + 1))
495 this.repeat = current;
499 if (current.IsLazy) {
501 // match tail first ...
502 this.repeat = current.Previous;
504 int cp = Checkpoint ();
505 if (Eval (Mode.Match, ref ptr, pc + 1))
510 // ... then match more
511 this.repeat = current;
512 if (current.IsMaximum)
517 if (!Eval (Mode.Match, ref ptr, current.Expression)) {
518 current.Start = start;
519 current.Count = start_count;
522 if (deep != current) // recursive mode
524 // Degenerate match: ptr has not moved since the last (failed) tail match.
525 // So, next and subsequent tail matches will fail.
526 if (ptr == current.Start)
530 int stack_size = stack.Count;
532 // match greedily as much as possible
533 while (!current.IsMaximum) {
534 int cp = Checkpoint ();
536 int old_start = current.Start;
541 if (!Eval (Mode.Match, ref ptr, current.Expression)) {
543 current.Start = old_start;
547 if (deep != current) {
548 // recursive mode: no more backtracking, truncate the stack
549 stack.Count = stack_size;
553 stack.Push (old_ptr);
555 // Degenerate match: no point going on
556 if (ptr == current.Start)
560 // then, match the tail, backtracking as necessary.
561 this.repeat = current.Previous;
564 if (Eval (Mode.Match, ref ptr, pc + 1)) {
565 stack.Count = stack_size;
568 if (stack.Count == stack_size) {
569 this.repeat = current;
575 Backtrack (stack.Pop ());
580 case OpCode.FastRepeat: {
581 this.fast = new RepeatContext (
583 ReadProgramCount (pc + 2), // minimum
584 ReadProgramCount (pc + 4), // maximum
585 (flags & OpFlags.Lazy) != 0, // lazy
586 pc + 6 // subexpression
591 int cp = Checkpoint ();
593 pc += program[pc + 1]; // tail expression
594 ushort tail_word = program[pc];
596 int c1 = -1; // first character of tail operator
597 int c2 = -1; // ... and the same character, in upper case if ignoring case
598 int coff = 0; // 0 or -1 depending on direction
600 OpCode tail_op = (OpCode)(tail_word & 0xff);
601 if (tail_op == OpCode.Character || tail_op == OpCode.String) {
602 OpFlags tail_flags = (OpFlags)(tail_word & 0xff00);
604 if ((tail_flags & OpFlags.Negate) != 0)
607 if (tail_op == OpCode.String)
611 if ((tail_flags & OpFlags.RightToLeft) != 0)
613 offset = program[pc + 1] - 1 ;
616 c1 = program[pc + 2 + offset]; // first char of string
619 c1 = program[pc + 1]; // character
621 if ((tail_flags & OpFlags.IgnoreCase) != 0)
622 c2 = Char.ToUpper ((char)c1); // ignore case
626 if ((tail_flags & OpFlags.RightToLeft) != 0)
627 coff = -1; // reverse
634 if (!fast.IsMinimum && !Eval (Mode.Count, ref ptr, fast.Expression)) {
635 //Console.WriteLine ("lazy fast: failed mininum.");
636 fast = fast.Previous;
642 if (c1 < 0 || (p >= 0 && ((regex_rtl && p >= text_end) || (!regex_rtl && p < text_end)) && (c1 == text[p] || c2 == text[p]))) {
644 if (Eval (Mode.Match, ref ptr, pc))
648 if (fast.IsMaximum) {
649 //Console.WriteLine ("lazy fast: failed with maximum.");
650 fast = fast.Previous;
655 if (!Eval (Mode.Count, ref ptr, fast.Expression)) {
656 //Console.WriteLine ("lazy fast: no more.");
657 fast = fast.Previous;
661 fast = fast.Previous;
665 if (!Eval (Mode.Count, ref ptr, fast.Expression)) {
666 fast = fast.Previous;
672 width = (ptr - fast.Start) / fast.Count;
678 if (c1 < 0 || (p >= 0 && ((regex_rtl && p >= text_end) || (!regex_rtl && p < text_end)) && (c1 == text[p] || c2 == text[p]))) {
680 if (Eval (Mode.Match, ref ptr, pc))
685 if (!fast.IsMinimum) {
686 fast = fast.Previous;
693 fast = fast.Previous;
699 Debug.Assert (false, "Regex", "Info block found in pattern");
713 if (fast.IsMaximum || (fast.IsLazy && fast.IsMinimum))
716 pc = fast.Expression;
727 if (!fast.IsLazy && fast.IsMinimum)
730 ref_ptr = fast.Start;
738 private bool EvalChar (Mode mode, ref int ptr, ref int pc, bool multi) {
739 bool consumed = false;
745 ushort word = program[pc];
746 OpCode op = (OpCode)(word & 0x00ff);
747 OpFlags flags = (OpFlags)(word & 0xff00);
751 ignore = (flags & OpFlags.IgnoreCase) != 0;
753 // consume character: the direction of an In construct is
754 // determined by the direction of its first op
757 if ((flags & OpFlags.RightToLeft) != 0) {
758 if ((substring_mode && ptr <= (regex_rtl ? text_end : text_start)) || (!substring_mode && ptr <= 0))
764 if ((!regex_rtl && ptr >= text_end) || (regex_rtl && ptr >= text_start))
771 c = Char.ToLower (c);
778 negate = (flags & OpFlags.Negate) != 0;
789 case OpCode.Character: {
790 if (c == (char)program[pc ++])
795 case OpCode.Category: {
796 if (CategoryUtils.IsCategory ((Category)program[pc ++], c))
801 case OpCode.NotCategory: {
802 if (!CategoryUtils.IsCategory ((Category)program[pc ++], c))
808 int lo = (char)program[pc ++];
809 int hi = (char)program[pc ++];
810 if (lo <= c && c <= hi)
816 int lo = (char)program[pc ++];
817 int len = (char)program[pc ++];
822 if (i < 0 || i >= len << 4)
826 if ((program[bits + (i >> 4)] & (1 << (i & 0xf))) != 0)
836 private bool TryMatch (ref int ref_ptr, int pc) {
840 marks [groups [0]].Start = ptr;
841 if (Eval (Mode.Match, ref ptr, pc)) {
842 marks [groups [0]].End = ptr;
850 private bool IsPosition (Position pos, int ptr) {
852 case Position.Start: case Position.StartOfString:
853 return ptr == 0 || (substring_mode && ((!regex_rtl && ptr == text_start) || (regex_rtl && ptr == text_end)));
855 case Position.StartOfLine:
856 return ptr == 0 || text[ptr - 1] == '\n' || (substring_mode && ((!regex_rtl && ptr == text_start) || (regex_rtl && ptr == text_end)));
858 case Position.StartOfScan:
859 return ptr == scan_ptr;
862 return (!regex_rtl && ptr == text_end) || (regex_rtl && ptr == text_start) ||
863 (((!regex_rtl && ptr == text_end - 1) || (regex_rtl && ptr == text_start - 1)) && text[ptr] == '\n');
865 case Position.EndOfLine:
866 return (!regex_rtl && ptr == text_end) || (regex_rtl && ptr == text_start) || text[ptr] == '\n';
868 case Position.EndOfString:
869 return (!regex_rtl && ptr == text_end) || (regex_rtl && ptr == text_start);
871 case Position.Boundary:
872 if ((!regex_rtl && text_end == 0) || (regex_rtl && text_start == 0))
876 return IsWordChar (text[ptr]);
877 else if ((!regex_rtl && ptr == text_end) || (regex_rtl && ptr == text_start))
878 return IsWordChar (text[ptr - 1]);
880 return IsWordChar (text[ptr]) != IsWordChar (text[ptr - 1]);
882 case Position.NonBoundary:
883 if ((!regex_rtl && text_end == 0) || (regex_rtl && text_start == 0))
887 return !IsWordChar (text[ptr]);
888 else if ((!regex_rtl && ptr == text_end) || (regex_rtl && ptr == text_start))
889 return !IsWordChar (text[ptr - 1]);
891 return IsWordChar (text[ptr]) == IsWordChar (text[ptr - 1]);
898 private bool IsWordChar (char c) {
899 return CategoryUtils.IsCategory (Category.Word, c);
902 private string GetString (int pc) {
903 int len = program[pc + 1];
906 char[] cs = new char[len];
907 for (int i = 0; i < len; ++ i)
908 cs[i] = (char)program[str ++];
910 return new string (cs);
913 // capture management
915 private void Open (int gid, int ptr) {
916 int m = groups [gid];
917 if (m < mark_start || marks [m].IsDefined) {
922 marks [m].Start = ptr;
925 private void Close (int gid, int ptr) {
926 marks [groups [gid]].End = ptr;
929 private bool Balance (int gid, int balance_gid, bool capture, int ptr) {
930 int b = groups [balance_gid];
932 if(b == -1 || marks[b].Index < 0) {
933 //Group not previously matched
936 Debug.Assert (marks [b].IsDefined, "Regex", "Balancng group not closed");
937 if (gid > 0 && capture){
938 Open (gid, marks [b].Index + marks [b].Length);
942 groups [balance_gid] = marks[b].Previous;
947 private int Checkpoint () {
948 mark_start = mark_end;
952 private void Backtrack (int cp) {
953 Debug.Assert (cp > mark_start, "Regex", "Attempt to backtrack forwards");
954 for (int i = 0; i < groups.Length; ++ i) {
957 marks [m].Start = -1;
958 m = marks [m].Previous;
966 private void ResetGroups () {
967 int n = groups.Length;
969 marks = new Mark [n * 10];
971 for (int i = 0; i < n; ++ i) {
974 marks [i].Start = -1;
976 marks [i].Previous = -1;
983 private int GetLastDefined (int gid) {
984 int m = groups [gid];
985 while (m >= 0 && !marks [m].IsDefined)
986 m = marks [m].Previous;
991 private int CreateMark (int previous) {
992 if (mark_end == marks.Length) {
993 Mark [] dest = new Mark [marks.Length * 2];
994 marks.CopyTo (dest, 0);
999 marks [m].Start = marks [m].End = -1;
1000 marks [m].Previous = previous;
1005 private void GetGroupInfo (int gid, out int first_mark_index, out int n_caps)
1007 first_mark_index = -1;
1009 for (int m = groups [gid]; m >= 0; m = marks [m].Previous) {
1010 if (!marks [m].IsDefined)
1012 if (first_mark_index < 0)
1013 first_mark_index = m;
1018 private void PopulateGroup (Group g, int first_mark_index, int n_caps)
1021 for (int m = marks [first_mark_index].Previous; m >= 0; m = marks [m].Previous) {
1022 if (!marks [m].IsDefined)
1024 Capture cap = new Capture (text, marks [m].Index, marks [m].Length);
1025 g.Captures.SetValue (cap, n_caps - 1 - i);
1030 private Match GenerateMatch (Regex regex)
1032 int n_caps, first_mark_index;
1034 GetGroupInfo (0, out first_mark_index, out n_caps);
1036 // Avoid fully populating the Match instance if not needed
1037 if (!needs_groups_or_captures)
1038 return new Match (regex, this, text, text_end, 0, marks [first_mark_index].Index, marks [first_mark_index].Length);
1040 Match retval = new Match (regex, this, text, text_end, groups.Length,
1041 marks [first_mark_index].Index, marks [first_mark_index].Length, n_caps);
1042 PopulateGroup (retval, first_mark_index, n_caps);
1044 for (int gid = 1; gid < groups.Length; ++ gid) {
1045 GetGroupInfo (gid, out first_mark_index, out n_caps);
1046 if (first_mark_index < 0) {
1049 g = new Group (text, marks [first_mark_index].Index, marks [first_mark_index].Length, n_caps);
1050 PopulateGroup (g, first_mark_index, n_caps);
1052 retval.Groups.SetValue (g, gid);
1057 // interpreter attributes
1059 private ushort[] program; // regex program
1060 private int program_start; // first instruction after info block
1061 private string text; // input text
1062 private int text_end; // end of input text (last character + 1)
1063 private int group_count; // number of capturing groups
1064 private int match_min;//, match_max; // match width information
1065 private QuickSearch qs; // fast substring matcher
1067 private bool regex_rtl;
1068 private int text_start;
1069 private bool substring_mode;
1070 private bool initialized;
1074 private int scan_ptr; // start of scan
1076 private RepeatContext repeat; // current repeat context
1077 private RepeatContext fast; // fast repeat context
1079 // Repeat/Until handling
1080 private IntStack stack = new IntStack (); // utility stack
1081 private RepeatContext deep; // points to the most-nested repeat context
1083 private Mark[] marks = null; // mark stack
1084 private int mark_start; // start of current checkpoint
1085 private int mark_end; // end of checkpoint/next free mark
1087 private int[] groups; // current group definitions
1091 private struct IntStack {
1096 return values [--count];
1098 public void Push (int value)
1100 if (values == null) {
1101 values = new int [8];
1102 } else if (count == values.Length) {
1103 int new_size = values.Length;
1104 new_size += new_size >> 1;
1105 int [] new_values = new int [new_size];
1106 for (int i = 0; i < count; ++i)
1107 new_values [i] = values [i];
1108 values = new_values;
1110 values [count++] = value;
1113 get { return values [count - 1]; }
1116 get { return count; }
1119 throw new SystemException ("can only truncate the stack");
1125 private class RepeatContext {
1126 public RepeatContext (RepeatContext previous, int min, int max, bool lazy, int expr_pc) {
1127 this.previous = previous;
1131 this.expr_pc = expr_pc;
1138 get { return count; }
1139 set { count = value; }
1143 get { return start; }
1144 set { start = value; }
1147 public bool IsMinimum {
1148 get { return min <= count; }
1151 public bool IsMaximum {
1152 get { return max <= count; }
1155 public bool IsLazy {
1156 get { return lazy; }
1159 public int Expression {
1160 get { return expr_pc; }
1163 public RepeatContext Previous {
1164 get { return previous; }
1168 private int min, max;
1170 private int expr_pc;
1171 private RepeatContext previous;