3 // namespace: System.Text.RegularExpressions
4 // file: interpreter.cs
6 // author: Dan Lewis (dlewis@gmx.co.uk)
10 // Permission is hereby granted, free of charge, to any person obtaining
11 // a copy of this software and associated documentation files (the
12 // "Software"), to deal in the Software without restriction, including
13 // without limitation the rights to use, copy, modify, merge, publish,
14 // distribute, sublicense, and/or sell copies of the Software, and to
15 // permit persons to whom the Software is furnished to do so, subject to
16 // the following conditions:
18 // The above copyright notice and this permission notice shall be
19 // included in all copies or substantial portions of the Software.
21 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
22 // EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
23 // MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
24 // NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
25 // LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
26 // OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
27 // WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
31 using System.Collections;
32 using System.Diagnostics;
33 using System.Globalization;
35 namespace System.Text.RegularExpressions {
37 partial class Interpreter : BaseMachine {
38 private int ReadProgramCount (int ptr)
40 int ret = program [ptr + 1];
46 public Interpreter (ushort[] program) {
47 this.program = program;
51 Debug.Assert ((OpCode)program[0] == OpCode.Info, "Regex", "Cant' find info block");
52 this.group_count = ReadProgramCount (1) + 1;
53 this.match_min = ReadProgramCount (3);
54 //this.match_max = ReadProgramCount (5);
58 this.program_start = 7;
59 this.groups = new int [group_count];
62 // IMachine implementation
64 public override Match Scan (Regex regex, string text, int start, int end) {
67 this.scan_ptr = start;
69 if (Eval (Mode.Match, ref scan_ptr, program_start))
70 return GenerateMatch (regex);
77 private void Reset () {
82 private bool Eval (Mode mode, ref int ref_ptr, int pc) {
86 ushort word = program[pc];
87 OpCode op = (OpCode)(word & 0x00ff);
88 OpFlags flags = (OpFlags)(word & 0xff00);
92 int skip = program[pc + 1];
94 int anch_offset = program[pc + 2];
95 bool anch_reverse = (flags & OpFlags.RightToLeft) != 0;
96 int anch_ptr = anch_reverse ? ptr - anch_offset : ptr + anch_offset;
97 int anch_end = text_end - match_min + anch_offset; // maximum anchor position
103 // the general case for an anchoring expression is at the bottom, however we
104 // do some checks for the common cases before to save processing time. the current
105 // optimizer only outputs three types of anchoring expressions: fixed position,
106 // fixed substring, and no anchor.
108 OpCode anch_op = (OpCode)(program[pc + 3] & 0x00ff);
109 if (anch_op == OpCode.Position && skip == 6) { // position anchor
114 switch ((Position)program[pc + 4]) {
115 case Position.StartOfString:
116 if (anch_reverse || anch_offset == 0) {
119 if (TryMatch (ref ptr, pc + skip))
124 case Position.StartOfLine:
127 if (TryMatch (ref ptr, pc + skip))
133 while ((anch_reverse && anch_ptr >= 0) || (!anch_reverse && anch_ptr <= anch_end)) {
134 if (anch_ptr == 0 || text[anch_ptr - 1] == '\n') {
136 ptr = anch_ptr == anch_end ? anch_ptr : anch_ptr + anch_offset;
138 ptr = anch_ptr == 0 ? anch_ptr : anch_ptr - anch_offset;
139 if (TryMatch (ref ptr, pc + skip))
150 case Position.StartOfScan:
151 if (anch_ptr == scan_ptr) {
152 ptr = anch_reverse ? scan_ptr + anch_offset : scan_ptr - anch_offset;
153 if (TryMatch (ref ptr, pc + skip))
163 else if (qs != null ||
164 (anch_op == OpCode.String && skip == 6 + program[pc + 4])) { // substring anchor
169 bool reverse = ((OpFlags)program[pc + 3] & OpFlags.RightToLeft) != 0;
172 bool ignore = ((OpFlags)program[pc + 3] & OpFlags.IgnoreCase) != 0;
173 string substring = GetString (pc + 3);
174 qs = new QuickSearch (substring, ignore, reverse);
176 while ((anch_reverse && anch_ptr >= anch_begin)
177 || (!anch_reverse && anch_ptr <= anch_end)) {
181 anch_ptr = qs.Search (text, anch_ptr, anch_begin);
183 anch_ptr += qs.Length ;
187 anch_ptr = qs.Search (text, anch_ptr, anch_end);
191 ptr = reverse ? anch_ptr + anch_offset : anch_ptr - anch_offset;
192 if (TryMatch (ref ptr, pc + skip))
201 else if (anch_op == OpCode.True) { // no anchor
206 while ((anch_reverse && anch_ptr >= anch_begin)
207 || (!anch_reverse && anch_ptr <= anch_end)) {
210 if (TryMatch (ref ptr, pc + skip))
218 else { // general case
223 while ((anch_reverse && anch_ptr >= anch_begin)
224 || (!anch_reverse && anch_ptr <= anch_end)) {
227 if (Eval (Mode.Match, ref ptr, pc + 3)) {
228 // anchor expression passed: try real expression at the correct offset
230 ptr = anch_reverse ? anch_ptr + anch_offset : anch_ptr - anch_offset;
231 if (TryMatch (ref ptr, pc + skip))
253 case OpCode.Position: {
254 if (!IsPosition ((Position)program[pc + 1], ptr))
260 case OpCode.String: {
261 bool reverse = (flags & OpFlags.RightToLeft) != 0;
262 bool ignore = (flags & OpFlags.IgnoreCase) != 0;
263 int len = program[pc + 1];
271 if (ptr + len > text_end)
275 for (int i = 0; i < len; ++ i) {
276 char c = text[ptr + i];
278 c = Char.ToLower (c);
280 if (c != (char)program[pc ++])
289 case OpCode.Reference: {
290 bool reverse = (flags & OpFlags.RightToLeft) != 0;
291 bool ignore = (flags & OpFlags.IgnoreCase) != 0;
292 int m = GetLastDefined (program [pc + 1]);
296 int str = marks [m].Index;
297 int len = marks [m].Length;
304 else if (ptr + len > text_end)
309 for (int i = 0; i < len; ++ i) {
310 if (Char.ToLower (text[ptr + i]) != Char.ToLower (text[str + i]))
314 for (int i = 0; i < len; ++ i) {
315 if (text[ptr + i] != text[str + i])
325 case OpCode.Character: case OpCode.Category: case OpCode.NotCategory:
326 case OpCode.Range: case OpCode.Set: {
327 if (!EvalChar (mode, ref ptr, ref pc, false))
333 int target = pc + program[pc + 1];
335 if (!EvalChar (mode, ref ptr, ref pc, true))
343 Open (program[pc + 1], ptr);
349 Close (program[pc + 1], ptr);
354 case OpCode.BalanceStart: {
356 int start = ptr; //point before the balancing group
358 if (!Eval (Mode.Match, ref ptr, pc + 5))
363 if(!Balance (program[pc + 1], program[pc + 2], (program[pc + 3] == 1 ? true : false) , start)) {
368 pc += program[pc + 4];
372 case OpCode.Balance: {
376 case OpCode.IfDefined: {
377 int m = GetLastDefined (program [pc + 2]);
379 pc += program[pc + 1];
386 if (!Eval (Mode.Match, ref ptr, pc + 2))
389 pc += program[pc + 1];
394 int cp = Checkpoint ();
396 if (Eval (Mode.Match, ref test_ptr, pc + 3))
397 pc += program[pc + 1];
400 pc += program[pc + 2];
405 case OpCode.Branch: {
408 int cp = Checkpoint ();
409 if (Eval (Mode.Match, ref ptr, pc + 2))
414 pc += program[pc + 1];
415 branch_op = (OpCode)(program[pc] & 0xff);
416 } while (branch_op != OpCode.False);
422 pc += program[pc + 1];
426 case OpCode.Repeat: {
427 this.repeat = new RepeatContext (
428 this.repeat, // previous context
429 ReadProgramCount (pc + 2), // minimum
430 ReadProgramCount (pc + 4), // maximum
431 (flags & OpFlags.Lazy) != 0, // lazy
432 pc + 6 // subexpression
435 if (Eval (Mode.Match, ref ptr, pc + program[pc + 1]))
438 this.repeat = this.repeat.Previous;
444 RepeatContext current = this.repeat;
447 // Can we avoid recursion?
449 // Backtracking can be forced in nested quantifiers from the tail of this quantifier.
450 // Thus, we cannot, in general, use a simple loop on repeat.Expression to handle
453 // If 'deep' was unmolested, that implies that there was no nested quantifiers.
454 // Thus, we can safely avoid recursion.
459 int start = current.Start;
460 int start_count = current.Count;
462 while (!current.IsMinimum) {
466 if (!Eval (Mode.Match, ref ptr, current.Expression)) {
467 current.Start = start;
468 current.Count = start_count;
471 if (deep != current) // recursive mode
475 if (ptr == current.Start) {
476 // degenerate match ... match tail or fail
477 this.repeat = current.Previous;
479 if (Eval (Mode.Match, ref ptr, pc + 1))
482 this.repeat = current;
486 if (current.IsLazy) {
488 // match tail first ...
489 this.repeat = current.Previous;
491 int cp = Checkpoint ();
492 if (Eval (Mode.Match, ref ptr, pc + 1))
497 // ... then match more
498 this.repeat = current;
499 if (current.IsMaximum)
504 if (!Eval (Mode.Match, ref ptr, current.Expression)) {
505 current.Start = start;
506 current.Count = start_count;
509 if (deep != current) // recursive mode
511 // Degenerate match: ptr has not moved since the last (failed) tail match.
512 // So, next and subsequent tail matches will fail.
513 if (ptr == current.Start)
517 int stack_size = stack.Count;
519 // match greedily as much as possible
520 while (!current.IsMaximum) {
521 int cp = Checkpoint ();
523 int old_start = current.Start;
528 if (!Eval (Mode.Match, ref ptr, current.Expression)) {
530 current.Start = old_start;
534 if (deep != current) {
535 // recursive mode: no more backtracking, truncate the stack
536 stack.Count = stack_size;
540 stack.Push (old_ptr);
542 // Degenerate match: no point going on
543 if (ptr == current.Start)
547 // then, match the tail, backtracking as necessary.
548 this.repeat = current.Previous;
551 if (Eval (Mode.Match, ref ptr, pc + 1)) {
552 stack.Count = stack_size;
555 if (stack.Count == stack_size) {
556 this.repeat = current;
562 Backtrack (stack.Pop ());
567 case OpCode.FastRepeat: {
568 this.fast = new RepeatContext (
570 ReadProgramCount (pc + 2), // minimum
571 ReadProgramCount (pc + 4), // maximum
572 (flags & OpFlags.Lazy) != 0, // lazy
573 pc + 6 // subexpression
578 int cp = Checkpoint ();
580 pc += program[pc + 1]; // tail expression
581 ushort tail_word = program[pc];
583 int c1 = -1; // first character of tail operator
584 int c2 = -1; // ... and the same character, in upper case if ignoring case
585 int coff = 0; // 0 or -1 depending on direction
587 OpCode tail_op = (OpCode)(tail_word & 0xff);
588 if (tail_op == OpCode.Character || tail_op == OpCode.String) {
589 OpFlags tail_flags = (OpFlags)(tail_word & 0xff00);
591 if ((tail_flags & OpFlags.Negate) != 0)
594 if (tail_op == OpCode.String)
598 if ((tail_flags & OpFlags.RightToLeft) != 0)
600 offset = program[pc + 1] - 1 ;
603 c1 = program[pc + 2 + offset]; // first char of string
606 c1 = program[pc + 1]; // character
608 if ((tail_flags & OpFlags.IgnoreCase) != 0)
609 c2 = Char.ToUpper ((char)c1); // ignore case
613 if ((tail_flags & OpFlags.RightToLeft) != 0)
614 coff = -1; // reverse
621 if (!fast.IsMinimum && !Eval (Mode.Count, ref ptr, fast.Expression)) {
622 //Console.WriteLine ("lazy fast: failed mininum.");
623 fast = fast.Previous;
629 if (c1 < 0 || (p >= 0 && p < text_end && (c1 == text[p] || c2 == text[p]))) {
631 if (Eval (Mode.Match, ref ptr, pc))
635 if (fast.IsMaximum) {
636 //Console.WriteLine ("lazy fast: failed with maximum.");
637 fast = fast.Previous;
642 if (!Eval (Mode.Count, ref ptr, fast.Expression)) {
643 //Console.WriteLine ("lazy fast: no more.");
644 fast = fast.Previous;
648 fast = fast.Previous;
652 if (!Eval (Mode.Count, ref ptr, fast.Expression)) {
653 fast = fast.Previous;
659 width = (ptr - fast.Start) / fast.Count;
665 if (c1 < 0 || (p >= 0 && p < text_end && (c1 == text[p] || c2 == text[p]))) {
667 if (Eval (Mode.Match, ref ptr, pc))
672 if (!fast.IsMinimum) {
673 fast = fast.Previous;
680 fast = fast.Previous;
686 Debug.Assert (false, "Regex", "Info block found in pattern");
700 if (fast.IsMaximum || (fast.IsLazy && fast.IsMinimum))
703 pc = fast.Expression;
714 if (!fast.IsLazy && fast.IsMinimum)
717 ref_ptr = fast.Start;
725 private bool EvalChar (Mode mode, ref int ptr, ref int pc, bool multi) {
726 bool consumed = false;
732 ushort word = program[pc];
733 OpCode op = (OpCode)(word & 0x00ff);
734 OpFlags flags = (OpFlags)(word & 0xff00);
738 ignore = (flags & OpFlags.IgnoreCase) != 0;
740 // consume character: the direction of an In construct is
741 // determined by the direction of its first op
744 if ((flags & OpFlags.RightToLeft) != 0) {
758 c = Char.ToLower (c);
765 negate = (flags & OpFlags.Negate) != 0;
776 case OpCode.Character: {
777 if (c == (char)program[pc ++])
782 case OpCode.Category: {
783 if (CategoryUtils.IsCategory ((Category)program[pc ++], c))
788 case OpCode.NotCategory: {
789 if (!CategoryUtils.IsCategory ((Category)program[pc ++], c))
795 int lo = (char)program[pc ++];
796 int hi = (char)program[pc ++];
797 if (lo <= c && c <= hi)
803 int lo = (char)program[pc ++];
804 int len = (char)program[pc ++];
809 if (i < 0 || i >= len << 4)
813 if ((program[bits + (i >> 4)] & (1 << (i & 0xf))) != 0)
823 private bool TryMatch (ref int ref_ptr, int pc) {
827 marks [groups [0]].Start = ptr;
828 if (Eval (Mode.Match, ref ptr, pc)) {
829 marks [groups [0]].End = ptr;
837 private bool IsPosition (Position pos, int ptr) {
839 case Position.Start: case Position.StartOfString:
842 case Position.StartOfLine:
843 return ptr == 0 || text[ptr - 1] == '\n';
845 case Position.StartOfScan:
846 return ptr == scan_ptr;
849 return ptr == text_end ||
850 (ptr == text_end - 1 && text[ptr] == '\n');
852 case Position.EndOfLine:
853 return ptr == text_end || text[ptr] == '\n';
855 case Position.EndOfString:
856 return ptr == text_end;
858 case Position.Boundary:
863 return IsWordChar (text[ptr]);
864 else if (ptr == text_end)
865 return IsWordChar (text[ptr - 1]);
867 return IsWordChar (text[ptr]) != IsWordChar (text[ptr - 1]);
869 case Position.NonBoundary:
874 return !IsWordChar (text[ptr]);
875 else if (ptr == text_end)
876 return !IsWordChar (text[ptr - 1]);
878 return IsWordChar (text[ptr]) == IsWordChar (text[ptr - 1]);
885 private bool IsWordChar (char c) {
886 return CategoryUtils.IsCategory (Category.Word, c);
889 private string GetString (int pc) {
890 int len = program[pc + 1];
893 char[] cs = new char[len];
894 for (int i = 0; i < len; ++ i)
895 cs[i] = (char)program[str ++];
897 return new string (cs);
900 // capture management
902 private void Open (int gid, int ptr) {
903 int m = groups [gid];
904 if (m < mark_start || marks [m].IsDefined) {
909 marks [m].Start = ptr;
912 private void Close (int gid, int ptr) {
913 marks [groups [gid]].End = ptr;
916 private bool Balance (int gid, int balance_gid, bool capture, int ptr) {
917 int b = groups [balance_gid];
919 if(b == -1 || marks[b].Index < 0) {
920 //Group not previously matched
923 Debug.Assert (marks [b].IsDefined, "Regex", "Balancng group not closed");
924 if (gid > 0 && capture){
925 Open (gid, marks [b].Index + marks [b].Length);
929 groups [balance_gid] = marks[b].Previous;
934 private int Checkpoint () {
935 mark_start = mark_end;
939 private void Backtrack (int cp) {
940 Debug.Assert (cp > mark_start, "Regex", "Attempt to backtrack forwards");
941 for (int i = 0; i < groups.Length; ++ i) {
944 marks [m].Start = -1;
945 m = marks [m].Previous;
953 private void ResetGroups () {
954 int n = groups.Length;
956 marks = new Mark [n * 10];
958 for (int i = 0; i < n; ++ i) {
961 marks [i].Start = -1;
963 marks [i].Previous = -1;
970 private int GetLastDefined (int gid) {
971 int m = groups [gid];
972 while (m >= 0 && !marks [m].IsDefined)
973 m = marks [m].Previous;
978 private int CreateMark (int previous) {
979 if (mark_end == marks.Length) {
980 Mark [] dest = new Mark [marks.Length * 2];
981 marks.CopyTo (dest, 0);
986 marks [m].Start = marks [m].End = -1;
987 marks [m].Previous = previous;
992 private void GetGroupInfo (int gid, out int first_mark_index, out int n_caps)
994 first_mark_index = -1;
996 for (int m = groups [gid]; m >= 0; m = marks [m].Previous) {
997 if (!marks [m].IsDefined)
999 if (first_mark_index < 0)
1000 first_mark_index = m;
1005 private void PopulateGroup (Group g, int first_mark_index, int n_caps)
1008 for (int m = marks [first_mark_index].Previous; m >= 0; m = marks [m].Previous) {
1009 if (!marks [m].IsDefined)
1011 Capture cap = new Capture (text, marks [m].Index, marks [m].Length);
1012 g.Captures.SetValue (cap, n_caps - 1 - i);
1017 private Match GenerateMatch (Regex regex)
1019 int n_caps, first_mark_index;
1021 GetGroupInfo (0, out first_mark_index, out n_caps);
1023 // Avoid fully populating the Match instance if not needed
1024 if (!needs_groups_or_captures)
1025 return new Match (regex, this, text, text_end, 0, marks [first_mark_index].Index, marks [first_mark_index].Length);
1027 Match retval = new Match (regex, this, text, text_end, groups.Length,
1028 marks [first_mark_index].Index, marks [first_mark_index].Length, n_caps);
1029 PopulateGroup (retval, first_mark_index, n_caps);
1031 for (int gid = 1; gid < groups.Length; ++ gid) {
1032 GetGroupInfo (gid, out first_mark_index, out n_caps);
1033 if (first_mark_index < 0) {
1036 g = new Group (text, marks [first_mark_index].Index, marks [first_mark_index].Length, n_caps);
1037 PopulateGroup (g, first_mark_index, n_caps);
1039 retval.Groups.SetValue (g, gid);
1044 // interpreter attributes
1046 private ushort[] program; // regex program
1047 private int program_start; // first instruction after info block
1048 private string text; // input text
1049 private int text_end; // end of input text (last character + 1)
1050 private int group_count; // number of capturing groups
1051 private int match_min;//, match_max; // match width information
1052 private QuickSearch qs; // fast substring matcher
1056 private int scan_ptr; // start of scan
1058 private RepeatContext repeat; // current repeat context
1059 private RepeatContext fast; // fast repeat context
1061 // Repeat/Until handling
1062 private IntStack stack = new IntStack (); // utility stack
1063 private RepeatContext deep; // points to the most-nested repeat context
1065 private Mark[] marks = null; // mark stack
1066 private int mark_start; // start of current checkpoint
1067 private int mark_end; // end of checkpoint/next free mark
1069 private int[] groups; // current group definitions
1073 private struct IntStack {
1078 return values [--count];
1080 public void Push (int value)
1082 if (values == null) {
1083 values = new int [8];
1084 } else if (count == values.Length) {
1085 int new_size = values.Length;
1086 new_size += new_size >> 1;
1087 int [] new_values = new int [new_size];
1088 for (int i = 0; i < count; ++i)
1089 new_values [i] = values [i];
1090 values = new_values;
1092 values [count++] = value;
1095 get { return values [count - 1]; }
1098 get { return count; }
1101 throw new SystemException ("can only truncate the stack");
1107 private class RepeatContext {
1108 public RepeatContext (RepeatContext previous, int min, int max, bool lazy, int expr_pc) {
1109 this.previous = previous;
1113 this.expr_pc = expr_pc;
1120 get { return count; }
1121 set { count = value; }
1125 get { return start; }
1126 set { start = value; }
1129 public bool IsMinimum {
1130 get { return min <= count; }
1133 public bool IsMaximum {
1134 get { return max <= count; }
1137 public bool IsLazy {
1138 get { return lazy; }
1141 public int Expression {
1142 get { return expr_pc; }
1145 public RepeatContext Previous {
1146 get { return previous; }
1150 private int min, max;
1152 private int expr_pc;
1153 private RepeatContext previous;