3 // namespace: System.Text.RegularExpressions
4 // file: interpreter.cs
6 // author: Dan Lewis (dlewis@gmx.co.uk)
10 // Permission is hereby granted, free of charge, to any person obtaining
11 // a copy of this software and associated documentation files (the
12 // "Software"), to deal in the Software without restriction, including
13 // without limitation the rights to use, copy, modify, merge, publish,
14 // distribute, sublicense, and/or sell copies of the Software, and to
15 // permit persons to whom the Software is furnished to do so, subject to
16 // the following conditions:
18 // The above copyright notice and this permission notice shall be
19 // included in all copies or substantial portions of the Software.
21 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
22 // EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
23 // MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
24 // NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
25 // LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
26 // OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
27 // WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
31 using System.Collections;
32 using System.Diagnostics;
33 using System.Globalization;
35 namespace System.Text.RegularExpressions {
37 class Interpreter : IMachine {
38 public Interpreter (ushort[] program) {
39 this.program = program;
44 Debug.Assert ((OpCode)program[0] == OpCode.Info, "Regex", "Cant' find info block");
46 this.group_count = program[1] + 1;
47 this.match_min = program[2];
48 this.match_max = program[3];
52 this.program_start = 4;
53 this.groups = new int [group_count];
56 // IMachine implementation
58 public Match Scan (Regex regex, string text, int start, int end) {
61 this.scan_ptr = start;
63 if (Eval (Mode.Match, ref scan_ptr, program_start))
64 return GenerateMatch (regex);
71 private void Reset () {
76 private bool Eval (Mode mode, ref int ref_ptr, int pc) {
80 ushort word = program[pc];
81 OpCode op = (OpCode)(word & 0x00ff);
82 OpFlags flags = (OpFlags)(word & 0xff00);
86 int skip = program[pc + 1];
88 int anch_offset = program[pc + 2];
89 bool anch_reverse = (flags & OpFlags.RightToLeft) != 0;
90 int anch_ptr = anch_reverse ? ptr - anch_offset : ptr + anch_offset;
91 int anch_end = text_end - match_min + anch_offset; // maximum anchor position
97 // the general case for an anchoring expression is at the bottom, however we
98 // do some checks for the common cases before to save processing time. the current
99 // optimizer only outputs three types of anchoring expressions: fixed position,
100 // fixed substring, and no anchor.
102 OpCode anch_op = (OpCode)(program[pc + 3] & 0x00ff);
103 if (anch_op == OpCode.Position && skip == 6) { // position anchor
108 switch ((Position)program[pc + 4]) {
109 case Position.StartOfString:
110 if (anch_reverse || anch_offset == 0) {
112 if (TryMatch (ref ptr, pc + skip))
117 case Position.StartOfLine:
121 if (TryMatch (ref ptr, pc + skip))
127 while ((anch_reverse && anch_ptr >= 0) || (!anch_reverse && anch_ptr <= anch_end)) {
128 if (anch_ptr == 0 || text[anch_ptr - 1] == '\n') {
130 ptr = anch_ptr == anch_end ? anch_ptr : anch_ptr + anch_offset;
132 ptr = anch_ptr == 0 ? anch_ptr : anch_ptr - anch_offset;
133 if (TryMatch (ref ptr, pc + skip))
144 case Position.StartOfScan:
145 if (anch_ptr == scan_ptr) {
146 ptr = anch_reverse ? scan_ptr + anch_offset : scan_ptr - anch_offset;
147 if (TryMatch (ref ptr, pc + skip))
157 else if (qs != null ||
158 (anch_op == OpCode.String && skip == 6 + program[pc + 4])) { // substring anchor
163 bool reverse = ((OpFlags)program[pc + 3] & OpFlags.RightToLeft) != 0;
164 string substring = GetString (pc + 3);
167 bool ignore = ((OpFlags)program[pc + 3] & OpFlags.IgnoreCase) != 0;
169 qs = new QuickSearch (substring, ignore, reverse);
171 while ((anch_reverse && anch_ptr >= anch_begin)
172 || (!anch_reverse && anch_ptr <= anch_end)) {
176 anch_ptr = qs.Search (text, anch_ptr, anch_begin);
178 anch_ptr += substring.Length ;
182 anch_ptr = qs.Search (text, anch_ptr, anch_end);
186 ptr = reverse ? anch_ptr + anch_offset : anch_ptr - anch_offset;
187 if (TryMatch (ref ptr, pc + skip))
196 else if (anch_op == OpCode.True) { // no anchor
201 while ((anch_reverse && anch_ptr >= anch_begin)
202 || (!anch_reverse && anch_ptr <= anch_end)) {
205 if (TryMatch (ref ptr, pc + skip))
213 else { // general case
218 while ((anch_reverse && anch_ptr >= anch_begin)
219 || (!anch_reverse && anch_ptr <= anch_end)) {
222 if (Eval (Mode.Match, ref ptr, pc + 3)) {
223 // anchor expression passed: try real expression at the correct offset
225 ptr = anch_reverse ? anch_ptr + anch_offset : anch_ptr - anch_offset;
226 if (TryMatch (ref ptr, pc + skip))
248 case OpCode.Position: {
249 if (!IsPosition ((Position)program[pc + 1], ptr))
255 case OpCode.String: {
256 bool reverse = (flags & OpFlags.RightToLeft) != 0;
257 bool ignore = (flags & OpFlags.IgnoreCase) != 0;
258 int len = program[pc + 1];
266 if (ptr + len > text_end)
270 for (int i = 0; i < len; ++ i) {
271 char c = text[ptr + i];
273 c = Char.ToLower (c);
275 if (c != (char)program[pc ++])
284 case OpCode.Reference: {
285 bool reverse = (flags & OpFlags.RightToLeft) != 0;
286 bool ignore = (flags & OpFlags.IgnoreCase) != 0;
287 int m = GetLastDefined (program [pc + 1]);
291 int str = marks [m].Index;
292 int len = marks [m].Length;
299 else if (ptr + len > text_end)
303 for (int i = 0; i < len; ++ i) {
305 if (Char.ToLower (text[ptr + i]) != Char.ToLower (text[str + i]))
309 if (text[ptr + i] != text[str + i])
319 case OpCode.Character: case OpCode.Category:
320 case OpCode.Range: case OpCode.Set: {
321 if (!EvalChar (mode, ref ptr, ref pc, false))
327 int target = pc + program[pc + 1];
329 if (!EvalChar (mode, ref ptr, ref pc, true))
337 Open (program[pc + 1], ptr);
343 Close (program[pc + 1], ptr);
348 case OpCode.BalanceStart: {
350 int start = ptr; //point before the balancing group
352 if (!Eval (Mode.Match, ref ptr, pc + 5))
357 if(!Balance (program[pc + 1], program[pc + 2], (program[pc + 3] == 1 ? true : false) , start)) {
362 pc += program[pc + 4];
366 case OpCode.Balance: {
370 case OpCode.IfDefined: {
371 int m = GetLastDefined (program [pc + 2]);
373 pc += program[pc + 1];
380 if (!Eval (Mode.Match, ref ptr, pc + 2))
383 pc += program[pc + 1];
388 int cp = Checkpoint ();
390 if (Eval (Mode.Match, ref test_ptr, pc + 3))
391 pc += program[pc + 1];
394 pc += program[pc + 2];
399 case OpCode.Branch: {
402 int cp = Checkpoint ();
403 if (Eval (Mode.Match, ref ptr, pc + 2))
408 pc += program[pc + 1];
409 branch_op = (OpCode)(program[pc] & 0xff);
410 } while (branch_op != OpCode.False);
416 pc += program[pc + 1];
420 case OpCode.Repeat: {
421 this.repeat = new RepeatContext (
422 this.repeat, // previous context
423 program[pc + 2], // minimum
424 program[pc + 3], // maximum
425 (flags & OpFlags.Lazy) != 0, // lazy
426 pc + 4 // subexpression
429 if (Eval (Mode.Match, ref ptr, pc + program[pc + 1]))
432 this.repeat = this.repeat.Previous;
438 RepeatContext current = this.repeat;
439 int start = current.Start;
441 if (!current.IsMinimum) {
444 if (Eval (Mode.Match, ref ptr, repeat.Expression))
447 current.Start = start;
452 if (ptr == current.Start) {
453 // degenerate match ... match tail or fail
455 this.repeat = current.Previous;
456 if (Eval (Mode.Match, ref ptr, pc + 1))
459 this.repeat = current;
463 if (current.IsLazy) {
464 // match tail first ...
466 this.repeat = current.Previous;
467 int cp = Checkpoint ();
468 if (Eval (Mode.Match, ref ptr, pc + 1))
473 // ... then match more
475 this.repeat = current;
476 if (!current.IsMaximum) {
479 if (Eval (Mode.Match, ref ptr, current.Expression))
482 current.Start = start;
490 // match more first ...
492 if (!current.IsMaximum) {
493 int cp = Checkpoint ();
496 if (Eval (Mode.Match, ref ptr, current.Expression))
499 current.Start = start;
504 // ... then match tail
506 this.repeat = current.Previous;
507 if (Eval (Mode.Match, ref ptr, pc + 1))
510 this.repeat = current;
515 case OpCode.FastRepeat: {
516 this.fast = new RepeatContext (
518 program[pc + 2], // minimum
519 program[pc + 3], // maximum
520 (flags & OpFlags.Lazy) != 0, // lazy
521 pc + 4 // subexpression
525 int cp = Checkpoint ();
527 pc += program[pc + 1]; // tail expression
528 ushort tail_word = program[pc];
530 int c1, c2; // first character of tail operator
531 int coff; // 0 or -1 depending on direction
533 OpCode tail_op = (OpCode)(tail_word & 0xff);
534 if (tail_op == OpCode.Character || tail_op == OpCode.String) {
535 OpFlags tail_flags = (OpFlags)(tail_word & 0xff00);
537 if (tail_op == OpCode.String)
541 if ((tail_flags & OpFlags.RightToLeft) != 0)
543 offset = program[pc + 1] - 1 ;
546 c1 = program[pc + 2 + offset]; // first char of string
549 c1 = program[pc + 1]; // character
551 if ((tail_flags & OpFlags.IgnoreCase) != 0)
552 c2 = Char.ToUpper ((char)c1); // ignore case
556 if ((tail_flags & OpFlags.RightToLeft) != 0)
557 coff = -1; // reverse
567 if (!fast.IsMinimum && !Eval (Mode.Count, ref ptr, fast.Expression)) {
568 //Console.WriteLine ("lazy fast: failed mininum.");
569 fast = fast.Previous;
575 if ((c1 < 0 || (p >= 0 && p < text_end && (c1 == text[p] || c2 == text[p]))) &&
576 Eval (Mode.Match, ref ptr, pc))
579 if (fast.IsMaximum) {
580 //Console.WriteLine ("lazy fast: failed with maximum.");
581 fast = fast.Previous;
586 if (!Eval (Mode.Count, ref ptr, fast.Expression)) {
587 //Console.WriteLine ("lazy fast: no more.");
588 fast = fast.Previous;
592 fast = fast.Previous;
596 if (!Eval (Mode.Count, ref ptr, fast.Expression)) {
597 fast = fast.Previous;
603 width = (ptr - fast.Start) / fast.Count;
609 if ((c1 < 0 || (p >= 0 && p < text_end && (c1 == text[p] || c2 == text[p]))) &&
610 Eval (Mode.Match, ref ptr, pc))
614 if (!fast.IsMinimum) {
615 fast = fast.Previous;
622 fast = fast.Previous;
628 Debug.Assert (false, "Regex", "Info block found in pattern");
642 if (fast.IsMaximum || (fast.IsLazy && fast.IsMinimum))
645 pc = fast.Expression;
656 if (!fast.IsLazy && fast.IsMinimum)
659 ref_ptr = fast.Start;
667 private bool EvalChar (Mode mode, ref int ptr, ref int pc, bool multi) {
668 bool consumed = false;
674 ushort word = program[pc];
675 OpCode op = (OpCode)(word & 0x00ff);
676 OpFlags flags = (OpFlags)(word & 0xff00);
680 ignore = (flags & OpFlags.IgnoreCase) != 0;
682 // consume character: the direction of an In construct is
683 // determined by the direction of its first op
686 if ((flags & OpFlags.RightToLeft) != 0) {
700 c = Char.ToLower (c);
707 negate = (flags & OpFlags.Negate) != 0;
718 case OpCode.Character: {
719 if (c == (char)program[pc ++])
724 case OpCode.Category: {
725 if (CategoryUtils.IsCategory ((Category)program[pc ++], c))
732 int lo = (char)program[pc ++];
733 int hi = (char)program[pc ++];
734 if (lo <= c && c <= hi)
740 int lo = (char)program[pc ++];
741 int len = (char)program[pc ++];
746 if (i < 0 || i >= len << 4)
750 if ((program[bits + (i >> 4)] & (1 << (i & 0xf))) != 0)
760 private bool TryMatch (ref int ref_ptr, int pc) {
764 marks [groups [0]].Start = ptr;
765 if (Eval (Mode.Match, ref ptr, pc)) {
766 marks [groups [0]].End = ptr;
774 private bool IsPosition (Position pos, int ptr) {
776 case Position.Start: case Position.StartOfString:
779 case Position.StartOfLine:
780 return ptr == 0 || text[ptr - 1] == '\n';
782 case Position.StartOfScan:
783 return ptr == scan_ptr;
786 return ptr == text_end ||
787 (ptr == text_end - 1 && text[ptr] == '\n');
789 case Position.EndOfLine:
790 return ptr == text_end || text[ptr] == '\n';
792 case Position.EndOfString:
793 return ptr == text_end;
795 case Position.Boundary:
800 return IsWordChar (text[ptr]);
801 else if (ptr == text_end)
802 return IsWordChar (text[ptr - 1]);
804 return IsWordChar (text[ptr]) != IsWordChar (text[ptr - 1]);
806 case Position.NonBoundary:
811 return !IsWordChar (text[ptr]);
812 else if (ptr == text_end)
813 return !IsWordChar (text[ptr - 1]);
815 return IsWordChar (text[ptr]) == IsWordChar (text[ptr - 1]);
822 private bool IsWordChar (char c) {
823 return CategoryUtils.IsCategory (Category.Word, c);
826 private string GetString (int pc) {
827 int len = program[pc + 1];
830 char[] cs = new char[len];
831 for (int i = 0; i < len; ++ i)
832 cs[i] = (char)program[str ++];
834 return new string (cs);
837 // capture management
839 private void Open (int gid, int ptr) {
840 int m = groups [gid];
841 if (m < mark_start || marks [m].IsDefined) {
846 marks [m].Start = ptr;
849 private void Close (int gid, int ptr) {
850 marks [groups [gid]].End = ptr;
853 private bool Balance (int gid, int balance_gid, bool capture, int ptr) {
854 int b = groups [balance_gid];
856 if(b == -1 || marks[b].Index < 0) {
857 //Group not previously matched
861 Debug.Assert (marks [b].IsDefined, "Regex", "Balancng group not closed");
863 if (gid > 0 && capture){
864 Open (gid, marks [b].Index + marks [b].Length);
868 groups [balance_gid] = marks[b].Previous;
873 private int Checkpoint () {
874 mark_start = mark_end;
878 private void Backtrack (int cp) {
879 Debug.Assert (cp > mark_start, "Regex", "Attempt to backtrack forwards");
881 for (int i = 0; i < groups.Length; ++ i) {
884 m = marks [m].Previous;
890 private void ResetGroups () {
891 int n = groups.Length;
893 marks = new Mark [n * 10];
895 for (int i = 0; i < n; ++ i) {
898 marks [i].Start = -1;
900 marks [i].Previous = -1;
907 private int GetLastDefined (int gid) {
908 int m = groups [gid];
909 while (m >= 0 && !marks [m].IsDefined)
910 m = marks [m].Previous;
915 private int CreateMark (int previous) {
916 if (mark_end == marks.Length) {
917 Mark [] dest = new Mark [marks.Length * 2];
918 marks.CopyTo (dest, 0);
923 marks [m].Start = marks [m].End = -1;
924 marks [m].Previous = previous;
929 private Match GenerateMatch (Regex regex) {
930 int[][] grps = new int[groups.Length][];
931 ArrayList caps = new ArrayList ();
933 for (int gid = 0; gid < groups.Length; ++ gid) {
935 for (int m = groups[gid]; m >= 0; m = marks[m].Previous) {
936 if (!marks[m].IsDefined)
939 caps.Add (marks[m].Index);
940 caps.Add (marks[m].Length);
943 grps[gid] = (int[])caps.ToArray (typeof (int));
946 return new Match (regex, this, text, text_end, grps);
949 // interpreter attributes
951 private ushort[] program; // regex program
952 private int program_start; // first instruction after info block
953 private string text; // input text
954 private int text_end; // end of input text (last character + 1)
955 private int group_count; // number of capturing groups
956 private int match_min, match_max; // match width information
957 private QuickSearch qs; // fast substring matcher
961 private int scan_ptr; // start of scan
963 private RepeatContext repeat; // current repeat context
964 private RepeatContext fast; // fast repeat context
966 private Mark[] marks = null; // mark stack
967 private int mark_start; // start of current checkpoint
968 private int mark_end; // end of checkpoint/next free mark
970 private int[] groups; // current group definitions
974 private struct Mark {
975 public int Start, End;
978 public bool IsDefined {
979 get { return Start >= 0 && End >= 0; }
983 get { return Start < End ? Start : End; }
987 get { return Start < End ? End - Start : Start - End; }
991 private class RepeatContext {
992 public RepeatContext (RepeatContext previous, int min, int max, bool lazy, int expr_pc) {
993 this.previous = previous;
997 this.expr_pc = expr_pc;
1004 get { return count; }
1005 set { count = value; }
1009 get { return start; }
1010 set { start = value; }
1013 public bool IsMinimum {
1014 get { return min <= count; }
1017 public bool IsMaximum {
1018 get { return max <= count; }
1021 public bool IsLazy {
1022 get { return lazy; }
1025 public int Expression {
1026 get { return expr_pc; }
1029 public RepeatContext Previous {
1030 get { return previous; }
1034 private int min, max;
1036 private int expr_pc;
1037 private RepeatContext previous;