3 // namespace: System.Text.RegularExpressions
4 // file: interpreter.cs
6 // author: Dan Lewis (dlewis@gmx.co.uk)
10 using System.Collections;
11 using System.Diagnostics;
12 using System.Globalization;
14 namespace System.Text.RegularExpressions {
16 class Interpreter : IMachine {
17 public Interpreter (ushort[] program) {
18 this.program = program;
23 Debug.Assert ((OpCode)program[0] == OpCode.Info, "Regex", "Cant' find info block");
25 this.group_count = program[1] + 1;
26 this.match_min = program[2];
27 this.match_max = program[3];
31 this.program_start = 4;
32 this.groups = new int [group_count];
35 // IMachine implementation
37 public Match Scan (Regex regex, string text, int start, int end) {
40 this.scan_ptr = start;
42 if (Eval (Mode.Match, ref scan_ptr, program_start))
43 return GenerateMatch (regex);
50 private void Reset () {
55 private bool Eval (Mode mode, ref int ref_ptr, int pc) {
59 ushort word = program[pc];
60 OpCode op = (OpCode)(word & 0x00ff);
61 OpFlags flags = (OpFlags)(word & 0xff00);
65 int skip = program[pc + 1];
67 int anch_offset = program[pc + 2];
68 bool anch_reverse = (flags & OpFlags.RightToLeft) != 0;
69 int anch_ptr = anch_reverse ? ptr - anch_offset : ptr + anch_offset;
70 int anch_end = text_end - match_min + anch_offset; // maximum anchor position
76 // the general case for an anchoring expression is at the bottom, however we
77 // do some checks for the common cases before to save processing time. the current
78 // optimizer only outputs three types of anchoring expressions: fixed position,
79 // fixed substring, and no anchor.
81 OpCode anch_op = (OpCode)(program[pc + 3] & 0x00ff);
82 if (anch_op == OpCode.Position && skip == 6) { // position anchor
87 switch ((Position)program[pc + 4]) {
88 case Position.StartOfString:
89 if (anch_reverse || anch_offset == 0) {
91 if (TryMatch (ref ptr, pc + skip))
96 case Position.StartOfLine:
100 if (TryMatch (ref ptr, pc + skip))
106 while ((anch_reverse && anch_ptr >= 0) || (!anch_reverse && anch_ptr <= anch_end)) {
107 if (anch_ptr == 0 || text[anch_ptr - 1] == '\n') {
109 ptr = anch_ptr == anch_end ? anch_ptr : anch_ptr + anch_offset;
111 ptr = anch_ptr == 0 ? anch_ptr : anch_ptr - anch_offset;
112 if (TryMatch (ref ptr, pc + skip))
123 case Position.StartOfScan:
124 if (anch_ptr == scan_ptr) {
125 ptr = anch_reverse ? scan_ptr + anch_offset : scan_ptr - anch_offset;
126 if (TryMatch (ref ptr, pc + skip))
136 else if (qs != null ||
137 (anch_op == OpCode.String && skip == 6 + program[pc + 4])) { // substring anchor
142 bool reverse = ((OpFlags)program[pc + 3] & OpFlags.RightToLeft) != 0;
143 string substring = GetString (pc + 3);
146 bool ignore = ((OpFlags)program[pc + 3] & OpFlags.IgnoreCase) != 0;
148 qs = new QuickSearch (substring, ignore, reverse);
150 while ((anch_reverse && anch_ptr >= anch_begin)
151 || (!anch_reverse && anch_ptr <= anch_end)) {
155 anch_ptr = qs.Search (text, anch_ptr, anch_begin);
157 anch_ptr += substring.Length ;
161 anch_ptr = qs.Search (text, anch_ptr, anch_end);
165 ptr = reverse ? anch_ptr + anch_offset : anch_ptr - anch_offset;
166 if (TryMatch (ref ptr, pc + skip))
175 else if (anch_op == OpCode.True) { // no anchor
180 while ((anch_reverse && anch_ptr >= anch_begin)
181 || (!anch_reverse && anch_ptr <= anch_end)) {
184 if (TryMatch (ref ptr, pc + skip))
192 else { // general case
197 while ((anch_reverse && anch_ptr >= anch_begin)
198 || (!anch_reverse && anch_ptr <= anch_end)) {
201 if (Eval (Mode.Match, ref ptr, pc + 3)) {
202 // anchor expression passed: try real expression at the correct offset
204 ptr = anch_reverse ? anch_ptr + anch_offset : anch_ptr - anch_offset;
205 if (TryMatch (ref ptr, pc + skip))
227 case OpCode.Position: {
228 if (!IsPosition ((Position)program[pc + 1], ptr))
234 case OpCode.String: {
235 bool reverse = (flags & OpFlags.RightToLeft) != 0;
236 bool ignore = (flags & OpFlags.IgnoreCase) != 0;
237 int len = program[pc + 1];
245 if (ptr + len > text_end)
249 for (int i = 0; i < len; ++ i) {
250 char c = text[ptr + i];
252 c = Char.ToLower (c);
254 if (c != (char)program[pc ++])
263 case OpCode.Reference: {
264 bool reverse = (flags & OpFlags.RightToLeft) != 0;
265 bool ignore = (flags & OpFlags.IgnoreCase) != 0;
266 int m = GetLastDefined (program [pc + 1]);
270 int str = marks [m].Index;
271 int len = marks [m].Length;
278 else if (ptr + len > text_end)
282 for (int i = 0; i < len; ++ i) {
284 if (Char.ToLower (text[ptr + i]) != Char.ToLower (text[str + i]))
288 if (text[ptr + i] != text[str + i])
298 case OpCode.Character: case OpCode.Category:
299 case OpCode.Range: case OpCode.Set: {
300 if (!EvalChar (mode, ref ptr, ref pc, false))
306 int target = pc + program[pc + 1];
308 if (!EvalChar (mode, ref ptr, ref pc, true))
316 Open (program[pc + 1], ptr);
322 Close (program[pc + 1], ptr);
327 case OpCode.BalanceStart: {
329 int start = ptr; //point before the balancing group
331 if (!Eval (Mode.Match, ref ptr, pc + 5))
336 if(!Balance (program[pc + 1], program[pc + 2], (program[pc + 3] == 1 ? true : false) , start)) {
341 pc += program[pc + 4];
345 case OpCode.Balance: {
349 case OpCode.IfDefined: {
350 int m = GetLastDefined (program [pc + 2]);
352 pc += program[pc + 1];
359 if (!Eval (Mode.Match, ref ptr, pc + 2))
362 pc += program[pc + 1];
367 int cp = Checkpoint ();
369 if (Eval (Mode.Match, ref test_ptr, pc + 3))
370 pc += program[pc + 1];
373 pc += program[pc + 2];
378 case OpCode.Branch: {
381 int cp = Checkpoint ();
382 if (Eval (Mode.Match, ref ptr, pc + 2))
387 pc += program[pc + 1];
388 branch_op = (OpCode)(program[pc] & 0xff);
389 } while (branch_op != OpCode.False);
395 pc += program[pc + 1];
399 case OpCode.Repeat: {
400 this.repeat = new RepeatContext (
401 this.repeat, // previous context
402 program[pc + 2], // minimum
403 program[pc + 3], // maximum
404 (flags & OpFlags.Lazy) != 0, // lazy
405 pc + 4 // subexpression
408 if (Eval (Mode.Match, ref ptr, pc + program[pc + 1]))
411 this.repeat = this.repeat.Previous;
417 RepeatContext current = this.repeat;
418 int start = current.Start;
420 if (!current.IsMinimum) {
423 if (Eval (Mode.Match, ref ptr, repeat.Expression))
426 current.Start = start;
431 if (ptr == current.Start) {
432 // degenerate match ... match tail or fail
434 this.repeat = current.Previous;
435 if (Eval (Mode.Match, ref ptr, pc + 1))
438 this.repeat = current;
442 if (current.IsLazy) {
443 // match tail first ...
445 this.repeat = current.Previous;
446 int cp = Checkpoint ();
447 if (Eval (Mode.Match, ref ptr, pc + 1))
452 // ... then match more
454 this.repeat = current;
455 if (!current.IsMaximum) {
458 if (Eval (Mode.Match, ref ptr, current.Expression))
461 current.Start = start;
469 // match more first ...
471 if (!current.IsMaximum) {
472 int cp = Checkpoint ();
475 if (Eval (Mode.Match, ref ptr, current.Expression))
478 current.Start = start;
483 // ... then match tail
485 this.repeat = current.Previous;
486 if (Eval (Mode.Match, ref ptr, pc + 1))
489 this.repeat = current;
494 case OpCode.FastRepeat: {
495 this.fast = new RepeatContext (
497 program[pc + 2], // minimum
498 program[pc + 3], // maximum
499 (flags & OpFlags.Lazy) != 0, // lazy
500 pc + 4 // subexpression
504 int cp = Checkpoint ();
506 pc += program[pc + 1]; // tail expression
507 ushort tail_word = program[pc];
509 int c1, c2; // first character of tail operator
510 int coff; // 0 or -1 depending on direction
512 OpCode tail_op = (OpCode)(tail_word & 0xff);
513 if (tail_op == OpCode.Character || tail_op == OpCode.String) {
514 OpFlags tail_flags = (OpFlags)(tail_word & 0xff00);
516 if (tail_op == OpCode.String)
520 if ((tail_flags & OpFlags.RightToLeft) != 0)
522 offset = program[pc + 1] - 1 ;
525 c1 = program[pc + 2 + offset]; // first char of string
528 c1 = program[pc + 1]; // character
530 if ((tail_flags & OpFlags.IgnoreCase) != 0)
531 c2 = Char.ToUpper ((char)c1); // ignore case
535 if ((tail_flags & OpFlags.RightToLeft) != 0)
536 coff = -1; // reverse
546 if (!fast.IsMinimum && !Eval (Mode.Count, ref ptr, fast.Expression)) {
547 //Console.WriteLine ("lazy fast: failed mininum.");
548 fast = fast.Previous;
554 if ((c1 < 0 || (p >= 0 && p < text_end && (c1 == text[p] || c2 == text[p]))) &&
555 Eval (Mode.Match, ref ptr, pc))
558 if (fast.IsMaximum) {
559 //Console.WriteLine ("lazy fast: failed with maximum.");
560 fast = fast.Previous;
565 if (!Eval (Mode.Count, ref ptr, fast.Expression)) {
566 //Console.WriteLine ("lazy fast: no more.");
567 fast = fast.Previous;
571 fast = fast.Previous;
575 if (!Eval (Mode.Count, ref ptr, fast.Expression)) {
576 fast = fast.Previous;
582 width = (ptr - fast.Start) / fast.Count;
588 if ((c1 < 0 || (p >= 0 && p < text_end && (c1 == text[p] || c2 == text[p]))) &&
589 Eval (Mode.Match, ref ptr, pc))
593 if (!fast.IsMinimum) {
594 fast = fast.Previous;
601 fast = fast.Previous;
607 Debug.Assert (false, "Regex", "Info block found in pattern");
621 if (fast.IsMaximum || (fast.IsLazy && fast.IsMinimum))
624 pc = fast.Expression;
635 if (!fast.IsLazy && fast.IsMinimum)
638 ref_ptr = fast.Start;
646 private bool EvalChar (Mode mode, ref int ptr, ref int pc, bool multi) {
647 bool consumed = false;
653 ushort word = program[pc];
654 OpCode op = (OpCode)(word & 0x00ff);
655 OpFlags flags = (OpFlags)(word & 0xff00);
659 ignore = (flags & OpFlags.IgnoreCase) != 0;
661 // consume character: the direction of an In construct is
662 // determined by the direction of its first op
665 if ((flags & OpFlags.RightToLeft) != 0) {
679 c = Char.ToLower (c);
686 negate = (flags & OpFlags.Negate) != 0;
697 case OpCode.Character: {
698 if (c == (char)program[pc ++])
703 case OpCode.Category: {
704 if (CategoryUtils.IsCategory ((Category)program[pc ++], c))
711 int lo = (char)program[pc ++];
712 int hi = (char)program[pc ++];
713 if (lo <= c && c <= hi)
719 int lo = (char)program[pc ++];
720 int len = (char)program[pc ++];
725 if (i < 0 || i >= len << 4)
729 if ((program[bits + (i >> 4)] & (1 << (i & 0xf))) != 0)
739 private bool TryMatch (ref int ref_ptr, int pc) {
743 marks [groups [0]].Start = ptr;
744 if (Eval (Mode.Match, ref ptr, pc)) {
745 marks [groups [0]].End = ptr;
753 private bool IsPosition (Position pos, int ptr) {
755 case Position.Start: case Position.StartOfString:
758 case Position.StartOfLine:
759 return ptr == 0 || text[ptr - 1] == '\n';
761 case Position.StartOfScan:
762 return ptr == scan_ptr;
765 return ptr == text_end ||
766 (ptr == text_end - 1 && text[ptr] == '\n');
768 case Position.EndOfLine:
769 return ptr == text_end || text[ptr] == '\n';
771 case Position.EndOfString:
772 return ptr == text_end;
774 case Position.Boundary:
779 return IsWordChar (text[ptr]);
780 else if (ptr == text_end)
781 return IsWordChar (text[ptr - 1]);
783 return IsWordChar (text[ptr]) != IsWordChar (text[ptr - 1]);
785 case Position.NonBoundary:
790 return !IsWordChar (text[ptr]);
791 else if (ptr == text_end)
792 return !IsWordChar (text[ptr - 1]);
794 return IsWordChar (text[ptr]) == IsWordChar (text[ptr - 1]);
801 private bool IsWordChar (char c) {
802 return CategoryUtils.IsCategory (Category.Word, c);
805 private string GetString (int pc) {
806 int len = program[pc + 1];
809 char[] cs = new char[len];
810 for (int i = 0; i < len; ++ i)
811 cs[i] = (char)program[str ++];
813 return new string (cs);
816 // capture management
818 private void Open (int gid, int ptr) {
819 int m = groups [gid];
820 if (m < mark_start || marks [m].IsDefined) {
825 marks [m].Start = ptr;
828 private void Close (int gid, int ptr) {
829 marks [groups [gid]].End = ptr;
832 private bool Balance (int gid, int balance_gid, bool capture, int ptr) {
833 int b = groups [balance_gid];
835 if(b == -1 || marks[b].Index < 0) {
836 //Group not previously matched
840 Debug.Assert (marks [b].IsDefined, "Regex", "Balancng group not closed");
842 if (gid > 0 && capture){
843 Open (gid, marks [b].Index + marks [b].Length);
847 groups [balance_gid] = marks[b].Previous;
852 private int Checkpoint () {
853 mark_start = mark_end;
857 private void Backtrack (int cp) {
858 Debug.Assert (cp > mark_start, "Regex", "Attempt to backtrack forwards");
860 for (int i = 0; i < groups.Length; ++ i) {
863 m = marks [m].Previous;
869 private void ResetGroups () {
870 int n = groups.Length;
872 marks = new Mark [n * 10];
874 for (int i = 0; i < n; ++ i) {
877 marks [i].Start = -1;
879 marks [i].Previous = -1;
886 private int GetLastDefined (int gid) {
887 int m = groups [gid];
888 while (m >= 0 && !marks [m].IsDefined)
889 m = marks [m].Previous;
894 private int CreateMark (int previous) {
895 if (mark_end == marks.Length) {
896 Mark [] dest = new Mark [marks.Length * 2];
897 marks.CopyTo (dest, 0);
902 marks [m].Start = marks [m].End = -1;
903 marks [m].Previous = previous;
908 private Match GenerateMatch (Regex regex) {
909 int[][] grps = new int[groups.Length][];
910 ArrayList caps = new ArrayList ();
912 for (int gid = 0; gid < groups.Length; ++ gid) {
914 for (int m = groups[gid]; m >= 0; m = marks[m].Previous) {
915 if (!marks[m].IsDefined)
918 caps.Add (marks[m].Index);
919 caps.Add (marks[m].Length);
922 grps[gid] = (int[])caps.ToArray (typeof (int));
925 return new Match (regex, this, text, text_end, grps);
928 // interpreter attributes
930 private ushort[] program; // regex program
931 private int program_start; // first instruction after info block
932 private string text; // input text
933 private int text_end; // end of input text (last character + 1)
934 private int group_count; // number of capturing groups
935 private int match_min, match_max; // match width information
936 private QuickSearch qs; // fast substring matcher
940 private int scan_ptr; // start of scan
942 private RepeatContext repeat; // current repeat context
943 private RepeatContext fast; // fast repeat context
945 private Mark[] marks = null; // mark stack
946 private int mark_start; // start of current checkpoint
947 private int mark_end; // end of checkpoint/next free mark
949 private int[] groups; // current group definitions
953 private struct Mark {
954 public int Start, End;
957 public bool IsDefined {
958 get { return Start >= 0 && End >= 0; }
962 get { return Start < End ? Start : End; }
966 get { return Start < End ? End - Start : Start - End; }
970 private class RepeatContext {
971 public RepeatContext (RepeatContext previous, int min, int max, bool lazy, int expr_pc) {
972 this.previous = previous;
976 this.expr_pc = expr_pc;
983 get { return count; }
984 set { count = value; }
988 get { return start; }
989 set { start = value; }
992 public bool IsMinimum {
993 get { return min <= count; }
996 public bool IsMaximum {
997 get { return max <= count; }
1000 public bool IsLazy {
1001 get { return lazy; }
1004 public int Expression {
1005 get { return expr_pc; }
1008 public RepeatContext Previous {
1009 get { return previous; }
1013 private int min, max;
1015 private int expr_pc;
1016 private RepeatContext previous;