3 // namespace: System.Text.RegularExpressions
\r
4 // file: interpreter.cs
\r
6 // author: Dan Lewis (dlewis@gmx.co.uk)
\r
10 using System.Collections;
\r
11 using System.Globalization;
\r
13 namespace System.Text.RegularExpressions {
\r
15 class Interpreter : IMachine {
\r
16 public Interpreter (ushort[] program) {
\r
17 this.program = program;
\r
18 this.checkpoints = new Stack ();
\r
21 // process info block
\r
23 if ((OpCode)program[0] != OpCode.Info)
\r
24 throw NewInterpretException ("Can't find info block.");
\r
26 this.group_count = program[1] + 1;
\r
27 this.match_min = program[2];
\r
28 this.match_max = program[3];
\r
32 this.captures = new Capture[group_count];
\r
33 this.program_start = 4;
\r
36 // IMachine implementation
\r
38 public Match Scan (Regex regex, string text, int start, int end) {
\r
40 this.text_end = end;
\r
41 this.scan_ptr = start;
\r
43 if (Eval (Mode.Match, ref scan_ptr, program_start))
\r
44 return new Match (regex, this, end, captures);
\r
51 private void Reset () {
\r
52 for (int i = 0; i < group_count; ++ i)
\r
53 captures[i] = new Capture (text);
\r
55 checkpoints.Clear ();
\r
57 fast = repeat = null;
\r
60 private bool Eval (Mode mode, ref int ref_ptr, int pc) {
\r
64 ushort word = program[pc];
\r
65 OpCode op = (OpCode)(word & 0x00ff);
\r
66 OpFlags flags = (OpFlags)(word & 0xff00);
\r
69 case OpCode.Anchor: {
\r
70 int skip = program[pc + 1];
\r
72 int anch_offset = program[pc + 2];
\r
73 int anch_ptr = ptr + anch_offset;
\r
74 int anch_end = text_end - match_min + anch_offset; // maximum anchor position
\r
76 // the general case for an anchoring expression is at the bottom, however we
\r
77 // do some checks for the common cases before to save processing time. the current
\r
78 // optimizer only outputs three types of anchoring expressions: fixed position,
\r
79 // fixed substring, and no anchor.
\r
81 OpCode anch_op = (OpCode)(program[pc + 3] & 0x00ff);
\r
82 if (anch_op == OpCode.Position && skip == 6) { // position anchor
\r
87 switch ((Position)program[pc + 4]) {
\r
88 case Position.StartOfString:
\r
89 if (anch_ptr == 0) {
\r
91 if (TryMatch (ref ptr, pc + skip))
\r
96 case Position.StartOfLine:
\r
97 if (anch_ptr == 0) {
\r
99 if (TryMatch (ref ptr, pc + skip))
\r
105 while (anch_ptr <= anch_end) {
\r
106 if (text[anch_ptr - 1] == '\n') {
\r
107 ptr = anch_ptr - anch_offset;
\r
108 if (TryMatch (ref ptr, pc + skip))
\r
116 case Position.StartOfScan:
\r
117 if (anch_ptr == scan_ptr) {
\r
118 ptr = scan_ptr - anch_offset;
\r
119 if (TryMatch (ref ptr, pc + skip))
\r
129 else if (qs != null ||
\r
130 (anch_op == OpCode.String && skip == 6 + program[pc + 4])) { // substring anchor
\r
136 bool ignore = ((OpFlags)program[pc + 3] & OpFlags.IgnoreCase) != 0;
\r
137 string substring = GetString (pc + 3);
\r
139 qs = new QuickSearch (substring, ignore);
\r
142 while (anch_ptr <= anch_end) {
\r
143 anch_ptr = qs.Search (text, anch_ptr, anch_end);
\r
147 ptr = anch_ptr - anch_offset;
\r
148 if (TryMatch (ref ptr, pc + skip))
\r
154 else if (anch_op == OpCode.True) { // no anchor
\r
158 while (anch_ptr <= anch_end) {
\r
160 if (TryMatch (ref ptr, pc + skip))
\r
166 else { // general case
\r
171 while (anch_ptr <= anch_end) {
\r
173 if (Eval (Mode.Match, ref ptr, pc + 3)) {
\r
174 // anchor expression passed: try real expression at the correct offset
\r
176 ptr = anch_ptr - anch_offset;
\r
177 if (TryMatch (ref ptr, pc + skip))
\r
188 case OpCode.False: {
\r
192 case OpCode.True: {
\r
196 case OpCode.Position: {
\r
197 if (!IsPosition ((Position)program[pc + 1], ptr))
\r
203 case OpCode.String: {
\r
204 bool reverse = (flags & OpFlags.RightToLeft) != 0;
\r
205 bool ignore = (flags & OpFlags.IgnoreCase) != 0;
\r
206 int len = program[pc + 1];
\r
213 else if (ptr + len > text_end)
\r
217 for (int i = 0; i < len; ++ i) {
\r
218 char c = text[ptr + i];
\r
220 c = Char.ToLower (c);
\r
222 if (c != (char)program[pc ++])
\r
231 case OpCode.Reference: {
\r
232 bool reverse = (flags & OpFlags.RightToLeft) != 0;
\r
233 bool ignore = (flags & OpFlags.IgnoreCase) != 0;
\r
234 Capture cap = captures[program[pc + 1]].GetLastDefined ();
\r
238 int str = cap.Index;
\r
239 int len = cap.Length;
\r
246 else if (ptr + len > text_end)
\r
250 for (int i = 0; i < len; ++ i) {
\r
252 if (Char.ToLower (text[ptr + i]) != Char.ToLower (text[str + i]))
\r
256 if (text[ptr + i] != text[str + i])
\r
266 case OpCode.Character: case OpCode.Category:
\r
267 case OpCode.Range: case OpCode.Set: {
\r
268 if (!EvalChar (mode, ref ptr, ref pc, false))
\r
274 int target = pc + program[pc + 1];
\r
276 if (!EvalChar (mode, ref ptr, ref pc, true))
\r
283 case OpCode.Open: {
\r
284 Open (program[pc + 1], ptr);
\r
289 case OpCode.Close: {
\r
290 Close (program[pc + 1], ptr);
\r
295 case OpCode.Balance: {
\r
296 Balance (program[pc + 1], program[pc + 2], ptr);
\r
300 case OpCode.IfDefined: {
\r
301 Capture cap = captures[program[pc + 2]];
\r
302 if (cap.GetLastDefined () == null)
\r
303 pc += program[pc + 1];
\r
310 if (!Eval (Mode.Match, ref ptr, pc + 2))
\r
313 pc += program[pc + 1];
\r
317 case OpCode.Test: {
\r
318 int cp = Checkpoint ();
\r
319 int test_ptr = ptr;
\r
320 if (Eval (Mode.Match, ref test_ptr, pc + 3))
\r
321 pc += program[pc + 1];
\r
324 pc += program[pc + 2];
\r
329 case OpCode.Branch: {
\r
332 int cp = Checkpoint ();
\r
333 if (Eval (mode, ref ptr, pc + 2))
\r
338 pc += program[pc + 1];
\r
339 branch_op = (OpCode)(program[pc] & 0xff);
\r
340 } while (branch_op != OpCode.False);
\r
345 case OpCode.Jump: {
\r
346 pc += program[pc + 1];
\r
350 case OpCode.Repeat: {
\r
351 this.repeat = new RepeatContext (
\r
352 this.repeat, // previous context
\r
353 program[pc + 2], // minimum
\r
354 program[pc + 3], // maximum
\r
355 (flags & OpFlags.Lazy) != 0, // lazy
\r
356 pc + 4 // subexpression
\r
359 if (Eval (Mode.Match, ref ptr, pc + program[pc + 1]))
\r
362 this.repeat = this.repeat.Previous;
\r
367 case OpCode.Until: {
\r
368 RepeatContext current = this.repeat;
\r
369 int start = current.Start;
\r
371 if (!current.IsMinimum) {
\r
373 current.Start = ptr;
\r
374 if (Eval (Mode.Match, ref ptr, repeat.Expression))
\r
377 current.Start = start;
\r
382 if (ptr == current.Start) {
\r
383 // degenerate match ... match tail or fail
\r
385 this.repeat = current.Previous;
\r
386 if (Eval (Mode.Match, ref ptr, pc + 1))
\r
392 if (current.IsLazy) {
\r
393 // match tail first ...
\r
395 this.repeat = current.Previous;
\r
396 int cp = Checkpoint ();
\r
397 if (Eval (Mode.Match, ref ptr, pc + 1))
\r
402 // ... then match more
\r
404 this.repeat = current;
\r
405 if (!current.IsMaximum) {
\r
407 current.Start = ptr;
\r
408 if (Eval (Mode.Match, ref ptr, current.Expression))
\r
411 current.Start = start;
\r
419 // match more first ...
\r
421 if (!current.IsMaximum) {
\r
422 int cp = Checkpoint ();
\r
424 current.Start = ptr;
\r
425 if (Eval (Mode.Match, ref ptr, current.Expression))
\r
428 current.Start = start;
\r
433 // ... then match tail
\r
435 this.repeat = current.Previous;
\r
436 if (Eval (Mode.Match, ref ptr, pc + 1))
\r
439 this.repeat = current;
\r
444 case OpCode.FastRepeat: {
\r
445 this.fast = new RepeatContext (
\r
447 program[pc + 2], // minimum
\r
448 program[pc + 3], // maximum
\r
449 (flags & OpFlags.Lazy) != 0, // lazy
\r
450 pc + 4 // subexpression
\r
454 int cp = Checkpoint ();
\r
456 pc += program[pc + 1]; // tail expression
\r
457 ushort tail_word = program[pc];
\r
459 int c1, c2; // first character of tail operator
\r
460 int coff; // 0 or -1 depending on direction
\r
462 OpCode tail_op = (OpCode)(tail_word & 0xff);
\r
463 if (tail_op == OpCode.Character || tail_op == OpCode.String) {
\r
464 OpFlags tail_flags = (OpFlags)(tail_word & 0xff00);
\r
466 if (tail_op == OpCode.String)
\r
467 c1 = program[pc + 2]; // first char of string
\r
469 c1 = program[pc + 1]; // character
\r
471 if ((tail_flags & OpFlags.IgnoreCase) != 0)
\r
472 c2 = Char.ToUpper ((char)c1); // ignore case
\r
476 if ((tail_flags & OpFlags.RightToLeft) != 0)
\r
477 coff = -1; // reverse
\r
487 if (!fast.IsMinimum && !Eval (Mode.Count, ref ptr, fast.Expression)) {
\r
488 //Console.WriteLine ("lazy fast: failed mininum.");
\r
489 fast = fast.Previous;
\r
494 int p = ptr + coff;
\r
495 if ((c1 < 0 || (p >= 0 && p < text_end && (c1 == text[p] || c2 == text[p]))) &&
\r
496 Eval (Mode.Match, ref ptr, pc))
\r
499 if (fast.IsMaximum) {
\r
500 //Console.WriteLine ("lazy fast: failed with maximum.");
\r
501 fast = fast.Previous;
\r
506 if (!Eval (Mode.Count, ref ptr, fast.Expression)) {
\r
507 //Console.WriteLine ("lazy fast: no more.");
\r
508 fast = fast.Previous;
\r
512 fast = fast.Previous;
\r
516 if (!Eval (Mode.Count, ref ptr, fast.Expression)) {
\r
517 fast = fast.Previous;
\r
522 if (fast.Count > 0)
\r
523 width = (ptr - fast.Start) / fast.Count;
\r
528 int p = ptr + coff;
\r
529 if ((c1 < 0 || (p >= 0 && p < text_end && (c1 == text[p] || c2 == text[p]))) &&
\r
530 Eval (Mode.Match, ref ptr, pc))
\r
534 if (!fast.IsMinimum) {
\r
535 fast = fast.Previous;
\r
542 fast = fast.Previous;
\r
547 case OpCode.Info: {
\r
548 throw NewInterpretException ("Info block found in pattern.");
\r
561 if (fast.IsMaximum || (fast.IsLazy && fast.IsMinimum))
\r
564 pc = fast.Expression;
\r
575 if (!fast.IsLazy && fast.IsMinimum)
\r
578 ref_ptr = fast.Start;
\r
586 private bool EvalChar (Mode mode, ref int ptr, ref int pc, bool multi) {
\r
587 bool consumed = false;
\r
592 ushort word = program[pc];
\r
593 OpCode op = (OpCode)(word & 0x00ff);
\r
594 OpFlags flags = (OpFlags)(word & 0xff00);
\r
598 ignore = (flags & OpFlags.IgnoreCase) != 0;
\r
600 // consume character: the direction of an In construct is
\r
601 // determined by the direction of its first op
\r
604 if ((flags & OpFlags.RightToLeft) != 0) {
\r
611 if (ptr >= text_end)
\r
618 c = Char.ToLower (c);
\r
625 negate = (flags & OpFlags.Negate) != 0;
\r
636 case OpCode.Character: {
\r
637 if (c == (char)program[pc ++])
\r
642 case OpCode.Category: {
\r
643 if (CategoryUtils.IsCategory ((Category)program[pc ++], c))
\r
649 case OpCode.Range: {
\r
650 int lo = (char)program[pc ++];
\r
651 int hi = (char)program[pc ++];
\r
652 if (lo <= c && c <= hi)
\r
658 int lo = (char)program[pc ++];
\r
659 int len = (char)program[pc ++];
\r
663 int i = (int)c - lo;
\r
664 if (i < 0 || i >= len << 4)
\r
667 if ((program[bits + (i >> 4)] & (1 << (i & 0xf))) != 0)
\r
677 private bool TryMatch (ref int ref_ptr, int pc) {
\r
681 captures[0].Open (ptr);
\r
682 if (Eval (Mode.Match, ref ptr, pc)) {
\r
683 captures[0].Close (ptr);
\r
691 private bool IsPosition (Position pos, int ptr) {
\r
693 case Position.Start: case Position.StartOfString:
\r
696 case Position.StartOfLine:
\r
697 return ptr == 0 || text[ptr - 1] == '\n';
\r
699 case Position.StartOfScan:
\r
700 return ptr == scan_ptr;
\r
703 return ptr == text_end ||
\r
704 (ptr == text_end - 1 && text[ptr] == '\n');
\r
706 case Position.EndOfLine:
\r
707 return ptr == text_end || text[ptr] == '\n';
\r
709 case Position.EndOfString:
\r
710 return ptr == text_end;
\r
712 case Position.Boundary:
\r
717 return IsWordChar (text[ptr]);
\r
718 else if (ptr == text_end)
\r
719 return IsWordChar (text[ptr - 1]);
\r
721 return IsWordChar (text[ptr]) != IsWordChar (text[ptr - 1]);
\r
723 case Position.NonBoundary:
\r
728 return !IsWordChar (text[ptr]);
\r
729 else if (ptr == text_end)
\r
730 return !IsWordChar (text[ptr - 1]);
\r
732 return IsWordChar (text[ptr]) == IsWordChar (text[ptr - 1]);
\r
739 private bool IsWordChar (char c) {
\r
740 return CategoryUtils.IsCategory (Category.Word, c);
\r
743 private string GetString (int pc) {
\r
744 int len = program[pc + 1];
\r
747 char[] cs = new char[len];
\r
748 for (int i = 0; i < len; ++ i)
\r
749 cs[i] = (char)program[str ++];
\r
751 return new string (cs);
\r
754 // capture management
\r
756 private void Open (int gid, int ptr) {
\r
757 Capture cap = captures[gid];
\r
758 if (cap.IsDefined || cap.Checkpoint < checkpoint) {
\r
759 cap = new Capture (cap, checkpoint);
\r
760 captures[gid] = cap;
\r
766 private void Close (int gid, int ptr) {
\r
767 captures[gid].Close (ptr);
\r
770 private void Balance (int gid, int balance_gid, int ptr) {
\r
771 Capture balance = captures[balance_gid];
\r
772 if (!balance.IsDefined)
\r
773 throw NewInterpretException ("Invalid state - balancing group not closed.");
\r
776 Open (gid, balance.Index + balance.Length);
\r
780 captures[balance_gid] = balance.Previous;
\r
783 private int Checkpoint () {
\r
784 checkpoints.Push (captures);
\r
785 captures = (Capture[])captures.Clone ();
\r
786 checkpoint = checkpoints.Count;
\r
791 private void Backtrack (int cp) {
\r
792 if (cp > checkpoints.Count)
\r
793 throw NewInterpretException ("Can't backtrack forwards");
\r
795 while (checkpoints.Count > cp)
\r
796 checkpoints.Pop ();
\r
798 captures = (Capture[])checkpoints.Peek ();
\r
801 // TODO optimize this
\r
804 private Exception NewInterpretException (string msg) {
\r
805 return new ApplicationException (msg);
\r
808 // interpreter attributes
\r
810 private ushort[] program; // regex program
\r
811 private int program_start; // first instruction after info block
\r
812 private string text; // input text
\r
813 private int text_end; // end of input text (last character + 1)
\r
814 private int group_count; // number of capturing groups
\r
815 private int match_min, match_max; // match width information
\r
816 private QuickSearch qs; // fast substring matcher
\r
820 private int scan_ptr; // start of scan
\r
822 private Capture[] captures; // current captures
\r
824 private int checkpoint; // last checkpoint
\r
825 private Stack checkpoints; // checkpointed captures
\r
827 private RepeatContext repeat; // current repeat context
\r
828 private RepeatContext fast; // fast repeat context
\r
832 private class RepeatContext {
\r
833 public RepeatContext (RepeatContext previous, int min, int max, bool lazy, int expr_pc) {
\r
834 this.previous = previous;
\r
838 this.expr_pc = expr_pc;
\r
845 get { return count; }
\r
846 set { count = value; }
\r
850 get { return start; }
\r
851 set { start = value; }
\r
854 public bool IsMinimum {
\r
855 get { return min <= count; }
\r
858 public bool IsMaximum {
\r
859 get { return max <= count; }
\r
862 public bool IsLazy {
\r
863 get { return lazy; }
\r
866 public int Expression {
\r
867 get { return expr_pc; }
\r
870 public RepeatContext Previous {
\r
871 get { return previous; }
\r
875 private int min, max;
\r
877 private int expr_pc;
\r
878 private RepeatContext previous;
\r
883 private enum Mode {
\r