3 using System.Collections;
4 using System.Globalization;
6 namespace System.Text.RegularExpressions {
8 internal delegate bool EvalDelegate (RxInterpreter interp, int strpos, ref int strpos_result);
10 sealed class RxInterpreter: BaseMachine {
18 EvalDelegate eval_del; // optimized EvalByteCode method created by the CILCompiler
20 Mark[] marks = null; // mark stack
21 int mark_start; // start of current checkpoint
22 int mark_end; // end of checkpoint/next free mark
24 static readonly bool trace_rx = Environment.GetEnvironmentVariable ("RXD") != null;
26 static int ReadInt (byte[] code, int pc)
29 val |= code [pc + 1] << 8;
30 val |= code [pc + 2] << 16;
31 val |= code [pc + 3] << 24;
35 public RxInterpreter (byte[] program, EvalDelegate eval_del)
37 this.program = program;
38 this.eval_del = eval_del;
39 group_count = 1 + (program [1] | (program [2] << 8));
40 groups = new int [group_count];
45 public override Match Scan (Regex regex, string text, int start, int end) {
52 if (eval_del != null) {
53 match = eval_del (this, start, ref res);
55 match = EvalByteCode (11, start, ref res);
57 marks [groups [0]].End = res;
59 return GenerateMatch (regex);
60 //Match m = new Match (regex, this, text, end, 0, match_start, res - match_start);
67 private void Open (int gid, int ptr) {
69 if (m < mark_start || marks [m].IsDefined) {
74 marks [m].Start = ptr;
77 private void Close (int gid, int ptr) {
78 marks [groups [gid]].End = ptr;
81 private bool Balance (int gid, int balance_gid, bool capture, int ptr) {
82 int b = groups [balance_gid];
84 if (b == -1 || marks [b].Index < 0) {
85 //Group not previously matched
88 if (gid > 0 && capture){
89 Open (gid, marks [b].Index + marks [b].Length);
93 groups [balance_gid] = marks[b].Previous;
97 private int Checkpoint () {
98 mark_start = mark_end;
102 private void Backtrack (int cp) {
103 for (int i = 0; i < groups.Length; ++ i) {
106 m = marks [m].Previous;
111 private void ResetGroups () {
112 int n = groups.Length;
114 marks = new Mark [n * 10];
116 for (int i = 0; i < n; ++ i) {
119 marks [i].Start = -1;
121 marks [i].Previous = -1;
127 private int GetLastDefined (int gid) {
128 int m = groups [gid];
129 while (m >= 0 && !marks [m].IsDefined)
130 m = marks [m].Previous;
135 private int CreateMark (int previous) {
136 if (mark_end == marks.Length) {
137 Mark [] dest = new Mark [marks.Length * 2];
138 marks.CopyTo (dest, 0);
143 marks [m].Start = marks [m].End = -1;
144 marks [m].Previous = previous;
149 private void GetGroupInfo (int gid, out int first_mark_index, out int n_caps)
151 first_mark_index = -1;
153 for (int m = groups [gid]; m >= 0; m = marks [m].Previous) {
154 if (!marks [m].IsDefined)
156 if (first_mark_index < 0)
157 first_mark_index = m;
162 private void PopulateGroup (Group g, int first_mark_index, int n_caps)
165 for (int m = marks [first_mark_index].Previous; m >= 0; m = marks [m].Previous) {
166 if (!marks [m].IsDefined)
168 Capture cap = new Capture (str, marks [m].Index, marks [m].Length);
169 g.Captures.SetValue (cap, n_caps - 1 - i);
174 private Match GenerateMatch (Regex regex)
176 int n_caps, first_mark_index;
178 GetGroupInfo (0, out first_mark_index, out n_caps);
180 // Avoid fully populating the Match instance if not needed
181 if (!needs_groups_or_captures)
182 return new Match (regex, this, str, string_end, 0, marks [first_mark_index].Index, marks [first_mark_index].Length);
184 Match retval = new Match (regex, this, str, string_end, groups.Length,
185 marks [first_mark_index].Index, marks [first_mark_index].Length, n_caps);
186 PopulateGroup (retval, first_mark_index, n_caps);
188 for (int gid = 1; gid < groups.Length; ++ gid) {
189 GetGroupInfo (gid, out first_mark_index, out n_caps);
190 if (first_mark_index < 0) {
193 g = new Group (str, marks [first_mark_index].Index, marks [first_mark_index].Length, n_caps);
194 PopulateGroup (g, first_mark_index, n_caps);
196 retval.Groups.SetValue (g, gid);
201 // used by the IL backend
202 void SetStartOfMatch (int pos)
204 marks [groups [0]].Start = pos;
207 static bool IsWordChar (char c)
209 return Char.IsLetterOrDigit (c) || Char.GetUnicodeCategory (c) == UnicodeCategory.ConnectorPunctuation;
212 bool EvalByteCode (int pc, int strpos, ref int strpos_result)
214 // luckily the IL engine can deal with char_group_end at compile time
215 // this code offset needs to be checked only in opcodes that handle
216 // a single char and that are included in a TestCharGroup expression:
217 // the engine is supposed to jump to this offset as soons as the
218 // first opcode in the expression matches
219 // The code pattern becomes:
220 // on successfull match: check if char_group_end is nonzero and jump to
221 // test_char_group_passed after adjusting strpos
222 // on failure: try the next expression by simply advancing pc
223 int char_group_end = 0;
224 int length, start, end;
227 Console.WriteLine ("evaluating: {0} at pc: {1}, strpos: {2}, cge: {3}", (RxOp)program [pc], pc, strpos, char_group_end);
228 switch ((RxOp)program [pc]) {
230 if (char_group_end != 0) {
235 strpos_result = strpos;
239 case RxOp.AnyPosition:
242 case RxOp.StartOfString:
247 case RxOp.StartOfLine:
248 if (strpos == 0 || str [strpos - 1] == '\n') {
253 case RxOp.StartOfScan:
254 if (strpos != string_start)
259 if (strpos == string_end || (strpos == string_end - 1 && str [strpos] == '\n')) {
264 case RxOp.EndOfString:
265 if (strpos != string_end)
270 if (strpos == string_end || str [strpos] == '\n') {
275 case RxOp.WordBoundary:
279 if (IsWordChar (str [strpos])) {
283 } else if (strpos == string_end) {
284 if (IsWordChar (str [strpos - 1])) {
289 if (IsWordChar (str [strpos]) != IsWordChar (str [strpos - 1])) {
295 case RxOp.NoWordBoundary:
299 if (!IsWordChar (str [strpos])) {
303 } else if (strpos == string_end) {
304 if (!IsWordChar (str [strpos - 1])) {
309 if (IsWordChar (str [strpos]) == IsWordChar (str [strpos - 1])) {
316 // FIXME: test anchor
317 length = program [pc + 3] | (program [pc + 4] << 8);
318 pc += program [pc + 1] | (program [pc + 2] << 8);
319 // it's important to test also the end of the string
320 // position for things like: "" =~ /$/
321 end = string_end + 1;
322 while (strpos < end) {
324 if (groups.Length > 1) {
326 marks [groups [0]].Start = strpos;
328 if (EvalByteCode (pc, strpos, ref res)) {
329 // match_start = strpos;
330 marks [groups [0]].Start = strpos;
331 if (groups.Length > 1)
332 marks [groups [0]].End = res;
340 length = GetLastDefined (program [pc + 1] | (program [pc + 2] << 8));
343 start = marks [length].Index;
344 length = marks [length].Length;
345 if (strpos + length > string_end)
347 for (end = start + length; start < end; ++start) {
348 if (str [strpos] != str [start])
354 case RxOp.ReferenceIgnoreCase:
355 length = GetLastDefined (program [pc + 1] | (program [pc + 2] << 8));
358 start = marks [length].Index;
359 length = marks [length].Length;
360 if (strpos + length > string_end)
362 for (end = start + length; start < end; ++start) {
363 if (str [strpos] != str [start] && Char.ToLower (str [strpos]) != Char.ToLower (str [start]))
370 if (GetLastDefined (program [pc + 3] | (program [pc + 4] << 8)) < 0)
373 pc += program [pc + 1] | (program [pc + 2] << 8);
375 case RxOp.SubExpression: {
377 if (EvalByteCode (pc + 3, strpos, ref res)) {
378 pc += program [pc + 1] | (program [pc + 2] << 8);
386 if (EvalByteCode (pc + 5, strpos, ref res)) {
387 pc += program [pc + 1] | (program [pc + 2] << 8);
389 pc += program [pc + 3] | (program [pc + 4] << 8);
394 Open (program [pc + 1] | (program [pc + 2] << 8), strpos);
397 case RxOp.CloseGroup:
398 Close (program [pc + 1] | (program [pc + 2] << 8), strpos);
402 pc += program [pc + 1] | (program [pc + 2] << 8);
404 case RxOp.TestCharGroup:
405 char_group_end = pc + program [pc + 1] | (program [pc + 2] << 8);
410 length = program [pc + 1];
411 if (strpos + length > string_end)
413 end = start + length;
414 for (; start < end; ++start) {
415 if (str [strpos] != program [start])
421 case RxOp.StringIgnoreCase:
423 length = program [pc + 1];
424 if (strpos + length > string_end)
426 end = start + length;
427 for (; start < end; ++start) {
428 if (str [strpos] != program [start] && Char.ToLower (str [strpos]) != program [start])
434 case RxOp.UnicodeString:
436 length = program [pc + 1] | (program [pc + 2] << 8);
437 if (strpos + length > string_end)
439 end = start + length * 2;
440 for (; start < end; start += 2) {
441 int c = program [start] | (program [start + 1] << 8);
442 if (str [strpos] != c)
448 case RxOp.UnicodeStringIgnoreCase:
450 length = program [pc + 1] | (program [pc + 2] << 8);
451 if (strpos + length > string_end)
453 end = start + length * 2;
454 for (; start < end; start += 2) {
455 int c = program [start] | (program [start + 1] << 8);
456 if (str [strpos] != c && Char.ToLower (str [strpos]) != c)
463 if (strpos < string_end && (str [strpos] == program [pc + 1])) {
465 if (char_group_end != 0)
466 goto test_char_group_passed;
470 if (char_group_end == 0)
475 if (strpos < string_end && (str [strpos] != program [pc + 1])) {
477 if (char_group_end != 0)
478 goto test_char_group_passed;
482 if (char_group_end == 0)
486 case RxOp.CharIgnoreCase:
487 if (strpos < string_end && (Char.ToLower (str [strpos]) == program [pc + 1])) {
489 if (char_group_end != 0)
490 goto test_char_group_passed;
494 if (char_group_end == 0)
498 case RxOp.NoCharIgnoreCase:
499 if (strpos < string_end && (Char.ToLower (str [strpos]) != program [pc + 1])) {
501 if (char_group_end != 0)
502 goto test_char_group_passed;
506 if (char_group_end == 0)
511 if (strpos < string_end) {
512 int c = str [strpos];
513 if (c >= program [pc + 1] && c <= program [pc + 2]) {
515 if (char_group_end != 0)
516 goto test_char_group_passed;
521 if (char_group_end == 0)
526 if (strpos < string_end) {
527 int c = str [strpos];
528 if (c >= program [pc + 1] && c <= program [pc + 2]) {
529 if (char_group_end == 0)
535 if (char_group_end != 0)
536 goto test_char_group_passed;
540 if (char_group_end == 0)
544 case RxOp.RangeIgnoreCase:
545 if (strpos < string_end) {
546 int c = Char.ToLower (str [strpos]);
547 if (c >= program [pc + 1] && c <= program [pc + 2]) {
549 if (char_group_end != 0)
550 goto test_char_group_passed;
555 if (char_group_end == 0)
559 case RxOp.NoRangeIgnoreCase:
560 if (strpos < string_end) {
561 int c = Char.ToLower (str [strpos]);
562 if (c >= program [pc + 1] && c <= program [pc + 2]) {
563 if (char_group_end == 0)
569 if (char_group_end != 0)
570 goto test_char_group_passed;
574 if (char_group_end == 0)
579 if (strpos < string_end) {
580 int c = str [strpos];
581 c -= program [pc + 1];
582 length = program [pc + 2];
583 if (c < 0 || c >= (length << 3)) {
584 if (char_group_end == 0)
590 if ((program [pc + (c >> 3)] & (1 << (c & 0x7))) != 0) {
592 if (char_group_end != 0)
593 goto test_char_group_passed;
597 if (char_group_end == 0)
602 if (char_group_end == 0)
604 pc += 3 + program [pc + 2];
607 if (strpos < string_end) {
608 int c = str [strpos];
609 c -= program [pc + 1];
610 length = program [pc + 2];
611 if (c < 0 || c >= (length << 3)) {
613 if (char_group_end != 0)
614 goto test_char_group_passed;
619 if ((program [pc + (c >> 3)] & (1 << (c & 0x7))) != 0) {
620 if (char_group_end == 0)
626 if (char_group_end != 0)
627 goto test_char_group_passed;
632 if (char_group_end == 0)
634 pc += 3 + program [pc + 2];
636 case RxOp.BitmapIgnoreCase:
637 if (strpos < string_end) {
638 int c = Char.ToLower (str [strpos]);
639 c -= program [pc + 1];
640 length = program [pc + 2];
641 if (c < 0 || c >= (length << 3)) {
642 if (char_group_end == 0)
648 if ((program [pc + (c >> 3)] & (1 << (c & 0x7))) != 0) {
650 if (char_group_end != 0)
651 goto test_char_group_passed;
655 if (char_group_end == 0)
660 if (char_group_end == 0)
662 pc += 3 + program [pc + 2];
664 case RxOp.NoBitmapIgnoreCase:
665 if (strpos < string_end) {
666 int c = str [strpos];
667 c -= program [pc + 1];
668 length = program [pc + 2];
670 if (c < 0 || c >= (length << 3)) {
672 if (char_group_end != 0)
673 goto test_char_group_passed;
677 if ((program [pc + (c >> 3)] & (1 << (c & 0x7))) != 0) {
678 if (char_group_end == 0)
684 if (char_group_end != 0)
685 goto test_char_group_passed;
690 if (char_group_end == 0)
692 pc += 3 + program [pc + 2];
694 case RxOp.UnicodeChar:
695 if (strpos < string_end && (str [strpos] == (program [pc + 1] | (program [pc + 2] << 8)))) {
697 if (char_group_end != 0)
698 goto test_char_group_passed;
702 if (char_group_end == 0)
706 case RxOp.NoUnicodeChar:
707 if (strpos < string_end && (str [strpos] != (program [pc + 1] | (program [pc + 2] << 8)))) {
709 if (char_group_end != 0)
710 goto test_char_group_passed;
714 if (char_group_end == 0)
718 case RxOp.UnicodeCharIgnoreCase:
719 if (strpos < string_end && (Char.ToLower (str [strpos]) == (program [pc + 1] | (program [pc + 2] << 8)))) {
721 if (char_group_end != 0)
722 goto test_char_group_passed;
726 if (char_group_end == 0)
730 case RxOp.NoUnicodeCharIgnoreCase:
731 if (strpos < string_end && (Char.ToLower (str [strpos]) != (program [pc + 1] | (program [pc + 2] << 8)))) {
733 if (char_group_end != 0)
734 goto test_char_group_passed;
738 if (char_group_end == 0)
742 case RxOp.CategoryAny:
743 if (strpos < string_end && str [strpos] != '\n') {
745 if (char_group_end != 0)
746 goto test_char_group_passed;
750 if (char_group_end == 0)
754 case RxOp.CategoryWord:
755 if (strpos < string_end) {
756 char c = str [strpos];
757 if (Char.IsLetterOrDigit (c) || Char.GetUnicodeCategory (c) == UnicodeCategory.ConnectorPunctuation) {
759 if (char_group_end != 0)
760 goto test_char_group_passed;
765 if (char_group_end == 0)
769 case RxOp.NoCategoryWord:
770 if (strpos < string_end) {
771 char c = str [strpos];
772 if (!Char.IsLetterOrDigit (c) && Char.GetUnicodeCategory (c) != UnicodeCategory.ConnectorPunctuation) {
774 if (char_group_end != 0)
775 goto test_char_group_passed;
780 if (char_group_end == 0)
784 case RxOp.CategoryDigit:
785 if (strpos < string_end && Char.IsDigit (str [strpos])) {
787 if (char_group_end != 0)
788 goto test_char_group_passed;
792 if (char_group_end == 0)
796 case RxOp.NoCategoryDigit:
797 if (strpos < string_end && !Char.IsDigit (str [strpos])) {
799 if (char_group_end != 0)
800 goto test_char_group_passed;
804 if (char_group_end == 0)
808 case RxOp.CategoryWhiteSpace:
809 if (strpos < string_end && Char.IsWhiteSpace (str [strpos])) {
811 if (char_group_end != 0)
812 goto test_char_group_passed;
816 if (char_group_end == 0)
820 case RxOp.NoCategoryWhiteSpace:
821 if (strpos < string_end && !Char.IsWhiteSpace (str [strpos])) {
823 if (char_group_end != 0)
824 goto test_char_group_passed;
828 if (char_group_end == 0)
832 case RxOp.CategoryEcmaWord:
833 if (strpos < string_end) {
834 int c = str [strpos];
835 if ('a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' || '0' <= c && c <= '9' || c == '_') {
837 if (char_group_end != 0)
838 goto test_char_group_passed;
843 if (char_group_end == 0)
847 case RxOp.NoCategoryEcmaWord:
848 if (strpos < string_end) {
849 int c = str [strpos];
850 if ('a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' || '0' <= c && c <= '9' || c == '_') {
851 if (char_group_end == 0)
857 if (char_group_end != 0)
858 goto test_char_group_passed;
862 if (char_group_end == 0)
866 case RxOp.CategoryEcmaWhiteSpace:
867 if (strpos < string_end) {
868 int c = str [strpos];
869 if (c == ' ' || c == '\t' || c == '\n' || c == '\r' || c == '\f' || c == '\v') {
871 if (char_group_end != 0)
872 goto test_char_group_passed;
877 if (char_group_end == 0)
881 case RxOp.NoCategoryEcmaWhiteSpace:
882 if (strpos < string_end) {
883 int c = str [strpos];
884 if (c == ' ' || c == '\t' || c == '\n' || c == '\r' || c == '\f' || c == '\v') {
885 if (char_group_end == 0)
891 if (char_group_end != 0)
892 goto test_char_group_passed;
896 if (char_group_end == 0)
900 case RxOp.CategoryUnicodeSpecials:
901 if (strpos < string_end) {
902 int c = str [strpos];
903 if ('\uFEFF' <= c && c <= '\uFEFF' || '\uFFF0' <= c && c <= '\uFFFD') {
905 if (char_group_end != 0)
906 goto test_char_group_passed;
911 if (char_group_end == 0)
915 case RxOp.NoCategoryUnicodeSpecials:
916 if (strpos < string_end) {
917 int c = str [strpos];
918 if ('\uFEFF' <= c && c <= '\uFEFF' || '\uFFF0' <= c && c <= '\uFFFD') {
919 if (char_group_end == 0)
925 if (char_group_end != 0)
926 goto test_char_group_passed;
930 if (char_group_end == 0)
934 case RxOp.CategoryUnicode:
935 if (strpos < string_end && Char.GetUnicodeCategory (str [strpos]) == (UnicodeCategory)program [pc + 1]) {
937 if (char_group_end != 0)
938 goto test_char_group_passed;
942 if (char_group_end == 0)
946 case RxOp.NoCategoryUnicode:
947 if (strpos < string_end && Char.GetUnicodeCategory (str [strpos]) != (UnicodeCategory)program [pc + 1]) {
949 if (char_group_end != 0)
950 goto test_char_group_passed;
954 if (char_group_end == 0)
960 if (EvalByteCode (pc + 3, strpos, ref res)) {
964 //Console.WriteLine ("branch offset: {0}", program [pc + 1] | (program [pc + 2] << 8));
965 pc += program [pc + 1] | (program [pc + 2] << 8);
969 case RxOp.RepeatLazy: {
971 start = ReadInt (program, pc + 3);
972 end = ReadInt (program, pc + 7);
973 //Console.WriteLine ("min: {0}, max: {1}", start, end);
975 int cp = Checkpoint ();
976 while (length < end) {
977 if (!EvalByteCode (pc + 11, strpos, ref res)) {
978 if (length >= start) {
990 pc += program [pc + 1] | (program [pc + 2] << 8);
994 Console.WriteLine ("evaluating: {0} at pc: {1}, strpos: {2}", (RxOp)program [pc], pc, strpos);
995 throw new NotSupportedException ();
998 test_char_group_passed:
1002 } // end of while (true)