1 //------------------------------------------------------------------------------
2 // <copyright file="RegexParser.cs" company="Microsoft">
3 // Copyright (c) Microsoft Corporation. All rights reserved.
5 //------------------------------------------------------------------------------
7 // This RegexParser class is internal to the Regex package.
8 // It builds a tree of RegexNodes from a regular expression
10 // Implementation notes:
12 // It would be nice to get rid of the comment modes, since the
13 // ScanBlank() calls are just kind of duct-taped in.
16 namespace System.Text.RegularExpressions {
18 using System.Collections;
19 using System.Collections.Generic;
20 using System.Globalization;
22 internal sealed class RegexParser {
23 internal RegexNode _stack;
24 internal RegexNode _group;
25 internal RegexNode _alternation;
26 internal RegexNode _concatenation;
27 internal RegexNode _unit;
29 internal String _pattern;
30 internal int _currentPos;
31 internal CultureInfo _culture;
33 internal int _autocap;
34 internal int _capcount;
36 internal int _capsize;
38 internal Dictionary<Int32, Int32> _caps;
39 internal Dictionary<String, Int32> _capnames;
41 internal Hashtable _caps;
42 internal Hashtable _capnames;
44 internal Int32[] _capnumlist;
45 internal List<String> _capnamelist;
47 internal RegexOptions _options;
48 internal List<RegexOptions> _optionsStack;
50 internal bool _ignoreNextParen = false;
52 internal const int MaxValueDiv10 = Int32.MaxValue / 10;
53 internal const int MaxValueMod10 = Int32.MaxValue % 10;
56 * This static call constructs a RegexTree from a regular expression
57 * pattern string and an option string.
59 * The method creates, drives, and drops a parser instance.
61 internal static RegexTree Parse(String re, RegexOptions op) {
66 p = new RegexParser((op & RegexOptions.CultureInvariant) != 0 ? CultureInfo.InvariantCulture : CultureInfo.CurrentCulture);
75 if (p._capnamelist == null)
78 capnamelist = p._capnamelist.ToArray();
80 return new RegexTree(root, p._caps, p._capnumlist, p._captop, p._capnames, capnamelist, op);
84 * This static call constructs a flat concatenation node given
85 * a replacement pattern.
88 internal static RegexReplacement ParseReplacement(String rep, Dictionary<Int32, Int32> caps, int capsize, Dictionary<String, Int32> capnames, RegexOptions op) {
90 internal static RegexReplacement ParseReplacement(String rep, Hashtable caps, int capsize, Hashtable capnames, RegexOptions op) {
95 p = new RegexParser((op & RegexOptions.CultureInvariant) != 0 ? CultureInfo.InvariantCulture : CultureInfo.CurrentCulture);
99 p.NoteCaptures(caps, capsize, capnames);
101 root = p.ScanReplacement();
103 return new RegexReplacement(rep, root, caps);
107 * Escapes all metacharacters (including |,(,),[,{,|,^,$,*,+,?,\, spaces and #)
109 internal static String Escape(String input) {
110 for (int i = 0; i < input.Length; i++) {
111 if (IsMetachar(input[i])) {
112 StringBuilder sb = new StringBuilder();
116 sb.Append(input, 0, i);
137 while (i < input.Length) {
145 sb.Append(input, lastpos, i - lastpos);
147 } while (i < input.Length);
149 return sb.ToString();
157 * Escapes all metacharacters (including (,),[,],{,},|,^,$,*,+,?,\, spaces and #)
159 internal static String Unescape(String input) {
160 for (int i = 0; i < input.Length; i++) {
161 if (input[i] == '\\') {
162 StringBuilder sb = new StringBuilder();
163 RegexParser p = new RegexParser(CultureInfo.InvariantCulture);
167 sb.Append(input, 0, i);
171 if (i < input.Length)
172 sb.Append(p.ScanCharEscape());
175 while (i < input.Length && input[i] != '\\')
177 sb.Append(input, lastpos, i - lastpos);
179 } while (i < input.Length);
181 return sb.ToString();
189 * Private constructor.
191 private RegexParser(CultureInfo culture) {
193 _optionsStack = new List<RegexOptions>();
195 _caps = new Dictionary<Int32,Int32>();
197 _caps = new Hashtable();
203 * Drops a string into the pattern buffer.
205 internal void SetPattern(String Re) {
213 * Resets parsing to the beginning of the pattern.
215 internal void Reset(RegexOptions topopts) {
218 _ignoreNextParen = false;
220 if (_optionsStack.Count > 0)
221 _optionsStack.RemoveRange(0, _optionsStack.Count - 1);
228 * The main parsing function.
230 internal RegexNode ScanRegex() {
231 char ch = '@'; // nonspecial ch, means at beginning
232 bool isQuantifier = false;
234 StartGroup(new RegexNode(RegexNode.Capture, _options, 0, -1));
236 while (CharsRight() > 0) {
237 bool wasPrevQuantifier = isQuantifier;
238 isQuantifier = false;
242 int startpos = Textpos();
244 // move past all of the normal characters. We'll stop when we hit some kind of control character,
245 // or if IgnorePatternWhiteSpace is on, we'll stop when we see some whitespace.
247 while (CharsRight() > 0 && (!IsStopperX(ch = RightChar()) || ch == '{' && !IsTrueQuantifier()))
250 while (CharsRight() > 0 && (!IsSpecial(ch = RightChar()) || ch == '{' && !IsTrueQuantifier()))
253 int endpos = Textpos();
257 if (CharsRight() == 0)
258 ch = '!'; // nonspecial, means at end
259 else if (IsSpecial(ch = RightChar())) {
260 isQuantifier = IsQuantifier(ch);
263 ch = ' '; // nonspecial, means at ordinary char
265 if (startpos < endpos) {
266 int cchUnquantified = endpos - startpos - (isQuantifier ? 1 : 0);
268 wasPrevQuantifier = false;
270 if (cchUnquantified > 0)
271 AddConcatenate(startpos, cchUnquantified, false);
274 AddUnitOne(CharAt(endpos - 1));
282 goto ContinueOuterScan;
285 AddUnitSet(ScanCharClass(UseOptionI()).ToStringClass());
293 if (null == (grouper = ScanGroupOpen())) {
305 goto ContinueOuterScan;
309 throw MakeException(SR.GetString(SR.TooManyParens));
316 goto ContinueOuterScan;
320 AddUnitNode(ScanBackslash());
324 AddUnitType(UseOptionM() ? RegexNode.Bol : RegexNode.Beginning);
328 AddUnitType(UseOptionM() ? RegexNode.Eol : RegexNode.EndZ);
333 AddUnitSet(RegexCharClass.AnyClass);
343 throw MakeException(wasPrevQuantifier ?
344 SR.GetString(SR.NestedQuantify, ch.ToString()) :
345 SR.GetString(SR.QuantifyAfterNothing));
350 throw MakeException(SR.GetString(SR.InternalError));
355 if (CharsRight() == 0 || !(isQuantifier = IsTrueQuantifier())) {
357 goto ContinueOuterScan;
360 ch = MoveRightGetChar();
362 // Handle quantifiers
363 while (Unit() != null) {
371 max = Int32.MaxValue;
381 max = Int32.MaxValue;
385 startpos = Textpos();
386 max = min = ScanDecimal();
387 if (startpos < Textpos()) {
388 if (CharsRight() > 0 && RightChar() == ',') {
390 if (CharsRight() == 0 || RightChar() == '}')
391 max = Int32.MaxValue;
397 if (startpos == Textpos() || CharsRight() == 0 || MoveRightGetChar() != '}') {
399 Textto(startpos - 1);
400 goto ContinueOuterScan;
407 throw MakeException(SR.GetString(SR.InternalError));
412 if (CharsRight() == 0 || RightChar() != '?')
420 throw MakeException(SR.GetString(SR.IllegalRange));
422 AddConcatenate(lazy, min, max);
433 throw MakeException(SR.GetString(SR.NotEnoughParens));
441 * Simple parsing for replacement patterns
443 internal RegexNode ScanReplacement() {
447 _concatenation = new RegexNode(RegexNode.Concatenate, _options);
454 startpos = Textpos();
456 while (c > 0 && RightChar() != '$') {
461 AddConcatenate(startpos, Textpos() - startpos, true);
464 if (MoveRightGetChar() == '$')
465 AddUnitNode(ScanDollar());
470 return _concatenation;
474 * Scans contents of [] (not including []'s), and converts to a
477 internal RegexCharClass ScanCharClass(bool caseInsensitive) {
478 return ScanCharClass(caseInsensitive, false);
482 * Scans contents of [] (not including []'s), and converts to a
485 internal RegexCharClass ScanCharClass(bool caseInsensitive, bool scanOnly) {
488 bool inRange = false;
489 bool firstChar = true;
494 cc = scanOnly ? null : new RegexCharClass();
496 if (CharsRight() > 0 && RightChar() == '^') {
502 for ( ; CharsRight() > 0; firstChar = false) {
503 bool fTranslatedChar = false;
504 ch = MoveRightGetChar();
511 else if (ch == '\\' && CharsRight() > 0) {
513 switch (ch = MoveRightGetChar()) {
518 throw MakeException(SR.GetString(SR.BadClassInCharRange, ch.ToString()));
519 cc.AddDigit(UseOptionE(), ch == 'D', _pattern);
527 throw MakeException(SR.GetString(SR.BadClassInCharRange, ch.ToString()));
528 cc.AddSpace(UseOptionE(), ch == 'S');
536 throw MakeException(SR.GetString(SR.BadClassInCharRange, ch.ToString()));
538 cc.AddWord(UseOptionE(), ch == 'W');
546 throw MakeException(SR.GetString(SR.BadClassInCharRange, ch.ToString()));
547 cc.AddCategoryFromName(ParseProperty(), (ch != 'p'), caseInsensitive, _pattern);
561 ch = ScanCharEscape(); // non-literal character
562 fTranslatedChar = true;
563 break; // this break will only break out of the switch
566 else if (ch == '[') {
567 // This is code for Posix style properties - [:Ll:] or [:IsTibetan:].
568 // It currently doesn't do anything other than skip the whole thing!
569 if (CharsRight() > 0 && RightChar() == ':' && !inRange) {
571 int savePos = Textpos();
574 /* name = */ ScanCapname();
575 if (CharsRight() < 2 || MoveRightGetChar() != ':' || MoveRightGetChar() != ']')
577 // else lookup name (nyi)
585 if (ch == '[' && !fTranslatedChar && !firstChar) {
586 // We thought we were in a range, but we're actually starting a subtraction.
587 // In that case, we'll add chPrev to our char class, skip the opening [, and
588 // scan the new character class recursively.
590 cc.AddSubtraction(ScanCharClass(caseInsensitive, false));
592 if (CharsRight() > 0 && RightChar() != ']')
593 throw MakeException(SR.GetString(SR.SubtractionMustBeLast));
596 // a regular range, like a-z
598 throw MakeException(SR.GetString(SR.ReversedCharRange));
599 cc.AddRange(chPrev, ch);
603 else if (CharsRight() >= 2 && RightChar() == '-' && RightChar(1) != ']') {
604 // this could be the start of a range
609 else if (CharsRight() >= 1 && ch == '-' && !fTranslatedChar && RightChar() == '[' && !firstChar) {
610 // we aren't in a range, and now there is a subtraction. Usually this happens
611 // only when a subtraction follows a range, like [a-z-[b]]
614 cc.AddSubtraction(ScanCharClass(caseInsensitive, false));
616 if (CharsRight() > 0 && RightChar() != ']')
617 throw MakeException(SR.GetString(SR.SubtractionMustBeLast));
621 ScanCharClass(caseInsensitive, true);
631 throw MakeException(SR.GetString(SR.UnterminatedBracket));
633 if (!scanOnly && caseInsensitive)
634 cc.AddLowercase(_culture);
640 * Scans chars following a '(' (not counting the '('), and returns
641 * a RegexNode for the type of group scanned, or null if the group
642 * simply changed options (?cimsx-cimsx) or was a comment (#...).
644 internal RegexNode ScanGroupOpen() {
650 // just return a RegexNode if we have:
651 // 1. "(" followed by nothing
652 // 2. "(x" where x != ?
654 if (CharsRight() == 0 || RightChar() != '?' || (RightChar() == '?' && (CharsRight() > 1 && RightChar(1) == ')'))) {
655 if (UseOptionN() || _ignoreNextParen) {
656 _ignoreNextParen = false;
657 return new RegexNode(RegexNode.Group, _options);
660 return new RegexNode(RegexNode.Capture, _options, _autocap++, -1);
666 if (CharsRight() == 0)
669 switch (ch = MoveRightGetChar()) {
671 NodeType = RegexNode.Group;
675 _options &= ~(RegexOptions.RightToLeft);
676 NodeType = RegexNode.Require;
680 _options &= ~(RegexOptions.RightToLeft);
681 NodeType = RegexNode.Prevent;
685 NodeType = RegexNode.Greedy;
694 if (CharsRight() == 0)
697 switch (ch = MoveRightGetChar()) {
702 _options |= RegexOptions.RightToLeft;
703 NodeType = RegexNode.Require;
710 _options |= RegexOptions.RightToLeft;
711 NodeType = RegexNode.Prevent;
718 bool proceed = false;
720 // grab part before -
722 if (ch >= '0' && ch <= '9') {
723 capnum = ScanDecimal();
725 if (!IsCaptureSlot(capnum))
728 // check if we have bogus characters after the number
729 if (CharsRight() > 0 && !(RightChar() == close || RightChar() == '-'))
730 throw MakeException(SR.GetString(SR.InvalidGroupName));
732 throw MakeException(SR.GetString(SR.CapnumNotZero));
734 else if (RegexCharClass.IsWordChar(ch)) {
735 String capname = ScanCapname();
737 if (IsCaptureName(capname))
738 capnum = CaptureSlotFromName(capname);
740 // check if we have bogus character after the name
741 if (CharsRight() > 0 && !(RightChar() == close || RightChar() == '-'))
742 throw MakeException(SR.GetString(SR.InvalidGroupName));
744 else if (ch == '-') {
748 // bad group name - starts with something other than a word character and isn't a number
749 throw MakeException(SR.GetString(SR.InvalidGroupName));
752 // grab part after - if any
754 if ((capnum != -1 || proceed == true) && CharsRight() > 0 && RightChar() == '-') {
758 if (ch >= '0' && ch <= '9') {
759 uncapnum = ScanDecimal();
761 if (!IsCaptureSlot(uncapnum))
762 throw MakeException(SR.GetString(SR.UndefinedBackref, uncapnum));
764 // check if we have bogus characters after the number
765 if (CharsRight() > 0 && RightChar() != close)
766 throw MakeException(SR.GetString(SR.InvalidGroupName));
768 else if (RegexCharClass.IsWordChar(ch)) {
769 String uncapname = ScanCapname();
771 if (IsCaptureName(uncapname))
772 uncapnum = CaptureSlotFromName(uncapname);
774 throw MakeException(SR.GetString(SR.UndefinedNameRef, uncapname));
776 // check if we have bogus character after the name
777 if (CharsRight() > 0 && RightChar() != close)
778 throw MakeException(SR.GetString(SR.InvalidGroupName));
781 // bad group name - starts with something other than a word character and isn't a number
782 throw MakeException(SR.GetString(SR.InvalidGroupName));
786 // actually make the node
788 if ((capnum != -1 || uncapnum != -1) && CharsRight() > 0 && MoveRightGetChar() == close) {
789 return new RegexNode(RegexNode.Capture, _options, capnum, uncapnum);
796 // alternation construct (?(...) | )
798 int parenPos = Textpos();
799 if (CharsRight() > 0)
803 // check if the alternation condition is a backref
804 if (ch >= '0' && ch <= '9') {
805 int capnum = ScanDecimal();
806 if (CharsRight() > 0 && MoveRightGetChar() == ')') {
807 if (IsCaptureSlot(capnum))
808 return new RegexNode(RegexNode.Testref, _options, capnum);
810 throw MakeException(SR.GetString(SR.UndefinedReference, capnum.ToString(CultureInfo.CurrentCulture)));
813 throw MakeException(SR.GetString(SR.MalformedReference, capnum.ToString(CultureInfo.CurrentCulture)));
816 else if (RegexCharClass.IsWordChar(ch)) {
817 String capname = ScanCapname();
819 if (IsCaptureName(capname) && CharsRight() > 0 && MoveRightGetChar() == ')')
820 return new RegexNode(RegexNode.Testref, _options, CaptureSlotFromName(capname));
824 NodeType = RegexNode.Testgroup;
825 Textto(parenPos - 1); // jump to the start of the parentheses
826 _ignoreNextParen = true; // but make sure we don't try to capture the insides
828 int charsRight = CharsRight();
829 if (charsRight >= 3 && RightChar(1) == '?') {
830 char rightchar2 = RightChar(2);
831 // disallow comments in the condition
832 if (rightchar2 == '#')
833 throw MakeException(SR.GetString(SR.AlternationCantHaveComment));
835 // disallow named capture group (?<..>..) in the condition
836 if (rightchar2 == '\'' )
837 throw MakeException(SR.GetString(SR.AlternationCantCapture));
839 if (charsRight >=4 && (rightchar2 == '<' && RightChar(3) != '!' && RightChar(3) != '='))
840 throw MakeException(SR.GetString(SR.AlternationCantCapture));
850 NodeType = RegexNode.Group;
852 if (CharsRight() == 0)
855 if ((ch = MoveRightGetChar()) == ')')
863 return new RegexNode(NodeType, _options);
868 // break Recognize comes here
870 throw MakeException(SR.GetString(SR.UnrecognizedGrouping));
874 * Scans whitespace or x-mode comments.
876 internal void ScanBlank() {
879 while (CharsRight() > 0 && IsSpace(RightChar()))
882 if (CharsRight() == 0)
885 if (RightChar() == '#') {
886 while (CharsRight() > 0 && RightChar() != '\n')
889 else if (CharsRight() >= 3 && RightChar(2) == '#' &&
890 RightChar(1) == '?' && RightChar() == '(') {
891 while (CharsRight() > 0 && RightChar() != ')')
893 if (CharsRight() == 0)
894 throw MakeException(SR.GetString(SR.UnterminatedComment));
903 if (CharsRight() < 3 || RightChar(2) != '#' ||
904 RightChar(1) != '?' || RightChar() != '(')
907 while (CharsRight() > 0 && RightChar() != ')')
909 if (CharsRight() == 0)
910 throw MakeException(SR.GetString(SR.UnterminatedComment));
917 * Scans chars following a '\' (not counting the '\'), and returns
918 * a RegexNode for the type of atom scanned.
920 internal RegexNode ScanBackslash() {
924 if (CharsRight() == 0)
925 throw MakeException(SR.GetString(SR.IllegalEndEscape));
927 switch (ch = RightChar()) {
935 return new RegexNode(TypeFromCode(ch), _options);
940 return new RegexNode(RegexNode.Set, _options, RegexCharClass.ECMAWordClass);
941 return new RegexNode(RegexNode.Set, _options, RegexCharClass.WordClass);
946 return new RegexNode(RegexNode.Set, _options, RegexCharClass.NotECMAWordClass);
947 return new RegexNode(RegexNode.Set, _options, RegexCharClass.NotWordClass);
952 return new RegexNode(RegexNode.Set, _options, RegexCharClass.ECMASpaceClass);
953 return new RegexNode(RegexNode.Set, _options, RegexCharClass.SpaceClass);
958 return new RegexNode(RegexNode.Set, _options, RegexCharClass.NotECMASpaceClass);
959 return new RegexNode(RegexNode.Set, _options, RegexCharClass.NotSpaceClass);
964 return new RegexNode(RegexNode.Set, _options, RegexCharClass.ECMADigitClass);
965 return new RegexNode(RegexNode.Set, _options, RegexCharClass.DigitClass);
970 return new RegexNode(RegexNode.Set, _options, RegexCharClass.NotECMADigitClass);
971 return new RegexNode(RegexNode.Set, _options, RegexCharClass.NotDigitClass);
976 cc = new RegexCharClass();
977 cc.AddCategoryFromName(ParseProperty(), (ch != 'p'), UseOptionI(), _pattern);
979 cc.AddLowercase(_culture);
981 return new RegexNode(RegexNode.Set, _options, cc.ToStringClass());
984 return ScanBasicBackslash();
989 * Scans \-style backreferences and character escapes
991 internal RegexNode ScanBasicBackslash() {
992 if (CharsRight() == 0)
993 throw MakeException(SR.GetString(SR.IllegalEndEscape));
1000 backpos = Textpos();
1003 // allow \k<foo> instead of \<foo>, which is now deprecated
1006 if (CharsRight() >= 2) {
1008 ch = MoveRightGetChar();
1010 if (ch == '<' || ch == '\'') {
1012 close = (ch == '\'') ? '\'' : '>';
1016 if (!angled || CharsRight() <= 0)
1017 throw MakeException(SR.GetString(SR.MalformedNameRef));
1022 // Note angle without \g <
1024 else if ((ch == '<' || ch == '\'') && CharsRight() > 1) {
1026 close = (ch == '\'') ? '\'' : '>';
1032 // Try to parse backreference: \<1> or \<cap>
1034 if (angled && ch >= '0' && ch <= '9') {
1035 int capnum = ScanDecimal();
1037 if (CharsRight() > 0 && MoveRightGetChar() == close) {
1038 if (IsCaptureSlot(capnum))
1039 return new RegexNode(RegexNode.Ref, _options, capnum);
1041 throw MakeException(SR.GetString(SR.UndefinedBackref, capnum.ToString(CultureInfo.CurrentCulture)));
1045 // Try to parse backreference or octal: \1
1047 else if (!angled && ch >= '1' && ch <= '9') {
1050 int newcapnum = (int)(ch - '0');
1051 int pos = Textpos() - 1;
1052 while (newcapnum <= _captop) {
1053 if (IsCaptureSlot(newcapnum) && (_caps == null || (int)_caps[newcapnum] < pos))
1056 if (CharsRight() == 0 || (ch = RightChar()) < '0' || ch > '9')
1058 newcapnum = newcapnum * 10 + (int)(ch - '0');
1061 return new RegexNode(RegexNode.Ref, _options, capnum);
1065 int capnum = ScanDecimal();
1066 if (IsCaptureSlot(capnum))
1067 return new RegexNode(RegexNode.Ref, _options, capnum);
1068 else if (capnum <= 9)
1069 throw MakeException(SR.GetString(SR.UndefinedBackref, capnum.ToString(CultureInfo.CurrentCulture)));
1073 else if (angled && RegexCharClass.IsWordChar(ch)) {
1074 String capname = ScanCapname();
1076 if (CharsRight() > 0 && MoveRightGetChar() == close) {
1077 if (IsCaptureName(capname))
1078 return new RegexNode(RegexNode.Ref, _options, CaptureSlotFromName(capname));
1080 throw MakeException(SR.GetString(SR.UndefinedNameRef, capname));
1084 // Not backreference: must be char code
1087 ch = ScanCharEscape();
1090 ch = Char.ToLower(ch, _culture);
1092 return new RegexNode(RegexNode.One, _options, ch);
1096 * Scans $ patterns recognized within replacment patterns
1098 internal RegexNode ScanDollar() {
1099 if (CharsRight() == 0)
1100 return new RegexNode(RegexNode.One, _options, '$');
1102 char ch = RightChar();
1104 int backpos = Textpos();
1105 int lastEndPos = backpos;
1109 if (ch == '{' && CharsRight() > 1) {
1118 // Try to parse backreference: \1 or \{1} or \{cap}
1120 if (ch >= '0' && ch <= '9') {
1121 if (!angled && UseOptionE()) {
1123 int newcapnum = (int)(ch - '0');
1125 if (IsCaptureSlot(newcapnum)) {
1127 lastEndPos = Textpos();
1130 while (CharsRight() > 0 && (ch = RightChar()) >= '0' && ch <= '9') {
1131 int digit = (int)(ch - '0');
1132 if (newcapnum > (MaxValueDiv10) || (newcapnum == (MaxValueDiv10) && digit > (MaxValueMod10)))
1133 throw MakeException(SR.GetString(SR.CaptureGroupOutOfRange));
1135 newcapnum = newcapnum * 10 + digit;
1138 if (IsCaptureSlot(newcapnum)) {
1140 lastEndPos = Textpos();
1145 return new RegexNode(RegexNode.Ref, _options, capnum);
1149 int capnum = ScanDecimal();
1150 if (!angled || CharsRight() > 0 && MoveRightGetChar() == '}') {
1151 if (IsCaptureSlot(capnum))
1152 return new RegexNode(RegexNode.Ref, _options, capnum);
1156 else if (angled && RegexCharClass.IsWordChar(ch)) {
1157 String capname = ScanCapname();
1159 if (CharsRight() > 0 && MoveRightGetChar() == '}') {
1160 if (IsCaptureName(capname))
1161 return new RegexNode(RegexNode.Ref, _options, CaptureSlotFromName(capname));
1170 return new RegexNode(RegexNode.One, _options, '$');
1177 capnum = RegexReplacement.LeftPortion;
1181 capnum = RegexReplacement.RightPortion;
1185 capnum = RegexReplacement.LastGroup;
1189 capnum = RegexReplacement.WholeString;
1195 return new RegexNode(RegexNode.Ref, _options, capnum);
1199 // unrecognized $: literalize
1202 return new RegexNode(RegexNode.One, _options, '$');
1206 * Scans a capture name: consumes word chars
1208 internal String ScanCapname() {
1209 int startpos = Textpos();
1211 while (CharsRight() > 0) {
1212 if (!RegexCharClass.IsWordChar(MoveRightGetChar())) {
1218 return _pattern.Substring(startpos, Textpos() - startpos);
1223 * Scans up to three octal digits (stops before exceeding 0377).
1225 internal char ScanOctal() {
1230 // Consume octal chars only up to 3 digits and value 0377
1234 if (c > CharsRight())
1237 for (i = 0; c > 0 && (uint)(d = RightChar() - '0') <= 7; c -= 1) {
1241 if (UseOptionE() && i >= 0x20)
1245 // Octal codes only go up to 255. Any larger and the behavior that Perl follows
1246 // is simply to truncate the high bits.
1253 * Scans any number of decimal digits (pegs value at 2^31-1 if too large)
1255 internal int ScanDecimal() {
1259 while (CharsRight() > 0 && (uint)(d = (char)(RightChar() - '0')) <= 9) {
1262 if (i > (MaxValueDiv10) || (i == (MaxValueDiv10) && d > (MaxValueMod10)))
1263 throw MakeException(SR.GetString(SR.CaptureGroupOutOfRange));
1273 * Scans exactly c hex digits (c=2 for \xFF, c=4 for \uFFFF)
1275 internal char ScanHex(int c) {
1281 if (CharsRight() >= c) {
1282 for (; c > 0 && ((d = HexDigit(MoveRightGetChar())) >= 0); c -= 1) {
1289 throw MakeException(SR.GetString(SR.TooFewHex));
1295 * Returns n <= 0xF for a hex digit.
1297 internal static int HexDigit(char ch) {
1300 if ((uint)(d = ch - '0') <= 9)
1303 if ((uint)(d = ch - 'a') <= 5)
1306 if ((uint)(d = ch - 'A') <= 5)
1313 * Grabs and converts an ascii control character
1315 internal char ScanControl() {
1318 if (CharsRight() <= 0)
1319 throw MakeException(SR.GetString(SR.MissingControl));
1321 ch = MoveRightGetChar();
1323 // \ca interpreted as \cA
1325 if (ch >= 'a' && ch <= 'z')
1326 ch = (char)(ch - ('a' - 'A'));
1328 if ((ch = (char)(ch - '@')) < ' ')
1331 throw MakeException(SR.GetString(SR.UnrecognizedControl));
1335 * Returns true for options allowed only at the top level
1337 internal bool IsOnlyTopOption(RegexOptions option) {
1338 return(option == RegexOptions.RightToLeft
1339 #if !(SILVERLIGHT||FULL_AOT_RUNTIME)
1340 || option == RegexOptions.Compiled
1342 || option == RegexOptions.CultureInvariant
1343 || option == RegexOptions.ECMAScript
1348 * Scans cimsx-cimsx option string, stops at the first unrecognized char.
1350 internal void ScanOptions() {
1353 RegexOptions option;
1355 for (off = false; CharsRight() > 0; MoveRight()) {
1361 else if (ch == '+') {
1365 option = OptionFromCode(ch);
1366 if (option == 0 || IsOnlyTopOption(option))
1370 _options &= ~option;
1378 * Scans \ code for escape codes that map to single unicode chars.
1380 internal char ScanCharEscape() {
1383 ch = MoveRightGetChar();
1385 if (ch >= '0' && ch <= '7') {
1412 return ScanControl();
1414 if (!UseOptionE() && RegexCharClass.IsWordChar(ch))
1415 throw MakeException(SR.GetString(SR.UnrecognizedEscape, ch.ToString()));
1421 * Scans X for \p{X} or \P{X}
1423 internal String ParseProperty() {
1424 if (CharsRight() < 3) {
1425 throw MakeException(SR.GetString(SR.IncompleteSlashP));
1427 char ch = MoveRightGetChar();
1429 throw MakeException(SR.GetString(SR.MalformedSlashP));
1432 int startpos = Textpos();
1433 while (CharsRight() > 0) {
1434 ch = MoveRightGetChar();
1435 if (!(RegexCharClass.IsWordChar(ch) || ch == '-')) {
1440 String capname = _pattern.Substring(startpos, Textpos() - startpos);
1442 if (CharsRight() == 0 || MoveRightGetChar() != '}')
1443 throw MakeException(SR.GetString(SR.IncompleteSlashP));
1449 * Returns ReNode type for zero-length assertions with a \ code.
1451 internal int TypeFromCode(char ch) {
1454 return UseOptionE() ? RegexNode.ECMABoundary : RegexNode.Boundary;
1456 return UseOptionE() ? RegexNode.NonECMABoundary : RegexNode.Nonboundary;
1458 return RegexNode.Beginning;
1460 return RegexNode.Start;
1462 return RegexNode.EndZ;
1464 return RegexNode.End;
1466 return RegexNode.Nothing;
1471 * Returns option bit from single-char (?cimsx) code.
1473 internal static RegexOptions OptionFromCode(char ch) {
1475 if (ch >= 'A' && ch <= 'Z')
1476 ch += (char)('a' - 'A');
1479 #if !(SILVERLIGHT||FULL_AOT_RUNTIME)
1481 return RegexOptions.Compiled;
1484 return RegexOptions.IgnoreCase;
1486 return RegexOptions.RightToLeft;
1488 return RegexOptions.Multiline;
1490 return RegexOptions.ExplicitCapture;
1492 return RegexOptions.Singleline;
1494 return RegexOptions.IgnorePatternWhitespace;
1497 return RegexOptions.Debug;
1500 return RegexOptions.ECMAScript;
1507 * a prescanner for deducing the slots used for
1508 * captures by doing a partial tokenization of the pattern.
1510 internal void CountCaptures() {
1513 NoteCaptureSlot(0, 0);
1517 while (CharsRight() > 0) {
1518 int pos = Textpos();
1519 ch = MoveRightGetChar();
1522 if (CharsRight() > 0)
1534 ScanCharClass(false, true);
1538 if (!EmptyOptionsStack())
1543 if (CharsRight() >= 2 && RightChar(1) == '#' && RightChar() == '?') {
1550 if (CharsRight() > 0 && RightChar() == '?') {
1554 if (CharsRight() > 1 && (RightChar() == '<' || RightChar() == '\'')) {
1555 // named group: (?<... or (?'...
1560 if (ch != '0' && RegexCharClass.IsWordChar(ch)) {
1561 //if (_ignoreNextParen)
1562 // throw MakeException(SR.GetString(SR.AlternationCantCapture));
1563 if (ch >= '1' && ch <= '9')
1564 NoteCaptureSlot(ScanDecimal(), pos);
1566 NoteCaptureName(ScanCapname(), pos);
1572 // get the options if it's an option construct (?cimsx-cimsx...)
1575 if (CharsRight() > 0) {
1576 if (RightChar() == ')') {
1581 else if (RightChar() == '(') {
1582 // alternation construct: (?(foo)yes|no)
1583 // ignore the next paren so we don't capture the condition
1584 _ignoreNextParen = true;
1586 // break from here so we don't reset _ignoreNextParen
1593 if (!UseOptionN() && !_ignoreNextParen)
1594 NoteCaptureSlot(_autocap++, pos);
1598 _ignoreNextParen = false;
1607 * Notes a used capture slot
1609 internal void NoteCaptureSlot(int i, int pos) {
1610 if (!_caps.ContainsKey(i)) {
1611 // the rhs of the hashtable isn't used in the parser
1617 if (i == Int32.MaxValue)
1626 * Notes a used capture slot
1628 internal void NoteCaptureName(String name, int pos) {
1629 if (_capnames == null) {
1631 _capnames = new Dictionary<String, Int32>();
1633 _capnames = new Hashtable();
1635 _capnamelist = new List<String>();
1638 if (!_capnames.ContainsKey(name)) {
1639 _capnames.Add(name, pos);
1640 _capnamelist.Add(name);
1645 * For when all the used captures are known: note them all at once
1648 internal void NoteCaptures(Dictionary<Int32, Int32> caps, int capsize, Dictionary<String, Int32> capnames) {
1650 internal void NoteCaptures(Hashtable caps, int capsize, Hashtable capnames) {
1654 _capnames = capnames;
1658 * Assigns unused slot numbers to the capture names
1660 internal void AssignNameSlots() {
1661 if (_capnames != null) {
1662 for (int i = 0; i < _capnamelist.Count; i++) {
1663 while (IsCaptureSlot(_autocap))
1665 string name = _capnamelist[i];
1666 int pos = (int)_capnames[name];
1667 _capnames[name] = _autocap;
1668 NoteCaptureSlot(_autocap, pos);
1674 // if the caps array has at least one gap, construct the list of used slots
1676 if (_capcount < _captop) {
1677 _capnumlist = new Int32[_capcount];
1680 for (IDictionaryEnumerator de = _caps.GetEnumerator(); de.MoveNext(); )
1681 _capnumlist[i++] = (int)de.Key;
1683 System.Array.Sort(_capnumlist, Comparer<Int32>.Default);
1686 // merge capsnumlist into capnamelist
1688 if (_capnames != null || _capnumlist != null) {
1689 List<String> oldcapnamelist;
1693 if (_capnames == null) {
1694 oldcapnamelist = null;
1696 _capnames = new Dictionary<String, Int32>();
1698 _capnames = new Hashtable();
1700 _capnamelist = new List<String>();
1704 oldcapnamelist = _capnamelist;
1705 _capnamelist = new List<String>();
1706 next = (int)_capnames[oldcapnamelist[0]];
1709 for (int i = 0; i < _capcount; i++) {
1710 int j = (_capnumlist == null) ? i : (int)_capnumlist[i];
1713 _capnamelist.Add(oldcapnamelist[k++]);
1714 next = (k == oldcapnamelist.Count) ? -1 : (int)_capnames[oldcapnamelist[k]];
1717 String str = Convert.ToString(j, _culture);
1718 _capnamelist.Add(str);
1726 * Looks up the slot number for a given name
1728 internal int CaptureSlotFromName(String capname) {
1729 return(int)_capnames[capname];
1733 * True if the capture slot was noted
1735 internal bool IsCaptureSlot(int i) {
1737 return _caps.ContainsKey(i);
1739 return(i >= 0 && i < _capsize);
1743 * Looks up the slot number for a given name
1745 internal bool IsCaptureName(String capname) {
1746 if (_capnames == null)
1749 return _capnames.ContainsKey(capname);
1753 * True if N option disabling '(' autocapture is on.
1755 internal bool UseOptionN() {
1756 return(_options & RegexOptions.ExplicitCapture) != 0;
1760 * True if I option enabling case-insensitivity is on.
1762 internal bool UseOptionI() {
1763 return(_options & RegexOptions.IgnoreCase) != 0;
1767 * True if M option altering meaning of $ and ^ is on.
1769 internal bool UseOptionM() {
1770 return(_options & RegexOptions.Multiline) != 0;
1774 * True if S option altering meaning of . is on.
1776 internal bool UseOptionS() {
1777 return(_options & RegexOptions.Singleline) != 0;
1781 * True if X option enabling whitespace/comment mode is on.
1783 internal bool UseOptionX() {
1784 return(_options & RegexOptions.IgnorePatternWhitespace) != 0;
1788 * True if E option enabling ECMAScript behavior is on.
1790 internal bool UseOptionE() {
1791 return(_options & RegexOptions.ECMAScript) != 0;
1794 internal const byte Q = 5; // quantifier
1795 internal const byte S = 4; // ordinary stoppper
1796 internal const byte Z = 3; // ScanBlank stopper
1797 internal const byte X = 2; // whitespace
1798 internal const byte E = 1; // should be escaped
1801 * For categorizing ascii characters.
1803 internal static readonly byte[] _category = new byte[] {
1804 // 0 1 2 3 4 5 6 7 8 9 A B C D E F 0 1 2 3 4 5 6 7 8 9 A B C D E F
1805 0,0,0,0,0,0,0,0,0,X,X,0,X,X,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1806 // ! " # $ % & ' ( ) * + , - . / 0 1 2 3 4 5 6 7 8 9 : ; < = > ?
1807 X,0,0,Z,S,0,0,0,S,S,Q,Q,0,0,S,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,Q,
1808 // @ A B C D E F G H I J K L M N O P Q R S T U V W X Y Z [ \ ] ^ _
1809 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,S,S,0,S,0,
1810 // ' a b c d e f g h i j k l m n o p q r s t u v w x y z { | } ~
1811 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,Q,S,0,0,0};
1814 * Returns true for those characters that terminate a string of ordinary chars.
1816 internal static bool IsSpecial(char ch) {
1817 return(ch <= '|' && _category[ch] >= S);
1821 * Returns true for those characters that terminate a string of ordinary chars.
1823 internal static bool IsStopperX(char ch) {
1824 return(ch <= '|' && _category[ch] >= X);
1828 * Returns true for those characters that begin a quantifier.
1830 internal static bool IsQuantifier(char ch) {
1831 return(ch <= '{' && _category[ch] >= Q);
1834 internal bool IsTrueQuantifier() {
1835 int nChars = CharsRight();
1838 int startpos = Textpos();
1839 char ch = CharAt(startpos);
1841 return ch <= '{' && _category[ch] >= Q;
1843 while (--nChars > 0 && (ch = CharAt(++pos)) >= '0' && ch <= '9') ;
1844 if (nChars == 0 || pos - startpos == 1)
1850 while (--nChars > 0 && (ch = CharAt(++pos)) >= '0' && ch <= '9') ;
1851 return nChars > 0 && ch == '}';
1855 * Returns true for whitespace.
1857 internal static bool IsSpace(char ch) {
1858 return(ch <= ' ' && _category[ch] == X);
1862 * Returns true for chars that should be escaped.
1864 internal static bool IsMetachar(char ch) {
1865 return(ch <= '|' && _category[ch] >= E);
1870 * Add a string to the last concatenate.
1872 internal void AddConcatenate(int pos, int cch, bool isReplacement) {
1879 String str = _pattern.Substring(pos, cch);
1881 if (UseOptionI() && !isReplacement) {
1882 // We do the ToLower character by character for consistency. With surrogate chars, doing
1883 // a ToLower on the entire string could actually change the surrogate pair. This is more correct
1884 // linguistically, but since Regex doesn't support surrogates, it's more important to be
1886 StringBuilder sb = new StringBuilder(str.Length);
1887 for (int i=0; i<str.Length; i++)
1888 sb.Append(Char.ToLower(str[i], _culture));
1889 str = sb.ToString();
1892 node = new RegexNode(RegexNode.Multi, _options, str);
1895 char ch = _pattern[pos];
1897 if (UseOptionI() && !isReplacement)
1898 ch = Char.ToLower(ch, _culture);
1900 node = new RegexNode(RegexNode.One, _options, ch);
1903 _concatenation.AddChild(node);
1907 * Push the parser state (in response to an open paren)
1909 internal void PushGroup() {
1910 _group._next = _stack;
1911 _alternation._next = _group;
1912 _concatenation._next = _alternation;
1913 _stack = _concatenation;
1917 * Remember the pushed state (in response to a ')')
1919 internal void PopGroup() {
1920 _concatenation = _stack;
1921 _alternation = _concatenation._next;
1922 _group = _alternation._next;
1923 _stack = _group._next;
1925 // The first () inside a Testgroup group goes directly to the group
1926 if (_group.Type() == RegexNode.Testgroup && _group.ChildCount() == 0) {
1928 throw MakeException(SR.GetString(SR.IllegalCondition));
1930 _group.AddChild(_unit);
1936 * True if the group stack is empty.
1938 internal bool EmptyStack() {
1939 return _stack == null;
1943 * Start a new round for the parser state (in response to an open paren or string start)
1945 internal void StartGroup(RegexNode openGroup) {
1947 _alternation = new RegexNode(RegexNode.Alternate, _options);
1948 _concatenation = new RegexNode(RegexNode.Concatenate, _options);
1952 * Finish the current concatenation (in response to a |)
1954 internal void AddAlternate() {
1955 // The | parts inside a Testgroup group go directly to the group
1957 if (_group.Type() == RegexNode.Testgroup || _group.Type() == RegexNode.Testref) {
1958 _group.AddChild(_concatenation.ReverseLeft());
1961 _alternation.AddChild(_concatenation.ReverseLeft());
1964 _concatenation = new RegexNode(RegexNode.Concatenate, _options);
1968 * Finish the current quantifiable (when a quantifier is not found or is not possible)
1970 internal void AddConcatenate() {
1971 // The first (| inside a Testgroup group goes directly to the group
1973 _concatenation.AddChild(_unit);
1978 * Finish the current quantifiable (when a quantifier is found)
1980 internal void AddConcatenate(bool lazy, int min, int max) {
1981 _concatenation.AddChild(_unit.MakeQuantifier(lazy, min, max));
1986 * Returns the current unit
1988 internal RegexNode Unit() {
1993 * Sets the current unit to a single char node
1995 internal void AddUnitOne(char ch) {
1997 ch = Char.ToLower(ch, _culture);
1999 _unit = new RegexNode(RegexNode.One, _options, ch);
2003 * Sets the current unit to a single inverse-char node
2005 internal void AddUnitNotone(char ch) {
2007 ch = Char.ToLower(ch, _culture);
2009 _unit = new RegexNode(RegexNode.Notone, _options, ch);
2013 * Sets the current unit to a single set node
2015 internal void AddUnitSet(string cc) {
2016 _unit = new RegexNode(RegexNode.Set, _options, cc);
2020 * Sets the current unit to a subtree
2022 internal void AddUnitNode(RegexNode node) {
2027 * Sets the current unit to an assertion of the specified type
2029 internal void AddUnitType(int type) {
2030 _unit = new RegexNode(type, _options);
2034 * Finish the current group (in response to a ')' or end)
2036 internal void AddGroup() {
2037 if (_group.Type() == RegexNode.Testgroup || _group.Type() == RegexNode.Testref) {
2038 _group.AddChild(_concatenation.ReverseLeft());
2040 if (_group.Type() == RegexNode.Testref && _group.ChildCount() > 2 || _group.ChildCount() > 3)
2041 throw MakeException(SR.GetString(SR.TooManyAlternates));
2044 _alternation.AddChild(_concatenation.ReverseLeft());
2045 _group.AddChild(_alternation);
2052 * Saves options on a stack.
2054 internal void PushOptions() {
2055 _optionsStack.Add(_options);
2059 * Recalls options from the stack.
2061 internal void PopOptions() {
2062 _options = _optionsStack[_optionsStack.Count - 1];
2063 _optionsStack.RemoveAt(_optionsStack.Count - 1);
2067 * True if options stack is empty.
2069 internal bool EmptyOptionsStack() {
2070 return(_optionsStack.Count == 0);
2074 * Pops the option stack, but keeps the current options unchanged.
2076 internal void PopKeepOptions() {
2077 _optionsStack.RemoveAt(_optionsStack.Count - 1);
2081 * Fills in an ArgumentException
2083 internal ArgumentException MakeException(String message) {
2084 return new ArgumentException(SR.GetString(SR.MakeException, _pattern, message));
2088 * Returns the current parsing position.
2090 internal int Textpos() {
2095 * Zaps to a specific parsing position.
2097 internal void Textto(int pos) {
2102 * Returns the char at the right of the current parsing position and advances to the right.
2104 internal char MoveRightGetChar() {
2105 return _pattern[_currentPos++];
2109 * Moves the current position to the right.
2111 internal void MoveRight() {
2115 internal void MoveRight(int i) {
2120 * Moves the current parsing position one to the left.
2122 internal void MoveLeft() {
2127 * Returns the char left of the current parsing position.
2129 internal char CharAt(int i) {
2134 * Returns the char right of the current parsing position.
2136 internal char RightChar() {
2137 return _pattern[_currentPos];
2141 * Returns the char i chars right of the current parsing position.
2143 internal char RightChar(int i) {
2144 return _pattern[_currentPos + i];
2148 * Number of characters to the right of the current parsing position.
2150 internal int CharsRight() {
2151 return _pattern.Length - _currentPos;