3 // namespace: System.Text.RegularExpressions
6 // author: Dan Lewis (dlewis@gmx.co.uk)
10 // Permission is hereby granted, free of charge, to any person obtaining
11 // a copy of this software and associated documentation files (the
12 // "Software"), to deal in the Software without restriction, including
13 // without limitation the rights to use, copy, modify, merge, publish,
14 // distribute, sublicense, and/or sell copies of the Software, and to
15 // permit persons to whom the Software is furnished to do so, subject to
16 // the following conditions:
18 // The above copyright notice and this permission notice shall be
19 // included in all copies or substantial portions of the Software.
21 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
22 // EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
23 // MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
24 // NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
25 // LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
26 // OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
27 // WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
31 using System.Collections;
32 using System.Globalization;
34 namespace System.Text.RegularExpressions.Syntax {
37 public static int ParseDecimal (string str, ref int ptr) {
38 return ParseNumber (str, ref ptr, 10, 1, Int32.MaxValue);
41 public static int ParseOctal (string str, ref int ptr) {
42 return ParseNumber (str, ref ptr, 8, 1, 3);
45 public static int ParseHex (string str, ref int ptr, int digits) {
46 return ParseNumber (str, ref ptr, 16, digits, digits);
49 public static int ParseNumber (string str, ref int ptr, int b, int min, int max) {
50 int p = ptr, n = 0, digits = 0, d;
54 while (digits < max && p < str.Length) {
55 d = ParseDigit (str[p ++], b, digits);
72 public static string ParseName (string str, ref int ptr) {
73 if (Char.IsDigit (str[ptr])) {
74 int gid = ParseNumber (str, ref ptr, 10, 1, 0);
76 return gid.ToString ();
83 if (!IsNameChar (str[ptr]))
89 return str.Substring (start, ptr - start);
94 public static string Escape (string str) {
96 for (int i = 0; i < str.Length; ++ i) {
99 case '\\': case '*': case '+': case '?': case '|':
100 case '{': case '[': case '(': case ')': case '^':
101 case '$': case '.': case '#': case ' ':
105 case '\t': result += "\\t"; break;
106 case '\n': result += "\\n"; break;
107 case '\r': result += "\\r"; break;
108 case '\f': result += "\\f"; break;
110 default: result += c; break;
117 public static string Unescape (string str) {
118 if (str.IndexOf ('\\') == -1)
120 return new Parser ().ParseString (str);
126 this.caps = new ArrayList ();
127 this.refs = new Hashtable ();
130 public RegularExpression ParseRegularExpression (string pattern, RegexOptions options) {
131 this.pattern = pattern;
139 RegularExpression re = new RegularExpression ();
140 ParseGroup (re, options, null);
141 ResolveReferences ();
143 re.GroupCount = num_groups;
147 catch (IndexOutOfRangeException) {
148 throw NewParseException ("Unexpected end of pattern.");
152 public int GetMapping (Hashtable mapping)
154 int end = caps.Count;
155 mapping.Add ("0", 0);
156 for (int i = 0; i < end; i++) {
157 CapturingGroup group = (CapturingGroup) caps [i];
158 string name = group.Name != null ? group.Name : group.Index.ToString ();
159 if (mapping.Contains (name)) {
160 if ((int) mapping [name] != group.Index)
161 throw new SystemException ("invalid state");
164 mapping.Add (name, group.Index);
172 private void ParseGroup (Group group, RegexOptions options, Assertion assertion) {
173 bool is_top_level = group is RegularExpression;
175 Alternation alternation = null;
176 string literal = null;
178 Group current = new Group ();
179 Expression expr = null;
183 ConsumeWhitespace (IsIgnorePatternWhitespace (options));
184 if (ptr >= pattern.Length)
187 // (1) Parse for Expressions
189 char ch = pattern[ptr ++];
194 IsMultiline (options) ? Position.StartOfLine : Position.Start;
195 expr = new PositionAssertion (pos);
201 IsMultiline (options) ? Position.EndOfLine : Position.End;
202 expr = new PositionAssertion (pos);
208 IsSingleline (options) ? Category.AnySingleline : Category.Any;
209 expr = new CharacterClass (cat, false);
214 int c = ParseEscape (false);
218 expr = ParseSpecial (options);
221 ch = pattern[ptr ++]; // default escape
227 expr = ParseCharacterClass (options);
232 bool ignore = IsIgnoreCase (options);
233 expr = ParseGroupingConstruct (ref options);
235 if (literal != null && IsIgnoreCase (options) != ignore) {
236 current.AppendExpression (new Literal (literal, IsIgnoreCase (options)));
251 if (literal != null) {
252 current.AppendExpression (new Literal (literal, IsIgnoreCase (options)));
256 if (assertion != null) {
257 if (assertion.TrueExpression == null)
258 assertion.TrueExpression = current;
259 else if (assertion.FalseExpression == null)
260 assertion.FalseExpression = current;
262 throw NewParseException ("Too many | in (?()|).");
265 if (alternation == null)
266 alternation = new Alternation ();
268 alternation.AddAlternative (current);
271 current = new Group ();
275 case '*': case '+': case '?': {
276 throw NewParseException ("Bad quantifier.");
280 break; // literal character
283 ConsumeWhitespace (IsIgnorePatternWhitespace (options));
285 // (2) Check for Repetitions
287 if (ptr < pattern.Length) {
288 char k = pattern[ptr];
289 int min = 0, max = 0;
291 bool haveRep = false;
294 if (k == '?' || k == '*' || k == '+') {
299 case '?': min = 0; max = 1; break;
300 case '*': min = 0; max = 0x7fffffff; break;
301 case '+': min = 1; max = 0x7fffffff; break;
303 } else if (k == '{' && ptr + 1 < pattern.Length) {
306 haveRep = ParseRepetitionBounds (out min, out max, options);
312 ConsumeWhitespace (IsIgnorePatternWhitespace (options));
313 if (ptr < pattern.Length && pattern[ptr] == '?') {
318 //It doesn't make sense to assert a given position more than once.
319 bool ignore_repetition = false;
320 if (expr is PositionAssertion) {
321 ignore_repetition = min > 0 && !lazy;
325 if (!ignore_repetition) {
326 Repetition repetition = new Repetition (min, max, lazy);
329 repetition.Expression = new Literal (ch.ToString (), IsIgnoreCase (options));
331 repetition.Expression = expr;
338 // (3) Append Expression and/or Literal
346 if (literal != null) {
347 current.AppendExpression (new Literal (literal, IsIgnoreCase (options)));
351 current.AppendExpression (expr);
355 if (is_top_level && ptr >= pattern.Length)
360 if (is_top_level && closed)
361 throw NewParseException ("Too many )'s.");
362 if (!is_top_level && !closed)
363 throw NewParseException ("Not enough )'s.");
366 // clean up literals and alternations
369 current.AppendExpression (new Literal (literal, IsIgnoreCase (options)));
371 if (assertion != null) {
372 if (assertion.TrueExpression == null)
373 assertion.TrueExpression = current;
375 assertion.FalseExpression = current;
377 group.AppendExpression (assertion);
379 else if (alternation != null) {
380 alternation.AddAlternative (current);
381 group.AppendExpression (alternation);
384 group.AppendExpression (current);
387 private Expression ParseGroupingConstruct (ref RegexOptions options) {
388 if (pattern[ptr] != '?') {
391 if (IsExplicitCapture (options))
392 group = new Group ();
394 group = new CapturingGroup ();
398 ParseGroup (group, options, null);
404 switch (pattern[ptr]) {
405 case ':': { // non-capturing group
407 Group group = new Group ();
408 ParseGroup (group, options, null);
413 case '>': { // non-backtracking group
415 Group group = new NonBacktrackingGroup ();
416 ParseGroup (group, options, null);
421 case 'i': case 'm': case 'n':
422 case 's': case 'x': case '-': { // options
423 RegexOptions o = options;
424 ParseOptions (ref o, false);
425 if (pattern[ptr] == '-') {
427 ParseOptions (ref o, true);
430 if (pattern[ptr] == ':') { // pass options to child group
432 Group group = new Group ();
433 ParseGroup (group, o, null);
436 else if (pattern[ptr] == ')') { // change options of enclosing group
442 throw NewParseException ("Bad options");
445 case '<': case '=': case '!': { // lookahead/lookbehind
446 ExpressionAssertion asn = new ExpressionAssertion ();
447 if (!ParseAssertionType (asn))
448 goto case '\''; // it's a (?<name> ) construct
450 Group test = new Group ();
451 ParseGroup (test, options, null);
453 asn.TestExpression = test;
457 case '\'': { // named/balancing group
459 if (pattern[ptr] == '<')
465 string name = ParseName ();
467 if (pattern[ptr] == delim) {
471 throw NewParseException ("Bad group name.");
474 CapturingGroup cap = new CapturingGroup ();
477 ParseGroup (cap, options, null);
481 else if (pattern[ptr] == '-') {
485 string balance_name = ParseName ();
486 if (balance_name == null || pattern[ptr] != delim)
487 throw NewParseException ("Bad balancing group name.");
490 BalancingGroup bal = new BalancingGroup ();
497 refs.Add (bal, balance_name);
499 ParseGroup (bal, options, null);
504 throw NewParseException ("Bad group name.");
507 case '(': { // expression/capture test
512 string name = ParseName ();
513 if (name == null || pattern[ptr] != ')') { // expression test
514 // FIXME MS implementation doesn't seem to
515 // implement this version of (?(x) ...)
518 ExpressionAssertion expr_asn = new ExpressionAssertion ();
520 if (pattern[ptr] == '?') {
522 if (!ParseAssertionType (expr_asn))
523 throw NewParseException ("Bad conditional.");
526 expr_asn.Negate = false;
527 expr_asn.Reverse = false;
530 Group test = new Group ();
531 ParseGroup (test, options, null);
532 expr_asn.TestExpression = test;
535 else { // capture test
537 asn = new CaptureAssertion (new Literal (name, IsIgnoreCase (options)));
538 refs.Add (asn, name);
541 Group group = new Group ();
542 ParseGroup (group, options, asn);
546 case '#': { // comment
548 while (pattern[ptr ++] != ')') {
549 if (ptr >= pattern.Length)
550 throw NewParseException ("Unterminated (?#...) comment.");
556 throw NewParseException ("Bad grouping construct.");
560 private bool ParseAssertionType (ExpressionAssertion assertion) {
561 if (pattern[ptr] == '<') {
562 switch (pattern[ptr + 1]) {
564 assertion.Negate = false;
567 assertion.Negate = true;
573 assertion.Reverse = true;
577 switch (pattern[ptr]) {
579 assertion.Negate = false;
582 assertion.Negate = true;
588 assertion.Reverse = false;
595 private void ParseOptions (ref RegexOptions options, bool negate) {
597 switch (pattern[ptr]) {
600 options &= ~RegexOptions.IgnoreCase;
602 options |= RegexOptions.IgnoreCase;
607 options &= ~RegexOptions.Multiline;
609 options |= RegexOptions.Multiline;
614 options &= ~RegexOptions.ExplicitCapture;
616 options |= RegexOptions.ExplicitCapture;
621 options &= ~RegexOptions.Singleline;
623 options |= RegexOptions.Singleline;
628 options &= ~RegexOptions.IgnorePatternWhitespace;
630 options |= RegexOptions.IgnorePatternWhitespace;
641 private Expression ParseCharacterClass (RegexOptions options) {
643 if (pattern[ptr] == '^') {
648 bool ecma = IsECMAScript (options);
649 CharacterClass cls = new CharacterClass (negate, IsIgnoreCase (options));
651 if (pattern[ptr] == ']') {
652 cls.AddCharacter (']');
660 while (ptr < pattern.Length) {
668 if (c == '-' && last >= 0 && !range) {
674 c = ParseEscape (true);
676 goto char_recognized;
678 // didn't recognize escape
679 c = pattern [ptr ++];
683 goto char_recognized;
686 cls.AddCategory (ecma ? Category.EcmaDigit : Category.Digit, c == 'D');
690 cls.AddCategory (ecma ? Category.EcmaWord : Category.Word, c == 'W');
694 cls.AddCategory (ecma ? Category.EcmaWhiteSpace : Category.WhiteSpace, c == 'S');
698 cls.AddCategory (ParseUnicodeCategory (), c == 'P'); // ignore ecma
701 default: // add escaped character
702 goto char_recognized;
705 // if the pattern looks like [a-\s] ...
707 throw NewParseException ("character range cannot have category \\" + c);
715 // if 'range' is true, we know that 'last >= 0'
717 throw NewParseException ("[" + last + "-" + c + "] range in reverse order.");
718 cls.AddRange ((char)last, (char)c);
724 cls.AddCharacter ((char)c);
729 throw NewParseException ("Unterminated [] set.");
732 cls.AddCharacter ('-');
737 private bool ParseRepetitionBounds (out int min, out int max, RegexOptions options) {
743 ConsumeWhitespace (IsIgnorePatternWhitespace (options));
745 if (pattern[ptr] == ',') {
748 n = ParseNumber (10, 1, 0);
749 ConsumeWhitespace (IsIgnorePatternWhitespace (options));
752 switch (pattern[ptr ++]) {
757 ConsumeWhitespace (IsIgnorePatternWhitespace (options));
758 m = ParseNumber (10, 1, 0);
759 ConsumeWhitespace (IsIgnorePatternWhitespace (options));
760 if (pattern[ptr ++] != '}')
767 /* check bounds and ordering */
769 if (n > 0x7fffffff || m > 0x7fffffff)
770 throw NewParseException ("Illegal {x, y} - maximum of 2147483647.");
772 throw NewParseException ("Illegal {x, y} with x > y.");
774 /* assign min and max */
785 private Category ParseUnicodeCategory () {
786 if (pattern[ptr ++] != '{')
787 throw NewParseException ("Incomplete \\p{X} character escape.");
789 string name = ParseName (pattern, ref ptr);
791 throw NewParseException ("Incomplete \\p{X} character escape.");
793 Category cat = CategoryUtils.CategoryFromName (name);
794 if (cat == Category.None)
795 throw NewParseException ("Unknown property '" + name + "'.");
797 if (pattern[ptr ++] != '}')
798 throw NewParseException ("Incomplete \\p{X} character escape.");
803 private Expression ParseSpecial (RegexOptions options) {
805 bool ecma = IsECMAScript (options);
806 Expression expr = null;
808 switch (pattern[ptr ++]) {
813 expr = new CharacterClass (ecma ? Category.EcmaDigit : Category.Digit, false);
817 expr = new CharacterClass (ecma ? Category.EcmaWord : Category.Word, false);
821 expr = new CharacterClass (ecma ? Category.EcmaWhiteSpace : Category.WhiteSpace, false);
825 // this is odd - ECMAScript isn't supposed to support Unicode,
826 // yet \p{..} compiles and runs under the MS implementation
827 // identically to canonical mode. That's why I'm ignoring the
828 // value of ecma here.
830 expr = new CharacterClass (ParseUnicodeCategory (), false);
834 expr = new CharacterClass (ecma ? Category.EcmaDigit : Category.Digit, true);
838 expr = new CharacterClass (ecma ? Category.EcmaWord : Category.Word, true);
842 expr = new CharacterClass (ecma ? Category.EcmaWhiteSpace : Category.WhiteSpace, true);
846 expr = new CharacterClass (ParseUnicodeCategory (), true);
851 case 'A': expr = new PositionAssertion (Position.StartOfString); break;
852 case 'Z': expr = new PositionAssertion (Position.End); break;
853 case 'z': expr = new PositionAssertion (Position.EndOfString); break;
854 case 'G': expr = new PositionAssertion (Position.StartOfScan); break;
855 case 'b': expr = new PositionAssertion (Position.Boundary); break;
856 case 'B': expr = new PositionAssertion (Position.NonBoundary); break;
860 case '1': case '2': case '3': case '4': case '5':
861 case '6': case '7': case '8': case '9': {
864 int n = ParseNumber (10, 1, 0);
870 // FIXME test if number is within number of assigned groups
871 // this may present a problem for right-to-left matching
873 Reference reference = new BackslashNumber (IsIgnoreCase (options), ecma);
874 refs.Add (reference, n.ToString ());
880 char delim = pattern[ptr ++];
883 else if (delim != '\'')
884 throw NewParseException ("Malformed \\k<...> named backreference.");
886 string name = ParseName ();
887 if (name == null || pattern[ptr] != delim)
888 throw NewParseException ("Malformed \\k<...> named backreference.");
891 Reference reference = new Reference (IsIgnoreCase (options));
892 refs.Add (reference, name);
908 private int ParseEscape (bool inCharacterClass) {
912 if (p >= pattern.Length)
913 throw new ArgumentException (
914 String.Format ("Parsing \"{0}\" - Illegal \\ at end of " +
915 "pattern.", pattern), pattern);
917 switch (pattern[ptr ++]) {
919 // standard escapes (except \b)
921 case 'a': return '\u0007';
922 case 't': return '\u0009';
923 case 'r': return '\u000d';
924 case 'v': return '\u000b';
925 case 'f': return '\u000c';
926 case 'n': return '\u000a';
927 case 'e': return '\u001b';
928 case '\\': return '\\';
935 // Turns out that octal values can be specified
936 // without a leading zero. But also the limit
937 // of three character should include this first
942 int result = ParseOctal (pattern, ref ptr);
943 if (result == -1 && prevptr == ptr)
948 case '1': case '2': case '3': case '4': case '5':
950 if (inCharacterClass){
952 return ParseOctal (pattern, ref ptr);
957 c = ParseHex (pattern, ref ptr, 2);
959 throw NewParseException ("Insufficient hex digits");
964 c = ParseHex (pattern, ref ptr, 4);
966 throw NewParseException ("Insufficient hex digits");
970 // control characters
974 if (c >= '@' && c <= '_')
977 throw NewParseException ("Unrecognized control character.");
985 private string ParseName () {
986 return Parser.ParseName (pattern, ref ptr);
989 private static bool IsNameChar (char c) {
990 UnicodeCategory cat = Char.GetUnicodeCategory (c);
991 if (cat == UnicodeCategory.ModifierLetter)
993 if (cat == UnicodeCategory.ConnectorPunctuation)
995 return Char.IsLetterOrDigit (c);
998 private int ParseNumber (int b, int min, int max) {
999 return Parser.ParseNumber (pattern, ref ptr, b, min, max);
1002 private static int ParseDigit (char c, int b, int n) {
1005 if (c >= '0' && c <= '7')
1010 if (c >= '0' && c <= '9')
1015 if (c >= '0' && c <= '9')
1017 else if (c >= 'a' && c <= 'f')
1018 return 10 + c - 'a';
1019 else if (c >= 'A' && c <= 'F')
1020 return 10 + c - 'A';
1028 private void ConsumeWhitespace (bool ignore) {
1029 while (ptr < pattern.Length) {
1030 if (pattern[ptr] == '(') {
1031 if (ptr + 3 >= pattern.Length)
1034 if (pattern[ptr + 1] != '?' || pattern[ptr + 2] != '#')
1038 while (ptr < pattern.Length && pattern[ptr ++] != ')')
1041 else if (ignore && pattern[ptr] == '#') {
1042 while (ptr < pattern.Length && pattern[ptr ++] != '\n')
1045 else if (ignore && Char.IsWhiteSpace (pattern[ptr])) {
1046 while (ptr < pattern.Length && Char.IsWhiteSpace (pattern[ptr]))
1054 private string ParseString (string pattern) {
1055 this.pattern = pattern;
1058 StringBuilder result = new StringBuilder (pattern.Length);
1059 while (ptr < pattern.Length) {
1060 int c = pattern[ptr ++];
1062 c = ParseEscape (false);
1065 c = pattern[ptr ++];
1070 result.Append ((char) c);
1073 return result.ToString ();
1076 private void ResolveReferences ()
1079 Hashtable dict = new Hashtable ();
1080 ArrayList explicit_numeric_groups = null;
1082 // number unnamed groups
1084 foreach (CapturingGroup group in caps) {
1085 if (group.Name != null)
1088 dict.Add (gid.ToString (), group);
1089 group.Index = gid ++;
1093 // number named groups
1095 foreach (CapturingGroup group in caps) {
1096 if (group.Name == null)
1099 if (dict.Contains (group.Name)) {
1100 CapturingGroup prev = (CapturingGroup) dict [group.Name];
1101 group.Index = prev.Index;
1103 if (group.Index == gid)
1105 else if (group.Index > gid)
1106 explicit_numeric_groups.Add (group);
1110 if (Char.IsDigit (group.Name [0])) {
1112 int group_gid = ParseDecimal (group.Name, ref ptr);
1113 if (ptr == group.Name.Length) {
1114 group.Index = group_gid;
1115 dict.Add (group.Name, group);
1118 if (group_gid == gid) {
1121 // all numbers before 'gid' are already in the dictionary. So, we know group_gid > gid
1122 if (explicit_numeric_groups == null)
1123 explicit_numeric_groups = new ArrayList (4);
1124 explicit_numeric_groups.Add (group);
1131 string gid_s = gid.ToString ();
1132 while (dict.Contains (gid_s))
1133 gid_s = (++gid).ToString ();
1135 dict.Add (gid_s, group);
1136 dict.Add (group.Name, group);
1137 group.Index = gid ++;
1141 gap = gid; // == 1 + num_groups, if explicit_numeric_groups == null
1143 if (explicit_numeric_groups != null)
1144 HandleExplicitNumericGroups (explicit_numeric_groups);
1146 // resolve references
1148 foreach (Expression expr in refs.Keys) {
1149 string name = (string) refs [expr];
1150 if (!dict.Contains (name)) {
1151 if (expr is CaptureAssertion && !Char.IsDigit (name [0]))
1153 BackslashNumber bn = expr as BackslashNumber;
1154 if (bn != null && bn.ResolveReference (name, dict))
1156 throw NewParseException ("Reference to undefined group " +
1157 (Char.IsDigit (name[0]) ? "number " : "name ") +
1161 CapturingGroup group = (CapturingGroup)dict[name];
1162 if (expr is Reference)
1163 ((Reference)expr).CapturingGroup = group;
1164 else if (expr is CaptureAssertion)
1165 ((CaptureAssertion)expr).CapturingGroup = group;
1166 else if (expr is BalancingGroup)
1167 ((BalancingGroup)expr).Balance = group;
1171 private void HandleExplicitNumericGroups (ArrayList explicit_numeric_groups)
1175 int n_explicit = explicit_numeric_groups.Count;
1177 explicit_numeric_groups.Sort ();
1179 // move 'gap' forward to skip over all explicit groups that
1180 // turn out to match their index
1181 for (; i < n_explicit; ++i) {
1182 CapturingGroup g = (CapturingGroup) explicit_numeric_groups [i];
1191 // re-index all further groups so that the indexes are contiguous
1193 for (; i < n_explicit; ++i) {
1194 CapturingGroup g = (CapturingGroup) explicit_numeric_groups [i];
1195 if (g.Index == prev) {
1204 // flag helper functions
1206 private static bool IsIgnoreCase (RegexOptions options) {
1207 return (options & RegexOptions.IgnoreCase) != 0;
1210 private static bool IsMultiline (RegexOptions options) {
1211 return (options & RegexOptions.Multiline) != 0;
1214 private static bool IsExplicitCapture (RegexOptions options) {
1215 return (options & RegexOptions.ExplicitCapture) != 0;
1218 private static bool IsSingleline (RegexOptions options) {
1219 return (options & RegexOptions.Singleline) != 0;
1222 private static bool IsIgnorePatternWhitespace (RegexOptions options) {
1223 return (options & RegexOptions.IgnorePatternWhitespace) != 0;
1226 private static bool IsECMAScript (RegexOptions options) {
1227 return (options & RegexOptions.ECMAScript) != 0;
1230 // exception creation
1232 private ArgumentException NewParseException (string msg) {
1233 msg = "parsing \"" + pattern + "\" - " + msg;
1234 return new ArgumentException (msg, pattern);
1237 private string pattern;
1240 private ArrayList caps;
1241 private Hashtable refs;
1242 private int num_groups;