3 // namespace: System.Text.RegularExpressions
6 // author: Dan Lewis (dlewis@gmx.co.uk)
10 using System.Collections;
11 using System.Globalization;
13 namespace System.Text.RegularExpressions.Syntax {
16 public static int ParseDecimal (string str, ref int ptr) {
17 return ParseNumber (str, ref ptr, 10, 1, Int32.MaxValue);
20 public static int ParseOctal (string str, ref int ptr) {
21 return ParseNumber (str, ref ptr, 8, 1, 3);
24 public static int ParseHex (string str, ref int ptr, int digits) {
25 return ParseNumber (str, ref ptr, 16, digits, digits);
28 public static int ParseNumber (string str, ref int ptr, int b, int min, int max) {
29 int p = ptr, n = 0, digits = 0, d;
33 while (digits < max && p < str.Length) {
34 d = ParseDigit (str[p ++], b, digits);
51 public static string ParseName (string str, ref int ptr) {
52 if (Char.IsDigit (str[ptr])) {
53 int gid = ParseNumber (str, ref ptr, 10, 1, 0);
55 return gid.ToString ();
62 if (!IsNameChar (str[ptr]))
68 return str.Substring (start, ptr - start);
73 public static string Escape (string str) {
75 for (int i = 0; i < str.Length; ++ i) {
78 case '\\': case '*': case '+': case '?': case '|':
79 case '{': case '[': case '(': case ')': case '^':
80 case '$': case '.': case '#': case ' ':
84 case '\t': result += "\\t"; break;
85 case '\n': result += "\\n"; break;
86 case '\r': result += "\\r"; break;
87 case '\f': result += "\\f"; break;
89 default: result += c; break;
96 public static string Unescape (string str) {
97 return new Parser ().ParseString (str);
103 this.caps = new ArrayList ();
104 this.refs = new Hashtable ();
107 public RegularExpression ParseRegularExpression (string pattern, RegexOptions options) {
108 this.pattern = pattern;
116 RegularExpression re = new RegularExpression ();
117 ParseGroup (re, options, null);
118 ResolveReferences ();
120 re.GroupCount = num_groups;
124 catch (IndexOutOfRangeException) {
125 throw NewParseException ("Unexpected end of pattern.");
129 public IDictionary GetMapping () {
130 Hashtable mapping = new Hashtable ();
131 Hashtable numbers = new Hashtable ();
132 int end = caps.Count;
133 mapping.Add ("0", 0);
134 for (int i = 0; i < end; i++) {
135 CapturingGroup group = (CapturingGroup) caps [i];
136 if (group.Name != null && !mapping.Contains (group.Name)) {
137 mapping.Add (group.Name, group.Number);
138 numbers.Add (group.Number, group.Number);
142 for (int i = 1; i < end; i++) {
143 if (numbers [i] == null)
144 mapping.Add (i.ToString (), i);
152 private void ParseGroup (Group group, RegexOptions options, Assertion assertion) {
153 bool is_top_level = group is RegularExpression;
155 Alternation alternation = null;
156 string literal = null;
158 Group current = new Group ();
159 Expression expr = null;
163 ConsumeWhitespace (IsIgnorePatternWhitespace (options));
164 if (ptr >= pattern.Length)
167 // (1) Parse for Expressions
169 char ch = pattern[ptr ++];
174 IsMultiline (options) ? Position.StartOfLine : Position.Start;
175 expr = new PositionAssertion (pos);
181 IsMultiline (options) ? Position.EndOfLine : Position.End;
182 expr = new PositionAssertion (pos);
188 IsSingleline (options) ? Category.AnySingleline : Category.Any;
189 expr = new CharacterClass (cat, false);
194 int c = ParseEscape ();
198 expr = ParseSpecial (options);
201 ch = pattern[ptr ++]; // default escape
207 expr = ParseCharacterClass (options);
212 bool ignore = IsIgnoreCase (options);
213 expr = ParseGroupingConstruct (ref options);
215 if (literal != null && IsIgnoreCase (options) != ignore) {
216 current.AppendExpression (new Literal (literal, IsIgnoreCase (options)));
231 if (literal != null) {
232 current.AppendExpression (new Literal (literal, IsIgnoreCase (options)));
236 if (assertion != null) {
237 if (assertion.TrueExpression == null)
238 assertion.TrueExpression = current;
239 else if (assertion.FalseExpression == null)
240 assertion.FalseExpression = current;
242 throw NewParseException ("Too many | in (?()|).");
245 if (alternation == null)
246 alternation = new Alternation ();
248 alternation.AddAlternative (current);
251 current = new Group ();
255 case '*': case '+': case '?': {
256 throw NewParseException ("Bad quantifier.");
260 break; // literal character
263 ConsumeWhitespace (IsIgnorePatternWhitespace (options));
265 // (2) Check for Repetitions
267 if (ptr < pattern.Length) {
268 char k = pattern[ptr];
270 if (k == '?' || k == '*' || k == '+' || k == '{') {
273 int min = 0, max = 0;
277 case '?': min = 0; max = 1; break;
278 case '*': min = 0; max = 0xffff; break;
279 case '+': min = 1; max = 0xffff; break;
280 case '{': ParseRepetitionBounds (out min, out max, options); break;
283 ConsumeWhitespace (IsIgnorePatternWhitespace (options));
284 if (ptr < pattern.Length && pattern[ptr] == '?') {
289 Repetition repetition = new Repetition (min, max, lazy);
292 repetition.Expression = new Literal (ch.ToString (), IsIgnoreCase (options));
294 repetition.Expression = expr;
300 // (3) Append Expression and/or Literal
308 if (literal != null) {
309 current.AppendExpression (new Literal (literal, IsIgnoreCase (options)));
313 current.AppendExpression (expr);
317 if (is_top_level && ptr >= pattern.Length)
322 if (is_top_level && closed)
323 throw NewParseException ("Too many )'s.");
324 if (!is_top_level && !closed)
325 throw NewParseException ("Not enough )'s.");
328 // clean up literals and alternations
331 current.AppendExpression (new Literal (literal, IsIgnoreCase (options)));
333 if (assertion != null) {
334 if (assertion.TrueExpression == null)
335 assertion.TrueExpression = current;
337 assertion.FalseExpression = current;
339 group.AppendExpression (assertion);
341 else if (alternation != null) {
342 alternation.AddAlternative (current);
343 group.AppendExpression (alternation);
346 group.AppendExpression (current);
349 private Expression ParseGroupingConstruct (ref RegexOptions options) {
350 if (pattern[ptr] != '?') {
353 if (IsExplicitCapture (options))
354 group = new Group ();
356 group = new CapturingGroup ();
360 ParseGroup (group, options, null);
366 switch (pattern[ptr]) {
367 case ':': { // non-capturing group
369 Group group = new Group ();
370 ParseGroup (group, options, null);
375 case '>': { // non-backtracking group
377 Group group = new NonBacktrackingGroup ();
378 ParseGroup (group, options, null);
383 case 'i': case 'm': case 'n':
384 case 's': case 'x': case '-': { // options
385 RegexOptions o = options;
386 ParseOptions (ref o, false);
387 if (pattern[ptr] == '-') {
389 ParseOptions (ref o, true);
392 if (pattern[ptr] == ':') { // pass options to child group
394 Group group = new Group ();
395 ParseGroup (group, o, null);
398 else if (pattern[ptr] == ')') { // change options of enclosing group
404 throw NewParseException ("Bad options");
407 case '<': case '=': case '!': { // lookahead/lookbehind
408 ExpressionAssertion asn = new ExpressionAssertion ();
409 if (!ParseAssertionType (asn))
410 goto case '\''; // it's a (?<name> ) construct
412 Group test = new Group ();
413 ParseGroup (test, options, null);
415 asn.TestExpression = test;
419 case '\'': { // named/balancing group
421 if (pattern[ptr] == '<')
427 string name = ParseName ();
429 if (pattern[ptr] == delim) {
433 throw NewParseException ("Bad group name.");
436 CapturingGroup cap = new CapturingGroup ();
439 ParseGroup (cap, options, null);
443 else if (pattern[ptr] == '-') {
447 string balance_name = ParseName ();
448 if (balance_name == null || pattern[ptr] != delim)
449 throw NewParseException ("Bad balancing group name.");
452 BalancingGroup bal = new BalancingGroup ();
459 refs.Add (bal, balance_name);
461 ParseGroup (bal, options, null);
466 throw NewParseException ("Bad group name.");
469 case '(': { // expression/capture test
474 string name = ParseName ();
475 if (name == null || pattern[ptr] != ')') { // expression test
476 // FIXME MS implementation doesn't seem to
477 // implement this version of (?(x) ...)
480 ExpressionAssertion expr_asn = new ExpressionAssertion ();
482 if (pattern[ptr] == '?') {
484 if (!ParseAssertionType (expr_asn))
485 throw NewParseException ("Bad conditional.");
488 expr_asn.Negate = false;
489 expr_asn.Reverse = false;
492 Group test = new Group ();
493 ParseGroup (test, options, null);
494 expr_asn.TestExpression = test;
497 else { // capture test
499 asn = new CaptureAssertion ();
500 refs.Add (asn, name);
503 Group group = new Group ();
504 ParseGroup (group, options, asn);
508 case '#': { // comment
510 while (pattern[ptr ++] != ')') {
511 if (ptr >= pattern.Length)
512 throw NewParseException ("Unterminated (?#...) comment.");
518 throw NewParseException ("Bad grouping construct.");
522 private bool ParseAssertionType (ExpressionAssertion assertion) {
523 if (pattern[ptr] == '<') {
524 switch (pattern[ptr + 1]) {
526 assertion.Negate = false;
529 assertion.Negate = true;
535 assertion.Reverse = true;
539 switch (pattern[ptr]) {
541 assertion.Negate = false;
544 assertion.Negate = true;
550 assertion.Reverse = false;
557 private void ParseOptions (ref RegexOptions options, bool negate) {
559 switch (pattern[ptr]) {
562 options &= ~RegexOptions.IgnoreCase;
564 options |= RegexOptions.IgnoreCase;
569 options &= ~RegexOptions.Multiline;
571 options |= RegexOptions.Multiline;
576 options &= ~RegexOptions.ExplicitCapture;
578 options |= RegexOptions.ExplicitCapture;
583 options &= ~RegexOptions.Singleline;
585 options |= RegexOptions.Singleline;
590 options &= ~RegexOptions.IgnorePatternWhitespace;
592 options |= RegexOptions.IgnorePatternWhitespace;
603 private Expression ParseCharacterClass (RegexOptions options) {
605 if (pattern[ptr] == '^') {
612 ecma = IsECMAScript (options);
613 CharacterClass cls = new CharacterClass (negate, IsIgnoreCase (options));
615 if (pattern[ptr] == ']') {
616 cls.AddCharacter (']');
624 while (ptr < pattern.Length) {
640 // didn't recognize escape
644 case 'b': c = '\b'; break;
647 cls.AddCategory (ecma ? Category.EcmaDigit : Category.Digit, false);
652 cls.AddCategory (ecma ? Category.EcmaWord : Category.Word, false);
657 cls.AddCategory (ecma ? Category.EcmaWhiteSpace : Category.WhiteSpace, false);
662 cls.AddCategory (ParseUnicodeCategory (), false); // ignore ecma
667 cls.AddCategory (ecma ? Category.EcmaDigit : Category.Digit, true);
672 cls.AddCategory (ecma ? Category.EcmaWord : Category.Word, true);
677 cls.AddCategory (ecma ? Category.EcmaWhiteSpace : Category.WhiteSpace, true);
682 cls.AddCategory (ParseUnicodeCategory (), true);
686 default: break; // add escaped character
693 throw NewParseException ("[x-y] range in reverse order.");
696 cls.AddRange ((char)last, (char)c);
698 cls.AddCharacter ((char)c);
699 cls.AddCharacter ('-');
706 cls.AddCharacter ((char)c);
712 throw NewParseException ("Unterminated [] set.");
715 cls.AddCharacter ('-');
720 private void ParseRepetitionBounds (out int min, out int max, RegexOptions options) {
725 ConsumeWhitespace (IsIgnorePatternWhitespace (options));
727 if (pattern[ptr] == ',') {
730 n = ParseNumber (10, 1, 0);
731 ConsumeWhitespace (IsIgnorePatternWhitespace (options));
734 switch (pattern[ptr ++]) {
739 ConsumeWhitespace (IsIgnorePatternWhitespace (options));
740 m = ParseNumber (10, 1, 0);
741 ConsumeWhitespace (IsIgnorePatternWhitespace (options));
742 if (pattern[ptr ++] != '}')
743 throw NewParseException ("Illegal {x,y} - bad value of y.");
746 throw NewParseException ("Illegal {x,y}");
749 /* check bounds and ordering */
751 if (n >= 0xffff || m >= 0xffff)
752 throw NewParseException ("Illegal {x, y} - maximum of 65535.");
754 throw NewParseException ("Illegal {x, y} with x > y.");
756 /* assign min and max */
765 private Category ParseUnicodeCategory () {
766 if (pattern[ptr ++] != '{')
767 throw NewParseException ("Incomplete \\p{X} character escape.");
769 string name = ParseName (pattern, ref ptr);
771 throw NewParseException ("Incomplete \\p{X} character escape.");
773 Category cat = CategoryUtils.CategoryFromName (name);
774 if (cat == Category.None)
775 throw NewParseException ("Unknown property '" + name + "'.");
777 if (pattern[ptr ++] != '}')
778 throw NewParseException ("Incomplete \\p{X} character escape.");
783 private Expression ParseSpecial (RegexOptions options) {
785 bool ecma = IsECMAScript (options);
786 Expression expr = null;
788 switch (pattern[ptr ++]) {
793 expr = new CharacterClass (ecma ? Category.EcmaDigit : Category.Digit, false);
797 expr = new CharacterClass (ecma ? Category.EcmaWord : Category.Word, false);
801 expr = new CharacterClass (ecma ? Category.EcmaWhiteSpace : Category.WhiteSpace, false);
805 // this is odd - ECMAScript isn't supposed to support Unicode,
806 // yet \p{..} compiles and runs under the MS implementation
807 // identically to canonical mode. That's why I'm ignoring the
808 // value of ecma here.
810 expr = new CharacterClass (ParseUnicodeCategory (), false);
814 expr = new CharacterClass (ecma ? Category.EcmaDigit : Category.Digit, true);
818 expr = new CharacterClass (ecma ? Category.EcmaWord : Category.Word, true);
822 expr = new CharacterClass (ecma ? Category.EcmaWhiteSpace : Category.WhiteSpace, true);
826 expr = new CharacterClass (ParseUnicodeCategory (), true);
831 case 'A': expr = new PositionAssertion (Position.StartOfString); break;
832 case 'Z': expr = new PositionAssertion (Position.End); break;
833 case 'z': expr = new PositionAssertion (Position.EndOfString); break;
834 case 'G': expr = new PositionAssertion (Position.StartOfScan); break;
835 case 'b': expr = new PositionAssertion (Position.Boundary); break;
836 case 'B': expr = new PositionAssertion (Position.NonBoundary); break;
840 case '1': case '2': case '3': case '4': case '5':
841 case '6': case '7': case '8': case '9': {
843 int n = ParseNumber (10, 1, 0);
849 // FIXME test if number is within number of assigned groups
850 // this may present a problem for right-to-left matching
852 Reference reference = new Reference (IsIgnoreCase (options));
853 refs.Add (reference, n.ToString ());
859 char delim = pattern[ptr ++];
862 else if (delim != '\'')
863 throw NewParseException ("Malformed \\k<...> named backreference.");
865 string name = ParseName ();
866 if (name == null || pattern[ptr] != delim)
867 throw NewParseException ("Malformed \\k<...> named backreference.");
870 Reference reference = new Reference (IsIgnoreCase (options));
871 refs.Add (reference, name);
887 private int ParseEscape () {
891 if (p >= pattern.Length)
892 throw new ArgumentException (
893 String.Format ("Parsing \"{0}\" - Illegal \\ at end of " +
894 "pattern.", pattern), pattern);
896 switch (pattern[ptr ++]) {
898 // standard escapes (except \b)
900 case 'a': return '\u0007';
901 case 't': return '\u0009';
902 case 'r': return '\u000d';
903 case 'v': return '\u000b';
904 case 'f': return '\u000c';
905 case 'n': return '\u000a';
906 case 'e': return '\u001b';
907 case '\\': return '\\';
913 int result = ParseOctal (pattern, ref ptr);
914 if (result == -1 && prevptr == ptr)
920 c = ParseHex (pattern, ref ptr, 2);
922 throw NewParseException ("Insufficient hex digits");
927 c = ParseHex (pattern, ref ptr, 4);
929 throw NewParseException ("Insufficient hex digits");
933 // control characters
937 if (c >= '@' && c <= '_')
940 throw NewParseException ("Unrecognized control character.");
950 private string ParseName () {
951 return Parser.ParseName (pattern, ref ptr);
954 private static bool IsNameChar (char c) {
955 UnicodeCategory cat = Char.GetUnicodeCategory (c);
956 if (cat == UnicodeCategory.ModifierLetter)
958 if (cat == UnicodeCategory.ConnectorPunctuation)
960 return Char.IsLetterOrDigit (c);
963 private int ParseNumber (int b, int min, int max) {
964 return Parser.ParseNumber (pattern, ref ptr, b, min, max);
967 private int ParseDecimal () {
968 return Parser.ParseDecimal (pattern, ref ptr);
971 private static int ParseDigit (char c, int b, int n) {
974 if (c >= '0' && c <= '7')
979 if (c >= '0' && c <= '9')
984 if (c >= '0' && c <= '9')
986 else if (c >= 'a' && c <= 'f')
988 else if (c >= 'A' && c <= 'F')
997 private void ConsumeWhitespace (bool ignore) {
999 if (ptr >= pattern.Length)
1002 if (pattern[ptr] == '(') {
1003 if (ptr + 3 >= pattern.Length)
1006 if (pattern[ptr + 1] != '?' || pattern[ptr + 2] != '#')
1010 while (pattern[ptr ++] != ')')
1013 else if (ignore && pattern[ptr] == '#') {
1014 while (ptr < pattern.Length && pattern[ptr ++] != '\n')
1017 else if (ignore && Char.IsWhiteSpace (pattern[ptr])) {
1018 while (ptr < pattern.Length && Char.IsWhiteSpace (pattern[ptr]))
1026 private string ParseString (string pattern) {
1027 this.pattern = pattern;
1031 while (ptr < pattern.Length) {
1032 int c = pattern[ptr ++];
1037 c = pattern[ptr ++];
1048 private void ResolveReferences () {
1050 Hashtable dict = new Hashtable ();
1052 // number unnamed groups
1054 foreach (CapturingGroup group in caps) {
1055 if (group.Name == null) {
1056 dict.Add (gid.ToString (), group);
1057 group.Number = gid ++;
1063 // number named groups
1065 foreach (CapturingGroup group in caps) {
1066 if (group.Name != null) {
1067 if (!dict.Contains (group.Name)) {
1068 dict.Add (group.Name, group);
1069 group.Number = gid ++;
1074 CapturingGroup prev = (CapturingGroup)dict[group.Name];
1075 group.Number = prev.Number;
1080 // resolve references
1082 foreach (Expression expr in refs.Keys) {
1083 string name = (string)refs[expr];
1084 if (!dict.Contains (name)) {
1085 throw NewParseException ("Reference to undefined group " +
1086 (Char.IsDigit (name[0]) ? "number " : "name ") +
1090 CapturingGroup group = (CapturingGroup)dict[name];
1091 if (expr is Reference)
1092 ((Reference)expr).CapturingGroup = group;
1093 else if (expr is CaptureAssertion)
1094 ((CaptureAssertion)expr).CapturingGroup = group;
1095 else if (expr is BalancingGroup)
1096 ((BalancingGroup)expr).Balance = group;
1100 // flag helper functions
1102 private static bool IsIgnoreCase (RegexOptions options) {
1103 return (options & RegexOptions.IgnoreCase) != 0;
1106 private static bool IsMultiline (RegexOptions options) {
1107 return (options & RegexOptions.Multiline) != 0;
1110 private static bool IsExplicitCapture (RegexOptions options) {
1111 return (options & RegexOptions.ExplicitCapture) != 0;
1114 private static bool IsSingleline (RegexOptions options) {
1115 return (options & RegexOptions.Singleline) != 0;
1118 private static bool IsIgnorePatternWhitespace (RegexOptions options) {
1119 return (options & RegexOptions.IgnorePatternWhitespace) != 0;
1122 private static bool IsRightToLeft (RegexOptions options) {
1123 return (options & RegexOptions.RightToLeft) != 0;
1126 private static bool IsECMAScript (RegexOptions options) {
1127 return (options & RegexOptions.ECMAScript) != 0;
1130 // exception creation
1132 private ArgumentException NewParseException (string msg) {
1133 msg = "parsing \"" + pattern + "\" - " + msg;
1134 return new ArgumentException (msg, pattern);
1137 private string pattern;
1140 private ArrayList caps;
1141 private Hashtable refs;
1142 private int num_groups;