3 // namespace: System.Text.RegularExpressions
6 // author: Dan Lewis (dlewis@gmx.co.uk)
10 // Permission is hereby granted, free of charge, to any person obtaining
11 // a copy of this software and associated documentation files (the
12 // "Software"), to deal in the Software without restriction, including
13 // without limitation the rights to use, copy, modify, merge, publish,
14 // distribute, sublicense, and/or sell copies of the Software, and to
15 // permit persons to whom the Software is furnished to do so, subject to
16 // the following conditions:
18 // The above copyright notice and this permission notice shall be
19 // included in all copies or substantial portions of the Software.
21 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
22 // EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
23 // MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
24 // NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
25 // LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
26 // OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
27 // WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
31 using System.Collections;
32 using System.Globalization;
34 namespace System.Text.RegularExpressions.Syntax {
37 public static int ParseDecimal (string str, ref int ptr) {
38 return ParseNumber (str, ref ptr, 10, 1, Int32.MaxValue);
41 public static int ParseOctal (string str, ref int ptr) {
42 return ParseNumber (str, ref ptr, 8, 1, 3);
45 public static int ParseHex (string str, ref int ptr, int digits) {
46 return ParseNumber (str, ref ptr, 16, digits, digits);
49 public static int ParseNumber (string str, ref int ptr, int b, int min, int max) {
50 int p = ptr, n = 0, digits = 0, d;
54 while (digits < max && p < str.Length) {
55 d = ParseDigit (str[p ++], b, digits);
72 public static string ParseName (string str, ref int ptr) {
73 if (Char.IsDigit (str[ptr])) {
74 int gid = ParseNumber (str, ref ptr, 10, 1, 0);
76 return gid.ToString ();
83 if (!IsNameChar (str[ptr]))
89 return str.Substring (start, ptr - start);
94 public static string Escape (string str) {
96 for (int i = 0; i < str.Length; ++ i) {
99 case '\\': case '*': case '+': case '?': case '|':
100 case '{': case '[': case '(': case ')': case '^':
101 case '$': case '.': case '#': case ' ':
105 case '\t': result += "\\t"; break;
106 case '\n': result += "\\n"; break;
107 case '\r': result += "\\r"; break;
108 case '\f': result += "\\f"; break;
110 default: result += c; break;
117 public static string Unescape (string str) {
118 if (str.IndexOf ('\\') == -1)
120 return new Parser ().ParseString (str);
126 this.caps = new ArrayList ();
127 this.refs = new Hashtable ();
130 public RegularExpression ParseRegularExpression (string pattern, RegexOptions options) {
131 this.pattern = pattern;
139 RegularExpression re = new RegularExpression ();
140 ParseGroup (re, options, null);
141 ResolveReferences ();
143 re.GroupCount = num_groups;
147 catch (IndexOutOfRangeException) {
148 throw NewParseException ("Unexpected end of pattern.");
152 public IDictionary GetMapping () {
153 Hashtable mapping = new Hashtable ();
154 Hashtable numbers = new Hashtable ();
155 int end = caps.Count;
156 mapping.Add ("0", 0);
157 for (int i = 0; i < end; i++) {
158 CapturingGroup group = (CapturingGroup) caps [i];
159 if (group.Name != null && !mapping.Contains (group.Name)) {
160 mapping.Add (group.Name, group.Number);
161 numbers.Add (group.Number, group.Number);
165 for (int i = 1; i < end; i++) {
166 if (numbers [i] == null)
167 mapping.Add (i.ToString (), i);
175 private void ParseGroup (Group group, RegexOptions options, Assertion assertion) {
176 bool is_top_level = group is RegularExpression;
178 Alternation alternation = null;
179 string literal = null;
181 Group current = new Group ();
182 Expression expr = null;
186 ConsumeWhitespace (IsIgnorePatternWhitespace (options));
187 if (ptr >= pattern.Length)
190 // (1) Parse for Expressions
192 char ch = pattern[ptr ++];
197 IsMultiline (options) ? Position.StartOfLine : Position.Start;
198 expr = new PositionAssertion (pos);
204 IsMultiline (options) ? Position.EndOfLine : Position.End;
205 expr = new PositionAssertion (pos);
211 IsSingleline (options) ? Category.AnySingleline : Category.Any;
212 expr = new CharacterClass (cat, false);
217 int c = ParseEscape ();
221 expr = ParseSpecial (options);
224 ch = pattern[ptr ++]; // default escape
230 expr = ParseCharacterClass (options);
235 bool ignore = IsIgnoreCase (options);
236 expr = ParseGroupingConstruct (ref options);
238 if (literal != null && IsIgnoreCase (options) != ignore) {
239 current.AppendExpression (new Literal (literal, IsIgnoreCase (options)));
254 if (literal != null) {
255 current.AppendExpression (new Literal (literal, IsIgnoreCase (options)));
259 if (assertion != null) {
260 if (assertion.TrueExpression == null)
261 assertion.TrueExpression = current;
262 else if (assertion.FalseExpression == null)
263 assertion.FalseExpression = current;
265 throw NewParseException ("Too many | in (?()|).");
268 if (alternation == null)
269 alternation = new Alternation ();
271 alternation.AddAlternative (current);
274 current = new Group ();
278 case '*': case '+': case '?': {
279 throw NewParseException ("Bad quantifier.");
283 break; // literal character
286 ConsumeWhitespace (IsIgnorePatternWhitespace (options));
288 // (2) Check for Repetitions
290 if (ptr < pattern.Length) {
291 char k = pattern[ptr];
292 int min = 0, max = 0;
294 bool haveRep = false;
297 if (k == '?' || k == '*' || k == '+') {
302 case '?': min = 0; max = 1; break;
303 case '*': min = 0; max = 0x7fffffff; break;
304 case '+': min = 1; max = 0x7fffffff; break;
306 } else if (k == '{' && ptr + 1 < pattern.Length) {
309 haveRep = ParseRepetitionBounds (out min, out max, options);
315 ConsumeWhitespace (IsIgnorePatternWhitespace (options));
316 if (ptr < pattern.Length && pattern[ptr] == '?') {
321 Repetition repetition = new Repetition (min, max, lazy);
324 repetition.Expression = new Literal (ch.ToString (), IsIgnoreCase (options));
326 repetition.Expression = expr;
332 // (3) Append Expression and/or Literal
340 if (literal != null) {
341 current.AppendExpression (new Literal (literal, IsIgnoreCase (options)));
345 current.AppendExpression (expr);
349 if (is_top_level && ptr >= pattern.Length)
354 if (is_top_level && closed)
355 throw NewParseException ("Too many )'s.");
356 if (!is_top_level && !closed)
357 throw NewParseException ("Not enough )'s.");
360 // clean up literals and alternations
363 current.AppendExpression (new Literal (literal, IsIgnoreCase (options)));
365 if (assertion != null) {
366 if (assertion.TrueExpression == null)
367 assertion.TrueExpression = current;
369 assertion.FalseExpression = current;
371 group.AppendExpression (assertion);
373 else if (alternation != null) {
374 alternation.AddAlternative (current);
375 group.AppendExpression (alternation);
378 group.AppendExpression (current);
381 private Expression ParseGroupingConstruct (ref RegexOptions options) {
382 if (pattern[ptr] != '?') {
385 if (IsExplicitCapture (options))
386 group = new Group ();
388 group = new CapturingGroup ();
392 ParseGroup (group, options, null);
398 switch (pattern[ptr]) {
399 case ':': { // non-capturing group
401 Group group = new Group ();
402 ParseGroup (group, options, null);
407 case '>': { // non-backtracking group
409 Group group = new NonBacktrackingGroup ();
410 ParseGroup (group, options, null);
415 case 'i': case 'm': case 'n':
416 case 's': case 'x': case '-': { // options
417 RegexOptions o = options;
418 ParseOptions (ref o, false);
419 if (pattern[ptr] == '-') {
421 ParseOptions (ref o, true);
424 if (pattern[ptr] == ':') { // pass options to child group
426 Group group = new Group ();
427 ParseGroup (group, o, null);
430 else if (pattern[ptr] == ')') { // change options of enclosing group
436 throw NewParseException ("Bad options");
439 case '<': case '=': case '!': { // lookahead/lookbehind
440 ExpressionAssertion asn = new ExpressionAssertion ();
441 if (!ParseAssertionType (asn))
442 goto case '\''; // it's a (?<name> ) construct
444 Group test = new Group ();
445 ParseGroup (test, options, null);
447 asn.TestExpression = test;
451 case '\'': { // named/balancing group
453 if (pattern[ptr] == '<')
459 string name = ParseName ();
461 if (pattern[ptr] == delim) {
465 throw NewParseException ("Bad group name.");
468 CapturingGroup cap = new CapturingGroup ();
471 ParseGroup (cap, options, null);
475 else if (pattern[ptr] == '-') {
479 string balance_name = ParseName ();
480 if (balance_name == null || pattern[ptr] != delim)
481 throw NewParseException ("Bad balancing group name.");
484 BalancingGroup bal = new BalancingGroup ();
491 refs.Add (bal, balance_name);
493 ParseGroup (bal, options, null);
498 throw NewParseException ("Bad group name.");
501 case '(': { // expression/capture test
506 string name = ParseName ();
507 if (name == null || pattern[ptr] != ')') { // expression test
508 // FIXME MS implementation doesn't seem to
509 // implement this version of (?(x) ...)
512 ExpressionAssertion expr_asn = new ExpressionAssertion ();
514 if (pattern[ptr] == '?') {
516 if (!ParseAssertionType (expr_asn))
517 throw NewParseException ("Bad conditional.");
520 expr_asn.Negate = false;
521 expr_asn.Reverse = false;
524 Group test = new Group ();
525 ParseGroup (test, options, null);
526 expr_asn.TestExpression = test;
529 else { // capture test
531 asn = new CaptureAssertion ();
532 refs.Add (asn, name);
535 Group group = new Group ();
536 ParseGroup (group, options, asn);
540 case '#': { // comment
542 while (pattern[ptr ++] != ')') {
543 if (ptr >= pattern.Length)
544 throw NewParseException ("Unterminated (?#...) comment.");
550 throw NewParseException ("Bad grouping construct.");
554 private bool ParseAssertionType (ExpressionAssertion assertion) {
555 if (pattern[ptr] == '<') {
556 switch (pattern[ptr + 1]) {
558 assertion.Negate = false;
561 assertion.Negate = true;
567 assertion.Reverse = true;
571 switch (pattern[ptr]) {
573 assertion.Negate = false;
576 assertion.Negate = true;
582 assertion.Reverse = false;
589 private void ParseOptions (ref RegexOptions options, bool negate) {
591 switch (pattern[ptr]) {
594 options &= ~RegexOptions.IgnoreCase;
596 options |= RegexOptions.IgnoreCase;
601 options &= ~RegexOptions.Multiline;
603 options |= RegexOptions.Multiline;
608 options &= ~RegexOptions.ExplicitCapture;
610 options |= RegexOptions.ExplicitCapture;
615 options &= ~RegexOptions.Singleline;
617 options |= RegexOptions.Singleline;
622 options &= ~RegexOptions.IgnorePatternWhitespace;
624 options |= RegexOptions.IgnorePatternWhitespace;
635 private Expression ParseCharacterClass (RegexOptions options) {
637 if (pattern[ptr] == '^') {
642 bool ecma = IsECMAScript (options);
643 CharacterClass cls = new CharacterClass (negate, IsIgnoreCase (options));
645 if (pattern[ptr] == ']') {
646 cls.AddCharacter (']');
654 while (ptr < pattern.Length) {
662 if (c == '-' && last >= 0 && !range) {
670 goto char_recognized;
672 // didn't recognize escape
673 c = pattern [ptr ++];
677 goto char_recognized;
680 cls.AddCategory (ecma ? Category.EcmaDigit : Category.Digit, c == 'D');
684 cls.AddCategory (ecma ? Category.EcmaWord : Category.Word, c == 'W');
688 cls.AddCategory (ecma ? Category.EcmaWhiteSpace : Category.WhiteSpace, c == 'S');
692 cls.AddCategory (ParseUnicodeCategory (), c == 'P'); // ignore ecma
695 default: // add escaped character
696 goto char_recognized;
699 // if the pattern looks like [a-\s] ...
701 throw NewParseException ("character range cannot have category \\" + c);
709 // if 'range' is true, we know that 'last >= 0'
711 throw NewParseException ("[" + last + "-" + c + "] range in reverse order.");
712 cls.AddRange ((char)last, (char)c);
718 cls.AddCharacter ((char)c);
723 throw NewParseException ("Unterminated [] set.");
726 cls.AddCharacter ('-');
731 private bool ParseRepetitionBounds (out int min, out int max, RegexOptions options) {
737 ConsumeWhitespace (IsIgnorePatternWhitespace (options));
739 if (pattern[ptr] == ',') {
742 n = ParseNumber (10, 1, 0);
743 ConsumeWhitespace (IsIgnorePatternWhitespace (options));
746 switch (pattern[ptr ++]) {
751 ConsumeWhitespace (IsIgnorePatternWhitespace (options));
752 m = ParseNumber (10, 1, 0);
753 ConsumeWhitespace (IsIgnorePatternWhitespace (options));
754 if (pattern[ptr ++] != '}')
761 /* check bounds and ordering */
763 if (n >= 0xffff || m >= 0xffff)
764 throw NewParseException ("Illegal {x, y} - maximum of 65535.");
766 throw NewParseException ("Illegal {x, y} with x > y.");
768 /* assign min and max */
779 private Category ParseUnicodeCategory () {
780 if (pattern[ptr ++] != '{')
781 throw NewParseException ("Incomplete \\p{X} character escape.");
783 string name = ParseName (pattern, ref ptr);
785 throw NewParseException ("Incomplete \\p{X} character escape.");
787 Category cat = CategoryUtils.CategoryFromName (name);
788 if (cat == Category.None)
789 throw NewParseException ("Unknown property '" + name + "'.");
791 if (pattern[ptr ++] != '}')
792 throw NewParseException ("Incomplete \\p{X} character escape.");
797 private Expression ParseSpecial (RegexOptions options) {
799 bool ecma = IsECMAScript (options);
800 Expression expr = null;
802 switch (pattern[ptr ++]) {
807 expr = new CharacterClass (ecma ? Category.EcmaDigit : Category.Digit, false);
811 expr = new CharacterClass (ecma ? Category.EcmaWord : Category.Word, false);
815 expr = new CharacterClass (ecma ? Category.EcmaWhiteSpace : Category.WhiteSpace, false);
819 // this is odd - ECMAScript isn't supposed to support Unicode,
820 // yet \p{..} compiles and runs under the MS implementation
821 // identically to canonical mode. That's why I'm ignoring the
822 // value of ecma here.
824 expr = new CharacterClass (ParseUnicodeCategory (), false);
828 expr = new CharacterClass (ecma ? Category.EcmaDigit : Category.Digit, true);
832 expr = new CharacterClass (ecma ? Category.EcmaWord : Category.Word, true);
836 expr = new CharacterClass (ecma ? Category.EcmaWhiteSpace : Category.WhiteSpace, true);
840 expr = new CharacterClass (ParseUnicodeCategory (), true);
845 case 'A': expr = new PositionAssertion (Position.StartOfString); break;
846 case 'Z': expr = new PositionAssertion (Position.End); break;
847 case 'z': expr = new PositionAssertion (Position.EndOfString); break;
848 case 'G': expr = new PositionAssertion (Position.StartOfScan); break;
849 case 'b': expr = new PositionAssertion (Position.Boundary); break;
850 case 'B': expr = new PositionAssertion (Position.NonBoundary); break;
854 case '1': case '2': case '3': case '4': case '5':
855 case '6': case '7': case '8': case '9': {
857 int n = ParseNumber (10, 1, 0);
863 // FIXME test if number is within number of assigned groups
864 // this may present a problem for right-to-left matching
866 Reference reference = new Reference (IsIgnoreCase (options));
867 refs.Add (reference, n.ToString ());
873 char delim = pattern[ptr ++];
876 else if (delim != '\'')
877 throw NewParseException ("Malformed \\k<...> named backreference.");
879 string name = ParseName ();
880 if (name == null || pattern[ptr] != delim)
881 throw NewParseException ("Malformed \\k<...> named backreference.");
884 Reference reference = new Reference (IsIgnoreCase (options));
885 refs.Add (reference, name);
901 private int ParseEscape () {
905 if (p >= pattern.Length)
906 throw new ArgumentException (
907 String.Format ("Parsing \"{0}\" - Illegal \\ at end of " +
908 "pattern.", pattern), pattern);
910 switch (pattern[ptr ++]) {
912 // standard escapes (except \b)
914 case 'a': return '\u0007';
915 case 't': return '\u0009';
916 case 'r': return '\u000d';
917 case 'v': return '\u000b';
918 case 'f': return '\u000c';
919 case 'n': return '\u000a';
920 case 'e': return '\u001b';
921 case '\\': return '\\';
927 // Turns out that octal values can be specified
928 // without a leading zero. But also the limit
929 // of three character should include this first
934 int result = ParseOctal (pattern, ref ptr);
935 if (result == -1 && prevptr == ptr)
941 c = ParseHex (pattern, ref ptr, 2);
943 throw NewParseException ("Insufficient hex digits");
948 c = ParseHex (pattern, ref ptr, 4);
950 throw NewParseException ("Insufficient hex digits");
954 // control characters
958 if (c >= '@' && c <= '_')
961 throw NewParseException ("Unrecognized control character.");
971 private string ParseName () {
972 return Parser.ParseName (pattern, ref ptr);
975 private static bool IsNameChar (char c) {
976 UnicodeCategory cat = Char.GetUnicodeCategory (c);
977 if (cat == UnicodeCategory.ModifierLetter)
979 if (cat == UnicodeCategory.ConnectorPunctuation)
981 return Char.IsLetterOrDigit (c);
984 private int ParseNumber (int b, int min, int max) {
985 return Parser.ParseNumber (pattern, ref ptr, b, min, max);
988 private static int ParseDigit (char c, int b, int n) {
991 if (c >= '0' && c <= '7')
996 if (c >= '0' && c <= '9')
1001 if (c >= '0' && c <= '9')
1003 else if (c >= 'a' && c <= 'f')
1004 return 10 + c - 'a';
1005 else if (c >= 'A' && c <= 'F')
1006 return 10 + c - 'A';
1014 private void ConsumeWhitespace (bool ignore) {
1015 while (ptr < pattern.Length) {
1016 if (pattern[ptr] == '(') {
1017 if (ptr + 3 >= pattern.Length)
1020 if (pattern[ptr + 1] != '?' || pattern[ptr + 2] != '#')
1024 while (ptr < pattern.Length && pattern[ptr ++] != ')')
1027 else if (ignore && pattern[ptr] == '#') {
1028 while (ptr < pattern.Length && pattern[ptr ++] != '\n')
1031 else if (ignore && Char.IsWhiteSpace (pattern[ptr])) {
1032 while (ptr < pattern.Length && Char.IsWhiteSpace (pattern[ptr]))
1040 private string ParseString (string pattern) {
1041 this.pattern = pattern;
1044 StringBuilder result = new StringBuilder (pattern.Length);
1045 while (ptr < pattern.Length) {
1046 int c = pattern[ptr ++];
1051 c = pattern[ptr ++];
1056 result.Append ((char) c);
1059 return result.ToString ();
1062 private void ResolveReferences () {
1064 Hashtable dict = new Hashtable ();
1066 // number unnamed groups
1068 foreach (CapturingGroup group in caps) {
1069 if (group.Name == null) {
1070 dict.Add (gid.ToString (), group);
1071 group.Number = gid ++;
1077 // number named groups
1079 foreach (CapturingGroup group in caps) {
1080 if (group.Name != null) {
1081 if (!dict.Contains (group.Name)) {
1082 dict.Add (group.Name, group);
1083 group.Number = gid ++;
1088 CapturingGroup prev = (CapturingGroup)dict[group.Name];
1089 group.Number = prev.Number;
1094 // resolve references
1096 foreach (Expression expr in refs.Keys) {
1097 string name = (string)refs[expr];
1098 if (!dict.Contains (name)) {
1099 throw NewParseException ("Reference to undefined group " +
1100 (Char.IsDigit (name[0]) ? "number " : "name ") +
1104 CapturingGroup group = (CapturingGroup)dict[name];
1105 if (expr is Reference)
1106 ((Reference)expr).CapturingGroup = group;
1107 else if (expr is CaptureAssertion)
1108 ((CaptureAssertion)expr).CapturingGroup = group;
1109 else if (expr is BalancingGroup)
1110 ((BalancingGroup)expr).Balance = group;
1114 // flag helper functions
1116 private static bool IsIgnoreCase (RegexOptions options) {
1117 return (options & RegexOptions.IgnoreCase) != 0;
1120 private static bool IsMultiline (RegexOptions options) {
1121 return (options & RegexOptions.Multiline) != 0;
1124 private static bool IsExplicitCapture (RegexOptions options) {
1125 return (options & RegexOptions.ExplicitCapture) != 0;
1128 private static bool IsSingleline (RegexOptions options) {
1129 return (options & RegexOptions.Singleline) != 0;
1132 private static bool IsIgnorePatternWhitespace (RegexOptions options) {
1133 return (options & RegexOptions.IgnorePatternWhitespace) != 0;
1136 private static bool IsECMAScript (RegexOptions options) {
1137 return (options & RegexOptions.ECMAScript) != 0;
1140 // exception creation
1142 private ArgumentException NewParseException (string msg) {
1143 msg = "parsing \"" + pattern + "\" - " + msg;
1144 return new ArgumentException (msg, pattern);
1147 private string pattern;
1150 private ArrayList caps;
1151 private Hashtable refs;
1152 private int num_groups;