3 // namespace: System.Text.RegularExpressions
6 // author: Dan Lewis (dlewis@gmx.co.uk)
10 // Permission is hereby granted, free of charge, to any person obtaining
11 // a copy of this software and associated documentation files (the
12 // "Software"), to deal in the Software without restriction, including
13 // without limitation the rights to use, copy, modify, merge, publish,
14 // distribute, sublicense, and/or sell copies of the Software, and to
15 // permit persons to whom the Software is furnished to do so, subject to
16 // the following conditions:
18 // The above copyright notice and this permission notice shall be
19 // included in all copies or substantial portions of the Software.
21 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
22 // EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
23 // MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
24 // NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
25 // LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
26 // OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
27 // WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
31 using System.Collections;
32 using System.Globalization;
34 namespace System.Text.RegularExpressions.Syntax {
37 public static int ParseDecimal (string str, ref int ptr) {
38 return ParseNumber (str, ref ptr, 10, 1, Int32.MaxValue);
41 public static int ParseOctal (string str, ref int ptr) {
42 return ParseNumber (str, ref ptr, 8, 1, 3);
45 public static int ParseHex (string str, ref int ptr, int digits) {
46 return ParseNumber (str, ref ptr, 16, digits, digits);
49 public static int ParseNumber (string str, ref int ptr, int b, int min, int max) {
50 int p = ptr, n = 0, digits = 0, d;
54 while (digits < max && p < str.Length) {
55 d = ParseDigit (str[p ++], b, digits);
72 public static string ParseName (string str, ref int ptr) {
73 if (Char.IsDigit (str[ptr])) {
74 int gid = ParseNumber (str, ref ptr, 10, 1, 0);
76 return gid.ToString ();
83 if (!IsNameChar (str[ptr]))
89 return str.Substring (start, ptr - start);
94 public static string Escape (string str) {
96 for (int i = 0; i < str.Length; ++ i) {
99 case '\\': case '*': case '+': case '?': case '|':
100 case '{': case '[': case '(': case ')': case '^':
101 case '$': case '.': case '#': case ' ':
105 case '\t': result += "\\t"; break;
106 case '\n': result += "\\n"; break;
107 case '\r': result += "\\r"; break;
108 case '\f': result += "\\f"; break;
110 default: result += c; break;
117 public static string Unescape (string str) {
118 if (str.IndexOf ('\\') == -1)
120 return new Parser ().ParseString (str);
126 this.caps = new ArrayList ();
127 this.refs = new Hashtable ();
130 public RegularExpression ParseRegularExpression (string pattern, RegexOptions options) {
131 this.pattern = pattern;
139 RegularExpression re = new RegularExpression ();
140 ParseGroup (re, options, null);
141 ResolveReferences ();
143 re.GroupCount = num_groups;
147 catch (IndexOutOfRangeException) {
148 throw NewParseException ("Unexpected end of pattern.");
152 public IDictionary GetMapping ()
154 Hashtable mapping = new Hashtable ();
155 int end = caps.Count;
156 mapping.Add ("0", 0);
157 for (int i = 0; i < end; i++) {
158 CapturingGroup group = (CapturingGroup) caps [i];
159 if (group.Name != null) {
160 if (mapping.Contains (group.Name)) {
161 if ((int) mapping [group.Name] != group.Number)
162 throw new SystemException ("invalid state");
165 mapping.Add (group.Name, group.Number);
167 mapping.Add (group.Number.ToString (), group.Number);
176 private void ParseGroup (Group group, RegexOptions options, Assertion assertion) {
177 bool is_top_level = group is RegularExpression;
179 Alternation alternation = null;
180 string literal = null;
182 Group current = new Group ();
183 Expression expr = null;
187 ConsumeWhitespace (IsIgnorePatternWhitespace (options));
188 if (ptr >= pattern.Length)
191 // (1) Parse for Expressions
193 char ch = pattern[ptr ++];
198 IsMultiline (options) ? Position.StartOfLine : Position.Start;
199 expr = new PositionAssertion (pos);
205 IsMultiline (options) ? Position.EndOfLine : Position.End;
206 expr = new PositionAssertion (pos);
212 IsSingleline (options) ? Category.AnySingleline : Category.Any;
213 expr = new CharacterClass (cat, false);
218 int c = ParseEscape ();
222 expr = ParseSpecial (options);
225 ch = pattern[ptr ++]; // default escape
231 expr = ParseCharacterClass (options);
236 bool ignore = IsIgnoreCase (options);
237 expr = ParseGroupingConstruct (ref options);
239 if (literal != null && IsIgnoreCase (options) != ignore) {
240 current.AppendExpression (new Literal (literal, IsIgnoreCase (options)));
255 if (literal != null) {
256 current.AppendExpression (new Literal (literal, IsIgnoreCase (options)));
260 if (assertion != null) {
261 if (assertion.TrueExpression == null)
262 assertion.TrueExpression = current;
263 else if (assertion.FalseExpression == null)
264 assertion.FalseExpression = current;
266 throw NewParseException ("Too many | in (?()|).");
269 if (alternation == null)
270 alternation = new Alternation ();
272 alternation.AddAlternative (current);
275 current = new Group ();
279 case '*': case '+': case '?': {
280 throw NewParseException ("Bad quantifier.");
284 break; // literal character
287 ConsumeWhitespace (IsIgnorePatternWhitespace (options));
289 // (2) Check for Repetitions
291 if (ptr < pattern.Length) {
292 char k = pattern[ptr];
293 int min = 0, max = 0;
295 bool haveRep = false;
298 if (k == '?' || k == '*' || k == '+') {
303 case '?': min = 0; max = 1; break;
304 case '*': min = 0; max = 0x7fffffff; break;
305 case '+': min = 1; max = 0x7fffffff; break;
307 } else if (k == '{' && ptr + 1 < pattern.Length) {
310 haveRep = ParseRepetitionBounds (out min, out max, options);
316 ConsumeWhitespace (IsIgnorePatternWhitespace (options));
317 if (ptr < pattern.Length && pattern[ptr] == '?') {
322 Repetition repetition = new Repetition (min, max, lazy);
325 repetition.Expression = new Literal (ch.ToString (), IsIgnoreCase (options));
327 repetition.Expression = expr;
333 // (3) Append Expression and/or Literal
341 if (literal != null) {
342 current.AppendExpression (new Literal (literal, IsIgnoreCase (options)));
346 current.AppendExpression (expr);
350 if (is_top_level && ptr >= pattern.Length)
355 if (is_top_level && closed)
356 throw NewParseException ("Too many )'s.");
357 if (!is_top_level && !closed)
358 throw NewParseException ("Not enough )'s.");
361 // clean up literals and alternations
364 current.AppendExpression (new Literal (literal, IsIgnoreCase (options)));
366 if (assertion != null) {
367 if (assertion.TrueExpression == null)
368 assertion.TrueExpression = current;
370 assertion.FalseExpression = current;
372 group.AppendExpression (assertion);
374 else if (alternation != null) {
375 alternation.AddAlternative (current);
376 group.AppendExpression (alternation);
379 group.AppendExpression (current);
382 private Expression ParseGroupingConstruct (ref RegexOptions options) {
383 if (pattern[ptr] != '?') {
386 if (IsExplicitCapture (options))
387 group = new Group ();
389 group = new CapturingGroup ();
393 ParseGroup (group, options, null);
399 switch (pattern[ptr]) {
400 case ':': { // non-capturing group
402 Group group = new Group ();
403 ParseGroup (group, options, null);
408 case '>': { // non-backtracking group
410 Group group = new NonBacktrackingGroup ();
411 ParseGroup (group, options, null);
416 case 'i': case 'm': case 'n':
417 case 's': case 'x': case '-': { // options
418 RegexOptions o = options;
419 ParseOptions (ref o, false);
420 if (pattern[ptr] == '-') {
422 ParseOptions (ref o, true);
425 if (pattern[ptr] == ':') { // pass options to child group
427 Group group = new Group ();
428 ParseGroup (group, o, null);
431 else if (pattern[ptr] == ')') { // change options of enclosing group
437 throw NewParseException ("Bad options");
440 case '<': case '=': case '!': { // lookahead/lookbehind
441 ExpressionAssertion asn = new ExpressionAssertion ();
442 if (!ParseAssertionType (asn))
443 goto case '\''; // it's a (?<name> ) construct
445 Group test = new Group ();
446 ParseGroup (test, options, null);
448 asn.TestExpression = test;
452 case '\'': { // named/balancing group
454 if (pattern[ptr] == '<')
460 string name = ParseName ();
462 if (pattern[ptr] == delim) {
466 throw NewParseException ("Bad group name.");
469 CapturingGroup cap = new CapturingGroup ();
472 ParseGroup (cap, options, null);
476 else if (pattern[ptr] == '-') {
480 string balance_name = ParseName ();
481 if (balance_name == null || pattern[ptr] != delim)
482 throw NewParseException ("Bad balancing group name.");
485 BalancingGroup bal = new BalancingGroup ();
492 refs.Add (bal, balance_name);
494 ParseGroup (bal, options, null);
499 throw NewParseException ("Bad group name.");
502 case '(': { // expression/capture test
507 string name = ParseName ();
508 if (name == null || pattern[ptr] != ')') { // expression test
509 // FIXME MS implementation doesn't seem to
510 // implement this version of (?(x) ...)
513 ExpressionAssertion expr_asn = new ExpressionAssertion ();
515 if (pattern[ptr] == '?') {
517 if (!ParseAssertionType (expr_asn))
518 throw NewParseException ("Bad conditional.");
521 expr_asn.Negate = false;
522 expr_asn.Reverse = false;
525 Group test = new Group ();
526 ParseGroup (test, options, null);
527 expr_asn.TestExpression = test;
530 else { // capture test
532 asn = new CaptureAssertion ();
533 refs.Add (asn, name);
536 Group group = new Group ();
537 ParseGroup (group, options, asn);
541 case '#': { // comment
543 while (pattern[ptr ++] != ')') {
544 if (ptr >= pattern.Length)
545 throw NewParseException ("Unterminated (?#...) comment.");
551 throw NewParseException ("Bad grouping construct.");
555 private bool ParseAssertionType (ExpressionAssertion assertion) {
556 if (pattern[ptr] == '<') {
557 switch (pattern[ptr + 1]) {
559 assertion.Negate = false;
562 assertion.Negate = true;
568 assertion.Reverse = true;
572 switch (pattern[ptr]) {
574 assertion.Negate = false;
577 assertion.Negate = true;
583 assertion.Reverse = false;
590 private void ParseOptions (ref RegexOptions options, bool negate) {
592 switch (pattern[ptr]) {
595 options &= ~RegexOptions.IgnoreCase;
597 options |= RegexOptions.IgnoreCase;
602 options &= ~RegexOptions.Multiline;
604 options |= RegexOptions.Multiline;
609 options &= ~RegexOptions.ExplicitCapture;
611 options |= RegexOptions.ExplicitCapture;
616 options &= ~RegexOptions.Singleline;
618 options |= RegexOptions.Singleline;
623 options &= ~RegexOptions.IgnorePatternWhitespace;
625 options |= RegexOptions.IgnorePatternWhitespace;
636 private Expression ParseCharacterClass (RegexOptions options) {
638 if (pattern[ptr] == '^') {
643 bool ecma = IsECMAScript (options);
644 CharacterClass cls = new CharacterClass (negate, IsIgnoreCase (options));
646 if (pattern[ptr] == ']') {
647 cls.AddCharacter (']');
655 while (ptr < pattern.Length) {
663 if (c == '-' && last >= 0 && !range) {
671 goto char_recognized;
673 // didn't recognize escape
674 c = pattern [ptr ++];
678 goto char_recognized;
681 cls.AddCategory (ecma ? Category.EcmaDigit : Category.Digit, c == 'D');
685 cls.AddCategory (ecma ? Category.EcmaWord : Category.Word, c == 'W');
689 cls.AddCategory (ecma ? Category.EcmaWhiteSpace : Category.WhiteSpace, c == 'S');
693 cls.AddCategory (ParseUnicodeCategory (), c == 'P'); // ignore ecma
696 default: // add escaped character
697 goto char_recognized;
700 // if the pattern looks like [a-\s] ...
702 throw NewParseException ("character range cannot have category \\" + c);
710 // if 'range' is true, we know that 'last >= 0'
712 throw NewParseException ("[" + last + "-" + c + "] range in reverse order.");
713 cls.AddRange ((char)last, (char)c);
719 cls.AddCharacter ((char)c);
724 throw NewParseException ("Unterminated [] set.");
727 cls.AddCharacter ('-');
732 private bool ParseRepetitionBounds (out int min, out int max, RegexOptions options) {
738 ConsumeWhitespace (IsIgnorePatternWhitespace (options));
740 if (pattern[ptr] == ',') {
743 n = ParseNumber (10, 1, 0);
744 ConsumeWhitespace (IsIgnorePatternWhitespace (options));
747 switch (pattern[ptr ++]) {
752 ConsumeWhitespace (IsIgnorePatternWhitespace (options));
753 m = ParseNumber (10, 1, 0);
754 ConsumeWhitespace (IsIgnorePatternWhitespace (options));
755 if (pattern[ptr ++] != '}')
762 /* check bounds and ordering */
764 if (n >= 0xffff || m >= 0xffff)
765 throw NewParseException ("Illegal {x, y} - maximum of 65535.");
767 throw NewParseException ("Illegal {x, y} with x > y.");
769 /* assign min and max */
780 private Category ParseUnicodeCategory () {
781 if (pattern[ptr ++] != '{')
782 throw NewParseException ("Incomplete \\p{X} character escape.");
784 string name = ParseName (pattern, ref ptr);
786 throw NewParseException ("Incomplete \\p{X} character escape.");
788 Category cat = CategoryUtils.CategoryFromName (name);
789 if (cat == Category.None)
790 throw NewParseException ("Unknown property '" + name + "'.");
792 if (pattern[ptr ++] != '}')
793 throw NewParseException ("Incomplete \\p{X} character escape.");
798 private Expression ParseSpecial (RegexOptions options) {
800 bool ecma = IsECMAScript (options);
801 Expression expr = null;
803 switch (pattern[ptr ++]) {
808 expr = new CharacterClass (ecma ? Category.EcmaDigit : Category.Digit, false);
812 expr = new CharacterClass (ecma ? Category.EcmaWord : Category.Word, false);
816 expr = new CharacterClass (ecma ? Category.EcmaWhiteSpace : Category.WhiteSpace, false);
820 // this is odd - ECMAScript isn't supposed to support Unicode,
821 // yet \p{..} compiles and runs under the MS implementation
822 // identically to canonical mode. That's why I'm ignoring the
823 // value of ecma here.
825 expr = new CharacterClass (ParseUnicodeCategory (), false);
829 expr = new CharacterClass (ecma ? Category.EcmaDigit : Category.Digit, true);
833 expr = new CharacterClass (ecma ? Category.EcmaWord : Category.Word, true);
837 expr = new CharacterClass (ecma ? Category.EcmaWhiteSpace : Category.WhiteSpace, true);
841 expr = new CharacterClass (ParseUnicodeCategory (), true);
846 case 'A': expr = new PositionAssertion (Position.StartOfString); break;
847 case 'Z': expr = new PositionAssertion (Position.End); break;
848 case 'z': expr = new PositionAssertion (Position.EndOfString); break;
849 case 'G': expr = new PositionAssertion (Position.StartOfScan); break;
850 case 'b': expr = new PositionAssertion (Position.Boundary); break;
851 case 'B': expr = new PositionAssertion (Position.NonBoundary); break;
855 case '1': case '2': case '3': case '4': case '5':
856 case '6': case '7': case '8': case '9': {
858 int n = ParseNumber (10, 1, 0);
864 // FIXME test if number is within number of assigned groups
865 // this may present a problem for right-to-left matching
867 Reference reference = new Reference (IsIgnoreCase (options));
868 refs.Add (reference, n.ToString ());
874 char delim = pattern[ptr ++];
877 else if (delim != '\'')
878 throw NewParseException ("Malformed \\k<...> named backreference.");
880 string name = ParseName ();
881 if (name == null || pattern[ptr] != delim)
882 throw NewParseException ("Malformed \\k<...> named backreference.");
885 Reference reference = new Reference (IsIgnoreCase (options));
886 refs.Add (reference, name);
902 private int ParseEscape () {
906 if (p >= pattern.Length)
907 throw new ArgumentException (
908 String.Format ("Parsing \"{0}\" - Illegal \\ at end of " +
909 "pattern.", pattern), pattern);
911 switch (pattern[ptr ++]) {
913 // standard escapes (except \b)
915 case 'a': return '\u0007';
916 case 't': return '\u0009';
917 case 'r': return '\u000d';
918 case 'v': return '\u000b';
919 case 'f': return '\u000c';
920 case 'n': return '\u000a';
921 case 'e': return '\u001b';
922 case '\\': return '\\';
928 // Turns out that octal values can be specified
929 // without a leading zero. But also the limit
930 // of three character should include this first
935 int result = ParseOctal (pattern, ref ptr);
936 if (result == -1 && prevptr == ptr)
942 c = ParseHex (pattern, ref ptr, 2);
944 throw NewParseException ("Insufficient hex digits");
949 c = ParseHex (pattern, ref ptr, 4);
951 throw NewParseException ("Insufficient hex digits");
955 // control characters
959 if (c >= '@' && c <= '_')
962 throw NewParseException ("Unrecognized control character.");
972 private string ParseName () {
973 return Parser.ParseName (pattern, ref ptr);
976 private static bool IsNameChar (char c) {
977 UnicodeCategory cat = Char.GetUnicodeCategory (c);
978 if (cat == UnicodeCategory.ModifierLetter)
980 if (cat == UnicodeCategory.ConnectorPunctuation)
982 return Char.IsLetterOrDigit (c);
985 private int ParseNumber (int b, int min, int max) {
986 return Parser.ParseNumber (pattern, ref ptr, b, min, max);
989 private static int ParseDigit (char c, int b, int n) {
992 if (c >= '0' && c <= '7')
997 if (c >= '0' && c <= '9')
1002 if (c >= '0' && c <= '9')
1004 else if (c >= 'a' && c <= 'f')
1005 return 10 + c - 'a';
1006 else if (c >= 'A' && c <= 'F')
1007 return 10 + c - 'A';
1015 private void ConsumeWhitespace (bool ignore) {
1016 while (ptr < pattern.Length) {
1017 if (pattern[ptr] == '(') {
1018 if (ptr + 3 >= pattern.Length)
1021 if (pattern[ptr + 1] != '?' || pattern[ptr + 2] != '#')
1025 while (ptr < pattern.Length && pattern[ptr ++] != ')')
1028 else if (ignore && pattern[ptr] == '#') {
1029 while (ptr < pattern.Length && pattern[ptr ++] != '\n')
1032 else if (ignore && Char.IsWhiteSpace (pattern[ptr])) {
1033 while (ptr < pattern.Length && Char.IsWhiteSpace (pattern[ptr]))
1041 private string ParseString (string pattern) {
1042 this.pattern = pattern;
1045 StringBuilder result = new StringBuilder (pattern.Length);
1046 while (ptr < pattern.Length) {
1047 int c = pattern[ptr ++];
1052 c = pattern[ptr ++];
1057 result.Append ((char) c);
1060 return result.ToString ();
1063 private void ResolveReferences () {
1065 Hashtable dict = new Hashtable ();
1067 // number unnamed groups
1069 foreach (CapturingGroup group in caps) {
1070 if (group.Name == null) {
1071 dict.Add (gid.ToString (), group);
1072 group.Number = gid ++;
1078 // number named groups
1080 foreach (CapturingGroup group in caps) {
1081 if (group.Name != null) {
1082 if (!dict.Contains (group.Name)) {
1083 dict.Add (group.Name, group);
1084 group.Number = gid ++;
1089 CapturingGroup prev = (CapturingGroup)dict[group.Name];
1090 group.Number = prev.Number;
1095 // resolve references
1097 foreach (Expression expr in refs.Keys) {
1098 string name = (string)refs[expr];
1099 if (!dict.Contains (name)) {
1100 throw NewParseException ("Reference to undefined group " +
1101 (Char.IsDigit (name[0]) ? "number " : "name ") +
1105 CapturingGroup group = (CapturingGroup)dict[name];
1106 if (expr is Reference)
1107 ((Reference)expr).CapturingGroup = group;
1108 else if (expr is CaptureAssertion)
1109 ((CaptureAssertion)expr).CapturingGroup = group;
1110 else if (expr is BalancingGroup)
1111 ((BalancingGroup)expr).Balance = group;
1115 // flag helper functions
1117 private static bool IsIgnoreCase (RegexOptions options) {
1118 return (options & RegexOptions.IgnoreCase) != 0;
1121 private static bool IsMultiline (RegexOptions options) {
1122 return (options & RegexOptions.Multiline) != 0;
1125 private static bool IsExplicitCapture (RegexOptions options) {
1126 return (options & RegexOptions.ExplicitCapture) != 0;
1129 private static bool IsSingleline (RegexOptions options) {
1130 return (options & RegexOptions.Singleline) != 0;
1133 private static bool IsIgnorePatternWhitespace (RegexOptions options) {
1134 return (options & RegexOptions.IgnorePatternWhitespace) != 0;
1137 private static bool IsECMAScript (RegexOptions options) {
1138 return (options & RegexOptions.ECMAScript) != 0;
1141 // exception creation
1143 private ArgumentException NewParseException (string msg) {
1144 msg = "parsing \"" + pattern + "\" - " + msg;
1145 return new ArgumentException (msg, pattern);
1148 private string pattern;
1151 private ArrayList caps;
1152 private Hashtable refs;
1153 private int num_groups;