3 // namespace: System.Text.RegularExpressions
6 // author: Dan Lewis (dlewis@gmx.co.uk)
10 // Permission is hereby granted, free of charge, to any person obtaining
11 // a copy of this software and associated documentation files (the
12 // "Software"), to deal in the Software without restriction, including
13 // without limitation the rights to use, copy, modify, merge, publish,
14 // distribute, sublicense, and/or sell copies of the Software, and to
15 // permit persons to whom the Software is furnished to do so, subject to
16 // the following conditions:
18 // The above copyright notice and this permission notice shall be
19 // included in all copies or substantial portions of the Software.
21 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
22 // EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
23 // MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
24 // NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
25 // LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
26 // OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
27 // WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
31 using System.Collections;
32 using System.Globalization;
34 namespace System.Text.RegularExpressions.Syntax {
37 public static int ParseDecimal (string str, ref int ptr) {
38 return ParseNumber (str, ref ptr, 10, 1, Int32.MaxValue);
41 public static int ParseOctal (string str, ref int ptr) {
42 return ParseNumber (str, ref ptr, 8, 1, 3);
45 public static int ParseHex (string str, ref int ptr, int digits) {
46 return ParseNumber (str, ref ptr, 16, digits, digits);
49 public static int ParseNumber (string str, ref int ptr, int b, int min, int max) {
50 int p = ptr, n = 0, digits = 0, d;
54 while (digits < max && p < str.Length) {
55 d = ParseDigit (str[p ++], b, digits);
72 public static string ParseName (string str, ref int ptr) {
73 if (Char.IsDigit (str[ptr])) {
74 int gid = ParseNumber (str, ref ptr, 10, 1, 0);
76 return gid.ToString ();
83 if (!IsNameChar (str[ptr]))
89 return str.Substring (start, ptr - start);
94 public static string Escape (string str) {
96 for (int i = 0; i < str.Length; ++ i) {
99 case '\\': case '*': case '+': case '?': case '|':
100 case '{': case '[': case '(': case ')': case '^':
101 case '$': case '.': case '#': case ' ':
105 case '\t': result += "\\t"; break;
106 case '\n': result += "\\n"; break;
107 case '\r': result += "\\r"; break;
108 case '\f': result += "\\f"; break;
110 default: result += c; break;
117 public static string Unescape (string str) {
118 return new Parser ().ParseString (str);
124 this.caps = new ArrayList ();
125 this.refs = new Hashtable ();
128 public RegularExpression ParseRegularExpression (string pattern, RegexOptions options) {
129 this.pattern = pattern;
137 RegularExpression re = new RegularExpression ();
138 ParseGroup (re, options, null);
139 ResolveReferences ();
141 re.GroupCount = num_groups;
145 catch (IndexOutOfRangeException) {
146 throw NewParseException ("Unexpected end of pattern.");
150 public IDictionary GetMapping () {
151 Hashtable mapping = new Hashtable ();
152 Hashtable numbers = new Hashtable ();
153 int end = caps.Count;
154 mapping.Add ("0", 0);
155 for (int i = 0; i < end; i++) {
156 CapturingGroup group = (CapturingGroup) caps [i];
157 if (group.Name != null && !mapping.Contains (group.Name)) {
158 mapping.Add (group.Name, group.Number);
159 numbers.Add (group.Number, group.Number);
163 for (int i = 1; i < end; i++) {
164 if (numbers [i] == null)
165 mapping.Add (i.ToString (), i);
173 private void ParseGroup (Group group, RegexOptions options, Assertion assertion) {
174 bool is_top_level = group is RegularExpression;
176 Alternation alternation = null;
177 string literal = null;
179 Group current = new Group ();
180 Expression expr = null;
184 ConsumeWhitespace (IsIgnorePatternWhitespace (options));
185 if (ptr >= pattern.Length)
188 // (1) Parse for Expressions
190 char ch = pattern[ptr ++];
195 IsMultiline (options) ? Position.StartOfLine : Position.Start;
196 expr = new PositionAssertion (pos);
202 IsMultiline (options) ? Position.EndOfLine : Position.End;
203 expr = new PositionAssertion (pos);
209 IsSingleline (options) ? Category.AnySingleline : Category.Any;
210 expr = new CharacterClass (cat, false);
215 int c = ParseEscape ();
219 expr = ParseSpecial (options);
222 ch = pattern[ptr ++]; // default escape
228 expr = ParseCharacterClass (options);
233 bool ignore = IsIgnoreCase (options);
234 expr = ParseGroupingConstruct (ref options);
236 if (literal != null && IsIgnoreCase (options) != ignore) {
237 current.AppendExpression (new Literal (literal, IsIgnoreCase (options)));
252 if (literal != null) {
253 current.AppendExpression (new Literal (literal, IsIgnoreCase (options)));
257 if (assertion != null) {
258 if (assertion.TrueExpression == null)
259 assertion.TrueExpression = current;
260 else if (assertion.FalseExpression == null)
261 assertion.FalseExpression = current;
263 throw NewParseException ("Too many | in (?()|).");
266 if (alternation == null)
267 alternation = new Alternation ();
269 alternation.AddAlternative (current);
272 current = new Group ();
276 case '*': case '+': case '?': {
277 throw NewParseException ("Bad quantifier.");
281 break; // literal character
284 ConsumeWhitespace (IsIgnorePatternWhitespace (options));
286 // (2) Check for Repetitions
288 if (ptr < pattern.Length) {
289 char k = pattern[ptr];
291 if (k == '?' || k == '*' || k == '+' || k == '{') {
294 int min = 0, max = 0;
298 case '?': min = 0; max = 1; break;
299 case '*': min = 0; max = 0xffff; break;
300 case '+': min = 1; max = 0xffff; break;
301 case '{': ParseRepetitionBounds (out min, out max, options); break;
304 ConsumeWhitespace (IsIgnorePatternWhitespace (options));
305 if (ptr < pattern.Length && pattern[ptr] == '?') {
310 Repetition repetition = new Repetition (min, max, lazy);
313 repetition.Expression = new Literal (ch.ToString (), IsIgnoreCase (options));
315 repetition.Expression = expr;
321 // (3) Append Expression and/or Literal
329 if (literal != null) {
330 current.AppendExpression (new Literal (literal, IsIgnoreCase (options)));
334 current.AppendExpression (expr);
338 if (is_top_level && ptr >= pattern.Length)
343 if (is_top_level && closed)
344 throw NewParseException ("Too many )'s.");
345 if (!is_top_level && !closed)
346 throw NewParseException ("Not enough )'s.");
349 // clean up literals and alternations
352 current.AppendExpression (new Literal (literal, IsIgnoreCase (options)));
354 if (assertion != null) {
355 if (assertion.TrueExpression == null)
356 assertion.TrueExpression = current;
358 assertion.FalseExpression = current;
360 group.AppendExpression (assertion);
362 else if (alternation != null) {
363 alternation.AddAlternative (current);
364 group.AppendExpression (alternation);
367 group.AppendExpression (current);
370 private Expression ParseGroupingConstruct (ref RegexOptions options) {
371 if (pattern[ptr] != '?') {
374 if (IsExplicitCapture (options))
375 group = new Group ();
377 group = new CapturingGroup ();
381 ParseGroup (group, options, null);
387 switch (pattern[ptr]) {
388 case ':': { // non-capturing group
390 Group group = new Group ();
391 ParseGroup (group, options, null);
396 case '>': { // non-backtracking group
398 Group group = new NonBacktrackingGroup ();
399 ParseGroup (group, options, null);
404 case 'i': case 'm': case 'n':
405 case 's': case 'x': case '-': { // options
406 RegexOptions o = options;
407 ParseOptions (ref o, false);
408 if (pattern[ptr] == '-') {
410 ParseOptions (ref o, true);
413 if (pattern[ptr] == ':') { // pass options to child group
415 Group group = new Group ();
416 ParseGroup (group, o, null);
419 else if (pattern[ptr] == ')') { // change options of enclosing group
425 throw NewParseException ("Bad options");
428 case '<': case '=': case '!': { // lookahead/lookbehind
429 ExpressionAssertion asn = new ExpressionAssertion ();
430 if (!ParseAssertionType (asn))
431 goto case '\''; // it's a (?<name> ) construct
433 Group test = new Group ();
434 ParseGroup (test, options, null);
436 asn.TestExpression = test;
440 case '\'': { // named/balancing group
442 if (pattern[ptr] == '<')
448 string name = ParseName ();
450 if (pattern[ptr] == delim) {
454 throw NewParseException ("Bad group name.");
457 CapturingGroup cap = new CapturingGroup ();
460 ParseGroup (cap, options, null);
464 else if (pattern[ptr] == '-') {
468 string balance_name = ParseName ();
469 if (balance_name == null || pattern[ptr] != delim)
470 throw NewParseException ("Bad balancing group name.");
473 BalancingGroup bal = new BalancingGroup ();
480 refs.Add (bal, balance_name);
482 ParseGroup (bal, options, null);
487 throw NewParseException ("Bad group name.");
490 case '(': { // expression/capture test
495 string name = ParseName ();
496 if (name == null || pattern[ptr] != ')') { // expression test
497 // FIXME MS implementation doesn't seem to
498 // implement this version of (?(x) ...)
501 ExpressionAssertion expr_asn = new ExpressionAssertion ();
503 if (pattern[ptr] == '?') {
505 if (!ParseAssertionType (expr_asn))
506 throw NewParseException ("Bad conditional.");
509 expr_asn.Negate = false;
510 expr_asn.Reverse = false;
513 Group test = new Group ();
514 ParseGroup (test, options, null);
515 expr_asn.TestExpression = test;
518 else { // capture test
520 asn = new CaptureAssertion ();
521 refs.Add (asn, name);
524 Group group = new Group ();
525 ParseGroup (group, options, asn);
529 case '#': { // comment
531 while (pattern[ptr ++] != ')') {
532 if (ptr >= pattern.Length)
533 throw NewParseException ("Unterminated (?#...) comment.");
539 throw NewParseException ("Bad grouping construct.");
543 private bool ParseAssertionType (ExpressionAssertion assertion) {
544 if (pattern[ptr] == '<') {
545 switch (pattern[ptr + 1]) {
547 assertion.Negate = false;
550 assertion.Negate = true;
556 assertion.Reverse = true;
560 switch (pattern[ptr]) {
562 assertion.Negate = false;
565 assertion.Negate = true;
571 assertion.Reverse = false;
578 private void ParseOptions (ref RegexOptions options, bool negate) {
580 switch (pattern[ptr]) {
583 options &= ~RegexOptions.IgnoreCase;
585 options |= RegexOptions.IgnoreCase;
590 options &= ~RegexOptions.Multiline;
592 options |= RegexOptions.Multiline;
597 options &= ~RegexOptions.ExplicitCapture;
599 options |= RegexOptions.ExplicitCapture;
604 options &= ~RegexOptions.Singleline;
606 options |= RegexOptions.Singleline;
611 options &= ~RegexOptions.IgnorePatternWhitespace;
613 options |= RegexOptions.IgnorePatternWhitespace;
624 private Expression ParseCharacterClass (RegexOptions options) {
626 if (pattern[ptr] == '^') {
633 ecma = IsECMAScript (options);
634 CharacterClass cls = new CharacterClass (negate, IsIgnoreCase (options));
636 if (pattern[ptr] == ']') {
637 cls.AddCharacter (']');
645 while (ptr < pattern.Length) {
661 // didn't recognize escape
665 case 'b': c = '\b'; break;
668 cls.AddCategory (ecma ? Category.EcmaDigit : Category.Digit, false);
673 cls.AddCategory (ecma ? Category.EcmaWord : Category.Word, false);
678 cls.AddCategory (ecma ? Category.EcmaWhiteSpace : Category.WhiteSpace, false);
683 cls.AddCategory (ParseUnicodeCategory (), false); // ignore ecma
688 cls.AddCategory (ecma ? Category.EcmaDigit : Category.Digit, true);
693 cls.AddCategory (ecma ? Category.EcmaWord : Category.Word, true);
698 cls.AddCategory (ecma ? Category.EcmaWhiteSpace : Category.WhiteSpace, true);
703 cls.AddCategory (ParseUnicodeCategory (), true);
707 default: break; // add escaped character
714 throw NewParseException ("[x-y] range in reverse order.");
717 cls.AddRange ((char)last, (char)c);
719 cls.AddCharacter ((char)c);
720 cls.AddCharacter ('-');
727 cls.AddCharacter ((char)c);
733 throw NewParseException ("Unterminated [] set.");
736 cls.AddCharacter ('-');
741 private void ParseRepetitionBounds (out int min, out int max, RegexOptions options) {
746 ConsumeWhitespace (IsIgnorePatternWhitespace (options));
748 if (pattern[ptr] == ',') {
751 n = ParseNumber (10, 1, 0);
752 ConsumeWhitespace (IsIgnorePatternWhitespace (options));
755 switch (pattern[ptr ++]) {
760 ConsumeWhitespace (IsIgnorePatternWhitespace (options));
761 m = ParseNumber (10, 1, 0);
762 ConsumeWhitespace (IsIgnorePatternWhitespace (options));
763 if (pattern[ptr ++] != '}')
764 throw NewParseException ("Illegal {x,y} - bad value of y.");
767 throw NewParseException ("Illegal {x,y}");
770 /* check bounds and ordering */
772 if (n >= 0xffff || m >= 0xffff)
773 throw NewParseException ("Illegal {x, y} - maximum of 65535.");
775 throw NewParseException ("Illegal {x, y} with x > y.");
777 /* assign min and max */
786 private Category ParseUnicodeCategory () {
787 if (pattern[ptr ++] != '{')
788 throw NewParseException ("Incomplete \\p{X} character escape.");
790 string name = ParseName (pattern, ref ptr);
792 throw NewParseException ("Incomplete \\p{X} character escape.");
794 Category cat = CategoryUtils.CategoryFromName (name);
795 if (cat == Category.None)
796 throw NewParseException ("Unknown property '" + name + "'.");
798 if (pattern[ptr ++] != '}')
799 throw NewParseException ("Incomplete \\p{X} character escape.");
804 private Expression ParseSpecial (RegexOptions options) {
806 bool ecma = IsECMAScript (options);
807 Expression expr = null;
809 switch (pattern[ptr ++]) {
814 expr = new CharacterClass (ecma ? Category.EcmaDigit : Category.Digit, false);
818 expr = new CharacterClass (ecma ? Category.EcmaWord : Category.Word, false);
822 expr = new CharacterClass (ecma ? Category.EcmaWhiteSpace : Category.WhiteSpace, false);
826 // this is odd - ECMAScript isn't supposed to support Unicode,
827 // yet \p{..} compiles and runs under the MS implementation
828 // identically to canonical mode. That's why I'm ignoring the
829 // value of ecma here.
831 expr = new CharacterClass (ParseUnicodeCategory (), false);
835 expr = new CharacterClass (ecma ? Category.EcmaDigit : Category.Digit, true);
839 expr = new CharacterClass (ecma ? Category.EcmaWord : Category.Word, true);
843 expr = new CharacterClass (ecma ? Category.EcmaWhiteSpace : Category.WhiteSpace, true);
847 expr = new CharacterClass (ParseUnicodeCategory (), true);
852 case 'A': expr = new PositionAssertion (Position.StartOfString); break;
853 case 'Z': expr = new PositionAssertion (Position.End); break;
854 case 'z': expr = new PositionAssertion (Position.EndOfString); break;
855 case 'G': expr = new PositionAssertion (Position.StartOfScan); break;
856 case 'b': expr = new PositionAssertion (Position.Boundary); break;
857 case 'B': expr = new PositionAssertion (Position.NonBoundary); break;
861 case '1': case '2': case '3': case '4': case '5':
862 case '6': case '7': case '8': case '9': {
864 int n = ParseNumber (10, 1, 0);
870 // FIXME test if number is within number of assigned groups
871 // this may present a problem for right-to-left matching
873 Reference reference = new Reference (IsIgnoreCase (options));
874 refs.Add (reference, n.ToString ());
880 char delim = pattern[ptr ++];
883 else if (delim != '\'')
884 throw NewParseException ("Malformed \\k<...> named backreference.");
886 string name = ParseName ();
887 if (name == null || pattern[ptr] != delim)
888 throw NewParseException ("Malformed \\k<...> named backreference.");
891 Reference reference = new Reference (IsIgnoreCase (options));
892 refs.Add (reference, name);
908 private int ParseEscape () {
912 if (p >= pattern.Length)
913 throw new ArgumentException (
914 String.Format ("Parsing \"{0}\" - Illegal \\ at end of " +
915 "pattern.", pattern), pattern);
917 switch (pattern[ptr ++]) {
919 // standard escapes (except \b)
921 case 'a': return '\u0007';
922 case 't': return '\u0009';
923 case 'r': return '\u000d';
924 case 'v': return '\u000b';
925 case 'f': return '\u000c';
926 case 'n': return '\u000a';
927 case 'e': return '\u001b';
928 case '\\': return '\\';
934 // Turns out that octal values can be specified
935 // without a leading zero. But also the limit
936 // of three character should include this first
941 int result = ParseOctal (pattern, ref ptr);
942 if (result == -1 && prevptr == ptr)
948 c = ParseHex (pattern, ref ptr, 2);
950 throw NewParseException ("Insufficient hex digits");
955 c = ParseHex (pattern, ref ptr, 4);
957 throw NewParseException ("Insufficient hex digits");
961 // control characters
965 if (c >= '@' && c <= '_')
968 throw NewParseException ("Unrecognized control character.");
978 private string ParseName () {
979 return Parser.ParseName (pattern, ref ptr);
982 private static bool IsNameChar (char c) {
983 UnicodeCategory cat = Char.GetUnicodeCategory (c);
984 if (cat == UnicodeCategory.ModifierLetter)
986 if (cat == UnicodeCategory.ConnectorPunctuation)
988 return Char.IsLetterOrDigit (c);
991 private int ParseNumber (int b, int min, int max) {
992 return Parser.ParseNumber (pattern, ref ptr, b, min, max);
995 private int ParseDecimal () {
996 return Parser.ParseDecimal (pattern, ref ptr);
999 private static int ParseDigit (char c, int b, int n) {
1002 if (c >= '0' && c <= '7')
1007 if (c >= '0' && c <= '9')
1012 if (c >= '0' && c <= '9')
1014 else if (c >= 'a' && c <= 'f')
1015 return 10 + c - 'a';
1016 else if (c >= 'A' && c <= 'F')
1017 return 10 + c - 'A';
1025 private void ConsumeWhitespace (bool ignore) {
1027 if (ptr >= pattern.Length)
1030 if (pattern[ptr] == '(') {
1031 if (ptr + 3 >= pattern.Length)
1034 if (pattern[ptr + 1] != '?' || pattern[ptr + 2] != '#')
1038 while (pattern[ptr ++] != ')')
1041 else if (ignore && pattern[ptr] == '#') {
1042 while (ptr < pattern.Length && pattern[ptr ++] != '\n')
1045 else if (ignore && Char.IsWhiteSpace (pattern[ptr])) {
1046 while (ptr < pattern.Length && Char.IsWhiteSpace (pattern[ptr]))
1054 private string ParseString (string pattern) {
1055 this.pattern = pattern;
1058 StringBuilder result = new StringBuilder (pattern.Length);
1059 while (ptr < pattern.Length) {
1060 int c = pattern[ptr ++];
1065 c = pattern[ptr ++];
1070 result.Append ((char) c);
1073 return result.ToString ();
1076 private void ResolveReferences () {
1078 Hashtable dict = new Hashtable ();
1080 // number unnamed groups
1082 foreach (CapturingGroup group in caps) {
1083 if (group.Name == null) {
1084 dict.Add (gid.ToString (), group);
1085 group.Number = gid ++;
1091 // number named groups
1093 foreach (CapturingGroup group in caps) {
1094 if (group.Name != null) {
1095 if (!dict.Contains (group.Name)) {
1096 dict.Add (group.Name, group);
1097 group.Number = gid ++;
1102 CapturingGroup prev = (CapturingGroup)dict[group.Name];
1103 group.Number = prev.Number;
1108 // resolve references
1110 foreach (Expression expr in refs.Keys) {
1111 string name = (string)refs[expr];
1112 if (!dict.Contains (name)) {
1113 throw NewParseException ("Reference to undefined group " +
1114 (Char.IsDigit (name[0]) ? "number " : "name ") +
1118 CapturingGroup group = (CapturingGroup)dict[name];
1119 if (expr is Reference)
1120 ((Reference)expr).CapturingGroup = group;
1121 else if (expr is CaptureAssertion)
1122 ((CaptureAssertion)expr).CapturingGroup = group;
1123 else if (expr is BalancingGroup)
1124 ((BalancingGroup)expr).Balance = group;
1128 // flag helper functions
1130 private static bool IsIgnoreCase (RegexOptions options) {
1131 return (options & RegexOptions.IgnoreCase) != 0;
1134 private static bool IsMultiline (RegexOptions options) {
1135 return (options & RegexOptions.Multiline) != 0;
1138 private static bool IsExplicitCapture (RegexOptions options) {
1139 return (options & RegexOptions.ExplicitCapture) != 0;
1142 private static bool IsSingleline (RegexOptions options) {
1143 return (options & RegexOptions.Singleline) != 0;
1146 private static bool IsIgnorePatternWhitespace (RegexOptions options) {
1147 return (options & RegexOptions.IgnorePatternWhitespace) != 0;
1150 private static bool IsRightToLeft (RegexOptions options) {
1151 return (options & RegexOptions.RightToLeft) != 0;
1154 private static bool IsECMAScript (RegexOptions options) {
1155 return (options & RegexOptions.ECMAScript) != 0;
1158 // exception creation
1160 private ArgumentException NewParseException (string msg) {
1161 msg = "parsing \"" + pattern + "\" - " + msg;
1162 return new ArgumentException (msg, pattern);
1165 private string pattern;
1168 private ArrayList caps;
1169 private Hashtable refs;
1170 private int num_groups;