3 // namespace: System.Text.RegularExpressions
\r
6 // author: Dan Lewis (dlewis@gmx.co.uk)
\r
10 using System.Collections;
\r
11 using System.Globalization;
\r
13 namespace System.Text.RegularExpressions.Syntax {
\r
16 public static int ParseDecimal (string str, ref int ptr) {
\r
17 return ParseNumber (str, ref ptr, 10, 1, Int32.MaxValue);
\r
20 public static int ParseOctal (string str, ref int ptr) {
\r
21 return ParseNumber (str, ref ptr, 8, 1, 3);
\r
24 public static int ParseHex (string str, ref int ptr, int digits) {
\r
25 return ParseNumber (str, ref ptr, 16, digits, digits);
\r
28 public static int ParseNumber (string str, ref int ptr, int b, int min, int max) {
\r
29 int p = ptr, n = 0, digits = 0, d;
\r
31 max = Int32.MaxValue;
\r
33 while (digits < max && p < str.Length) {
\r
34 d = ParseDigit (str[p ++], b, digits);
\r
51 public static string ParseName (string str, ref int ptr) {
\r
52 if (Char.IsDigit (str[ptr])) {
\r
53 int gid = ParseNumber (str, ref ptr, 10, 1, 0);
\r
55 return gid.ToString ();
\r
62 if (!IsNameChar (str[ptr]))
\r
67 if (ptr - start > 0)
\r
68 return str.Substring (start, ptr - start);
\r
73 public static string Escape (string str) {
\r
75 for (int i = 0; i < str.Length; ++ i) {
\r
78 case '\\': case '*': case '+': case '?': case '|':
\r
79 case '{': case '[': case '(': case ')': case '^':
\r
80 case '$': case '.': case '#': case ' ':
\r
84 case '\t': result += "\\t"; break;
\r
85 case '\n': result += "\\n"; break;
\r
86 case '\r': result += "\\r"; break;
\r
87 case '\f': result += "\\f"; break;
\r
89 default: result += c; break;
\r
96 public static string Unescape (string str) {
\r
97 return new Parser ().ParseString (str);
\r
103 this.caps = new ArrayList ();
\r
104 this.refs = new Hashtable ();
\r
107 public RegularExpression ParseRegularExpression (string pattern, RegexOptions options) {
\r
108 this.pattern = pattern;
\r
113 this.num_groups = 0;
\r
116 RegularExpression re = new RegularExpression ();
\r
117 ParseGroup (re, options, null);
\r
118 ResolveReferences ();
\r
120 re.GroupCount = num_groups;
\r
124 catch (IndexOutOfRangeException) {
\r
125 throw NewParseException ("Unexpected end of pattern.");
\r
129 public IDictionary GetMapping () {
\r
130 Hashtable mapping = new Hashtable ();
\r
131 int end = caps.Count;
\r
132 mapping.Add ("0", 0);
\r
133 for (int i = 0; i < end;) {
\r
134 CapturingGroup group = (CapturingGroup) caps [i];
\r
136 if (group.Name != null && !mapping.Contains (group.Name))
\r
137 mapping.Add (group.Name, group.Number);
\r
139 mapping.Add (group.Number.ToString (), group.Number);
\r
147 private void ParseGroup (Group group, RegexOptions options, Assertion assertion) {
\r
148 bool is_top_level = group is RegularExpression;
\r
150 Alternation alternation = null;
\r
151 string literal = null;
\r
153 Group current = new Group ();
\r
154 Expression expr = null;
\r
155 bool closed = false;
\r
158 ConsumeWhitespace (IsIgnorePatternWhitespace (options));
\r
159 if (ptr >= pattern.Length)
\r
162 // (1) Parse for Expressions
\r
164 char ch = pattern[ptr ++];
\r
169 IsMultiline (options) ? Position.StartOfLine : Position.Start;
\r
170 expr = new PositionAssertion (pos);
\r
176 IsMultiline (options) ? Position.EndOfLine : Position.End;
\r
177 expr = new PositionAssertion (pos);
\r
183 IsSingleline (options) ? Category.AnySingleline : Category.Any;
\r
184 expr = new CharacterClass (cat, false);
\r
189 int c = ParseEscape ();
\r
193 expr = ParseSpecial (options);
\r
196 ch = pattern[ptr ++]; // default escape
\r
202 expr = ParseCharacterClass (options);
\r
207 bool ignore = IsIgnoreCase (options);
\r
208 expr = ParseGroupingConstruct (ref options);
\r
209 if (expr == null) {
\r
210 if (literal != null && IsIgnoreCase (options) != ignore) {
\r
211 current.AppendExpression (new Literal (literal, IsIgnoreCase (options)));
\r
226 if (literal != null) {
\r
227 current.AppendExpression (new Literal (literal, IsIgnoreCase (options)));
\r
231 if (assertion != null) {
\r
232 if (assertion.TrueExpression == null)
\r
233 assertion.TrueExpression = current;
\r
234 else if (assertion.FalseExpression == null)
\r
235 assertion.FalseExpression = current;
\r
237 throw NewParseException ("Too many | in (?()|).");
\r
240 if (alternation == null)
\r
241 alternation = new Alternation ();
\r
243 alternation.AddAlternative (current);
\r
246 current = new Group ();
\r
250 case '*': case '+': case '?': {
\r
251 throw NewParseException ("Bad quantifier.");
\r
255 break; // literal character
\r
258 ConsumeWhitespace (IsIgnorePatternWhitespace (options));
\r
260 // (2) Check for Repetitions
\r
262 if (ptr < pattern.Length) {
\r
263 char k = pattern[ptr];
\r
265 if (k == '?' || k == '*' || k == '+' || k == '{') {
\r
268 int min = 0, max = 0;
\r
272 case '?': min = 0; max = 1; break;
\r
273 case '*': min = 0; max = 0xffff; break;
\r
274 case '+': min = 1; max = 0xffff; break;
\r
275 case '{': ParseRepetitionBounds (out min, out max, options); break;
\r
278 ConsumeWhitespace (IsIgnorePatternWhitespace (options));
\r
279 if (ptr < pattern.Length && pattern[ptr] == '?') {
\r
284 Repetition repetition = new Repetition (min, max, lazy);
\r
287 repetition.Expression = new Literal (ch.ToString (), IsIgnoreCase (options));
\r
289 repetition.Expression = expr;
\r
295 // (3) Append Expression and/or Literal
\r
297 if (expr == null) {
\r
298 if (literal == null)
\r
303 if (literal != null) {
\r
304 current.AppendExpression (new Literal (literal, IsIgnoreCase (options)));
\r
308 current.AppendExpression (expr);
\r
312 if (is_top_level && ptr >= pattern.Length)
\r
317 if (is_top_level && closed)
\r
318 throw NewParseException ("Too many )'s.");
\r
319 if (!is_top_level && !closed)
\r
320 throw NewParseException ("Not enough )'s.");
\r
323 // clean up literals and alternations
\r
325 if (literal != null)
\r
326 current.AppendExpression (new Literal (literal, IsIgnoreCase (options)));
\r
328 if (assertion != null) {
\r
329 if (assertion.TrueExpression == null)
\r
330 assertion.TrueExpression = current;
\r
332 assertion.FalseExpression = current;
\r
334 group.AppendExpression (assertion);
\r
336 else if (alternation != null) {
\r
337 alternation.AddAlternative (current);
\r
338 group.AppendExpression (alternation);
\r
341 group.AppendExpression (current);
\r
344 private Expression ParseGroupingConstruct (ref RegexOptions options) {
\r
345 if (pattern[ptr] != '?') {
\r
348 if (IsExplicitCapture (options))
\r
349 group = new Group ();
\r
351 group = new CapturingGroup ();
\r
355 ParseGroup (group, options, null);
\r
361 switch (pattern[ptr]) {
\r
362 case ':': { // non-capturing group
\r
364 Group group = new Group ();
\r
365 ParseGroup (group, options, null);
\r
370 case '>': { // non-backtracking group
\r
372 Group group = new NonBacktrackingGroup ();
\r
373 ParseGroup (group, options, null);
\r
378 case 'i': case 'm': case 'n':
\r
379 case 's': case 'x': case '-': { // options
\r
380 RegexOptions o = options;
\r
381 ParseOptions (ref o, false);
\r
382 if (pattern[ptr] == '-') {
\r
384 ParseOptions (ref o, true);
\r
387 if (pattern[ptr] == ':') { // pass options to child group
\r
389 Group group = new Group ();
\r
390 ParseGroup (group, o, null);
\r
393 else if (pattern[ptr] == ')') { // change options of enclosing group
\r
399 throw NewParseException ("Bad options");
\r
402 case '<': case '=': case '!': { // lookahead/lookbehind
\r
403 ExpressionAssertion asn = new ExpressionAssertion ();
\r
404 if (!ParseAssertionType (asn))
\r
405 goto case '\''; // it's a (?<name> ) construct
\r
407 Group test = new Group ();
\r
408 ParseGroup (test, options, null);
\r
410 asn.TestExpression = test;
\r
414 case '\'': { // named/balancing group
\r
416 if (pattern[ptr] == '<')
\r
422 string name = ParseName ();
\r
424 if (pattern[ptr] == delim) {
\r
428 throw NewParseException ("Bad group name.");
\r
431 CapturingGroup cap = new CapturingGroup ();
\r
434 ParseGroup (cap, options, null);
\r
438 else if (pattern[ptr] == '-') {
\r
442 string balance_name = ParseName ();
\r
443 if (balance_name == null || pattern[ptr] != delim)
\r
444 throw NewParseException ("Bad balancing group name.");
\r
447 BalancingGroup bal = new BalancingGroup ();
\r
450 refs.Add (bal, balance_name);
\r
455 throw NewParseException ("Bad group name.");
\r
458 case '(': { // expression/capture test
\r
463 string name = ParseName ();
\r
464 if (name == null || pattern[ptr] != ')') { // expression test
\r
465 // FIXME MS implementation doesn't seem to
\r
466 // implement this version of (?(x) ...)
\r
469 ExpressionAssertion expr_asn = new ExpressionAssertion ();
\r
471 if (pattern[ptr] == '?') {
\r
473 if (!ParseAssertionType (expr_asn))
\r
474 throw NewParseException ("Bad conditional.");
\r
477 expr_asn.Negate = false;
\r
478 expr_asn.Reverse = false;
\r
481 Group test = new Group ();
\r
482 ParseGroup (test, options, null);
\r
483 expr_asn.TestExpression = test;
\r
486 else { // capture test
\r
488 asn = new CaptureAssertion ();
\r
489 refs.Add (asn, name);
\r
492 Group group = new Group ();
\r
493 ParseGroup (group, options, asn);
\r
497 case '#': { // comment
\r
499 while (pattern[ptr ++] != ')') {
\r
500 if (ptr >= pattern.Length)
\r
501 throw NewParseException ("Unterminated (?#...) comment.");
\r
507 throw NewParseException ("Bad grouping construct.");
\r
511 private bool ParseAssertionType (ExpressionAssertion assertion) {
\r
512 if (pattern[ptr] == '<') {
\r
513 switch (pattern[ptr + 1]) {
\r
515 assertion.Negate = false;
\r
518 assertion.Negate = true;
\r
524 assertion.Reverse = true;
\r
528 switch (pattern[ptr]) {
\r
530 assertion.Negate = false;
\r
533 assertion.Negate = true;
\r
539 assertion.Reverse = false;
\r
546 private void ParseOptions (ref RegexOptions options, bool negate) {
\r
548 switch (pattern[ptr]) {
\r
551 options &= ~RegexOptions.IgnoreCase;
\r
553 options |= RegexOptions.IgnoreCase;
\r
558 options &= ~RegexOptions.Multiline;
\r
560 options |= RegexOptions.Multiline;
\r
565 options &= ~RegexOptions.ExplicitCapture;
\r
567 options |= RegexOptions.ExplicitCapture;
\r
572 options &= ~RegexOptions.Singleline;
\r
574 options |= RegexOptions.Singleline;
\r
579 options &= ~RegexOptions.IgnorePatternWhitespace;
\r
581 options |= RegexOptions.IgnorePatternWhitespace;
\r
592 private Expression ParseCharacterClass (RegexOptions options) {
\r
594 if (pattern[ptr] == '^') {
\r
601 ecma = IsECMAScript (options);
\r
602 CharacterClass cls = new CharacterClass (negate, IsIgnoreCase (options));
\r
604 if (pattern[ptr] == ']') {
\r
605 cls.AddCharacter (']');
\r
611 bool range = false;
\r
612 bool closed = false;
\r
613 while (ptr < pattern.Length) {
\r
614 c = pattern[ptr ++];
\r
627 c = ParseEscape ();
\r
629 // didn't recognize escape
\r
631 c = pattern[ptr ++];
\r
633 case 'b': c = '\b'; break;
\r
636 cls.AddCategory (ecma ? Category.EcmaDigit : Category.Digit, false);
\r
641 cls.AddCategory (ecma ? Category.EcmaWord : Category.Word, false);
\r
646 cls.AddCategory (ecma ? Category.EcmaWhiteSpace : Category.WhiteSpace, false);
\r
651 cls.AddCategory (ParseUnicodeCategory (), false); // ignore ecma
\r
656 cls.AddCategory (ecma ? Category.EcmaDigit : Category.Digit, true);
\r
661 cls.AddCategory (ecma ? Category.EcmaWord : Category.Word, true);
\r
666 cls.AddCategory (ecma ? Category.EcmaWhiteSpace : Category.WhiteSpace, true);
\r
671 cls.AddCategory (ParseUnicodeCategory (), true);
\r
675 default: break; // add escaped character
\r
682 throw NewParseException ("[x-y] range in reverse order.");
\r
685 cls.AddRange ((char)last, (char)c);
\r
687 cls.AddCharacter ((char)c);
\r
688 cls.AddCharacter ('-');
\r
695 cls.AddCharacter ((char)c);
\r
701 throw NewParseException ("Unterminated [] set.");
\r
704 cls.AddCharacter ('-');
\r
709 private void ParseRepetitionBounds (out int min, out int max, RegexOptions options) {
\r
714 ConsumeWhitespace (IsIgnorePatternWhitespace (options));
\r
715 n = ParseNumber (10, 1, 0);
\r
717 throw NewParseException ("Illegal {x,y} - bad value of x.");
\r
719 ConsumeWhitespace (IsIgnorePatternWhitespace (options));
\r
720 switch (pattern[ptr ++]) {
\r
725 ConsumeWhitespace (IsIgnorePatternWhitespace (options));
\r
726 m = ParseNumber (10, 1, 0);
\r
727 ConsumeWhitespace (IsIgnorePatternWhitespace (options));
\r
728 if (pattern[ptr ++] != '}')
\r
729 throw NewParseException ("Illegal {x,y} - bad value of y.");
\r
732 throw NewParseException ("Illegal {x,y}");
\r
735 /* check bounds and ordering */
\r
737 if (n >= 0xffff || m >= 0xffff)
\r
738 throw NewParseException ("Illegal {x, y} - maximum of 65535.");
\r
739 if (m >= 0 && m < n)
\r
740 throw NewParseException ("Illegal {x, y} with x > y.");
\r
742 /* assign min and max */
\r
751 private Category ParseUnicodeCategory () {
\r
752 if (pattern[ptr ++] != '{')
\r
753 throw NewParseException ("Incomplete \\p{X} character escape.");
\r
755 string name = ParseName (pattern, ref ptr);
\r
757 throw NewParseException ("Incomplete \\p{X} character escape.");
\r
759 Category cat = CategoryUtils.CategoryFromName (name);
\r
760 if (cat == Category.None)
\r
761 throw NewParseException ("Unknown property '" + name + "'.");
\r
763 if (pattern[ptr ++] != '}')
\r
764 throw NewParseException ("Incomplete \\p{X} character escape.");
\r
769 private Expression ParseSpecial (RegexOptions options) {
\r
771 bool ecma = IsECMAScript (options);
\r
772 Expression expr = null;
\r
774 switch (pattern[ptr ++]) {
\r
779 expr = new CharacterClass (ecma ? Category.EcmaDigit : Category.Digit, false);
\r
783 expr = new CharacterClass (ecma ? Category.EcmaWord : Category.Word, false);
\r
787 expr = new CharacterClass (ecma ? Category.EcmaWhiteSpace : Category.WhiteSpace, false);
\r
791 // this is odd - ECMAScript isn't supposed to support Unicode,
\r
792 // yet \p{..} compiles and runs under the MS implementation
\r
793 // identically to canonical mode. That's why I'm ignoring the
\r
794 // value of ecma here.
\r
796 expr = new CharacterClass (ParseUnicodeCategory (), false);
\r
800 expr = new CharacterClass (ecma ? Category.EcmaDigit : Category.Digit, true);
\r
804 expr = new CharacterClass (ecma ? Category.EcmaWord : Category.Word, true);
\r
808 expr = new CharacterClass (ecma ? Category.EcmaWhiteSpace : Category.WhiteSpace, true);
\r
812 expr = new CharacterClass (ParseUnicodeCategory (), true);
\r
817 case 'A': expr = new PositionAssertion (Position.StartOfString); break;
\r
818 case 'Z': expr = new PositionAssertion (Position.End); break;
\r
819 case 'z': expr = new PositionAssertion (Position.EndOfString); break;
\r
820 case 'G': expr = new PositionAssertion (Position.StartOfScan); break;
\r
821 case 'b': expr = new PositionAssertion (Position.Boundary); break;
\r
822 case 'B': expr = new PositionAssertion (Position.NonBoundary); break;
\r
826 case '1': case '2': case '3': case '4': case '5':
\r
827 case '6': case '7': case '8': case '9': {
\r
829 int n = ParseNumber (10, 1, 0);
\r
835 // FIXME test if number is within number of assigned groups
\r
836 // this may present a problem for right-to-left matching
\r
838 Reference reference = new Reference (IsIgnoreCase (options));
\r
839 refs.Add (reference, n.ToString ());
\r
845 char delim = pattern[ptr ++];
\r
848 else if (delim != '\'')
\r
849 throw NewParseException ("Malformed \\k<...> named backreference.");
\r
851 string name = ParseName ();
\r
852 if (name == null || pattern[ptr] != delim)
\r
853 throw NewParseException ("Malformed \\k<...> named backreference.");
\r
856 Reference reference = new Reference (IsIgnoreCase (options));
\r
857 refs.Add (reference, name);
\r
873 private int ParseEscape () {
\r
877 if (p >= pattern.Length)
\r
878 throw new ArgumentException (
\r
879 String.Format ("Parsing \"{0}\" - Illegal \\ at end of " +
\r
880 "pattern.", pattern), pattern);
\r
882 switch (pattern[ptr ++]) {
\r
884 // standard escapes (except \b)
\r
886 case 'a': return '\u0007';
\r
887 case 't': return '\u0009';
\r
888 case 'r': return '\u000d';
\r
889 case 'v': return '\u000b';
\r
890 case 'f': return '\u000c';
\r
891 case 'n': return '\u000a';
\r
892 case 'e': return '\u001b';
\r
893 case '\\': return '\\';
\r
899 int result = ParseOctal (pattern, ref ptr);
\r
900 if (result == -1 && prevptr == ptr)
\r
906 c = ParseHex (pattern, ref ptr, 2);
\r
908 throw NewParseException ("Insufficient hex digits");
\r
913 c = ParseHex (pattern, ref ptr, 4);
\r
915 throw NewParseException ("Insufficient hex digits");
\r
919 // control characters
\r
923 if (c >= 'A' && c <= 'Z')
\r
925 else if (c >= '@' && c <= '_')
\r
928 throw NewParseException ("Unrecognized control character.");
\r
938 private string ParseName () {
\r
939 return Parser.ParseName (pattern, ref ptr);
\r
942 private static bool IsNameChar (char c) {
\r
943 UnicodeCategory cat = Char.GetUnicodeCategory (c);
\r
944 if (cat == UnicodeCategory.ModifierLetter)
\r
946 if (cat == UnicodeCategory.ConnectorPunctuation)
\r
948 return Char.IsLetterOrDigit (c);
\r
951 private int ParseNumber (int b, int min, int max) {
\r
952 return Parser.ParseNumber (pattern, ref ptr, b, min, max);
\r
955 private int ParseDecimal () {
\r
956 return Parser.ParseDecimal (pattern, ref ptr);
\r
959 private static int ParseDigit (char c, int b, int n) {
\r
962 if (c >= '0' && c <= '7')
\r
967 if (c >= '0' && c <= '9')
\r
972 if (c >= '0' && c <= '9')
\r
974 else if (c >= 'a' && c <= 'f')
\r
975 return 10 + c - 'a';
\r
976 else if (c >= 'A' && c <= 'F')
\r
977 return 10 + c - 'A';
\r
985 private void ConsumeWhitespace (bool ignore) {
\r
987 if (ptr >= pattern.Length)
\r
990 if (pattern[ptr] == '(') {
\r
991 if (ptr + 3 >= pattern.Length)
\r
994 if (pattern[ptr + 1] != '?' || pattern[ptr + 2] != '#')
\r
998 while (pattern[ptr ++] != ')')
\r
1001 else if (ignore && pattern[ptr] == '#') {
\r
1002 while (ptr < pattern.Length && pattern[ptr ++] != '\n')
\r
1005 else if (ignore && Char.IsWhiteSpace (pattern[ptr])) {
\r
1006 while (ptr < pattern.Length && Char.IsWhiteSpace (pattern[ptr]))
\r
1014 private string ParseString (string pattern) {
\r
1015 this.pattern = pattern;
\r
1018 string result = "";
\r
1019 while (ptr < pattern.Length) {
\r
1020 int c = pattern[ptr];
\r
1022 c = ParseEscape ();
\r
1024 result += (char)c;
\r
1030 private void ResolveReferences () {
\r
1032 Hashtable dict = new Hashtable ();
\r
1034 // number unnamed groups
\r
1036 foreach (CapturingGroup group in caps) {
\r
1037 if (group.Name == null) {
\r
1038 dict.Add (gid.ToString (), group);
\r
1039 group.Number = gid ++;
\r
1045 // number named groups
\r
1047 foreach (CapturingGroup group in caps) {
\r
1048 if (group.Name != null) {
\r
1049 if (!dict.Contains (group.Name)) {
\r
1050 dict.Add (group.Name, group);
\r
1051 group.Number = gid ++;
\r
1056 CapturingGroup prev = (CapturingGroup)dict[group.Name];
\r
1057 group.Number = prev.Number;
\r
1062 // resolve references
\r
1064 foreach (Expression expr in refs.Keys) {
\r
1065 string name = (string)refs[expr];
\r
1066 if (!dict.Contains (name)) {
\r
1067 throw NewParseException ("Reference to undefined group " +
\r
1068 (Char.IsDigit (name[0]) ? "number " : "name ") +
\r
1072 CapturingGroup group = (CapturingGroup)dict[name];
\r
1073 if (expr is Reference)
\r
1074 ((Reference)expr).CapturingGroup = group;
\r
1075 else if (expr is CaptureAssertion)
\r
1076 ((CaptureAssertion)expr).CapturingGroup = group;
\r
1077 else if (expr is BalancingGroup)
\r
1078 ((BalancingGroup)expr).Balance = group;
\r
1082 // flag helper functions
\r
1084 private static bool IsIgnoreCase (RegexOptions options) {
\r
1085 return (options & RegexOptions.IgnoreCase) != 0;
\r
1088 private static bool IsMultiline (RegexOptions options) {
\r
1089 return (options & RegexOptions.Multiline) != 0;
\r
1092 private static bool IsExplicitCapture (RegexOptions options) {
\r
1093 return (options & RegexOptions.ExplicitCapture) != 0;
\r
1096 private static bool IsSingleline (RegexOptions options) {
\r
1097 return (options & RegexOptions.Singleline) != 0;
\r
1100 private static bool IsIgnorePatternWhitespace (RegexOptions options) {
\r
1101 return (options & RegexOptions.IgnorePatternWhitespace) != 0;
\r
1104 private static bool IsRightToLeft (RegexOptions options) {
\r
1105 return (options & RegexOptions.RightToLeft) != 0;
\r
1108 private static bool IsECMAScript (RegexOptions options) {
\r
1109 return (options & RegexOptions.ECMAScript) != 0;
\r
1112 // exception creation
\r
1114 private ArgumentException NewParseException (string msg) {
\r
1115 msg = "parsing \"" + pattern + "\" - " + msg;
\r
1116 return new ArgumentException (msg, pattern);
\r
1119 private string pattern;
\r
1122 private ArrayList caps;
\r
1123 private Hashtable refs;
\r
1124 private int num_groups;
\r