Bug 10670 fix.
[mono.git] / mcs / class / System / System.Text.RegularExpressions / parser.cs
1 //
2 // assembly:    System
3 // namespace:   System.Text.RegularExpressions
4 // file:        parser.cs
5 //
6 // author:      Dan Lewis (dlewis@gmx.co.uk)
7 //              (c) 2002
8
9 //
10 // Permission is hereby granted, free of charge, to any person obtaining
11 // a copy of this software and associated documentation files (the
12 // "Software"), to deal in the Software without restriction, including
13 // without limitation the rights to use, copy, modify, merge, publish,
14 // distribute, sublicense, and/or sell copies of the Software, and to
15 // permit persons to whom the Software is furnished to do so, subject to
16 // the following conditions:
17 // 
18 // The above copyright notice and this permission notice shall be
19 // included in all copies or substantial portions of the Software.
20 // 
21 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
22 // EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
23 // MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
24 // NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
25 // LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
26 // OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
27 // WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
28 //
29
30 using System;
31 using System.Collections;
32 using System.Globalization;
33
34 namespace System.Text.RegularExpressions.Syntax {
35
36         class Parser {
37                 public static int ParseDecimal (string str, ref int ptr) {
38                         return ParseNumber (str, ref ptr, 10, 1, Int32.MaxValue);
39                 }
40
41                 public static int ParseOctal (string str, ref int ptr) {
42                         return ParseNumber (str, ref ptr, 8, 1, 3);
43                 }
44
45                 public static int ParseHex (string str, ref int ptr, int digits) {
46                         return ParseNumber (str, ref ptr, 16, digits, digits);
47                 }
48
49                 public static int ParseNumber (string str, ref int ptr, int b, int min, int max) {
50                         int p = ptr, n = 0, digits = 0, d;
51                         if (max < min)
52                                 max = Int32.MaxValue;
53
54                         while (digits < max && p < str.Length) {
55                                 d = ParseDigit (str[p ++], b, digits);
56                                 if (d < 0) {
57                                         -- p;
58                                         break;
59                                 }
60
61                                 n = n * b + d;
62                                 ++ digits;
63                         }
64
65                         if (digits < min)
66                                 return -1;
67
68                         ptr = p;
69                         return n;
70                 }
71
72                 public static string ParseName (string str, ref int ptr) {
73                         if (Char.IsDigit (str[ptr])) {
74                                 int gid = ParseNumber (str, ref ptr, 10, 1, 0);
75                                 if (gid > 0)
76                                         return gid.ToString ();
77                                 
78                                 return null;
79                         }
80
81                         int start = ptr;
82                         for (;;) {
83                                 if (!IsNameChar (str[ptr]))
84                                         break;
85                                 ++ ptr;
86                         }
87
88                         if (ptr - start > 0)
89                                 return str.Substring (start, ptr - start);
90
91                         return null;
92                 }
93
94                 public static string Escape (string str) {
95                         string result = "";
96                         for (int i = 0; i < str.Length; ++ i) {
97                                 char c = str[i];
98                                 switch (c) {
99                                 case '\\': case '*': case '+': case '?': case '|':
100                                 case '{': case '[': case '(': case ')': case '^':
101                                 case '$': case '.': case '#': case ' ':
102                                         result += "\\" + c;
103                                         break;
104
105                                 case '\t': result += "\\t"; break;
106                                 case '\n': result += "\\n"; break;
107                                 case '\r': result += "\\r"; break;
108                                 case '\f': result += "\\f"; break;
109
110                                 default: result += c; break;
111                                 }
112                         }
113
114                         return result;
115                 }
116
117                 public static string Unescape (string str) {
118                         if (str.IndexOf ('\\') == -1)
119                                 return str;
120                         return new Parser ().ParseString (str);
121                 }
122
123                 // public instance
124
125                 public Parser () {
126                         this.caps = new ArrayList ();
127                         this.refs = new Hashtable ();
128                 }
129
130                 public RegularExpression ParseRegularExpression (string pattern, RegexOptions options) {
131                         this.pattern = pattern;
132                         this.ptr = 0;
133
134                         caps.Clear ();
135                         refs.Clear ();
136                         this.num_groups = 0;
137
138                         try {
139                                 RegularExpression re = new RegularExpression ();
140                                 ParseGroup (re, options, null);
141                                 ResolveReferences ();
142
143                                 re.GroupCount = num_groups;
144                                 
145                                 return re;
146                         }
147                         catch (IndexOutOfRangeException) {
148                                 throw NewParseException ("Unexpected end of pattern.");
149                         }
150                 }
151
152                 public IDictionary GetMapping ()
153                 {
154                         Hashtable mapping = new Hashtable ();
155                         int end = caps.Count;
156                         mapping.Add ("0", 0);
157                         for (int i = 0; i < end; i++) {
158                                 CapturingGroup group = (CapturingGroup) caps [i];
159                                 if (group.Name != null) {
160                                         if (mapping.Contains (group.Name)) {
161                                                 if ((int) mapping [group.Name] != group.Number)
162                                                         throw new SystemException ("invalid state");
163                                                 continue;
164                                         }
165                                         mapping.Add (group.Name, group.Number);
166                                 } else {
167                                         mapping.Add (group.Number.ToString (), group.Number);
168                                 }
169                         }
170
171                         return mapping;
172                 }
173
174                 // private methods
175
176                 private void ParseGroup (Group group, RegexOptions options, Assertion assertion) {
177                         bool is_top_level = group is RegularExpression;
178                 
179                         Alternation alternation = null;
180                         string literal = null;
181
182                         Group current = new Group ();
183                         Expression expr = null;
184                         bool closed = false;
185
186                         while (true) {
187                                 ConsumeWhitespace (IsIgnorePatternWhitespace (options));
188                                 if (ptr >= pattern.Length)
189                                         break;
190                                 
191                                 // (1) Parse for Expressions
192                         
193                                 char ch = pattern[ptr ++];
194                                 
195                                 switch (ch) {
196                                 case '^': {
197                                         Position pos =
198                                                 IsMultiline (options) ? Position.StartOfLine : Position.Start;
199                                         expr = new PositionAssertion (pos);
200                                         break;
201                                 }
202
203                                 case '$': {
204                                         Position pos =
205                                                 IsMultiline (options) ? Position.EndOfLine : Position.End;
206                                         expr = new PositionAssertion (pos);
207                                         break;
208                                 }
209
210                                 case '.': {
211                                         Category cat =
212                                                 IsSingleline (options) ? Category.AnySingleline : Category.Any;
213                                         expr = new CharacterClass (cat, false);
214                                         break;
215                                 }
216
217                                 case '\\': {
218                                         int c = ParseEscape ();
219                                         if (c >= 0)
220                                                 ch = (char)c;
221                                         else {
222                                                 expr = ParseSpecial (options);
223
224                                                 if (expr == null)
225                                                         ch = pattern[ptr ++];           // default escape
226                                         }
227                                         break;
228                                 }
229
230                                 case '[': {
231                                         expr = ParseCharacterClass (options);
232                                         break;
233                                 }
234
235                                 case '(': {
236                                         bool ignore = IsIgnoreCase (options);
237                                         expr = ParseGroupingConstruct (ref options);
238                                         if (expr == null) {
239                                                 if (literal != null && IsIgnoreCase (options) != ignore) {
240                                                         current.AppendExpression (new Literal (literal, IsIgnoreCase (options)));
241                                                         literal = null;
242                                                 }
243
244                                                 continue;
245                                         }
246                                         break;
247                                 }
248
249                                 case ')': {
250                                         closed = true;
251                                         goto EndOfGroup;
252                                 }
253
254                                 case '|': {
255                                         if (literal != null) {
256                                                 current.AppendExpression (new Literal (literal, IsIgnoreCase (options)));
257                                                 literal = null;
258                                         }
259
260                                         if (assertion != null) {
261                                                 if (assertion.TrueExpression == null)
262                                                         assertion.TrueExpression = current;
263                                                 else if (assertion.FalseExpression == null)
264                                                         assertion.FalseExpression = current;
265                                                 else
266                                                         throw NewParseException ("Too many | in (?()|).");
267                                         }
268                                         else {
269                                                 if (alternation == null)
270                                                         alternation = new Alternation ();
271
272                                                 alternation.AddAlternative (current);
273                                         }
274
275                                         current = new Group ();
276                                         continue;
277                                 }
278
279                                 case '*': case '+': case '?': {
280                                         throw NewParseException ("Bad quantifier.");
281                                 }
282
283                                 default: 
284                                         break;          // literal character
285                                 }
286
287                                 ConsumeWhitespace (IsIgnorePatternWhitespace (options));
288                                 
289                                 // (2) Check for Repetitions
290                                 
291                                 if (ptr < pattern.Length) {
292                                         char k = pattern[ptr];
293                                         int min = 0, max = 0;
294                                         bool lazy = false;
295                                         bool haveRep = false;
296
297
298                                         if (k == '?' || k == '*' || k == '+') {
299                                                 ++ ptr;
300                                                 haveRep = true;
301
302                                                 switch (k) {
303                                                 case '?': min = 0; max = 1; break;
304                                                 case '*': min = 0; max = 0x7fffffff; break;
305                                                 case '+': min = 1; max = 0x7fffffff; break;
306                                                 }
307                                         } else if (k == '{' && ptr + 1 < pattern.Length) {
308                                                 int saved_ptr = ptr;
309                                                 ++ptr;
310                                                 haveRep = ParseRepetitionBounds (out min, out max, options);
311                                                 if (!haveRep)
312                                                         ptr = saved_ptr;
313                                         }
314
315                                         if (haveRep) {
316                                                 ConsumeWhitespace (IsIgnorePatternWhitespace (options));
317                                                 if (ptr < pattern.Length && pattern[ptr] == '?') {
318                                                         ++ ptr;
319                                                         lazy = true;
320                                                 }
321
322                                                 Repetition repetition = new Repetition (min, max, lazy);
323
324                                                 if (expr == null)
325                                                         repetition.Expression = new Literal (ch.ToString (), IsIgnoreCase (options));
326                                                 else
327                                                         repetition.Expression = expr;
328
329                                                 expr = repetition;
330                                         }
331                                 }
332
333                                 // (3) Append Expression and/or Literal
334
335                                 if (expr == null) {
336                                         if (literal == null)
337                                                 literal = "";
338                                         literal += ch;
339                                 }
340                                 else {
341                                         if (literal != null) {
342                                                 current.AppendExpression (new Literal (literal, IsIgnoreCase (options)));
343                                                 literal = null;
344                                         }
345
346                                         current.AppendExpression (expr);
347                                         expr = null;
348                                 }
349
350                                 if (is_top_level && ptr >= pattern.Length)
351                                         goto EndOfGroup;
352                         }
353
354                 EndOfGroup:
355                         if (is_top_level && closed)
356                                 throw NewParseException ("Too many )'s.");
357                         if (!is_top_level && !closed)
358                                 throw NewParseException ("Not enough )'s.");
359                                 
360                 
361                         // clean up literals and alternations
362
363                         if (literal != null)
364                                 current.AppendExpression (new Literal (literal, IsIgnoreCase (options)));
365
366                         if (assertion != null) {
367                                 if (assertion.TrueExpression == null)
368                                         assertion.TrueExpression = current;
369                                 else
370                                         assertion.FalseExpression = current;
371                                 
372                                 group.AppendExpression (assertion);
373                         }
374                         else if (alternation != null) {
375                                 alternation.AddAlternative (current);
376                                 group.AppendExpression (alternation);
377                         }
378                         else
379                                 group.AppendExpression (current);
380                 }
381
382                 private Expression ParseGroupingConstruct (ref RegexOptions options) {
383                         if (pattern[ptr] != '?') {
384                                 Group group;
385
386                                 if (IsExplicitCapture (options))
387                                         group = new Group ();
388                                 else {
389                                         group = new CapturingGroup ();
390                                         caps.Add (group);
391                                 }
392
393                                 ParseGroup (group, options, null);
394                                 return group;
395                         }
396                         else
397                                 ++ ptr;
398
399                         switch (pattern[ptr]) {
400                         case ':': {                                             // non-capturing group
401                                 ++ ptr;
402                                 Group group = new Group ();
403                                 ParseGroup (group, options, null);
404
405                                 return group;
406                         }
407
408                         case '>': {                                             // non-backtracking group
409                                 ++ ptr;
410                                 Group group = new NonBacktrackingGroup ();
411                                 ParseGroup (group, options, null);
412                                 
413                                 return group;
414                         }
415
416                         case 'i': case 'm': case 'n':
417                         case 's': case 'x': case '-': {                         // options
418                                 RegexOptions o = options;
419                                 ParseOptions (ref o, false);
420                                 if (pattern[ptr] == '-') {
421                                         ++ ptr;
422                                         ParseOptions (ref o, true);
423                                 }
424
425                                 if (pattern[ptr] == ':') {                      // pass options to child group
426                                         ++ ptr;
427                                         Group group = new Group ();
428                                         ParseGroup (group, o, null);
429                                         return group;
430                                 }
431                                 else if (pattern[ptr] == ')') {                 // change options of enclosing group
432                                         ++ ptr;
433                                         options = o;
434                                         return null;
435                                 }
436                                 else
437                                         throw NewParseException ("Bad options");
438                         }
439
440                         case '<': case '=': case '!': {                         // lookahead/lookbehind
441                                 ExpressionAssertion asn = new ExpressionAssertion ();
442                                 if (!ParseAssertionType (asn))
443                                         goto case '\'';                         // it's a (?<name> ) construct
444
445                                 Group test = new Group ();
446                                 ParseGroup (test, options, null);
447
448                                 asn.TestExpression = test;
449                                 return asn;
450                         }
451
452                         case '\'': {                                            // named/balancing group
453                                 char delim;
454                                 if (pattern[ptr] == '<')
455                                         delim = '>';
456                                 else
457                                         delim = '\'';
458
459                                 ++ ptr;
460                                 string name = ParseName ();
461
462                                 if (pattern[ptr] == delim) {
463                                         // capturing group
464
465                                         if (name == null)
466                                                 throw NewParseException ("Bad group name.");
467
468                                         ++ ptr;
469                                         CapturingGroup cap = new CapturingGroup ();
470                                         cap.Name = name;
471                                         caps.Add (cap);
472                                         ParseGroup (cap, options, null);
473
474                                         return cap;
475                                 }
476                                 else if (pattern[ptr] == '-') {
477                                         // balancing group
478
479                                         ++ ptr;
480                                         string balance_name = ParseName ();
481                                         if (balance_name == null || pattern[ptr] != delim)
482                                                 throw NewParseException ("Bad balancing group name.");
483
484                                         ++ ptr;
485                                         BalancingGroup bal = new BalancingGroup ();
486                                         bal.Name = name;
487                                         
488                                         if(bal.IsNamed) {
489                                                 caps.Add (bal);
490                                         }
491
492                                         refs.Add (bal, balance_name);
493
494                                         ParseGroup (bal, options, null);
495
496                                         return bal;
497                                 }
498                                 else
499                                         throw NewParseException ("Bad group name.");
500                         }
501
502                         case '(': {                                             // expression/capture test
503                                 Assertion asn;
504                         
505                                 ++ ptr;
506                                 int p = ptr;
507                                 string name = ParseName ();
508                                 if (name == null || pattern[ptr] != ')') {      // expression test
509                                         // FIXME MS implementation doesn't seem to
510                                         // implement this version of (?(x) ...)
511
512                                         ptr = p;
513                                         ExpressionAssertion expr_asn = new ExpressionAssertion ();
514
515                                         if (pattern[ptr] == '?') {
516                                                 ++ ptr;
517                                                 if (!ParseAssertionType (expr_asn))
518                                                         throw NewParseException ("Bad conditional.");
519                                         }
520                                         else {
521                                                 expr_asn.Negate = false;
522                                                 expr_asn.Reverse = false;
523                                         }
524
525                                         Group test = new Group ();
526                                         ParseGroup (test, options, null);
527                                         expr_asn.TestExpression = test;
528                                         asn = expr_asn;
529                                 }
530                                 else {                                          // capture test
531                                         ++ ptr;
532                                         asn = new CaptureAssertion (new Literal (name, IsIgnoreCase (options)));
533                                         refs.Add (asn, name);
534                                 }
535
536                                 Group group = new Group ();
537                                 ParseGroup (group, options, asn);
538                                 return group;
539                         }
540
541                         case '#': {                                             // comment
542                                 ++ ptr;
543                                 while (pattern[ptr ++] != ')') {
544                                         if (ptr >= pattern.Length)
545                                                 throw NewParseException ("Unterminated (?#...) comment.");
546                                 }
547                                 return null;
548                         }
549
550                         default:                                                // error
551                                 throw NewParseException ("Bad grouping construct.");
552                         }
553                 }
554
555                 private bool ParseAssertionType (ExpressionAssertion assertion) {
556                         if (pattern[ptr] == '<') {
557                                 switch (pattern[ptr + 1]) {
558                                 case '=':
559                                         assertion.Negate = false;
560                                         break;
561                                 case '!':
562                                         assertion.Negate = true;
563                                         break;
564                                 default:
565                                         return false;
566                                 }
567
568                                 assertion.Reverse = true;
569                                 ptr += 2;
570                         }
571                         else {
572                                 switch (pattern[ptr]) {
573                                 case '=':
574                                         assertion.Negate = false;
575                                         break;
576                                 case '!':
577                                         assertion.Negate = true;
578                                         break;
579                                 default:
580                                         return false;
581                                 }
582
583                                 assertion.Reverse = false;
584                                 ptr += 1;
585                         }
586
587                         return true;
588                 }
589
590                 private void ParseOptions (ref RegexOptions options, bool negate) {
591                         for (;;) {
592                                 switch (pattern[ptr]) {
593                                 case 'i':
594                                         if (negate)
595                                                 options &= ~RegexOptions.IgnoreCase;
596                                         else
597                                                 options |= RegexOptions.IgnoreCase;
598                                         break;
599
600                                 case 'm':
601                                         if (negate)
602                                                 options &= ~RegexOptions.Multiline;
603                                         else
604                                                 options |= RegexOptions.Multiline;
605                                         break;
606                                         
607                                 case 'n':
608                                         if (negate)
609                                                 options &= ~RegexOptions.ExplicitCapture;
610                                         else
611                                                 options |= RegexOptions.ExplicitCapture;
612                                         break;
613                                         
614                                 case 's':
615                                         if (negate)
616                                                 options &= ~RegexOptions.Singleline;
617                                         else
618                                                 options |= RegexOptions.Singleline;
619                                         break;
620                                         
621                                 case 'x':
622                                         if (negate)
623                                                 options &= ~RegexOptions.IgnorePatternWhitespace;
624                                         else
625                                                 options |= RegexOptions.IgnorePatternWhitespace;
626                                         break;
627
628                                 default:
629                                         return;
630                                 }
631
632                                 ++ ptr;
633                         }
634                 }
635
636                 private Expression ParseCharacterClass (RegexOptions options) {
637                         bool negate = false;
638                         if (pattern[ptr] == '^') {
639                                 negate = true;
640                                 ++ ptr;
641                         }
642                         
643                         bool ecma = IsECMAScript (options);
644                         CharacterClass cls = new CharacterClass (negate, IsIgnoreCase (options));
645
646                         if (pattern[ptr] == ']') {
647                                 cls.AddCharacter (']');
648                                 ++ ptr;
649                         }
650
651                         int c = -1;
652                         int last = -1;
653                         bool range = false;
654                         bool closed = false;
655                         while (ptr < pattern.Length) {
656                                 c = pattern[ptr ++];
657
658                                 if (c == ']') {
659                                         closed = true;
660                                         break;
661                                 }
662
663                                 if (c == '-' && last >= 0 && !range) {
664                                         range = true;
665                                         continue;
666                                 }
667
668                                 if (c == '\\') {
669                                         c = ParseEscape ();
670                                         if (c >= 0)
671                                                 goto char_recognized;
672
673                                         // didn't recognize escape
674                                         c = pattern [ptr ++];
675                                         switch (c) {
676                                         case 'b':
677                                                 c = '\b';
678                                                 goto char_recognized;
679
680                                         case 'd': case 'D':
681                                                 cls.AddCategory (ecma ? Category.EcmaDigit : Category.Digit, c == 'D');
682                                                 break;
683                                                 
684                                         case 'w': case 'W':
685                                                 cls.AddCategory (ecma ? Category.EcmaWord : Category.Word, c == 'W');
686                                                 break;
687                                                 
688                                         case 's': case 'S':
689                                                 cls.AddCategory (ecma ? Category.EcmaWhiteSpace : Category.WhiteSpace, c == 'S');
690                                                 break;
691                                                 
692                                         case 'p': case 'P':
693                                                 cls.AddCategory (ParseUnicodeCategory (), c == 'P');    // ignore ecma
694                                                 break;
695
696                                         default:                // add escaped character
697                                                 goto char_recognized;
698                                         }
699
700                                         // if the pattern looks like [a-\s] ...
701                                         if (range)
702                                                 throw NewParseException ("character range cannot have category \\" + c);
703
704                                         last = -1;
705                                         continue;
706                                 }
707
708                         char_recognized:
709                                 if (range) {
710                                         // if 'range' is true, we know that 'last >= 0'
711                                         if (c < last)
712                                                 throw NewParseException ("[" + last + "-" + c + "] range in reverse order.");
713                                         cls.AddRange ((char)last, (char)c);
714                                         last = -1;
715                                         range = false;
716                                         continue;
717                                 }
718
719                                 cls.AddCharacter ((char)c);
720                                 last = c;
721                         }
722
723                         if (!closed)
724                                 throw NewParseException ("Unterminated [] set.");
725
726                         if (range)
727                                 cls.AddCharacter ('-');
728
729                         return cls;
730                 }
731
732                 private bool ParseRepetitionBounds (out int min, out int max, RegexOptions options) {
733                         int n, m;
734                         min = max = 0;
735
736                         /* check syntax */
737
738                         ConsumeWhitespace (IsIgnorePatternWhitespace (options));
739                     
740                         if (pattern[ptr] == ',') {
741                                 n = -1;
742                         } else {
743                                 n = ParseNumber (10, 1, 0);
744                                 ConsumeWhitespace (IsIgnorePatternWhitespace (options));
745                         }
746                         
747                         switch (pattern[ptr ++]) {
748                         case '}':
749                                 m = n;
750                                 break;
751                         case ',':
752                                 ConsumeWhitespace (IsIgnorePatternWhitespace (options));
753                                 m = ParseNumber (10, 1, 0);
754                                 ConsumeWhitespace (IsIgnorePatternWhitespace (options));
755                                 if (pattern[ptr ++] != '}')
756                                         return false;
757                                 break;
758                         default:
759                                 return false;
760                         }
761
762                         /* check bounds and ordering */
763
764                         if (n > 0x7fffffff || m > 0x7fffffff)
765                                 throw NewParseException ("Illegal {x, y} - maximum of 2147483647.");
766                         if (m >= 0 && m < n)
767                                 throw NewParseException ("Illegal {x, y} with x > y.");
768
769                         /* assign min and max */
770                         
771                         min = n;
772                         if (m > 0)
773                                 max = m;
774                         else
775                                 max = 0x7fffffff;
776
777                         return true;
778                 }
779
780                 private Category ParseUnicodeCategory () {
781                         if (pattern[ptr ++] != '{')
782                                 throw NewParseException ("Incomplete \\p{X} character escape.");
783
784                         string name = ParseName (pattern, ref ptr);
785                         if (name == null)
786                                 throw NewParseException ("Incomplete \\p{X} character escape.");
787
788                         Category cat = CategoryUtils.CategoryFromName (name);
789                         if (cat == Category.None)
790                                 throw NewParseException ("Unknown property '" + name + "'.");
791
792                         if (pattern[ptr ++] != '}')
793                                 throw NewParseException ("Incomplete \\p{X} character escape.");
794
795                         return cat;
796                 }
797
798                 private Expression ParseSpecial (RegexOptions options) {
799                         int p = ptr;
800                         bool ecma = IsECMAScript (options);
801                         Expression expr = null;
802                         
803                         switch (pattern[ptr ++]) {
804
805                         // categories
806
807                         case 'd':
808                                 expr = new CharacterClass (ecma ? Category.EcmaDigit : Category.Digit, false);
809                                 break;
810                                 
811                         case 'w':
812                                 expr = new CharacterClass (ecma ? Category.EcmaWord : Category.Word, false);
813                                 break;
814                                 
815                         case 's':
816                                 expr = new CharacterClass (ecma ? Category.EcmaWhiteSpace : Category.WhiteSpace, false);
817                                 break;
818                                 
819                         case 'p':
820                                 // this is odd - ECMAScript isn't supposed to support Unicode,
821                                 // yet \p{..} compiles and runs under the MS implementation
822                                 // identically to canonical mode. That's why I'm ignoring the
823                                 // value of ecma here.
824                         
825                                 expr = new CharacterClass (ParseUnicodeCategory (), false);
826                                 break;
827                                 
828                         case 'D':
829                                 expr = new CharacterClass (ecma ? Category.EcmaDigit : Category.Digit, true);
830                                 break;
831                                 
832                         case 'W':
833                                 expr = new CharacterClass (ecma ? Category.EcmaWord : Category.Word, true);
834                                 break;
835                                 
836                         case 'S':
837                                 expr = new CharacterClass (ecma ? Category.EcmaWhiteSpace : Category.WhiteSpace, true);
838                                 break;
839                                 
840                         case 'P':
841                                 expr = new CharacterClass (ParseUnicodeCategory (), true);
842                                 break;
843
844                         // positions
845
846                         case 'A': expr = new PositionAssertion (Position.StartOfString); break;
847                         case 'Z': expr = new PositionAssertion (Position.End); break;
848                         case 'z': expr = new PositionAssertion (Position.EndOfString); break;
849                         case 'G': expr = new PositionAssertion (Position.StartOfScan); break;
850                         case 'b': expr = new PositionAssertion (Position.Boundary); break;
851                         case 'B': expr = new PositionAssertion (Position.NonBoundary); break;
852                         
853                         // references
854
855                         case '1': case '2': case '3': case '4': case '5':
856                         case '6': case '7': case '8': case '9': {
857                                 ptr --;
858                                 int n = ParseNumber (10, 1, 0);
859                                 if (n < 0) {
860                                         ptr = p;
861                                         return null;
862                                 }
863
864                                 // FIXME test if number is within number of assigned groups
865                                 // this may present a problem for right-to-left matching
866
867                                 Reference reference = new Reference (IsIgnoreCase (options));
868                                 refs.Add (reference, n.ToString ());
869                                 expr = reference;
870                                 break;
871                         }
872
873                         case 'k': {
874                                 char delim = pattern[ptr ++];
875                                 if (delim == '<')
876                                         delim = '>';
877                                 else if (delim != '\'')
878                                         throw NewParseException ("Malformed \\k<...> named backreference.");
879
880                                 string name = ParseName ();
881                                 if (name == null || pattern[ptr] != delim)
882                                         throw NewParseException ("Malformed \\k<...> named backreference.");
883
884                                 ++ ptr;
885                                 Reference reference = new Reference (IsIgnoreCase (options));
886                                 refs.Add (reference, name);
887                                 expr = reference;
888                                 break;
889                         }
890
891                         default:
892                                 expr = null;
893                                 break;
894                         }
895
896                         if (expr == null)
897                                 ptr = p;
898
899                         return expr;
900                 }
901
902                 private int ParseEscape () {
903                         int p = ptr;
904                         int c;
905
906                         if (p >= pattern.Length)
907                                 throw new ArgumentException (
908                                                 String.Format ("Parsing \"{0}\" - Illegal \\ at end of " + 
909                                                                 "pattern.", pattern), pattern);
910                         
911                         switch (pattern[ptr ++]) {
912         
913                         // standard escapes (except \b)
914
915                         case 'a': return '\u0007';
916                         case 't': return '\u0009';
917                         case 'r': return '\u000d';
918                         case 'v': return '\u000b';
919                         case 'f': return '\u000c';
920                         case 'n': return '\u000a';
921                         case 'e': return '\u001b';
922                         case '\\': return '\\';
923
924                         // character codes
925
926                         case '0':
927                                 //
928                                 // Turns out that octal values can be specified
929                                 // without a leading zero.   But also the limit
930                                 // of three character should include this first
931                                 // one.  
932                                 //
933                                 ptr--;
934                                 int prevptr = ptr;
935                                 int result = ParseOctal (pattern, ref ptr);
936                                 if (result == -1 && prevptr == ptr)
937                                         return 0;
938
939                                 return result;
940
941                         case 'x':
942                                 c = ParseHex (pattern, ref ptr, 2);
943                                 if (c < 0)
944                                         throw NewParseException ("Insufficient hex digits");
945
946                                 return c;
947
948                         case 'u':
949                                 c = ParseHex (pattern, ref ptr, 4);
950                                 if (c < 0)
951                                         throw NewParseException ("Insufficient hex digits");
952                                 
953                                 return c;
954
955                         // control characters
956
957                         case 'c':
958                                 c = pattern[ptr ++];
959                                 if (c >= '@' && c <= '_')
960                                         return c - '@';
961                                 else
962                                         throw NewParseException ("Unrecognized control character.");
963
964                         // unknown escape
965
966                         default:
967                                 ptr = p;
968                                 return -1;
969                         }
970                 }
971
972                 private string ParseName () {
973                         return Parser.ParseName (pattern, ref ptr);
974                 }
975
976                 private static bool IsNameChar (char c) {
977                         UnicodeCategory cat = Char.GetUnicodeCategory (c);
978                         if (cat == UnicodeCategory.ModifierLetter)
979                                 return false;
980                         if (cat == UnicodeCategory.ConnectorPunctuation)
981                                 return true;
982                         return Char.IsLetterOrDigit (c);
983                 }
984         
985                 private int ParseNumber (int b, int min, int max) {
986                         return Parser.ParseNumber (pattern, ref ptr, b, min, max);
987                 }
988
989                 private static int ParseDigit (char c, int b, int n) {
990                         switch (b) {
991                         case 8:
992                                 if (c >= '0' && c <= '7')
993                                         return c - '0';
994                                 else
995                                         return -1;
996                         case 10:
997                                 if (c >= '0' && c <= '9')
998                                         return c - '0';
999                                 else
1000                                         return -1;
1001                         case 16:
1002                                 if (c >= '0' && c <= '9')
1003                                         return c - '0';
1004                                 else if (c >= 'a' && c <= 'f')
1005                                         return 10 + c - 'a';
1006                                 else if (c >= 'A' && c <= 'F')
1007                                         return 10 + c - 'A';
1008                                 else
1009                                         return -1;
1010                         default:
1011                                 return -1;
1012                         }
1013                 }
1014
1015                 private void ConsumeWhitespace (bool ignore) {
1016                         while (ptr < pattern.Length) {
1017                                 if (pattern[ptr] == '(') {
1018                                         if (ptr + 3 >= pattern.Length)
1019                                                 return;
1020
1021                                         if (pattern[ptr + 1] != '?' || pattern[ptr + 2] != '#')
1022                                                 return;
1023
1024                                         ptr += 3;
1025                                         while (ptr < pattern.Length && pattern[ptr ++] != ')')
1026                                                 /* ignore */ ;
1027                                 }
1028                                 else if (ignore && pattern[ptr] == '#') {
1029                                         while (ptr < pattern.Length && pattern[ptr ++] != '\n')
1030                                                 /* ignore */ ;
1031                                 }
1032                                 else if (ignore && Char.IsWhiteSpace (pattern[ptr])) {
1033                                         while (ptr < pattern.Length && Char.IsWhiteSpace (pattern[ptr]))
1034                                                 ++ ptr;
1035                                 }
1036                                 else
1037                                         return;
1038                         }
1039                 }
1040
1041                 private string ParseString (string pattern) {
1042                         this.pattern = pattern;
1043                         this.ptr = 0;
1044
1045                         StringBuilder result = new StringBuilder (pattern.Length);
1046                         while (ptr < pattern.Length) {
1047                                 int c = pattern[ptr ++];
1048                                 if (c == '\\') {
1049                                         c = ParseEscape ();
1050
1051                                         if(c < 0) {
1052                                                 c = pattern[ptr ++];
1053                                                 if(c == 'b')
1054                                                         c = '\b';
1055                                         }
1056                                 }
1057                                 result.Append ((char) c);
1058                         }
1059
1060                         return result.ToString ();
1061                 }
1062
1063                 private void ResolveReferences () {
1064                         int gid = 1;
1065                         Hashtable dict = new Hashtable ();
1066
1067                         // number unnamed groups
1068
1069                         foreach (CapturingGroup group in caps) {
1070                                 if (group.Name == null) {
1071                                         dict.Add (gid.ToString (), group);
1072                                         group.Number = gid ++;
1073
1074                                         ++ num_groups;
1075                                 }
1076                         }
1077
1078                         // number named groups
1079
1080                         foreach (CapturingGroup group in caps) {
1081                                 if (group.Name != null) {
1082                                         if (!dict.Contains (group.Name)) {
1083                                                 dict.Add (group.Name, group);
1084                                                 group.Number = gid ++;
1085
1086                                                 ++ num_groups;
1087                                         }
1088                                         else {
1089                                                 CapturingGroup prev = (CapturingGroup)dict[group.Name];
1090                                                 group.Number = prev.Number;
1091                                         }
1092                                 }
1093                         }
1094
1095                         // resolve references
1096
1097                         foreach (Expression expr in refs.Keys) {
1098                                 string name = (string)refs[expr];
1099                                 if (!dict.Contains (name)) {
1100                                         if (expr is CaptureAssertion && !Char.IsDigit (name [0]))
1101                                                 continue;
1102                                         throw NewParseException ("Reference to undefined group " +
1103                                                 (Char.IsDigit (name[0]) ? "number " : "name ") +
1104                                                 name);
1105                                 }
1106
1107                                 CapturingGroup group = (CapturingGroup)dict[name];
1108                                 if (expr is Reference)
1109                                         ((Reference)expr).CapturingGroup = group;
1110                                 else if (expr is CaptureAssertion)
1111                                         ((CaptureAssertion)expr).CapturingGroup = group;
1112                                 else if (expr is BalancingGroup)
1113                                         ((BalancingGroup)expr).Balance = group;
1114                         }
1115                 }
1116
1117                 // flag helper functions
1118
1119                 private static bool IsIgnoreCase (RegexOptions options) {
1120                         return (options & RegexOptions.IgnoreCase) != 0;
1121                 }
1122
1123                 private static bool IsMultiline (RegexOptions options) {
1124                         return (options & RegexOptions.Multiline) != 0;
1125                 }
1126
1127                 private static bool IsExplicitCapture (RegexOptions options) {
1128                         return (options & RegexOptions.ExplicitCapture) != 0;
1129                 }
1130         
1131                 private static bool IsSingleline (RegexOptions options) {
1132                         return (options & RegexOptions.Singleline) != 0;
1133                 }
1134
1135                 private static bool IsIgnorePatternWhitespace (RegexOptions options) {
1136                         return (options & RegexOptions.IgnorePatternWhitespace) != 0;
1137                 }
1138
1139                 private static bool IsECMAScript (RegexOptions options) {
1140                         return (options & RegexOptions.ECMAScript) != 0;
1141                 }
1142
1143                 // exception creation
1144
1145                 private ArgumentException NewParseException (string msg) {
1146                         msg = "parsing \"" + pattern + "\" - " + msg;
1147                         return new ArgumentException (msg, pattern);
1148                 }
1149
1150                 private string pattern;
1151                 private int ptr;
1152
1153                 private ArrayList caps;
1154                 private Hashtable refs;
1155                 private int num_groups;
1156         }
1157 }