New test.
[mono.git] / mcs / class / System / System.Text.RegularExpressions / parser.cs
1 //
2 // assembly:    System
3 // namespace:   System.Text.RegularExpressions
4 // file:        parser.cs
5 //
6 // author:      Dan Lewis (dlewis@gmx.co.uk)
7 //              (c) 2002
8
9 //
10 // Permission is hereby granted, free of charge, to any person obtaining
11 // a copy of this software and associated documentation files (the
12 // "Software"), to deal in the Software without restriction, including
13 // without limitation the rights to use, copy, modify, merge, publish,
14 // distribute, sublicense, and/or sell copies of the Software, and to
15 // permit persons to whom the Software is furnished to do so, subject to
16 // the following conditions:
17 // 
18 // The above copyright notice and this permission notice shall be
19 // included in all copies or substantial portions of the Software.
20 // 
21 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
22 // EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
23 // MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
24 // NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
25 // LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
26 // OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
27 // WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
28 //
29
30 using System;
31 using System.Collections;
32 using System.Globalization;
33
34 namespace System.Text.RegularExpressions.Syntax {
35
36         class Parser {
37                 public static int ParseDecimal (string str, ref int ptr) {
38                         return ParseNumber (str, ref ptr, 10, 1, Int32.MaxValue);
39                 }
40
41                 public static int ParseOctal (string str, ref int ptr) {
42                         return ParseNumber (str, ref ptr, 8, 1, 3);
43                 }
44
45                 public static int ParseHex (string str, ref int ptr, int digits) {
46                         return ParseNumber (str, ref ptr, 16, digits, digits);
47                 }
48
49                 public static int ParseNumber (string str, ref int ptr, int b, int min, int max) {
50                         int p = ptr, n = 0, digits = 0, d;
51                         if (max < min)
52                                 max = Int32.MaxValue;
53
54                         while (digits < max && p < str.Length) {
55                                 d = ParseDigit (str[p ++], b, digits);
56                                 if (d < 0) {
57                                         -- p;
58                                         break;
59                                 }
60
61                                 n = n * b + d;
62                                 ++ digits;
63                         }
64
65                         if (digits < min)
66                                 return -1;
67
68                         ptr = p;
69                         return n;
70                 }
71
72                 public static string ParseName (string str, ref int ptr) {
73                         if (Char.IsDigit (str[ptr])) {
74                                 int gid = ParseNumber (str, ref ptr, 10, 1, 0);
75                                 if (gid > 0)
76                                         return gid.ToString ();
77                                 
78                                 return null;
79                         }
80
81                         int start = ptr;
82                         for (;;) {
83                                 if (!IsNameChar (str[ptr]))
84                                         break;
85                                 ++ ptr;
86                         }
87
88                         if (ptr - start > 0)
89                                 return str.Substring (start, ptr - start);
90
91                         return null;
92                 }
93
94                 public static string Escape (string str) {
95                         string result = "";
96                         for (int i = 0; i < str.Length; ++ i) {
97                                 char c = str[i];
98                                 switch (c) {
99                                 case '\\': case '*': case '+': case '?': case '|':
100                                 case '{': case '[': case '(': case ')': case '^':
101                                 case '$': case '.': case '#': case ' ':
102                                         result += "\\" + c;
103                                         break;
104
105                                 case '\t': result += "\\t"; break;
106                                 case '\n': result += "\\n"; break;
107                                 case '\r': result += "\\r"; break;
108                                 case '\f': result += "\\f"; break;
109
110                                 default: result += c; break;
111                                 }
112                         }
113
114                         return result;
115                 }
116
117                 public static string Unescape (string str) {
118                         if (str.IndexOf ('\\') == -1)
119                                 return str;
120                         return new Parser ().ParseString (str);
121                 }
122
123                 // public instance
124
125                 public Parser () {
126                         this.caps = new ArrayList ();
127                         this.refs = new Hashtable ();
128                 }
129
130                 public RegularExpression ParseRegularExpression (string pattern, RegexOptions options) {
131                         this.pattern = pattern;
132                         this.ptr = 0;
133
134                         caps.Clear ();
135                         refs.Clear ();
136                         this.num_groups = 0;
137
138                         try {
139                                 RegularExpression re = new RegularExpression ();
140                                 ParseGroup (re, options, null);
141                                 ResolveReferences ();
142
143                                 re.GroupCount = num_groups;
144                                 
145                                 return re;
146                         }
147                         catch (IndexOutOfRangeException) {
148                                 throw NewParseException ("Unexpected end of pattern.");
149                         }
150                 }
151
152                 public IDictionary GetMapping () {
153                         Hashtable mapping = new Hashtable ();
154                         Hashtable numbers = new Hashtable ();
155                         int end = caps.Count;
156                         mapping.Add ("0", 0);
157                         for (int i = 0; i < end; i++) {
158                                 CapturingGroup group = (CapturingGroup) caps [i];
159                                 if (group.Name != null && !mapping.Contains (group.Name)) {
160                                         mapping.Add (group.Name, group.Number);
161                                         numbers.Add (group.Number, group.Number);
162                                 }
163                         }
164
165                         for (int i = 1; i < end; i++) {
166                                 if (numbers [i] == null)
167                                         mapping.Add (i.ToString (), i);
168                         }
169
170                         return mapping;
171                 }
172
173                 // private methods
174
175                 private void ParseGroup (Group group, RegexOptions options, Assertion assertion) {
176                         bool is_top_level = group is RegularExpression;
177                 
178                         Alternation alternation = null;
179                         string literal = null;
180
181                         Group current = new Group ();
182                         Expression expr = null;
183                         bool closed = false;
184
185                         while (true) {
186                                 ConsumeWhitespace (IsIgnorePatternWhitespace (options));
187                                 if (ptr >= pattern.Length)
188                                         break;
189                                 
190                                 // (1) Parse for Expressions
191                         
192                                 char ch = pattern[ptr ++];
193                                 
194                                 switch (ch) {
195                                 case '^': {
196                                         Position pos =
197                                                 IsMultiline (options) ? Position.StartOfLine : Position.Start;
198                                         expr = new PositionAssertion (pos);
199                                         break;
200                                 }
201
202                                 case '$': {
203                                         Position pos =
204                                                 IsMultiline (options) ? Position.EndOfLine : Position.End;
205                                         expr = new PositionAssertion (pos);
206                                         break;
207                                 }
208
209                                 case '.': {
210                                         Category cat =
211                                                 IsSingleline (options) ? Category.AnySingleline : Category.Any;
212                                         expr = new CharacterClass (cat, false);
213                                         break;
214                                 }
215
216                                 case '\\': {
217                                         int c = ParseEscape ();
218                                         if (c >= 0)
219                                                 ch = (char)c;
220                                         else {
221                                                 expr = ParseSpecial (options);
222
223                                                 if (expr == null)
224                                                         ch = pattern[ptr ++];           // default escape
225                                         }
226                                         break;
227                                 }
228
229                                 case '[': {
230                                         expr = ParseCharacterClass (options);
231                                         break;
232                                 }
233
234                                 case '(': {
235                                         bool ignore = IsIgnoreCase (options);
236                                         expr = ParseGroupingConstruct (ref options);
237                                         if (expr == null) {
238                                                 if (literal != null && IsIgnoreCase (options) != ignore) {
239                                                         current.AppendExpression (new Literal (literal, IsIgnoreCase (options)));
240                                                         literal = null;
241                                                 }
242
243                                                 continue;
244                                         }
245                                         break;
246                                 }
247
248                                 case ')': {
249                                         closed = true;
250                                         goto EndOfGroup;
251                                 }
252
253                                 case '|': {
254                                         if (literal != null) {
255                                                 current.AppendExpression (new Literal (literal, IsIgnoreCase (options)));
256                                                 literal = null;
257                                         }
258
259                                         if (assertion != null) {
260                                                 if (assertion.TrueExpression == null)
261                                                         assertion.TrueExpression = current;
262                                                 else if (assertion.FalseExpression == null)
263                                                         assertion.FalseExpression = current;
264                                                 else
265                                                         throw NewParseException ("Too many | in (?()|).");
266                                         }
267                                         else {
268                                                 if (alternation == null)
269                                                         alternation = new Alternation ();
270
271                                                 alternation.AddAlternative (current);
272                                         }
273
274                                         current = new Group ();
275                                         continue;
276                                 }
277
278                                 case '*': case '+': case '?': {
279                                         throw NewParseException ("Bad quantifier.");
280                                 }
281
282                                 default: 
283                                         break;          // literal character
284                                 }
285
286                                 ConsumeWhitespace (IsIgnorePatternWhitespace (options));
287                                 
288                                 // (2) Check for Repetitions
289                                 
290                                 if (ptr < pattern.Length) {
291                                         char k = pattern[ptr];
292                                         int min = 0, max = 0;
293                                         bool lazy = false;
294                                         bool haveRep = false;
295
296
297                                         if (k == '?' || k == '*' || k == '+') {
298                                                 ++ ptr;
299                                                 haveRep = true;
300
301                                                 switch (k) {
302                                                 case '?': min = 0; max = 1; break;
303                                                 case '*': min = 0; max = 0x7fffffff; break;
304                                                 case '+': min = 1; max = 0x7fffffff; break;
305                                                 }
306                                         } else if (k == '{' && ptr + 1 < pattern.Length) {
307                                                 int saved_ptr = ptr;
308                                                 ++ptr;
309                                                 haveRep = ParseRepetitionBounds (out min, out max, options);
310                                                 if (!haveRep)
311                                                         ptr = saved_ptr;
312                                         }
313
314                                         if (haveRep) {
315                                                 ConsumeWhitespace (IsIgnorePatternWhitespace (options));
316                                                 if (ptr < pattern.Length && pattern[ptr] == '?') {
317                                                         ++ ptr;
318                                                         lazy = true;
319                                                 }
320
321                                                 Repetition repetition = new Repetition (min, max, lazy);
322
323                                                 if (expr == null)
324                                                         repetition.Expression = new Literal (ch.ToString (), IsIgnoreCase (options));
325                                                 else
326                                                         repetition.Expression = expr;
327
328                                                 expr = repetition;
329                                         }
330                                 }
331
332                                 // (3) Append Expression and/or Literal
333
334                                 if (expr == null) {
335                                         if (literal == null)
336                                                 literal = "";
337                                         literal += ch;
338                                 }
339                                 else {
340                                         if (literal != null) {
341                                                 current.AppendExpression (new Literal (literal, IsIgnoreCase (options)));
342                                                 literal = null;
343                                         }
344
345                                         current.AppendExpression (expr);
346                                         expr = null;
347                                 }
348
349                                 if (is_top_level && ptr >= pattern.Length)
350                                         goto EndOfGroup;
351                         }
352
353                 EndOfGroup:
354                         if (is_top_level && closed)
355                                 throw NewParseException ("Too many )'s.");
356                         if (!is_top_level && !closed)
357                                 throw NewParseException ("Not enough )'s.");
358                                 
359                 
360                         // clean up literals and alternations
361
362                         if (literal != null)
363                                 current.AppendExpression (new Literal (literal, IsIgnoreCase (options)));
364
365                         if (assertion != null) {
366                                 if (assertion.TrueExpression == null)
367                                         assertion.TrueExpression = current;
368                                 else
369                                         assertion.FalseExpression = current;
370                                 
371                                 group.AppendExpression (assertion);
372                         }
373                         else if (alternation != null) {
374                                 alternation.AddAlternative (current);
375                                 group.AppendExpression (alternation);
376                         }
377                         else
378                                 group.AppendExpression (current);
379                 }
380
381                 private Expression ParseGroupingConstruct (ref RegexOptions options) {
382                         if (pattern[ptr] != '?') {
383                                 Group group;
384
385                                 if (IsExplicitCapture (options))
386                                         group = new Group ();
387                                 else {
388                                         group = new CapturingGroup ();
389                                         caps.Add (group);
390                                 }
391
392                                 ParseGroup (group, options, null);
393                                 return group;
394                         }
395                         else
396                                 ++ ptr;
397
398                         switch (pattern[ptr]) {
399                         case ':': {                                             // non-capturing group
400                                 ++ ptr;
401                                 Group group = new Group ();
402                                 ParseGroup (group, options, null);
403
404                                 return group;
405                         }
406
407                         case '>': {                                             // non-backtracking group
408                                 ++ ptr;
409                                 Group group = new NonBacktrackingGroup ();
410                                 ParseGroup (group, options, null);
411                                 
412                                 return group;
413                         }
414
415                         case 'i': case 'm': case 'n':
416                         case 's': case 'x': case '-': {                         // options
417                                 RegexOptions o = options;
418                                 ParseOptions (ref o, false);
419                                 if (pattern[ptr] == '-') {
420                                         ++ ptr;
421                                         ParseOptions (ref o, true);
422                                 }
423
424                                 if (pattern[ptr] == ':') {                      // pass options to child group
425                                         ++ ptr;
426                                         Group group = new Group ();
427                                         ParseGroup (group, o, null);
428                                         return group;
429                                 }
430                                 else if (pattern[ptr] == ')') {                 // change options of enclosing group
431                                         ++ ptr;
432                                         options = o;
433                                         return null;
434                                 }
435                                 else
436                                         throw NewParseException ("Bad options");
437                         }
438
439                         case '<': case '=': case '!': {                         // lookahead/lookbehind
440                                 ExpressionAssertion asn = new ExpressionAssertion ();
441                                 if (!ParseAssertionType (asn))
442                                         goto case '\'';                         // it's a (?<name> ) construct
443
444                                 Group test = new Group ();
445                                 ParseGroup (test, options, null);
446
447                                 asn.TestExpression = test;
448                                 return asn;
449                         }
450
451                         case '\'': {                                            // named/balancing group
452                                 char delim;
453                                 if (pattern[ptr] == '<')
454                                         delim = '>';
455                                 else
456                                         delim = '\'';
457
458                                 ++ ptr;
459                                 string name = ParseName ();
460
461                                 if (pattern[ptr] == delim) {
462                                         // capturing group
463
464                                         if (name == null)
465                                                 throw NewParseException ("Bad group name.");
466
467                                         ++ ptr;
468                                         CapturingGroup cap = new CapturingGroup ();
469                                         cap.Name = name;
470                                         caps.Add (cap);
471                                         ParseGroup (cap, options, null);
472
473                                         return cap;
474                                 }
475                                 else if (pattern[ptr] == '-') {
476                                         // balancing group
477
478                                         ++ ptr;
479                                         string balance_name = ParseName ();
480                                         if (balance_name == null || pattern[ptr] != delim)
481                                                 throw NewParseException ("Bad balancing group name.");
482
483                                         ++ ptr;
484                                         BalancingGroup bal = new BalancingGroup ();
485                                         bal.Name = name;
486                                         
487                                         if(bal.IsNamed) {
488                                                 caps.Add (bal);
489                                         }
490
491                                         refs.Add (bal, balance_name);
492
493                                         ParseGroup (bal, options, null);
494
495                                         return bal;
496                                 }
497                                 else
498                                         throw NewParseException ("Bad group name.");
499                         }
500
501                         case '(': {                                             // expression/capture test
502                                 Assertion asn;
503                         
504                                 ++ ptr;
505                                 int p = ptr;
506                                 string name = ParseName ();
507                                 if (name == null || pattern[ptr] != ')') {      // expression test
508                                         // FIXME MS implementation doesn't seem to
509                                         // implement this version of (?(x) ...)
510
511                                         ptr = p;
512                                         ExpressionAssertion expr_asn = new ExpressionAssertion ();
513
514                                         if (pattern[ptr] == '?') {
515                                                 ++ ptr;
516                                                 if (!ParseAssertionType (expr_asn))
517                                                         throw NewParseException ("Bad conditional.");
518                                         }
519                                         else {
520                                                 expr_asn.Negate = false;
521                                                 expr_asn.Reverse = false;
522                                         }
523
524                                         Group test = new Group ();
525                                         ParseGroup (test, options, null);
526                                         expr_asn.TestExpression = test;
527                                         asn = expr_asn;
528                                 }
529                                 else {                                          // capture test
530                                         ++ ptr;
531                                         asn = new CaptureAssertion ();
532                                         refs.Add (asn, name);
533                                 }
534
535                                 Group group = new Group ();
536                                 ParseGroup (group, options, asn);
537                                 return group;
538                         }
539
540                         case '#': {                                             // comment
541                                 ++ ptr;
542                                 while (pattern[ptr ++] != ')') {
543                                         if (ptr >= pattern.Length)
544                                                 throw NewParseException ("Unterminated (?#...) comment.");
545                                 }
546                                 return null;
547                         }
548
549                         default:                                                // error
550                                 throw NewParseException ("Bad grouping construct.");
551                         }
552                 }
553
554                 private bool ParseAssertionType (ExpressionAssertion assertion) {
555                         if (pattern[ptr] == '<') {
556                                 switch (pattern[ptr + 1]) {
557                                 case '=':
558                                         assertion.Negate = false;
559                                         break;
560                                 case '!':
561                                         assertion.Negate = true;
562                                         break;
563                                 default:
564                                         return false;
565                                 }
566
567                                 assertion.Reverse = true;
568                                 ptr += 2;
569                         }
570                         else {
571                                 switch (pattern[ptr]) {
572                                 case '=':
573                                         assertion.Negate = false;
574                                         break;
575                                 case '!':
576                                         assertion.Negate = true;
577                                         break;
578                                 default:
579                                         return false;
580                                 }
581
582                                 assertion.Reverse = false;
583                                 ptr += 1;
584                         }
585
586                         return true;
587                 }
588
589                 private void ParseOptions (ref RegexOptions options, bool negate) {
590                         for (;;) {
591                                 switch (pattern[ptr]) {
592                                 case 'i':
593                                         if (negate)
594                                                 options &= ~RegexOptions.IgnoreCase;
595                                         else
596                                                 options |= RegexOptions.IgnoreCase;
597                                         break;
598
599                                 case 'm':
600                                         if (negate)
601                                                 options &= ~RegexOptions.Multiline;
602                                         else
603                                                 options |= RegexOptions.Multiline;
604                                         break;
605                                         
606                                 case 'n':
607                                         if (negate)
608                                                 options &= ~RegexOptions.ExplicitCapture;
609                                         else
610                                                 options |= RegexOptions.ExplicitCapture;
611                                         break;
612                                         
613                                 case 's':
614                                         if (negate)
615                                                 options &= ~RegexOptions.Singleline;
616                                         else
617                                                 options |= RegexOptions.Singleline;
618                                         break;
619                                         
620                                 case 'x':
621                                         if (negate)
622                                                 options &= ~RegexOptions.IgnorePatternWhitespace;
623                                         else
624                                                 options |= RegexOptions.IgnorePatternWhitespace;
625                                         break;
626
627                                 default:
628                                         return;
629                                 }
630
631                                 ++ ptr;
632                         }
633                 }
634
635                 private Expression ParseCharacterClass (RegexOptions options) {
636                         bool negate = false;
637                         if (pattern[ptr] == '^') {
638                                 negate = true;
639                                 ++ ptr;
640                         }
641                         
642                         bool ecma = IsECMAScript (options);
643                         CharacterClass cls = new CharacterClass (negate, IsIgnoreCase (options));
644
645                         if (pattern[ptr] == ']') {
646                                 cls.AddCharacter (']');
647                                 ++ ptr;
648                         }
649
650                         int c = -1;
651                         int last = -1;
652                         bool range = false;
653                         bool closed = false;
654                         while (ptr < pattern.Length) {
655                                 c = pattern[ptr ++];
656
657                                 if (c == ']') {
658                                         closed = true;
659                                         break;
660                                 }
661
662                                 if (c == '-' && last >= 0 && !range) {
663                                         range = true;
664                                         continue;
665                                 }
666
667                                 if (c == '\\') {
668                                         c = ParseEscape ();
669                                         if (c >= 0)
670                                                 goto char_recognized;
671
672                                         // didn't recognize escape
673                                         c = pattern [ptr ++];
674                                         switch (c) {
675                                         case 'b':
676                                                 c = '\b';
677                                                 goto char_recognized;
678
679                                         case 'd': case 'D':
680                                                 cls.AddCategory (ecma ? Category.EcmaDigit : Category.Digit, c == 'D');
681                                                 break;
682                                                 
683                                         case 'w': case 'W':
684                                                 cls.AddCategory (ecma ? Category.EcmaWord : Category.Word, c == 'W');
685                                                 break;
686                                                 
687                                         case 's': case 'S':
688                                                 cls.AddCategory (ecma ? Category.EcmaWhiteSpace : Category.WhiteSpace, c == 'S');
689                                                 break;
690                                                 
691                                         case 'p': case 'P':
692                                                 cls.AddCategory (ParseUnicodeCategory (), c == 'P');    // ignore ecma
693                                                 break;
694
695                                         default:                // add escaped character
696                                                 goto char_recognized;
697                                         }
698
699                                         // if the pattern looks like [a-\s] ...
700                                         if (range)
701                                                 throw NewParseException ("character range cannot have category \\" + c);
702
703                                         last = -1;
704                                         continue;
705                                 }
706
707                         char_recognized:
708                                 if (range) {
709                                         // if 'range' is true, we know that 'last >= 0'
710                                         if (c < last)
711                                                 throw NewParseException ("[" + last + "-" + c + "] range in reverse order.");
712                                         cls.AddRange ((char)last, (char)c);
713                                         last = -1;
714                                         range = false;
715                                         continue;
716                                 }
717
718                                 cls.AddCharacter ((char)c);
719                                 last = c;
720                         }
721
722                         if (!closed)
723                                 throw NewParseException ("Unterminated [] set.");
724
725                         if (range)
726                                 cls.AddCharacter ('-');
727
728                         return cls;
729                 }
730
731                 private bool ParseRepetitionBounds (out int min, out int max, RegexOptions options) {
732                         int n, m;
733                         min = max = 0;
734
735                         /* check syntax */
736
737                         ConsumeWhitespace (IsIgnorePatternWhitespace (options));
738                     
739                         if (pattern[ptr] == ',') {
740                                 n = -1;
741                         } else {
742                                 n = ParseNumber (10, 1, 0);
743                                 ConsumeWhitespace (IsIgnorePatternWhitespace (options));
744                         }
745                         
746                         switch (pattern[ptr ++]) {
747                         case '}':
748                                 m = n;
749                                 break;
750                         case ',':
751                                 ConsumeWhitespace (IsIgnorePatternWhitespace (options));
752                                 m = ParseNumber (10, 1, 0);
753                                 ConsumeWhitespace (IsIgnorePatternWhitespace (options));
754                                 if (pattern[ptr ++] != '}')
755                                         return false;
756                                 break;
757                         default:
758                                 return false;
759                         }
760
761                         /* check bounds and ordering */
762
763                         if (n >= 0xffff || m >= 0xffff)
764                                 throw NewParseException ("Illegal {x, y} - maximum of 65535.");
765                         if (m >= 0 && m < n)
766                                 throw NewParseException ("Illegal {x, y} with x > y.");
767
768                         /* assign min and max */
769                         
770                         min = n;
771                         if (m > 0)
772                                 max = m;
773                         else
774                                 max = 0xffff;
775
776                         return true;
777                 }
778
779                 private Category ParseUnicodeCategory () {
780                         if (pattern[ptr ++] != '{')
781                                 throw NewParseException ("Incomplete \\p{X} character escape.");
782
783                         string name = ParseName (pattern, ref ptr);
784                         if (name == null)
785                                 throw NewParseException ("Incomplete \\p{X} character escape.");
786
787                         Category cat = CategoryUtils.CategoryFromName (name);
788                         if (cat == Category.None)
789                                 throw NewParseException ("Unknown property '" + name + "'.");
790
791                         if (pattern[ptr ++] != '}')
792                                 throw NewParseException ("Incomplete \\p{X} character escape.");
793
794                         return cat;
795                 }
796
797                 private Expression ParseSpecial (RegexOptions options) {
798                         int p = ptr;
799                         bool ecma = IsECMAScript (options);
800                         Expression expr = null;
801                         
802                         switch (pattern[ptr ++]) {
803
804                         // categories
805
806                         case 'd':
807                                 expr = new CharacterClass (ecma ? Category.EcmaDigit : Category.Digit, false);
808                                 break;
809                                 
810                         case 'w':
811                                 expr = new CharacterClass (ecma ? Category.EcmaWord : Category.Word, false);
812                                 break;
813                                 
814                         case 's':
815                                 expr = new CharacterClass (ecma ? Category.EcmaWhiteSpace : Category.WhiteSpace, false);
816                                 break;
817                                 
818                         case 'p':
819                                 // this is odd - ECMAScript isn't supposed to support Unicode,
820                                 // yet \p{..} compiles and runs under the MS implementation
821                                 // identically to canonical mode. That's why I'm ignoring the
822                                 // value of ecma here.
823                         
824                                 expr = new CharacterClass (ParseUnicodeCategory (), false);
825                                 break;
826                                 
827                         case 'D':
828                                 expr = new CharacterClass (ecma ? Category.EcmaDigit : Category.Digit, true);
829                                 break;
830                                 
831                         case 'W':
832                                 expr = new CharacterClass (ecma ? Category.EcmaWord : Category.Word, true);
833                                 break;
834                                 
835                         case 'S':
836                                 expr = new CharacterClass (ecma ? Category.EcmaWhiteSpace : Category.WhiteSpace, true);
837                                 break;
838                                 
839                         case 'P':
840                                 expr = new CharacterClass (ParseUnicodeCategory (), true);
841                                 break;
842
843                         // positions
844
845                         case 'A': expr = new PositionAssertion (Position.StartOfString); break;
846                         case 'Z': expr = new PositionAssertion (Position.End); break;
847                         case 'z': expr = new PositionAssertion (Position.EndOfString); break;
848                         case 'G': expr = new PositionAssertion (Position.StartOfScan); break;
849                         case 'b': expr = new PositionAssertion (Position.Boundary); break;
850                         case 'B': expr = new PositionAssertion (Position.NonBoundary); break;
851                         
852                         // references
853
854                         case '1': case '2': case '3': case '4': case '5':
855                         case '6': case '7': case '8': case '9': {
856                                 ptr --;
857                                 int n = ParseNumber (10, 1, 0);
858                                 if (n < 0) {
859                                         ptr = p;
860                                         return null;
861                                 }
862
863                                 // FIXME test if number is within number of assigned groups
864                                 // this may present a problem for right-to-left matching
865
866                                 Reference reference = new Reference (IsIgnoreCase (options));
867                                 refs.Add (reference, n.ToString ());
868                                 expr = reference;
869                                 break;
870                         }
871
872                         case 'k': {
873                                 char delim = pattern[ptr ++];
874                                 if (delim == '<')
875                                         delim = '>';
876                                 else if (delim != '\'')
877                                         throw NewParseException ("Malformed \\k<...> named backreference.");
878
879                                 string name = ParseName ();
880                                 if (name == null || pattern[ptr] != delim)
881                                         throw NewParseException ("Malformed \\k<...> named backreference.");
882
883                                 ++ ptr;
884                                 Reference reference = new Reference (IsIgnoreCase (options));
885                                 refs.Add (reference, name);
886                                 expr = reference;
887                                 break;
888                         }
889
890                         default:
891                                 expr = null;
892                                 break;
893                         }
894
895                         if (expr == null)
896                                 ptr = p;
897
898                         return expr;
899                 }
900
901                 private int ParseEscape () {
902                         int p = ptr;
903                         int c;
904
905                         if (p >= pattern.Length)
906                                 throw new ArgumentException (
907                                                 String.Format ("Parsing \"{0}\" - Illegal \\ at end of " + 
908                                                                 "pattern.", pattern), pattern);
909                         
910                         switch (pattern[ptr ++]) {
911         
912                         // standard escapes (except \b)
913
914                         case 'a': return '\u0007';
915                         case 't': return '\u0009';
916                         case 'r': return '\u000d';
917                         case 'v': return '\u000b';
918                         case 'f': return '\u000c';
919                         case 'n': return '\u000a';
920                         case 'e': return '\u001b';
921                         case '\\': return '\\';
922
923                         // character codes
924
925                         case '0':
926                                 //
927                                 // Turns out that octal values can be specified
928                                 // without a leading zero.   But also the limit
929                                 // of three character should include this first
930                                 // one.  
931                                 //
932                                 ptr--;
933                                 int prevptr = ptr;
934                                 int result = ParseOctal (pattern, ref ptr);
935                                 if (result == -1 && prevptr == ptr)
936                                         return 0;
937
938                                 return result;
939
940                         case 'x':
941                                 c = ParseHex (pattern, ref ptr, 2);
942                                 if (c < 0)
943                                         throw NewParseException ("Insufficient hex digits");
944
945                                 return c;
946
947                         case 'u':
948                                 c = ParseHex (pattern, ref ptr, 4);
949                                 if (c < 0)
950                                         throw NewParseException ("Insufficient hex digits");
951                                 
952                                 return c;
953
954                         // control characters
955
956                         case 'c':
957                                 c = pattern[ptr ++];
958                                 if (c >= '@' && c <= '_')
959                                         return c - '@';
960                                 else
961                                         throw NewParseException ("Unrecognized control character.");
962
963                         // unknown escape
964
965                         default:
966                                 ptr = p;
967                                 return -1;
968                         }
969                 }
970
971                 private string ParseName () {
972                         return Parser.ParseName (pattern, ref ptr);
973                 }
974
975                 private static bool IsNameChar (char c) {
976                         UnicodeCategory cat = Char.GetUnicodeCategory (c);
977                         if (cat == UnicodeCategory.ModifierLetter)
978                                 return false;
979                         if (cat == UnicodeCategory.ConnectorPunctuation)
980                                 return true;
981                         return Char.IsLetterOrDigit (c);
982                 }
983         
984                 private int ParseNumber (int b, int min, int max) {
985                         return Parser.ParseNumber (pattern, ref ptr, b, min, max);
986                 }
987
988                 private static int ParseDigit (char c, int b, int n) {
989                         switch (b) {
990                         case 8:
991                                 if (c >= '0' && c <= '7')
992                                         return c - '0';
993                                 else
994                                         return -1;
995                         case 10:
996                                 if (c >= '0' && c <= '9')
997                                         return c - '0';
998                                 else
999                                         return -1;
1000                         case 16:
1001                                 if (c >= '0' && c <= '9')
1002                                         return c - '0';
1003                                 else if (c >= 'a' && c <= 'f')
1004                                         return 10 + c - 'a';
1005                                 else if (c >= 'A' && c <= 'F')
1006                                         return 10 + c - 'A';
1007                                 else
1008                                         return -1;
1009                         default:
1010                                 return -1;
1011                         }
1012                 }
1013
1014                 private void ConsumeWhitespace (bool ignore) {
1015                         while (ptr < pattern.Length) {
1016                                 if (pattern[ptr] == '(') {
1017                                         if (ptr + 3 >= pattern.Length)
1018                                                 return;
1019
1020                                         if (pattern[ptr + 1] != '?' || pattern[ptr + 2] != '#')
1021                                                 return;
1022
1023                                         ptr += 3;
1024                                         while (ptr < pattern.Length && pattern[ptr ++] != ')')
1025                                                 /* ignore */ ;
1026                                 }
1027                                 else if (ignore && pattern[ptr] == '#') {
1028                                         while (ptr < pattern.Length && pattern[ptr ++] != '\n')
1029                                                 /* ignore */ ;
1030                                 }
1031                                 else if (ignore && Char.IsWhiteSpace (pattern[ptr])) {
1032                                         while (ptr < pattern.Length && Char.IsWhiteSpace (pattern[ptr]))
1033                                                 ++ ptr;
1034                                 }
1035                                 else
1036                                         return;
1037                         }
1038                 }
1039
1040                 private string ParseString (string pattern) {
1041                         this.pattern = pattern;
1042                         this.ptr = 0;
1043
1044                         StringBuilder result = new StringBuilder (pattern.Length);
1045                         while (ptr < pattern.Length) {
1046                                 int c = pattern[ptr ++];
1047                                 if (c == '\\') {
1048                                         c = ParseEscape ();
1049
1050                                         if(c < 0) {
1051                                                 c = pattern[ptr ++];
1052                                                 if(c == 'b')
1053                                                         c = '\b';
1054                                         }
1055                                 }
1056                                 result.Append ((char) c);
1057                         }
1058
1059                         return result.ToString ();
1060                 }
1061
1062                 private void ResolveReferences () {
1063                         int gid = 1;
1064                         Hashtable dict = new Hashtable ();
1065
1066                         // number unnamed groups
1067
1068                         foreach (CapturingGroup group in caps) {
1069                                 if (group.Name == null) {
1070                                         dict.Add (gid.ToString (), group);
1071                                         group.Number = gid ++;
1072
1073                                         ++ num_groups;
1074                                 }
1075                         }
1076
1077                         // number named groups
1078
1079                         foreach (CapturingGroup group in caps) {
1080                                 if (group.Name != null) {
1081                                         if (!dict.Contains (group.Name)) {
1082                                                 dict.Add (group.Name, group);
1083                                                 group.Number = gid ++;
1084
1085                                                 ++ num_groups;
1086                                         }
1087                                         else {
1088                                                 CapturingGroup prev = (CapturingGroup)dict[group.Name];
1089                                                 group.Number = prev.Number;
1090                                         }
1091                                 }
1092                         }
1093
1094                         // resolve references
1095
1096                         foreach (Expression expr in refs.Keys) {
1097                                 string name = (string)refs[expr];
1098                                 if (!dict.Contains (name)) {
1099                                         throw NewParseException ("Reference to undefined group " +
1100                                                 (Char.IsDigit (name[0]) ? "number " : "name ") +
1101                                                 name);
1102                                 }
1103
1104                                 CapturingGroup group = (CapturingGroup)dict[name];
1105                                 if (expr is Reference)
1106                                         ((Reference)expr).CapturingGroup = group;
1107                                 else if (expr is CaptureAssertion)
1108                                         ((CaptureAssertion)expr).CapturingGroup = group;
1109                                 else if (expr is BalancingGroup)
1110                                         ((BalancingGroup)expr).Balance = group;
1111                         }
1112                 }
1113
1114                 // flag helper functions
1115
1116                 private static bool IsIgnoreCase (RegexOptions options) {
1117                         return (options & RegexOptions.IgnoreCase) != 0;
1118                 }
1119
1120                 private static bool IsMultiline (RegexOptions options) {
1121                         return (options & RegexOptions.Multiline) != 0;
1122                 }
1123
1124                 private static bool IsExplicitCapture (RegexOptions options) {
1125                         return (options & RegexOptions.ExplicitCapture) != 0;
1126                 }
1127         
1128                 private static bool IsSingleline (RegexOptions options) {
1129                         return (options & RegexOptions.Singleline) != 0;
1130                 }
1131
1132                 private static bool IsIgnorePatternWhitespace (RegexOptions options) {
1133                         return (options & RegexOptions.IgnorePatternWhitespace) != 0;
1134                 }
1135
1136                 private static bool IsECMAScript (RegexOptions options) {
1137                         return (options & RegexOptions.ECMAScript) != 0;
1138                 }
1139
1140                 // exception creation
1141
1142                 private ArgumentException NewParseException (string msg) {
1143                         msg = "parsing \"" + pattern + "\" - " + msg;
1144                         return new ArgumentException (msg, pattern);
1145                 }
1146
1147                 private string pattern;
1148                 private int ptr;
1149
1150                 private ArrayList caps;
1151                 private Hashtable refs;
1152                 private int num_groups;
1153         }
1154 }