* parser.cs: Allow creating a regular expression using {,n} as the
[mono.git] / mcs / class / System / System.Text.RegularExpressions / parser.cs
1 //
2 // assembly:    System
3 // namespace:   System.Text.RegularExpressions
4 // file:        parser.cs
5 //
6 // author:      Dan Lewis (dlewis@gmx.co.uk)
7 //              (c) 2002
8
9 using System;
10 using System.Collections;
11 using System.Globalization;
12
13 namespace System.Text.RegularExpressions.Syntax {
14
15         class Parser {
16                 public static int ParseDecimal (string str, ref int ptr) {
17                         return ParseNumber (str, ref ptr, 10, 1, Int32.MaxValue);
18                 }
19
20                 public static int ParseOctal (string str, ref int ptr) {
21                         return ParseNumber (str, ref ptr, 8, 1, 3);
22                 }
23
24                 public static int ParseHex (string str, ref int ptr, int digits) {
25                         return ParseNumber (str, ref ptr, 16, digits, digits);
26                 }
27
28                 public static int ParseNumber (string str, ref int ptr, int b, int min, int max) {
29                         int p = ptr, n = 0, digits = 0, d;
30                         if (max < min)
31                                 max = Int32.MaxValue;
32
33                         while (digits < max && p < str.Length) {
34                                 d = ParseDigit (str[p ++], b, digits);
35                                 if (d < 0) {
36                                         -- p;
37                                         break;
38                                 }
39
40                                 n = n * b + d;
41                                 ++ digits;
42                         }
43
44                         if (digits < min)
45                                 return -1;
46
47                         ptr = p;
48                         return n;
49                 }
50
51                 public static string ParseName (string str, ref int ptr) {
52                         if (Char.IsDigit (str[ptr])) {
53                                 int gid = ParseNumber (str, ref ptr, 10, 1, 0);
54                                 if (gid > 0)
55                                         return gid.ToString ();
56                                 
57                                 return null;
58                         }
59
60                         int start = ptr;
61                         for (;;) {
62                                 if (!IsNameChar (str[ptr]))
63                                         break;
64                                 ++ ptr;
65                         }
66
67                         if (ptr - start > 0)
68                                 return str.Substring (start, ptr - start);
69
70                         return null;
71                 }
72
73                 public static string Escape (string str) {
74                         string result = "";
75                         for (int i = 0; i < str.Length; ++ i) {
76                                 char c = str[i];
77                                 switch (c) {
78                                 case '\\': case '*': case '+': case '?': case '|':
79                                 case '{': case '[': case '(': case ')': case '^':
80                                 case '$': case '.': case '#': case ' ':
81                                         result += "\\" + c;
82                                         break;
83
84                                 case '\t': result += "\\t"; break;
85                                 case '\n': result += "\\n"; break;
86                                 case '\r': result += "\\r"; break;
87                                 case '\f': result += "\\f"; break;
88
89                                 default: result += c; break;
90                                 }
91                         }
92
93                         return result;
94                 }
95
96                 public static string Unescape (string str) {
97                         return new Parser ().ParseString (str);
98                 }
99
100                 // public instance
101
102                 public Parser () {
103                         this.caps = new ArrayList ();
104                         this.refs = new Hashtable ();
105                 }
106
107                 public RegularExpression ParseRegularExpression (string pattern, RegexOptions options) {
108                         this.pattern = pattern;
109                         this.ptr = 0;
110
111                         caps.Clear ();
112                         refs.Clear ();
113                         this.num_groups = 0;
114
115                         try {
116                                 RegularExpression re = new RegularExpression ();
117                                 ParseGroup (re, options, null);
118                                 ResolveReferences ();
119
120                                 re.GroupCount = num_groups;
121                                 
122                                 return re;
123                         }
124                         catch (IndexOutOfRangeException) {
125                                 throw NewParseException ("Unexpected end of pattern.");
126                         }
127                 }
128
129                 public IDictionary GetMapping () {
130                         Hashtable mapping = new Hashtable ();
131                         Hashtable numbers = new Hashtable ();
132                         int end = caps.Count;
133                         mapping.Add ("0", 0);
134                         for (int i = 0; i < end; i++) {
135                                 CapturingGroup group = (CapturingGroup) caps [i];
136                                 if (group.Name != null && !mapping.Contains (group.Name)) {
137                                         mapping.Add (group.Name, group.Number);
138                                         numbers.Add (group.Number, group.Number);
139                                 }
140                         }
141
142                         for (int i = 1; i < end; i++) {
143                                 if (numbers [i] == null)
144                                         mapping.Add (i.ToString (), i);
145                         }
146
147                         return mapping;
148                 }
149
150                 // private methods
151
152                 private void ParseGroup (Group group, RegexOptions options, Assertion assertion) {
153                         bool is_top_level = group is RegularExpression;
154                 
155                         Alternation alternation = null;
156                         string literal = null;
157
158                         Group current = new Group ();
159                         Expression expr = null;
160                         bool closed = false;
161
162                         while (true) {
163                                 ConsumeWhitespace (IsIgnorePatternWhitespace (options));
164                                 if (ptr >= pattern.Length)
165                                         break;
166                                 
167                                 // (1) Parse for Expressions
168                         
169                                 char ch = pattern[ptr ++];
170                                 
171                                 switch (ch) {
172                                 case '^': {
173                                         Position pos =
174                                                 IsMultiline (options) ? Position.StartOfLine : Position.Start;
175                                         expr = new PositionAssertion (pos);
176                                         break;
177                                 }
178
179                                 case '$': {
180                                         Position pos =
181                                                 IsMultiline (options) ? Position.EndOfLine : Position.End;
182                                         expr = new PositionAssertion (pos);
183                                         break;
184                                 }
185
186                                 case '.': {
187                                         Category cat =
188                                                 IsSingleline (options) ? Category.AnySingleline : Category.Any;
189                                         expr = new CharacterClass (cat, false);
190                                         break;
191                                 }
192
193                                 case '\\': {
194                                         int c = ParseEscape ();
195                                         if (c >= 0)
196                                                 ch = (char)c;
197                                         else {
198                                                 expr = ParseSpecial (options);
199
200                                                 if (expr == null)
201                                                         ch = pattern[ptr ++];           // default escape
202                                         }
203                                         break;
204                                 }
205
206                                 case '[': {
207                                         expr = ParseCharacterClass (options);
208                                         break;
209                                 }
210
211                                 case '(': {
212                                         bool ignore = IsIgnoreCase (options);
213                                         expr = ParseGroupingConstruct (ref options);
214                                         if (expr == null) {
215                                                 if (literal != null && IsIgnoreCase (options) != ignore) {
216                                                         current.AppendExpression (new Literal (literal, IsIgnoreCase (options)));
217                                                         literal = null;
218                                                 }
219
220                                                 continue;
221                                         }
222                                         break;
223                                 }
224
225                                 case ')': {
226                                         closed = true;
227                                         goto EndOfGroup;
228                                 }
229
230                                 case '|': {
231                                         if (literal != null) {
232                                                 current.AppendExpression (new Literal (literal, IsIgnoreCase (options)));
233                                                 literal = null;
234                                         }
235
236                                         if (assertion != null) {
237                                                 if (assertion.TrueExpression == null)
238                                                         assertion.TrueExpression = current;
239                                                 else if (assertion.FalseExpression == null)
240                                                         assertion.FalseExpression = current;
241                                                 else
242                                                         throw NewParseException ("Too many | in (?()|).");
243                                         }
244                                         else {
245                                                 if (alternation == null)
246                                                         alternation = new Alternation ();
247
248                                                 alternation.AddAlternative (current);
249                                         }
250
251                                         current = new Group ();
252                                         continue;
253                                 }
254
255                                 case '*': case '+': case '?': {
256                                         throw NewParseException ("Bad quantifier.");
257                                 }
258
259                                 default: 
260                                         break;          // literal character
261                                 }
262
263                                 ConsumeWhitespace (IsIgnorePatternWhitespace (options));
264                                 
265                                 // (2) Check for Repetitions
266                                 
267                                 if (ptr < pattern.Length) {
268                                         char k = pattern[ptr];
269
270                                         if (k == '?' || k == '*' || k == '+' || k == '{') {
271                                                 ++ ptr;
272
273                                                 int min = 0, max = 0;
274                                                 bool lazy = false;
275
276                                                 switch (k) {
277                                                 case '?': min = 0; max = 1; break;
278                                                 case '*': min = 0; max = 0xffff; break;
279                                                 case '+': min = 1; max = 0xffff; break;
280                                                 case '{': ParseRepetitionBounds (out min, out max, options); break;
281                                                 }
282
283                                                 ConsumeWhitespace (IsIgnorePatternWhitespace (options));
284                                                 if (ptr < pattern.Length && pattern[ptr] == '?') {
285                                                         ++ ptr;
286                                                         lazy = true;
287                                                 }
288
289                                                 Repetition repetition = new Repetition (min, max, lazy);
290
291                                                 if (expr == null)
292                                                         repetition.Expression = new Literal (ch.ToString (), IsIgnoreCase (options));
293                                                 else
294                                                         repetition.Expression = expr;
295
296                                                 expr = repetition;
297                                         }
298                                 }
299
300                                 // (3) Append Expression and/or Literal
301
302                                 if (expr == null) {
303                                         if (literal == null)
304                                                 literal = "";
305                                         literal += ch;
306                                 }
307                                 else {
308                                         if (literal != null) {
309                                                 current.AppendExpression (new Literal (literal, IsIgnoreCase (options)));
310                                                 literal = null;
311                                         }
312
313                                         current.AppendExpression (expr);
314                                         expr = null;
315                                 }
316
317                                 if (is_top_level && ptr >= pattern.Length)
318                                         goto EndOfGroup;
319                         }
320
321                 EndOfGroup:
322                         if (is_top_level && closed)
323                                 throw NewParseException ("Too many )'s.");
324                         if (!is_top_level && !closed)
325                                 throw NewParseException ("Not enough )'s.");
326                                 
327                 
328                         // clean up literals and alternations
329
330                         if (literal != null)
331                                 current.AppendExpression (new Literal (literal, IsIgnoreCase (options)));
332
333                         if (assertion != null) {
334                                 if (assertion.TrueExpression == null)
335                                         assertion.TrueExpression = current;
336                                 else
337                                         assertion.FalseExpression = current;
338                                 
339                                 group.AppendExpression (assertion);
340                         }
341                         else if (alternation != null) {
342                                 alternation.AddAlternative (current);
343                                 group.AppendExpression (alternation);
344                         }
345                         else
346                                 group.AppendExpression (current);
347                 }
348
349                 private Expression ParseGroupingConstruct (ref RegexOptions options) {
350                         if (pattern[ptr] != '?') {
351                                 Group group;
352
353                                 if (IsExplicitCapture (options))
354                                         group = new Group ();
355                                 else {
356                                         group = new CapturingGroup ();
357                                         caps.Add (group);
358                                 }
359
360                                 ParseGroup (group, options, null);
361                                 return group;
362                         }
363                         else
364                                 ++ ptr;
365
366                         switch (pattern[ptr]) {
367                         case ':': {                                             // non-capturing group
368                                 ++ ptr;
369                                 Group group = new Group ();
370                                 ParseGroup (group, options, null);
371
372                                 return group;
373                         }
374
375                         case '>': {                                             // non-backtracking group
376                                 ++ ptr;
377                                 Group group = new NonBacktrackingGroup ();
378                                 ParseGroup (group, options, null);
379                                 
380                                 return group;
381                         }
382
383                         case 'i': case 'm': case 'n':
384                         case 's': case 'x': case '-': {                         // options
385                                 RegexOptions o = options;
386                                 ParseOptions (ref o, false);
387                                 if (pattern[ptr] == '-') {
388                                         ++ ptr;
389                                         ParseOptions (ref o, true);
390                                 }
391
392                                 if (pattern[ptr] == ':') {                      // pass options to child group
393                                         ++ ptr;
394                                         Group group = new Group ();
395                                         ParseGroup (group, o, null);
396                                         return group;
397                                 }
398                                 else if (pattern[ptr] == ')') {                 // change options of enclosing group
399                                         ++ ptr;
400                                         options = o;
401                                         return null;
402                                 }
403                                 else
404                                         throw NewParseException ("Bad options");
405                         }
406
407                         case '<': case '=': case '!': {                         // lookahead/lookbehind
408                                 ExpressionAssertion asn = new ExpressionAssertion ();
409                                 if (!ParseAssertionType (asn))
410                                         goto case '\'';                         // it's a (?<name> ) construct
411
412                                 Group test = new Group ();
413                                 ParseGroup (test, options, null);
414
415                                 asn.TestExpression = test;
416                                 return asn;
417                         }
418
419                         case '\'': {                                            // named/balancing group
420                                 char delim;
421                                 if (pattern[ptr] == '<')
422                                         delim = '>';
423                                 else
424                                         delim = '\'';
425
426                                 ++ ptr;
427                                 string name = ParseName ();
428
429                                 if (pattern[ptr] == delim) {
430                                         // capturing group
431
432                                         if (name == null)
433                                                 throw NewParseException ("Bad group name.");
434
435                                         ++ ptr;
436                                         CapturingGroup cap = new CapturingGroup ();
437                                         cap.Name = name;
438                                         caps.Add (cap);
439                                         ParseGroup (cap, options, null);
440
441                                         return cap;
442                                 }
443                                 else if (pattern[ptr] == '-') {
444                                         // balancing group
445
446                                         ++ ptr;
447                                         string balance_name = ParseName ();
448                                         if (balance_name == null || pattern[ptr] != delim)
449                                                 throw NewParseException ("Bad balancing group name.");
450
451                                         ++ ptr;
452                                         BalancingGroup bal = new BalancingGroup ();
453                                         bal.Name = name;
454                                         
455                                         if(bal.IsNamed) {
456                                                 caps.Add (bal);
457                                         }
458
459                                         refs.Add (bal, balance_name);
460
461                                         ParseGroup (bal, options, null);
462
463                                         return bal;
464                                 }
465                                 else
466                                         throw NewParseException ("Bad group name.");
467                         }
468
469                         case '(': {                                             // expression/capture test
470                                 Assertion asn;
471                         
472                                 ++ ptr;
473                                 int p = ptr;
474                                 string name = ParseName ();
475                                 if (name == null || pattern[ptr] != ')') {      // expression test
476                                         // FIXME MS implementation doesn't seem to
477                                         // implement this version of (?(x) ...)
478
479                                         ptr = p;
480                                         ExpressionAssertion expr_asn = new ExpressionAssertion ();
481
482                                         if (pattern[ptr] == '?') {
483                                                 ++ ptr;
484                                                 if (!ParseAssertionType (expr_asn))
485                                                         throw NewParseException ("Bad conditional.");
486                                         }
487                                         else {
488                                                 expr_asn.Negate = false;
489                                                 expr_asn.Reverse = false;
490                                         }
491
492                                         Group test = new Group ();
493                                         ParseGroup (test, options, null);
494                                         expr_asn.TestExpression = test;
495                                         asn = expr_asn;
496                                 }
497                                 else {                                          // capture test
498                                         ++ ptr;
499                                         asn = new CaptureAssertion ();
500                                         refs.Add (asn, name);
501                                 }
502
503                                 Group group = new Group ();
504                                 ParseGroup (group, options, asn);
505                                 return group;
506                         }
507
508                         case '#': {                                             // comment
509                                 ++ ptr;
510                                 while (pattern[ptr ++] != ')') {
511                                         if (ptr >= pattern.Length)
512                                                 throw NewParseException ("Unterminated (?#...) comment.");
513                                 }
514                                 return null;
515                         }
516
517                         default:                                                // error
518                                 throw NewParseException ("Bad grouping construct.");
519                         }
520                 }
521
522                 private bool ParseAssertionType (ExpressionAssertion assertion) {
523                         if (pattern[ptr] == '<') {
524                                 switch (pattern[ptr + 1]) {
525                                 case '=':
526                                         assertion.Negate = false;
527                                         break;
528                                 case '!':
529                                         assertion.Negate = true;
530                                         break;
531                                 default:
532                                         return false;
533                                 }
534
535                                 assertion.Reverse = true;
536                                 ptr += 2;
537                         }
538                         else {
539                                 switch (pattern[ptr]) {
540                                 case '=':
541                                         assertion.Negate = false;
542                                         break;
543                                 case '!':
544                                         assertion.Negate = true;
545                                         break;
546                                 default:
547                                         return false;
548                                 }
549
550                                 assertion.Reverse = false;
551                                 ptr += 1;
552                         }
553
554                         return true;
555                 }
556
557                 private void ParseOptions (ref RegexOptions options, bool negate) {
558                         for (;;) {
559                                 switch (pattern[ptr]) {
560                                 case 'i':
561                                         if (negate)
562                                                 options &= ~RegexOptions.IgnoreCase;
563                                         else
564                                                 options |= RegexOptions.IgnoreCase;
565                                         break;
566
567                                 case 'm':
568                                         if (negate)
569                                                 options &= ~RegexOptions.Multiline;
570                                         else
571                                                 options |= RegexOptions.Multiline;
572                                         break;
573                                         
574                                 case 'n':
575                                         if (negate)
576                                                 options &= ~RegexOptions.ExplicitCapture;
577                                         else
578                                                 options |= RegexOptions.ExplicitCapture;
579                                         break;
580                                         
581                                 case 's':
582                                         if (negate)
583                                                 options &= ~RegexOptions.Singleline;
584                                         else
585                                                 options |= RegexOptions.Singleline;
586                                         break;
587                                         
588                                 case 'x':
589                                         if (negate)
590                                                 options &= ~RegexOptions.IgnorePatternWhitespace;
591                                         else
592                                                 options |= RegexOptions.IgnorePatternWhitespace;
593                                         break;
594
595                                 default:
596                                         return;
597                                 }
598
599                                 ++ ptr;
600                         }
601                 }
602
603                 private Expression ParseCharacterClass (RegexOptions options) {
604                         bool negate, ecma;
605                         if (pattern[ptr] == '^') {
606                                 negate = true;
607                                 ++ ptr;
608                         }
609                         else
610                                 negate = false;
611                         
612                         ecma = IsECMAScript (options);
613                         CharacterClass cls = new CharacterClass (negate, IsIgnoreCase (options));
614
615                         if (pattern[ptr] == ']') {
616                                 cls.AddCharacter (']');
617                                 ++ ptr;
618                         }
619
620                         int c = -1;
621                         int last = -1;
622                         bool range = false;
623                         bool closed = false;
624                         while (ptr < pattern.Length) {
625                                 c = pattern[ptr ++];
626
627                                 if (c == ']') {
628                                         closed = true;
629                                         break;
630                                 }
631                                 
632                                 if (c == '-') {
633                                         range = true;
634                                         continue;
635                                 }
636
637                                 if (c == '\\') {
638                                         c = ParseEscape ();
639                                         if (c < 0) {
640                                                 // didn't recognize escape
641
642                                                 c = pattern[ptr ++];
643                                                 switch (c) {
644                                                 case 'b': c = '\b'; break;
645
646                                                 case 'd':
647                                                         cls.AddCategory (ecma ? Category.EcmaDigit : Category.Digit, false);
648                                                         last = -1;
649                                                         continue;
650                                                         
651                                                 case 'w':
652                                                         cls.AddCategory (ecma ? Category.EcmaWord : Category.Word, false);
653                                                         last = -1;
654                                                         continue;
655                                                         
656                                                 case 's':
657                                                         cls.AddCategory (ecma ? Category.EcmaWhiteSpace : Category.WhiteSpace, false);
658                                                         last = -1;
659                                                         continue;
660                                                         
661                                                 case 'p':
662                                                         cls.AddCategory (ParseUnicodeCategory (), false);       // ignore ecma
663                                                         last = -1;
664                                                         continue;
665                                                         
666                                                 case 'D':
667                                                         cls.AddCategory (ecma ? Category.EcmaDigit : Category.Digit, true);
668                                                         last = -1;
669                                                         continue;
670                                                         
671                                                 case 'W':
672                                                         cls.AddCategory (ecma ? Category.EcmaWord : Category.Word, true);
673                                                         last = -1;
674                                                         continue;
675                                                         
676                                                 case 'S':
677                                                         cls.AddCategory (ecma ? Category.EcmaWhiteSpace : Category.WhiteSpace, true);
678                                                         last = -1;
679                                                         continue;
680                                                         
681                                                 case 'P':
682                                                         cls.AddCategory (ParseUnicodeCategory (), true);
683                                                         last = -1;
684                                                         continue;
685
686                                                 default: break;         // add escaped character
687                                                 }
688                                         }
689                                 }
690
691                                 if (range) {
692                                         if (c < last)
693                                                 throw NewParseException ("[x-y] range in reverse order.");
694
695                                         if (last >=0 )
696                                                 cls.AddRange ((char)last, (char)c);
697                                         else {
698                                                 cls.AddCharacter ((char)c);
699                                                 cls.AddCharacter ('-');
700                                         }
701
702                                         range = false;
703                                         last = -1;
704                                 }
705                                 else {
706                                         cls.AddCharacter ((char)c);
707                                         last = c;
708                                 }
709                         }
710
711                         if (!closed)
712                                 throw NewParseException ("Unterminated [] set.");
713
714                         if (range)
715                                 cls.AddCharacter ('-');
716
717                         return cls;
718                 }
719
720                 private void ParseRepetitionBounds (out int min, out int max, RegexOptions options) {
721                         int n, m;
722
723                         /* check syntax */
724
725                         ConsumeWhitespace (IsIgnorePatternWhitespace (options));
726                     
727                         if (pattern[ptr] == ',') {
728                                 n = -1;
729                         } else {
730                                 n = ParseNumber (10, 1, 0);
731                                 ConsumeWhitespace (IsIgnorePatternWhitespace (options));
732                         }
733                         
734                         switch (pattern[ptr ++]) {
735                         case '}':
736                                 m = n;
737                                 break;
738                         case ',':
739                                 ConsumeWhitespace (IsIgnorePatternWhitespace (options));
740                                 m = ParseNumber (10, 1, 0);
741                                 ConsumeWhitespace (IsIgnorePatternWhitespace (options));
742                                 if (pattern[ptr ++] != '}')
743                                         throw NewParseException ("Illegal {x,y} - bad value of y.");
744                                 break;
745                         default:
746                                 throw NewParseException ("Illegal {x,y}");
747                         }
748
749                         /* check bounds and ordering */
750
751                         if (n >= 0xffff || m >= 0xffff)
752                                 throw NewParseException ("Illegal {x, y} - maximum of 65535.");
753                         if (m >= 0 && m < n)
754                                 throw NewParseException ("Illegal {x, y} with x > y.");
755
756                         /* assign min and max */
757                         
758                         min = n;
759                         if (m > 0)
760                                 max = m;
761                         else
762                                 max = 0xffff;
763                 }
764
765                 private Category ParseUnicodeCategory () {
766                         if (pattern[ptr ++] != '{')
767                                 throw NewParseException ("Incomplete \\p{X} character escape.");
768
769                         string name = ParseName (pattern, ref ptr);
770                         if (name == null)
771                                 throw NewParseException ("Incomplete \\p{X} character escape.");
772
773                         Category cat = CategoryUtils.CategoryFromName (name);
774                         if (cat == Category.None)
775                                 throw NewParseException ("Unknown property '" + name + "'.");
776
777                         if (pattern[ptr ++] != '}')
778                                 throw NewParseException ("Incomplete \\p{X} character escape.");
779
780                         return cat;
781                 }
782
783                 private Expression ParseSpecial (RegexOptions options) {
784                         int p = ptr;
785                         bool ecma = IsECMAScript (options);
786                         Expression expr = null;
787                         
788                         switch (pattern[ptr ++]) {
789
790                         // categories
791
792                         case 'd':
793                                 expr = new CharacterClass (ecma ? Category.EcmaDigit : Category.Digit, false);
794                                 break;
795                                 
796                         case 'w':
797                                 expr = new CharacterClass (ecma ? Category.EcmaWord : Category.Word, false);
798                                 break;
799                                 
800                         case 's':
801                                 expr = new CharacterClass (ecma ? Category.EcmaWhiteSpace : Category.WhiteSpace, false);
802                                 break;
803                                 
804                         case 'p':
805                                 // this is odd - ECMAScript isn't supposed to support Unicode,
806                                 // yet \p{..} compiles and runs under the MS implementation
807                                 // identically to canonical mode. That's why I'm ignoring the
808                                 // value of ecma here.
809                         
810                                 expr = new CharacterClass (ParseUnicodeCategory (), false);
811                                 break;
812                                 
813                         case 'D':
814                                 expr = new CharacterClass (ecma ? Category.EcmaDigit : Category.Digit, true);
815                                 break;
816                                 
817                         case 'W':
818                                 expr = new CharacterClass (ecma ? Category.EcmaWord : Category.Word, true);
819                                 break;
820                                 
821                         case 'S':
822                                 expr = new CharacterClass (ecma ? Category.EcmaWhiteSpace : Category.WhiteSpace, true);
823                                 break;
824                                 
825                         case 'P':
826                                 expr = new CharacterClass (ParseUnicodeCategory (), true);
827                                 break;
828
829                         // positions
830
831                         case 'A': expr = new PositionAssertion (Position.StartOfString); break;
832                         case 'Z': expr = new PositionAssertion (Position.End); break;
833                         case 'z': expr = new PositionAssertion (Position.EndOfString); break;
834                         case 'G': expr = new PositionAssertion (Position.StartOfScan); break;
835                         case 'b': expr = new PositionAssertion (Position.Boundary); break;
836                         case 'B': expr = new PositionAssertion (Position.NonBoundary); break;
837                         
838                         // references
839
840                         case '1': case '2': case '3': case '4': case '5':
841                         case '6': case '7': case '8': case '9': {
842                                 ptr --;
843                                 int n = ParseNumber (10, 1, 0);
844                                 if (n < 0) {
845                                         ptr = p;
846                                         return null;
847                                 }
848
849                                 // FIXME test if number is within number of assigned groups
850                                 // this may present a problem for right-to-left matching
851
852                                 Reference reference = new Reference (IsIgnoreCase (options));
853                                 refs.Add (reference, n.ToString ());
854                                 expr = reference;
855                                 break;
856                         }
857
858                         case 'k': {
859                                 char delim = pattern[ptr ++];
860                                 if (delim == '<')
861                                         delim = '>';
862                                 else if (delim != '\'')
863                                         throw NewParseException ("Malformed \\k<...> named backreference.");
864
865                                 string name = ParseName ();
866                                 if (name == null || pattern[ptr] != delim)
867                                         throw NewParseException ("Malformed \\k<...> named backreference.");
868
869                                 ++ ptr;
870                                 Reference reference = new Reference (IsIgnoreCase (options));
871                                 refs.Add (reference, name);
872                                 expr = reference;
873                                 break;
874                         }
875
876                         default:
877                                 expr = null;
878                                 break;
879                         }
880
881                         if (expr == null)
882                                 ptr = p;
883
884                         return expr;
885                 }
886
887                 private int ParseEscape () {
888                         int p = ptr;
889                         int c;
890
891                         if (p >= pattern.Length)
892                                 throw new ArgumentException (
893                                                 String.Format ("Parsing \"{0}\" - Illegal \\ at end of " + 
894                                                                 "pattern.", pattern), pattern);
895                         
896                         switch (pattern[ptr ++]) {
897         
898                         // standard escapes (except \b)
899
900                         case 'a': return '\u0007';
901                         case 't': return '\u0009';
902                         case 'r': return '\u000d';
903                         case 'v': return '\u000b';
904                         case 'f': return '\u000c';
905                         case 'n': return '\u000a';
906                         case 'e': return '\u001b';
907                         case '\\': return '\\';
908
909                         // character codes
910
911                         case '0':
912                                 int prevptr = ptr;
913                                 int result = ParseOctal (pattern, ref ptr);
914                                 if (result == -1 && prevptr == ptr)
915                                         return 0;
916
917                                 return result;
918
919                         case 'x':
920                                 c = ParseHex (pattern, ref ptr, 2);
921                                 if (c < 0)
922                                         throw NewParseException ("Insufficient hex digits");
923
924                                 return c;
925
926                         case 'u':
927                                 c = ParseHex (pattern, ref ptr, 4);
928                                 if (c < 0)
929                                         throw NewParseException ("Insufficient hex digits");
930                                 
931                                 return c;
932
933                         // control characters
934
935                         case 'c':
936                                 c = pattern[p ++];
937                                 if (c >= 'A' && c <= 'Z')
938                                         return c - 'A';
939                                 else if (c >= '@' && c <= '_')
940                                         return c - '@';
941                                 else
942                                         throw NewParseException ("Unrecognized control character.");
943
944                         // unknown escape
945
946                         default:
947                                 ptr = p;
948                                 return -1;
949                         }
950                 }
951
952                 private string ParseName () {
953                         return Parser.ParseName (pattern, ref ptr);
954                 }
955
956                 private static bool IsNameChar (char c) {
957                         UnicodeCategory cat = Char.GetUnicodeCategory (c);
958                         if (cat == UnicodeCategory.ModifierLetter)
959                                 return false;
960                         if (cat == UnicodeCategory.ConnectorPunctuation)
961                                 return true;
962                         return Char.IsLetterOrDigit (c);
963                 }
964         
965                 private int ParseNumber (int b, int min, int max) {
966                         return Parser.ParseNumber (pattern, ref ptr, b, min, max);
967                 }
968
969                 private int ParseDecimal () {
970                         return Parser.ParseDecimal (pattern, ref ptr);
971                 }
972
973                 private static int ParseDigit (char c, int b, int n) {
974                         switch (b) {
975                         case 8:
976                                 if (c >= '0' && c <= '7')
977                                         return c - '0';
978                                 else
979                                         return -1;
980                         case 10:
981                                 if (c >= '0' && c <= '9')
982                                         return c - '0';
983                                 else
984                                         return -1;
985                         case 16:
986                                 if (c >= '0' && c <= '9')
987                                         return c - '0';
988                                 else if (c >= 'a' && c <= 'f')
989                                         return 10 + c - 'a';
990                                 else if (c >= 'A' && c <= 'F')
991                                         return 10 + c - 'A';
992                                 else
993                                         return -1;
994                         default:
995                                 return -1;
996                         }
997                 }
998
999                 private void ConsumeWhitespace (bool ignore) {
1000                         while (true) {
1001                                 if (ptr >= pattern.Length)
1002                                         break;
1003                         
1004                                 if (pattern[ptr] == '(') {
1005                                         if (ptr + 3 >= pattern.Length)
1006                                                 return;
1007
1008                                         if (pattern[ptr + 1] != '?' || pattern[ptr + 2] != '#')
1009                                                 return;
1010
1011                                         ptr += 3;
1012                                         while (pattern[ptr ++] != ')')
1013                                                 /* ignore */ ;
1014                                 }
1015                                 else if (ignore && pattern[ptr] == '#') {
1016                                         while (ptr < pattern.Length && pattern[ptr ++] != '\n')
1017                                                 /* ignore */ ;
1018                                 }
1019                                 else if (ignore && Char.IsWhiteSpace (pattern[ptr])) {
1020                                         while (ptr < pattern.Length && Char.IsWhiteSpace (pattern[ptr]))
1021                                                 ++ ptr;
1022                                 }
1023                                 else
1024                                         return;
1025                         }
1026                 }
1027
1028                 private string ParseString (string pattern) {
1029                         this.pattern = pattern;
1030                         this.ptr = 0;
1031
1032                         string result = "";
1033                         while (ptr < pattern.Length) {
1034                                 int c = pattern[ptr];
1035                                 if (c == '\\')
1036                                         c = ParseEscape ();
1037                                 ptr ++; 
1038                                 result += (char)c;
1039                         }
1040
1041                         return result;
1042                 }
1043
1044                 private void ResolveReferences () {
1045                         int gid = 1;
1046                         Hashtable dict = new Hashtable ();
1047
1048                         // number unnamed groups
1049
1050                         foreach (CapturingGroup group in caps) {
1051                                 if (group.Name == null) {
1052                                         dict.Add (gid.ToString (), group);
1053                                         group.Number = gid ++;
1054
1055                                         ++ num_groups;
1056                                 }
1057                         }
1058
1059                         // number named groups
1060
1061                         foreach (CapturingGroup group in caps) {
1062                                 if (group.Name != null) {
1063                                         if (!dict.Contains (group.Name)) {
1064                                                 dict.Add (group.Name, group);
1065                                                 group.Number = gid ++;
1066
1067                                                 ++ num_groups;
1068                                         }
1069                                         else {
1070                                                 CapturingGroup prev = (CapturingGroup)dict[group.Name];
1071                                                 group.Number = prev.Number;
1072                                         }
1073                                 }
1074                         }
1075
1076                         // resolve references
1077
1078                         foreach (Expression expr in refs.Keys) {
1079                                 string name = (string)refs[expr];
1080                                 if (!dict.Contains (name)) {
1081                                         throw NewParseException ("Reference to undefined group " +
1082                                                 (Char.IsDigit (name[0]) ? "number " : "name ") +
1083                                                 name);
1084                                 }
1085
1086                                 CapturingGroup group = (CapturingGroup)dict[name];
1087                                 if (expr is Reference)
1088                                         ((Reference)expr).CapturingGroup = group;
1089                                 else if (expr is CaptureAssertion)
1090                                         ((CaptureAssertion)expr).CapturingGroup = group;
1091                                 else if (expr is BalancingGroup)
1092                                         ((BalancingGroup)expr).Balance = group;
1093                         }
1094                 }
1095
1096                 // flag helper functions
1097
1098                 private static bool IsIgnoreCase (RegexOptions options) {
1099                         return (options & RegexOptions.IgnoreCase) != 0;
1100                 }
1101
1102                 private static bool IsMultiline (RegexOptions options) {
1103                         return (options & RegexOptions.Multiline) != 0;
1104                 }
1105
1106                 private static bool IsExplicitCapture (RegexOptions options) {
1107                         return (options & RegexOptions.ExplicitCapture) != 0;
1108                 }
1109         
1110                 private static bool IsSingleline (RegexOptions options) {
1111                         return (options & RegexOptions.Singleline) != 0;
1112                 }
1113
1114                 private static bool IsIgnorePatternWhitespace (RegexOptions options) {
1115                         return (options & RegexOptions.IgnorePatternWhitespace) != 0;
1116                 }
1117
1118                 private static bool IsRightToLeft (RegexOptions options) {
1119                         return (options & RegexOptions.RightToLeft) != 0;
1120                 }
1121
1122                 private static bool IsECMAScript (RegexOptions options) {
1123                         return (options & RegexOptions.ECMAScript) != 0;
1124                 }
1125
1126                 // exception creation
1127
1128                 private ArgumentException NewParseException (string msg) {
1129                         msg = "parsing \"" + pattern + "\" - " + msg;
1130                         return new ArgumentException (msg, pattern);
1131                 }
1132
1133                 private string pattern;
1134                 private int ptr;
1135
1136                 private ArrayList caps;
1137                 private Hashtable refs;
1138                 private int num_groups;
1139         }
1140 }