2005-01-31 Zoltan Varga <vargaz@freemail.hu>
[mono.git] / mcs / class / System / System.Text.RegularExpressions / parser.cs
1 //
2 // assembly:    System
3 // namespace:   System.Text.RegularExpressions
4 // file:        parser.cs
5 //
6 // author:      Dan Lewis (dlewis@gmx.co.uk)
7 //              (c) 2002
8
9 //
10 // Permission is hereby granted, free of charge, to any person obtaining
11 // a copy of this software and associated documentation files (the
12 // "Software"), to deal in the Software without restriction, including
13 // without limitation the rights to use, copy, modify, merge, publish,
14 // distribute, sublicense, and/or sell copies of the Software, and to
15 // permit persons to whom the Software is furnished to do so, subject to
16 // the following conditions:
17 // 
18 // The above copyright notice and this permission notice shall be
19 // included in all copies or substantial portions of the Software.
20 // 
21 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
22 // EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
23 // MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
24 // NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
25 // LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
26 // OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
27 // WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
28 //
29
30 using System;
31 using System.Collections;
32 using System.Globalization;
33
34 namespace System.Text.RegularExpressions.Syntax {
35
36         class Parser {
37                 public static int ParseDecimal (string str, ref int ptr) {
38                         return ParseNumber (str, ref ptr, 10, 1, Int32.MaxValue);
39                 }
40
41                 public static int ParseOctal (string str, ref int ptr) {
42                         return ParseNumber (str, ref ptr, 8, 1, 3);
43                 }
44
45                 public static int ParseHex (string str, ref int ptr, int digits) {
46                         return ParseNumber (str, ref ptr, 16, digits, digits);
47                 }
48
49                 public static int ParseNumber (string str, ref int ptr, int b, int min, int max) {
50                         int p = ptr, n = 0, digits = 0, d;
51                         if (max < min)
52                                 max = Int32.MaxValue;
53
54                         while (digits < max && p < str.Length) {
55                                 d = ParseDigit (str[p ++], b, digits);
56                                 if (d < 0) {
57                                         -- p;
58                                         break;
59                                 }
60
61                                 n = n * b + d;
62                                 ++ digits;
63                         }
64
65                         if (digits < min)
66                                 return -1;
67
68                         ptr = p;
69                         return n;
70                 }
71
72                 public static string ParseName (string str, ref int ptr) {
73                         if (Char.IsDigit (str[ptr])) {
74                                 int gid = ParseNumber (str, ref ptr, 10, 1, 0);
75                                 if (gid > 0)
76                                         return gid.ToString ();
77                                 
78                                 return null;
79                         }
80
81                         int start = ptr;
82                         for (;;) {
83                                 if (!IsNameChar (str[ptr]))
84                                         break;
85                                 ++ ptr;
86                         }
87
88                         if (ptr - start > 0)
89                                 return str.Substring (start, ptr - start);
90
91                         return null;
92                 }
93
94                 public static string Escape (string str) {
95                         string result = "";
96                         for (int i = 0; i < str.Length; ++ i) {
97                                 char c = str[i];
98                                 switch (c) {
99                                 case '\\': case '*': case '+': case '?': case '|':
100                                 case '{': case '[': case '(': case ')': case '^':
101                                 case '$': case '.': case '#': case ' ':
102                                         result += "\\" + c;
103                                         break;
104
105                                 case '\t': result += "\\t"; break;
106                                 case '\n': result += "\\n"; break;
107                                 case '\r': result += "\\r"; break;
108                                 case '\f': result += "\\f"; break;
109
110                                 default: result += c; break;
111                                 }
112                         }
113
114                         return result;
115                 }
116
117                 public static string Unescape (string str) {
118                         return new Parser ().ParseString (str);
119                 }
120
121                 // public instance
122
123                 public Parser () {
124                         this.caps = new ArrayList ();
125                         this.refs = new Hashtable ();
126                 }
127
128                 public RegularExpression ParseRegularExpression (string pattern, RegexOptions options) {
129                         this.pattern = pattern;
130                         this.ptr = 0;
131
132                         caps.Clear ();
133                         refs.Clear ();
134                         this.num_groups = 0;
135
136                         try {
137                                 RegularExpression re = new RegularExpression ();
138                                 ParseGroup (re, options, null);
139                                 ResolveReferences ();
140
141                                 re.GroupCount = num_groups;
142                                 
143                                 return re;
144                         }
145                         catch (IndexOutOfRangeException) {
146                                 throw NewParseException ("Unexpected end of pattern.");
147                         }
148                 }
149
150                 public IDictionary GetMapping () {
151                         Hashtable mapping = new Hashtable ();
152                         Hashtable numbers = new Hashtable ();
153                         int end = caps.Count;
154                         mapping.Add ("0", 0);
155                         for (int i = 0; i < end; i++) {
156                                 CapturingGroup group = (CapturingGroup) caps [i];
157                                 if (group.Name != null && !mapping.Contains (group.Name)) {
158                                         mapping.Add (group.Name, group.Number);
159                                         numbers.Add (group.Number, group.Number);
160                                 }
161                         }
162
163                         for (int i = 1; i < end; i++) {
164                                 if (numbers [i] == null)
165                                         mapping.Add (i.ToString (), i);
166                         }
167
168                         return mapping;
169                 }
170
171                 // private methods
172
173                 private void ParseGroup (Group group, RegexOptions options, Assertion assertion) {
174                         bool is_top_level = group is RegularExpression;
175                 
176                         Alternation alternation = null;
177                         string literal = null;
178
179                         Group current = new Group ();
180                         Expression expr = null;
181                         bool closed = false;
182
183                         while (true) {
184                                 ConsumeWhitespace (IsIgnorePatternWhitespace (options));
185                                 if (ptr >= pattern.Length)
186                                         break;
187                                 
188                                 // (1) Parse for Expressions
189                         
190                                 char ch = pattern[ptr ++];
191                                 
192                                 switch (ch) {
193                                 case '^': {
194                                         Position pos =
195                                                 IsMultiline (options) ? Position.StartOfLine : Position.Start;
196                                         expr = new PositionAssertion (pos);
197                                         break;
198                                 }
199
200                                 case '$': {
201                                         Position pos =
202                                                 IsMultiline (options) ? Position.EndOfLine : Position.End;
203                                         expr = new PositionAssertion (pos);
204                                         break;
205                                 }
206
207                                 case '.': {
208                                         Category cat =
209                                                 IsSingleline (options) ? Category.AnySingleline : Category.Any;
210                                         expr = new CharacterClass (cat, false);
211                                         break;
212                                 }
213
214                                 case '\\': {
215                                         int c = ParseEscape ();
216                                         if (c >= 0)
217                                                 ch = (char)c;
218                                         else {
219                                                 expr = ParseSpecial (options);
220
221                                                 if (expr == null)
222                                                         ch = pattern[ptr ++];           // default escape
223                                         }
224                                         break;
225                                 }
226
227                                 case '[': {
228                                         expr = ParseCharacterClass (options);
229                                         break;
230                                 }
231
232                                 case '(': {
233                                         bool ignore = IsIgnoreCase (options);
234                                         expr = ParseGroupingConstruct (ref options);
235                                         if (expr == null) {
236                                                 if (literal != null && IsIgnoreCase (options) != ignore) {
237                                                         current.AppendExpression (new Literal (literal, IsIgnoreCase (options)));
238                                                         literal = null;
239                                                 }
240
241                                                 continue;
242                                         }
243                                         break;
244                                 }
245
246                                 case ')': {
247                                         closed = true;
248                                         goto EndOfGroup;
249                                 }
250
251                                 case '|': {
252                                         if (literal != null) {
253                                                 current.AppendExpression (new Literal (literal, IsIgnoreCase (options)));
254                                                 literal = null;
255                                         }
256
257                                         if (assertion != null) {
258                                                 if (assertion.TrueExpression == null)
259                                                         assertion.TrueExpression = current;
260                                                 else if (assertion.FalseExpression == null)
261                                                         assertion.FalseExpression = current;
262                                                 else
263                                                         throw NewParseException ("Too many | in (?()|).");
264                                         }
265                                         else {
266                                                 if (alternation == null)
267                                                         alternation = new Alternation ();
268
269                                                 alternation.AddAlternative (current);
270                                         }
271
272                                         current = new Group ();
273                                         continue;
274                                 }
275
276                                 case '*': case '+': case '?': {
277                                         throw NewParseException ("Bad quantifier.");
278                                 }
279
280                                 default: 
281                                         break;          // literal character
282                                 }
283
284                                 ConsumeWhitespace (IsIgnorePatternWhitespace (options));
285                                 
286                                 // (2) Check for Repetitions
287                                 
288                                 if (ptr < pattern.Length) {
289                                         char k = pattern[ptr];
290
291                                         if (k == '?' || k == '*' || k == '+' || k == '{') {
292                                                 ++ ptr;
293
294                                                 int min = 0, max = 0;
295                                                 bool lazy = false;
296
297                                                 switch (k) {
298                                                 case '?': min = 0; max = 1; break;
299                                                 case '*': min = 0; max = 0xffff; break;
300                                                 case '+': min = 1; max = 0xffff; break;
301                                                 case '{': ParseRepetitionBounds (out min, out max, options); break;
302                                                 }
303
304                                                 ConsumeWhitespace (IsIgnorePatternWhitespace (options));
305                                                 if (ptr < pattern.Length && pattern[ptr] == '?') {
306                                                         ++ ptr;
307                                                         lazy = true;
308                                                 }
309
310                                                 Repetition repetition = new Repetition (min, max, lazy);
311
312                                                 if (expr == null)
313                                                         repetition.Expression = new Literal (ch.ToString (), IsIgnoreCase (options));
314                                                 else
315                                                         repetition.Expression = expr;
316
317                                                 expr = repetition;
318                                         }
319                                 }
320
321                                 // (3) Append Expression and/or Literal
322
323                                 if (expr == null) {
324                                         if (literal == null)
325                                                 literal = "";
326                                         literal += ch;
327                                 }
328                                 else {
329                                         if (literal != null) {
330                                                 current.AppendExpression (new Literal (literal, IsIgnoreCase (options)));
331                                                 literal = null;
332                                         }
333
334                                         current.AppendExpression (expr);
335                                         expr = null;
336                                 }
337
338                                 if (is_top_level && ptr >= pattern.Length)
339                                         goto EndOfGroup;
340                         }
341
342                 EndOfGroup:
343                         if (is_top_level && closed)
344                                 throw NewParseException ("Too many )'s.");
345                         if (!is_top_level && !closed)
346                                 throw NewParseException ("Not enough )'s.");
347                                 
348                 
349                         // clean up literals and alternations
350
351                         if (literal != null)
352                                 current.AppendExpression (new Literal (literal, IsIgnoreCase (options)));
353
354                         if (assertion != null) {
355                                 if (assertion.TrueExpression == null)
356                                         assertion.TrueExpression = current;
357                                 else
358                                         assertion.FalseExpression = current;
359                                 
360                                 group.AppendExpression (assertion);
361                         }
362                         else if (alternation != null) {
363                                 alternation.AddAlternative (current);
364                                 group.AppendExpression (alternation);
365                         }
366                         else
367                                 group.AppendExpression (current);
368                 }
369
370                 private Expression ParseGroupingConstruct (ref RegexOptions options) {
371                         if (pattern[ptr] != '?') {
372                                 Group group;
373
374                                 if (IsExplicitCapture (options))
375                                         group = new Group ();
376                                 else {
377                                         group = new CapturingGroup ();
378                                         caps.Add (group);
379                                 }
380
381                                 ParseGroup (group, options, null);
382                                 return group;
383                         }
384                         else
385                                 ++ ptr;
386
387                         switch (pattern[ptr]) {
388                         case ':': {                                             // non-capturing group
389                                 ++ ptr;
390                                 Group group = new Group ();
391                                 ParseGroup (group, options, null);
392
393                                 return group;
394                         }
395
396                         case '>': {                                             // non-backtracking group
397                                 ++ ptr;
398                                 Group group = new NonBacktrackingGroup ();
399                                 ParseGroup (group, options, null);
400                                 
401                                 return group;
402                         }
403
404                         case 'i': case 'm': case 'n':
405                         case 's': case 'x': case '-': {                         // options
406                                 RegexOptions o = options;
407                                 ParseOptions (ref o, false);
408                                 if (pattern[ptr] == '-') {
409                                         ++ ptr;
410                                         ParseOptions (ref o, true);
411                                 }
412
413                                 if (pattern[ptr] == ':') {                      // pass options to child group
414                                         ++ ptr;
415                                         Group group = new Group ();
416                                         ParseGroup (group, o, null);
417                                         return group;
418                                 }
419                                 else if (pattern[ptr] == ')') {                 // change options of enclosing group
420                                         ++ ptr;
421                                         options = o;
422                                         return null;
423                                 }
424                                 else
425                                         throw NewParseException ("Bad options");
426                         }
427
428                         case '<': case '=': case '!': {                         // lookahead/lookbehind
429                                 ExpressionAssertion asn = new ExpressionAssertion ();
430                                 if (!ParseAssertionType (asn))
431                                         goto case '\'';                         // it's a (?<name> ) construct
432
433                                 Group test = new Group ();
434                                 ParseGroup (test, options, null);
435
436                                 asn.TestExpression = test;
437                                 return asn;
438                         }
439
440                         case '\'': {                                            // named/balancing group
441                                 char delim;
442                                 if (pattern[ptr] == '<')
443                                         delim = '>';
444                                 else
445                                         delim = '\'';
446
447                                 ++ ptr;
448                                 string name = ParseName ();
449
450                                 if (pattern[ptr] == delim) {
451                                         // capturing group
452
453                                         if (name == null)
454                                                 throw NewParseException ("Bad group name.");
455
456                                         ++ ptr;
457                                         CapturingGroup cap = new CapturingGroup ();
458                                         cap.Name = name;
459                                         caps.Add (cap);
460                                         ParseGroup (cap, options, null);
461
462                                         return cap;
463                                 }
464                                 else if (pattern[ptr] == '-') {
465                                         // balancing group
466
467                                         ++ ptr;
468                                         string balance_name = ParseName ();
469                                         if (balance_name == null || pattern[ptr] != delim)
470                                                 throw NewParseException ("Bad balancing group name.");
471
472                                         ++ ptr;
473                                         BalancingGroup bal = new BalancingGroup ();
474                                         bal.Name = name;
475                                         
476                                         if(bal.IsNamed) {
477                                                 caps.Add (bal);
478                                         }
479
480                                         refs.Add (bal, balance_name);
481
482                                         ParseGroup (bal, options, null);
483
484                                         return bal;
485                                 }
486                                 else
487                                         throw NewParseException ("Bad group name.");
488                         }
489
490                         case '(': {                                             // expression/capture test
491                                 Assertion asn;
492                         
493                                 ++ ptr;
494                                 int p = ptr;
495                                 string name = ParseName ();
496                                 if (name == null || pattern[ptr] != ')') {      // expression test
497                                         // FIXME MS implementation doesn't seem to
498                                         // implement this version of (?(x) ...)
499
500                                         ptr = p;
501                                         ExpressionAssertion expr_asn = new ExpressionAssertion ();
502
503                                         if (pattern[ptr] == '?') {
504                                                 ++ ptr;
505                                                 if (!ParseAssertionType (expr_asn))
506                                                         throw NewParseException ("Bad conditional.");
507                                         }
508                                         else {
509                                                 expr_asn.Negate = false;
510                                                 expr_asn.Reverse = false;
511                                         }
512
513                                         Group test = new Group ();
514                                         ParseGroup (test, options, null);
515                                         expr_asn.TestExpression = test;
516                                         asn = expr_asn;
517                                 }
518                                 else {                                          // capture test
519                                         ++ ptr;
520                                         asn = new CaptureAssertion ();
521                                         refs.Add (asn, name);
522                                 }
523
524                                 Group group = new Group ();
525                                 ParseGroup (group, options, asn);
526                                 return group;
527                         }
528
529                         case '#': {                                             // comment
530                                 ++ ptr;
531                                 while (pattern[ptr ++] != ')') {
532                                         if (ptr >= pattern.Length)
533                                                 throw NewParseException ("Unterminated (?#...) comment.");
534                                 }
535                                 return null;
536                         }
537
538                         default:                                                // error
539                                 throw NewParseException ("Bad grouping construct.");
540                         }
541                 }
542
543                 private bool ParseAssertionType (ExpressionAssertion assertion) {
544                         if (pattern[ptr] == '<') {
545                                 switch (pattern[ptr + 1]) {
546                                 case '=':
547                                         assertion.Negate = false;
548                                         break;
549                                 case '!':
550                                         assertion.Negate = true;
551                                         break;
552                                 default:
553                                         return false;
554                                 }
555
556                                 assertion.Reverse = true;
557                                 ptr += 2;
558                         }
559                         else {
560                                 switch (pattern[ptr]) {
561                                 case '=':
562                                         assertion.Negate = false;
563                                         break;
564                                 case '!':
565                                         assertion.Negate = true;
566                                         break;
567                                 default:
568                                         return false;
569                                 }
570
571                                 assertion.Reverse = false;
572                                 ptr += 1;
573                         }
574
575                         return true;
576                 }
577
578                 private void ParseOptions (ref RegexOptions options, bool negate) {
579                         for (;;) {
580                                 switch (pattern[ptr]) {
581                                 case 'i':
582                                         if (negate)
583                                                 options &= ~RegexOptions.IgnoreCase;
584                                         else
585                                                 options |= RegexOptions.IgnoreCase;
586                                         break;
587
588                                 case 'm':
589                                         if (negate)
590                                                 options &= ~RegexOptions.Multiline;
591                                         else
592                                                 options |= RegexOptions.Multiline;
593                                         break;
594                                         
595                                 case 'n':
596                                         if (negate)
597                                                 options &= ~RegexOptions.ExplicitCapture;
598                                         else
599                                                 options |= RegexOptions.ExplicitCapture;
600                                         break;
601                                         
602                                 case 's':
603                                         if (negate)
604                                                 options &= ~RegexOptions.Singleline;
605                                         else
606                                                 options |= RegexOptions.Singleline;
607                                         break;
608                                         
609                                 case 'x':
610                                         if (negate)
611                                                 options &= ~RegexOptions.IgnorePatternWhitespace;
612                                         else
613                                                 options |= RegexOptions.IgnorePatternWhitespace;
614                                         break;
615
616                                 default:
617                                         return;
618                                 }
619
620                                 ++ ptr;
621                         }
622                 }
623
624                 private Expression ParseCharacterClass (RegexOptions options) {
625                         bool negate, ecma;
626                         if (pattern[ptr] == '^') {
627                                 negate = true;
628                                 ++ ptr;
629                         }
630                         else
631                                 negate = false;
632                         
633                         ecma = IsECMAScript (options);
634                         CharacterClass cls = new CharacterClass (negate, IsIgnoreCase (options));
635
636                         if (pattern[ptr] == ']') {
637                                 cls.AddCharacter (']');
638                                 ++ ptr;
639                         }
640
641                         int c = -1;
642                         int last = -1;
643                         bool range = false;
644                         bool closed = false;
645                         while (ptr < pattern.Length) {
646                                 c = pattern[ptr ++];
647
648                                 if (c == ']') {
649                                         closed = true;
650                                         break;
651                                 }
652                                 
653                                 if (c == '-') {
654                                         range = true;
655                                         continue;
656                                 }
657
658                                 if (c == '\\') {
659                                         c = ParseEscape ();
660                                         if (c < 0) {
661                                                 // didn't recognize escape
662
663                                                 c = pattern[ptr ++];
664                                                 switch (c) {
665                                                 case 'b': c = '\b'; break;
666
667                                                 case 'd':
668                                                         cls.AddCategory (ecma ? Category.EcmaDigit : Category.Digit, false);
669                                                         last = -1;
670                                                         continue;
671                                                         
672                                                 case 'w':
673                                                         cls.AddCategory (ecma ? Category.EcmaWord : Category.Word, false);
674                                                         last = -1;
675                                                         continue;
676                                                         
677                                                 case 's':
678                                                         cls.AddCategory (ecma ? Category.EcmaWhiteSpace : Category.WhiteSpace, false);
679                                                         last = -1;
680                                                         continue;
681                                                         
682                                                 case 'p':
683                                                         cls.AddCategory (ParseUnicodeCategory (), false);       // ignore ecma
684                                                         last = -1;
685                                                         continue;
686                                                         
687                                                 case 'D':
688                                                         cls.AddCategory (ecma ? Category.EcmaDigit : Category.Digit, true);
689                                                         last = -1;
690                                                         continue;
691                                                         
692                                                 case 'W':
693                                                         cls.AddCategory (ecma ? Category.EcmaWord : Category.Word, true);
694                                                         last = -1;
695                                                         continue;
696                                                         
697                                                 case 'S':
698                                                         cls.AddCategory (ecma ? Category.EcmaWhiteSpace : Category.WhiteSpace, true);
699                                                         last = -1;
700                                                         continue;
701                                                         
702                                                 case 'P':
703                                                         cls.AddCategory (ParseUnicodeCategory (), true);
704                                                         last = -1;
705                                                         continue;
706
707                                                 default: break;         // add escaped character
708                                                 }
709                                         }
710                                 }
711
712                                 if (range) {
713                                         if (c < last)
714                                                 throw NewParseException ("[x-y] range in reverse order.");
715
716                                         if (last >=0 )
717                                                 cls.AddRange ((char)last, (char)c);
718                                         else {
719                                                 cls.AddCharacter ((char)c);
720                                                 cls.AddCharacter ('-');
721                                         }
722
723                                         range = false;
724                                         last = -1;
725                                 }
726                                 else {
727                                         cls.AddCharacter ((char)c);
728                                         last = c;
729                                 }
730                         }
731
732                         if (!closed)
733                                 throw NewParseException ("Unterminated [] set.");
734
735                         if (range)
736                                 cls.AddCharacter ('-');
737
738                         return cls;
739                 }
740
741                 private void ParseRepetitionBounds (out int min, out int max, RegexOptions options) {
742                         int n, m;
743
744                         /* check syntax */
745
746                         ConsumeWhitespace (IsIgnorePatternWhitespace (options));
747                     
748                         if (pattern[ptr] == ',') {
749                                 n = -1;
750                         } else {
751                                 n = ParseNumber (10, 1, 0);
752                                 ConsumeWhitespace (IsIgnorePatternWhitespace (options));
753                         }
754                         
755                         switch (pattern[ptr ++]) {
756                         case '}':
757                                 m = n;
758                                 break;
759                         case ',':
760                                 ConsumeWhitespace (IsIgnorePatternWhitespace (options));
761                                 m = ParseNumber (10, 1, 0);
762                                 ConsumeWhitespace (IsIgnorePatternWhitespace (options));
763                                 if (pattern[ptr ++] != '}')
764                                         throw NewParseException ("Illegal {x,y} - bad value of y.");
765                                 break;
766                         default:
767                                 throw NewParseException ("Illegal {x,y}");
768                         }
769
770                         /* check bounds and ordering */
771
772                         if (n >= 0xffff || m >= 0xffff)
773                                 throw NewParseException ("Illegal {x, y} - maximum of 65535.");
774                         if (m >= 0 && m < n)
775                                 throw NewParseException ("Illegal {x, y} with x > y.");
776
777                         /* assign min and max */
778                         
779                         min = n;
780                         if (m > 0)
781                                 max = m;
782                         else
783                                 max = 0xffff;
784                 }
785
786                 private Category ParseUnicodeCategory () {
787                         if (pattern[ptr ++] != '{')
788                                 throw NewParseException ("Incomplete \\p{X} character escape.");
789
790                         string name = ParseName (pattern, ref ptr);
791                         if (name == null)
792                                 throw NewParseException ("Incomplete \\p{X} character escape.");
793
794                         Category cat = CategoryUtils.CategoryFromName (name);
795                         if (cat == Category.None)
796                                 throw NewParseException ("Unknown property '" + name + "'.");
797
798                         if (pattern[ptr ++] != '}')
799                                 throw NewParseException ("Incomplete \\p{X} character escape.");
800
801                         return cat;
802                 }
803
804                 private Expression ParseSpecial (RegexOptions options) {
805                         int p = ptr;
806                         bool ecma = IsECMAScript (options);
807                         Expression expr = null;
808                         
809                         switch (pattern[ptr ++]) {
810
811                         // categories
812
813                         case 'd':
814                                 expr = new CharacterClass (ecma ? Category.EcmaDigit : Category.Digit, false);
815                                 break;
816                                 
817                         case 'w':
818                                 expr = new CharacterClass (ecma ? Category.EcmaWord : Category.Word, false);
819                                 break;
820                                 
821                         case 's':
822                                 expr = new CharacterClass (ecma ? Category.EcmaWhiteSpace : Category.WhiteSpace, false);
823                                 break;
824                                 
825                         case 'p':
826                                 // this is odd - ECMAScript isn't supposed to support Unicode,
827                                 // yet \p{..} compiles and runs under the MS implementation
828                                 // identically to canonical mode. That's why I'm ignoring the
829                                 // value of ecma here.
830                         
831                                 expr = new CharacterClass (ParseUnicodeCategory (), false);
832                                 break;
833                                 
834                         case 'D':
835                                 expr = new CharacterClass (ecma ? Category.EcmaDigit : Category.Digit, true);
836                                 break;
837                                 
838                         case 'W':
839                                 expr = new CharacterClass (ecma ? Category.EcmaWord : Category.Word, true);
840                                 break;
841                                 
842                         case 'S':
843                                 expr = new CharacterClass (ecma ? Category.EcmaWhiteSpace : Category.WhiteSpace, true);
844                                 break;
845                                 
846                         case 'P':
847                                 expr = new CharacterClass (ParseUnicodeCategory (), true);
848                                 break;
849
850                         // positions
851
852                         case 'A': expr = new PositionAssertion (Position.StartOfString); break;
853                         case 'Z': expr = new PositionAssertion (Position.End); break;
854                         case 'z': expr = new PositionAssertion (Position.EndOfString); break;
855                         case 'G': expr = new PositionAssertion (Position.StartOfScan); break;
856                         case 'b': expr = new PositionAssertion (Position.Boundary); break;
857                         case 'B': expr = new PositionAssertion (Position.NonBoundary); break;
858                         
859                         // references
860
861                         case '1': case '2': case '3': case '4': case '5':
862                         case '6': case '7': case '8': case '9': {
863                                 ptr --;
864                                 int n = ParseNumber (10, 1, 0);
865                                 if (n < 0) {
866                                         ptr = p;
867                                         return null;
868                                 }
869
870                                 // FIXME test if number is within number of assigned groups
871                                 // this may present a problem for right-to-left matching
872
873                                 Reference reference = new Reference (IsIgnoreCase (options));
874                                 refs.Add (reference, n.ToString ());
875                                 expr = reference;
876                                 break;
877                         }
878
879                         case 'k': {
880                                 char delim = pattern[ptr ++];
881                                 if (delim == '<')
882                                         delim = '>';
883                                 else if (delim != '\'')
884                                         throw NewParseException ("Malformed \\k<...> named backreference.");
885
886                                 string name = ParseName ();
887                                 if (name == null || pattern[ptr] != delim)
888                                         throw NewParseException ("Malformed \\k<...> named backreference.");
889
890                                 ++ ptr;
891                                 Reference reference = new Reference (IsIgnoreCase (options));
892                                 refs.Add (reference, name);
893                                 expr = reference;
894                                 break;
895                         }
896
897                         default:
898                                 expr = null;
899                                 break;
900                         }
901
902                         if (expr == null)
903                                 ptr = p;
904
905                         return expr;
906                 }
907
908                 private int ParseEscape () {
909                         int p = ptr;
910                         int c;
911
912                         if (p >= pattern.Length)
913                                 throw new ArgumentException (
914                                                 String.Format ("Parsing \"{0}\" - Illegal \\ at end of " + 
915                                                                 "pattern.", pattern), pattern);
916                         
917                         switch (pattern[ptr ++]) {
918         
919                         // standard escapes (except \b)
920
921                         case 'a': return '\u0007';
922                         case 't': return '\u0009';
923                         case 'r': return '\u000d';
924                         case 'v': return '\u000b';
925                         case 'f': return '\u000c';
926                         case 'n': return '\u000a';
927                         case 'e': return '\u001b';
928                         case '\\': return '\\';
929
930                         // character codes
931
932                         case '0':
933                                 //
934                                 // Turns out that octal values can be specified
935                                 // without a leading zero.   But also the limit
936                                 // of three character should include this first
937                                 // one.  
938                                 //
939                                 ptr--;
940                                 int prevptr = ptr;
941                                 int result = ParseOctal (pattern, ref ptr);
942                                 if (result == -1 && prevptr == ptr)
943                                         return 0;
944
945                                 return result;
946
947                         case 'x':
948                                 c = ParseHex (pattern, ref ptr, 2);
949                                 if (c < 0)
950                                         throw NewParseException ("Insufficient hex digits");
951
952                                 return c;
953
954                         case 'u':
955                                 c = ParseHex (pattern, ref ptr, 4);
956                                 if (c < 0)
957                                         throw NewParseException ("Insufficient hex digits");
958                                 
959                                 return c;
960
961                         // control characters
962
963                         case 'c':
964                                 c = pattern[ptr ++];
965                                 if (c >= '@' && c <= '_')
966                                         return c - '@';
967                                 else
968                                         throw NewParseException ("Unrecognized control character.");
969
970                         // unknown escape
971
972                         default:
973                                 ptr = p;
974                                 return -1;
975                         }
976                 }
977
978                 private string ParseName () {
979                         return Parser.ParseName (pattern, ref ptr);
980                 }
981
982                 private static bool IsNameChar (char c) {
983                         UnicodeCategory cat = Char.GetUnicodeCategory (c);
984                         if (cat == UnicodeCategory.ModifierLetter)
985                                 return false;
986                         if (cat == UnicodeCategory.ConnectorPunctuation)
987                                 return true;
988                         return Char.IsLetterOrDigit (c);
989                 }
990         
991                 private int ParseNumber (int b, int min, int max) {
992                         return Parser.ParseNumber (pattern, ref ptr, b, min, max);
993                 }
994
995                 private int ParseDecimal () {
996                         return Parser.ParseDecimal (pattern, ref ptr);
997                 }
998
999                 private static int ParseDigit (char c, int b, int n) {
1000                         switch (b) {
1001                         case 8:
1002                                 if (c >= '0' && c <= '7')
1003                                         return c - '0';
1004                                 else
1005                                         return -1;
1006                         case 10:
1007                                 if (c >= '0' && c <= '9')
1008                                         return c - '0';
1009                                 else
1010                                         return -1;
1011                         case 16:
1012                                 if (c >= '0' && c <= '9')
1013                                         return c - '0';
1014                                 else if (c >= 'a' && c <= 'f')
1015                                         return 10 + c - 'a';
1016                                 else if (c >= 'A' && c <= 'F')
1017                                         return 10 + c - 'A';
1018                                 else
1019                                         return -1;
1020                         default:
1021                                 return -1;
1022                         }
1023                 }
1024
1025                 private void ConsumeWhitespace (bool ignore) {
1026                         while (true) {
1027                                 if (ptr >= pattern.Length)
1028                                         break;
1029                         
1030                                 if (pattern[ptr] == '(') {
1031                                         if (ptr + 3 >= pattern.Length)
1032                                                 return;
1033
1034                                         if (pattern[ptr + 1] != '?' || pattern[ptr + 2] != '#')
1035                                                 return;
1036
1037                                         ptr += 3;
1038                                         while (pattern[ptr ++] != ')')
1039                                                 /* ignore */ ;
1040                                 }
1041                                 else if (ignore && pattern[ptr] == '#') {
1042                                         while (ptr < pattern.Length && pattern[ptr ++] != '\n')
1043                                                 /* ignore */ ;
1044                                 }
1045                                 else if (ignore && Char.IsWhiteSpace (pattern[ptr])) {
1046                                         while (ptr < pattern.Length && Char.IsWhiteSpace (pattern[ptr]))
1047                                                 ++ ptr;
1048                                 }
1049                                 else
1050                                         return;
1051                         }
1052                 }
1053
1054                 private string ParseString (string pattern) {
1055                         this.pattern = pattern;
1056                         this.ptr = 0;
1057
1058                         StringBuilder result = new StringBuilder (pattern.Length);
1059                         while (ptr < pattern.Length) {
1060                                 int c = pattern[ptr ++];
1061                                 if (c == '\\') {
1062                                         c = ParseEscape ();
1063
1064                                         if(c < 0) {
1065                                                 c = pattern[ptr ++];
1066                                                 if(c == 'b')
1067                                                         c = '\b';
1068                                         }
1069                                 }
1070                                 result.Append ((char) c);
1071                         }
1072
1073                         return result.ToString ();
1074                 }
1075
1076                 private void ResolveReferences () {
1077                         int gid = 1;
1078                         Hashtable dict = new Hashtable ();
1079
1080                         // number unnamed groups
1081
1082                         foreach (CapturingGroup group in caps) {
1083                                 if (group.Name == null) {
1084                                         dict.Add (gid.ToString (), group);
1085                                         group.Number = gid ++;
1086
1087                                         ++ num_groups;
1088                                 }
1089                         }
1090
1091                         // number named groups
1092
1093                         foreach (CapturingGroup group in caps) {
1094                                 if (group.Name != null) {
1095                                         if (!dict.Contains (group.Name)) {
1096                                                 dict.Add (group.Name, group);
1097                                                 group.Number = gid ++;
1098
1099                                                 ++ num_groups;
1100                                         }
1101                                         else {
1102                                                 CapturingGroup prev = (CapturingGroup)dict[group.Name];
1103                                                 group.Number = prev.Number;
1104                                         }
1105                                 }
1106                         }
1107
1108                         // resolve references
1109
1110                         foreach (Expression expr in refs.Keys) {
1111                                 string name = (string)refs[expr];
1112                                 if (!dict.Contains (name)) {
1113                                         throw NewParseException ("Reference to undefined group " +
1114                                                 (Char.IsDigit (name[0]) ? "number " : "name ") +
1115                                                 name);
1116                                 }
1117
1118                                 CapturingGroup group = (CapturingGroup)dict[name];
1119                                 if (expr is Reference)
1120                                         ((Reference)expr).CapturingGroup = group;
1121                                 else if (expr is CaptureAssertion)
1122                                         ((CaptureAssertion)expr).CapturingGroup = group;
1123                                 else if (expr is BalancingGroup)
1124                                         ((BalancingGroup)expr).Balance = group;
1125                         }
1126                 }
1127
1128                 // flag helper functions
1129
1130                 private static bool IsIgnoreCase (RegexOptions options) {
1131                         return (options & RegexOptions.IgnoreCase) != 0;
1132                 }
1133
1134                 private static bool IsMultiline (RegexOptions options) {
1135                         return (options & RegexOptions.Multiline) != 0;
1136                 }
1137
1138                 private static bool IsExplicitCapture (RegexOptions options) {
1139                         return (options & RegexOptions.ExplicitCapture) != 0;
1140                 }
1141         
1142                 private static bool IsSingleline (RegexOptions options) {
1143                         return (options & RegexOptions.Singleline) != 0;
1144                 }
1145
1146                 private static bool IsIgnorePatternWhitespace (RegexOptions options) {
1147                         return (options & RegexOptions.IgnorePatternWhitespace) != 0;
1148                 }
1149
1150                 private static bool IsRightToLeft (RegexOptions options) {
1151                         return (options & RegexOptions.RightToLeft) != 0;
1152                 }
1153
1154                 private static bool IsECMAScript (RegexOptions options) {
1155                         return (options & RegexOptions.ECMAScript) != 0;
1156                 }
1157
1158                 // exception creation
1159
1160                 private ArgumentException NewParseException (string msg) {
1161                         msg = "parsing \"" + pattern + "\" - " + msg;
1162                         return new ArgumentException (msg, pattern);
1163                 }
1164
1165                 private string pattern;
1166                 private int ptr;
1167
1168                 private ArrayList caps;
1169                 private Hashtable refs;
1170                 private int num_groups;
1171         }
1172 }