Invoke the callback in separate threads
[mono.git] / mcs / class / System / System.Text.RegularExpressions / parser.cs
1 //
2 // assembly:    System
3 // namespace:   System.Text.RegularExpressions
4 // file:        parser.cs
5 //
6 // author:      Dan Lewis (dlewis@gmx.co.uk)
7 //              (c) 2002
8
9 //
10 // Permission is hereby granted, free of charge, to any person obtaining
11 // a copy of this software and associated documentation files (the
12 // "Software"), to deal in the Software without restriction, including
13 // without limitation the rights to use, copy, modify, merge, publish,
14 // distribute, sublicense, and/or sell copies of the Software, and to
15 // permit persons to whom the Software is furnished to do so, subject to
16 // the following conditions:
17 // 
18 // The above copyright notice and this permission notice shall be
19 // included in all copies or substantial portions of the Software.
20 // 
21 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
22 // EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
23 // MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
24 // NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
25 // LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
26 // OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
27 // WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
28 //
29
30 using System;
31 using System.Collections;
32 using System.Globalization;
33
34 namespace System.Text.RegularExpressions.Syntax {
35
36         class Parser {
37                 public static int ParseDecimal (string str, ref int ptr) {
38                         return ParseNumber (str, ref ptr, 10, 1, Int32.MaxValue);
39                 }
40
41                 public static int ParseOctal (string str, ref int ptr) {
42                         return ParseNumber (str, ref ptr, 8, 1, 3);
43                 }
44
45                 public static int ParseHex (string str, ref int ptr, int digits) {
46                         return ParseNumber (str, ref ptr, 16, digits, digits);
47                 }
48
49                 public static int ParseNumber (string str, ref int ptr, int b, int min, int max) {
50                         int p = ptr, n = 0, digits = 0, d;
51                         if (max < min)
52                                 max = Int32.MaxValue;
53
54                         while (digits < max && p < str.Length) {
55                                 d = ParseDigit (str[p ++], b, digits);
56                                 if (d < 0) {
57                                         -- p;
58                                         break;
59                                 }
60
61                                 n = n * b + d;
62                                 ++ digits;
63                         }
64
65                         if (digits < min)
66                                 return -1;
67
68                         ptr = p;
69                         return n;
70                 }
71
72                 public static string ParseName (string str, ref int ptr) {
73                         if (Char.IsDigit (str[ptr])) {
74                                 int gid = ParseNumber (str, ref ptr, 10, 1, 0);
75                                 if (gid > 0)
76                                         return gid.ToString ();
77                                 
78                                 return null;
79                         }
80
81                         int start = ptr;
82                         for (;;) {
83                                 if (!IsNameChar (str[ptr]))
84                                         break;
85                                 ++ ptr;
86                         }
87
88                         if (ptr - start > 0)
89                                 return str.Substring (start, ptr - start);
90
91                         return null;
92                 }
93
94                 public static string Escape (string str) {
95                         string result = "";
96                         for (int i = 0; i < str.Length; ++ i) {
97                                 char c = str[i];
98                                 switch (c) {
99                                 case '\\': case '*': case '+': case '?': case '|':
100                                 case '{': case '[': case '(': case ')': case '^':
101                                 case '$': case '.': case '#': case ' ':
102                                         result += "\\" + c;
103                                         break;
104
105                                 case '\t': result += "\\t"; break;
106                                 case '\n': result += "\\n"; break;
107                                 case '\r': result += "\\r"; break;
108                                 case '\f': result += "\\f"; break;
109
110                                 default: result += c; break;
111                                 }
112                         }
113
114                         return result;
115                 }
116
117                 public static string Unescape (string str) {
118                         if (str.IndexOf ('\\') == -1)
119                                 return str;
120                         return new Parser ().ParseString (str);
121                 }
122
123                 // public instance
124
125                 public Parser () {
126                         this.caps = new ArrayList ();
127                         this.refs = new Hashtable ();
128                 }
129
130                 public RegularExpression ParseRegularExpression (string pattern, RegexOptions options) {
131                         this.pattern = pattern;
132                         this.ptr = 0;
133
134                         caps.Clear ();
135                         refs.Clear ();
136                         this.num_groups = 0;
137
138                         try {
139                                 RegularExpression re = new RegularExpression ();
140                                 ParseGroup (re, options, null);
141                                 ResolveReferences ();
142
143                                 re.GroupCount = num_groups;
144                                 
145                                 return re;
146                         }
147                         catch (IndexOutOfRangeException) {
148                                 throw NewParseException ("Unexpected end of pattern.");
149                         }
150                 }
151
152                 public int GetMapping (Hashtable mapping)
153                 {
154                         int end = caps.Count;
155                         mapping.Add ("0", 0);
156                         for (int i = 0; i < end; i++) {
157                                 CapturingGroup group = (CapturingGroup) caps [i];
158                                 string name = group.Name != null ? group.Name : group.Index.ToString ();
159                                 if (mapping.Contains (name)) {
160                                         if ((int) mapping [name] != group.Index)
161                                                 throw new SystemException ("invalid state");
162                                         continue;
163                                 }
164                                 mapping.Add (name, group.Index);
165                         }
166
167                         return gap;
168                 }
169
170                 // private methods
171
172                 private void ParseGroup (Group group, RegexOptions options, Assertion assertion) {
173                         bool is_top_level = group is RegularExpression;
174                 
175                         Alternation alternation = null;
176                         string literal = null;
177
178                         Group current = new Group ();
179                         Expression expr = null;
180                         bool closed = false;
181
182                         while (true) {
183                                 ConsumeWhitespace (IsIgnorePatternWhitespace (options));
184                                 if (ptr >= pattern.Length)
185                                         break;
186                                 
187                                 // (1) Parse for Expressions
188                         
189                                 char ch = pattern[ptr ++];
190                                 
191                                 switch (ch) {
192                                 case '^': {
193                                         Position pos =
194                                                 IsMultiline (options) ? Position.StartOfLine : Position.Start;
195                                         expr = new PositionAssertion (pos);
196                                         break;
197                                 }
198
199                                 case '$': {
200                                         Position pos =
201                                                 IsMultiline (options) ? Position.EndOfLine : Position.End;
202                                         expr = new PositionAssertion (pos);
203                                         break;
204                                 }
205
206                                 case '.': {
207                                         Category cat =
208                                                 IsSingleline (options) ? Category.AnySingleline : Category.Any;
209                                         expr = new CharacterClass (cat, false);
210                                         break;
211                                 }
212
213                                 case '\\': {
214                                         int c = ParseEscape (false);
215                                         if (c >= 0)
216                                                 ch = (char)c;
217                                         else {
218                                                 expr = ParseSpecial (options);
219
220                                                 if (expr == null)
221                                                         ch = pattern[ptr ++];           // default escape
222                                         }
223                                         break;
224                                 }
225
226                                 case '[': {
227                                         expr = ParseCharacterClass (options);
228                                         break;
229                                 }
230
231                                 case '(': {
232                                         bool ignore = IsIgnoreCase (options);
233                                         expr = ParseGroupingConstruct (ref options);
234                                         if (expr == null) {
235                                                 if (literal != null && IsIgnoreCase (options) != ignore) {
236                                                         current.AppendExpression (new Literal (literal, IsIgnoreCase (options)));
237                                                         literal = null;
238                                                 }
239
240                                                 continue;
241                                         }
242                                         break;
243                                 }
244
245                                 case ')': {
246                                         closed = true;
247                                         goto EndOfGroup;
248                                 }
249
250                                 case '|': {
251                                         if (literal != null) {
252                                                 current.AppendExpression (new Literal (literal, IsIgnoreCase (options)));
253                                                 literal = null;
254                                         }
255
256                                         if (assertion != null) {
257                                                 if (assertion.TrueExpression == null)
258                                                         assertion.TrueExpression = current;
259                                                 else if (assertion.FalseExpression == null)
260                                                         assertion.FalseExpression = current;
261                                                 else
262                                                         throw NewParseException ("Too many | in (?()|).");
263                                         }
264                                         else {
265                                                 if (alternation == null)
266                                                         alternation = new Alternation ();
267
268                                                 alternation.AddAlternative (current);
269                                         }
270
271                                         current = new Group ();
272                                         continue;
273                                 }
274
275                                 case '*': case '+': case '?': {
276                                         throw NewParseException ("Bad quantifier.");
277                                 }
278
279                                 default: 
280                                         break;          // literal character
281                                 }
282
283                                 ConsumeWhitespace (IsIgnorePatternWhitespace (options));
284                                 
285                                 // (2) Check for Repetitions
286                                 
287                                 if (ptr < pattern.Length) {
288                                         char k = pattern[ptr];
289                                         int min = 0, max = 0;
290                                         bool lazy = false;
291                                         bool haveRep = false;
292
293
294                                         if (k == '?' || k == '*' || k == '+') {
295                                                 ++ ptr;
296                                                 haveRep = true;
297
298                                                 switch (k) {
299                                                 case '?': min = 0; max = 1; break;
300                                                 case '*': min = 0; max = 0x7fffffff; break;
301                                                 case '+': min = 1; max = 0x7fffffff; break;
302                                                 }
303                                         } else if (k == '{' && ptr + 1 < pattern.Length) {
304                                                 int saved_ptr = ptr;
305                                                 ++ptr;
306                                                 haveRep = ParseRepetitionBounds (out min, out max, options);
307                                                 if (!haveRep)
308                                                         ptr = saved_ptr;
309                                         }
310
311                                         if (haveRep) {
312                                                 ConsumeWhitespace (IsIgnorePatternWhitespace (options));
313                                                 if (ptr < pattern.Length && pattern[ptr] == '?') {
314                                                         ++ ptr;
315                                                         lazy = true;
316                                                 }
317
318                                                 //It doesn't make sense to assert a given position more than once.
319                                                 bool ignore_repetition = false;
320                                                 if (expr is PositionAssertion) {
321                                                         ignore_repetition = min > 0 && !lazy;
322                                                         max = 1;
323                                                 }
324
325                                                 if (!ignore_repetition) {
326                                                         Repetition repetition = new Repetition (min, max, lazy);
327         
328                                                         if (expr == null)
329                                                                 repetition.Expression = new Literal (ch.ToString (), IsIgnoreCase (options));
330                                                         else
331                                                                 repetition.Expression = expr;
332         
333                                                         expr = repetition;
334                                                 }
335                                         }
336                                 }
337
338                                 // (3) Append Expression and/or Literal
339
340                                 if (expr == null) {
341                                         if (literal == null)
342                                                 literal = "";
343                                         literal += ch;
344                                 }
345                                 else {
346                                         if (literal != null) {
347                                                 current.AppendExpression (new Literal (literal, IsIgnoreCase (options)));
348                                                 literal = null;
349                                         }
350
351                                         current.AppendExpression (expr);
352                                         expr = null;
353                                 }
354
355                                 if (is_top_level && ptr >= pattern.Length)
356                                         goto EndOfGroup;
357                         }
358
359                 EndOfGroup:
360                         if (is_top_level && closed)
361                                 throw NewParseException ("Too many )'s.");
362                         if (!is_top_level && !closed)
363                                 throw NewParseException ("Not enough )'s.");
364                                 
365                 
366                         // clean up literals and alternations
367
368                         if (literal != null)
369                                 current.AppendExpression (new Literal (literal, IsIgnoreCase (options)));
370
371                         if (assertion != null) {
372                                 if (assertion.TrueExpression == null)
373                                         assertion.TrueExpression = current;
374                                 else
375                                         assertion.FalseExpression = current;
376                                 
377                                 group.AppendExpression (assertion);
378                         }
379                         else if (alternation != null) {
380                                 alternation.AddAlternative (current);
381                                 group.AppendExpression (alternation);
382                         }
383                         else
384                                 group.AppendExpression (current);
385                 }
386
387                 private Expression ParseGroupingConstruct (ref RegexOptions options) {
388                         if (pattern[ptr] != '?') {
389                                 Group group;
390
391                                 if (IsExplicitCapture (options))
392                                         group = new Group ();
393                                 else {
394                                         group = new CapturingGroup ();
395                                         caps.Add (group);
396                                 }
397
398                                 ParseGroup (group, options, null);
399                                 return group;
400                         }
401                         else
402                                 ++ ptr;
403
404                         switch (pattern[ptr]) {
405                         case ':': {                                             // non-capturing group
406                                 ++ ptr;
407                                 Group group = new Group ();
408                                 ParseGroup (group, options, null);
409
410                                 return group;
411                         }
412
413                         case '>': {                                             // non-backtracking group
414                                 ++ ptr;
415                                 Group group = new NonBacktrackingGroup ();
416                                 ParseGroup (group, options, null);
417                                 
418                                 return group;
419                         }
420
421                         case 'i': case 'm': case 'n':
422                         case 's': case 'x': case '-': {                         // options
423                                 RegexOptions o = options;
424                                 ParseOptions (ref o, false);
425                                 if (pattern[ptr] == '-') {
426                                         ++ ptr;
427                                         ParseOptions (ref o, true);
428                                 }
429
430                                 if (pattern[ptr] == ':') {                      // pass options to child group
431                                         ++ ptr;
432                                         Group group = new Group ();
433                                         ParseGroup (group, o, null);
434                                         return group;
435                                 }
436                                 else if (pattern[ptr] == ')') {                 // change options of enclosing group
437                                         ++ ptr;
438                                         options = o;
439                                         return null;
440                                 }
441                                 else
442                                         throw NewParseException ("Bad options");
443                         }
444
445                         case '<': case '=': case '!': {                         // lookahead/lookbehind
446                                 ExpressionAssertion asn = new ExpressionAssertion ();
447                                 if (!ParseAssertionType (asn))
448                                         goto case '\'';                         // it's a (?<name> ) construct
449
450                                 Group test = new Group ();
451                                 ParseGroup (test, options, null);
452
453                                 asn.TestExpression = test;
454                                 return asn;
455                         }
456
457                         case '\'': {                                            // named/balancing group
458                                 char delim;
459                                 if (pattern[ptr] == '<')
460                                         delim = '>';
461                                 else
462                                         delim = '\'';
463
464                                 ++ ptr;
465                                 string name = ParseName ();
466
467                                 if (pattern[ptr] == delim) {
468                                         // capturing group
469
470                                         if (name == null)
471                                                 throw NewParseException ("Bad group name.");
472
473                                         ++ ptr;
474                                         CapturingGroup cap = new CapturingGroup ();
475                                         cap.Name = name;
476                                         caps.Add (cap);
477                                         ParseGroup (cap, options, null);
478
479                                         return cap;
480                                 }
481                                 else if (pattern[ptr] == '-') {
482                                         // balancing group
483
484                                         ++ ptr;
485                                         string balance_name = ParseName ();
486                                         if (balance_name == null || pattern[ptr] != delim)
487                                                 throw NewParseException ("Bad balancing group name.");
488
489                                         ++ ptr;
490                                         BalancingGroup bal = new BalancingGroup ();
491                                         bal.Name = name;
492                                         
493                                         if(bal.IsNamed) {
494                                                 caps.Add (bal);
495                                         }
496
497                                         refs.Add (bal, balance_name);
498
499                                         ParseGroup (bal, options, null);
500
501                                         return bal;
502                                 }
503                                 else
504                                         throw NewParseException ("Bad group name.");
505                         }
506
507                         case '(': {                                             // expression/capture test
508                                 Assertion asn;
509                         
510                                 ++ ptr;
511                                 int p = ptr;
512                                 string name = ParseName ();
513                                 if (name == null || pattern[ptr] != ')') {      // expression test
514                                         // FIXME MS implementation doesn't seem to
515                                         // implement this version of (?(x) ...)
516
517                                         ptr = p;
518                                         ExpressionAssertion expr_asn = new ExpressionAssertion ();
519
520                                         if (pattern[ptr] == '?') {
521                                                 ++ ptr;
522                                                 if (!ParseAssertionType (expr_asn))
523                                                         throw NewParseException ("Bad conditional.");
524                                         }
525                                         else {
526                                                 expr_asn.Negate = false;
527                                                 expr_asn.Reverse = false;
528                                         }
529
530                                         Group test = new Group ();
531                                         ParseGroup (test, options, null);
532                                         expr_asn.TestExpression = test;
533                                         asn = expr_asn;
534                                 }
535                                 else {                                          // capture test
536                                         ++ ptr;
537                                         asn = new CaptureAssertion (new Literal (name, IsIgnoreCase (options)));
538                                         refs.Add (asn, name);
539                                 }
540
541                                 Group group = new Group ();
542                                 ParseGroup (group, options, asn);
543                                 return group;
544                         }
545
546                         case '#': {                                             // comment
547                                 ++ ptr;
548                                 while (pattern[ptr ++] != ')') {
549                                         if (ptr >= pattern.Length)
550                                                 throw NewParseException ("Unterminated (?#...) comment.");
551                                 }
552                                 return null;
553                         }
554
555                         default:                                                // error
556                                 throw NewParseException ("Bad grouping construct.");
557                         }
558                 }
559
560                 private bool ParseAssertionType (ExpressionAssertion assertion) {
561                         if (pattern[ptr] == '<') {
562                                 switch (pattern[ptr + 1]) {
563                                 case '=':
564                                         assertion.Negate = false;
565                                         break;
566                                 case '!':
567                                         assertion.Negate = true;
568                                         break;
569                                 default:
570                                         return false;
571                                 }
572
573                                 assertion.Reverse = true;
574                                 ptr += 2;
575                         }
576                         else {
577                                 switch (pattern[ptr]) {
578                                 case '=':
579                                         assertion.Negate = false;
580                                         break;
581                                 case '!':
582                                         assertion.Negate = true;
583                                         break;
584                                 default:
585                                         return false;
586                                 }
587
588                                 assertion.Reverse = false;
589                                 ptr += 1;
590                         }
591
592                         return true;
593                 }
594
595                 private void ParseOptions (ref RegexOptions options, bool negate) {
596                         for (;;) {
597                                 switch (pattern[ptr]) {
598                                 case 'i':
599                                         if (negate)
600                                                 options &= ~RegexOptions.IgnoreCase;
601                                         else
602                                                 options |= RegexOptions.IgnoreCase;
603                                         break;
604
605                                 case 'm':
606                                         if (negate)
607                                                 options &= ~RegexOptions.Multiline;
608                                         else
609                                                 options |= RegexOptions.Multiline;
610                                         break;
611                                         
612                                 case 'n':
613                                         if (negate)
614                                                 options &= ~RegexOptions.ExplicitCapture;
615                                         else
616                                                 options |= RegexOptions.ExplicitCapture;
617                                         break;
618                                         
619                                 case 's':
620                                         if (negate)
621                                                 options &= ~RegexOptions.Singleline;
622                                         else
623                                                 options |= RegexOptions.Singleline;
624                                         break;
625                                         
626                                 case 'x':
627                                         if (negate)
628                                                 options &= ~RegexOptions.IgnorePatternWhitespace;
629                                         else
630                                                 options |= RegexOptions.IgnorePatternWhitespace;
631                                         break;
632
633                                 default:
634                                         return;
635                                 }
636
637                                 ++ ptr;
638                         }
639                 }
640
641                 private Expression ParseCharacterClass (RegexOptions options) {
642                         bool negate = false;
643                         if (pattern[ptr] == '^') {
644                                 negate = true;
645                                 ++ ptr;
646                         }
647                         
648                         bool ecma = IsECMAScript (options);
649                         CharacterClass cls = new CharacterClass (negate, IsIgnoreCase (options));
650
651                         if (pattern[ptr] == ']') {
652                                 cls.AddCharacter (']');
653                                 ++ ptr;
654                         }
655
656                         int c = -1;
657                         int last = -1;
658                         bool range = false;
659                         bool closed = false;
660                         while (ptr < pattern.Length) {
661                                 c = pattern[ptr ++];
662
663                                 if (c == ']') {
664                                         closed = true;
665                                         break;
666                                 }
667
668                                 if (c == '-' && last >= 0 && !range) {
669                                         range = true;
670                                         continue;
671                                 }
672
673                                 if (c == '\\') {
674                                         c = ParseEscape (true);
675                                         if (c >= 0)
676                                                 goto char_recognized;
677
678                                         // didn't recognize escape
679                                         c = pattern [ptr ++];
680                                         switch (c) {
681                                         case 'b':
682                                                 c = '\b';
683                                                 goto char_recognized;
684
685                                         case 'd': case 'D':
686                                                 cls.AddCategory (ecma ? Category.EcmaDigit : Category.Digit, c == 'D');
687                                                 break;
688                                                 
689                                         case 'w': case 'W':
690                                                 cls.AddCategory (ecma ? Category.EcmaWord : Category.Word, c == 'W');
691                                                 break;
692                                                 
693                                         case 's': case 'S':
694                                                 cls.AddCategory (ecma ? Category.EcmaWhiteSpace : Category.WhiteSpace, c == 'S');
695                                                 break;
696                                                 
697                                         case 'p': case 'P':
698                                                 cls.AddCategory (ParseUnicodeCategory (), c == 'P');    // ignore ecma
699                                                 break;
700
701                                         default:                // add escaped character
702                                                 goto char_recognized;
703                                         }
704
705                                         // if the pattern looks like [a-\s] ...
706                                         if (range)
707                                                 throw NewParseException ("character range cannot have category \\" + c);
708
709                                         last = -1;
710                                         continue;
711                                 }
712
713                         char_recognized:
714                                 if (range) {
715                                         // if 'range' is true, we know that 'last >= 0'
716                                         if (c < last)
717                                                 throw NewParseException ("[" + last + "-" + c + "] range in reverse order.");
718                                         cls.AddRange ((char)last, (char)c);
719                                         last = -1;
720                                         range = false;
721                                         continue;
722                                 }
723
724                                 cls.AddCharacter ((char)c);
725                                 last = c;
726                         }
727
728                         if (!closed)
729                                 throw NewParseException ("Unterminated [] set.");
730
731                         if (range)
732                                 cls.AddCharacter ('-');
733
734                         return cls;
735                 }
736
737                 private bool ParseRepetitionBounds (out int min, out int max, RegexOptions options) {
738                         int n, m;
739                         min = max = 0;
740
741                         /* check syntax */
742
743                         ConsumeWhitespace (IsIgnorePatternWhitespace (options));
744                     
745                         if (pattern[ptr] == ',') {
746                                 n = -1;
747                         } else {
748                                 n = ParseNumber (10, 1, 0);
749                                 ConsumeWhitespace (IsIgnorePatternWhitespace (options));
750                         }
751                         
752                         switch (pattern[ptr ++]) {
753                         case '}':
754                                 m = n;
755                                 break;
756                         case ',':
757                                 ConsumeWhitespace (IsIgnorePatternWhitespace (options));
758                                 m = ParseNumber (10, 1, 0);
759                                 ConsumeWhitespace (IsIgnorePatternWhitespace (options));
760                                 if (pattern[ptr ++] != '}')
761                                         return false;
762                                 break;
763                         default:
764                                 return false;
765                         }
766
767                         /* check bounds and ordering */
768
769                         if (n > 0x7fffffff || m > 0x7fffffff)
770                                 throw NewParseException ("Illegal {x, y} - maximum of 2147483647.");
771                         if (m >= 0 && m < n)
772                                 throw NewParseException ("Illegal {x, y} with x > y.");
773
774                         /* assign min and max */
775                         
776                         min = n;
777                         if (m > 0)
778                                 max = m;
779                         else
780                                 max = 0x7fffffff;
781
782                         return true;
783                 }
784
785                 private Category ParseUnicodeCategory () {
786                         if (pattern[ptr ++] != '{')
787                                 throw NewParseException ("Incomplete \\p{X} character escape.");
788
789                         string name = ParseName (pattern, ref ptr);
790                         if (name == null)
791                                 throw NewParseException ("Incomplete \\p{X} character escape.");
792
793                         Category cat = CategoryUtils.CategoryFromName (name);
794                         if (cat == Category.None)
795                                 throw NewParseException ("Unknown property '" + name + "'.");
796
797                         if (pattern[ptr ++] != '}')
798                                 throw NewParseException ("Incomplete \\p{X} character escape.");
799
800                         return cat;
801                 }
802
803                 private Expression ParseSpecial (RegexOptions options) {
804                         int p = ptr;
805                         bool ecma = IsECMAScript (options);
806                         Expression expr = null;
807                         
808                         switch (pattern[ptr ++]) {
809
810                         // categories
811
812                         case 'd':
813                                 expr = new CharacterClass (ecma ? Category.EcmaDigit : Category.Digit, false);
814                                 break;
815                                 
816                         case 'w':
817                                 expr = new CharacterClass (ecma ? Category.EcmaWord : Category.Word, false);
818                                 break;
819                                 
820                         case 's':
821                                 expr = new CharacterClass (ecma ? Category.EcmaWhiteSpace : Category.WhiteSpace, false);
822                                 break;
823                                 
824                         case 'p':
825                                 // this is odd - ECMAScript isn't supposed to support Unicode,
826                                 // yet \p{..} compiles and runs under the MS implementation
827                                 // identically to canonical mode. That's why I'm ignoring the
828                                 // value of ecma here.
829                         
830                                 expr = new CharacterClass (ParseUnicodeCategory (), false);
831                                 break;
832                                 
833                         case 'D':
834                                 expr = new CharacterClass (ecma ? Category.EcmaDigit : Category.Digit, true);
835                                 break;
836                                 
837                         case 'W':
838                                 expr = new CharacterClass (ecma ? Category.EcmaWord : Category.Word, true);
839                                 break;
840                                 
841                         case 'S':
842                                 expr = new CharacterClass (ecma ? Category.EcmaWhiteSpace : Category.WhiteSpace, true);
843                                 break;
844                                 
845                         case 'P':
846                                 expr = new CharacterClass (ParseUnicodeCategory (), true);
847                                 break;
848
849                         // positions
850
851                         case 'A': expr = new PositionAssertion (Position.StartOfString); break;
852                         case 'Z': expr = new PositionAssertion (Position.End); break;
853                         case 'z': expr = new PositionAssertion (Position.EndOfString); break;
854                         case 'G': expr = new PositionAssertion (Position.StartOfScan); break;
855                         case 'b': expr = new PositionAssertion (Position.Boundary); break;
856                         case 'B': expr = new PositionAssertion (Position.NonBoundary); break;
857                         
858                         // references
859
860                         case '1': case '2': case '3': case '4': case '5':
861                         case '6': case '7': case '8': case '9': {
862                                 ptr --;
863                                 int oldptr = ptr;
864                                 int n = ParseNumber (10, 1, 0);
865                                 if (n < 0) {
866                                         ptr = p;
867                                         return null;
868                                 }
869
870                                 // FIXME test if number is within number of assigned groups
871                                 // this may present a problem for right-to-left matching
872
873                                 Reference reference = new BackslashNumber (IsIgnoreCase (options), ecma);
874                                 refs.Add (reference, n.ToString ());
875                                 expr = reference;
876                                 break;
877                         }
878
879                         case 'k': {
880                                 char delim = pattern[ptr ++];
881                                 if (delim == '<')
882                                         delim = '>';
883                                 else if (delim != '\'')
884                                         throw NewParseException ("Malformed \\k<...> named backreference.");
885
886                                 string name = ParseName ();
887                                 if (name == null || pattern[ptr] != delim)
888                                         throw NewParseException ("Malformed \\k<...> named backreference.");
889
890                                 ++ ptr;
891                                 Reference reference = new Reference (IsIgnoreCase (options));
892                                 refs.Add (reference, name);
893                                 expr = reference;
894                                 break;
895                         }
896
897                         default:
898                                 expr = null;
899                                 break;
900                         }
901
902                         if (expr == null)
903                                 ptr = p;
904
905                         return expr;
906                 }
907
908                 private int ParseEscape (bool inCharacterClass) {
909                         int p = ptr;
910                         int c;
911
912                         if (p >= pattern.Length)
913                                 throw new ArgumentException (
914                                                 String.Format ("Parsing \"{0}\" - Illegal \\ at end of " + 
915                                                                 "pattern.", pattern), pattern);
916                         
917                         switch (pattern[ptr ++]) {
918         
919                         // standard escapes (except \b)
920
921                         case 'a': return '\u0007';
922                         case 't': return '\u0009';
923                         case 'r': return '\u000d';
924                         case 'v': return '\u000b';
925                         case 'f': return '\u000c';
926                         case 'n': return '\u000a';
927                         case 'e': return '\u001b';
928                         case '\\': return '\\';
929
930                         // character codes
931
932                         case '0':
933                                 
934                                 //
935                                 // Turns out that octal values can be specified
936                                 // without a leading zero.   But also the limit
937                                 // of three character should include this first
938                                 // one.  
939                                 //
940                                 ptr--;
941                                 int prevptr = ptr;
942                                 int result = ParseOctal (pattern, ref ptr);
943                                 if (result == -1 && prevptr == ptr)
944                                         return 0;
945
946                                 return result;
947
948                         case '1': case '2': case '3': case '4': case '5':
949                         case '6': case '7':
950                                 if (inCharacterClass){
951                                         ptr--;
952                                         return ParseOctal (pattern, ref ptr);
953                                 }
954                                 break;
955                                 
956                         case 'x':
957                                 c = ParseHex (pattern, ref ptr, 2);
958                                 if (c < 0)
959                                         throw NewParseException ("Insufficient hex digits");
960
961                                 return c;
962
963                         case 'u':
964                                 c = ParseHex (pattern, ref ptr, 4);
965                                 if (c < 0)
966                                         throw NewParseException ("Insufficient hex digits");
967                                 
968                                 return c;
969
970                         // control characters
971
972                         case 'c':
973                                 c = pattern[ptr ++];
974                                 if (c >= '@' && c <= '_')
975                                         return c - '@';
976                                 else
977                                         throw NewParseException ("Unrecognized control character.");
978                         }
979
980                         // unknown escape
981                         ptr = p;
982                         return -1;
983                 }
984
985                 private string ParseName () {
986                         return Parser.ParseName (pattern, ref ptr);
987                 }
988
989                 private static bool IsNameChar (char c) {
990                         UnicodeCategory cat = Char.GetUnicodeCategory (c);
991                         if (cat == UnicodeCategory.ModifierLetter)
992                                 return false;
993                         if (cat == UnicodeCategory.ConnectorPunctuation)
994                                 return true;
995                         return Char.IsLetterOrDigit (c);
996                 }
997         
998                 private int ParseNumber (int b, int min, int max) {
999                         return Parser.ParseNumber (pattern, ref ptr, b, min, max);
1000                 }
1001
1002                 private static int ParseDigit (char c, int b, int n) {
1003                         switch (b) {
1004                         case 8:
1005                                 if (c >= '0' && c <= '7')
1006                                         return c - '0';
1007                                 else
1008                                         return -1;
1009                         case 10:
1010                                 if (c >= '0' && c <= '9')
1011                                         return c - '0';
1012                                 else
1013                                         return -1;
1014                         case 16:
1015                                 if (c >= '0' && c <= '9')
1016                                         return c - '0';
1017                                 else if (c >= 'a' && c <= 'f')
1018                                         return 10 + c - 'a';
1019                                 else if (c >= 'A' && c <= 'F')
1020                                         return 10 + c - 'A';
1021                                 else
1022                                         return -1;
1023                         default:
1024                                 return -1;
1025                         }
1026                 }
1027
1028                 private void ConsumeWhitespace (bool ignore) {
1029                         while (ptr < pattern.Length) {
1030                                 if (pattern[ptr] == '(') {
1031                                         if (ptr + 3 >= pattern.Length)
1032                                                 return;
1033
1034                                         if (pattern[ptr + 1] != '?' || pattern[ptr + 2] != '#')
1035                                                 return;
1036
1037                                         ptr += 3;
1038                                         while (ptr < pattern.Length && pattern[ptr ++] != ')')
1039                                                 /* ignore */ ;
1040                                 }
1041                                 else if (ignore && pattern[ptr] == '#') {
1042                                         while (ptr < pattern.Length && pattern[ptr ++] != '\n')
1043                                                 /* ignore */ ;
1044                                 }
1045                                 else if (ignore && Char.IsWhiteSpace (pattern[ptr])) {
1046                                         while (ptr < pattern.Length && Char.IsWhiteSpace (pattern[ptr]))
1047                                                 ++ ptr;
1048                                 }
1049                                 else
1050                                         return;
1051                         }
1052                 }
1053
1054                 private string ParseString (string pattern) {
1055                         this.pattern = pattern;
1056                         this.ptr = 0;
1057
1058                         StringBuilder result = new StringBuilder (pattern.Length);
1059                         while (ptr < pattern.Length) {
1060                                 int c = pattern[ptr ++];
1061                                 if (c == '\\') {
1062                                         c = ParseEscape (false);
1063
1064                                         if(c < 0) {
1065                                                 c = pattern[ptr ++];
1066                                                 if(c == 'b')
1067                                                         c = '\b';
1068                                         }
1069                                 }
1070                                 result.Append ((char) c);
1071                         }
1072
1073                         return result.ToString ();
1074                 }
1075
1076                 private void ResolveReferences ()
1077                 {
1078                         int gid = 1;
1079                         Hashtable dict = new Hashtable ();
1080                         ArrayList explicit_numeric_groups = null;
1081
1082                         // number unnamed groups
1083
1084                         foreach (CapturingGroup group in caps) {
1085                                 if (group.Name != null)
1086                                         continue;
1087
1088                                 dict.Add (gid.ToString (), group);
1089                                 group.Index = gid ++;
1090                                 ++ num_groups;
1091                         }
1092
1093                         // number named groups
1094
1095                         foreach (CapturingGroup group in caps) {
1096                                 if (group.Name == null)
1097                                         continue;
1098
1099                                 if (dict.Contains (group.Name)) {
1100                                         CapturingGroup prev = (CapturingGroup) dict [group.Name];
1101                                         group.Index = prev.Index;
1102
1103                                         if (group.Index == gid)
1104                                                 gid ++;
1105                                         else if (group.Index > gid)
1106                                                 explicit_numeric_groups.Add (group);
1107                                         continue;
1108                                 }
1109
1110                                 if (Char.IsDigit (group.Name [0])) {
1111                                         int ptr = 0;
1112                                         int group_gid = ParseDecimal (group.Name, ref ptr);
1113                                         if (ptr == group.Name.Length) {
1114                                                 group.Index = group_gid;
1115                                                 dict.Add (group.Name, group);
1116                                                 ++ num_groups;
1117
1118                                                 if (group_gid == gid) {
1119                                                         gid ++;
1120                                                 } else {
1121                                                         // all numbers before 'gid' are already in the dictionary.  So, we know group_gid > gid
1122                                                         if (explicit_numeric_groups == null)
1123                                                                 explicit_numeric_groups = new ArrayList (4);
1124                                                         explicit_numeric_groups.Add (group);
1125                                                 }
1126
1127                                                 continue;
1128                                         }
1129                                 }
1130
1131                                 string gid_s = gid.ToString ();
1132                                 while (dict.Contains (gid_s))
1133                                         gid_s = (++gid).ToString ();
1134
1135                                 dict.Add (gid_s, group);
1136                                 dict.Add (group.Name, group);
1137                                 group.Index = gid ++;
1138                                 ++ num_groups;
1139                         }
1140
1141                         gap = gid; // == 1 + num_groups, if explicit_numeric_groups == null
1142
1143                         if (explicit_numeric_groups != null)
1144                                 HandleExplicitNumericGroups (explicit_numeric_groups);
1145
1146                         // resolve references
1147
1148                         foreach (Expression expr in refs.Keys) {
1149                                 string name = (string) refs [expr];
1150                                 if (!dict.Contains (name)) {
1151                                         if (expr is CaptureAssertion && !Char.IsDigit (name [0]))
1152                                                 continue;
1153                                         BackslashNumber bn = expr as BackslashNumber;
1154                                         if (bn != null && bn.ResolveReference (name, dict))
1155                                                 continue;
1156                                         throw NewParseException ("Reference to undefined group " +
1157                                                 (Char.IsDigit (name[0]) ? "number " : "name ") +
1158                                                 name);
1159                                 }
1160
1161                                 CapturingGroup group = (CapturingGroup)dict[name];
1162                                 if (expr is Reference)
1163                                         ((Reference)expr).CapturingGroup = group;
1164                                 else if (expr is CaptureAssertion)
1165                                         ((CaptureAssertion)expr).CapturingGroup = group;
1166                                 else if (expr is BalancingGroup)
1167                                         ((BalancingGroup)expr).Balance = group;
1168                         }
1169                 }
1170
1171                 private void HandleExplicitNumericGroups (ArrayList explicit_numeric_groups)
1172                 {
1173                         int gid = gap;
1174                         int i = 0;
1175                         int n_explicit = explicit_numeric_groups.Count;
1176
1177                         explicit_numeric_groups.Sort ();
1178
1179                         // move 'gap' forward to skip over all explicit groups that
1180                         // turn out to match their index
1181                         for (; i < n_explicit; ++i) {
1182                                 CapturingGroup g = (CapturingGroup) explicit_numeric_groups [i];
1183                                 if (g.Index > gid)
1184                                         break;
1185                                 if (g.Index == gid)
1186                                         gid ++;
1187                         }
1188
1189                         gap = gid;
1190
1191                         // re-index all further groups so that the indexes are contiguous
1192                         int prev = gid;
1193                         for (; i < n_explicit; ++i) {
1194                                 CapturingGroup g = (CapturingGroup) explicit_numeric_groups [i];
1195                                 if (g.Index == prev) {
1196                                         g.Index = gid - 1;
1197                                 } else {
1198                                         prev = g.Index;
1199                                         g.Index = gid ++;
1200                                 }
1201                         }
1202                 }
1203
1204                 // flag helper functions
1205
1206                 private static bool IsIgnoreCase (RegexOptions options) {
1207                         return (options & RegexOptions.IgnoreCase) != 0;
1208                 }
1209
1210                 private static bool IsMultiline (RegexOptions options) {
1211                         return (options & RegexOptions.Multiline) != 0;
1212                 }
1213
1214                 private static bool IsExplicitCapture (RegexOptions options) {
1215                         return (options & RegexOptions.ExplicitCapture) != 0;
1216                 }
1217         
1218                 private static bool IsSingleline (RegexOptions options) {
1219                         return (options & RegexOptions.Singleline) != 0;
1220                 }
1221
1222                 private static bool IsIgnorePatternWhitespace (RegexOptions options) {
1223                         return (options & RegexOptions.IgnorePatternWhitespace) != 0;
1224                 }
1225
1226                 private static bool IsECMAScript (RegexOptions options) {
1227                         return (options & RegexOptions.ECMAScript) != 0;
1228                 }
1229
1230                 // exception creation
1231
1232                 private ArgumentException NewParseException (string msg) {
1233                         msg = "parsing \"" + pattern + "\" - " + msg;
1234                         return new ArgumentException (msg, pattern);
1235                 }
1236
1237                 private string pattern;
1238                 private int ptr;
1239
1240                 private ArrayList caps;
1241                 private Hashtable refs;
1242                 private int num_groups;
1243                 private int gap;
1244         }
1245 }