Fix bugs in sizing TableLayoutPanel (Xamarin bug 18638)
[mono.git] / mcs / class / System / System.Text.RegularExpressions / parser.cs
1 //
2 // assembly:    System
3 // namespace:   System.Text.RegularExpressions
4 // file:        parser.cs
5 //
6 // author:      Dan Lewis (dlewis@gmx.co.uk)
7 //              (c) 2002
8
9 //
10 // Permission is hereby granted, free of charge, to any person obtaining
11 // a copy of this software and associated documentation files (the
12 // "Software"), to deal in the Software without restriction, including
13 // without limitation the rights to use, copy, modify, merge, publish,
14 // distribute, sublicense, and/or sell copies of the Software, and to
15 // permit persons to whom the Software is furnished to do so, subject to
16 // the following conditions:
17 // 
18 // The above copyright notice and this permission notice shall be
19 // included in all copies or substantial portions of the Software.
20 // 
21 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
22 // EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
23 // MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
24 // NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
25 // LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
26 // OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
27 // WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
28 //
29
30 using System;
31 using System.Collections;
32 using System.Globalization;
33
34 namespace System.Text.RegularExpressions.Syntax {
35
36         class Parser {
37                 public static int ParseDecimal (string str, ref int ptr) {
38                         return ParseNumber (str, ref ptr, 10, 1, Int32.MaxValue);
39                 }
40
41                 public static int ParseOctal (string str, ref int ptr) {
42                         return ParseNumber (str, ref ptr, 8, 1, 3);
43                 }
44
45                 public static int ParseHex (string str, ref int ptr, int digits) {
46                         return ParseNumber (str, ref ptr, 16, digits, digits);
47                 }
48
49                 public static int ParseNumber (string str, ref int ptr, int b, int min, int max) {
50                         int p = ptr, n = 0, digits = 0, d;
51                         if (max < min)
52                                 max = Int32.MaxValue;
53
54                         while (digits < max && p < str.Length) {
55                                 d = ParseDigit (str[p ++], b, digits);
56                                 if (d < 0) {
57                                         -- p;
58                                         break;
59                                 }
60
61                                 n = n * b + d;
62                                 ++ digits;
63                         }
64
65                         if (digits < min)
66                                 return -1;
67
68                         ptr = p;
69                         return n;
70                 }
71
72                 public static string ParseName (string str, ref int ptr) {
73                         if (Char.IsDigit (str[ptr])) {
74                                 int gid = ParseNumber (str, ref ptr, 10, 1, 0);
75                                 if (gid > 0)
76                                         return gid.ToString ();
77                                 
78                                 return null;
79                         }
80
81                         int start = ptr;
82                         for (;;) {
83                                 if (!IsNameChar (str[ptr]))
84                                         break;
85                                 ++ ptr;
86                         }
87
88                         if (ptr - start > 0)
89                                 return str.Substring (start, ptr - start);
90
91                         return null;
92                 }
93
94                 public static string Escape (string str) {
95                         string result = "";
96                         for (int i = 0; i < str.Length; ++ i) {
97                                 char c = str[i];
98                                 switch (c) {
99                                 case '\\': case '*': case '+': case '?': case '|':
100                                 case '{': case '[': case '(': case ')': case '^':
101                                 case '$': case '.': case '#': case ' ':
102                                         result += "\\" + c;
103                                         break;
104
105                                 case '\t': result += "\\t"; break;
106                                 case '\n': result += "\\n"; break;
107                                 case '\r': result += "\\r"; break;
108                                 case '\f': result += "\\f"; break;
109
110                                 default: result += c; break;
111                                 }
112                         }
113
114                         return result;
115                 }
116
117                 public static string Unescape (string str) {
118                         if (str.IndexOf ('\\') == -1)
119                                 return str;
120                         return new Parser ().ParseString (str);
121                 }
122
123                 // public instance
124
125                 public Parser () {
126                         this.caps = new ArrayList ();
127                         this.refs = new Hashtable ();
128                 }
129
130                 public RegularExpression ParseRegularExpression (string pattern, RegexOptions options) {
131                         this.pattern = pattern;
132                         this.ptr = 0;
133
134                         caps.Clear ();
135                         refs.Clear ();
136                         this.num_groups = 0;
137
138                         try {
139                                 RegularExpression re = new RegularExpression ();
140                                 ParseGroup (re, options, null);
141                                 ResolveReferences ();
142
143                                 re.GroupCount = num_groups;
144                                 
145                                 return re;
146                         }
147                         catch (IndexOutOfRangeException) {
148                                 throw NewParseException ("Unexpected end of pattern.");
149                         }
150                 }
151
152                 public int GetMapping (Hashtable mapping)
153                 {
154                         int end = caps.Count;
155                         mapping.Add ("0", 0);
156                         for (int i = 0; i < end; i++) {
157                                 CapturingGroup group = (CapturingGroup) caps [i];
158                                 string name = group.Name != null ? group.Name : group.Index.ToString ();
159                                 if (mapping.Contains (name)) {
160                                         if ((int) mapping [name] != group.Index)
161                                                 throw new SystemException ("invalid state");
162                                         continue;
163                                 }
164                                 mapping.Add (name, group.Index);
165                         }
166
167                         return gap;
168                 }
169
170                 // private methods
171
172                 private void ParseGroup (Group group, RegexOptions options, Assertion assertion) {
173                         bool is_top_level = group is RegularExpression;
174                 
175                         Alternation alternation = null;
176                         string literal = null;
177
178                         Group current = new Group ();
179                         Expression expr = null;
180                         bool closed = false;
181
182                         while (true) {
183                                 ConsumeWhitespace (IsIgnorePatternWhitespace (options));
184                                 if (ptr >= pattern.Length)
185                                         break;
186                                 
187                                 // (1) Parse for Expressions
188                         
189                                 char ch = pattern[ptr ++];
190                                 
191                                 switch (ch) {
192                                 case '^': {
193                                         Position pos =
194                                                 IsMultiline (options) ? Position.StartOfLine : Position.Start;
195                                         expr = new PositionAssertion (pos);
196                                         break;
197                                 }
198
199                                 case '$': {
200                                         Position pos =
201                                                 IsMultiline (options) ? Position.EndOfLine : Position.End;
202                                         expr = new PositionAssertion (pos);
203                                         break;
204                                 }
205
206                                 case '.': {
207                                         Category cat =
208                                                 IsSingleline (options) ? Category.AnySingleline : Category.Any;
209                                         expr = new CharacterClass (cat, false);
210                                         break;
211                                 }
212
213                                 case '\\': {
214                                         int c = ParseEscape (false);
215                                         if (c >= 0)
216                                                 ch = (char)c;
217                                         else {
218                                                 expr = ParseSpecial (options);
219
220                                                 if (expr == null)
221                                                         ch = pattern[ptr ++];           // default escape
222                                         }
223                                         break;
224                                 }
225
226                                 case '[': {
227                                         expr = ParseCharacterClass (options);
228                                         break;
229                                 }
230
231                                 case '(': {
232                                         bool ignore = IsIgnoreCase (options);
233                                         expr = ParseGroupingConstruct (ref options);
234                                         if (expr == null) {
235                                                 if (literal != null && IsIgnoreCase (options) != ignore) {
236                                                         current.AppendExpression (new Literal (literal, IsIgnoreCase (options)));
237                                                         literal = null;
238                                                 }
239
240                                                 continue;
241                                         }
242                                         break;
243                                 }
244
245                                 case ')': {
246                                         closed = true;
247                                         goto EndOfGroup;
248                                 }
249
250                                 case '|': {
251                                         if (literal != null) {
252                                                 current.AppendExpression (new Literal (literal, IsIgnoreCase (options)));
253                                                 literal = null;
254                                         }
255
256                                         if (assertion != null) {
257                                                 if (assertion.TrueExpression == null)
258                                                         assertion.TrueExpression = current;
259                                                 else if (assertion.FalseExpression == null)
260                                                         assertion.FalseExpression = current;
261                                                 else
262                                                         throw NewParseException ("Too many | in (?()|).");
263                                         }
264                                         else {
265                                                 if (alternation == null)
266                                                         alternation = new Alternation ();
267
268                                                 alternation.AddAlternative (current);
269                                         }
270
271                                         current = new Group ();
272                                         continue;
273                                 }
274
275                                 case '*': case '+': case '?': {
276                                         throw NewParseException ("Bad quantifier.");
277                                 }
278
279                                 default: 
280                                         break;          // literal character
281                                 }
282
283                                 ConsumeWhitespace (IsIgnorePatternWhitespace (options));
284                                 
285                                 // (2) Check for Repetitions
286                                 
287                                 if (ptr < pattern.Length) {
288                                         char k = pattern[ptr];
289                                         int min = 0, max = 0;
290                                         bool lazy = false;
291                                         bool haveRep = false;
292
293
294                                         if (k == '?' || k == '*' || k == '+') {
295                                                 ++ ptr;
296                                                 haveRep = true;
297
298                                                 switch (k) {
299                                                 case '?': min = 0; max = 1; break;
300                                                 case '*': min = 0; max = 0x7fffffff; break;
301                                                 case '+': min = 1; max = 0x7fffffff; break;
302                                                 }
303                                         } else if (k == '{' && ptr + 1 < pattern.Length) {
304                                                 int saved_ptr = ptr;
305                                                 ++ptr;
306                                                 haveRep = ParseRepetitionBounds (out min, out max, options);
307                                                 if (!haveRep)
308                                                         ptr = saved_ptr;
309                                         }
310
311                                         if (haveRep) {
312                                                 ConsumeWhitespace (IsIgnorePatternWhitespace (options));
313                                                 if (ptr < pattern.Length && pattern[ptr] == '?') {
314                                                         ++ ptr;
315                                                         lazy = true;
316                                                 }
317
318                                                 //It doesn't make sense to assert a given position more than once.
319                                                 bool ignore_repetition = false;
320                                                 if (expr is PositionAssertion) {
321                                                         ignore_repetition = min > 0 && !lazy;
322                                                         max = 1;
323                                                 }
324
325                                                 if (!ignore_repetition) {
326                                                         Repetition repetition = new Repetition (min, max, lazy);
327         
328                                                         if (expr == null)
329                                                                 repetition.Expression = new Literal (ch.ToString (), IsIgnoreCase (options));
330                                                         else
331                                                                 repetition.Expression = expr;
332         
333                                                         expr = repetition;
334                                                 }
335                                         }
336                                 }
337
338                                 // (3) Append Expression and/or Literal
339
340                                 if (expr == null) {
341                                         if (literal == null)
342                                                 literal = "";
343                                         literal += ch;
344                                 }
345                                 else {
346                                         if (literal != null) {
347                                                 current.AppendExpression (new Literal (literal, IsIgnoreCase (options)));
348                                                 literal = null;
349                                         }
350
351                                         current.AppendExpression (expr);
352                                         expr = null;
353                                 }
354
355                                 if (is_top_level && ptr >= pattern.Length)
356                                         goto EndOfGroup;
357                         }
358
359                 EndOfGroup:
360                         if (is_top_level && closed)
361                                 throw NewParseException ("Too many )'s.");
362                         if (!is_top_level && !closed)
363                                 throw NewParseException ("Not enough )'s.");
364                                 
365                 
366                         // clean up literals and alternations
367
368                         if (literal != null)
369                                 current.AppendExpression (new Literal (literal, IsIgnoreCase (options)));
370
371                         if (assertion != null) {
372                                 if (assertion.TrueExpression == null)
373                                         assertion.TrueExpression = current;
374                                 else
375                                         assertion.FalseExpression = current;
376                                 
377                                 group.AppendExpression (assertion);
378                         }
379                         else if (alternation != null) {
380                                 alternation.AddAlternative (current);
381                                 group.AppendExpression (alternation);
382                         }
383                         else
384                                 group.AppendExpression (current);
385                 }
386
387                 private Expression ParseGroupingConstruct (ref RegexOptions options) {
388                         if (pattern[ptr] != '?') {
389                                 Group group;
390
391                                 if (IsExplicitCapture (options))
392                                         group = new Group ();
393                                 else {
394                                         group = new CapturingGroup ();
395                                         caps.Add (group);
396                                 }
397
398                                 ParseGroup (group, options, null);
399                                 return group;
400                         }
401                         else
402                                 ++ ptr;
403
404                         switch (pattern[ptr]) {
405                         case ':': {                                             // non-capturing group
406                                 ++ ptr;
407                                 Group group = new Group ();
408                                 ParseGroup (group, options, null);
409
410                                 return group;
411                         }
412
413                         case '>': {                                             // non-backtracking group
414                                 ++ ptr;
415                                 Group group = new NonBacktrackingGroup ();
416                                 ParseGroup (group, options, null);
417                                 
418                                 return group;
419                         }
420
421                         case 'i': case 'm': case 'n':
422                         case 's': case 'x': case '-': {                         // options
423                                 RegexOptions o = options;
424                                 ParseOptions (ref o, false);
425                                 if (pattern[ptr] == '-') {
426                                         ++ ptr;
427                                         ParseOptions (ref o, true);
428                                 }
429
430                                 if (pattern[ptr] == ':') {                      // pass options to child group
431                                         ++ ptr;
432                                         Group group = new Group ();
433                                         ParseGroup (group, o, null);
434                                         return group;
435                                 }
436                                 else if (pattern[ptr] == ')') {                 // change options of enclosing group
437                                         ++ ptr;
438                                         options = o;
439                                         return null;
440                                 }
441                                 else
442                                         throw NewParseException ("Bad options");
443                         }
444
445                         case '<': case '=': case '!': {                         // lookahead/lookbehind
446                                 ExpressionAssertion asn = new ExpressionAssertion ();
447                                 if (!ParseAssertionType (asn))
448                                         goto case '\'';                         // it's a (?<name> ) construct
449
450                                 Group test = new Group ();
451                                 ParseGroup (test, options, null);
452
453                                 asn.TestExpression = test;
454                                 return asn;
455                         }
456
457                         case '\'': {                                            // named/balancing group
458                                 char delim;
459                                 if (pattern[ptr] == '<')
460                                         delim = '>';
461                                 else
462                                         delim = '\'';
463
464                                 ++ ptr;
465                                 string name = ParseName ();
466
467                                 if (pattern[ptr] == delim) {
468                                         // capturing group
469
470                                         if (name == null)
471                                                 throw NewParseException ("Bad group name.");
472
473                                         ++ ptr;
474                                         CapturingGroup cap = new CapturingGroup ();
475                                         cap.Name = name;
476                                         caps.Add (cap);
477                                         ParseGroup (cap, options, null);
478
479                                         return cap;
480                                 }
481                                 else if (pattern[ptr] == '-') {
482                                         // balancing group
483
484                                         ++ ptr;
485                                         string balance_name = ParseName ();
486                                         if (balance_name == null || pattern[ptr] != delim)
487                                                 throw NewParseException ("Bad balancing group name.");
488
489                                         ++ ptr;
490                                         BalancingGroup bal = new BalancingGroup ();
491                                         bal.Name = name;
492                                         
493                                         if(bal.IsNamed) {
494                                                 caps.Add (bal);
495                                         }
496
497                                         refs.Add (bal, balance_name);
498
499                                         ParseGroup (bal, options, null);
500
501                                         return bal;
502                                 }
503                                 else
504                                         throw NewParseException ("Bad group name.");
505                         }
506
507                         case '(': {                                             // expression/capture test
508                                 Assertion asn;
509                         
510                                 ++ ptr;
511                                 int p = ptr;
512                                 string name = ParseName ();
513                                 if (name == null || pattern[ptr] != ')') {      // expression test
514                                         // FIXME MS implementation doesn't seem to
515                                         // implement this version of (?(x) ...)
516
517                                         ptr = p;
518                                         ExpressionAssertion expr_asn = new ExpressionAssertion ();
519
520                                         if (pattern[ptr] == '?') {
521                                                 ++ ptr;
522                                                 if (!ParseAssertionType (expr_asn))
523                                                         throw NewParseException ("Bad conditional.");
524                                         }
525                                         else {
526                                                 expr_asn.Negate = false;
527                                                 expr_asn.Reverse = false;
528                                         }
529
530                                         Group test = new Group ();
531                                         ParseGroup (test, options, null);
532                                         expr_asn.TestExpression = test;
533                                         asn = expr_asn;
534                                 }
535                                 else {                                          // capture test
536                                         ++ ptr;
537                                         asn = new CaptureAssertion (new Literal (name, IsIgnoreCase (options)));
538                                         refs.Add (asn, name);
539                                 }
540
541                                 Group group = new Group ();
542                                 ParseGroup (group, options, asn);
543                                 return group;
544                         }
545
546                         case '#': {                                             // comment
547                                 ++ ptr;
548                                 while (pattern[ptr ++] != ')') {
549                                         if (ptr >= pattern.Length)
550                                                 throw NewParseException ("Unterminated (?#...) comment.");
551                                 }
552                                 return null;
553                         }
554
555                         default:                                                // error
556                                 throw NewParseException ("Bad grouping construct.");
557                         }
558                 }
559
560                 private bool ParseAssertionType (ExpressionAssertion assertion) {
561                         if (pattern[ptr] == '<') {
562                                 switch (pattern[ptr + 1]) {
563                                 case '=':
564                                         assertion.Negate = false;
565                                         break;
566                                 case '!':
567                                         assertion.Negate = true;
568                                         break;
569                                 default:
570                                         return false;
571                                 }
572
573                                 assertion.Reverse = true;
574                                 ptr += 2;
575                         }
576                         else {
577                                 switch (pattern[ptr]) {
578                                 case '=':
579                                         assertion.Negate = false;
580                                         break;
581                                 case '!':
582                                         assertion.Negate = true;
583                                         break;
584                                 default:
585                                         return false;
586                                 }
587
588                                 assertion.Reverse = false;
589                                 ptr += 1;
590                         }
591
592                         return true;
593                 }
594
595                 private void ParseOptions (ref RegexOptions options, bool negate) {
596                         for (;;) {
597                                 switch (pattern[ptr]) {
598                                 case 'i':
599                                         if (negate)
600                                                 options &= ~RegexOptions.IgnoreCase;
601                                         else
602                                                 options |= RegexOptions.IgnoreCase;
603                                         break;
604
605                                 case 'm':
606                                         if (negate)
607                                                 options &= ~RegexOptions.Multiline;
608                                         else
609                                                 options |= RegexOptions.Multiline;
610                                         break;
611                                         
612                                 case 'n':
613                                         if (negate)
614                                                 options &= ~RegexOptions.ExplicitCapture;
615                                         else
616                                                 options |= RegexOptions.ExplicitCapture;
617                                         break;
618                                         
619                                 case 's':
620                                         if (negate)
621                                                 options &= ~RegexOptions.Singleline;
622                                         else
623                                                 options |= RegexOptions.Singleline;
624                                         break;
625                                         
626                                 case 'x':
627                                         if (negate)
628                                                 options &= ~RegexOptions.IgnorePatternWhitespace;
629                                         else
630                                                 options |= RegexOptions.IgnorePatternWhitespace;
631                                         break;
632
633                                 default:
634                                         return;
635                                 }
636
637                                 ++ ptr;
638                         }
639                 }
640
641                 private Expression ParseCharacterClass (RegexOptions options) {
642                         bool negate = false;
643                         if (pattern[ptr] == '^') {
644                                 negate = true;
645                                 ++ ptr;
646                         }
647                         
648                         bool ecma = IsECMAScript (options);
649                         CharacterClass cls = new CharacterClass (negate, IsIgnoreCase (options));
650
651                         if (pattern[ptr] == ']') {
652                                 cls.AddCharacter (']');
653                                 ++ ptr;
654                         }
655
656                         int c = -1;
657                         int last = -1;
658                         bool range = false;
659                         bool closed = false;
660                         while (ptr < pattern.Length) {
661                                 c = pattern[ptr ++];
662
663                                 if (c == ']') {
664                                         closed = true;
665                                         break;
666                                 }
667
668                                 if (c == '-' && last >= 0 && !range) {
669                                         range = true;
670                                         continue;
671                                 }
672
673                                 if (c == '\\') {
674                                         c = ParseEscape (true);
675                                         if (c >= 0)
676                                                 goto char_recognized;
677
678                                         // didn't recognize escape
679                                         c = pattern [ptr ++];
680                                         switch (c) {
681                                         case 'b':
682                                                 c = '\b';
683                                                 goto char_recognized;
684
685                                         case 'd': case 'D':
686                                                 cls.AddCategory (ecma ? Category.EcmaDigit : Category.Digit, c == 'D');
687                                                 break;
688                                                 
689                                         case 'w': case 'W':
690                                                 cls.AddCategory (ecma ? Category.EcmaWord : Category.Word, c == 'W');
691                                                 break;
692                                                 
693                                         case 's': case 'S':
694                                                 cls.AddCategory (ecma ? Category.EcmaWhiteSpace : Category.WhiteSpace, c == 'S');
695                                                 break;
696                                                 
697                                         case 'p': case 'P':
698                                                 cls.AddCategory (ParseUnicodeCategory (), c == 'P');    // ignore ecma
699                                                 break;
700
701                                         default:                // add escaped character
702                                                 goto char_recognized;
703                                         }
704
705                                         // if the pattern looks like [a-\s] ...
706                                         if (range)
707                                                 throw NewParseException ("character range cannot have category \\" + c);
708
709                                         last = -1;
710                                         continue;
711                                 }
712
713                         char_recognized:
714                                 if (range) {
715                                         // if 'range' is true, we know that 'last >= 0'
716                                         if (c < last)
717                                                 throw NewParseException ("[" + last + "-" + c + "] range in reverse order.");
718                                         cls.AddRange ((char)last, (char)c);
719                                         last = -1;
720                                         range = false;
721                                         continue;
722                                 }
723
724                                 cls.AddCharacter ((char)c);
725                                 last = c;
726                         }
727
728                         if (!closed)
729                                 throw NewParseException ("Unterminated [] set.");
730
731                         if (range)
732                                 cls.AddCharacter ('-');
733
734                         return cls;
735                 }
736
737                 private bool ParseRepetitionBounds (out int min, out int max, RegexOptions options) {
738                         int n, m;
739                         min = max = 0;
740
741                         /* check syntax */
742
743                         ConsumeWhitespace (IsIgnorePatternWhitespace (options));
744                     
745                         if (pattern[ptr] == ',') {
746                                 n = -1;
747                         } else {
748                                 n = ParseNumber (10, 1, 0);
749                                 ConsumeWhitespace (IsIgnorePatternWhitespace (options));
750                         }
751                         
752                         switch (pattern[ptr ++]) {
753                         case '}':
754                                 m = n;
755                                 break;
756                         case ',':
757                                 ConsumeWhitespace (IsIgnorePatternWhitespace (options));
758                                 m = ParseNumber (10, 1, 0);
759                                 ConsumeWhitespace (IsIgnorePatternWhitespace (options));
760                                 if (pattern[ptr ++] != '}')
761                                         return false;
762                                 break;
763                         default:
764                                 return false;
765                         }
766
767                         /* check bounds and ordering */
768
769                         if (n > 0x7fffffff || m > 0x7fffffff)
770                                 throw NewParseException ("Illegal {x, y} - maximum of 2147483647.");
771                         if (m >= 0 && m < n)
772                                 throw NewParseException ("Illegal {x, y} with x > y.");
773
774                         /* assign min and max */
775                         
776                         min = n;
777                         if (m > 0)
778                                 max = m;
779                         else
780                                 max = 0x7fffffff;
781
782                         return true;
783                 }
784
785                 private Category ParseUnicodeCategory () {
786                         if (pattern[ptr ++] != '{')
787                                 throw NewParseException ("Incomplete \\p{X} character escape.");
788
789                         string name = ParseName (pattern, ref ptr);
790                         if (name == null)
791                                 throw NewParseException ("Incomplete \\p{X} character escape.");
792
793                         Category cat = CategoryUtils.CategoryFromName (name);
794                         if (cat == Category.None)
795                                 throw NewParseException ("Unknown property '" + name + "'.");
796
797                         if (pattern[ptr ++] != '}')
798                                 throw NewParseException ("Incomplete \\p{X} character escape.");
799
800                         return cat;
801                 }
802
803                 private Expression ParseSpecial (RegexOptions options) {
804                         int p = ptr;
805                         bool ecma = IsECMAScript (options);
806                         Expression expr = null;
807                         
808                         switch (pattern[ptr ++]) {
809
810                         // categories
811
812                         case 'd':
813                                 expr = new CharacterClass (ecma ? Category.EcmaDigit : Category.Digit, false);
814                                 break;
815                                 
816                         case 'w':
817                                 expr = new CharacterClass (ecma ? Category.EcmaWord : Category.Word, false);
818                                 break;
819                                 
820                         case 's':
821                                 expr = new CharacterClass (ecma ? Category.EcmaWhiteSpace : Category.WhiteSpace, false);
822                                 break;
823                                 
824                         case 'p':
825                                 // this is odd - ECMAScript isn't supposed to support Unicode,
826                                 // yet \p{..} compiles and runs under the MS implementation
827                                 // identically to canonical mode. That's why I'm ignoring the
828                                 // value of ecma here.
829                         
830                                 expr = new CharacterClass (ParseUnicodeCategory (), false);
831                                 break;
832                                 
833                         case 'D':
834                                 expr = new CharacterClass (ecma ? Category.EcmaDigit : Category.Digit, true);
835                                 break;
836                                 
837                         case 'W':
838                                 expr = new CharacterClass (ecma ? Category.EcmaWord : Category.Word, true);
839                                 break;
840                                 
841                         case 'S':
842                                 expr = new CharacterClass (ecma ? Category.EcmaWhiteSpace : Category.WhiteSpace, true);
843                                 break;
844                                 
845                         case 'P':
846                                 expr = new CharacterClass (ParseUnicodeCategory (), true);
847                                 break;
848
849                         // positions
850
851                         case 'A': expr = new PositionAssertion (Position.StartOfString); break;
852                         case 'Z': expr = new PositionAssertion (Position.End); break;
853                         case 'z': expr = new PositionAssertion (Position.EndOfString); break;
854                         case 'G': expr = new PositionAssertion (Position.StartOfScan); break;
855                         case 'b': expr = new PositionAssertion (Position.Boundary); break;
856                         case 'B': expr = new PositionAssertion (Position.NonBoundary); break;
857                         
858                         // references
859
860                         case '1': case '2': case '3': case '4': case '5':
861                         case '6': case '7': case '8': case '9': {
862                                 ptr --;
863                                 int n = ParseNumber (10, 1, 0);
864                                 if (n < 0) {
865                                         ptr = p;
866                                         return null;
867                                 }
868
869                                 // FIXME test if number is within number of assigned groups
870                                 // this may present a problem for right-to-left matching
871
872                                 Reference reference = new BackslashNumber (IsIgnoreCase (options), ecma);
873                                 refs.Add (reference, n.ToString ());
874                                 expr = reference;
875                                 break;
876                         }
877
878                         case 'k': {
879                                 char delim = pattern[ptr ++];
880                                 if (delim == '<')
881                                         delim = '>';
882                                 else if (delim != '\'')
883                                         throw NewParseException ("Malformed \\k<...> named backreference.");
884
885                                 string name = ParseName ();
886                                 if (name == null || pattern[ptr] != delim)
887                                         throw NewParseException ("Malformed \\k<...> named backreference.");
888
889                                 ++ ptr;
890                                 Reference reference = new Reference (IsIgnoreCase (options));
891                                 refs.Add (reference, name);
892                                 expr = reference;
893                                 break;
894                         }
895
896                         default:
897                                 expr = null;
898                                 break;
899                         }
900
901                         if (expr == null)
902                                 ptr = p;
903
904                         return expr;
905                 }
906
907                 private int ParseEscape (bool inCharacterClass) {
908                         int p = ptr;
909                         int c;
910
911                         if (p >= pattern.Length)
912                                 throw new ArgumentException (
913                                                 String.Format ("Parsing \"{0}\" - Illegal \\ at end of " + 
914                                                                 "pattern.", pattern), pattern);
915                         
916                         switch (pattern[ptr ++]) {
917         
918                         // standard escapes (except \b)
919
920                         case 'a': return '\u0007';
921                         case 't': return '\u0009';
922                         case 'r': return '\u000d';
923                         case 'v': return '\u000b';
924                         case 'f': return '\u000c';
925                         case 'n': return '\u000a';
926                         case 'e': return '\u001b';
927                         case '\\': return '\\';
928
929                         // character codes
930
931                         case '0':
932                                 
933                                 //
934                                 // Turns out that octal values can be specified
935                                 // without a leading zero.   But also the limit
936                                 // of three character should include this first
937                                 // one.  
938                                 //
939                                 ptr--;
940                                 int prevptr = ptr;
941                                 int result = ParseOctal (pattern, ref ptr);
942                                 if (result == -1 && prevptr == ptr)
943                                         return 0;
944
945                                 return result;
946
947                         case '1': case '2': case '3': case '4': case '5':
948                         case '6': case '7':
949                                 if (inCharacterClass){
950                                         ptr--;
951                                         return ParseOctal (pattern, ref ptr);
952                                 }
953                                 break;
954                                 
955                         case 'x':
956                                 c = ParseHex (pattern, ref ptr, 2);
957                                 if (c < 0)
958                                         throw NewParseException ("Insufficient hex digits");
959
960                                 return c;
961
962                         case 'u':
963                                 c = ParseHex (pattern, ref ptr, 4);
964                                 if (c < 0)
965                                         throw NewParseException ("Insufficient hex digits");
966                                 
967                                 return c;
968
969                         // control characters
970
971                         case 'c':
972                                 c = pattern[ptr ++];
973                                 if (c >= '@' && c <= '_')
974                                         return c - '@';
975                                 else
976                                         throw NewParseException ("Unrecognized control character.");
977                         }
978
979                         // unknown escape
980                         ptr = p;
981                         return -1;
982                 }
983
984                 private string ParseName () {
985                         return Parser.ParseName (pattern, ref ptr);
986                 }
987
988                 private static bool IsNameChar (char c) {
989                         UnicodeCategory cat = Char.GetUnicodeCategory (c);
990                         if (cat == UnicodeCategory.ModifierLetter)
991                                 return false;
992                         if (cat == UnicodeCategory.ConnectorPunctuation)
993                                 return true;
994                         return Char.IsLetterOrDigit (c);
995                 }
996         
997                 private int ParseNumber (int b, int min, int max) {
998                         return Parser.ParseNumber (pattern, ref ptr, b, min, max);
999                 }
1000
1001                 private static int ParseDigit (char c, int b, int n) {
1002                         switch (b) {
1003                         case 8:
1004                                 if (c >= '0' && c <= '7')
1005                                         return c - '0';
1006                                 else
1007                                         return -1;
1008                         case 10:
1009                                 if (c >= '0' && c <= '9')
1010                                         return c - '0';
1011                                 else
1012                                         return -1;
1013                         case 16:
1014                                 if (c >= '0' && c <= '9')
1015                                         return c - '0';
1016                                 else if (c >= 'a' && c <= 'f')
1017                                         return 10 + c - 'a';
1018                                 else if (c >= 'A' && c <= 'F')
1019                                         return 10 + c - 'A';
1020                                 else
1021                                         return -1;
1022                         default:
1023                                 return -1;
1024                         }
1025                 }
1026
1027                 private void ConsumeWhitespace (bool ignore) {
1028                         while (ptr < pattern.Length) {
1029                                 if (pattern[ptr] == '(') {
1030                                         if (ptr + 3 >= pattern.Length)
1031                                                 return;
1032
1033                                         if (pattern[ptr + 1] != '?' || pattern[ptr + 2] != '#')
1034                                                 return;
1035
1036                                         ptr += 3;
1037                                         while (ptr < pattern.Length && pattern[ptr ++] != ')')
1038                                                 /* ignore */ ;
1039                                 }
1040                                 else if (ignore && pattern[ptr] == '#') {
1041                                         while (ptr < pattern.Length && pattern[ptr ++] != '\n')
1042                                                 /* ignore */ ;
1043                                 }
1044                                 else if (ignore && Char.IsWhiteSpace (pattern[ptr])) {
1045                                         while (ptr < pattern.Length && Char.IsWhiteSpace (pattern[ptr]))
1046                                                 ++ ptr;
1047                                 }
1048                                 else
1049                                         return;
1050                         }
1051                 }
1052
1053                 private string ParseString (string pattern) {
1054                         this.pattern = pattern;
1055                         this.ptr = 0;
1056
1057                         StringBuilder result = new StringBuilder (pattern.Length);
1058                         while (ptr < pattern.Length) {
1059                                 int c = pattern[ptr ++];
1060                                 if (c == '\\') {
1061                                         c = ParseEscape (false);
1062
1063                                         if(c < 0) {
1064                                                 c = pattern[ptr ++];
1065                                                 if(c == 'b')
1066                                                         c = '\b';
1067                                         }
1068                                 }
1069                                 result.Append ((char) c);
1070                         }
1071
1072                         return result.ToString ();
1073                 }
1074
1075                 private void ResolveReferences ()
1076                 {
1077                         int gid = 1;
1078                         Hashtable dict = new Hashtable ();
1079                         ArrayList explicit_numeric_groups = null;
1080
1081                         // number unnamed groups
1082
1083                         foreach (CapturingGroup group in caps) {
1084                                 if (group.Name != null)
1085                                         continue;
1086
1087                                 dict.Add (gid.ToString (), group);
1088                                 group.Index = gid ++;
1089                                 ++ num_groups;
1090                         }
1091
1092                         // number named groups
1093
1094                         foreach (CapturingGroup group in caps) {
1095                                 if (group.Name == null)
1096                                         continue;
1097
1098                                 if (dict.Contains (group.Name)) {
1099                                         CapturingGroup prev = (CapturingGroup) dict [group.Name];
1100                                         group.Index = prev.Index;
1101
1102                                         if (group.Index == gid)
1103                                                 gid ++;
1104                                         else if (group.Index > gid)
1105                                                 explicit_numeric_groups.Add (group);
1106                                         continue;
1107                                 }
1108
1109                                 if (Char.IsDigit (group.Name [0])) {
1110                                         int ptr = 0;
1111                                         int group_gid = ParseDecimal (group.Name, ref ptr);
1112                                         if (ptr == group.Name.Length) {
1113                                                 group.Index = group_gid;
1114                                                 dict.Add (group.Name, group);
1115                                                 ++ num_groups;
1116
1117                                                 if (group_gid == gid) {
1118                                                         gid ++;
1119                                                 } else {
1120                                                         // all numbers before 'gid' are already in the dictionary.  So, we know group_gid > gid
1121                                                         if (explicit_numeric_groups == null)
1122                                                                 explicit_numeric_groups = new ArrayList (4);
1123                                                         explicit_numeric_groups.Add (group);
1124                                                 }
1125
1126                                                 continue;
1127                                         }
1128                                 }
1129
1130                                 string gid_s = gid.ToString ();
1131                                 while (dict.Contains (gid_s))
1132                                         gid_s = (++gid).ToString ();
1133
1134                                 dict.Add (gid_s, group);
1135                                 dict.Add (group.Name, group);
1136                                 group.Index = gid ++;
1137                                 ++ num_groups;
1138                         }
1139
1140                         gap = gid; // == 1 + num_groups, if explicit_numeric_groups == null
1141
1142                         if (explicit_numeric_groups != null)
1143                                 HandleExplicitNumericGroups (explicit_numeric_groups);
1144
1145                         // resolve references
1146
1147                         foreach (Expression expr in refs.Keys) {
1148                                 string name = (string) refs [expr];
1149                                 if (!dict.Contains (name)) {
1150                                         if (expr is CaptureAssertion && !Char.IsDigit (name [0]))
1151                                                 continue;
1152                                         BackslashNumber bn = expr as BackslashNumber;
1153                                         if (bn != null && bn.ResolveReference (name, dict))
1154                                                 continue;
1155                                         throw NewParseException ("Reference to undefined group " +
1156                                                 (Char.IsDigit (name[0]) ? "number " : "name ") +
1157                                                 name);
1158                                 }
1159
1160                                 CapturingGroup group = (CapturingGroup)dict[name];
1161                                 if (expr is Reference)
1162                                         ((Reference)expr).CapturingGroup = group;
1163                                 else if (expr is CaptureAssertion)
1164                                         ((CaptureAssertion)expr).CapturingGroup = group;
1165                                 else if (expr is BalancingGroup)
1166                                         ((BalancingGroup)expr).Balance = group;
1167                         }
1168                 }
1169
1170                 private void HandleExplicitNumericGroups (ArrayList explicit_numeric_groups)
1171                 {
1172                         int gid = gap;
1173                         int i = 0;
1174                         int n_explicit = explicit_numeric_groups.Count;
1175
1176                         explicit_numeric_groups.Sort ();
1177
1178                         // move 'gap' forward to skip over all explicit groups that
1179                         // turn out to match their index
1180                         for (; i < n_explicit; ++i) {
1181                                 CapturingGroup g = (CapturingGroup) explicit_numeric_groups [i];
1182                                 if (g.Index > gid)
1183                                         break;
1184                                 if (g.Index == gid)
1185                                         gid ++;
1186                         }
1187
1188                         gap = gid;
1189
1190                         // re-index all further groups so that the indexes are contiguous
1191                         int prev = gid;
1192                         for (; i < n_explicit; ++i) {
1193                                 CapturingGroup g = (CapturingGroup) explicit_numeric_groups [i];
1194                                 if (g.Index == prev) {
1195                                         g.Index = gid - 1;
1196                                 } else {
1197                                         prev = g.Index;
1198                                         g.Index = gid ++;
1199                                 }
1200                         }
1201                 }
1202
1203                 // flag helper functions
1204
1205                 private static bool IsIgnoreCase (RegexOptions options) {
1206                         return (options & RegexOptions.IgnoreCase) != 0;
1207                 }
1208
1209                 private static bool IsMultiline (RegexOptions options) {
1210                         return (options & RegexOptions.Multiline) != 0;
1211                 }
1212
1213                 private static bool IsExplicitCapture (RegexOptions options) {
1214                         return (options & RegexOptions.ExplicitCapture) != 0;
1215                 }
1216         
1217                 private static bool IsSingleline (RegexOptions options) {
1218                         return (options & RegexOptions.Singleline) != 0;
1219                 }
1220
1221                 private static bool IsIgnorePatternWhitespace (RegexOptions options) {
1222                         return (options & RegexOptions.IgnorePatternWhitespace) != 0;
1223                 }
1224
1225                 private static bool IsECMAScript (RegexOptions options) {
1226                         return (options & RegexOptions.ECMAScript) != 0;
1227                 }
1228
1229                 // exception creation
1230
1231                 private ArgumentException NewParseException (string msg) {
1232                         msg = "parsing \"" + pattern + "\" - " + msg;
1233                         return new ArgumentException (msg, pattern);
1234                 }
1235
1236                 private string pattern;
1237                 private int ptr;
1238
1239                 private ArrayList caps;
1240                 private Hashtable refs;
1241                 private int num_groups;
1242                 private int gap;
1243         }
1244 }