* parser.cs: Use the group number as the name in mapping. Patch by Gert Driesen.
[mono.git] / mcs / class / System / System.Text.RegularExpressions / parser.cs
1 //\r
2 // assembly:    System\r
3 // namespace:   System.Text.RegularExpressions\r
4 // file:        parser.cs\r
5 //\r
6 // author:      Dan Lewis (dlewis@gmx.co.uk)\r
7 //              (c) 2002\r
8 \r
9 using System;\r
10 using System.Collections;\r
11 using System.Globalization;\r
12 \r
13 namespace System.Text.RegularExpressions.Syntax {\r
14 \r
15         class Parser {\r
16                 public static int ParseDecimal (string str, ref int ptr) {\r
17                         return ParseNumber (str, ref ptr, 10, 1, Int32.MaxValue);\r
18                 }\r
19 \r
20                 public static int ParseOctal (string str, ref int ptr) {\r
21                         return ParseNumber (str, ref ptr, 8, 1, 3);\r
22                 }\r
23 \r
24                 public static int ParseHex (string str, ref int ptr, int digits) {\r
25                         return ParseNumber (str, ref ptr, 16, digits, digits);\r
26                 }\r
27 \r
28                 public static int ParseNumber (string str, ref int ptr, int b, int min, int max) {\r
29                         int p = ptr, n = 0, digits = 0, d;\r
30                         if (max < min)\r
31                                 max = Int32.MaxValue;\r
32 \r
33                         while (digits < max && p < str.Length) {\r
34                                 d = ParseDigit (str[p ++], b, digits);\r
35                                 if (d < 0) {\r
36                                         -- p;\r
37                                         break;\r
38                                 }\r
39 \r
40                                 n = n * b + d;\r
41                                 ++ digits;\r
42                         }\r
43 \r
44                         if (digits < min)\r
45                                 return -1;\r
46 \r
47                         ptr = p;\r
48                         return n;\r
49                 }\r
50 \r
51                 public static string ParseName (string str, ref int ptr) {\r
52                         if (Char.IsDigit (str[ptr])) {\r
53                                 int gid = ParseNumber (str, ref ptr, 10, 1, 0);\r
54                                 if (gid > 0)\r
55                                         return gid.ToString ();\r
56                                 \r
57                                 return null;\r
58                         }\r
59 \r
60                         int start = ptr;\r
61                         for (;;) {\r
62                                 if (!IsNameChar (str[ptr]))\r
63                                         break;\r
64                                 ++ ptr;\r
65                         }\r
66 \r
67                         if (ptr - start > 0)\r
68                                 return str.Substring (start, ptr - start);\r
69 \r
70                         return null;\r
71                 }\r
72 \r
73                 public static string Escape (string str) {\r
74                         string result = "";\r
75                         for (int i = 0; i < str.Length; ++ i) {\r
76                                 char c = str[i];\r
77                                 switch (c) {\r
78                                 case '\\': case '*': case '+': case '?': case '|':\r
79                                 case '{': case '[': case '(': case ')': case '^':\r
80                                 case '$': case '.': case '#': case ' ':\r
81                                         result += "\\" + c;\r
82                                         break;\r
83 \r
84                                 case '\t': result += "\\t"; break;\r
85                                 case '\n': result += "\\n"; break;\r
86                                 case '\r': result += "\\r"; break;\r
87                                 case '\f': result += "\\f"; break;\r
88 \r
89                                 default: result += c; break;\r
90                                 }\r
91                         }\r
92 \r
93                         return result;\r
94                 }\r
95 \r
96                 public static string Unescape (string str) {\r
97                         return new Parser ().ParseString (str);\r
98                 }\r
99 \r
100                 // public instance\r
101 \r
102                 public Parser () {\r
103                         this.caps = new ArrayList ();\r
104                         this.refs = new Hashtable ();\r
105                 }\r
106 \r
107                 public RegularExpression ParseRegularExpression (string pattern, RegexOptions options) {\r
108                         this.pattern = pattern;\r
109                         this.ptr = 0;\r
110 \r
111                         caps.Clear ();\r
112                         refs.Clear ();\r
113                         this.num_groups = 0;\r
114 \r
115                         try {\r
116                                 RegularExpression re = new RegularExpression ();\r
117                                 ParseGroup (re, options, null);\r
118                                 ResolveReferences ();\r
119 \r
120                                 re.GroupCount = num_groups;\r
121                                 \r
122                                 return re;\r
123                         }\r
124                         catch (IndexOutOfRangeException) {\r
125                                 throw NewParseException ("Unexpected end of pattern.");\r
126                         }\r
127                 }\r
128 \r
129                 public IDictionary GetMapping () {\r
130                         Hashtable mapping = new Hashtable ();\r
131                         int end = caps.Count;\r
132                         mapping.Add ("0", 0);\r
133                         for (int i = 0; i < end;) {\r
134                                 CapturingGroup group = (CapturingGroup) caps [i];\r
135                                 i++;\r
136                                 if (group.Name != null && !mapping.Contains (group.Name))\r
137                                         mapping.Add (group.Name, group.Number);\r
138                                 else\r
139                                         mapping.Add (group.Number.ToString (), group.Number);\r
140                         }\r
141 \r
142                         return mapping;\r
143                 }\r
144 \r
145                 // private methods\r
146 \r
147                 private void ParseGroup (Group group, RegexOptions options, Assertion assertion) {\r
148                         bool is_top_level = group is RegularExpression;\r
149                 \r
150                         Alternation alternation = null;\r
151                         string literal = null;\r
152 \r
153                         Group current = new Group ();\r
154                         Expression expr = null;\r
155                         bool closed = false;\r
156 \r
157                         while (true) {\r
158                                 ConsumeWhitespace (IsIgnorePatternWhitespace (options));\r
159                                 if (ptr >= pattern.Length)\r
160                                         break;\r
161                                 \r
162                                 // (1) Parse for Expressions\r
163                         \r
164                                 char ch = pattern[ptr ++];\r
165                                 \r
166                                 switch (ch) {\r
167                                 case '^': {\r
168                                         Position pos =\r
169                                                 IsMultiline (options) ? Position.StartOfLine : Position.Start;\r
170                                         expr = new PositionAssertion (pos);\r
171                                         break;\r
172                                 }\r
173 \r
174                                 case '$': {\r
175                                         Position pos =\r
176                                                 IsMultiline (options) ? Position.EndOfLine : Position.End;\r
177                                         expr = new PositionAssertion (pos);\r
178                                         break;\r
179                                 }\r
180 \r
181                                 case '.': {\r
182                                         Category cat =\r
183                                                 IsSingleline (options) ? Category.AnySingleline : Category.Any;\r
184                                         expr = new CharacterClass (cat, false);\r
185                                         break;\r
186                                 }\r
187 \r
188                                 case '\\': {\r
189                                         int c = ParseEscape ();\r
190                                         if (c >= 0)\r
191                                                 ch = (char)c;\r
192                                         else {\r
193                                                 expr = ParseSpecial (options);\r
194 \r
195                                                 if (expr == null)\r
196                                                         ch = pattern[ptr ++];           // default escape\r
197                                         }\r
198                                         break;\r
199                                 }\r
200 \r
201                                 case '[': {\r
202                                         expr = ParseCharacterClass (options);\r
203                                         break;\r
204                                 }\r
205 \r
206                                 case '(': {\r
207                                         bool ignore = IsIgnoreCase (options);\r
208                                         expr = ParseGroupingConstruct (ref options);\r
209                                         if (expr == null) {\r
210                                                 if (literal != null && IsIgnoreCase (options) != ignore) {\r
211                                                         current.AppendExpression (new Literal (literal, IsIgnoreCase (options)));\r
212                                                         literal = null;\r
213                                                 }\r
214 \r
215                                                 continue;\r
216                                         }\r
217                                         break;\r
218                                 }\r
219 \r
220                                 case ')': {\r
221                                         closed = true;\r
222                                         goto EndOfGroup;\r
223                                 }\r
224 \r
225                                 case '|': {\r
226                                         if (literal != null) {\r
227                                                 current.AppendExpression (new Literal (literal, IsIgnoreCase (options)));\r
228                                                 literal = null;\r
229                                         }\r
230 \r
231                                         if (assertion != null) {\r
232                                                 if (assertion.TrueExpression == null)\r
233                                                         assertion.TrueExpression = current;\r
234                                                 else if (assertion.FalseExpression == null)\r
235                                                         assertion.FalseExpression = current;\r
236                                                 else\r
237                                                         throw NewParseException ("Too many | in (?()|).");\r
238                                         }\r
239                                         else {\r
240                                                 if (alternation == null)\r
241                                                         alternation = new Alternation ();\r
242 \r
243                                                 alternation.AddAlternative (current);\r
244                                         }\r
245 \r
246                                         current = new Group ();\r
247                                         continue;\r
248                                 }\r
249 \r
250                                 case '*': case '+': case '?': {\r
251                                         throw NewParseException ("Bad quantifier.");\r
252                                 }\r
253 \r
254                                 default: \r
255                                         break;          // literal character\r
256                                 }\r
257 \r
258                                 ConsumeWhitespace (IsIgnorePatternWhitespace (options));\r
259                                 \r
260                                 // (2) Check for Repetitions\r
261                                 \r
262                                 if (ptr < pattern.Length) {\r
263                                         char k = pattern[ptr];\r
264 \r
265                                         if (k == '?' || k == '*' || k == '+' || k == '{') {\r
266                                                 ++ ptr;\r
267 \r
268                                                 int min = 0, max = 0;\r
269                                                 bool lazy = false;\r
270 \r
271                                                 switch (k) {\r
272                                                 case '?': min = 0; max = 1; break;\r
273                                                 case '*': min = 0; max = 0xffff; break;\r
274                                                 case '+': min = 1; max = 0xffff; break;\r
275                                                 case '{': ParseRepetitionBounds (out min, out max, options); break;\r
276                                                 }\r
277 \r
278                                                 ConsumeWhitespace (IsIgnorePatternWhitespace (options));\r
279                                                 if (ptr < pattern.Length && pattern[ptr] == '?') {\r
280                                                         ++ ptr;\r
281                                                         lazy = true;\r
282                                                 }\r
283 \r
284                                                 Repetition repetition = new Repetition (min, max, lazy);\r
285 \r
286                                                 if (expr == null)\r
287                                                         repetition.Expression = new Literal (ch.ToString (), IsIgnoreCase (options));\r
288                                                 else\r
289                                                         repetition.Expression = expr;\r
290 \r
291                                                 expr = repetition;\r
292                                         }\r
293                                 }\r
294 \r
295                                 // (3) Append Expression and/or Literal\r
296 \r
297                                 if (expr == null) {\r
298                                         if (literal == null)\r
299                                                 literal = "";\r
300                                         literal += ch;\r
301                                 }\r
302                                 else {\r
303                                         if (literal != null) {\r
304                                                 current.AppendExpression (new Literal (literal, IsIgnoreCase (options)));\r
305                                                 literal = null;\r
306                                         }\r
307 \r
308                                         current.AppendExpression (expr);\r
309                                         expr = null;\r
310                                 }\r
311 \r
312                                 if (is_top_level && ptr >= pattern.Length)\r
313                                         goto EndOfGroup;\r
314                         }\r
315 \r
316                 EndOfGroup:\r
317                         if (is_top_level && closed)\r
318                                 throw NewParseException ("Too many )'s.");\r
319                         if (!is_top_level && !closed)\r
320                                 throw NewParseException ("Not enough )'s.");\r
321                                 \r
322                 \r
323                         // clean up literals and alternations\r
324 \r
325                         if (literal != null)\r
326                                 current.AppendExpression (new Literal (literal, IsIgnoreCase (options)));\r
327 \r
328                         if (assertion != null) {\r
329                                 if (assertion.TrueExpression == null)\r
330                                         assertion.TrueExpression = current;\r
331                                 else\r
332                                         assertion.FalseExpression = current;\r
333                                 \r
334                                 group.AppendExpression (assertion);\r
335                         }\r
336                         else if (alternation != null) {\r
337                                 alternation.AddAlternative (current);\r
338                                 group.AppendExpression (alternation);\r
339                         }\r
340                         else\r
341                                 group.AppendExpression (current);\r
342                 }\r
343 \r
344                 private Expression ParseGroupingConstruct (ref RegexOptions options) {\r
345                         if (pattern[ptr] != '?') {\r
346                                 Group group;\r
347 \r
348                                 if (IsExplicitCapture (options))\r
349                                         group = new Group ();\r
350                                 else {\r
351                                         group = new CapturingGroup ();\r
352                                         caps.Add (group);\r
353                                 }\r
354 \r
355                                 ParseGroup (group, options, null);\r
356                                 return group;\r
357                         }\r
358                         else\r
359                                 ++ ptr;\r
360 \r
361                         switch (pattern[ptr]) {\r
362                         case ':': {                                             // non-capturing group\r
363                                 ++ ptr;\r
364                                 Group group = new Group ();\r
365                                 ParseGroup (group, options, null);\r
366 \r
367                                 return group;\r
368                         }\r
369 \r
370                         case '>': {                                             // non-backtracking group\r
371                                 ++ ptr;\r
372                                 Group group = new NonBacktrackingGroup ();\r
373                                 ParseGroup (group, options, null);\r
374                                 \r
375                                 return group;\r
376                         }\r
377 \r
378                         case 'i': case 'm': case 'n':\r
379                         case 's': case 'x': case '-': {                         // options\r
380                                 RegexOptions o = options;\r
381                                 ParseOptions (ref o, false);\r
382                                 if (pattern[ptr] == '-') {\r
383                                         ++ ptr;\r
384                                         ParseOptions (ref o, true);\r
385                                 }\r
386 \r
387                                 if (pattern[ptr] == ':') {                      // pass options to child group\r
388                                         ++ ptr;\r
389                                         Group group = new Group ();\r
390                                         ParseGroup (group, o, null);\r
391                                         return group;\r
392                                 }\r
393                                 else if (pattern[ptr] == ')') {                 // change options of enclosing group\r
394                                         ++ ptr;\r
395                                         options = o;\r
396                                         return null;\r
397                                 }\r
398                                 else\r
399                                         throw NewParseException ("Bad options");\r
400                         }\r
401 \r
402                         case '<': case '=': case '!': {                         // lookahead/lookbehind\r
403                                 ExpressionAssertion asn = new ExpressionAssertion ();\r
404                                 if (!ParseAssertionType (asn))\r
405                                         goto case '\'';                         // it's a (?<name> ) construct\r
406 \r
407                                 Group test = new Group ();\r
408                                 ParseGroup (test, options, null);\r
409 \r
410                                 asn.TestExpression = test;\r
411                                 return asn;\r
412                         }\r
413 \r
414                         case '\'': {                                            // named/balancing group\r
415                                 char delim;\r
416                                 if (pattern[ptr] == '<')\r
417                                         delim = '>';\r
418                                 else\r
419                                         delim = '\'';\r
420 \r
421                                 ++ ptr;\r
422                                 string name = ParseName ();\r
423 \r
424                                 if (pattern[ptr] == delim) {\r
425                                         // capturing group\r
426 \r
427                                         if (name == null)\r
428                                                 throw NewParseException ("Bad group name.");\r
429 \r
430                                         ++ ptr;\r
431                                         CapturingGroup cap = new CapturingGroup ();\r
432                                         cap.Name = name;\r
433                                         caps.Add (cap);\r
434                                         ParseGroup (cap, options, null);\r
435 \r
436                                         return cap;\r
437                                 }\r
438                                 else if (pattern[ptr] == '-') {\r
439                                         // balancing group\r
440 \r
441                                         ++ ptr;\r
442                                         string balance_name = ParseName ();\r
443                                         if (balance_name == null || pattern[ptr] != delim)\r
444                                                 throw NewParseException ("Bad balancing group name.");\r
445 \r
446                                         ++ ptr;\r
447                                         BalancingGroup bal = new BalancingGroup ();\r
448                                         bal.Name = name;\r
449                                         caps.Add (bal);\r
450                                         refs.Add (bal, balance_name);\r
451 \r
452                                         return bal;\r
453                                 }\r
454                                 else\r
455                                         throw NewParseException ("Bad group name.");\r
456                         }\r
457 \r
458                         case '(': {                                             // expression/capture test\r
459                                 Assertion asn;\r
460                         \r
461                                 ++ ptr;\r
462                                 int p = ptr;\r
463                                 string name = ParseName ();\r
464                                 if (name == null || pattern[ptr] != ')') {      // expression test\r
465                                         // FIXME MS implementation doesn't seem to\r
466                                         // implement this version of (?(x) ...)\r
467 \r
468                                         ptr = p;\r
469                                         ExpressionAssertion expr_asn = new ExpressionAssertion ();\r
470 \r
471                                         if (pattern[ptr] == '?') {\r
472                                                 ++ ptr;\r
473                                                 if (!ParseAssertionType (expr_asn))\r
474                                                         throw NewParseException ("Bad conditional.");\r
475                                         }\r
476                                         else {\r
477                                                 expr_asn.Negate = false;\r
478                                                 expr_asn.Reverse = false;\r
479                                         }\r
480 \r
481                                         Group test = new Group ();\r
482                                         ParseGroup (test, options, null);\r
483                                         expr_asn.TestExpression = test;\r
484                                         asn = expr_asn;\r
485                                 }\r
486                                 else {                                          // capture test\r
487                                         ++ ptr;\r
488                                         asn = new CaptureAssertion ();\r
489                                         refs.Add (asn, name);\r
490                                 }\r
491 \r
492                                 Group group = new Group ();\r
493                                 ParseGroup (group, options, asn);\r
494                                 return group;\r
495                         }\r
496 \r
497                         case '#': {                                             // comment\r
498                                 ++ ptr;\r
499                                 while (pattern[ptr ++] != ')') {\r
500                                         if (ptr >= pattern.Length)\r
501                                                 throw NewParseException ("Unterminated (?#...) comment.");\r
502                                 }\r
503                                 return null;\r
504                         }\r
505 \r
506                         default:                                                // error\r
507                                 throw NewParseException ("Bad grouping construct.");\r
508                         }\r
509                 }\r
510 \r
511                 private bool ParseAssertionType (ExpressionAssertion assertion) {\r
512                         if (pattern[ptr] == '<') {\r
513                                 switch (pattern[ptr + 1]) {\r
514                                 case '=':\r
515                                         assertion.Negate = false;\r
516                                         break;\r
517                                 case '!':\r
518                                         assertion.Negate = true;\r
519                                         break;\r
520                                 default:\r
521                                         return false;\r
522                                 }\r
523 \r
524                                 assertion.Reverse = true;\r
525                                 ptr += 2;\r
526                         }\r
527                         else {\r
528                                 switch (pattern[ptr]) {\r
529                                 case '=':\r
530                                         assertion.Negate = false;\r
531                                         break;\r
532                                 case '!':\r
533                                         assertion.Negate = true;\r
534                                         break;\r
535                                 default:\r
536                                         return false;\r
537                                 }\r
538 \r
539                                 assertion.Reverse = false;\r
540                                 ptr += 1;\r
541                         }\r
542 \r
543                         return true;\r
544                 }\r
545 \r
546                 private void ParseOptions (ref RegexOptions options, bool negate) {\r
547                         for (;;) {\r
548                                 switch (pattern[ptr]) {\r
549                                 case 'i':\r
550                                         if (negate)\r
551                                                 options &= ~RegexOptions.IgnoreCase;\r
552                                         else\r
553                                                 options |= RegexOptions.IgnoreCase;\r
554                                         break;\r
555 \r
556                                 case 'm':\r
557                                         if (negate)\r
558                                                 options &= ~RegexOptions.Multiline;\r
559                                         else\r
560                                                 options |= RegexOptions.Multiline;\r
561                                         break;\r
562                                         \r
563                                 case 'n':\r
564                                         if (negate)\r
565                                                 options &= ~RegexOptions.ExplicitCapture;\r
566                                         else\r
567                                                 options |= RegexOptions.ExplicitCapture;\r
568                                         break;\r
569                                         \r
570                                 case 's':\r
571                                         if (negate)\r
572                                                 options &= ~RegexOptions.Singleline;\r
573                                         else\r
574                                                 options |= RegexOptions.Singleline;\r
575                                         break;\r
576                                         \r
577                                 case 'x':\r
578                                         if (negate)\r
579                                                 options &= ~RegexOptions.IgnorePatternWhitespace;\r
580                                         else\r
581                                                 options |= RegexOptions.IgnorePatternWhitespace;\r
582                                         break;\r
583 \r
584                                 default:\r
585                                         return;\r
586                                 }\r
587 \r
588                                 ++ ptr;\r
589                         }\r
590                 }\r
591 \r
592                 private Expression ParseCharacterClass (RegexOptions options) {\r
593                         bool negate, ecma;\r
594                         if (pattern[ptr] == '^') {\r
595                                 negate = true;\r
596                                 ++ ptr;\r
597                         }\r
598                         else\r
599                                 negate = false;\r
600                         \r
601                         ecma = IsECMAScript (options);\r
602                         CharacterClass cls = new CharacterClass (negate, IsIgnoreCase (options));\r
603 \r
604                         if (pattern[ptr] == ']') {\r
605                                 cls.AddCharacter (']');\r
606                                 ++ ptr;\r
607                         }\r
608 \r
609                         int c = -1;\r
610                         int last = -1;\r
611                         bool range = false;\r
612                         bool closed = false;\r
613                         while (ptr < pattern.Length) {\r
614                                 c = pattern[ptr ++];\r
615 \r
616                                 if (c == ']') {\r
617                                         closed = true;\r
618                                         break;\r
619                                 }\r
620                                 \r
621                                 if (c == '-') {\r
622                                         range = true;\r
623                                         continue;\r
624                                 }\r
625 \r
626                                 if (c == '\\') {\r
627                                         c = ParseEscape ();\r
628                                         if (c < 0) {\r
629                                                 // didn't recognize escape\r
630 \r
631                                                 c = pattern[ptr ++];\r
632                                                 switch (c) {\r
633                                                 case 'b': c = '\b'; break;\r
634 \r
635                                                 case 'd':\r
636                                                         cls.AddCategory (ecma ? Category.EcmaDigit : Category.Digit, false);\r
637                                                         last = -1;\r
638                                                         continue;\r
639                                                         \r
640                                                 case 'w':\r
641                                                         cls.AddCategory (ecma ? Category.EcmaWord : Category.Word, false);\r
642                                                         last = -1;\r
643                                                         continue;\r
644                                                         \r
645                                                 case 's':\r
646                                                         cls.AddCategory (ecma ? Category.EcmaWhiteSpace : Category.WhiteSpace, false);\r
647                                                         last = -1;\r
648                                                         continue;\r
649                                                         \r
650                                                 case 'p':\r
651                                                         cls.AddCategory (ParseUnicodeCategory (), false);       // ignore ecma\r
652                                                         last = -1;\r
653                                                         continue;\r
654                                                         \r
655                                                 case 'D':\r
656                                                         cls.AddCategory (ecma ? Category.EcmaDigit : Category.Digit, true);\r
657                                                         last = -1;\r
658                                                         continue;\r
659                                                         \r
660                                                 case 'W':\r
661                                                         cls.AddCategory (ecma ? Category.EcmaWord : Category.Word, true);\r
662                                                         last = -1;\r
663                                                         continue;\r
664                                                         \r
665                                                 case 'S':\r
666                                                         cls.AddCategory (ecma ? Category.EcmaWhiteSpace : Category.WhiteSpace, true);\r
667                                                         last = -1;\r
668                                                         continue;\r
669                                                         \r
670                                                 case 'P':\r
671                                                         cls.AddCategory (ParseUnicodeCategory (), true);\r
672                                                         last = -1;\r
673                                                         continue;\r
674 \r
675                                                 default: break;         // add escaped character\r
676                                                 }\r
677                                         }\r
678                                 }\r
679 \r
680                                 if (range) {\r
681                                         if (c < last)\r
682                                                 throw NewParseException ("[x-y] range in reverse order.");\r
683 \r
684                                         if (last >=0 )\r
685                                                 cls.AddRange ((char)last, (char)c);\r
686                                         else {\r
687                                                 cls.AddCharacter ((char)c);\r
688                                                 cls.AddCharacter ('-');\r
689                                         }\r
690 \r
691                                         range = false;\r
692                                         last = -1;\r
693                                 }\r
694                                 else {\r
695                                         cls.AddCharacter ((char)c);\r
696                                         last = c;\r
697                                 }\r
698                         }\r
699 \r
700                         if (!closed)\r
701                                 throw NewParseException ("Unterminated [] set.");\r
702 \r
703                         if (range)\r
704                                 cls.AddCharacter ('-');\r
705 \r
706                         return cls;\r
707                 }\r
708 \r
709                 private void ParseRepetitionBounds (out int min, out int max, RegexOptions options) {\r
710                         int n, m;\r
711 \r
712                         /* check syntax */\r
713 \r
714                         ConsumeWhitespace (IsIgnorePatternWhitespace (options));\r
715                         n = ParseNumber (10, 1, 0);\r
716                         if (n < 0)\r
717                                 throw NewParseException ("Illegal {x,y} - bad value of x.");\r
718 \r
719                         ConsumeWhitespace (IsIgnorePatternWhitespace (options));\r
720                         switch (pattern[ptr ++]) {\r
721                         case '}':\r
722                                 m = n;\r
723                                 break;\r
724                         case ',':\r
725                                 ConsumeWhitespace (IsIgnorePatternWhitespace (options));\r
726                                 m = ParseNumber (10, 1, 0);\r
727                                 ConsumeWhitespace (IsIgnorePatternWhitespace (options));\r
728                                 if (pattern[ptr ++] != '}')\r
729                                         throw NewParseException ("Illegal {x,y} - bad value of y.");\r
730                                 break;\r
731                         default:\r
732                                 throw NewParseException ("Illegal {x,y}");\r
733                         }\r
734 \r
735                         /* check bounds and ordering */\r
736 \r
737                         if (n >= 0xffff || m >= 0xffff)\r
738                                 throw NewParseException ("Illegal {x, y} - maximum of 65535.");\r
739                         if (m >= 0 && m < n)\r
740                                 throw NewParseException ("Illegal {x, y} with x > y.");\r
741 \r
742                         /* assign min and max */\r
743                         \r
744                         min = n;\r
745                         if (m > 0)\r
746                                 max = m;\r
747                         else\r
748                                 max = 0xffff;\r
749                 }\r
750 \r
751                 private Category ParseUnicodeCategory () {\r
752                         if (pattern[ptr ++] != '{')\r
753                                 throw NewParseException ("Incomplete \\p{X} character escape.");\r
754 \r
755                         string name = ParseName (pattern, ref ptr);\r
756                         if (name == null)\r
757                                 throw NewParseException ("Incomplete \\p{X} character escape.");\r
758 \r
759                         Category cat = CategoryUtils.CategoryFromName (name);\r
760                         if (cat == Category.None)\r
761                                 throw NewParseException ("Unknown property '" + name + "'.");\r
762 \r
763                         if (pattern[ptr ++] != '}')\r
764                                 throw NewParseException ("Incomplete \\p{X} character escape.");\r
765 \r
766                         return cat;\r
767                 }\r
768 \r
769                 private Expression ParseSpecial (RegexOptions options) {\r
770                         int p = ptr;\r
771                         bool ecma = IsECMAScript (options);\r
772                         Expression expr = null;\r
773                         \r
774                         switch (pattern[ptr ++]) {\r
775 \r
776                         // categories\r
777 \r
778                         case 'd':\r
779                                 expr = new CharacterClass (ecma ? Category.EcmaDigit : Category.Digit, false);\r
780                                 break;\r
781                                 \r
782                         case 'w':\r
783                                 expr = new CharacterClass (ecma ? Category.EcmaWord : Category.Word, false);\r
784                                 break;\r
785                                 \r
786                         case 's':\r
787                                 expr = new CharacterClass (ecma ? Category.EcmaWhiteSpace : Category.WhiteSpace, false);\r
788                                 break;\r
789                                 \r
790                         case 'p':\r
791                                 // this is odd - ECMAScript isn't supposed to support Unicode,\r
792                                 // yet \p{..} compiles and runs under the MS implementation\r
793                                 // identically to canonical mode. That's why I'm ignoring the\r
794                                 // value of ecma here.\r
795                         \r
796                                 expr = new CharacterClass (ParseUnicodeCategory (), false);\r
797                                 break;\r
798                                 \r
799                         case 'D':\r
800                                 expr = new CharacterClass (ecma ? Category.EcmaDigit : Category.Digit, true);\r
801                                 break;\r
802                                 \r
803                         case 'W':\r
804                                 expr = new CharacterClass (ecma ? Category.EcmaWord : Category.Word, true);\r
805                                 break;\r
806                                 \r
807                         case 'S':\r
808                                 expr = new CharacterClass (ecma ? Category.EcmaWhiteSpace : Category.WhiteSpace, true);\r
809                                 break;\r
810                                 \r
811                         case 'P':\r
812                                 expr = new CharacterClass (ParseUnicodeCategory (), true);\r
813                                 break;\r
814 \r
815                         // positions\r
816 \r
817                         case 'A': expr = new PositionAssertion (Position.StartOfString); break;\r
818                         case 'Z': expr = new PositionAssertion (Position.End); break;\r
819                         case 'z': expr = new PositionAssertion (Position.EndOfString); break;\r
820                         case 'G': expr = new PositionAssertion (Position.StartOfScan); break;\r
821                         case 'b': expr = new PositionAssertion (Position.Boundary); break;\r
822                         case 'B': expr = new PositionAssertion (Position.NonBoundary); break;\r
823                         \r
824                         // references\r
825 \r
826                         case '1': case '2': case '3': case '4': case '5':\r
827                         case '6': case '7': case '8': case '9': {\r
828                                 ptr --;\r
829                                 int n = ParseNumber (10, 1, 0);\r
830                                 if (n < 0) {\r
831                                         ptr = p;\r
832                                         return null;\r
833                                 }\r
834 \r
835                                 // FIXME test if number is within number of assigned groups\r
836                                 // this may present a problem for right-to-left matching\r
837 \r
838                                 Reference reference = new Reference (IsIgnoreCase (options));\r
839                                 refs.Add (reference, n.ToString ());\r
840                                 expr = reference;\r
841                                 break;\r
842                         }\r
843 \r
844                         case 'k': {\r
845                                 char delim = pattern[ptr ++];\r
846                                 if (delim == '<')\r
847                                         delim = '>';\r
848                                 else if (delim != '\'')\r
849                                         throw NewParseException ("Malformed \\k<...> named backreference.");\r
850 \r
851                                 string name = ParseName ();\r
852                                 if (name == null || pattern[ptr] != delim)\r
853                                         throw NewParseException ("Malformed \\k<...> named backreference.");\r
854 \r
855                                 ++ ptr;\r
856                                 Reference reference = new Reference (IsIgnoreCase (options));\r
857                                 refs.Add (reference, name);\r
858                                 expr = reference;\r
859                                 break;\r
860                         }\r
861 \r
862                         default:\r
863                                 expr = null;\r
864                                 break;\r
865                         }\r
866 \r
867                         if (expr == null)\r
868                                 ptr = p;\r
869 \r
870                         return expr;\r
871                 }\r
872 \r
873                 private int ParseEscape () {\r
874                         int p = ptr;\r
875                         int c;\r
876 \r
877                         if (p >= pattern.Length)\r
878                                 throw new ArgumentException (\r
879                                                 String.Format ("Parsing \"{0}\" - Illegal \\ at end of " + \r
880                                                                 "pattern.", pattern), pattern);\r
881                         \r
882                         switch (pattern[ptr ++]) {\r
883         \r
884                         // standard escapes (except \b)\r
885 \r
886                         case 'a': return '\u0007';\r
887                         case 't': return '\u0009';\r
888                         case 'r': return '\u000d';\r
889                         case 'v': return '\u000b';\r
890                         case 'f': return '\u000c';\r
891                         case 'n': return '\u000a';\r
892                         case 'e': return '\u001b';\r
893                         case '\\': return '\\';\r
894 \r
895                         // character codes\r
896 \r
897                         case '0':\r
898                                 int prevptr = ptr;\r
899                                 int result = ParseOctal (pattern, ref ptr);\r
900                                 if (result == -1 && prevptr == ptr)\r
901                                         return 0;\r
902 \r
903                                 return result;\r
904 \r
905                         case 'x':\r
906                                 c = ParseHex (pattern, ref ptr, 2);\r
907                                 if (c < 0)\r
908                                         throw NewParseException ("Insufficient hex digits");\r
909 \r
910                                 return c;\r
911 \r
912                         case 'u':\r
913                                 c = ParseHex (pattern, ref ptr, 4);\r
914                                 if (c < 0)\r
915                                         throw NewParseException ("Insufficient hex digits");\r
916                                 \r
917                                 return c;\r
918 \r
919                         // control characters\r
920 \r
921                         case 'c':\r
922                                 c = pattern[p ++];\r
923                                 if (c >= 'A' && c <= 'Z')\r
924                                         return c - 'A';\r
925                                 else if (c >= '@' && c <= '_')\r
926                                         return c - '@';\r
927                                 else\r
928                                         throw NewParseException ("Unrecognized control character.");\r
929 \r
930                         // unknown escape\r
931 \r
932                         default:\r
933                                 ptr = p;\r
934                                 return -1;\r
935                         }\r
936                 }\r
937 \r
938                 private string ParseName () {\r
939                         return Parser.ParseName (pattern, ref ptr);\r
940                 }\r
941 \r
942                 private static bool IsNameChar (char c) {\r
943                         UnicodeCategory cat = Char.GetUnicodeCategory (c);\r
944                         if (cat == UnicodeCategory.ModifierLetter)\r
945                                 return false;\r
946                         if (cat == UnicodeCategory.ConnectorPunctuation)\r
947                                 return true;\r
948                         return Char.IsLetterOrDigit (c);\r
949                 }\r
950         \r
951                 private int ParseNumber (int b, int min, int max) {\r
952                         return Parser.ParseNumber (pattern, ref ptr, b, min, max);\r
953                 }\r
954 \r
955                 private int ParseDecimal () {\r
956                         return Parser.ParseDecimal (pattern, ref ptr);\r
957                 }\r
958 \r
959                 private static int ParseDigit (char c, int b, int n) {\r
960                         switch (b) {\r
961                         case 8:\r
962                                 if (c >= '0' && c <= '7')\r
963                                         return c - '0';\r
964                                 else\r
965                                         return -1;\r
966                         case 10:\r
967                                 if (c >= '0' && c <= '9')\r
968                                         return c - '0';\r
969                                 else\r
970                                         return -1;\r
971                         case 16:\r
972                                 if (c >= '0' && c <= '9')\r
973                                         return c - '0';\r
974                                 else if (c >= 'a' && c <= 'f')\r
975                                         return 10 + c - 'a';\r
976                                 else if (c >= 'A' && c <= 'F')\r
977                                         return 10 + c - 'A';\r
978                                 else\r
979                                         return -1;\r
980                         default:\r
981                                 return -1;\r
982                         }\r
983                 }\r
984 \r
985                 private void ConsumeWhitespace (bool ignore) {\r
986                         while (true) {\r
987                                 if (ptr >= pattern.Length)\r
988                                         break;\r
989                         \r
990                                 if (pattern[ptr] == '(') {\r
991                                         if (ptr + 3 >= pattern.Length)\r
992                                                 return;\r
993 \r
994                                         if (pattern[ptr + 1] != '?' || pattern[ptr + 2] != '#')\r
995                                                 return;\r
996 \r
997                                         ptr += 3;\r
998                                         while (pattern[ptr ++] != ')')\r
999                                                 /* ignore */ ;\r
1000                                 }\r
1001                                 else if (ignore && pattern[ptr] == '#') {\r
1002                                         while (ptr < pattern.Length && pattern[ptr ++] != '\n')\r
1003                                                 /* ignore */ ;\r
1004                                 }\r
1005                                 else if (ignore && Char.IsWhiteSpace (pattern[ptr])) {\r
1006                                         while (ptr < pattern.Length && Char.IsWhiteSpace (pattern[ptr]))\r
1007                                                 ++ ptr;\r
1008                                 }\r
1009                                 else\r
1010                                         return;\r
1011                         }\r
1012                 }\r
1013 \r
1014                 private string ParseString (string pattern) {\r
1015                         this.pattern = pattern;\r
1016                         this.ptr = 0;\r
1017 \r
1018                         string result = "";\r
1019                         while (ptr < pattern.Length) {\r
1020                                 int c = pattern[ptr];\r
1021                                 if (c == '\\')\r
1022                                         c = ParseEscape ();\r
1023                                 ptr ++; \r
1024                                 result += (char)c;\r
1025                         }\r
1026 \r
1027                         return result;\r
1028                 }\r
1029 \r
1030                 private void ResolveReferences () {\r
1031                         int gid = 1;\r
1032                         Hashtable dict = new Hashtable ();\r
1033 \r
1034                         // number unnamed groups\r
1035 \r
1036                         foreach (CapturingGroup group in caps) {\r
1037                                 if (group.Name == null) {\r
1038                                         dict.Add (gid.ToString (), group);\r
1039                                         group.Number = gid ++;\r
1040 \r
1041                                         ++ num_groups;\r
1042                                 }\r
1043                         }\r
1044 \r
1045                         // number named groups\r
1046 \r
1047                         foreach (CapturingGroup group in caps) {\r
1048                                 if (group.Name != null) {\r
1049                                         if (!dict.Contains (group.Name)) {\r
1050                                                 dict.Add (group.Name, group);\r
1051                                                 group.Number = gid ++;\r
1052 \r
1053                                                 ++ num_groups;\r
1054                                         }\r
1055                                         else {\r
1056                                                 CapturingGroup prev = (CapturingGroup)dict[group.Name];\r
1057                                                 group.Number = prev.Number;\r
1058                                         }\r
1059                                 }\r
1060                         }\r
1061 \r
1062                         // resolve references\r
1063 \r
1064                         foreach (Expression expr in refs.Keys) {\r
1065                                 string name = (string)refs[expr];\r
1066                                 if (!dict.Contains (name)) {\r
1067                                         throw NewParseException ("Reference to undefined group " +\r
1068                                                 (Char.IsDigit (name[0]) ? "number " : "name ") +\r
1069                                                 name);\r
1070                                 }\r
1071 \r
1072                                 CapturingGroup group = (CapturingGroup)dict[name];\r
1073                                 if (expr is Reference)\r
1074                                         ((Reference)expr).CapturingGroup = group;\r
1075                                 else if (expr is CaptureAssertion)\r
1076                                         ((CaptureAssertion)expr).CapturingGroup = group;\r
1077                                 else if (expr is BalancingGroup)\r
1078                                         ((BalancingGroup)expr).Balance = group;\r
1079                         }\r
1080                 }\r
1081 \r
1082                 // flag helper functions\r
1083 \r
1084                 private static bool IsIgnoreCase (RegexOptions options) {\r
1085                         return (options & RegexOptions.IgnoreCase) != 0;\r
1086                 }\r
1087 \r
1088                 private static bool IsMultiline (RegexOptions options) {\r
1089                         return (options & RegexOptions.Multiline) != 0;\r
1090                 }\r
1091 \r
1092                 private static bool IsExplicitCapture (RegexOptions options) {\r
1093                         return (options & RegexOptions.ExplicitCapture) != 0;\r
1094                 }\r
1095         \r
1096                 private static bool IsSingleline (RegexOptions options) {\r
1097                         return (options & RegexOptions.Singleline) != 0;\r
1098                 }\r
1099 \r
1100                 private static bool IsIgnorePatternWhitespace (RegexOptions options) {\r
1101                         return (options & RegexOptions.IgnorePatternWhitespace) != 0;\r
1102                 }\r
1103 \r
1104                 private static bool IsRightToLeft (RegexOptions options) {\r
1105                         return (options & RegexOptions.RightToLeft) != 0;\r
1106                 }\r
1107 \r
1108                 private static bool IsECMAScript (RegexOptions options) {\r
1109                         return (options & RegexOptions.ECMAScript) != 0;\r
1110                 }\r
1111 \r
1112                 // exception creation\r
1113 \r
1114                 private ArgumentException NewParseException (string msg) {\r
1115                         msg = "parsing \"" + pattern + "\" - " + msg;\r
1116                         return new ArgumentException (msg, pattern);\r
1117                 }\r
1118 \r
1119                 private string pattern;\r
1120                 private int ptr;\r
1121 \r
1122                 private ArrayList caps;\r
1123                 private Hashtable refs;\r
1124                 private int num_groups;\r
1125         }\r
1126 }\r