mcs/class/System/System.Text.RegularExpressions/parser.cs

   1 //
   2 // assembly:    System
   3 // namespace:   System.Text.RegularExpressions
   4 // file:        parser.cs
   5 //
   6 // author:      Dan Lewis (dlewis@gmx.co.uk)
   7 //              (c) 2002
   8
   9 //
  10 // Permission is hereby granted, free of charge, to any person obtaining
  11 // a copy of this software and associated documentation files (the
  12 // "Software"), to deal in the Software without restriction, including
  13 // without limitation the rights to use, copy, modify, merge, publish,
  14 // distribute, sublicense, and/or sell copies of the Software, and to
  15 // permit persons to whom the Software is furnished to do so, subject to
  16 // the following conditions:
  17 //
  18 // The above copyright notice and this permission notice shall be
  19 // included in all copies or substantial portions of the Software.
  20 //
  21 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  22 // EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  23 // MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  24 // NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
  25 // LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
  26 // OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
  27 // WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  28 //
  29
  30 using System;
  31 using System.Collections;
  32 using System.Globalization;
  33
  34 namespace System.Text.RegularExpressions.Syntax {
  35
  36         class Parser {
  37                 public static int ParseDecimal (string str, ref int ptr) {
  38                         return ParseNumber (str, ref ptr, 10, 1, Int32.MaxValue);
  39                 }
  40
  41                 public static int ParseOctal (string str, ref int ptr) {
  42                         return ParseNumber (str, ref ptr, 8, 1, 3);
  43                 }
  44
  45                 public static int ParseHex (string str, ref int ptr, int digits) {
  46                         return ParseNumber (str, ref ptr, 16, digits, digits);
  47                 }
  48
  49                 public static int ParseNumber (string str, ref int ptr, int b, int min, int max) {
  50                         int p = ptr, n = 0, digits = 0, d;
  51                         if (max < min)
  52                                 max = Int32.MaxValue;
  53
  54                         while (digits < max && p < str.Length) {
  55                                 d = ParseDigit (str[p ++], b, digits);
  56                                 if (d < 0) {
  57                                         -- p;
  58                                         break;
  59                                 }
  60
  61                                 n = n * b + d;
  62                                 ++ digits;
  63                         }
  64
  65                         if (digits < min)
  66                                 return -1;
  67
  68                         ptr = p;
  69                         return n;
  70                 }
  71
  72                 public static string ParseName (string str, ref int ptr) {
  73                         if (Char.IsDigit (str[ptr])) {
  74                                 int gid = ParseNumber (str, ref ptr, 10, 1, 0);
  75                                 if (gid > 0)
  76                                         return gid.ToString ();
  77
  78                                 return null;
  79                         }
  80
  81                         int start = ptr;
  82                         for (;;) {
  83                                 if (!IsNameChar (str[ptr]))
  84                                         break;
  85                                 ++ ptr;
  86                         }
  87
  88                         if (ptr - start > 0)
  89                                 return str.Substring (start, ptr - start);
  90
  91                         return null;
  92                 }
  93
  94                 public static string Escape (string str) {
  95                         string result = "";
  96                         for (int i = 0; i < str.Length; ++ i) {
  97                                 char c = str[i];
  98                                 switch (c) {
  99                                 case '\\': case '*': case '+': case '?': case '|':
 100                                 case '{': case '[': case '(': case ')': case '^':
 101                                 case '$': case '.': case '#': case ' ':
 102                                         result += "\\" + c;
 103                                         break;
 104
 105                                 case '\t': result += "\\t"; break;
 106                                 case '\n': result += "\\n"; break;
 107                                 case '\r': result += "\\r"; break;
 108                                 case '\f': result += "\\f"; break;
 109
 110                                 default: result += c; break;
 111                                 }
 112                         }
 113
 114                         return result;
 115                 }
 116
 117                 public static string Unescape (string str) {
 118                         if (str.IndexOf ('\\') == -1)
 119                                 return str;
 120                         return new Parser ().ParseString (str);
 121                 }
 122
 123                 // public instance
 124
 125                 public Parser () {
 126                         this.caps = new ArrayList ();
 127                         this.refs = new Hashtable ();
 128                 }
 129
 130                 public RegularExpression ParseRegularExpression (string pattern, RegexOptions options) {
 131                         this.pattern = pattern;
 132                         this.ptr = 0;
 133
 134                         caps.Clear ();
 135                         refs.Clear ();
 136                         this.num_groups = 0;
 137
 138                         try {
 139                                 RegularExpression re = new RegularExpression ();
 140                                 ParseGroup (re, options, null);
 141                                 ResolveReferences ();
 142
 143                                 re.GroupCount = num_groups;
 144
 145                                 return re;
 146                         }
 147                         catch (IndexOutOfRangeException) {
 148                                 throw NewParseException ("Unexpected end of pattern.");
 149                         }
 150                 }
 151
 152                 public int GetMapping (Hashtable mapping)
 153                 {
 154                         int end = caps.Count;
 155                         mapping.Add ("0", 0);
 156                         for (int i = 0; i < end; i++) {
 157                                 CapturingGroup group = (CapturingGroup) caps [i];
 158                                 string name = group.Name != null ? group.Name : group.Index.ToString ();
 159                                 if (mapping.Contains (name)) {
 160                                         if ((int) mapping [name] != group.Index)
 161                                                 throw new SystemException ("invalid state");
 162                                         continue;
 163                                 }
 164                                 mapping.Add (name, group.Index);
 165                         }
 166
 167                         return gap;
 168                 }
 169
 170                 // private methods
 171
 172                 private void ParseGroup (Group group, RegexOptions options, Assertion assertion) {
 173                         bool is_top_level = group is RegularExpression;
 174
 175                         Alternation alternation = null;
 176                         string literal = null;
 177
 178                         Group current = new Group ();
 179                         Expression expr = null;
 180                         bool closed = false;
 181
 182                         while (true) {
 183                                 ConsumeWhitespace (IsIgnorePatternWhitespace (options));
 184                                 if (ptr >= pattern.Length)
 185                                         break;
 186
 187                                 // (1) Parse for Expressions
 188
 189                                 char ch = pattern[ptr ++];
 190
 191                                 switch (ch) {
 192                                 case '^': {
 193                                         Position pos =
 194                                                 IsMultiline (options) ? Position.StartOfLine : Position.Start;
 195                                         expr = new PositionAssertion (pos);
 196                                         break;
 197                                 }
 198
 199                                 case '$': {
 200                                         Position pos =
 201                                                 IsMultiline (options) ? Position.EndOfLine : Position.End;
 202                                         expr = new PositionAssertion (pos);
 203                                         break;
 204                                 }
 205
 206                                 case '.': {
 207                                         Category cat =
 208                                                 IsSingleline (options) ? Category.AnySingleline : Category.Any;
 209                                         expr = new CharacterClass (cat, false);
 210                                         break;
 211                                 }
 212
 213                                 case '\\': {
 214                                         int c = ParseEscape (false);
 215                                         if (c >= 0)
 216                                                 ch = (char)c;
 217                                         else {
 218                                                 expr = ParseSpecial (options);
 219
 220                                                 if (expr == null)
 221                                                         ch = pattern[ptr ++];           // default escape
 222                                         }
 223                                         break;
 224                                 }
 225
 226                                 case '[': {
 227                                         expr = ParseCharacterClass (options);
 228                                         break;
 229                                 }
 230
 231                                 case '(': {
 232                                         bool ignore = IsIgnoreCase (options);
 233                                         expr = ParseGroupingConstruct (ref options);
 234                                         if (expr == null) {
 235                                                 if (literal != null && IsIgnoreCase (options) != ignore) {
 236                                                         current.AppendExpression (new Literal (literal, IsIgnoreCase (options)));
 237                                                         literal = null;
 238                                                 }
 239
 240                                                 continue;
 241                                         }
 242                                         break;
 243                                 }
 244
 245                                 case ')': {
 246                                         closed = true;
 247                                         goto EndOfGroup;
 248                                 }
 249
 250                                 case '|': {
 251                                         if (literal != null) {
 252                                                 current.AppendExpression (new Literal (literal, IsIgnoreCase (options)));
 253                                                 literal = null;
 254                                         }
 255
 256                                         if (assertion != null) {
 257                                                 if (assertion.TrueExpression == null)
 258                                                         assertion.TrueExpression = current;
 259                                                 else if (assertion.FalseExpression == null)
 260                                                         assertion.FalseExpression = current;
 261                                                 else
 262                                                         throw NewParseException ("Too many | in (?()|).");
 263                                         }
 264                                         else {
 265                                                 if (alternation == null)
 266                                                         alternation = new Alternation ();
 267
 268                                                 alternation.AddAlternative (current);
 269                                         }
 270
 271                                         current = new Group ();
 272                                         continue;
 273                                 }
 274
 275                                 case '*': case '+': case '?': {
 276                                         throw NewParseException ("Bad quantifier.");
 277                                 }
 278
 279                                 default:
 280                                         break;          // literal character
 281                                 }
 282
 283                                 ConsumeWhitespace (IsIgnorePatternWhitespace (options));
 284
 285                                 // (2) Check for Repetitions
 286
 287                                 if (ptr < pattern.Length) {
 288                                         char k = pattern[ptr];
 289                                         int min = 0, max = 0;
 290                                         bool lazy = false;
 291                                         bool haveRep = false;
 292
 293
 294                                         if (k == '?' || k == '*' || k == '+') {
 295                                                 ++ ptr;
 296                                                 haveRep = true;
 297
 298                                                 switch (k) {
 299                                                 case '?': min = 0; max = 1; break;
 300                                                 case '*': min = 0; max = 0x7fffffff; break;
 301                                                 case '+': min = 1; max = 0x7fffffff; break;
 302                                                 }
 303                                         } else if (k == '{' && ptr + 1 < pattern.Length) {
 304                                                 int saved_ptr = ptr;
 305                                                 ++ptr;
 306                                                 haveRep = ParseRepetitionBounds (out min, out max, options);
 307                                                 if (!haveRep)
 308                                                         ptr = saved_ptr;
 309                                         }
 310
 311                                         if (haveRep) {
 312                                                 ConsumeWhitespace (IsIgnorePatternWhitespace (options));
 313                                                 if (ptr < pattern.Length && pattern[ptr] == '?') {
 314                                                         ++ ptr;
 315                                                         lazy = true;
 316                                                 }
 317
 318                                                 //It doesn't make sense to assert a given position more than once.
 319                                                 bool ignore_repetition = false;
 320                                                 if (expr is PositionAssertion) {
 321                                                         ignore_repetition = min > 0 && !lazy;
 322                                                         max = 1;
 323                                                 }
 324
 325                                                 if (!ignore_repetition) {
 326                                                         Repetition repetition = new Repetition (min, max, lazy);
 327
 328                                                         if (expr == null)
 329                                                                 repetition.Expression = new Literal (ch.ToString (), IsIgnoreCase (options));
 330                                                         else
 331                                                                 repetition.Expression = expr;
 332
 333                                                         expr = repetition;
 334                                                 }
 335                                         }
 336                                 }
 337
 338                                 // (3) Append Expression and/or Literal
 339
 340                                 if (expr == null) {
 341                                         if (literal == null)
 342                                                 literal = "";
 343                                         literal += ch;
 344                                 }
 345                                 else {
 346                                         if (literal != null) {
 347                                                 current.AppendExpression (new Literal (literal, IsIgnoreCase (options)));
 348                                                 literal = null;
 349                                         }
 350
 351                                         current.AppendExpression (expr);
 352                                         expr = null;
 353                                 }
 354
 355                                 if (is_top_level && ptr >= pattern.Length)
 356                                         goto EndOfGroup;
 357                         }
 358
 359                 EndOfGroup:
 360                         if (is_top_level && closed)
 361                                 throw NewParseException ("Too many )'s.");
 362                         if (!is_top_level && !closed)
 363                                 throw NewParseException ("Not enough )'s.");
 364
 365
 366                         // clean up literals and alternations
 367
 368                         if (literal != null)
 369                                 current.AppendExpression (new Literal (literal, IsIgnoreCase (options)));
 370
 371                         if (assertion != null) {
 372                                 if (assertion.TrueExpression == null)
 373                                         assertion.TrueExpression = current;
 374                                 else
 375                                         assertion.FalseExpression = current;
 376
 377                                 group.AppendExpression (assertion);
 378                         }
 379                         else if (alternation != null) {
 380                                 alternation.AddAlternative (current);
 381                                 group.AppendExpression (alternation);
 382                         }
 383                         else
 384                                 group.AppendExpression (current);
 385                 }
 386
 387                 private Expression ParseGroupingConstruct (ref RegexOptions options) {
 388                         if (pattern[ptr] != '?') {
 389                                 Group group;
 390
 391                                 if (IsExplicitCapture (options))
 392                                         group = new Group ();
 393                                 else {
 394                                         group = new CapturingGroup ();
 395                                         caps.Add (group);
 396                                 }
 397
 398                                 ParseGroup (group, options, null);
 399                                 return group;
 400                         }
 401                         else
 402                                 ++ ptr;
 403
 404                         switch (pattern[ptr]) {
 405                         case ':': {                                             // non-capturing group
 406                                 ++ ptr;
 407                                 Group group = new Group ();
 408                                 ParseGroup (group, options, null);
 409
 410                                 return group;
 411                         }
 412
 413                         case '>': {                                             // non-backtracking group
 414                                 ++ ptr;
 415                                 Group group = new NonBacktrackingGroup ();
 416                                 ParseGroup (group, options, null);
 417
 418                                 return group;
 419                         }
 420
 421                         case 'i': case 'm': case 'n':
 422                         case 's': case 'x': case '-': {                         // options
 423                                 RegexOptions o = options;
 424                                 ParseOptions (ref o, false);
 425                                 if (pattern[ptr] == '-') {
 426                                         ++ ptr;
 427                                         ParseOptions (ref o, true);
 428                                 }
 429
 430                                 if (pattern[ptr] == ':') {                      // pass options to child group
 431                                         ++ ptr;
 432                                         Group group = new Group ();
 433                                         ParseGroup (group, o, null);
 434                                         return group;
 435                                 }
 436                                 else if (pattern[ptr] == ')') {                 // change options of enclosing group
 437                                         ++ ptr;
 438                                         options = o;
 439                                         return null;
 440                                 }
 441                                 else
 442                                         throw NewParseException ("Bad options");
 443                         }
 444
 445                         case '<': case '=': case '!': {                         // lookahead/lookbehind
 446                                 ExpressionAssertion asn = new ExpressionAssertion ();
 447                                 if (!ParseAssertionType (asn))
 448                                         goto case '\'';                         // it's a (?<name> ) construct
 449
 450                                 Group test = new Group ();
 451                                 ParseGroup (test, options, null);
 452
 453                                 asn.TestExpression = test;
 454                                 return asn;
 455                         }
 456
 457                         case '\'': {                                            // named/balancing group
 458                                 char delim;
 459                                 if (pattern[ptr] == '<')
 460                                         delim = '>';
 461                                 else
 462                                         delim = '\'';
 463
 464                                 ++ ptr;
 465                                 string name = ParseName ();
 466
 467                                 if (pattern[ptr] == delim) {
 468                                         // capturing group
 469
 470                                         if (name == null)
 471                                                 throw NewParseException ("Bad group name.");
 472
 473                                         ++ ptr;
 474                                         CapturingGroup cap = new CapturingGroup ();
 475                                         cap.Name = name;
 476                                         caps.Add (cap);
 477                                         ParseGroup (cap, options, null);
 478
 479                                         return cap;
 480                                 }
 481                                 else if (pattern[ptr] == '-') {
 482                                         // balancing group
 483
 484                                         ++ ptr;
 485                                         string balance_name = ParseName ();
 486                                         if (balance_name == null || pattern[ptr] != delim)
 487                                                 throw NewParseException ("Bad balancing group name.");
 488
 489                                         ++ ptr;
 490                                         BalancingGroup bal = new BalancingGroup ();
 491                                         bal.Name = name;
 492
 493                                         if(bal.IsNamed) {
 494                                                 caps.Add (bal);
 495                                         }
 496
 497                                         refs.Add (bal, balance_name);
 498
 499                                         ParseGroup (bal, options, null);
 500
 501                                         return bal;
 502                                 }
 503                                 else
 504                                         throw NewParseException ("Bad group name.");
 505                         }
 506
 507                         case '(': {                                             // expression/capture test
 508                                 Assertion asn;
 509
 510                                 ++ ptr;
 511                                 int p = ptr;
 512                                 string name = ParseName ();
 513                                 if (name == null || pattern[ptr] != ')') {      // expression test
 514                                         // FIXME MS implementation doesn't seem to
 515                                         // implement this version of (?(x) ...)
 516
 517                                         ptr = p;
 518                                         ExpressionAssertion expr_asn = new ExpressionAssertion ();
 519
 520                                         if (pattern[ptr] == '?') {
 521                                                 ++ ptr;
 522                                                 if (!ParseAssertionType (expr_asn))
 523                                                         throw NewParseException ("Bad conditional.");
 524                                         }
 525                                         else {
 526                                                 expr_asn.Negate = false;
 527                                                 expr_asn.Reverse = false;
 528                                         }
 529
 530                                         Group test = new Group ();
 531                                         ParseGroup (test, options, null);
 532                                         expr_asn.TestExpression = test;
 533                                         asn = expr_asn;
 534                                 }
 535                                 else {                                          // capture test
 536                                         ++ ptr;
 537                                         asn = new CaptureAssertion (new Literal (name, IsIgnoreCase (options)));
 538                                         refs.Add (asn, name);
 539                                 }
 540
 541                                 Group group = new Group ();
 542                                 ParseGroup (group, options, asn);
 543                                 return group;
 544                         }
 545
 546                         case '#': {                                             // comment
 547                                 ++ ptr;
 548                                 while (pattern[ptr ++] != ')') {
 549                                         if (ptr >= pattern.Length)
 550                                                 throw NewParseException ("Unterminated (?#...) comment.");
 551                                 }
 552                                 return null;
 553                         }
 554
 555                         default:                                                // error
 556                                 throw NewParseException ("Bad grouping construct.");
 557                         }
 558                 }
 559
 560                 private bool ParseAssertionType (ExpressionAssertion assertion) {
 561                         if (pattern[ptr] == '<') {
 562                                 switch (pattern[ptr + 1]) {
 563                                 case '=':
 564                                         assertion.Negate = false;
 565                                         break;
 566                                 case '!':
 567                                         assertion.Negate = true;
 568                                         break;
 569                                 default:
 570                                         return false;
 571                                 }
 572
 573                                 assertion.Reverse = true;
 574                                 ptr += 2;
 575                         }
 576                         else {
 577                                 switch (pattern[ptr]) {
 578                                 case '=':
 579                                         assertion.Negate = false;
 580                                         break;
 581                                 case '!':
 582                                         assertion.Negate = true;
 583                                         break;
 584                                 default:
 585                                         return false;
 586                                 }
 587
 588                                 assertion.Reverse = false;
 589                                 ptr += 1;
 590                         }
 591
 592                         return true;
 593                 }
 594
 595                 private void ParseOptions (ref RegexOptions options, bool negate) {
 596                         for (;;) {
 597                                 switch (pattern[ptr]) {
 598                                 case 'i':
 599                                         if (negate)
 600                                                 options &= ~RegexOptions.IgnoreCase;
 601                                         else
 602                                                 options |= RegexOptions.IgnoreCase;
 603                                         break;
 604
 605                                 case 'm':
 606                                         if (negate)
 607                                                 options &= ~RegexOptions.Multiline;
 608                                         else
 609                                                 options |= RegexOptions.Multiline;
 610                                         break;
 611
 612                                 case 'n':
 613                                         if (negate)
 614                                                 options &= ~RegexOptions.ExplicitCapture;
 615                                         else
 616                                                 options |= RegexOptions.ExplicitCapture;
 617                                         break;
 618
 619                                 case 's':
 620                                         if (negate)
 621                                                 options &= ~RegexOptions.Singleline;
 622                                         else
 623                                                 options |= RegexOptions.Singleline;
 624                                         break;
 625
 626                                 case 'x':
 627                                         if (negate)
 628                                                 options &= ~RegexOptions.IgnorePatternWhitespace;
 629                                         else
 630                                                 options |= RegexOptions.IgnorePatternWhitespace;
 631                                         break;
 632
 633                                 default:
 634                                         return;
 635                                 }
 636
 637                                 ++ ptr;
 638                         }
 639                 }
 640
 641                 private Expression ParseCharacterClass (RegexOptions options) {
 642                         bool negate = false;
 643                         if (pattern[ptr] == '^') {
 644                                 negate = true;
 645                                 ++ ptr;
 646                         }
 647
 648                         bool ecma = IsECMAScript (options);
 649                         CharacterClass cls = new CharacterClass (negate, IsIgnoreCase (options));
 650
 651                         if (pattern[ptr] == ']') {
 652                                 cls.AddCharacter (']');
 653                                 ++ ptr;
 654                         }
 655
 656                         int c = -1;
 657                         int last = -1;
 658                         bool range = false;
 659                         bool closed = false;
 660                         while (ptr < pattern.Length) {
 661                                 c = pattern[ptr ++];
 662
 663                                 if (c == ']') {
 664                                         closed = true;
 665                                         break;
 666                                 }
 667
 668                                 if (c == '-' && last >= 0 && !range) {
 669                                         range = true;
 670                                         continue;
 671                                 }
 672
 673                                 if (c == '\\') {
 674                                         c = ParseEscape (true);
 675                                         if (c >= 0)
 676                                                 goto char_recognized;
 677
 678                                         // didn't recognize escape
 679                                         c = pattern [ptr ++];
 680                                         switch (c) {
 681                                         case 'b':
 682                                                 c = '\b';
 683                                                 goto char_recognized;
 684
 685                                         case 'd': case 'D':
 686                                                 cls.AddCategory (ecma ? Category.EcmaDigit : Category.Digit, c == 'D');
 687                                                 break;
 688
 689                                         case 'w': case 'W':
 690                                                 cls.AddCategory (ecma ? Category.EcmaWord : Category.Word, c == 'W');
 691                                                 break;
 692
 693                                         case 's': case 'S':
 694                                                 cls.AddCategory (ecma ? Category.EcmaWhiteSpace : Category.WhiteSpace, c == 'S');
 695                                                 break;
 696
 697                                         case 'p': case 'P':
 698                                                 cls.AddCategory (ParseUnicodeCategory (), c == 'P');    // ignore ecma
 699                                                 break;
 700
 701                                         default:                // add escaped character
 702                                                 goto char_recognized;
 703                                         }
 704
 705                                         // if the pattern looks like [a-\s] ...
 706                                         if (range)
 707                                                 throw NewParseException ("character range cannot have category \\" + c);
 708
 709                                         last = -1;
 710                                         continue;
 711                                 }
 712
 713                         char_recognized:
 714                                 if (range) {
 715                                         // if 'range' is true, we know that 'last >= 0'
 716                                         if (c < last)
 717                                                 throw NewParseException ("[" + last + "-" + c + "] range in reverse order.");
 718                                         cls.AddRange ((char)last, (char)c);
 719                                         last = -1;
 720                                         range = false;
 721                                         continue;
 722                                 }
 723
 724                                 cls.AddCharacter ((char)c);
 725                                 last = c;
 726                         }
 727
 728                         if (!closed)
 729                                 throw NewParseException ("Unterminated [] set.");
 730
 731                         if (range)
 732                                 cls.AddCharacter ('-');
 733
 734                         return cls;
 735                 }
 736
 737                 private bool ParseRepetitionBounds (out int min, out int max, RegexOptions options) {
 738                         int n, m;
 739                         min = max = 0;
 740
 741                         /* check syntax */
 742
 743                         ConsumeWhitespace (IsIgnorePatternWhitespace (options));
 744
 745                         if (pattern[ptr] == ',') {
 746                                 n = -1;
 747                         } else {
 748                                 n = ParseNumber (10, 1, 0);
 749                                 ConsumeWhitespace (IsIgnorePatternWhitespace (options));
 750                         }
 751
 752                         switch (pattern[ptr ++]) {
 753                         case '}':
 754                                 m = n;
 755                                 break;
 756                         case ',':
 757                                 ConsumeWhitespace (IsIgnorePatternWhitespace (options));
 758                                 m = ParseNumber (10, 1, 0);
 759                                 ConsumeWhitespace (IsIgnorePatternWhitespace (options));
 760                                 if (pattern[ptr ++] != '}')
 761                                         return false;
 762                                 break;
 763                         default:
 764                                 return false;
 765                         }
 766
 767                         /* check bounds and ordering */
 768
 769                         if (n > 0x7fffffff || m > 0x7fffffff)
 770                                 throw NewParseException ("Illegal {x, y} - maximum of 2147483647.");
 771                         if (m >= 0 && m < n)
 772                                 throw NewParseException ("Illegal {x, y} with x > y.");
 773
 774                         /* assign min and max */
 775
 776                         min = n;
 777                         if (m > 0)
 778                                 max = m;
 779                         else
 780                                 max = 0x7fffffff;
 781
 782                         return true;
 783                 }
 784
 785                 private Category ParseUnicodeCategory () {
 786                         if (pattern[ptr ++] != '{')
 787                                 throw NewParseException ("Incomplete \\p{X} character escape.");
 788
 789                         string name = ParseName (pattern, ref ptr);
 790                         if (name == null)
 791                                 throw NewParseException ("Incomplete \\p{X} character escape.");
 792
 793                         Category cat = CategoryUtils.CategoryFromName (name);
 794                         if (cat == Category.None)
 795                                 throw NewParseException ("Unknown property '" + name + "'.");
 796
 797                         if (pattern[ptr ++] != '}')
 798                                 throw NewParseException ("Incomplete \\p{X} character escape.");
 799
 800                         return cat;
 801                 }
 802
 803                 private Expression ParseSpecial (RegexOptions options) {
 804                         int p = ptr;
 805                         bool ecma = IsECMAScript (options);
 806                         Expression expr = null;
 807
 808                         switch (pattern[ptr ++]) {
 809
 810                         // categories
 811
 812                         case 'd':
 813                                 expr = new CharacterClass (ecma ? Category.EcmaDigit : Category.Digit, false);
 814                                 break;
 815
 816                         case 'w':
 817                                 expr = new CharacterClass (ecma ? Category.EcmaWord : Category.Word, false);
 818                                 break;
 819
 820                         case 's':
 821                                 expr = new CharacterClass (ecma ? Category.EcmaWhiteSpace : Category.WhiteSpace, false);
 822                                 break;
 823
 824                         case 'p':
 825                                 // this is odd - ECMAScript isn't supposed to support Unicode,
 826                                 // yet \p{..} compiles and runs under the MS implementation
 827                                 // identically to canonical mode. That's why I'm ignoring the
 828                                 // value of ecma here.
 829
 830                                 expr = new CharacterClass (ParseUnicodeCategory (), false);
 831                                 break;
 832
 833                         case 'D':
 834                                 expr = new CharacterClass (ecma ? Category.EcmaDigit : Category.Digit, true);
 835                                 break;
 836
 837                         case 'W':
 838                                 expr = new CharacterClass (ecma ? Category.EcmaWord : Category.Word, true);
 839                                 break;
 840
 841                         case 'S':
 842                                 expr = new CharacterClass (ecma ? Category.EcmaWhiteSpace : Category.WhiteSpace, true);
 843                                 break;
 844
 845                         case 'P':
 846                                 expr = new CharacterClass (ParseUnicodeCategory (), true);
 847                                 break;
 848
 849                         // positions
 850
 851                         case 'A': expr = new PositionAssertion (Position.StartOfString); break;
 852                         case 'Z': expr = new PositionAssertion (Position.End); break;
 853                         case 'z': expr = new PositionAssertion (Position.EndOfString); break;
 854                         case 'G': expr = new PositionAssertion (Position.StartOfScan); break;
 855                         case 'b': expr = new PositionAssertion (Position.Boundary); break;
 856                         case 'B': expr = new PositionAssertion (Position.NonBoundary); break;
 857
 858                         // references
 859
 860                         case '1': case '2': case '3': case '4': case '5':
 861                         case '6': case '7': case '8': case '9': {
 862                                 ptr --;
 863                                 int oldptr = ptr;
 864                                 int n = ParseNumber (10, 1, 0);
 865                                 if (n < 0) {
 866                                         ptr = p;
 867                                         return null;
 868                                 }
 869
 870                                 // FIXME test if number is within number of assigned groups
 871                                 // this may present a problem for right-to-left matching
 872
 873                                 Reference reference = new BackslashNumber (IsIgnoreCase (options), ecma);
 874                                 refs.Add (reference, n.ToString ());
 875                                 expr = reference;
 876                                 break;
 877                         }
 878
 879                         case 'k': {
 880                                 char delim = pattern[ptr ++];
 881                                 if (delim == '<')
 882                                         delim = '>';
 883                                 else if (delim != '\'')
 884                                         throw NewParseException ("Malformed \\k<...> named backreference.");
 885
 886                                 string name = ParseName ();
 887                                 if (name == null || pattern[ptr] != delim)
 888                                         throw NewParseException ("Malformed \\k<...> named backreference.");
 889
 890                                 ++ ptr;
 891                                 Reference reference = new Reference (IsIgnoreCase (options));
 892                                 refs.Add (reference, name);
 893                                 expr = reference;
 894                                 break;
 895                         }
 896
 897                         default:
 898                                 expr = null;
 899                                 break;
 900                         }
 901
 902                         if (expr == null)
 903                                 ptr = p;
 904
 905                         return expr;
 906                 }
 907
 908                 private int ParseEscape (bool inCharacterClass) {
 909                         int p = ptr;
 910                         int c;
 911
 912                         if (p >= pattern.Length)
 913                                 throw new ArgumentException (
 914                                                 String.Format ("Parsing \"{0}\" - Illegal \\ at end of " +
 915                                                                 "pattern.", pattern), pattern);
 916
 917                         switch (pattern[ptr ++]) {
 918
 919                         // standard escapes (except \b)
 920
 921                         case 'a': return '\u0007';
 922                         case 't': return '\u0009';
 923                         case 'r': return '\u000d';
 924                         case 'v': return '\u000b';
 925                         case 'f': return '\u000c';
 926                         case 'n': return '\u000a';
 927                         case 'e': return '\u001b';
 928                         case '\\': return '\\';
 929
 930                         // character codes
 931
 932                         case '0':
 933
 934                                 //
 935                                 // Turns out that octal values can be specified
 936                                 // without a leading zero.   But also the limit
 937                                 // of three character should include this first
 938                                 // one.
 939                                 //
 940                                 ptr--;
 941                                 int prevptr = ptr;
 942                                 int result = ParseOctal (pattern, ref ptr);
 943                                 if (result == -1 && prevptr == ptr)
 944                                         return 0;
 945
 946                                 return result;
 947
 948                         case '1': case '2': case '3': case '4': case '5':
 949                         case '6': case '7':
 950                                 if (inCharacterClass){
 951                                         ptr--;
 952                                         return ParseOctal (pattern, ref ptr);
 953                                 }
 954                                 break;
 955
 956                         case 'x':
 957                                 c = ParseHex (pattern, ref ptr, 2);
 958                                 if (c < 0)
 959                                         throw NewParseException ("Insufficient hex digits");
 960
 961                                 return c;
 962
 963                         case 'u':
 964                                 c = ParseHex (pattern, ref ptr, 4);
 965                                 if (c < 0)
 966                                         throw NewParseException ("Insufficient hex digits");
 967
 968                                 return c;
 969
 970                         // control characters
 971
 972                         case 'c':
 973                                 c = pattern[ptr ++];
 974                                 if (c >= '@' && c <= '_')
 975                                         return c - '@';
 976                                 else
 977                                         throw NewParseException ("Unrecognized control character.");
 978                         }
 979
 980                         // unknown escape
 981                         ptr = p;
 982                         return -1;
 983                 }
 984
 985                 private string ParseName () {
 986                         return Parser.ParseName (pattern, ref ptr);
 987                 }
 988
 989                 private static bool IsNameChar (char c) {
 990                         UnicodeCategory cat = Char.GetUnicodeCategory (c);
 991                         if (cat == UnicodeCategory.ModifierLetter)
 992                                 return false;
 993                         if (cat == UnicodeCategory.ConnectorPunctuation)
 994                                 return true;
 995                         return Char.IsLetterOrDigit (c);
 996                 }
 997
 998                 private int ParseNumber (int b, int min, int max) {
 999                         return Parser.ParseNumber (pattern, ref ptr, b, min, max);
1000                 }
1001
1002                 private static int ParseDigit (char c, int b, int n) {
1003                         switch (b) {
1004                         case 8:
1005                                 if (c >= '0' && c <= '7')
1006                                         return c - '0';
1007                                 else
1008                                         return -1;
1009                         case 10:
1010                                 if (c >= '0' && c <= '9')
1011                                         return c - '0';
1012                                 else
1013                                         return -1;
1014                         case 16:
1015                                 if (c >= '0' && c <= '9')
1016                                         return c - '0';
1017                                 else if (c >= 'a' && c <= 'f')
1018                                         return 10 + c - 'a';
1019                                 else if (c >= 'A' && c <= 'F')
1020                                         return 10 + c - 'A';
1021                                 else
1022                                         return -1;
1023                         default:
1024                                 return -1;
1025                         }
1026                 }
1027
1028                 private void ConsumeWhitespace (bool ignore) {
1029                         while (ptr < pattern.Length) {
1030                                 if (pattern[ptr] == '(') {
1031                                         if (ptr + 3 >= pattern.Length)
1032                                                 return;
1033
1034                                         if (pattern[ptr + 1] != '?' || pattern[ptr + 2] != '#')
1035                                                 return;
1036
1037                                         ptr += 3;
1038                                         while (ptr < pattern.Length && pattern[ptr ++] != ')')
1039                                                 /* ignore */ ;
1040                                 }
1041                                 else if (ignore && pattern[ptr] == '#') {
1042                                         while (ptr < pattern.Length && pattern[ptr ++] != '\n')
1043                                                 /* ignore */ ;
1044                                 }
1045                                 else if (ignore && Char.IsWhiteSpace (pattern[ptr])) {
1046                                         while (ptr < pattern.Length && Char.IsWhiteSpace (pattern[ptr]))
1047                                                 ++ ptr;
1048                                 }
1049                                 else
1050                                         return;
1051                         }
1052                 }
1053
1054                 private string ParseString (string pattern) {
1055                         this.pattern = pattern;
1056                         this.ptr = 0;
1057
1058                         StringBuilder result = new StringBuilder (pattern.Length);
1059                         while (ptr < pattern.Length) {
1060                                 int c = pattern[ptr ++];
1061                                 if (c == '\\') {
1062                                         c = ParseEscape (false);
1063
1064                                         if(c < 0) {
1065                                                 c = pattern[ptr ++];
1066                                                 if(c == 'b')
1067                                                         c = '\b';
1068                                         }
1069                                 }
1070                                 result.Append ((char) c);
1071                         }
1072
1073                         return result.ToString ();
1074                 }
1075
1076                 private void ResolveReferences ()
1077                 {
1078                         int gid = 1;
1079                         Hashtable dict = new Hashtable ();
1080                         ArrayList explicit_numeric_groups = null;
1081
1082                         // number unnamed groups
1083
1084                         foreach (CapturingGroup group in caps) {
1085                                 if (group.Name != null)
1086                                         continue;
1087
1088                                 dict.Add (gid.ToString (), group);
1089                                 group.Index = gid ++;
1090                                 ++ num_groups;
1091                         }
1092
1093                         // number named groups
1094
1095                         foreach (CapturingGroup group in caps) {
1096                                 if (group.Name == null)
1097                                         continue;
1098
1099                                 if (dict.Contains (group.Name)) {
1100                                         CapturingGroup prev = (CapturingGroup) dict [group.Name];
1101                                         group.Index = prev.Index;
1102
1103                                         if (group.Index == gid)
1104                                                 gid ++;
1105                                         else if (group.Index > gid)
1106                                                 explicit_numeric_groups.Add (group);
1107                                         continue;
1108                                 }
1109
1110                                 if (Char.IsDigit (group.Name [0])) {
1111                                         int ptr = 0;
1112                                         int group_gid = ParseDecimal (group.Name, ref ptr);
1113                                         if (ptr == group.Name.Length) {
1114                                                 group.Index = group_gid;
1115                                                 dict.Add (group.Name, group);
1116                                                 ++ num_groups;
1117
1118                                                 if (group_gid == gid) {
1119                                                         gid ++;
1120                                                 } else {
1121                                                         // all numbers before 'gid' are already in the dictionary.  So, we know group_gid > gid
1122                                                         if (explicit_numeric_groups == null)
1123                                                                 explicit_numeric_groups = new ArrayList (4);
1124                                                         explicit_numeric_groups.Add (group);
1125                                                 }
1126
1127                                                 continue;
1128                                         }
1129                                 }
1130
1131                                 string gid_s = gid.ToString ();
1132                                 while (dict.Contains (gid_s))
1133                                         gid_s = (++gid).ToString ();
1134
1135                                 dict.Add (gid_s, group);
1136                                 dict.Add (group.Name, group);
1137                                 group.Index = gid ++;
1138                                 ++ num_groups;
1139                         }
1140
1141                         gap = gid; // == 1 + num_groups, if explicit_numeric_groups == null
1142
1143                         if (explicit_numeric_groups != null)
1144                                 HandleExplicitNumericGroups (explicit_numeric_groups);
1145
1146                         // resolve references
1147
1148                         foreach (Expression expr in refs.Keys) {
1149                                 string name = (string) refs [expr];
1150                                 if (!dict.Contains (name)) {
1151                                         if (expr is CaptureAssertion && !Char.IsDigit (name [0]))
1152                                                 continue;
1153                                         BackslashNumber bn = expr as BackslashNumber;
1154                                         if (bn != null && bn.ResolveReference (name, dict))
1155                                                 continue;
1156                                         throw NewParseException ("Reference to undefined group " +
1157                                                 (Char.IsDigit (name[0]) ? "number " : "name ") +
1158                                                 name);
1159                                 }
1160
1161                                 CapturingGroup group = (CapturingGroup)dict[name];
1162                                 if (expr is Reference)
1163                                         ((Reference)expr).CapturingGroup = group;
1164                                 else if (expr is CaptureAssertion)
1165                                         ((CaptureAssertion)expr).CapturingGroup = group;
1166                                 else if (expr is BalancingGroup)
1167                                         ((BalancingGroup)expr).Balance = group;
1168                         }
1169                 }
1170
1171                 private void HandleExplicitNumericGroups (ArrayList explicit_numeric_groups)
1172                 {
1173                         int gid = gap;
1174                         int i = 0;
1175                         int n_explicit = explicit_numeric_groups.Count;
1176
1177                         explicit_numeric_groups.Sort ();
1178
1179                         // move 'gap' forward to skip over all explicit groups that
1180                         // turn out to match their index
1181                         for (; i < n_explicit; ++i) {
1182                                 CapturingGroup g = (CapturingGroup) explicit_numeric_groups [i];
1183                                 if (g.Index > gid)
1184                                         break;
1185                                 if (g.Index == gid)
1186                                         gid ++;
1187                         }
1188
1189                         gap = gid;
1190
1191                         // re-index all further groups so that the indexes are contiguous
1192                         int prev = gid;
1193                         for (; i < n_explicit; ++i) {
1194                                 CapturingGroup g = (CapturingGroup) explicit_numeric_groups [i];
1195                                 if (g.Index == prev) {
1196                                         g.Index = gid - 1;
1197                                 } else {
1198                                         prev = g.Index;
1199                                         g.Index = gid ++;
1200                                 }
1201                         }
1202                 }
1203
1204                 // flag helper functions
1205
1206                 private static bool IsIgnoreCase (RegexOptions options) {
1207                         return (options & RegexOptions.IgnoreCase) != 0;
1208                 }
1209
1210                 private static bool IsMultiline (RegexOptions options) {
1211                         return (options & RegexOptions.Multiline) != 0;
1212                 }
1213
1214                 private static bool IsExplicitCapture (RegexOptions options) {
1215                         return (options & RegexOptions.ExplicitCapture) != 0;
1216                 }
1217
1218                 private static bool IsSingleline (RegexOptions options) {
1219                         return (options & RegexOptions.Singleline) != 0;
1220                 }
1221
1222                 private static bool IsIgnorePatternWhitespace (RegexOptions options) {
1223                         return (options & RegexOptions.IgnorePatternWhitespace) != 0;
1224                 }
1225
1226                 private static bool IsECMAScript (RegexOptions options) {
1227                         return (options & RegexOptions.ECMAScript) != 0;
1228                 }
1229
1230                 // exception creation
1231
1232                 private ArgumentException NewParseException (string msg) {
1233                         msg = "parsing \"" + pattern + "\" - " + msg;
1234                         return new ArgumentException (msg, pattern);
1235                 }
1236
1237                 private string pattern;
1238                 private int ptr;
1239
1240                 private ArrayList caps;
1241                 private Hashtable refs;
1242                 private int num_groups;
1243                 private int gap;
1244         }
1245 }