mcs/class/referencesource/System/regex/system/text/regularexpressions/RegexParser.cs

   1 //------------------------------------------------------------------------------
   2 // <copyright file="RegexParser.cs" company="Microsoft">
   3 //     Copyright (c) Microsoft Corporation.  All rights reserved.
   4 // </copyright>
   5 //------------------------------------------------------------------------------
   6
   7 // This RegexParser class is internal to the Regex package.
   8 // It builds a tree of RegexNodes from a regular expression
   9
  10 // Implementation notes:
  11 //
  12 // It would be nice to get rid of the comment modes, since the
  13 // ScanBlank() calls are just kind of duct-taped in.
  14
  15
  16 namespace System.Text.RegularExpressions {
  17
  18     using System.Collections;
  19     using System.Collections.Generic;
  20     using System.Globalization;
  21
  22     internal sealed class RegexParser {
  23         internal RegexNode _stack;
  24         internal RegexNode _group;
  25         internal RegexNode _alternation;
  26         internal RegexNode _concatenation;
  27         internal RegexNode _unit;
  28
  29         internal String _pattern;
  30         internal int _currentPos;
  31         internal CultureInfo _culture;
  32
  33         internal int _autocap;
  34         internal int _capcount;
  35         internal int _captop;
  36         internal int _capsize;
  37 #if SILVERLIGHT
  38         internal Dictionary<Int32, Int32> _caps;
  39         internal Dictionary<String, Int32> _capnames;
  40 #else
  41         internal Hashtable _caps;
  42         internal Hashtable _capnames;
  43 #endif
  44         internal Int32[] _capnumlist;
  45         internal List<String> _capnamelist;
  46
  47         internal RegexOptions _options;
  48         internal List<RegexOptions> _optionsStack;
  49
  50         internal bool _ignoreNextParen = false;
  51
  52         internal const int MaxValueDiv10 = Int32.MaxValue / 10;
  53         internal const int MaxValueMod10 = Int32.MaxValue % 10;
  54
  55         /*
  56          * This static call constructs a RegexTree from a regular expression
  57          * pattern string and an option string.
  58          *
  59          * The method creates, drives, and drops a parser instance.
  60          */
  61         internal static RegexTree Parse(String re, RegexOptions op) {
  62             RegexParser p;
  63             RegexNode root;
  64             String[] capnamelist;
  65
  66             p = new RegexParser((op & RegexOptions.CultureInvariant) != 0 ? CultureInfo.InvariantCulture : CultureInfo.CurrentCulture);
  67
  68             p._options = op;
  69
  70             p.SetPattern(re);
  71             p.CountCaptures();
  72             p.Reset(op);
  73             root = p.ScanRegex();
  74
  75             if (p._capnamelist == null)
  76                 capnamelist = null;
  77             else
  78                 capnamelist = p._capnamelist.ToArray();
  79
  80             return new RegexTree(root, p._caps, p._capnumlist, p._captop, p._capnames, capnamelist, op);
  81         }
  82
  83         /*
  84          * This static call constructs a flat concatenation node given
  85          * a replacement pattern.
  86          */
  87 #if SILVERLIGHT
  88         internal static RegexReplacement ParseReplacement(String rep, Dictionary<Int32, Int32> caps, int capsize, Dictionary<String, Int32> capnames, RegexOptions op) {
  89 #else
  90         internal static RegexReplacement ParseReplacement(String rep, Hashtable caps, int capsize, Hashtable capnames, RegexOptions op) {
  91 #endif
  92             RegexParser p;
  93             RegexNode root;
  94
  95             p = new RegexParser((op & RegexOptions.CultureInvariant) != 0 ? CultureInfo.InvariantCulture : CultureInfo.CurrentCulture);
  96
  97             p._options = op;
  98
  99             p.NoteCaptures(caps, capsize, capnames);
 100             p.SetPattern(rep);
 101             root = p.ScanReplacement();
 102
 103             return new RegexReplacement(rep, root, caps);
 104         }
 105
 106         /*
 107          * Escapes all metacharacters (including |,(,),[,{,|,^,$,*,+,?,\, spaces and #)
 108          */
 109         internal static String Escape(String input) {
 110             for (int i = 0; i < input.Length; i++) {
 111                 if (IsMetachar(input[i])) {
 112                     StringBuilder sb = new StringBuilder();
 113                     char ch = input[i];
 114                     int lastpos;
 115
 116                     sb.Append(input, 0, i);
 117                     do {
 118                         sb.Append('\\');
 119                         switch (ch) {
 120                             case '\n':
 121                                 ch = 'n';
 122                                 break;
 123                             case '\r':
 124                                 ch = 'r';
 125                                 break;
 126                             case '\t':
 127                                 ch = 't';
 128                                 break;
 129                             case '\f':
 130                                 ch = 'f';
 131                                 break;
 132                         }
 133                         sb.Append(ch);
 134                         i++;
 135                         lastpos = i;
 136
 137                         while (i < input.Length) {
 138                             ch = input[i];
 139                             if (IsMetachar(ch))
 140                                 break;
 141
 142                             i++;
 143                         }
 144
 145                         sb.Append(input, lastpos, i - lastpos);
 146
 147                     } while (i < input.Length);
 148
 149                     return sb.ToString();
 150                 }
 151             }
 152
 153             return input;
 154         }
 155
 156         /*
 157          * Escapes all metacharacters (including (,),[,],{,},|,^,$,*,+,?,\, spaces and #)
 158          */
 159         internal static String Unescape(String input) {
 160             for (int i = 0; i < input.Length; i++) {
 161                 if (input[i] == '\\') {
 162                     StringBuilder sb = new StringBuilder();
 163                     RegexParser p = new RegexParser(CultureInfo.InvariantCulture);
 164                     int lastpos;
 165                     p.SetPattern(input);
 166
 167                     sb.Append(input, 0, i);
 168                     do {
 169                         i++;
 170                         p.Textto(i);
 171                         if (i < input.Length)
 172                             sb.Append(p.ScanCharEscape());
 173                         i = p.Textpos();
 174                         lastpos = i;
 175                         while (i < input.Length && input[i] != '\\')
 176                             i++;
 177                         sb.Append(input, lastpos, i - lastpos);
 178
 179                     } while (i < input.Length);
 180
 181                     return sb.ToString();
 182                 }
 183             }
 184
 185             return input;
 186         }
 187
 188         /*
 189          * Private constructor.
 190          */
 191         private RegexParser(CultureInfo culture) {
 192             _culture = culture;
 193             _optionsStack = new List<RegexOptions>();
 194 #if SILVERLIGHT
 195             _caps = new Dictionary<Int32,Int32>();
 196 #else
 197             _caps = new Hashtable();
 198 #endif
 199
 200         }
 201
 202         /*
 203          * Drops a string into the pattern buffer.
 204          */
 205         internal void SetPattern(String Re) {
 206             if (Re == null)
 207                 Re = String.Empty;
 208             _pattern = Re;
 209             _currentPos = 0;
 210         }
 211
 212         /*
 213          * Resets parsing to the beginning of the pattern.
 214          */
 215         internal void Reset(RegexOptions topopts) {
 216             _currentPos = 0;
 217             _autocap = 1;
 218             _ignoreNextParen = false;
 219
 220             if (_optionsStack.Count > 0)
 221                 _optionsStack.RemoveRange(0, _optionsStack.Count - 1);
 222
 223             _options = topopts;
 224             _stack = null;
 225         }
 226
 227         /*
 228          * The main parsing function.
 229          */
 230         internal RegexNode ScanRegex() {
 231             char ch = '@'; // nonspecial ch, means at beginning
 232             bool isQuantifier = false;
 233
 234             StartGroup(new RegexNode(RegexNode.Capture, _options, 0, -1));
 235
 236             while (CharsRight() > 0) {
 237                 bool wasPrevQuantifier = isQuantifier;
 238                 isQuantifier = false;
 239
 240                 ScanBlank();
 241
 242                 int startpos = Textpos();
 243
 244                 // move past all of the normal characters.  We'll stop when we hit some kind of control character,
 245                 // or if IgnorePatternWhiteSpace is on, we'll stop when we see some whitespace.
 246                 if (UseOptionX())
 247                     while (CharsRight() > 0 && (!IsStopperX(ch = RightChar()) || ch == '{' && !IsTrueQuantifier()))
 248                         MoveRight();
 249                 else
 250                     while (CharsRight() > 0 && (!IsSpecial(ch = RightChar()) || ch == '{' && !IsTrueQuantifier()))
 251                         MoveRight();
 252
 253                 int endpos = Textpos();
 254
 255                 ScanBlank();
 256
 257                 if (CharsRight() == 0)
 258                     ch = '!'; // nonspecial, means at end
 259                 else if (IsSpecial(ch = RightChar())) {
 260                     isQuantifier = IsQuantifier(ch);
 261                     MoveRight();
 262                 } else
 263                     ch = ' '; // nonspecial, means at ordinary char
 264
 265                 if (startpos < endpos) {
 266                     int cchUnquantified = endpos - startpos - (isQuantifier ? 1 : 0);
 267
 268                     wasPrevQuantifier = false;
 269
 270                     if (cchUnquantified > 0)
 271                         AddConcatenate(startpos, cchUnquantified, false);
 272
 273                     if (isQuantifier)
 274                         AddUnitOne(CharAt(endpos - 1));
 275                 }
 276
 277                 switch (ch) {
 278                     case '!':
 279                         goto BreakOuterScan;
 280
 281                     case ' ':
 282                         goto ContinueOuterScan;
 283
 284                     case '[':
 285                         AddUnitSet(ScanCharClass(UseOptionI()).ToStringClass());
 286                         break;
 287
 288                     case '(': {
 289                             RegexNode grouper;
 290
 291                             PushOptions();
 292
 293                             if (null == (grouper = ScanGroupOpen())) {
 294                                 PopKeepOptions();
 295                             }
 296                             else {
 297                                 PushGroup();
 298                                 StartGroup(grouper);
 299                             }
 300                         }
 301                         continue;
 302
 303                     case '|':
 304                         AddAlternate();
 305                         goto ContinueOuterScan;
 306
 307                     case ')':
 308                         if (EmptyStack())
 309                             throw MakeException(SR.GetString(SR.TooManyParens));
 310
 311                         AddGroup();
 312                         PopGroup();
 313                         PopOptions();
 314
 315                         if (Unit() == null)
 316                             goto ContinueOuterScan;
 317                         break;
 318
 319                     case '\\':
 320                         AddUnitNode(ScanBackslash());
 321                         break;
 322
 323                     case '^':
 324                         AddUnitType(UseOptionM() ? RegexNode.Bol : RegexNode.Beginning);
 325                         break;
 326
 327                     case '$':
 328                         AddUnitType(UseOptionM() ? RegexNode.Eol : RegexNode.EndZ);
 329                         break;
 330
 331                     case '.':
 332                         if (UseOptionS())
 333                             AddUnitSet(RegexCharClass.AnyClass);
 334                         else
 335                             AddUnitNotone('\n');
 336                         break;
 337
 338                     case '{':
 339                     case '*':
 340                     case '+':
 341                     case '?':
 342                         if (Unit() == null)
 343                             throw MakeException(wasPrevQuantifier ?
 344                                                 SR.GetString(SR.NestedQuantify, ch.ToString()) :
 345                                                 SR.GetString(SR.QuantifyAfterNothing));
 346                         MoveLeft();
 347                         break;
 348
 349                     default:
 350                         throw MakeException(SR.GetString(SR.InternalError));
 351                 }
 352
 353                 ScanBlank();
 354
 355                 if (CharsRight() == 0 || !(isQuantifier = IsTrueQuantifier())) {
 356                     AddConcatenate();
 357                     goto ContinueOuterScan;
 358                 }
 359
 360                 ch = MoveRightGetChar();
 361
 362                 // Handle quantifiers
 363                 while (Unit() != null) {
 364                     int min;
 365                     int max;
 366                     bool lazy;
 367
 368                     switch (ch) {
 369                         case '*':
 370                             min = 0;
 371                             max = Int32.MaxValue;
 372                             break;
 373
 374                         case '?':
 375                             min = 0;
 376                             max = 1;
 377                             break;
 378
 379                         case '+':
 380                             min = 1;
 381                             max = Int32.MaxValue;
 382                             break;
 383
 384                         case '{': {
 385                                 startpos = Textpos();
 386                                 max = min = ScanDecimal();
 387                                 if (startpos < Textpos()) {
 388                                     if (CharsRight() > 0 && RightChar() == ',') {
 389                                         MoveRight();
 390                                         if (CharsRight() == 0 || RightChar() == '}')
 391                                             max = Int32.MaxValue;
 392                                         else
 393                                             max = ScanDecimal();
 394                                     }
 395                                 }
 396
 397                                 if (startpos == Textpos() || CharsRight() == 0 || MoveRightGetChar() != '}') {
 398                                     AddConcatenate();
 399                                     Textto(startpos - 1);
 400                                     goto ContinueOuterScan;
 401                                 }
 402                             }
 403
 404                             break;
 405
 406                         default:
 407                             throw MakeException(SR.GetString(SR.InternalError));
 408                     }
 409
 410                     ScanBlank();
 411
 412                     if (CharsRight() == 0 || RightChar() != '?')
 413                         lazy = false;
 414                     else {
 415                         MoveRight();
 416                         lazy = true;
 417                     }
 418
 419                     if (min > max)
 420                         throw MakeException(SR.GetString(SR.IllegalRange));
 421
 422                     AddConcatenate(lazy, min, max);
 423                 }
 424
 425                 ContinueOuterScan:
 426                 ;
 427             }
 428
 429             BreakOuterScan:
 430             ;
 431
 432             if (!EmptyStack())
 433                 throw MakeException(SR.GetString(SR.NotEnoughParens));
 434
 435             AddGroup();
 436
 437             return Unit();
 438         }
 439
 440         /*
 441          * Simple parsing for replacement patterns
 442          */
 443         internal RegexNode ScanReplacement() {
 444             int c;
 445             int startpos;
 446
 447             _concatenation = new RegexNode(RegexNode.Concatenate, _options);
 448
 449             for (;;) {
 450                 c = CharsRight();
 451                 if (c == 0)
 452                     break;
 453
 454                 startpos = Textpos();
 455
 456                 while (c > 0 && RightChar() != '$') {
 457                     MoveRight();
 458                     c--;
 459                 }
 460
 461                 AddConcatenate(startpos, Textpos() - startpos, true);
 462
 463                 if (c > 0) {
 464                     if (MoveRightGetChar() == '$')
 465                         AddUnitNode(ScanDollar());
 466                     AddConcatenate();
 467                 }
 468             }
 469
 470             return _concatenation;
 471         }
 472
 473         /*
 474          * Scans contents of [] (not including []'s), and converts to a
 475          * RegexCharClass.
 476          */
 477         internal RegexCharClass ScanCharClass(bool caseInsensitive) {
 478             return ScanCharClass(caseInsensitive, false);
 479         }
 480
 481         /*
 482          * Scans contents of [] (not including []'s), and converts to a
 483          * RegexCharClass.
 484          */
 485         internal RegexCharClass ScanCharClass(bool caseInsensitive, bool scanOnly) {
 486             char    ch = '\0';
 487             char    chPrev = '\0';
 488             bool inRange = false;
 489             bool firstChar = true;
 490             bool closed = false;
 491
 492             RegexCharClass cc;
 493
 494             cc = scanOnly ? null : new RegexCharClass();
 495
 496             if (CharsRight() > 0 && RightChar() == '^') {
 497                 MoveRight();
 498                 if (!scanOnly)
 499                     cc.Negate = true;
 500             }
 501
 502             for ( ; CharsRight() > 0; firstChar = false) {
 503                 bool fTranslatedChar = false;
 504                 ch = MoveRightGetChar();
 505                 if (ch == ']') {
 506                     if (!firstChar) {
 507                         closed = true;
 508                         break;
 509                     }
 510                 }
 511                 else if (ch == '\\' && CharsRight() > 0) {
 512
 513                     switch (ch = MoveRightGetChar()) {
 514                         case 'D':
 515                         case 'd':
 516                             if (!scanOnly) {
 517                                 if (inRange)
 518                                     throw MakeException(SR.GetString(SR.BadClassInCharRange, ch.ToString()));
 519                                 cc.AddDigit(UseOptionE(), ch == 'D', _pattern);
 520                             }
 521                             continue;
 522
 523                         case 'S':
 524                         case 's':
 525                             if (!scanOnly) {
 526                                 if (inRange)
 527                                     throw MakeException(SR.GetString(SR.BadClassInCharRange, ch.ToString()));
 528                                 cc.AddSpace(UseOptionE(), ch == 'S');
 529                             }
 530                             continue;
 531
 532                         case 'W':
 533                         case 'w':
 534                             if (!scanOnly) {
 535                                 if (inRange)
 536                                     throw MakeException(SR.GetString(SR.BadClassInCharRange, ch.ToString()));
 537
 538                                 cc.AddWord(UseOptionE(), ch == 'W');
 539                             }
 540                             continue;
 541
 542                         case 'p':
 543                         case 'P':
 544                             if (!scanOnly) {
 545                                 if (inRange)
 546                                     throw MakeException(SR.GetString(SR.BadClassInCharRange, ch.ToString()));
 547                                 cc.AddCategoryFromName(ParseProperty(), (ch != 'p'), caseInsensitive, _pattern);
 548                             }
 549                             else
 550                                 ParseProperty();
 551
 552                             continue;
 553
 554                         case '-':
 555                             if (!scanOnly)
 556                                 cc.AddRange(ch, ch);
 557                             continue;
 558
 559                         default:
 560                             MoveLeft();
 561                             ch = ScanCharEscape(); // non-literal character
 562                             fTranslatedChar = true;
 563                             break;          // this break will only break out of the switch
 564                     }
 565                 }
 566                 else if (ch == '[') {
 567                     // This is code for Posix style properties - [:Ll:] or [:IsTibetan:].
 568                     // It currently doesn't do anything other than skip the whole thing!
 569                     if (CharsRight() > 0 && RightChar() == ':' && !inRange) {
 570 //                        String name;
 571                         int savePos = Textpos();
 572
 573                         MoveRight();
 574 /*                        name = */ ScanCapname();
 575                         if (CharsRight() < 2 || MoveRightGetChar() != ':' || MoveRightGetChar() != ']')
 576                             Textto(savePos);
 577                         // else lookup name (nyi)
 578                     }
 579                 }
 580
 581
 582                 if (inRange) {
 583                     inRange = false;
 584                     if (!scanOnly) {
 585                         if (ch == '[' && !fTranslatedChar && !firstChar) {
 586                             // We thought we were in a range, but we're actually starting a subtraction.
 587                             // In that case, we'll add chPrev to our char class, skip the opening [, and
 588                             // scan the new character class recursively.
 589                             cc.AddChar(chPrev);
 590                             cc.AddSubtraction(ScanCharClass(caseInsensitive, false));
 591
 592                             if (CharsRight() > 0 && RightChar() != ']')
 593                                 throw MakeException(SR.GetString(SR.SubtractionMustBeLast));
 594                         }
 595                         else {
 596                             // a regular range, like a-z
 597                             if (chPrev > ch)
 598                                 throw MakeException(SR.GetString(SR.ReversedCharRange));
 599                             cc.AddRange(chPrev, ch);
 600                         }
 601                     }
 602                 }
 603                 else if (CharsRight() >= 2 && RightChar() == '-' && RightChar(1) != ']') {
 604                     // this could be the start of a range
 605                     chPrev = ch;
 606                     inRange = true;
 607                     MoveRight();
 608                 }
 609                 else if (CharsRight() >= 1 && ch == '-' && !fTranslatedChar && RightChar() == '[' && !firstChar) {
 610                     // we aren't in a range, and now there is a subtraction.  Usually this happens
 611                     // only when a subtraction follows a range, like [a-z-[b]]
 612                     if (!scanOnly) {
 613                         MoveRight(1);
 614                         cc.AddSubtraction(ScanCharClass(caseInsensitive, false));
 615
 616                         if (CharsRight() > 0 && RightChar() != ']')
 617                             throw MakeException(SR.GetString(SR.SubtractionMustBeLast));
 618                     }
 619                     else {
 620                         MoveRight(1);
 621                         ScanCharClass(caseInsensitive, true);
 622                     }
 623                 }
 624                 else {
 625                     if (!scanOnly)
 626                         cc.AddRange(ch, ch);
 627                 }
 628             }
 629
 630             if (!closed)
 631                 throw MakeException(SR.GetString(SR.UnterminatedBracket));
 632
 633             if (!scanOnly && caseInsensitive)
 634                 cc.AddLowercase(_culture);
 635
 636             return cc;
 637         }
 638
 639         /*
 640          * Scans chars following a '(' (not counting the '('), and returns
 641          * a RegexNode for the type of group scanned, or null if the group
 642          * simply changed options (?cimsx-cimsx) or was a comment (#...).
 643          */
 644         internal RegexNode ScanGroupOpen() {
 645             char ch = '\0';
 646             int NodeType;
 647             char close = '>';
 648
 649
 650             // just return a RegexNode if we have:
 651             // 1. "(" followed by nothing
 652             // 2. "(x" where x != ?
 653             // 3. "(?)"
 654             if (CharsRight() == 0 || RightChar() != '?' || (RightChar() == '?' && (CharsRight() > 1 && RightChar(1) == ')'))) {
 655                 if (UseOptionN() || _ignoreNextParen) {
 656                     _ignoreNextParen = false;
 657                     return new RegexNode(RegexNode.Group, _options);
 658                 }
 659                 else
 660                     return new RegexNode(RegexNode.Capture, _options, _autocap++, -1);
 661             }
 662
 663             MoveRight();
 664
 665             for (;;) {
 666                 if (CharsRight() == 0)
 667                     break;
 668
 669                 switch (ch = MoveRightGetChar()) {
 670                     case ':':
 671                         NodeType = RegexNode.Group;
 672                         break;
 673
 674                     case '=':
 675                         _options &= ~(RegexOptions.RightToLeft);
 676                         NodeType = RegexNode.Require;
 677                         break;
 678
 679                     case '!':
 680                         _options &= ~(RegexOptions.RightToLeft);
 681                         NodeType = RegexNode.Prevent;
 682                         break;
 683
 684                     case '>':
 685                         NodeType = RegexNode.Greedy;
 686                         break;
 687
 688                     case '\'':
 689                         close = '\'';
 690                         goto case '<';
 691                         // fallthrough
 692
 693                     case '<':
 694                         if (CharsRight() == 0)
 695                             goto BreakRecognize;
 696
 697                         switch (ch = MoveRightGetChar()) {
 698                             case '=':
 699                                 if (close == '\'')
 700                                     goto BreakRecognize;
 701
 702                                 _options |= RegexOptions.RightToLeft;
 703                                 NodeType = RegexNode.Require;
 704                                 break;
 705
 706                             case '!':
 707                                 if (close == '\'')
 708                                     goto BreakRecognize;
 709
 710                                 _options |= RegexOptions.RightToLeft;
 711                                 NodeType = RegexNode.Prevent;
 712                                 break;
 713
 714                             default:
 715                                 MoveLeft();
 716                                 int capnum = -1;
 717                                 int uncapnum = -1;
 718                                 bool proceed = false;
 719
 720                                 // grab part before -
 721
 722                                 if (ch >= '0' && ch <= '9') {
 723                                     capnum = ScanDecimal();
 724
 725                                     if (!IsCaptureSlot(capnum))
 726                                         capnum = -1;
 727
 728                                     // check if we have bogus characters after the number
 729                                     if (CharsRight() > 0 && !(RightChar() == close || RightChar() == '-'))
 730                                         throw MakeException(SR.GetString(SR.InvalidGroupName));
 731                                     if (capnum == 0)
 732                                         throw MakeException(SR.GetString(SR.CapnumNotZero));
 733                                 }
 734                                 else if (RegexCharClass.IsWordChar(ch)) {
 735                                     String capname = ScanCapname();
 736
 737                                     if (IsCaptureName(capname))
 738                                         capnum = CaptureSlotFromName(capname);
 739
 740                                     // check if we have bogus character after the name
 741                                     if (CharsRight() > 0 && !(RightChar() == close || RightChar() == '-'))
 742                                         throw MakeException(SR.GetString(SR.InvalidGroupName));
 743                                 }
 744                                 else if (ch == '-') {
 745                                     proceed = true;
 746                                 }
 747                                 else {
 748                                     // bad group name - starts with something other than a word character and isn't a number
 749                                     throw MakeException(SR.GetString(SR.InvalidGroupName));
 750                                 }
 751
 752                                 // grab part after - if any
 753
 754                                 if ((capnum != -1 || proceed == true) && CharsRight() > 0 && RightChar() == '-') {
 755                                     MoveRight();
 756                                     ch = RightChar();
 757
 758                                     if (ch >= '0' && ch <= '9') {
 759                                         uncapnum = ScanDecimal();
 760
 761                                         if (!IsCaptureSlot(uncapnum))
 762                                             throw MakeException(SR.GetString(SR.UndefinedBackref, uncapnum));
 763
 764                                         // check if we have bogus characters after the number
 765                                         if (CharsRight() > 0 && RightChar() != close)
 766                                             throw MakeException(SR.GetString(SR.InvalidGroupName));
 767                                     }
 768                                     else if (RegexCharClass.IsWordChar(ch)) {
 769                                         String uncapname = ScanCapname();
 770
 771                                         if (IsCaptureName(uncapname))
 772                                             uncapnum = CaptureSlotFromName(uncapname);
 773                                         else
 774                                             throw MakeException(SR.GetString(SR.UndefinedNameRef, uncapname));
 775
 776                                         // check if we have bogus character after the name
 777                                         if (CharsRight() > 0 && RightChar() != close)
 778                                             throw MakeException(SR.GetString(SR.InvalidGroupName));
 779                                     }
 780                                     else {
 781                                         // bad group name - starts with something other than a word character and isn't a number
 782                                         throw MakeException(SR.GetString(SR.InvalidGroupName));
 783                                     }
 784                                 }
 785
 786                                 // actually make the node
 787
 788                                 if ((capnum != -1 || uncapnum != -1) && CharsRight() > 0 && MoveRightGetChar() == close) {
 789                                     return new RegexNode(RegexNode.Capture, _options, capnum, uncapnum);
 790                                 }
 791                                 goto BreakRecognize;
 792                         }
 793                         break;
 794
 795                     case '(':
 796                         // alternation construct (?(...) | )
 797
 798                         int parenPos = Textpos();
 799                         if (CharsRight() > 0)
 800                         {
 801                             ch = RightChar();
 802
 803                             // check if the alternation condition is a backref
 804                             if (ch >= '0' && ch <= '9') {
 805                                 int capnum = ScanDecimal();
 806                                 if (CharsRight() > 0 && MoveRightGetChar() == ')') {
 807                                     if (IsCaptureSlot(capnum))
 808                                         return new RegexNode(RegexNode.Testref, _options, capnum);
 809                                     else
 810                                         throw MakeException(SR.GetString(SR.UndefinedReference, capnum.ToString(CultureInfo.CurrentCulture)));
 811                                 }
 812                                 else
 813                                     throw MakeException(SR.GetString(SR.MalformedReference, capnum.ToString(CultureInfo.CurrentCulture)));
 814
 815                             }
 816                             else if (RegexCharClass.IsWordChar(ch)) {
 817                                 String capname = ScanCapname();
 818
 819                                 if (IsCaptureName(capname) && CharsRight() > 0 && MoveRightGetChar() == ')')
 820                                     return new RegexNode(RegexNode.Testref, _options, CaptureSlotFromName(capname));
 821                             }
 822                         }
 823                         // not a backref
 824                         NodeType = RegexNode.Testgroup;
 825                         Textto(parenPos - 1);       // jump to the start of the parentheses
 826                         _ignoreNextParen = true;    // but make sure we don't try to capture the insides
 827
 828                         int charsRight = CharsRight();
 829                         if (charsRight >= 3 && RightChar(1) == '?') {
 830                             char rightchar2 = RightChar(2);
 831                             // disallow comments in the condition
 832                             if (rightchar2 == '#')
 833                                 throw MakeException(SR.GetString(SR.AlternationCantHaveComment));
 834
 835                             // disallow named capture group (?<..>..) in the condition
 836                             if (rightchar2 == '\'' )
 837                                 throw MakeException(SR.GetString(SR.AlternationCantCapture));
 838                             else {
 839                                 if (charsRight >=4 && (rightchar2 == '<' && RightChar(3) != '!' && RightChar(3) != '='))
 840                                     throw MakeException(SR.GetString(SR.AlternationCantCapture));
 841                             }
 842                         }
 843
 844                         break;
 845
 846
 847                     default:
 848                         MoveLeft();
 849
 850                         NodeType = RegexNode.Group;
 851                         ScanOptions();
 852                         if (CharsRight() == 0)
 853                             goto BreakRecognize;
 854
 855                         if ((ch = MoveRightGetChar()) == ')')
 856                             return null;
 857
 858                         if (ch != ':')
 859                             goto BreakRecognize;
 860                         break;
 861                 }
 862
 863                 return new RegexNode(NodeType, _options);
 864             }
 865
 866             BreakRecognize:
 867             ;
 868             // break Recognize comes here
 869
 870             throw MakeException(SR.GetString(SR.UnrecognizedGrouping));
 871         }
 872
 873         /*
 874          * Scans whitespace or x-mode comments.
 875          */
 876         internal void ScanBlank() {
 877             if (UseOptionX()) {
 878                 for (;;) {
 879                     while (CharsRight() > 0 && IsSpace(RightChar()))
 880                         MoveRight();
 881
 882                     if (CharsRight() == 0)
 883                         break;
 884
 885                     if (RightChar() == '#') {
 886                         while (CharsRight() > 0 && RightChar() != '\n')
 887                             MoveRight();
 888                     }
 889                     else if (CharsRight() >= 3 && RightChar(2) == '#' &&
 890                              RightChar(1) == '?' && RightChar() == '(') {
 891                         while (CharsRight() > 0 && RightChar() != ')')
 892                             MoveRight();
 893                         if (CharsRight() == 0)
 894                             throw MakeException(SR.GetString(SR.UnterminatedComment));
 895                         MoveRight();
 896                     }
 897                     else
 898                         break;
 899                 }
 900             }
 901             else {
 902                 for (;;) {
 903                     if (CharsRight() < 3 || RightChar(2) != '#' ||
 904                         RightChar(1) != '?' || RightChar() != '(')
 905                         return;
 906
 907                     while (CharsRight() > 0 && RightChar() != ')')
 908                         MoveRight();
 909                     if (CharsRight() == 0)
 910                         throw MakeException(SR.GetString(SR.UnterminatedComment));
 911                     MoveRight();
 912                 }
 913             }
 914         }
 915
 916         /*
 917          * Scans chars following a '\' (not counting the '\'), and returns
 918          * a RegexNode for the type of atom scanned.
 919          */
 920         internal RegexNode ScanBackslash() {
 921             char ch;
 922             RegexCharClass cc;
 923
 924             if (CharsRight() == 0)
 925                 throw MakeException(SR.GetString(SR.IllegalEndEscape));
 926
 927             switch (ch = RightChar()) {
 928                 case 'b':
 929                 case 'B':
 930                 case 'A':
 931                 case 'G':
 932                 case 'Z':
 933                 case 'z':
 934                     MoveRight();
 935                     return new RegexNode(TypeFromCode(ch), _options);
 936
 937                 case 'w':
 938                     MoveRight();
 939                     if (UseOptionE())
 940                         return new RegexNode(RegexNode.Set, _options, RegexCharClass.ECMAWordClass);
 941                     return new RegexNode(RegexNode.Set, _options, RegexCharClass.WordClass);
 942
 943                 case 'W':
 944                     MoveRight();
 945                     if (UseOptionE())
 946                         return new RegexNode(RegexNode.Set, _options, RegexCharClass.NotECMAWordClass);
 947                     return new RegexNode(RegexNode.Set, _options, RegexCharClass.NotWordClass);
 948
 949                 case 's':
 950                     MoveRight();
 951                     if (UseOptionE())
 952                         return new RegexNode(RegexNode.Set, _options, RegexCharClass.ECMASpaceClass);
 953                     return new RegexNode(RegexNode.Set, _options, RegexCharClass.SpaceClass);
 954
 955                 case 'S':
 956                     MoveRight();
 957                     if (UseOptionE())
 958                         return new RegexNode(RegexNode.Set, _options, RegexCharClass.NotECMASpaceClass);
 959                     return new RegexNode(RegexNode.Set, _options, RegexCharClass.NotSpaceClass);
 960
 961                 case 'd':
 962                     MoveRight();
 963                     if (UseOptionE())
 964                         return new RegexNode(RegexNode.Set, _options, RegexCharClass.ECMADigitClass);
 965                     return new RegexNode(RegexNode.Set, _options, RegexCharClass.DigitClass);
 966
 967                 case 'D':
 968                     MoveRight();
 969                     if (UseOptionE())
 970                         return new RegexNode(RegexNode.Set, _options, RegexCharClass.NotECMADigitClass);
 971                     return new RegexNode(RegexNode.Set, _options, RegexCharClass.NotDigitClass);
 972
 973                 case 'p':
 974                 case 'P':
 975                     MoveRight();
 976                     cc = new RegexCharClass();
 977                     cc.AddCategoryFromName(ParseProperty(), (ch != 'p'), UseOptionI(), _pattern);
 978                     if (UseOptionI())
 979                         cc.AddLowercase(_culture);
 980
 981                     return new RegexNode(RegexNode.Set, _options, cc.ToStringClass());
 982
 983                 default:
 984                     return ScanBasicBackslash();
 985             }
 986         }
 987
 988         /*
 989          * Scans \-style backreferences and character escapes
 990          */
 991         internal RegexNode ScanBasicBackslash() {
 992             if (CharsRight() == 0)
 993                 throw MakeException(SR.GetString(SR.IllegalEndEscape));
 994
 995             char ch;
 996             bool angled = false;
 997             char close = '\0';
 998             int backpos;
 999
1000             backpos = Textpos();
1001             ch = RightChar();
1002
1003             // allow \k<foo> instead of \<foo>, which is now deprecated
1004
1005             if (ch == 'k') {
1006                 if (CharsRight() >= 2) {
1007                     MoveRight();
1008                     ch = MoveRightGetChar();
1009
1010                     if (ch == '<' || ch == '\'') {
1011                         angled = true;
1012                         close = (ch == '\'') ? '\'' : '>';
1013                     }
1014                 }
1015
1016                 if (!angled || CharsRight() <= 0)
1017                     throw MakeException(SR.GetString(SR.MalformedNameRef));
1018
1019                 ch = RightChar();
1020             }
1021
1022             // Note angle without \g <
1023
1024             else if ((ch == '<' || ch == '\'') && CharsRight() > 1) {
1025                 angled = true;
1026                 close = (ch == '\'') ? '\'' : '>';
1027
1028                 MoveRight();
1029                 ch = RightChar();
1030             }
1031
1032             // Try to parse backreference: \<1> or \<cap>
1033
1034             if (angled && ch >= '0' && ch <= '9') {
1035                 int capnum = ScanDecimal();
1036
1037                 if (CharsRight() > 0 && MoveRightGetChar() == close) {
1038                     if (IsCaptureSlot(capnum))
1039                         return new RegexNode(RegexNode.Ref, _options, capnum);
1040                     else
1041                         throw MakeException(SR.GetString(SR.UndefinedBackref, capnum.ToString(CultureInfo.CurrentCulture)));
1042                 }
1043             }
1044
1045             // Try to parse backreference or octal: \1
1046
1047             else if (!angled && ch >= '1' && ch <= '9') {
1048                 if (UseOptionE()) {
1049                     int capnum = -1;
1050                     int newcapnum = (int)(ch - '0');
1051                     int pos = Textpos() - 1;
1052                     while (newcapnum <= _captop) {
1053                         if (IsCaptureSlot(newcapnum) && (_caps == null || (int)_caps[newcapnum] < pos))
1054                             capnum = newcapnum;
1055                         MoveRight();
1056                         if (CharsRight() == 0 || (ch = RightChar()) < '0' || ch > '9')
1057                             break;
1058                         newcapnum = newcapnum * 10 + (int)(ch - '0');
1059                     }
1060                     if (capnum >= 0)
1061                         return new RegexNode(RegexNode.Ref, _options, capnum);
1062                 } else
1063                 {
1064
1065                   int capnum = ScanDecimal();
1066                   if (IsCaptureSlot(capnum))
1067                       return new RegexNode(RegexNode.Ref, _options, capnum);
1068                   else if (capnum <= 9)
1069                       throw MakeException(SR.GetString(SR.UndefinedBackref, capnum.ToString(CultureInfo.CurrentCulture)));
1070                 }
1071             }
1072
1073             else if (angled && RegexCharClass.IsWordChar(ch)) {
1074                 String capname = ScanCapname();
1075
1076                 if (CharsRight() > 0 && MoveRightGetChar() == close) {
1077                     if (IsCaptureName(capname))
1078                         return new RegexNode(RegexNode.Ref, _options, CaptureSlotFromName(capname));
1079                     else
1080                         throw MakeException(SR.GetString(SR.UndefinedNameRef, capname));
1081                 }
1082             }
1083
1084             // Not backreference: must be char code
1085
1086             Textto(backpos);
1087             ch = ScanCharEscape();
1088
1089             if (UseOptionI())
1090                 ch = Char.ToLower(ch, _culture);
1091
1092             return new RegexNode(RegexNode.One, _options, ch);
1093         }
1094
1095         /*
1096          * Scans $ patterns recognized within replacment patterns
1097          */
1098         internal RegexNode ScanDollar() {
1099             if (CharsRight() == 0)
1100                 return new RegexNode(RegexNode.One, _options, '$');
1101
1102             char ch = RightChar();
1103             bool angled;
1104             int backpos = Textpos();
1105             int lastEndPos = backpos;
1106
1107             // Note angle
1108
1109             if (ch == '{' && CharsRight() > 1) {
1110                 angled = true;
1111                 MoveRight();
1112                 ch = RightChar();
1113             }
1114             else {
1115                 angled = false;
1116             }
1117
1118             // Try to parse backreference: \1 or \{1} or \{cap}
1119
1120             if (ch >= '0' && ch <= '9') {
1121                 if (!angled && UseOptionE()) {
1122                     int capnum = -1;
1123                     int newcapnum = (int)(ch - '0');
1124                     MoveRight();
1125                     if (IsCaptureSlot(newcapnum)) {
1126                         capnum = newcapnum;
1127                         lastEndPos = Textpos();
1128                     }
1129
1130                     while (CharsRight() > 0 && (ch = RightChar()) >= '0' && ch <= '9') {
1131                         int digit = (int)(ch - '0');
1132                         if (newcapnum > (MaxValueDiv10) || (newcapnum == (MaxValueDiv10) && digit > (MaxValueMod10)))
1133                             throw MakeException(SR.GetString(SR.CaptureGroupOutOfRange));
1134
1135                         newcapnum = newcapnum * 10 + digit;
1136
1137                         MoveRight();
1138                         if (IsCaptureSlot(newcapnum)) {
1139                             capnum = newcapnum;
1140                             lastEndPos = Textpos();
1141                         }
1142                     }
1143                     Textto(lastEndPos);
1144                     if (capnum >= 0)
1145                         return new RegexNode(RegexNode.Ref, _options, capnum);
1146                 }
1147                 else
1148                 {
1149                     int capnum = ScanDecimal();
1150                     if (!angled || CharsRight() > 0 && MoveRightGetChar() == '}') {
1151                         if (IsCaptureSlot(capnum))
1152                             return new RegexNode(RegexNode.Ref, _options, capnum);
1153                     }
1154                 }
1155             }
1156             else if (angled && RegexCharClass.IsWordChar(ch)) {
1157                 String capname = ScanCapname();
1158
1159                 if (CharsRight() > 0 && MoveRightGetChar() == '}') {
1160                     if (IsCaptureName(capname))
1161                         return new RegexNode(RegexNode.Ref, _options, CaptureSlotFromName(capname));
1162                 }
1163             }
1164             else if (!angled) {
1165                 int capnum = 1;
1166
1167                 switch (ch) {
1168                     case '$':
1169                         MoveRight();
1170                         return new RegexNode(RegexNode.One, _options, '$');
1171
1172                     case '&':
1173                         capnum = 0;
1174                         break;
1175
1176                     case '`':
1177                         capnum = RegexReplacement.LeftPortion;
1178                         break;
1179
1180                     case '\'':
1181                         capnum = RegexReplacement.RightPortion;
1182                         break;
1183
1184                     case '+':
1185                         capnum = RegexReplacement.LastGroup;
1186                         break;
1187
1188                     case '_':
1189                         capnum = RegexReplacement.WholeString;
1190                         break;
1191                 }
1192
1193                 if (capnum != 1) {
1194                     MoveRight();
1195                     return new RegexNode(RegexNode.Ref, _options, capnum);
1196                 }
1197             }
1198
1199             // unrecognized $: literalize
1200
1201             Textto(backpos);
1202             return new RegexNode(RegexNode.One, _options, '$');
1203         }
1204
1205         /*
1206          * Scans a capture name: consumes word chars
1207          */
1208         internal String ScanCapname() {
1209             int startpos = Textpos();
1210
1211             while (CharsRight() > 0) {
1212                 if (!RegexCharClass.IsWordChar(MoveRightGetChar())) {
1213                     MoveLeft();
1214                     break;
1215                 }
1216             }
1217
1218             return _pattern.Substring(startpos, Textpos() - startpos);
1219         }
1220
1221
1222         /*
1223          * Scans up to three octal digits (stops before exceeding 0377).
1224          */
1225         internal char ScanOctal() {
1226             int d;
1227             int i;
1228             int c;
1229
1230             // Consume octal chars only up to 3 digits and value 0377
1231
1232             c = 3;
1233
1234             if (c > CharsRight())
1235                 c = CharsRight();
1236
1237             for (i = 0; c > 0 && (uint)(d = RightChar() - '0') <= 7; c -= 1) {
1238                 MoveRight();
1239                 i *= 8;
1240                 i += d;
1241                 if (UseOptionE() && i >= 0x20)
1242                     break;
1243             }
1244
1245             // Octal codes only go up to 255.  Any larger and the behavior that Perl follows
1246             // is simply to truncate the high bits.
1247             i &= 0xFF;
1248
1249             return(char)i;
1250         }
1251
1252         /*
1253          * Scans any number of decimal digits (pegs value at 2^31-1 if too large)
1254          */
1255         internal int ScanDecimal() {
1256             int i = 0;
1257             int d;
1258
1259             while (CharsRight() > 0 && (uint)(d = (char)(RightChar() - '0')) <= 9) {
1260                 MoveRight();
1261
1262                 if (i > (MaxValueDiv10) || (i == (MaxValueDiv10) && d > (MaxValueMod10)))
1263                     throw MakeException(SR.GetString(SR.CaptureGroupOutOfRange));
1264
1265                 i *= 10;
1266                 i += d;
1267             }
1268
1269             return i;
1270         }
1271
1272         /*
1273          * Scans exactly c hex digits (c=2 for \xFF, c=4 for \uFFFF)
1274          */
1275         internal char ScanHex(int c) {
1276             int i;
1277             int d;
1278
1279             i = 0;
1280
1281             if (CharsRight() >= c) {
1282                 for (; c > 0 && ((d = HexDigit(MoveRightGetChar())) >= 0); c -= 1) {
1283                     i *= 0x10;
1284                     i += d;
1285                 }
1286             }
1287
1288             if (c > 0)
1289                 throw MakeException(SR.GetString(SR.TooFewHex));
1290
1291             return(char)i;
1292         }
1293
1294         /*
1295          * Returns n <= 0xF for a hex digit.
1296          */
1297         internal static int HexDigit(char ch) {
1298             int d;
1299
1300             if ((uint)(d = ch - '0') <= 9)
1301                 return d;
1302
1303             if ((uint)(d = ch - 'a') <= 5)
1304                 return d + 0xa;
1305
1306             if ((uint)(d = ch - 'A') <= 5)
1307                 return d + 0xa;
1308
1309             return -1;
1310         }
1311
1312         /*
1313          * Grabs and converts an ascii control character
1314          */
1315         internal char ScanControl() {
1316             char ch;
1317
1318             if (CharsRight() <= 0)
1319                 throw MakeException(SR.GetString(SR.MissingControl));
1320
1321             ch = MoveRightGetChar();
1322
1323             // \ca interpreted as \cA
1324
1325             if (ch >= 'a' && ch <= 'z')
1326                 ch = (char)(ch - ('a' - 'A'));
1327
1328             if ((ch = (char)(ch - '@')) < ' ')
1329                 return ch;
1330
1331             throw MakeException(SR.GetString(SR.UnrecognizedControl));
1332         }
1333
1334         /*
1335          * Returns true for options allowed only at the top level
1336          */
1337         internal bool IsOnlyTopOption(RegexOptions option) {
1338             return(option == RegexOptions.RightToLeft
1339 #if !(SILVERLIGHT||FULL_AOT_RUNTIME)
1340                 || option == RegexOptions.Compiled
1341 #endif
1342                 || option == RegexOptions.CultureInvariant
1343                 || option == RegexOptions.ECMAScript
1344             );
1345         }
1346
1347         /*
1348          * Scans cimsx-cimsx option string, stops at the first unrecognized char.
1349          */
1350         internal void ScanOptions() {
1351             char ch;
1352             bool off;
1353             RegexOptions option;
1354
1355             for (off = false; CharsRight() > 0; MoveRight()) {
1356                 ch = RightChar();
1357
1358                 if (ch == '-') {
1359                     off = true;
1360                 }
1361                 else if (ch == '+') {
1362                     off = false;
1363                 }
1364                 else {
1365                     option = OptionFromCode(ch);
1366                     if (option == 0 || IsOnlyTopOption(option))
1367                         return;
1368
1369                     if (off)
1370                         _options &= ~option;
1371                     else
1372                         _options |= option;
1373                 }
1374             }
1375         }
1376
1377         /*
1378          * Scans \ code for escape codes that map to single unicode chars.
1379          */
1380         internal char ScanCharEscape() {
1381             char ch;
1382
1383             ch = MoveRightGetChar();
1384
1385             if (ch >= '0' && ch <= '7') {
1386                 MoveLeft();
1387                 return ScanOctal();
1388             }
1389
1390             switch (ch) {
1391                 case 'x':
1392                     return ScanHex(2);
1393                 case 'u':
1394                     return ScanHex(4);
1395                 case 'a':
1396                     return '\u0007';
1397                 case 'b':
1398                     return '\b';
1399                 case 'e':
1400                     return '\u001B';
1401                 case 'f':
1402                     return '\f';
1403                 case 'n':
1404                     return '\n';
1405                 case 'r':
1406                     return '\r';
1407                 case 't':
1408                     return '\t';
1409                 case 'v':
1410                     return '\u000B';
1411                 case 'c':
1412                     return ScanControl();
1413                 default:
1414                     if (!UseOptionE() && RegexCharClass.IsWordChar(ch))
1415                         throw MakeException(SR.GetString(SR.UnrecognizedEscape, ch.ToString()));
1416                     return ch;
1417             }
1418         }
1419
1420         /*
1421          * Scans X for \p{X} or \P{X}
1422          */
1423         internal String ParseProperty() {
1424             if (CharsRight() < 3) {
1425                 throw MakeException(SR.GetString(SR.IncompleteSlashP));
1426             }
1427             char ch = MoveRightGetChar();
1428             if (ch != '{') {
1429                 throw MakeException(SR.GetString(SR.MalformedSlashP));
1430             }
1431
1432             int startpos = Textpos();
1433             while (CharsRight() > 0) {
1434                 ch = MoveRightGetChar();
1435                 if (!(RegexCharClass.IsWordChar(ch) || ch == '-')) {
1436                     MoveLeft();
1437                     break;
1438                 }
1439             }
1440             String capname = _pattern.Substring(startpos, Textpos() - startpos);
1441
1442             if (CharsRight() == 0 || MoveRightGetChar() != '}')
1443                 throw MakeException(SR.GetString(SR.IncompleteSlashP));
1444
1445             return capname;
1446         }
1447
1448         /*
1449          * Returns ReNode type for zero-length assertions with a \ code.
1450          */
1451         internal int TypeFromCode(char ch) {
1452             switch (ch) {
1453                 case 'b':
1454                     return UseOptionE() ? RegexNode.ECMABoundary : RegexNode.Boundary;
1455                 case 'B':
1456                     return UseOptionE() ? RegexNode.NonECMABoundary : RegexNode.Nonboundary;
1457                 case 'A':
1458                     return RegexNode.Beginning;
1459                 case 'G':
1460                     return RegexNode.Start;
1461                 case 'Z':
1462                     return RegexNode.EndZ;
1463                 case 'z':
1464                     return RegexNode.End;
1465                 default:
1466                     return RegexNode.Nothing;
1467             }
1468         }
1469
1470         /*
1471          * Returns option bit from single-char (?cimsx) code.
1472          */
1473         internal static RegexOptions OptionFromCode(char ch) {
1474             // case-insensitive
1475             if (ch >= 'A' && ch <= 'Z')
1476                 ch += (char)('a' - 'A');
1477
1478             switch (ch) {
1479 #if !(SILVERLIGHT||FULL_AOT_RUNTIME)
1480                 case 'c':
1481                     return RegexOptions.Compiled;
1482 #endif
1483                 case 'i':
1484                     return RegexOptions.IgnoreCase;
1485                 case 'r':
1486                     return RegexOptions.RightToLeft;
1487                 case 'm':
1488                     return RegexOptions.Multiline;
1489                 case 'n':
1490                     return RegexOptions.ExplicitCapture;
1491                 case 's':
1492                     return RegexOptions.Singleline;
1493                 case 'x':
1494                     return RegexOptions.IgnorePatternWhitespace;
1495 #if DBG
1496                 case 'd':
1497                     return RegexOptions.Debug;
1498 #endif
1499                 case 'e':
1500                     return RegexOptions.ECMAScript;
1501                 default:
1502                     return 0;
1503             }
1504         }
1505
1506         /*
1507          * a prescanner for deducing the slots used for
1508          * captures by doing a partial tokenization of the pattern.
1509          */
1510         internal void CountCaptures() {
1511             char ch;
1512
1513             NoteCaptureSlot(0, 0);
1514
1515             _autocap = 1;
1516
1517             while (CharsRight() > 0) {
1518                 int pos = Textpos();
1519                 ch = MoveRightGetChar();
1520                 switch (ch) {
1521                     case '\\':
1522                         if (CharsRight() > 0)
1523                             MoveRight();
1524                         break;
1525
1526                     case '#':
1527                         if (UseOptionX()) {
1528                             MoveLeft();
1529                             ScanBlank();
1530                         }
1531                         break;
1532
1533                     case '[':
1534                         ScanCharClass(false, true);
1535                         break;
1536
1537                     case ')':
1538                         if (!EmptyOptionsStack())
1539                             PopOptions();
1540                         break;
1541
1542                     case '(':
1543                         if (CharsRight() >= 2 && RightChar(1) == '#' && RightChar() == '?') {
1544                             MoveLeft();
1545                             ScanBlank();
1546                         }
1547                         else {
1548
1549                             PushOptions();
1550                             if (CharsRight() > 0 && RightChar() == '?') {
1551                                 // we have (?...
1552                                 MoveRight();
1553
1554                                 if (CharsRight() > 1 && (RightChar() == '<' || RightChar() == '\'')) {
1555                                     // named group: (?<... or (?'...
1556
1557                                     MoveRight();
1558                                     ch = RightChar();
1559
1560                                     if (ch != '0' && RegexCharClass.IsWordChar(ch)) {
1561                                         //if (_ignoreNextParen)
1562                                         //    throw MakeException(SR.GetString(SR.AlternationCantCapture));
1563                                         if (ch >= '1' && ch <= '9')
1564                                             NoteCaptureSlot(ScanDecimal(), pos);
1565                                         else
1566                                             NoteCaptureName(ScanCapname(), pos);
1567                                     }
1568                                 }
1569                                 else {
1570                                     // (?...
1571
1572                                     // get the options if it's an option construct (?cimsx-cimsx...)
1573                                     ScanOptions();
1574
1575                                     if (CharsRight() > 0) {
1576                                         if (RightChar() == ')') {
1577                                             // (?cimsx-cimsx)
1578                                             MoveRight();
1579                                             PopKeepOptions();
1580                                         }
1581                                         else if (RightChar() == '(') {
1582                                             // alternation construct: (?(foo)yes|no)
1583                                             // ignore the next paren so we don't capture the condition
1584                                             _ignoreNextParen = true;
1585
1586                                             // break from here so we don't reset _ignoreNextParen
1587                                             break;
1588                                         }
1589                                     }
1590                                 }
1591                             }
1592                             else {
1593                                 if (!UseOptionN() && !_ignoreNextParen)
1594                                     NoteCaptureSlot(_autocap++, pos);
1595                             }
1596                         }
1597
1598                         _ignoreNextParen = false;
1599                         break;
1600                 }
1601             }
1602
1603             AssignNameSlots();
1604         }
1605
1606         /*
1607          * Notes a used capture slot
1608          */
1609         internal void NoteCaptureSlot(int i, int pos) {
1610             if (!_caps.ContainsKey(i)) {
1611                 // the rhs of the hashtable isn't used in the parser
1612
1613                 _caps.Add(i, pos);
1614                 _capcount++;
1615
1616                 if (_captop <= i) {
1617                     if (i == Int32.MaxValue)
1618                         _captop = i;
1619                     else
1620                         _captop = i + 1;
1621                 }
1622             }
1623         }
1624
1625         /*
1626          * Notes a used capture slot
1627          */
1628         internal void NoteCaptureName(String name, int pos) {
1629             if (_capnames == null) {
1630 #if SILVERLIGHT
1631                 _capnames = new Dictionary<String, Int32>();
1632 #else
1633                 _capnames = new Hashtable();
1634 #endif
1635                 _capnamelist = new List<String>();
1636             }
1637
1638             if (!_capnames.ContainsKey(name)) {
1639                 _capnames.Add(name, pos);
1640                 _capnamelist.Add(name);
1641             }
1642         }
1643
1644         /*
1645          * For when all the used captures are known: note them all at once
1646          */
1647 #if SILVERLIGHT
1648         internal void NoteCaptures(Dictionary<Int32, Int32> caps, int capsize, Dictionary<String, Int32> capnames) {
1649 #else
1650         internal void NoteCaptures(Hashtable caps, int capsize, Hashtable capnames) {
1651 #endif
1652             _caps = caps;
1653             _capsize = capsize;
1654             _capnames = capnames;
1655         }
1656
1657         /*
1658          * Assigns unused slot numbers to the capture names
1659          */
1660         internal void AssignNameSlots() {
1661             if (_capnames != null) {
1662                 for (int i = 0; i < _capnamelist.Count; i++) {
1663                     while (IsCaptureSlot(_autocap))
1664                         _autocap++;
1665                     string name = _capnamelist[i];
1666                     int pos = (int)_capnames[name];
1667                     _capnames[name] = _autocap;
1668                     NoteCaptureSlot(_autocap, pos);
1669
1670                     _autocap++;
1671                 }
1672             }
1673
1674             // if the caps array has at least one gap, construct the list of used slots
1675
1676             if (_capcount < _captop) {
1677                 _capnumlist = new Int32[_capcount];
1678                 int i = 0;
1679
1680                 for (IDictionaryEnumerator de = _caps.GetEnumerator(); de.MoveNext(); )
1681                     _capnumlist[i++] = (int)de.Key;
1682
1683                 System.Array.Sort(_capnumlist, Comparer<Int32>.Default);
1684             }
1685
1686             // merge capsnumlist into capnamelist
1687
1688             if (_capnames != null || _capnumlist != null) {
1689                 List<String> oldcapnamelist;
1690                 int next;
1691                 int k = 0;
1692
1693                 if (_capnames == null) {
1694                     oldcapnamelist = null;
1695 #if SILVERLIGHT
1696                     _capnames = new Dictionary<String, Int32>();
1697 #else
1698                     _capnames = new Hashtable();
1699 #endif
1700                     _capnamelist = new List<String>();
1701                     next = -1;
1702                 }
1703                 else {
1704                     oldcapnamelist = _capnamelist;
1705                     _capnamelist = new List<String>();
1706                     next = (int)_capnames[oldcapnamelist[0]];
1707                 }
1708
1709                 for (int i = 0; i < _capcount; i++) {
1710                     int j = (_capnumlist == null) ? i : (int)_capnumlist[i];
1711
1712                     if (next == j) {
1713                         _capnamelist.Add(oldcapnamelist[k++]);
1714                         next = (k == oldcapnamelist.Count) ? -1 : (int)_capnames[oldcapnamelist[k]];
1715                     }
1716                     else {
1717                         String str = Convert.ToString(j, _culture);
1718                         _capnamelist.Add(str);
1719                         _capnames[str] = j;
1720                     }
1721                 }
1722             }
1723         }
1724
1725         /*
1726          * Looks up the slot number for a given name
1727          */
1728         internal int CaptureSlotFromName(String capname) {
1729             return(int)_capnames[capname];
1730         }
1731
1732         /*
1733          * True if the capture slot was noted
1734          */
1735         internal bool IsCaptureSlot(int i) {
1736             if (_caps != null)
1737                 return _caps.ContainsKey(i);
1738
1739             return(i >= 0 && i < _capsize);
1740         }
1741
1742         /*
1743          * Looks up the slot number for a given name
1744          */
1745         internal bool IsCaptureName(String capname) {
1746             if (_capnames == null)
1747                 return false;
1748
1749             return _capnames.ContainsKey(capname);
1750         }
1751
1752         /*
1753          * True if N option disabling '(' autocapture is on.
1754          */
1755         internal bool UseOptionN() {
1756             return(_options & RegexOptions.ExplicitCapture) != 0;
1757         }
1758
1759         /*
1760          * True if I option enabling case-insensitivity is on.
1761          */
1762         internal bool UseOptionI() {
1763             return(_options & RegexOptions.IgnoreCase) != 0;
1764         }
1765
1766         /*
1767          * True if M option altering meaning of $ and ^ is on.
1768          */
1769         internal bool UseOptionM() {
1770             return(_options & RegexOptions.Multiline) != 0;
1771         }
1772
1773         /*
1774          * True if S option altering meaning of . is on.
1775          */
1776         internal bool UseOptionS() {
1777             return(_options & RegexOptions.Singleline) != 0;
1778         }
1779
1780         /*
1781          * True if X option enabling whitespace/comment mode is on.
1782          */
1783         internal bool UseOptionX() {
1784             return(_options & RegexOptions.IgnorePatternWhitespace) != 0;
1785         }
1786
1787         /*
1788          * True if E option enabling ECMAScript behavior is on.
1789          */
1790         internal bool UseOptionE() {
1791             return(_options & RegexOptions.ECMAScript) != 0;
1792         }
1793
1794         internal const byte Q = 5;    // quantifier
1795         internal const byte S = 4;    // ordinary stoppper
1796         internal const byte Z = 3;    // ScanBlank stopper
1797         internal const byte X = 2;    // whitespace
1798         internal const byte E = 1;    // should be escaped
1799
1800         /*
1801          * For categorizing ascii characters.
1802         */
1803         internal static readonly byte[] _category = new byte[] {
1804             // 0 1 2 3 4 5 6 7 8 9 A B C D E F 0 1 2 3 4 5 6 7 8 9 A B C D E F
1805                0,0,0,0,0,0,0,0,0,X,X,0,X,X,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1806             //   ! " # $ % & ' ( ) * + , - . / 0 1 2 3 4 5 6 7 8 9 : ; < = > ?
1807                X,0,0,Z,S,0,0,0,S,S,Q,Q,0,0,S,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,Q,
1808             // @ A B C D E F G H I J K L M N O P Q R S T U V W X Y Z [ \ ] ^ _
1809                0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,S,S,0,S,0,
1810             // ' a b c d e f g h i j k l m n o p q r s t u v w x y z { | } ~
1811                0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,Q,S,0,0,0};
1812
1813         /*
1814          * Returns true for those characters that terminate a string of ordinary chars.
1815          */
1816         internal static bool IsSpecial(char ch) {
1817             return(ch <= '|' && _category[ch] >= S);
1818         }
1819
1820         /*
1821          * Returns true for those characters that terminate a string of ordinary chars.
1822          */
1823         internal static bool IsStopperX(char ch) {
1824             return(ch <= '|' && _category[ch] >= X);
1825         }
1826
1827         /*
1828          * Returns true for those characters that begin a quantifier.
1829          */
1830         internal static bool IsQuantifier(char ch) {
1831             return(ch <= '{' && _category[ch] >= Q);
1832         }
1833
1834         internal bool IsTrueQuantifier() {
1835             int nChars = CharsRight();
1836             if (nChars == 0)
1837                 return false;
1838             int startpos = Textpos();
1839             char ch = CharAt(startpos);
1840             if (ch != '{')
1841                 return ch <= '{' && _category[ch] >= Q;
1842             int pos = startpos;
1843             while (--nChars > 0 && (ch = CharAt(++pos)) >= '0' && ch <= '9') ;
1844             if (nChars == 0 || pos - startpos == 1)
1845                 return false;
1846             if (ch == '}')
1847                 return true;
1848             if (ch != ',')
1849                 return false;
1850             while (--nChars > 0 && (ch = CharAt(++pos)) >= '0' && ch <= '9') ;
1851             return nChars > 0 && ch == '}';
1852         }
1853
1854         /*
1855          * Returns true for whitespace.
1856          */
1857         internal static bool IsSpace(char ch) {
1858             return(ch <= ' ' && _category[ch] == X);
1859         }
1860
1861         /*
1862          * Returns true for chars that should be escaped.
1863          */
1864         internal static bool IsMetachar(char ch) {
1865             return(ch <= '|' && _category[ch] >= E);
1866         }
1867
1868
1869         /*
1870          * Add a string to the last concatenate.
1871          */
1872         internal void AddConcatenate(int pos, int cch, bool isReplacement) {
1873             RegexNode node;
1874
1875             if (cch == 0)
1876                 return;
1877
1878             if (cch > 1) {
1879                 String str = _pattern.Substring(pos, cch);
1880
1881                 if (UseOptionI() && !isReplacement) {
1882                     // We do the ToLower character by character for consistency.  With surrogate chars, doing
1883                     // a ToLower on the entire string could actually change the surrogate pair.  This is more correct
1884                     // linguistically, but since Regex doesn't support surrogates, it's more important to be
1885                     // consistent.
1886                     StringBuilder sb = new StringBuilder(str.Length);
1887                     for (int i=0; i<str.Length; i++)
1888                         sb.Append(Char.ToLower(str[i], _culture));
1889                     str = sb.ToString();
1890                 }
1891
1892                 node = new RegexNode(RegexNode.Multi, _options, str);
1893             }
1894             else {
1895                 char ch = _pattern[pos];
1896
1897                 if (UseOptionI() && !isReplacement)
1898                     ch = Char.ToLower(ch, _culture);
1899
1900                 node = new RegexNode(RegexNode.One, _options, ch);
1901             }
1902
1903             _concatenation.AddChild(node);
1904         }
1905
1906         /*
1907          * Push the parser state (in response to an open paren)
1908          */
1909         internal void PushGroup() {
1910             _group._next = _stack;
1911             _alternation._next = _group;
1912             _concatenation._next = _alternation;
1913             _stack = _concatenation;
1914         }
1915
1916         /*
1917          * Remember the pushed state (in response to a ')')
1918          */
1919         internal void PopGroup() {
1920             _concatenation = _stack;
1921             _alternation = _concatenation._next;
1922             _group = _alternation._next;
1923             _stack = _group._next;
1924
1925             // The first () inside a Testgroup group goes directly to the group
1926             if (_group.Type() == RegexNode.Testgroup && _group.ChildCount() == 0) {
1927                 if (_unit == null)
1928                     throw MakeException(SR.GetString(SR.IllegalCondition));
1929
1930                 _group.AddChild(_unit);
1931                 _unit = null;
1932             }
1933         }
1934
1935         /*
1936          * True if the group stack is empty.
1937          */
1938         internal bool EmptyStack() {
1939             return _stack == null;
1940         }
1941
1942         /*
1943          * Start a new round for the parser state (in response to an open paren or string start)
1944          */
1945         internal void StartGroup(RegexNode openGroup) {
1946             _group = openGroup;
1947             _alternation = new RegexNode(RegexNode.Alternate, _options);
1948             _concatenation = new RegexNode(RegexNode.Concatenate, _options);
1949         }
1950
1951         /*
1952          * Finish the current concatenation (in response to a |)
1953          */
1954         internal void AddAlternate() {
1955             // The | parts inside a Testgroup group go directly to the group
1956
1957             if (_group.Type() == RegexNode.Testgroup || _group.Type() == RegexNode.Testref) {
1958                 _group.AddChild(_concatenation.ReverseLeft());
1959             }
1960             else {
1961                 _alternation.AddChild(_concatenation.ReverseLeft());
1962             }
1963
1964             _concatenation = new RegexNode(RegexNode.Concatenate, _options);
1965         }
1966
1967         /*
1968          * Finish the current quantifiable (when a quantifier is not found or is not possible)
1969          */
1970         internal void AddConcatenate() {
1971             // The first (| inside a Testgroup group goes directly to the group
1972
1973             _concatenation.AddChild(_unit);
1974             _unit = null;
1975         }
1976
1977         /*
1978          * Finish the current quantifiable (when a quantifier is found)
1979          */
1980         internal void AddConcatenate(bool lazy, int min, int max) {
1981             _concatenation.AddChild(_unit.MakeQuantifier(lazy, min, max));
1982             _unit = null;
1983         }
1984
1985         /*
1986          * Returns the current unit
1987          */
1988         internal RegexNode Unit() {
1989             return _unit;
1990         }
1991
1992         /*
1993          * Sets the current unit to a single char node
1994          */
1995         internal void AddUnitOne(char ch) {
1996             if (UseOptionI())
1997                 ch = Char.ToLower(ch, _culture);
1998
1999             _unit = new RegexNode(RegexNode.One, _options, ch);
2000         }
2001
2002         /*
2003          * Sets the current unit to a single inverse-char node
2004          */
2005         internal void AddUnitNotone(char ch) {
2006             if (UseOptionI())
2007                 ch = Char.ToLower(ch, _culture);
2008
2009             _unit = new RegexNode(RegexNode.Notone, _options, ch);
2010         }
2011
2012         /*
2013          * Sets the current unit to a single set node
2014          */
2015         internal void AddUnitSet(string cc) {
2016             _unit = new RegexNode(RegexNode.Set, _options, cc);
2017         }
2018
2019         /*
2020          * Sets the current unit to a subtree
2021          */
2022         internal void AddUnitNode(RegexNode node) {
2023             _unit = node;
2024         }
2025
2026         /*
2027          * Sets the current unit to an assertion of the specified type
2028          */
2029         internal void AddUnitType(int type) {
2030             _unit = new RegexNode(type, _options);
2031         }
2032
2033         /*
2034          * Finish the current group (in response to a ')' or end)
2035          */
2036         internal void AddGroup() {
2037             if (_group.Type() == RegexNode.Testgroup || _group.Type() == RegexNode.Testref) {
2038                 _group.AddChild(_concatenation.ReverseLeft());
2039
2040                 if (_group.Type() == RegexNode.Testref && _group.ChildCount() > 2 || _group.ChildCount() > 3)
2041                     throw MakeException(SR.GetString(SR.TooManyAlternates));
2042             }
2043             else {
2044                 _alternation.AddChild(_concatenation.ReverseLeft());
2045                 _group.AddChild(_alternation);
2046             }
2047
2048             _unit = _group;
2049         }
2050
2051         /*
2052          * Saves options on a stack.
2053          */
2054         internal void PushOptions() {
2055             _optionsStack.Add(_options);
2056         }
2057
2058         /*
2059          * Recalls options from the stack.
2060          */
2061         internal void PopOptions() {
2062             _options = _optionsStack[_optionsStack.Count - 1];
2063             _optionsStack.RemoveAt(_optionsStack.Count - 1);
2064         }
2065
2066         /*
2067          * True if options stack is empty.
2068          */
2069         internal bool EmptyOptionsStack() {
2070             return(_optionsStack.Count == 0);
2071         }
2072
2073         /*
2074          * Pops the option stack, but keeps the current options unchanged.
2075          */
2076         internal void PopKeepOptions() {
2077             _optionsStack.RemoveAt(_optionsStack.Count - 1);
2078         }
2079
2080         /*
2081          * Fills in an ArgumentException
2082          */
2083         internal ArgumentException MakeException(String message) {
2084             return new ArgumentException(SR.GetString(SR.MakeException, _pattern, message));
2085         }
2086
2087         /*
2088          * Returns the current parsing position.
2089          */
2090         internal int Textpos() {
2091             return _currentPos;
2092         }
2093
2094         /*
2095          * Zaps to a specific parsing position.
2096          */
2097         internal void Textto(int pos) {
2098             _currentPos = pos;
2099         }
2100
2101         /*
2102          * Returns the char at the right of the current parsing position and advances to the right.
2103          */
2104         internal char MoveRightGetChar() {
2105             return _pattern[_currentPos++];
2106         }
2107
2108         /*
2109          * Moves the current position to the right.
2110          */
2111         internal void MoveRight() {
2112             MoveRight(1);
2113         }
2114
2115         internal void MoveRight(int i) {
2116             _currentPos += i;
2117         }
2118
2119         /*
2120          * Moves the current parsing position one to the left.
2121          */
2122         internal void MoveLeft() {
2123             --_currentPos;
2124         }
2125
2126         /*
2127          * Returns the char left of the current parsing position.
2128          */
2129         internal char CharAt(int i) {
2130             return _pattern[i];
2131         }
2132
2133         /*
2134          * Returns the char right of the current parsing position.
2135          */
2136         internal char RightChar() {
2137             return _pattern[_currentPos];
2138         }
2139
2140         /*
2141          * Returns the char i chars right of the current parsing position.
2142          */
2143         internal char RightChar(int i) {
2144             return _pattern[_currentPos + i];
2145         }
2146
2147         /*
2148          * Number of characters to the right of the current parsing position.
2149          */
2150         internal int CharsRight() {
2151             return _pattern.Length - _currentPos;
2152         }
2153     }
2154 }