-//\r
-// assembly: System\r
-// namespace: System.Text.RegularExpressions\r
-// file: parser.cs\r
-//\r
-// author: Dan Lewis (dlewis@gmx.co.uk)\r
-// (c) 2002\r
-\r
-using System;\r
-using System.Collections;\r
-using System.Globalization;\r
-\r
-namespace System.Text.RegularExpressions.Syntax {\r
-\r
- class Parser {\r
- public static int ParseDecimal (string str, ref int ptr) {\r
- return ParseNumber (str, ref ptr, 10, 1, Int32.MaxValue);\r
- }\r
-\r
- public static int ParseOctal (string str, ref int ptr) {\r
- return ParseNumber (str, ref ptr, 8, 1, 3);\r
- }\r
-\r
- public static int ParseHex (string str, ref int ptr, int digits) {\r
- return ParseNumber (str, ref ptr, 16, digits, digits);\r
- }\r
-\r
- public static int ParseNumber (string str, ref int ptr, int b, int min, int max) {\r
- int p = ptr, n = 0, digits = 0, d;\r
- if (max < min)\r
- max = Int32.MaxValue;\r
-\r
- while (digits < max && p < str.Length) {\r
- d = ParseDigit (str[p ++], b, digits);\r
- if (d < 0) {\r
- -- p;\r
- break;\r
- }\r
-\r
- n = n * b + d;\r
- ++ digits;\r
- }\r
-\r
- if (digits < min)\r
- return -1;\r
-\r
- ptr = p;\r
- return n;\r
- }\r
-\r
- public static string ParseName (string str, ref int ptr) {\r
- if (Char.IsDigit (str[ptr])) {\r
- int gid = ParseNumber (str, ref ptr, 10, 1, 0);\r
- if (gid > 0)\r
- return gid.ToString ();\r
- \r
- return null;\r
- }\r
-\r
- int start = ptr;\r
- for (;;) {\r
- if (!IsNameChar (str[ptr]))\r
- break;\r
- ++ ptr;\r
- }\r
-\r
- if (ptr - start > 0)\r
- return str.Substring (start, ptr - start);\r
-\r
- return null;\r
- }\r
-\r
- public static string Escape (string str) {\r
- string result = "";\r
- for (int i = 0; i < str.Length; ++ i) {\r
- char c = str[i];\r
- switch (c) {\r
- case '\\': case '*': case '+': case '?': case '|':\r
- case '{': case '[': case '(': case ')': case '^':\r
- case '$': case '.': case '#': case ' ':\r
- result += "\\" + c;\r
- break;\r
-\r
- case '\t': result += "\\t"; break;\r
- case '\n': result += "\\n"; break;\r
- case '\r': result += "\\r"; break;\r
- case '\f': result += "\\f"; break;\r
-\r
- default: result += c; break;\r
- }\r
- }\r
-\r
- return result;\r
- }\r
-\r
- public static string Unescape (string str) {\r
- return new Parser ().ParseString (str);\r
- }\r
-\r
- // public instance\r
-\r
- public Parser () {\r
- this.caps = new ArrayList ();\r
- this.refs = new Hashtable ();\r
- }\r
-\r
- public RegularExpression ParseRegularExpression (string pattern, RegexOptions options) {\r
- this.pattern = pattern;\r
- this.ptr = 0;\r
-\r
- caps.Clear ();\r
- refs.Clear ();\r
- this.num_groups = 0;\r
-\r
- try {\r
- RegularExpression re = new RegularExpression ();\r
- ParseGroup (re, options, null);\r
- ResolveReferences ();\r
-\r
- re.GroupCount = num_groups;\r
- \r
- return re;\r
- }\r
- catch (IndexOutOfRangeException) {\r
- throw NewParseException ("Unexpected end of pattern.");\r
- }\r
- }\r
-\r
- public IDictionary GetMapping () {\r
- Hashtable mapping = new Hashtable ();\r
- int end = caps.Count;\r
- mapping.Add ("0", 0);\r
- for (int i = 0; i < end;) {\r
- CapturingGroup group = (CapturingGroup) caps [i];\r
- i++;\r
- if (group.Name != null && !mapping.Contains (group.Name))\r
- mapping.Add (group.Name, group.Number);\r
- else\r
- mapping.Add (i.ToString (), i);\r
- }\r
-\r
- return mapping;\r
- }\r
-\r
- // private methods\r
-\r
- private void ParseGroup (Group group, RegexOptions options, Assertion assertion) {\r
- bool is_top_level = group is RegularExpression;\r
- \r
- Alternation alternation = null;\r
- string literal = null;\r
-\r
- Group current = new Group ();\r
- Expression expr = null;\r
- bool closed = false;\r
-\r
- while (true) {\r
- ConsumeWhitespace (IsIgnorePatternWhitespace (options));\r
- if (ptr >= pattern.Length)\r
- break;\r
- \r
- // (1) Parse for Expressions\r
- \r
- char ch = pattern[ptr ++];\r
- \r
- switch (ch) {\r
- case '^': {\r
- Position pos =\r
- IsMultiline (options) ? Position.StartOfLine : Position.Start;\r
- expr = new PositionAssertion (pos);\r
- break;\r
- }\r
-\r
- case '$': {\r
- Position pos =\r
- IsMultiline (options) ? Position.EndOfLine : Position.End;\r
- expr = new PositionAssertion (pos);\r
- break;\r
- }\r
-\r
- case '.': {\r
- Category cat =\r
- IsSingleline (options) ? Category.AnySingleline : Category.Any;\r
- expr = new CharacterClass (cat, false);\r
- break;\r
- }\r
-\r
- case '\\': {\r
- int c = ParseEscape ();\r
- if (c >= 0)\r
- ch = (char)c;\r
- else {\r
- expr = ParseSpecial (options);\r
-\r
- if (expr == null)\r
- ch = pattern[ptr ++]; // default escape\r
- }\r
- break;\r
- }\r
-\r
- case '[': {\r
- expr = ParseCharacterClass (options);\r
- break;\r
- }\r
-\r
- case '(': {\r
- bool ignore = IsIgnoreCase (options);\r
- expr = ParseGroupingConstruct (ref options);\r
- if (expr == null) {\r
- if (literal != null && IsIgnoreCase (options) != ignore) {\r
- current.AppendExpression (new Literal (literal, IsIgnoreCase (options)));\r
- literal = null;\r
- }\r
-\r
- continue;\r
- }\r
- break;\r
- }\r
-\r
- case ')': {\r
- closed = true;\r
- goto EndOfGroup;\r
- }\r
-\r
- case '|': {\r
- if (literal != null) {\r
- current.AppendExpression (new Literal (literal, IsIgnoreCase (options)));\r
- literal = null;\r
- }\r
-\r
- if (assertion != null) {\r
- if (assertion.TrueExpression == null)\r
- assertion.TrueExpression = current;\r
- else if (assertion.FalseExpression == null)\r
- assertion.FalseExpression = current;\r
- else\r
- throw NewParseException ("Too many | in (?()|).");\r
- }\r
- else {\r
- if (alternation == null)\r
- alternation = new Alternation ();\r
-\r
- alternation.AddAlternative (current);\r
- }\r
-\r
- current = new Group ();\r
- continue;\r
- }\r
-\r
- case '*': case '+': case '?': {\r
- throw NewParseException ("Bad quantifier.");\r
- }\r
-\r
- default: \r
- break; // literal character\r
- }\r
-\r
- ConsumeWhitespace (IsIgnorePatternWhitespace (options));\r
- \r
- // (2) Check for Repetitions\r
- \r
- if (ptr < pattern.Length) {\r
- char k = pattern[ptr];\r
-\r
- if (k == '?' || k == '*' || k == '+' || k == '{') {\r
- ++ ptr;\r
-\r
- int min = 0, max = 0;\r
- bool lazy = false;\r
-\r
- switch (k) {\r
- case '?': min = 0; max = 1; break;\r
- case '*': min = 0; max = 0xffff; break;\r
- case '+': min = 1; max = 0xffff; break;\r
- case '{': ParseRepetitionBounds (out min, out max, options); break;\r
- }\r
-\r
- ConsumeWhitespace (IsIgnorePatternWhitespace (options));\r
- if (ptr < pattern.Length && pattern[ptr] == '?') {\r
- ++ ptr;\r
- lazy = true;\r
- }\r
-\r
- Repetition repetition = new Repetition (min, max, lazy);\r
-\r
- if (expr == null)\r
- repetition.Expression = new Literal (ch.ToString (), IsIgnoreCase (options));\r
- else\r
- repetition.Expression = expr;\r
-\r
- expr = repetition;\r
- }\r
- }\r
-\r
- // (3) Append Expression and/or Literal\r
-\r
- if (expr == null) {\r
- if (literal == null)\r
- literal = "";\r
- literal += ch;\r
- }\r
- else {\r
- if (literal != null) {\r
- current.AppendExpression (new Literal (literal, IsIgnoreCase (options)));\r
- literal = null;\r
- }\r
-\r
- current.AppendExpression (expr);\r
- expr = null;\r
- }\r
-\r
- if (is_top_level && ptr >= pattern.Length)\r
- goto EndOfGroup;\r
- }\r
-\r
- EndOfGroup:\r
- if (is_top_level && closed)\r
- throw NewParseException ("Too many )'s.");\r
- if (!is_top_level && !closed)\r
- throw NewParseException ("Not enough )'s.");\r
- \r
- \r
- // clean up literals and alternations\r
-\r
- if (literal != null)\r
- current.AppendExpression (new Literal (literal, IsIgnoreCase (options)));\r
-\r
- if (assertion != null) {\r
- if (assertion.TrueExpression == null)\r
- assertion.TrueExpression = current;\r
- else\r
- assertion.FalseExpression = current;\r
- \r
- group.AppendExpression (assertion);\r
- }\r
- else if (alternation != null) {\r
- alternation.AddAlternative (current);\r
- group.AppendExpression (alternation);\r
- }\r
- else\r
- group.AppendExpression (current);\r
- }\r
-\r
- private Expression ParseGroupingConstruct (ref RegexOptions options) {\r
- if (pattern[ptr] != '?') {\r
- Group group;\r
-\r
- if (IsExplicitCapture (options))\r
- group = new Group ();\r
- else {\r
- group = new CapturingGroup ();\r
- caps.Add (group);\r
- }\r
-\r
- ParseGroup (group, options, null);\r
- return group;\r
- }\r
- else\r
- ++ ptr;\r
-\r
- switch (pattern[ptr]) {\r
- case ':': { // non-capturing group\r
- ++ ptr;\r
- Group group = new Group ();\r
- ParseGroup (group, options, null);\r
-\r
- return group;\r
- }\r
-\r
- case '>': { // non-backtracking group\r
- ++ ptr;\r
- Group group = new NonBacktrackingGroup ();\r
- ParseGroup (group, options, null);\r
- \r
- return group;\r
- }\r
-\r
- case 'i': case 'm': case 'n':\r
- case 's': case 'x': case '-': { // options\r
- RegexOptions o = options;\r
- ParseOptions (ref o, false);\r
- if (pattern[ptr] == '-') {\r
- ++ ptr;\r
- ParseOptions (ref o, true);\r
- }\r
-\r
- if (pattern[ptr] == ':') { // pass options to child group\r
- ++ ptr;\r
- Group group = new Group ();\r
- ParseGroup (group, o, null);\r
- return group;\r
- }\r
- else if (pattern[ptr] == ')') { // change options of enclosing group\r
- ++ ptr;\r
- options = o;\r
- return null;\r
- }\r
- else\r
- throw NewParseException ("Bad options");\r
- }\r
-\r
- case '<': case '=': case '!': { // lookahead/lookbehind\r
- ExpressionAssertion asn = new ExpressionAssertion ();\r
- if (!ParseAssertionType (asn))\r
- goto case '\''; // it's a (?<name> ) construct\r
-\r
- Group test = new Group ();\r
- ParseGroup (test, options, null);\r
-\r
- asn.TestExpression = test;\r
- return asn;\r
- }\r
-\r
- case '\'': { // named/balancing group\r
- char delim;\r
- if (pattern[ptr] == '<')\r
- delim = '>';\r
- else\r
- delim = '\'';\r
-\r
- ++ ptr;\r
- string name = ParseName ();\r
-\r
- if (pattern[ptr] == delim) {\r
- // capturing group\r
-\r
- if (name == null)\r
- throw NewParseException ("Bad group name.");\r
-\r
- ++ ptr;\r
- CapturingGroup cap = new CapturingGroup ();\r
- cap.Name = name;\r
- caps.Add (cap);\r
- ParseGroup (cap, options, null);\r
-\r
- return cap;\r
- }\r
- else if (pattern[ptr] == '-') {\r
- // balancing group\r
-\r
- ++ ptr;\r
- string balance_name = ParseName ();\r
- if (balance_name == null || pattern[ptr] != delim)\r
- throw NewParseException ("Bad balancing group name.");\r
-\r
- ++ ptr;\r
- BalancingGroup bal = new BalancingGroup ();\r
- bal.Name = name;\r
- caps.Add (bal);\r
- refs.Add (bal, balance_name);\r
-\r
- return bal;\r
- }\r
- else\r
- throw NewParseException ("Bad group name.");\r
- }\r
-\r
- case '(': { // expression/capture test\r
- Assertion asn;\r
- \r
- ++ ptr;\r
- int p = ptr;\r
- string name = ParseName ();\r
- if (name == null || pattern[ptr] != ')') { // expression test\r
- // FIXME MS implementation doesn't seem to\r
- // implement this version of (?(x) ...)\r
-\r
- ptr = p;\r
- ExpressionAssertion expr_asn = new ExpressionAssertion ();\r
-\r
- if (pattern[ptr] == '?') {\r
- ++ ptr;\r
- if (!ParseAssertionType (expr_asn))\r
- throw NewParseException ("Bad conditional.");\r
- }\r
- else {\r
- expr_asn.Negate = false;\r
- expr_asn.Reverse = false;\r
- }\r
-\r
- Group test = new Group ();\r
- ParseGroup (test, options, null);\r
- expr_asn.TestExpression = test;\r
- asn = expr_asn;\r
- }\r
- else { // capture test\r
- ++ ptr;\r
- asn = new CaptureAssertion ();\r
- refs.Add (asn, name);\r
- }\r
-\r
- Group group = new Group ();\r
- ParseGroup (group, options, asn);\r
- return group;\r
- }\r
-\r
- case '#': { // comment\r
- ++ ptr;\r
- while (pattern[ptr ++] != ')') {\r
- if (ptr >= pattern.Length)\r
- throw NewParseException ("Unterminated (?#...) comment.");\r
- }\r
- return null;\r
- }\r
-\r
- default: // error\r
- throw NewParseException ("Bad grouping construct.");\r
- }\r
- }\r
-\r
- private bool ParseAssertionType (ExpressionAssertion assertion) {\r
- if (pattern[ptr] == '<') {\r
- switch (pattern[ptr + 1]) {\r
- case '=':\r
- assertion.Negate = false;\r
- break;\r
- case '!':\r
- assertion.Negate = true;\r
- break;\r
- default:\r
- return false;\r
- }\r
-\r
- assertion.Reverse = true;\r
- ptr += 2;\r
- }\r
- else {\r
- switch (pattern[ptr]) {\r
- case '=':\r
- assertion.Negate = false;\r
- break;\r
- case '!':\r
- assertion.Negate = true;\r
- break;\r
- default:\r
- return false;\r
- }\r
-\r
- assertion.Reverse = false;\r
- ptr += 1;\r
- }\r
-\r
- return true;\r
- }\r
-\r
- private void ParseOptions (ref RegexOptions options, bool negate) {\r
- for (;;) {\r
- switch (pattern[ptr]) {\r
- case 'i':\r
- if (negate)\r
- options &= ~RegexOptions.IgnoreCase;\r
- else\r
- options |= RegexOptions.IgnoreCase;\r
- break;\r
-\r
- case 'm':\r
- if (negate)\r
- options &= ~RegexOptions.Multiline;\r
- else\r
- options |= RegexOptions.Multiline;\r
- break;\r
- \r
- case 'n':\r
- if (negate)\r
- options &= ~RegexOptions.ExplicitCapture;\r
- else\r
- options |= RegexOptions.ExplicitCapture;\r
- break;\r
- \r
- case 's':\r
- if (negate)\r
- options &= ~RegexOptions.Singleline;\r
- else\r
- options |= RegexOptions.Singleline;\r
- break;\r
- \r
- case 'x':\r
- if (negate)\r
- options &= ~RegexOptions.IgnorePatternWhitespace;\r
- else\r
- options |= RegexOptions.IgnorePatternWhitespace;\r
- break;\r
-\r
- default:\r
- return;\r
- }\r
-\r
- ++ ptr;\r
- }\r
- }\r
-\r
- private Expression ParseCharacterClass (RegexOptions options) {\r
- bool negate, ecma;\r
- if (pattern[ptr] == '^') {\r
- negate = true;\r
- ++ ptr;\r
- }\r
- else\r
- negate = false;\r
- \r
- ecma = IsECMAScript (options);\r
- CharacterClass cls = new CharacterClass (negate, IsIgnoreCase (options));\r
-\r
- if (pattern[ptr] == ']') {\r
- cls.AddCharacter (']');\r
- ++ ptr;\r
- }\r
-\r
- int c = -1;\r
- int last = -1;\r
- bool range = false;\r
- bool closed = false;\r
- while (ptr < pattern.Length) {\r
- c = pattern[ptr ++];\r
-\r
- if (c == ']') {\r
- closed = true;\r
- break;\r
- }\r
- \r
- if (c == '-') {\r
- range = true;\r
- continue;\r
- }\r
-\r
- if (c == '\\') {\r
- c = ParseEscape ();\r
- if (c < 0) {\r
- // didn't recognize escape\r
-\r
- c = pattern[ptr ++];\r
- switch (c) {\r
- case 'b': c = '\b'; break;\r
-\r
- case 'd':\r
- cls.AddCategory (ecma ? Category.EcmaDigit : Category.Digit, false);\r
- last = -1;\r
- continue;\r
- \r
- case 'w':\r
- cls.AddCategory (ecma ? Category.EcmaWord : Category.Word, false);\r
- last = -1;\r
- continue;\r
- \r
- case 's':\r
- cls.AddCategory (ecma ? Category.EcmaWhiteSpace : Category.WhiteSpace, false);\r
- last = -1;\r
- continue;\r
- \r
- case 'p':\r
- cls.AddCategory (ParseUnicodeCategory (), false); // ignore ecma\r
- last = -1;\r
- continue;\r
- \r
- case 'D':\r
- cls.AddCategory (ecma ? Category.EcmaDigit : Category.Digit, true);\r
- last = -1;\r
- continue;\r
- \r
- case 'W':\r
- cls.AddCategory (ecma ? Category.EcmaWord : Category.Word, true);\r
- last = -1;\r
- continue;\r
- \r
- case 'S':\r
- cls.AddCategory (ecma ? Category.EcmaWhiteSpace : Category.WhiteSpace, true);\r
- last = -1;\r
- continue;\r
- \r
- case 'P':\r
- cls.AddCategory (ParseUnicodeCategory (), true);\r
- last = -1;\r
- continue;\r
-\r
- default: break; // add escaped character\r
- }\r
- }\r
- }\r
-\r
- if (range) {\r
- if (c < last)\r
- throw NewParseException ("[x-y] range in reverse order.");\r
-\r
- if (last >=0 )\r
- cls.AddRange ((char)last, (char)c);\r
- else {\r
- cls.AddCharacter ((char)c);\r
- cls.AddCharacter ('-');\r
- }\r
-\r
- range = false;\r
- last = -1;\r
- }\r
- else {\r
- cls.AddCharacter ((char)c);\r
- last = c;\r
- }\r
- }\r
-\r
- if (!closed)\r
- throw NewParseException ("Unterminated [] set.");\r
-\r
- if (range)\r
- cls.AddCharacter ('-');\r
-\r
- return cls;\r
- }\r
-\r
- private void ParseRepetitionBounds (out int min, out int max, RegexOptions options) {\r
- int n, m;\r
-\r
- /* check syntax */\r
-\r
- ConsumeWhitespace (IsIgnorePatternWhitespace (options));\r
- n = ParseNumber (10, 1, 0);\r
- if (n < 0)\r
- throw NewParseException ("Illegal {x,y} - bad value of x.");\r
-\r
- ConsumeWhitespace (IsIgnorePatternWhitespace (options));\r
- switch (pattern[ptr ++]) {\r
- case '}':\r
- m = n;\r
- break;\r
- case ',':\r
- ConsumeWhitespace (IsIgnorePatternWhitespace (options));\r
- m = ParseNumber (10, 1, 0);\r
- ConsumeWhitespace (IsIgnorePatternWhitespace (options));\r
- if (pattern[ptr ++] != '}')\r
- throw NewParseException ("Illegal {x,y} - bad value of y.");\r
- break;\r
- default:\r
- throw NewParseException ("Illegal {x,y}");\r
- }\r
-\r
- /* check bounds and ordering */\r
-\r
- if (n >= 0xffff || m >= 0xffff)\r
- throw NewParseException ("Illegal {x, y} - maximum of 65535.");\r
- if (m >= 0 && m < n)\r
- throw NewParseException ("Illegal {x, y} with x > y.");\r
-\r
- /* assign min and max */\r
- \r
- min = n;\r
- if (m > 0)\r
- max = m;\r
- else\r
- max = 0xffff;\r
- }\r
-\r
- private Category ParseUnicodeCategory () {\r
- if (pattern[ptr ++] != '{')\r
- throw NewParseException ("Incomplete \\p{X} character escape.");\r
-\r
- string name = ParseName (pattern, ref ptr);\r
- if (name == null)\r
- throw NewParseException ("Incomplete \\p{X} character escape.");\r
-\r
- Category cat = CategoryUtils.CategoryFromName (name);\r
- if (cat == Category.None)\r
- throw NewParseException ("Unknown property '" + name + "'.");\r
-\r
- if (pattern[ptr ++] != '}')\r
- throw NewParseException ("Incomplete \\p{X} character escape.");\r
-\r
- return cat;\r
- }\r
-\r
- private Expression ParseSpecial (RegexOptions options) {\r
- int p = ptr;\r
- bool ecma = IsECMAScript (options);\r
- Expression expr = null;\r
- \r
- switch (pattern[ptr ++]) {\r
-\r
- // categories\r
-\r
- case 'd':\r
- expr = new CharacterClass (ecma ? Category.EcmaDigit : Category.Digit, false);\r
- break;\r
- \r
- case 'w':\r
- expr = new CharacterClass (ecma ? Category.EcmaWord : Category.Word, false);\r
- break;\r
- \r
- case 's':\r
- expr = new CharacterClass (ecma ? Category.EcmaWhiteSpace : Category.WhiteSpace, false);\r
- break;\r
- \r
- case 'p':\r
- // this is odd - ECMAScript isn't supposed to support Unicode,\r
- // yet \p{..} compiles and runs under the MS implementation\r
- // identically to canonical mode. That's why I'm ignoring the\r
- // value of ecma here.\r
- \r
- expr = new CharacterClass (ParseUnicodeCategory (), false);\r
- break;\r
- \r
- case 'D':\r
- expr = new CharacterClass (ecma ? Category.EcmaDigit : Category.Digit, true);\r
- break;\r
- \r
- case 'W':\r
- expr = new CharacterClass (ecma ? Category.EcmaWord : Category.Word, true);\r
- break;\r
- \r
- case 'S':\r
- expr = new CharacterClass (ecma ? Category.EcmaWhiteSpace : Category.WhiteSpace, true);\r
- break;\r
- \r
- case 'P':\r
- expr = new CharacterClass (ParseUnicodeCategory (), true);\r
- break;\r
-\r
- // positions\r
-\r
- case 'A': expr = new PositionAssertion (Position.StartOfString); break;\r
- case 'Z': expr = new PositionAssertion (Position.End); break;\r
- case 'z': expr = new PositionAssertion (Position.EndOfString); break;\r
- case 'G': expr = new PositionAssertion (Position.StartOfScan); break;\r
- case 'b': expr = new PositionAssertion (Position.Boundary); break;\r
- case 'B': expr = new PositionAssertion (Position.NonBoundary); break;\r
- \r
- // references\r
-\r
- case '1': case '2': case '3': case '4': case '5':\r
- case '6': case '7': case '8': case '9': {\r
- ptr --;\r
- int n = ParseNumber (10, 1, 0);\r
- if (n < 0) {\r
- ptr = p;\r
- return null;\r
- }\r
-\r
- // FIXME test if number is within number of assigned groups\r
- // this may present a problem for right-to-left matching\r
-\r
- Reference reference = new Reference (IsIgnoreCase (options));\r
- refs.Add (reference, n.ToString ());\r
- expr = reference;\r
- break;\r
- }\r
-\r
- case 'k': {\r
- char delim = pattern[ptr ++];\r
- if (delim == '<')\r
- delim = '>';\r
- else if (delim != '\'')\r
- throw NewParseException ("Malformed \\k<...> named backreference.");\r
-\r
- string name = ParseName ();\r
- if (name == null || pattern[ptr] != delim)\r
- throw NewParseException ("Malformed \\k<...> named backreference.");\r
-\r
- ++ ptr;\r
- Reference reference = new Reference (IsIgnoreCase (options));\r
- refs.Add (reference, name);\r
- expr = reference;\r
- break;\r
- }\r
-\r
- default:\r
- expr = null;\r
- break;\r
- }\r
-\r
- if (expr == null)\r
- ptr = p;\r
-\r
- return expr;\r
- }\r
-\r
- private int ParseEscape () {\r
- int p = ptr;\r
- int c;\r
-\r
- if (p >= pattern.Length)\r
- throw new ArgumentException (\r
- String.Format ("Parsing \"{0}\" - Illegal \\ at end of " + \r
- "pattern.", pattern), pattern);\r
- \r
- switch (pattern[ptr ++]) {\r
- \r
- // standard escapes (except \b)\r
-\r
- case 'a': return '\u0007';\r
- case 't': return '\u0009';\r
- case 'r': return '\u000d';\r
- case 'v': return '\u000b';\r
- case 'f': return '\u000c';\r
- case 'n': return '\u000a';\r
- case 'e': return '\u001b';\r
- case '\\': return '\\';\r
-\r
- // character codes\r
-\r
- case '0': return ParseOctal (pattern, ref ptr);\r
-\r
- case 'x':\r
- c = ParseHex (pattern, ref ptr, 2);\r
- if (c < 0)\r
- throw NewParseException ("Insufficient hex digits");\r
-\r
- return c;\r
-\r
- case 'u':\r
- c = ParseHex (pattern, ref ptr, 4);\r
- if (c < 0)\r
- throw NewParseException ("Insufficient hex digits");\r
- \r
- return c;\r
-\r
- // control characters\r
-\r
- case 'c':\r
- c = pattern[p ++];\r
- if (c >= 'A' && c <= 'Z')\r
- return c - 'A';\r
- else if (c >= '@' && c <= '_')\r
- return c - '@';\r
- else\r
- throw NewParseException ("Unrecognized control character.");\r
-\r
- // unknown escape\r
-\r
- default:\r
- ptr = p;\r
- return -1;\r
- }\r
- }\r
-\r
- private string ParseName () {\r
- return Parser.ParseName (pattern, ref ptr);\r
- }\r
-\r
- private static bool IsNameChar (char c) {\r
- UnicodeCategory cat = Char.GetUnicodeCategory (c);\r
- if (cat == UnicodeCategory.ModifierLetter)\r
- return false;\r
- if (cat == UnicodeCategory.ConnectorPunctuation)\r
- return true;\r
- return Char.IsLetterOrDigit (c);\r
- }\r
- \r
- private int ParseNumber (int b, int min, int max) {\r
- return Parser.ParseNumber (pattern, ref ptr, b, min, max);\r
- }\r
-\r
- private int ParseDecimal () {\r
- return Parser.ParseDecimal (pattern, ref ptr);\r
- }\r
-\r
- private static int ParseDigit (char c, int b, int n) {\r
- switch (b) {\r
- case 8:\r
- if (c >= '0' && c <= '7')\r
- return c - '0';\r
- else\r
- return -1;\r
- case 10:\r
- if (c >= '0' && c <= '9')\r
- return c - '0';\r
- else\r
- return -1;\r
- case 16:\r
- if (c >= '0' && c <= '9')\r
- return c - '0';\r
- else if (c >= 'a' && c <= 'f')\r
- return 10 + c - 'a';\r
- else if (c >= 'A' && c <= 'F')\r
- return 10 + c - 'A';\r
- else\r
- return -1;\r
- default:\r
- return -1;\r
- }\r
- }\r
-\r
- private void ConsumeWhitespace (bool ignore) {\r
- while (true) {\r
- if (ptr >= pattern.Length)\r
- break;\r
- \r
- if (pattern[ptr] == '(') {\r
- if (ptr + 3 >= pattern.Length)\r
- return;\r
-\r
- if (pattern[ptr + 1] != '?' || pattern[ptr + 2] != '#')\r
- return;\r
-\r
- ptr += 3;\r
- while (pattern[ptr ++] != ')')\r
- /* ignore */ ;\r
- }\r
- else if (ignore && pattern[ptr] == '#') {\r
- while (ptr < pattern.Length && pattern[ptr ++] != '\n')\r
- /* ignore */ ;\r
- }\r
- else if (ignore && Char.IsWhiteSpace (pattern[ptr])) {\r
- while (ptr < pattern.Length && Char.IsWhiteSpace (pattern[ptr]))\r
- ++ ptr;\r
- }\r
- else\r
- return;\r
- }\r
- }\r
-\r
- private string ParseString (string pattern) {\r
- this.pattern = pattern;\r
- this.ptr = 0;\r
-\r
- string result = "";\r
- while (ptr < pattern.Length) {\r
- int c = pattern[ptr];\r
- if (c == '\\')\r
- c = ParseEscape ();\r
- ptr ++; \r
- result += (char)c;\r
- }\r
-\r
- return result;\r
- }\r
-\r
- private void ResolveReferences () {\r
- int gid = 1;\r
- Hashtable dict = new Hashtable ();\r
-\r
- // number unnamed groups\r
-\r
- foreach (CapturingGroup group in caps) {\r
- if (group.Name == null) {\r
- dict.Add (gid.ToString (), group);\r
- group.Number = gid ++;\r
-\r
- ++ num_groups;\r
- }\r
- }\r
-\r
- // number named groups\r
-\r
- foreach (CapturingGroup group in caps) {\r
- if (group.Name != null) {\r
- if (!dict.Contains (group.Name)) {\r
- dict.Add (group.Name, group);\r
- group.Number = gid ++;\r
-\r
- ++ num_groups;\r
- }\r
- else {\r
- CapturingGroup prev = (CapturingGroup)dict[group.Name];\r
- group.Number = prev.Number;\r
- }\r
- }\r
- }\r
-\r
- // resolve references\r
-\r
- foreach (Expression expr in refs.Keys) {\r
- string name = (string)refs[expr];\r
- if (!dict.Contains (name)) {\r
- throw NewParseException ("Reference to undefined group " +\r
- (Char.IsDigit (name[0]) ? "number " : "name ") +\r
- name);\r
- }\r
-\r
- CapturingGroup group = (CapturingGroup)dict[name];\r
- if (expr is Reference)\r
- ((Reference)expr).CapturingGroup = group;\r
- else if (expr is CaptureAssertion)\r
- ((CaptureAssertion)expr).CapturingGroup = group;\r
- else if (expr is BalancingGroup)\r
- ((BalancingGroup)expr).Balance = group;\r
- }\r
- }\r
-\r
- // flag helper functions\r
-\r
- private static bool IsIgnoreCase (RegexOptions options) {\r
- return (options & RegexOptions.IgnoreCase) != 0;\r
- }\r
-\r
- private static bool IsMultiline (RegexOptions options) {\r
- return (options & RegexOptions.Multiline) != 0;\r
- }\r
-\r
- private static bool IsExplicitCapture (RegexOptions options) {\r
- return (options & RegexOptions.ExplicitCapture) != 0;\r
- }\r
- \r
- private static bool IsSingleline (RegexOptions options) {\r
- return (options & RegexOptions.Singleline) != 0;\r
- }\r
-\r
- private static bool IsIgnorePatternWhitespace (RegexOptions options) {\r
- return (options & RegexOptions.IgnorePatternWhitespace) != 0;\r
- }\r
-\r
- private static bool IsRightToLeft (RegexOptions options) {\r
- return (options & RegexOptions.RightToLeft) != 0;\r
- }\r
-\r
- private static bool IsECMAScript (RegexOptions options) {\r
- return (options & RegexOptions.ECMAScript) != 0;\r
- }\r
-\r
- // exception creation\r
-\r
- private ArgumentException NewParseException (string msg) {\r
- msg = "parsing \"" + pattern + "\" - " + msg;\r
- return new ArgumentException (msg, pattern);\r
- }\r
-\r
- private string pattern;\r
- private int ptr;\r
-\r
- private ArrayList caps;\r
- private Hashtable refs;\r
- private int num_groups;\r
- }\r
-}\r
+//
+// assembly: System
+// namespace: System.Text.RegularExpressions
+// file: parser.cs
+//
+// author: Dan Lewis (dlewis@gmx.co.uk)
+// (c) 2002
+
+//
+// Permission is hereby granted, free of charge, to any person obtaining
+// a copy of this software and associated documentation files (the
+// "Software"), to deal in the Software without restriction, including
+// without limitation the rights to use, copy, modify, merge, publish,
+// distribute, sublicense, and/or sell copies of the Software, and to
+// permit persons to whom the Software is furnished to do so, subject to
+// the following conditions:
+//
+// The above copyright notice and this permission notice shall be
+// included in all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+// MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+// OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+// WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+//
+
+using System;
+using System.Collections;
+using System.Globalization;
+
+namespace System.Text.RegularExpressions.Syntax {
+
+ class Parser {
+ public static int ParseDecimal (string str, ref int ptr) {
+ return ParseNumber (str, ref ptr, 10, 1, Int32.MaxValue);
+ }
+
+ public static int ParseOctal (string str, ref int ptr) {
+ return ParseNumber (str, ref ptr, 8, 1, 3);
+ }
+
+ public static int ParseHex (string str, ref int ptr, int digits) {
+ return ParseNumber (str, ref ptr, 16, digits, digits);
+ }
+
+ public static int ParseNumber (string str, ref int ptr, int b, int min, int max) {
+ int p = ptr, n = 0, digits = 0, d;
+ if (max < min)
+ max = Int32.MaxValue;
+
+ while (digits < max && p < str.Length) {
+ d = ParseDigit (str[p ++], b, digits);
+ if (d < 0) {
+ -- p;
+ break;
+ }
+
+ n = n * b + d;
+ ++ digits;
+ }
+
+ if (digits < min)
+ return -1;
+
+ ptr = p;
+ return n;
+ }
+
+ public static string ParseName (string str, ref int ptr) {
+ if (Char.IsDigit (str[ptr])) {
+ int gid = ParseNumber (str, ref ptr, 10, 1, 0);
+ if (gid > 0)
+ return gid.ToString ();
+
+ return null;
+ }
+
+ int start = ptr;
+ for (;;) {
+ if (!IsNameChar (str[ptr]))
+ break;
+ ++ ptr;
+ }
+
+ if (ptr - start > 0)
+ return str.Substring (start, ptr - start);
+
+ return null;
+ }
+
+ public static string Escape (string str) {
+ string result = "";
+ for (int i = 0; i < str.Length; ++ i) {
+ char c = str[i];
+ switch (c) {
+ case '\\': case '*': case '+': case '?': case '|':
+ case '{': case '[': case '(': case ')': case '^':
+ case '$': case '.': case '#': case ' ':
+ result += "\\" + c;
+ break;
+
+ case '\t': result += "\\t"; break;
+ case '\n': result += "\\n"; break;
+ case '\r': result += "\\r"; break;
+ case '\f': result += "\\f"; break;
+
+ default: result += c; break;
+ }
+ }
+
+ return result;
+ }
+
+ public static string Unescape (string str) {
+ if (str.IndexOf ('\\') == -1)
+ return str;
+ return new Parser ().ParseString (str);
+ }
+
+ // public instance
+
+ public Parser () {
+ this.caps = new ArrayList ();
+ this.refs = new Hashtable ();
+ }
+
+ public RegularExpression ParseRegularExpression (string pattern, RegexOptions options) {
+ this.pattern = pattern;
+ this.ptr = 0;
+
+ caps.Clear ();
+ refs.Clear ();
+ this.num_groups = 0;
+
+ try {
+ RegularExpression re = new RegularExpression ();
+ ParseGroup (re, options, null);
+ ResolveReferences ();
+
+ re.GroupCount = num_groups;
+
+ return re;
+ }
+ catch (IndexOutOfRangeException) {
+ throw NewParseException ("Unexpected end of pattern.");
+ }
+ }
+
+ public IDictionary GetMapping ()
+ {
+ Hashtable mapping = new Hashtable ();
+ int end = caps.Count;
+ mapping.Add ("0", 0);
+ for (int i = 0; i < end; i++) {
+ CapturingGroup group = (CapturingGroup) caps [i];
+ if (group.Name != null) {
+ if (mapping.Contains (group.Name)) {
+ if ((int) mapping [group.Name] != group.Number)
+ throw new SystemException ("invalid state");
+ continue;
+ }
+ mapping.Add (group.Name, group.Number);
+ } else {
+ mapping.Add (group.Number.ToString (), group.Number);
+ }
+ }
+
+ return mapping;
+ }
+
+ // private methods
+
+ private void ParseGroup (Group group, RegexOptions options, Assertion assertion) {
+ bool is_top_level = group is RegularExpression;
+
+ Alternation alternation = null;
+ string literal = null;
+
+ Group current = new Group ();
+ Expression expr = null;
+ bool closed = false;
+
+ while (true) {
+ ConsumeWhitespace (IsIgnorePatternWhitespace (options));
+ if (ptr >= pattern.Length)
+ break;
+
+ // (1) Parse for Expressions
+
+ char ch = pattern[ptr ++];
+
+ switch (ch) {
+ case '^': {
+ Position pos =
+ IsMultiline (options) ? Position.StartOfLine : Position.Start;
+ expr = new PositionAssertion (pos);
+ break;
+ }
+
+ case '$': {
+ Position pos =
+ IsMultiline (options) ? Position.EndOfLine : Position.End;
+ expr = new PositionAssertion (pos);
+ break;
+ }
+
+ case '.': {
+ Category cat =
+ IsSingleline (options) ? Category.AnySingleline : Category.Any;
+ expr = new CharacterClass (cat, false);
+ break;
+ }
+
+ case '\\': {
+ int c = ParseEscape ();
+ if (c >= 0)
+ ch = (char)c;
+ else {
+ expr = ParseSpecial (options);
+
+ if (expr == null)
+ ch = pattern[ptr ++]; // default escape
+ }
+ break;
+ }
+
+ case '[': {
+ expr = ParseCharacterClass (options);
+ break;
+ }
+
+ case '(': {
+ bool ignore = IsIgnoreCase (options);
+ expr = ParseGroupingConstruct (ref options);
+ if (expr == null) {
+ if (literal != null && IsIgnoreCase (options) != ignore) {
+ current.AppendExpression (new Literal (literal, IsIgnoreCase (options)));
+ literal = null;
+ }
+
+ continue;
+ }
+ break;
+ }
+
+ case ')': {
+ closed = true;
+ goto EndOfGroup;
+ }
+
+ case '|': {
+ if (literal != null) {
+ current.AppendExpression (new Literal (literal, IsIgnoreCase (options)));
+ literal = null;
+ }
+
+ if (assertion != null) {
+ if (assertion.TrueExpression == null)
+ assertion.TrueExpression = current;
+ else if (assertion.FalseExpression == null)
+ assertion.FalseExpression = current;
+ else
+ throw NewParseException ("Too many | in (?()|).");
+ }
+ else {
+ if (alternation == null)
+ alternation = new Alternation ();
+
+ alternation.AddAlternative (current);
+ }
+
+ current = new Group ();
+ continue;
+ }
+
+ case '*': case '+': case '?': {
+ throw NewParseException ("Bad quantifier.");
+ }
+
+ default:
+ break; // literal character
+ }
+
+ ConsumeWhitespace (IsIgnorePatternWhitespace (options));
+
+ // (2) Check for Repetitions
+
+ if (ptr < pattern.Length) {
+ char k = pattern[ptr];
+ int min = 0, max = 0;
+ bool lazy = false;
+ bool haveRep = false;
+
+
+ if (k == '?' || k == '*' || k == '+') {
+ ++ ptr;
+ haveRep = true;
+
+ switch (k) {
+ case '?': min = 0; max = 1; break;
+ case '*': min = 0; max = 0x7fffffff; break;
+ case '+': min = 1; max = 0x7fffffff; break;
+ }
+ } else if (k == '{' && ptr + 1 < pattern.Length) {
+ int saved_ptr = ptr;
+ ++ptr;
+ haveRep = ParseRepetitionBounds (out min, out max, options);
+ if (!haveRep)
+ ptr = saved_ptr;
+ }
+
+ if (haveRep) {
+ ConsumeWhitespace (IsIgnorePatternWhitespace (options));
+ if (ptr < pattern.Length && pattern[ptr] == '?') {
+ ++ ptr;
+ lazy = true;
+ }
+
+ Repetition repetition = new Repetition (min, max, lazy);
+
+ if (expr == null)
+ repetition.Expression = new Literal (ch.ToString (), IsIgnoreCase (options));
+ else
+ repetition.Expression = expr;
+
+ expr = repetition;
+ }
+ }
+
+ // (3) Append Expression and/or Literal
+
+ if (expr == null) {
+ if (literal == null)
+ literal = "";
+ literal += ch;
+ }
+ else {
+ if (literal != null) {
+ current.AppendExpression (new Literal (literal, IsIgnoreCase (options)));
+ literal = null;
+ }
+
+ current.AppendExpression (expr);
+ expr = null;
+ }
+
+ if (is_top_level && ptr >= pattern.Length)
+ goto EndOfGroup;
+ }
+
+ EndOfGroup:
+ if (is_top_level && closed)
+ throw NewParseException ("Too many )'s.");
+ if (!is_top_level && !closed)
+ throw NewParseException ("Not enough )'s.");
+
+
+ // clean up literals and alternations
+
+ if (literal != null)
+ current.AppendExpression (new Literal (literal, IsIgnoreCase (options)));
+
+ if (assertion != null) {
+ if (assertion.TrueExpression == null)
+ assertion.TrueExpression = current;
+ else
+ assertion.FalseExpression = current;
+
+ group.AppendExpression (assertion);
+ }
+ else if (alternation != null) {
+ alternation.AddAlternative (current);
+ group.AppendExpression (alternation);
+ }
+ else
+ group.AppendExpression (current);
+ }
+
+ private Expression ParseGroupingConstruct (ref RegexOptions options) {
+ if (pattern[ptr] != '?') {
+ Group group;
+
+ if (IsExplicitCapture (options))
+ group = new Group ();
+ else {
+ group = new CapturingGroup ();
+ caps.Add (group);
+ }
+
+ ParseGroup (group, options, null);
+ return group;
+ }
+ else
+ ++ ptr;
+
+ switch (pattern[ptr]) {
+ case ':': { // non-capturing group
+ ++ ptr;
+ Group group = new Group ();
+ ParseGroup (group, options, null);
+
+ return group;
+ }
+
+ case '>': { // non-backtracking group
+ ++ ptr;
+ Group group = new NonBacktrackingGroup ();
+ ParseGroup (group, options, null);
+
+ return group;
+ }
+
+ case 'i': case 'm': case 'n':
+ case 's': case 'x': case '-': { // options
+ RegexOptions o = options;
+ ParseOptions (ref o, false);
+ if (pattern[ptr] == '-') {
+ ++ ptr;
+ ParseOptions (ref o, true);
+ }
+
+ if (pattern[ptr] == ':') { // pass options to child group
+ ++ ptr;
+ Group group = new Group ();
+ ParseGroup (group, o, null);
+ return group;
+ }
+ else if (pattern[ptr] == ')') { // change options of enclosing group
+ ++ ptr;
+ options = o;
+ return null;
+ }
+ else
+ throw NewParseException ("Bad options");
+ }
+
+ case '<': case '=': case '!': { // lookahead/lookbehind
+ ExpressionAssertion asn = new ExpressionAssertion ();
+ if (!ParseAssertionType (asn))
+ goto case '\''; // it's a (?<name> ) construct
+
+ Group test = new Group ();
+ ParseGroup (test, options, null);
+
+ asn.TestExpression = test;
+ return asn;
+ }
+
+ case '\'': { // named/balancing group
+ char delim;
+ if (pattern[ptr] == '<')
+ delim = '>';
+ else
+ delim = '\'';
+
+ ++ ptr;
+ string name = ParseName ();
+
+ if (pattern[ptr] == delim) {
+ // capturing group
+
+ if (name == null)
+ throw NewParseException ("Bad group name.");
+
+ ++ ptr;
+ CapturingGroup cap = new CapturingGroup ();
+ cap.Name = name;
+ caps.Add (cap);
+ ParseGroup (cap, options, null);
+
+ return cap;
+ }
+ else if (pattern[ptr] == '-') {
+ // balancing group
+
+ ++ ptr;
+ string balance_name = ParseName ();
+ if (balance_name == null || pattern[ptr] != delim)
+ throw NewParseException ("Bad balancing group name.");
+
+ ++ ptr;
+ BalancingGroup bal = new BalancingGroup ();
+ bal.Name = name;
+
+ if(bal.IsNamed) {
+ caps.Add (bal);
+ }
+
+ refs.Add (bal, balance_name);
+
+ ParseGroup (bal, options, null);
+
+ return bal;
+ }
+ else
+ throw NewParseException ("Bad group name.");
+ }
+
+ case '(': { // expression/capture test
+ Assertion asn;
+
+ ++ ptr;
+ int p = ptr;
+ string name = ParseName ();
+ if (name == null || pattern[ptr] != ')') { // expression test
+ // FIXME MS implementation doesn't seem to
+ // implement this version of (?(x) ...)
+
+ ptr = p;
+ ExpressionAssertion expr_asn = new ExpressionAssertion ();
+
+ if (pattern[ptr] == '?') {
+ ++ ptr;
+ if (!ParseAssertionType (expr_asn))
+ throw NewParseException ("Bad conditional.");
+ }
+ else {
+ expr_asn.Negate = false;
+ expr_asn.Reverse = false;
+ }
+
+ Group test = new Group ();
+ ParseGroup (test, options, null);
+ expr_asn.TestExpression = test;
+ asn = expr_asn;
+ }
+ else { // capture test
+ ++ ptr;
+ asn = new CaptureAssertion (new Literal (name, IsIgnoreCase (options)));
+ refs.Add (asn, name);
+ }
+
+ Group group = new Group ();
+ ParseGroup (group, options, asn);
+ return group;
+ }
+
+ case '#': { // comment
+ ++ ptr;
+ while (pattern[ptr ++] != ')') {
+ if (ptr >= pattern.Length)
+ throw NewParseException ("Unterminated (?#...) comment.");
+ }
+ return null;
+ }
+
+ default: // error
+ throw NewParseException ("Bad grouping construct.");
+ }
+ }
+
+ private bool ParseAssertionType (ExpressionAssertion assertion) {
+ if (pattern[ptr] == '<') {
+ switch (pattern[ptr + 1]) {
+ case '=':
+ assertion.Negate = false;
+ break;
+ case '!':
+ assertion.Negate = true;
+ break;
+ default:
+ return false;
+ }
+
+ assertion.Reverse = true;
+ ptr += 2;
+ }
+ else {
+ switch (pattern[ptr]) {
+ case '=':
+ assertion.Negate = false;
+ break;
+ case '!':
+ assertion.Negate = true;
+ break;
+ default:
+ return false;
+ }
+
+ assertion.Reverse = false;
+ ptr += 1;
+ }
+
+ return true;
+ }
+
+ private void ParseOptions (ref RegexOptions options, bool negate) {
+ for (;;) {
+ switch (pattern[ptr]) {
+ case 'i':
+ if (negate)
+ options &= ~RegexOptions.IgnoreCase;
+ else
+ options |= RegexOptions.IgnoreCase;
+ break;
+
+ case 'm':
+ if (negate)
+ options &= ~RegexOptions.Multiline;
+ else
+ options |= RegexOptions.Multiline;
+ break;
+
+ case 'n':
+ if (negate)
+ options &= ~RegexOptions.ExplicitCapture;
+ else
+ options |= RegexOptions.ExplicitCapture;
+ break;
+
+ case 's':
+ if (negate)
+ options &= ~RegexOptions.Singleline;
+ else
+ options |= RegexOptions.Singleline;
+ break;
+
+ case 'x':
+ if (negate)
+ options &= ~RegexOptions.IgnorePatternWhitespace;
+ else
+ options |= RegexOptions.IgnorePatternWhitespace;
+ break;
+
+ default:
+ return;
+ }
+
+ ++ ptr;
+ }
+ }
+
+ private Expression ParseCharacterClass (RegexOptions options) {
+ bool negate = false;
+ if (pattern[ptr] == '^') {
+ negate = true;
+ ++ ptr;
+ }
+
+ bool ecma = IsECMAScript (options);
+ CharacterClass cls = new CharacterClass (negate, IsIgnoreCase (options));
+
+ if (pattern[ptr] == ']') {
+ cls.AddCharacter (']');
+ ++ ptr;
+ }
+
+ int c = -1;
+ int last = -1;
+ bool range = false;
+ bool closed = false;
+ while (ptr < pattern.Length) {
+ c = pattern[ptr ++];
+
+ if (c == ']') {
+ closed = true;
+ break;
+ }
+
+ if (c == '-' && last >= 0 && !range) {
+ range = true;
+ continue;
+ }
+
+ if (c == '\\') {
+ c = ParseEscape ();
+ if (c >= 0)
+ goto char_recognized;
+
+ // didn't recognize escape
+ c = pattern [ptr ++];
+ switch (c) {
+ case 'b':
+ c = '\b';
+ goto char_recognized;
+
+ case 'd': case 'D':
+ cls.AddCategory (ecma ? Category.EcmaDigit : Category.Digit, c == 'D');
+ break;
+
+ case 'w': case 'W':
+ cls.AddCategory (ecma ? Category.EcmaWord : Category.Word, c == 'W');
+ break;
+
+ case 's': case 'S':
+ cls.AddCategory (ecma ? Category.EcmaWhiteSpace : Category.WhiteSpace, c == 'S');
+ break;
+
+ case 'p': case 'P':
+ cls.AddCategory (ParseUnicodeCategory (), c == 'P'); // ignore ecma
+ break;
+
+ default: // add escaped character
+ goto char_recognized;
+ }
+
+ // if the pattern looks like [a-\s] ...
+ if (range)
+ throw NewParseException ("character range cannot have category \\" + c);
+
+ last = -1;
+ continue;
+ }
+
+ char_recognized:
+ if (range) {
+ // if 'range' is true, we know that 'last >= 0'
+ if (c < last)
+ throw NewParseException ("[" + last + "-" + c + "] range in reverse order.");
+ cls.AddRange ((char)last, (char)c);
+ last = -1;
+ range = false;
+ continue;
+ }
+
+ cls.AddCharacter ((char)c);
+ last = c;
+ }
+
+ if (!closed)
+ throw NewParseException ("Unterminated [] set.");
+
+ if (range)
+ cls.AddCharacter ('-');
+
+ return cls;
+ }
+
+ private bool ParseRepetitionBounds (out int min, out int max, RegexOptions options) {
+ int n, m;
+ min = max = 0;
+
+ /* check syntax */
+
+ ConsumeWhitespace (IsIgnorePatternWhitespace (options));
+
+ if (pattern[ptr] == ',') {
+ n = -1;
+ } else {
+ n = ParseNumber (10, 1, 0);
+ ConsumeWhitespace (IsIgnorePatternWhitespace (options));
+ }
+
+ switch (pattern[ptr ++]) {
+ case '}':
+ m = n;
+ break;
+ case ',':
+ ConsumeWhitespace (IsIgnorePatternWhitespace (options));
+ m = ParseNumber (10, 1, 0);
+ ConsumeWhitespace (IsIgnorePatternWhitespace (options));
+ if (pattern[ptr ++] != '}')
+ return false;
+ break;
+ default:
+ return false;
+ }
+
+ /* check bounds and ordering */
+
+ if (n > 0x7fffffff || m > 0x7fffffff)
+ throw NewParseException ("Illegal {x, y} - maximum of 2147483647.");
+ if (m >= 0 && m < n)
+ throw NewParseException ("Illegal {x, y} with x > y.");
+
+ /* assign min and max */
+
+ min = n;
+ if (m > 0)
+ max = m;
+ else
+ max = 0x7fffffff;
+
+ return true;
+ }
+
+ private Category ParseUnicodeCategory () {
+ if (pattern[ptr ++] != '{')
+ throw NewParseException ("Incomplete \\p{X} character escape.");
+
+ string name = ParseName (pattern, ref ptr);
+ if (name == null)
+ throw NewParseException ("Incomplete \\p{X} character escape.");
+
+ Category cat = CategoryUtils.CategoryFromName (name);
+ if (cat == Category.None)
+ throw NewParseException ("Unknown property '" + name + "'.");
+
+ if (pattern[ptr ++] != '}')
+ throw NewParseException ("Incomplete \\p{X} character escape.");
+
+ return cat;
+ }
+
+ private Expression ParseSpecial (RegexOptions options) {
+ int p = ptr;
+ bool ecma = IsECMAScript (options);
+ Expression expr = null;
+
+ switch (pattern[ptr ++]) {
+
+ // categories
+
+ case 'd':
+ expr = new CharacterClass (ecma ? Category.EcmaDigit : Category.Digit, false);
+ break;
+
+ case 'w':
+ expr = new CharacterClass (ecma ? Category.EcmaWord : Category.Word, false);
+ break;
+
+ case 's':
+ expr = new CharacterClass (ecma ? Category.EcmaWhiteSpace : Category.WhiteSpace, false);
+ break;
+
+ case 'p':
+ // this is odd - ECMAScript isn't supposed to support Unicode,
+ // yet \p{..} compiles and runs under the MS implementation
+ // identically to canonical mode. That's why I'm ignoring the
+ // value of ecma here.
+
+ expr = new CharacterClass (ParseUnicodeCategory (), false);
+ break;
+
+ case 'D':
+ expr = new CharacterClass (ecma ? Category.EcmaDigit : Category.Digit, true);
+ break;
+
+ case 'W':
+ expr = new CharacterClass (ecma ? Category.EcmaWord : Category.Word, true);
+ break;
+
+ case 'S':
+ expr = new CharacterClass (ecma ? Category.EcmaWhiteSpace : Category.WhiteSpace, true);
+ break;
+
+ case 'P':
+ expr = new CharacterClass (ParseUnicodeCategory (), true);
+ break;
+
+ // positions
+
+ case 'A': expr = new PositionAssertion (Position.StartOfString); break;
+ case 'Z': expr = new PositionAssertion (Position.End); break;
+ case 'z': expr = new PositionAssertion (Position.EndOfString); break;
+ case 'G': expr = new PositionAssertion (Position.StartOfScan); break;
+ case 'b': expr = new PositionAssertion (Position.Boundary); break;
+ case 'B': expr = new PositionAssertion (Position.NonBoundary); break;
+
+ // references
+
+ case '1': case '2': case '3': case '4': case '5':
+ case '6': case '7': case '8': case '9': {
+ ptr --;
+ int n = ParseNumber (10, 1, 0);
+ if (n < 0) {
+ ptr = p;
+ return null;
+ }
+
+ // FIXME test if number is within number of assigned groups
+ // this may present a problem for right-to-left matching
+
+ Reference reference = new Reference (IsIgnoreCase (options));
+ refs.Add (reference, n.ToString ());
+ expr = reference;
+ break;
+ }
+
+ case 'k': {
+ char delim = pattern[ptr ++];
+ if (delim == '<')
+ delim = '>';
+ else if (delim != '\'')
+ throw NewParseException ("Malformed \\k<...> named backreference.");
+
+ string name = ParseName ();
+ if (name == null || pattern[ptr] != delim)
+ throw NewParseException ("Malformed \\k<...> named backreference.");
+
+ ++ ptr;
+ Reference reference = new Reference (IsIgnoreCase (options));
+ refs.Add (reference, name);
+ expr = reference;
+ break;
+ }
+
+ default:
+ expr = null;
+ break;
+ }
+
+ if (expr == null)
+ ptr = p;
+
+ return expr;
+ }
+
+ private int ParseEscape () {
+ int p = ptr;
+ int c;
+
+ if (p >= pattern.Length)
+ throw new ArgumentException (
+ String.Format ("Parsing \"{0}\" - Illegal \\ at end of " +
+ "pattern.", pattern), pattern);
+
+ switch (pattern[ptr ++]) {
+
+ // standard escapes (except \b)
+
+ case 'a': return '\u0007';
+ case 't': return '\u0009';
+ case 'r': return '\u000d';
+ case 'v': return '\u000b';
+ case 'f': return '\u000c';
+ case 'n': return '\u000a';
+ case 'e': return '\u001b';
+ case '\\': return '\\';
+
+ // character codes
+
+ case '0':
+ //
+ // Turns out that octal values can be specified
+ // without a leading zero. But also the limit
+ // of three character should include this first
+ // one.
+ //
+ ptr--;
+ int prevptr = ptr;
+ int result = ParseOctal (pattern, ref ptr);
+ if (result == -1 && prevptr == ptr)
+ return 0;
+
+ return result;
+
+ case 'x':
+ c = ParseHex (pattern, ref ptr, 2);
+ if (c < 0)
+ throw NewParseException ("Insufficient hex digits");
+
+ return c;
+
+ case 'u':
+ c = ParseHex (pattern, ref ptr, 4);
+ if (c < 0)
+ throw NewParseException ("Insufficient hex digits");
+
+ return c;
+
+ // control characters
+
+ case 'c':
+ c = pattern[ptr ++];
+ if (c >= '@' && c <= '_')
+ return c - '@';
+ else
+ throw NewParseException ("Unrecognized control character.");
+
+ // unknown escape
+
+ default:
+ ptr = p;
+ return -1;
+ }
+ }
+
+ private string ParseName () {
+ return Parser.ParseName (pattern, ref ptr);
+ }
+
+ private static bool IsNameChar (char c) {
+ UnicodeCategory cat = Char.GetUnicodeCategory (c);
+ if (cat == UnicodeCategory.ModifierLetter)
+ return false;
+ if (cat == UnicodeCategory.ConnectorPunctuation)
+ return true;
+ return Char.IsLetterOrDigit (c);
+ }
+
+ private int ParseNumber (int b, int min, int max) {
+ return Parser.ParseNumber (pattern, ref ptr, b, min, max);
+ }
+
+ private static int ParseDigit (char c, int b, int n) {
+ switch (b) {
+ case 8:
+ if (c >= '0' && c <= '7')
+ return c - '0';
+ else
+ return -1;
+ case 10:
+ if (c >= '0' && c <= '9')
+ return c - '0';
+ else
+ return -1;
+ case 16:
+ if (c >= '0' && c <= '9')
+ return c - '0';
+ else if (c >= 'a' && c <= 'f')
+ return 10 + c - 'a';
+ else if (c >= 'A' && c <= 'F')
+ return 10 + c - 'A';
+ else
+ return -1;
+ default:
+ return -1;
+ }
+ }
+
+ private void ConsumeWhitespace (bool ignore) {
+ while (ptr < pattern.Length) {
+ if (pattern[ptr] == '(') {
+ if (ptr + 3 >= pattern.Length)
+ return;
+
+ if (pattern[ptr + 1] != '?' || pattern[ptr + 2] != '#')
+ return;
+
+ ptr += 3;
+ while (ptr < pattern.Length && pattern[ptr ++] != ')')
+ /* ignore */ ;
+ }
+ else if (ignore && pattern[ptr] == '#') {
+ while (ptr < pattern.Length && pattern[ptr ++] != '\n')
+ /* ignore */ ;
+ }
+ else if (ignore && Char.IsWhiteSpace (pattern[ptr])) {
+ while (ptr < pattern.Length && Char.IsWhiteSpace (pattern[ptr]))
+ ++ ptr;
+ }
+ else
+ return;
+ }
+ }
+
+ private string ParseString (string pattern) {
+ this.pattern = pattern;
+ this.ptr = 0;
+
+ StringBuilder result = new StringBuilder (pattern.Length);
+ while (ptr < pattern.Length) {
+ int c = pattern[ptr ++];
+ if (c == '\\') {
+ c = ParseEscape ();
+
+ if(c < 0) {
+ c = pattern[ptr ++];
+ if(c == 'b')
+ c = '\b';
+ }
+ }
+ result.Append ((char) c);
+ }
+
+ return result.ToString ();
+ }
+
+ private void ResolveReferences () {
+ int gid = 1;
+ Hashtable dict = new Hashtable ();
+
+ // number unnamed groups
+
+ foreach (CapturingGroup group in caps) {
+ if (group.Name == null) {
+ dict.Add (gid.ToString (), group);
+ group.Number = gid ++;
+
+ ++ num_groups;
+ }
+ }
+
+ // number named groups
+
+ foreach (CapturingGroup group in caps) {
+ if (group.Name != null) {
+ if (!dict.Contains (group.Name)) {
+ dict.Add (group.Name, group);
+ group.Number = gid ++;
+
+ ++ num_groups;
+ }
+ else {
+ CapturingGroup prev = (CapturingGroup)dict[group.Name];
+ group.Number = prev.Number;
+ }
+ }
+ }
+
+ // resolve references
+
+ foreach (Expression expr in refs.Keys) {
+ string name = (string)refs[expr];
+ if (!dict.Contains (name)) {
+ if (expr is CaptureAssertion && !Char.IsDigit (name [0]))
+ continue;
+ throw NewParseException ("Reference to undefined group " +
+ (Char.IsDigit (name[0]) ? "number " : "name ") +
+ name);
+ }
+
+ CapturingGroup group = (CapturingGroup)dict[name];
+ if (expr is Reference)
+ ((Reference)expr).CapturingGroup = group;
+ else if (expr is CaptureAssertion)
+ ((CaptureAssertion)expr).CapturingGroup = group;
+ else if (expr is BalancingGroup)
+ ((BalancingGroup)expr).Balance = group;
+ }
+ }
+
+ // flag helper functions
+
+ private static bool IsIgnoreCase (RegexOptions options) {
+ return (options & RegexOptions.IgnoreCase) != 0;
+ }
+
+ private static bool IsMultiline (RegexOptions options) {
+ return (options & RegexOptions.Multiline) != 0;
+ }
+
+ private static bool IsExplicitCapture (RegexOptions options) {
+ return (options & RegexOptions.ExplicitCapture) != 0;
+ }
+
+ private static bool IsSingleline (RegexOptions options) {
+ return (options & RegexOptions.Singleline) != 0;
+ }
+
+ private static bool IsIgnorePatternWhitespace (RegexOptions options) {
+ return (options & RegexOptions.IgnorePatternWhitespace) != 0;
+ }
+
+ private static bool IsECMAScript (RegexOptions options) {
+ return (options & RegexOptions.ECMAScript) != 0;
+ }
+
+ // exception creation
+
+ private ArgumentException NewParseException (string msg) {
+ msg = "parsing \"" + pattern + "\" - " + msg;
+ return new ArgumentException (msg, pattern);
+ }
+
+ private string pattern;
+ private int ptr;
+
+ private ArrayList caps;
+ private Hashtable refs;
+ private int num_groups;
+ }
+}