// author: Dan Lewis (dlewis@gmx.co.uk)
// (c) 2002
+//
+// Permission is hereby granted, free of charge, to any person obtaining
+// a copy of this software and associated documentation files (the
+// "Software"), to deal in the Software without restriction, including
+// without limitation the rights to use, copy, modify, merge, publish,
+// distribute, sublicense, and/or sell copies of the Software, and to
+// permit persons to whom the Software is furnished to do so, subject to
+// the following conditions:
+//
+// The above copyright notice and this permission notice shall be
+// included in all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+// MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+// OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+// WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+//
+
using System;
using System.Collections;
using System.Globalization;
}
public static string Unescape (string str) {
+ if (str.IndexOf ('\\') == -1)
+ return str;
return new Parser ().ParseString (str);
}
}
}
- public IDictionary GetMapping () {
+ public IDictionary GetMapping ()
+ {
Hashtable mapping = new Hashtable ();
- Hashtable numbers = new Hashtable ();
int end = caps.Count;
mapping.Add ("0", 0);
for (int i = 0; i < end; i++) {
CapturingGroup group = (CapturingGroup) caps [i];
- if (group.Name != null && !mapping.Contains (group.Name)) {
+ if (group.Name != null) {
+ if (mapping.Contains (group.Name)) {
+ if ((int) mapping [group.Name] != group.Number)
+ throw new SystemException ("invalid state");
+ continue;
+ }
mapping.Add (group.Name, group.Number);
- numbers.Add (group.Number, group.Number);
+ } else {
+ mapping.Add (group.Number.ToString (), group.Number);
}
}
- for (int i = 1; i < end; i++) {
- if (numbers [i] == null)
- mapping.Add (i.ToString (), i);
- }
-
return mapping;
}
if (ptr < pattern.Length) {
char k = pattern[ptr];
+ int min = 0, max = 0;
+ bool lazy = false;
+ bool haveRep = false;
- if (k == '?' || k == '*' || k == '+' || k == '{') {
- ++ ptr;
- int min = 0, max = 0;
- bool lazy = false;
+ if (k == '?' || k == '*' || k == '+') {
+ ++ ptr;
+ haveRep = true;
switch (k) {
case '?': min = 0; max = 1; break;
- case '*': min = 0; max = 0xffff; break;
- case '+': min = 1; max = 0xffff; break;
- case '{': ParseRepetitionBounds (out min, out max, options); break;
+ case '*': min = 0; max = 0x7fffffff; break;
+ case '+': min = 1; max = 0x7fffffff; break;
}
+ } else if (k == '{' && ptr + 1 < pattern.Length) {
+ int saved_ptr = ptr;
+ ++ptr;
+ haveRep = ParseRepetitionBounds (out min, out max, options);
+ if (!haveRep)
+ ptr = saved_ptr;
+ }
+ if (haveRep) {
ConsumeWhitespace (IsIgnorePatternWhitespace (options));
if (ptr < pattern.Length && pattern[ptr] == '?') {
++ ptr;
}
else { // capture test
++ ptr;
- asn = new CaptureAssertion ();
+ asn = new CaptureAssertion (new Literal (name, IsIgnoreCase (options)));
refs.Add (asn, name);
}
}
private Expression ParseCharacterClass (RegexOptions options) {
- bool negate, ecma;
+ bool negate = false;
if (pattern[ptr] == '^') {
negate = true;
++ ptr;
}
- else
- negate = false;
- ecma = IsECMAScript (options);
+ bool ecma = IsECMAScript (options);
CharacterClass cls = new CharacterClass (negate, IsIgnoreCase (options));
if (pattern[ptr] == ']') {
closed = true;
break;
}
-
- if (c == '-') {
+
+ if (c == '-' && last >= 0 && !range) {
range = true;
continue;
}
if (c == '\\') {
c = ParseEscape ();
- if (c < 0) {
- // didn't recognize escape
-
- c = pattern[ptr ++];
- switch (c) {
- case 'b': c = '\b'; break;
-
- case 'd':
- cls.AddCategory (ecma ? Category.EcmaDigit : Category.Digit, false);
- last = -1;
- continue;
-
- case 'w':
- cls.AddCategory (ecma ? Category.EcmaWord : Category.Word, false);
- last = -1;
- continue;
-
- case 's':
- cls.AddCategory (ecma ? Category.EcmaWhiteSpace : Category.WhiteSpace, false);
- last = -1;
- continue;
-
- case 'p':
- cls.AddCategory (ParseUnicodeCategory (), false); // ignore ecma
- last = -1;
- continue;
-
- case 'D':
- cls.AddCategory (ecma ? Category.EcmaDigit : Category.Digit, true);
- last = -1;
- continue;
-
- case 'W':
- cls.AddCategory (ecma ? Category.EcmaWord : Category.Word, true);
- last = -1;
- continue;
-
- case 'S':
- cls.AddCategory (ecma ? Category.EcmaWhiteSpace : Category.WhiteSpace, true);
- last = -1;
- continue;
-
- case 'P':
- cls.AddCategory (ParseUnicodeCategory (), true);
- last = -1;
- continue;
-
- default: break; // add escaped character
- }
+ if (c >= 0)
+ goto char_recognized;
+
+ // didn't recognize escape
+ c = pattern [ptr ++];
+ switch (c) {
+ case 'b':
+ c = '\b';
+ goto char_recognized;
+
+ case 'd': case 'D':
+ cls.AddCategory (ecma ? Category.EcmaDigit : Category.Digit, c == 'D');
+ break;
+
+ case 'w': case 'W':
+ cls.AddCategory (ecma ? Category.EcmaWord : Category.Word, c == 'W');
+ break;
+
+ case 's': case 'S':
+ cls.AddCategory (ecma ? Category.EcmaWhiteSpace : Category.WhiteSpace, c == 'S');
+ break;
+
+ case 'p': case 'P':
+ cls.AddCategory (ParseUnicodeCategory (), c == 'P'); // ignore ecma
+ break;
+
+ default: // add escaped character
+ goto char_recognized;
}
+
+ // if the pattern looks like [a-\s] ...
+ if (range)
+ throw NewParseException ("character range cannot have category \\" + c);
+
+ last = -1;
+ continue;
}
+ char_recognized:
if (range) {
+ // if 'range' is true, we know that 'last >= 0'
if (c < last)
- throw NewParseException ("[x-y] range in reverse order.");
-
- if (last >=0 )
- cls.AddRange ((char)last, (char)c);
- else {
- cls.AddCharacter ((char)c);
- cls.AddCharacter ('-');
- }
-
- range = false;
+ throw NewParseException ("[" + last + "-" + c + "] range in reverse order.");
+ cls.AddRange ((char)last, (char)c);
last = -1;
+ range = false;
+ continue;
}
- else {
- cls.AddCharacter ((char)c);
- last = c;
- }
+
+ cls.AddCharacter ((char)c);
+ last = c;
}
if (!closed)
return cls;
}
- private void ParseRepetitionBounds (out int min, out int max, RegexOptions options) {
+ private bool ParseRepetitionBounds (out int min, out int max, RegexOptions options) {
int n, m;
+ min = max = 0;
/* check syntax */
m = ParseNumber (10, 1, 0);
ConsumeWhitespace (IsIgnorePatternWhitespace (options));
if (pattern[ptr ++] != '}')
- throw NewParseException ("Illegal {x,y} - bad value of y.");
+ return false;
break;
default:
- throw NewParseException ("Illegal {x,y}");
+ return false;
}
/* check bounds and ordering */
- if (n >= 0xffff || m >= 0xffff)
- throw NewParseException ("Illegal {x, y} - maximum of 65535.");
+ if (n > 0x7fffffff || m > 0x7fffffff)
+ throw NewParseException ("Illegal {x, y} - maximum of 2147483647.");
if (m >= 0 && m < n)
throw NewParseException ("Illegal {x, y} with x > y.");
if (m > 0)
max = m;
else
- max = 0xffff;
+ max = 0x7fffffff;
+
+ return true;
}
private Category ParseUnicodeCategory () {
// character codes
case '0':
+ //
+ // Turns out that octal values can be specified
+ // without a leading zero. But also the limit
+ // of three character should include this first
+ // one.
+ //
+ ptr--;
int prevptr = ptr;
int result = ParseOctal (pattern, ref ptr);
if (result == -1 && prevptr == ptr)
// control characters
case 'c':
- c = pattern[p ++];
- if (c >= 'A' && c <= 'Z')
- return c - 'A';
- else if (c >= '@' && c <= '_')
+ c = pattern[ptr ++];
+ if (c >= '@' && c <= '_')
return c - '@';
else
throw NewParseException ("Unrecognized control character.");
return Parser.ParseNumber (pattern, ref ptr, b, min, max);
}
- private int ParseDecimal () {
- return Parser.ParseDecimal (pattern, ref ptr);
- }
-
private static int ParseDigit (char c, int b, int n) {
switch (b) {
case 8:
}
private void ConsumeWhitespace (bool ignore) {
- while (true) {
- if (ptr >= pattern.Length)
- break;
-
+ while (ptr < pattern.Length) {
if (pattern[ptr] == '(') {
if (ptr + 3 >= pattern.Length)
return;
return;
ptr += 3;
- while (pattern[ptr ++] != ')')
+ while (ptr < pattern.Length && pattern[ptr ++] != ')')
/* ignore */ ;
}
else if (ignore && pattern[ptr] == '#') {
this.pattern = pattern;
this.ptr = 0;
- string result = "";
+ StringBuilder result = new StringBuilder (pattern.Length);
while (ptr < pattern.Length) {
- int c = pattern[ptr];
- if (c == '\\')
+ int c = pattern[ptr ++];
+ if (c == '\\') {
c = ParseEscape ();
- ptr ++;
- result += (char)c;
+
+ if(c < 0) {
+ c = pattern[ptr ++];
+ if(c == 'b')
+ c = '\b';
+ }
+ }
+ result.Append ((char) c);
}
- return result;
+ return result.ToString ();
}
private void ResolveReferences () {
foreach (Expression expr in refs.Keys) {
string name = (string)refs[expr];
if (!dict.Contains (name)) {
+ if (expr is CaptureAssertion && !Char.IsDigit (name [0]))
+ continue;
throw NewParseException ("Reference to undefined group " +
(Char.IsDigit (name[0]) ? "number " : "name ") +
name);
return (options & RegexOptions.IgnorePatternWhitespace) != 0;
}
- private static bool IsRightToLeft (RegexOptions options) {
- return (options & RegexOptions.RightToLeft) != 0;
- }
-
private static bool IsECMAScript (RegexOptions options) {
return (options & RegexOptions.ECMAScript) != 0;
}