Bug 10670 fix.

[mono.git] / mcs / class / System / System.Text.RegularExpressions / parser.cs
diff --git a/mcs/class/System/System.Text.RegularExpressions/parser.cs b/mcs/class/System/System.Text.RegularExpressions/parser.cs

index 694babda2afcd6a07b40b0afcf95cc812aa209c5..f3ddf196730bb8879cbdc0dd8fc5a82829c91486 100644 (file)
--- a/mcs/class/System/System.Text.RegularExpressions/parser.cs
+++ b/mcs/class/System/System.Text.RegularExpressions/parser.cs
@@ -6,6 +6,27 @@
  // author:     Dan Lewis (dlewis@gmx.co.uk)
  //             (c) 2002
  
+//
+// Permission is hereby granted, free of charge, to any person obtaining
+// a copy of this software and associated documentation files (the
+// "Software"), to deal in the Software without restriction, including
+// without limitation the rights to use, copy, modify, merge, publish,
+// distribute, sublicense, and/or sell copies of the Software, and to
+// permit persons to whom the Software is furnished to do so, subject to
+// the following conditions:
+// 
+// The above copyright notice and this permission notice shall be
+// included in all copies or substantial portions of the Software.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+// MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+// OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+// WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+//
+
  using System;
  using System.Collections;
  using System.Globalization;
@@ -94,6 +115,8 @@ namespace System.Text.RegularExpressions.Syntax {
                 }
  
                 public static string Unescape (string str) {
+                       if (str.IndexOf ('\\') == -1)
+                               return str;
                         return new Parser ().ParseString (str);
                 }
  
@@ -126,24 +149,25 @@ namespace System.Text.RegularExpressions.Syntax {
                         }
                 }
  
-               public IDictionary GetMapping () {
+               public IDictionary GetMapping ()
+               {
                         Hashtable mapping = new Hashtable ();
-                       Hashtable numbers = new Hashtable ();
                         int end = caps.Count;
                         mapping.Add ("0", 0);
                         for (int i = 0; i < end; i++) {
                                 CapturingGroup group = (CapturingGroup) caps [i];
-                               if (group.Name != null && !mapping.Contains (group.Name)) {
+                               if (group.Name != null) {
+                                       if (mapping.Contains (group.Name)) {
+                                               if ((int) mapping [group.Name] != group.Number)
+                                                       throw new SystemException ("invalid state");
+                                               continue;
+                                       }
                                         mapping.Add (group.Name, group.Number);
-                                       numbers.Add (group.Number, group.Number);
+                               } else {
+                                       mapping.Add (group.Number.ToString (), group.Number);
                                 }
                         }
  
-                       for (int i = 1; i < end; i++) {
-                               if (numbers [i] == null)
-                                       mapping.Add (i.ToString (), i);
-                       }
-
                         return mapping;
                 }
  
@@ -266,20 +290,29 @@ namespace System.Text.RegularExpressions.Syntax {
                                 
                                 if (ptr < pattern.Length) {
                                         char k = pattern[ptr];
+                                       int min = 0, max = 0;
+                                       bool lazy = false;
+                                       bool haveRep = false;
  
-                                       if (k == '?' || k == '*' || k == '+' || k == '{') {
-                                               ++ ptr;
  
-                                               int min = 0, max = 0;
-                                               bool lazy = false;
+                                       if (k == '?' || k == '*' || k == '+') {
+                                               ++ ptr;
+                                               haveRep = true;
  
                                                 switch (k) {
                                                 case '?': min = 0; max = 1; break;
-                                               case '*': min = 0; max = 0xffff; break;
-                                               case '+': min = 1; max = 0xffff; break;
-                                               case '{': ParseRepetitionBounds (out min, out max, options); break;
+                                               case '*': min = 0; max = 0x7fffffff; break;
+                                               case '+': min = 1; max = 0x7fffffff; break;
                                                 }
+                                       } else if (k == '{' && ptr + 1 < pattern.Length) {
+                                               int saved_ptr = ptr;
+                                               ++ptr;
+                                               haveRep = ParseRepetitionBounds (out min, out max, options);
+                                               if (!haveRep)
+                                                       ptr = saved_ptr;
+                                       }
  
+                                       if (haveRep) {
                                                 ConsumeWhitespace (IsIgnorePatternWhitespace (options));
                                                 if (ptr < pattern.Length && pattern[ptr] == '?') {
                                                         ++ ptr;
@@ -496,7 +529,7 @@ namespace System.Text.RegularExpressions.Syntax {
                                 }
                                 else {                                          // capture test
                                         ++ ptr;
-                                       asn = new CaptureAssertion ();
+                                       asn = new CaptureAssertion (new Literal (name, IsIgnoreCase (options)));
                                         refs.Add (asn, name);
                                 }
  
@@ -601,15 +634,13 @@ namespace System.Text.RegularExpressions.Syntax {
                 }
  
                 private Expression ParseCharacterClass (RegexOptions options) {
-                       bool negate, ecma;
+                       bool negate = false;
                         if (pattern[ptr] == '^') {
                                 negate = true;
                                 ++ ptr;
                         }
-                       else
-                               negate = false;
                         
-                       ecma = IsECMAScript (options);
+                       bool ecma = IsECMAScript (options);
                         CharacterClass cls = new CharacterClass (negate, IsIgnoreCase (options));
  
                         if (pattern[ptr] == ']') {
@@ -628,84 +659,65 @@ namespace System.Text.RegularExpressions.Syntax {
                                         closed = true;
                                         break;
                                 }
-                               
-                               if (c == '-') {
+
+                               if (c == '-' && last >= 0 && !range) {
                                         range = true;
                                         continue;
                                 }
  
                                 if (c == '\\') {
                                         c = ParseEscape ();
-                                       if (c < 0) {
-                                               // didn't recognize escape
-
-                                               c = pattern[ptr ++];
-                                               switch (c) {
-                                               case 'b': c = '\b'; break;
-
-                                               case 'd':
-                                                       cls.AddCategory (ecma ? Category.EcmaDigit : Category.Digit, false);
-                                                       last = -1;
-                                                       continue;
-                                                       
-                                               case 'w':
-                                                       cls.AddCategory (ecma ? Category.EcmaWord : Category.Word, false);
-                                                       last = -1;
-                                                       continue;
-                                                       
-                                               case 's':
-                                                       cls.AddCategory (ecma ? Category.EcmaWhiteSpace : Category.WhiteSpace, false);
-                                                       last = -1;
-                                                       continue;
-                                                       
-                                               case 'p':
-                                                       cls.AddCategory (ParseUnicodeCategory (), false);       // ignore ecma
-                                                       last = -1;
-                                                       continue;
-                                                       
-                                               case 'D':
-                                                       cls.AddCategory (ecma ? Category.EcmaDigit : Category.Digit, true);
-                                                       last = -1;
-                                                       continue;
-                                                       
-                                               case 'W':
-                                                       cls.AddCategory (ecma ? Category.EcmaWord : Category.Word, true);
-                                                       last = -1;
-                                                       continue;
-                                                       
-                                               case 'S':
-                                                       cls.AddCategory (ecma ? Category.EcmaWhiteSpace : Category.WhiteSpace, true);
-                                                       last = -1;
-                                                       continue;
-                                                       
-                                               case 'P':
-                                                       cls.AddCategory (ParseUnicodeCategory (), true);
-                                                       last = -1;
-                                                       continue;
-
-                                               default: break;         // add escaped character
-                                               }
+                                       if (c >= 0)
+                                               goto char_recognized;
+
+                                       // didn't recognize escape
+                                       c = pattern [ptr ++];
+                                       switch (c) {
+                                       case 'b':
+                                               c = '\b';
+                                               goto char_recognized;
+
+                                       case 'd': case 'D':
+                                               cls.AddCategory (ecma ? Category.EcmaDigit : Category.Digit, c == 'D');
+                                               break;
+                                               
+                                       case 'w': case 'W':
+                                               cls.AddCategory (ecma ? Category.EcmaWord : Category.Word, c == 'W');
+                                               break;
+                                               
+                                       case 's': case 'S':
+                                               cls.AddCategory (ecma ? Category.EcmaWhiteSpace : Category.WhiteSpace, c == 'S');
+                                               break;
+                                               
+                                       case 'p': case 'P':
+                                               cls.AddCategory (ParseUnicodeCategory (), c == 'P');    // ignore ecma
+                                               break;
+
+                                       default:                // add escaped character
+                                               goto char_recognized;
                                         }
+
+                                       // if the pattern looks like [a-\s] ...
+                                       if (range)
+                                               throw NewParseException ("character range cannot have category \\" + c);
+
+                                       last = -1;
+                                       continue;
                                 }
  
+                       char_recognized:
                                 if (range) {
+                                       // if 'range' is true, we know that 'last >= 0'
                                         if (c < last)
-                                               throw NewParseException ("[x-y] range in reverse order.");
-
-                                       if (last >=0 )
-                                               cls.AddRange ((char)last, (char)c);
-                                       else {
-                                               cls.AddCharacter ((char)c);
-                                               cls.AddCharacter ('-');
-                                       }
-
-                                       range = false;
+                                               throw NewParseException ("[" + last + "-" + c + "] range in reverse order.");
+                                       cls.AddRange ((char)last, (char)c);
                                         last = -1;
+                                       range = false;
+                                       continue;
                                 }
-                               else {
-                                       cls.AddCharacter ((char)c);
-                                       last = c;
-                               }
+
+                               cls.AddCharacter ((char)c);
+                               last = c;
                         }
  
                         if (!closed)
@@ -717,8 +729,9 @@ namespace System.Text.RegularExpressions.Syntax {
                         return cls;
                 }
  
-               private void ParseRepetitionBounds (out int min, out int max, RegexOptions options) {
+               private bool ParseRepetitionBounds (out int min, out int max, RegexOptions options) {
                         int n, m;
+                       min = max = 0;
  
                         /* check syntax */
  
@@ -740,16 +753,16 @@ namespace System.Text.RegularExpressions.Syntax {
                                 m = ParseNumber (10, 1, 0);
                                 ConsumeWhitespace (IsIgnorePatternWhitespace (options));
                                 if (pattern[ptr ++] != '}')
-                                       throw NewParseException ("Illegal {x,y} - bad value of y.");
+                                       return false;
                                 break;
                         default:
-                               throw NewParseException ("Illegal {x,y}");
+                               return false;
                         }
  
                         /* check bounds and ordering */
  
-                       if (n >= 0xffff || m >= 0xffff)
-                               throw NewParseException ("Illegal {x, y} - maximum of 65535.");
+                       if (n > 0x7fffffff || m > 0x7fffffff)
+                               throw NewParseException ("Illegal {x, y} - maximum of 2147483647.");
                         if (m >= 0 && m < n)
                                 throw NewParseException ("Illegal {x, y} with x > y.");
  
@@ -759,7 +772,9 @@ namespace System.Text.RegularExpressions.Syntax {
                         if (m > 0)
                                 max = m;
                         else
-                               max = 0xffff;
+                               max = 0x7fffffff;
+
+                       return true;
                 }
  
                 private Category ParseUnicodeCategory () {
@@ -909,6 +924,13 @@ namespace System.Text.RegularExpressions.Syntax {
                         // character codes
  
                         case '0':
+                               //
+                               // Turns out that octal values can be specified
+                               // without a leading zero.   But also the limit
+                               // of three character should include this first
+                               // one.  
+                               //
+                               ptr--;
                                 int prevptr = ptr;
                                 int result = ParseOctal (pattern, ref ptr);
                                 if (result == -1 && prevptr == ptr)
@@ -933,10 +955,8 @@ namespace System.Text.RegularExpressions.Syntax {
                         // control characters
  
                         case 'c':
-                               c = pattern[p ++];
-                               if (c >= 'A' && c <= 'Z')
-                                       return c - 'A';
-                               else if (c >= '@' && c <= '_')
+                               c = pattern[ptr ++];
+                               if (c >= '@' && c <= '_')
                                         return c - '@';
                                 else
                                         throw NewParseException ("Unrecognized control character.");
@@ -966,10 +986,6 @@ namespace System.Text.RegularExpressions.Syntax {
                         return Parser.ParseNumber (pattern, ref ptr, b, min, max);
                 }
  
-               private int ParseDecimal () {
-                       return Parser.ParseDecimal (pattern, ref ptr);
-               }
-
                 private static int ParseDigit (char c, int b, int n) {
                         switch (b) {
                         case 8:
@@ -997,10 +1013,7 @@ namespace System.Text.RegularExpressions.Syntax {
                 }
  
                 private void ConsumeWhitespace (bool ignore) {
-                       while (true) {
-                               if (ptr >= pattern.Length)
-                                       break;
-                       
+                       while (ptr < pattern.Length) {
                                 if (pattern[ptr] == '(') {
                                         if (ptr + 3 >= pattern.Length)
                                                 return;
@@ -1009,7 +1022,7 @@ namespace System.Text.RegularExpressions.Syntax {
                                                 return;
  
                                         ptr += 3;
-                                       while (pattern[ptr ++] != ')')
+                                       while (ptr < pattern.Length && pattern[ptr ++] != ')')
                                                 /* ignore */ ;
                                 }
                                 else if (ignore && pattern[ptr] == '#') {
@@ -1029,16 +1042,22 @@ namespace System.Text.RegularExpressions.Syntax {
                         this.pattern = pattern;
                         this.ptr = 0;
  
-                       string result = "";
+                       StringBuilder result = new StringBuilder (pattern.Length);
                         while (ptr < pattern.Length) {
-                               int c = pattern[ptr];
-                               if (c == '\\')
+                               int c = pattern[ptr ++];
+                               if (c == '\\') {
                                         c = ParseEscape ();
-                               ptr ++; 
-                               result += (char)c;
+
+                                       if(c < 0) {
+                                               c = pattern[ptr ++];
+                                               if(c == 'b')
+                                                       c = '\b';
+                                       }
+                               }
+                               result.Append ((char) c);
                         }
  
-                       return result;
+                       return result.ToString ();
                 }
  
                 private void ResolveReferences () {
@@ -1078,6 +1097,8 @@ namespace System.Text.RegularExpressions.Syntax {
                         foreach (Expression expr in refs.Keys) {
                                 string name = (string)refs[expr];
                                 if (!dict.Contains (name)) {
+                                       if (expr is CaptureAssertion && !Char.IsDigit (name [0]))
+                                               continue;
                                         throw NewParseException ("Reference to undefined group " +
                                                 (Char.IsDigit (name[0]) ? "number " : "name ") +
                                                 name);
@@ -1115,10 +1136,6 @@ namespace System.Text.RegularExpressions.Syntax {
                         return (options & RegexOptions.IgnorePatternWhitespace) != 0;
                 }
  
-               private static bool IsRightToLeft (RegexOptions options) {
-                       return (options & RegexOptions.RightToLeft) != 0;
-               }
-
                 private static bool IsECMAScript (RegexOptions options) {
                         return (options & RegexOptions.ECMAScript) != 0;
                 }