added ecma character categories to parser.

author Dan Lewis <dan@mono-cvs.ximian.com>

Tue, 5 Feb 2002 20:48:52 +0000 (20:48 -0000)

committer Dan Lewis <dan@mono-cvs.ximian.com>

Tue, 5 Feb 2002 20:48:52 +0000 (20:48 -0000)
author Dan Lewis <dan@mono-cvs.ximian.com>
Tue, 5 Feb 2002 20:48:52 +0000 (20:48 -0000)
committer Dan Lewis <dan@mono-cvs.ximian.com>
Tue, 5 Feb 2002 20:48:52 +0000 (20:48 -0000)
diff --git a/mcs/class/System/System.Text.RegularExpressions/notes.txt b/mcs/class/System/System.Text.RegularExpressions/notes.txt

index a3753c5f4c5d8f08a25eaa77ef0a1e9deabc66f2..56b047ec76e5f35c396915ae9e0f3cf4a5a260bf 100644 (file)
--- a/mcs/class/System/System.Text.RegularExpressions/notes.txt
+++ b/mcs/class/System/System.Text.RegularExpressions/notes.txt
@@ -12,13 +12,13 @@ TODO:
    reverse. There may be other stuff.... work through the code.
  
  * Add ECMAScript support to the parser. For example, [.\w\s\d] map to ECMA
-  categories instead of canonical ones. There's different behaviour on
+  categories instead of canonical ones [DONE]. There's different behaviour on
    backreference/octal disambiguation. Find out what the runtime behavioural
    difference is for cyclic backreferences eg (?(1)abc\1) - this is only briefly 
    mentioned in the spec. I couldn't find much on this in the ECMAScript
    specification either.
  
-* Check the octal disambiguation for canonical syntax works as specced.
+* Octal/backreference parsing needs a big fix. The rules are ridiculously complex.
  
  * Add a check in QuickSearch for single character substrings. This is likely to
    be a common case. There's no need to go through a shift table. Also, have a
diff --git a/mcs/class/System/System.Text.RegularExpressions/parser.cs b/mcs/class/System/System.Text.RegularExpressions/parser.cs

index 8bece9206581f2f1b3aeff25129bd39032e0f741..3327cf3024ab7498936c49502efd920dae9c6bcf 100644 (file)
--- a/mcs/class/System/System.Text.RegularExpressions/parser.cs
+++ b/mcs/class/System/System.Text.RegularExpressions/parser.cs
@@ -584,7 +584,7 @@ namespace System.Text.RegularExpressions.Syntax {
                 }\r
  \r
                 private Expression ParseCharacterClass (RegexOptions options) {\r
-                       bool negate;\r
+                       bool negate, ecma;\r
                         if (pattern[ptr] == '^') {\r
                                 negate = true;\r
                                 ++ ptr;\r
@@ -592,6 +592,7 @@ namespace System.Text.RegularExpressions.Syntax {
                         else\r
                                 negate = false;\r
                         \r
+                       ecma = IsECMAScript (options);\r
                         CharacterClass cls = new CharacterClass (negate, IsIgnoreCase (options));\r
  \r
                         if (pattern[ptr] == ']') {\r
@@ -625,14 +626,45 @@ namespace System.Text.RegularExpressions.Syntax {
                                                 switch (c) {\r
                                                 case 'b': c = '\b'; break;\r
  \r
-                                               case 'd': cls.AddCategory (Category.Digit, false); last = -1; continue;\r
-                                               case 'w': cls.AddCategory (Category.Word, false); last = -1; continue;\r
-                                               case 's': cls.AddCategory (Category.WhiteSpace, false); last = -1; continue;\r
-                                               case 'p': cls.AddCategory (ParseUnicodeCategory (), true); last = -1; continue;\r
-                                               case 'D': cls.AddCategory (Category.Digit, true); last = -1; continue;\r
-                                               case 'W': cls.AddCategory (Category.Word, true); last = -1; continue;\r
-                                               case 'S': cls.AddCategory (Category.WhiteSpace, true); last = -1; continue;\r
-                                               case 'P': cls.AddCategory (ParseUnicodeCategory (), true); last = -1; continue;\r
+                                               case 'd':\r
+                                                       cls.AddCategory (ecma ? Category.EcmaDigit : Category.Digit, false);\r
+                                                       last = -1;\r
+                                                       continue;\r
+                                                       \r
+                                               case 'w':\r
+                                                       cls.AddCategory (ecma ? Category.EcmaWord : Category.Word, false);\r
+                                                       last = -1;\r
+                                                       continue;\r
+                                                       \r
+                                               case 's':\r
+                                                       cls.AddCategory (ecma ? Category.EcmaWhiteSpace : Category.WhiteSpace, false);\r
+                                                       last = -1;\r
+                                                       continue;\r
+                                                       \r
+                                               case 'p':\r
+                                                       cls.AddCategory (ParseUnicodeCategory (), false);       // ignore ecma\r
+                                                       last = -1;\r
+                                                       continue;\r
+                                                       \r
+                                               case 'D':\r
+                                                       cls.AddCategory (ecma ? Category.EcmaDigit : Category.Digit, true);\r
+                                                       last = -1;\r
+                                                       continue;\r
+                                                       \r
+                                               case 'W':\r
+                                                       cls.AddCategory (ecma ? Category.EcmaWord : Category.Word, true);\r
+                                                       last = -1;\r
+                                                       continue;\r
+                                                       \r
+                                               case 'S':\r
+                                                       cls.AddCategory (ecma ? Category.EcmaWhiteSpace : Category.WhiteSpace, true);\r
+                                                       last = -1;\r
+                                                       continue;\r
+                                                       \r
+                                               case 'P':\r
+                                                       cls.AddCategory (ParseUnicodeCategory (), true);\r
+                                                       last = -1;\r
+                                                       continue;\r
  \r
                                                 default: break;         // add escaped character\r
                                                 }\r
@@ -730,20 +762,49 @@ namespace System.Text.RegularExpressions.Syntax {
  \r
                 private Expression ParseSpecial (RegexOptions options) {\r
                         int p = ptr;\r
+                       bool ecma = IsECMAScript (options);\r
                         Expression expr = null;\r
                         \r
                         switch (pattern[ptr ++]) {\r
  \r
                         // categories\r
  \r
-                       case 'd': expr = new CharacterClass (Category.Digit, false); break;\r
-                       case 'w': expr = new CharacterClass (Category.Word, false); break;\r
-                       case 's': expr = new CharacterClass (Category.WhiteSpace, false); break;\r
-                       case 'D': expr = new CharacterClass (Category.Digit, true); break;\r
-                       case 'W': expr = new CharacterClass (Category.Word, true); break;\r
-                       case 'S': expr = new CharacterClass (Category.WhiteSpace, true); break;\r
-                       case 'p': expr = new CharacterClass (ParseUnicodeCategory (), true); break;\r
-                       case 'P': expr = new CharacterClass (ParseUnicodeCategory (), false); break;\r
+                       case 'd':\r
+                               expr = new CharacterClass (ecma ? Category.EcmaDigit : Category.Digit, false);\r
+                               break;\r
+                               \r
+                       case 'w':\r
+                               expr = new CharacterClass (ecma ? Category.EcmaWord : Category.Word, false);\r
+                               break;\r
+                               \r
+                       case 's':\r
+                               expr = new CharacterClass (ecma ? Category.EcmaWhiteSpace : Category.WhiteSpace, false);\r
+                               break;\r
+                               \r
+                       case 'p':\r
+                               // this is odd - ECMAScript isn't supposed to support Unicode,\r
+                               // yet \p{..} compiles and runs under the MS implementation\r
+                               // identically to canonical mode. That's why I'm ignoring the\r
+                               // value of ecma here.\r
+                       \r
+                               expr = new CharacterClass (ParseUnicodeCategory (), false);\r
+                               break;\r
+                               \r
+                       case 'D':\r
+                               expr = new CharacterClass (ecma ? Category.EcmaDigit : Category.Digit, true);\r
+                               break;\r
+                               \r
+                       case 'W':\r
+                               expr = new CharacterClass (ecma ? Category.EcmaWord : Category.Word, true);\r
+                               break;\r
+                               \r
+                       case 'S':\r
+                               expr = new CharacterClass (ecma ? Category.EcmaWhiteSpace : Category.WhiteSpace, true);\r
+                               break;\r
+                               \r
+                       case 'P':\r
+                               expr = new CharacterClass (ParseUnicodeCategory (), true);\r
+                               break;\r
  \r
                         // positions\r
  \r
@@ -1027,6 +1088,10 @@ namespace System.Text.RegularExpressions.Syntax {
                         return (options & RegexOptions.RightToLeft) != 0;\r
                 }\r
  \r
+               private static bool IsECMAScript (RegexOptions options) {\r
+                       return (options & RegexOptions.ECMAScript) != 0;\r
+               }\r
+\r
                 // exception creation\r
  \r
                 private ArgumentException NewParseException (string msg) {\r
author	Dan Lewis <dan@mono-cvs.ximian.com>
	Tue, 5 Feb 2002 20:48:52 +0000 (20:48 -0000)
committer	Dan Lewis <dan@mono-cvs.ximian.com>
	Tue, 5 Feb 2002 20:48:52 +0000 (20:48 -0000)
mcs/class/System/System.Text.RegularExpressions/notes.txt		patch \| blob \| history
mcs/class/System/System.Text.RegularExpressions/parser.cs		patch \| blob \| history