added ecma character categories to parser.
authorDan Lewis <dan@mono-cvs.ximian.com>
Tue, 5 Feb 2002 20:48:52 +0000 (20:48 -0000)
committerDan Lewis <dan@mono-cvs.ximian.com>
Tue, 5 Feb 2002 20:48:52 +0000 (20:48 -0000)
svn path=/trunk/mcs/; revision=2243

mcs/class/System/System.Text.RegularExpressions/notes.txt
mcs/class/System/System.Text.RegularExpressions/parser.cs

index a3753c5f4c5d8f08a25eaa77ef0a1e9deabc66f2..56b047ec76e5f35c396915ae9e0f3cf4a5a260bf 100644 (file)
@@ -12,13 +12,13 @@ TODO:
   reverse. There may be other stuff.... work through the code.
 
 * Add ECMAScript support to the parser. For example, [.\w\s\d] map to ECMA
-  categories instead of canonical ones. There's different behaviour on
+  categories instead of canonical ones [DONE]. There's different behaviour on
   backreference/octal disambiguation. Find out what the runtime behavioural
   difference is for cyclic backreferences eg (?(1)abc\1) - this is only briefly 
   mentioned in the spec. I couldn't find much on this in the ECMAScript
   specification either.
 
-* Check the octal disambiguation for canonical syntax works as specced.
+* Octal/backreference parsing needs a big fix. The rules are ridiculously complex.
 
 * Add a check in QuickSearch for single character substrings. This is likely to
   be a common case. There's no need to go through a shift table. Also, have a
index 8bece9206581f2f1b3aeff25129bd39032e0f741..3327cf3024ab7498936c49502efd920dae9c6bcf 100644 (file)
@@ -584,7 +584,7 @@ namespace System.Text.RegularExpressions.Syntax {
                }\r
 \r
                private Expression ParseCharacterClass (RegexOptions options) {\r
-                       bool negate;\r
+                       bool negate, ecma;\r
                        if (pattern[ptr] == '^') {\r
                                negate = true;\r
                                ++ ptr;\r
@@ -592,6 +592,7 @@ namespace System.Text.RegularExpressions.Syntax {
                        else\r
                                negate = false;\r
                        \r
+                       ecma = IsECMAScript (options);\r
                        CharacterClass cls = new CharacterClass (negate, IsIgnoreCase (options));\r
 \r
                        if (pattern[ptr] == ']') {\r
@@ -625,14 +626,45 @@ namespace System.Text.RegularExpressions.Syntax {
                                                switch (c) {\r
                                                case 'b': c = '\b'; break;\r
 \r
-                                               case 'd': cls.AddCategory (Category.Digit, false); last = -1; continue;\r
-                                               case 'w': cls.AddCategory (Category.Word, false); last = -1; continue;\r
-                                               case 's': cls.AddCategory (Category.WhiteSpace, false); last = -1; continue;\r
-                                               case 'p': cls.AddCategory (ParseUnicodeCategory (), true); last = -1; continue;\r
-                                               case 'D': cls.AddCategory (Category.Digit, true); last = -1; continue;\r
-                                               case 'W': cls.AddCategory (Category.Word, true); last = -1; continue;\r
-                                               case 'S': cls.AddCategory (Category.WhiteSpace, true); last = -1; continue;\r
-                                               case 'P': cls.AddCategory (ParseUnicodeCategory (), true); last = -1; continue;\r
+                                               case 'd':\r
+                                                       cls.AddCategory (ecma ? Category.EcmaDigit : Category.Digit, false);\r
+                                                       last = -1;\r
+                                                       continue;\r
+                                                       \r
+                                               case 'w':\r
+                                                       cls.AddCategory (ecma ? Category.EcmaWord : Category.Word, false);\r
+                                                       last = -1;\r
+                                                       continue;\r
+                                                       \r
+                                               case 's':\r
+                                                       cls.AddCategory (ecma ? Category.EcmaWhiteSpace : Category.WhiteSpace, false);\r
+                                                       last = -1;\r
+                                                       continue;\r
+                                                       \r
+                                               case 'p':\r
+                                                       cls.AddCategory (ParseUnicodeCategory (), false);       // ignore ecma\r
+                                                       last = -1;\r
+                                                       continue;\r
+                                                       \r
+                                               case 'D':\r
+                                                       cls.AddCategory (ecma ? Category.EcmaDigit : Category.Digit, true);\r
+                                                       last = -1;\r
+                                                       continue;\r
+                                                       \r
+                                               case 'W':\r
+                                                       cls.AddCategory (ecma ? Category.EcmaWord : Category.Word, true);\r
+                                                       last = -1;\r
+                                                       continue;\r
+                                                       \r
+                                               case 'S':\r
+                                                       cls.AddCategory (ecma ? Category.EcmaWhiteSpace : Category.WhiteSpace, true);\r
+                                                       last = -1;\r
+                                                       continue;\r
+                                                       \r
+                                               case 'P':\r
+                                                       cls.AddCategory (ParseUnicodeCategory (), true);\r
+                                                       last = -1;\r
+                                                       continue;\r
 \r
                                                default: break;         // add escaped character\r
                                                }\r
@@ -730,20 +762,49 @@ namespace System.Text.RegularExpressions.Syntax {
 \r
                private Expression ParseSpecial (RegexOptions options) {\r
                        int p = ptr;\r
+                       bool ecma = IsECMAScript (options);\r
                        Expression expr = null;\r
                        \r
                        switch (pattern[ptr ++]) {\r
 \r
                        // categories\r
 \r
-                       case 'd': expr = new CharacterClass (Category.Digit, false); break;\r
-                       case 'w': expr = new CharacterClass (Category.Word, false); break;\r
-                       case 's': expr = new CharacterClass (Category.WhiteSpace, false); break;\r
-                       case 'D': expr = new CharacterClass (Category.Digit, true); break;\r
-                       case 'W': expr = new CharacterClass (Category.Word, true); break;\r
-                       case 'S': expr = new CharacterClass (Category.WhiteSpace, true); break;\r
-                       case 'p': expr = new CharacterClass (ParseUnicodeCategory (), true); break;\r
-                       case 'P': expr = new CharacterClass (ParseUnicodeCategory (), false); break;\r
+                       case 'd':\r
+                               expr = new CharacterClass (ecma ? Category.EcmaDigit : Category.Digit, false);\r
+                               break;\r
+                               \r
+                       case 'w':\r
+                               expr = new CharacterClass (ecma ? Category.EcmaWord : Category.Word, false);\r
+                               break;\r
+                               \r
+                       case 's':\r
+                               expr = new CharacterClass (ecma ? Category.EcmaWhiteSpace : Category.WhiteSpace, false);\r
+                               break;\r
+                               \r
+                       case 'p':\r
+                               // this is odd - ECMAScript isn't supposed to support Unicode,\r
+                               // yet \p{..} compiles and runs under the MS implementation\r
+                               // identically to canonical mode. That's why I'm ignoring the\r
+                               // value of ecma here.\r
+                       \r
+                               expr = new CharacterClass (ParseUnicodeCategory (), false);\r
+                               break;\r
+                               \r
+                       case 'D':\r
+                               expr = new CharacterClass (ecma ? Category.EcmaDigit : Category.Digit, true);\r
+                               break;\r
+                               \r
+                       case 'W':\r
+                               expr = new CharacterClass (ecma ? Category.EcmaWord : Category.Word, true);\r
+                               break;\r
+                               \r
+                       case 'S':\r
+                               expr = new CharacterClass (ecma ? Category.EcmaWhiteSpace : Category.WhiteSpace, true);\r
+                               break;\r
+                               \r
+                       case 'P':\r
+                               expr = new CharacterClass (ParseUnicodeCategory (), true);\r
+                               break;\r
 \r
                        // positions\r
 \r
@@ -1027,6 +1088,10 @@ namespace System.Text.RegularExpressions.Syntax {
                        return (options & RegexOptions.RightToLeft) != 0;\r
                }\r
 \r
+               private static bool IsECMAScript (RegexOptions options) {\r
+                       return (options & RegexOptions.ECMAScript) != 0;\r
+               }\r
+\r
                // exception creation\r
 \r
                private ArgumentException NewParseException (string msg) {\r