Merge pull request #1225 from strawd/bug22307
[mono.git] / mcs / class / Commons.Xml.Relaxng / Commons.Xml.Relaxng.Rnc / RncTokenizer.cs
old mode 100755 (executable)
new mode 100644 (file)
index 8103a21..53048bc
@@ -7,27 +7,27 @@
 // (C)2003 Atsushi Enomoto\r
 // (C)2004 Novell Inc.\r
 //\r
-
-//
-// Permission is hereby granted, free of charge, to any person obtaining
-// a copy of this software and associated documentation files (the
-// "Software"), to deal in the Software without restriction, including
-// without limitation the rights to use, copy, modify, merge, publish,
-// distribute, sublicense, and/or sell copies of the Software, and to
-// permit persons to whom the Software is furnished to do so, subject to
-// the following conditions:
-// 
-// The above copyright notice and this permission notice shall be
-// included in all copies or substantial portions of the Software.
-// 
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-// MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
-// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
-// OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
-// WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-//
+\r
+//\r
+// Permission is hereby granted, free of charge, to any person obtaining\r
+// a copy of this software and associated documentation files (the\r
+// "Software"), to deal in the Software without restriction, including\r
+// without limitation the rights to use, copy, modify, merge, publish,\r
+// distribute, sublicense, and/or sell copies of the Software, and to\r
+// permit persons to whom the Software is furnished to do so, subject to\r
+// the following conditions:\r
+// \r
+// The above copyright notice and this permission notice shall be\r
+// included in all copies or substantial portions of the Software.\r
+// \r
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,\r
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF\r
+// MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND\r
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE\r
+// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION\r
+// OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION\r
+// WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.\r
+//\r
 \r
 using System;\r
 using System.Collections;\r
@@ -44,17 +44,21 @@ namespace Commons.Xml.Relaxng.Rnc
                int currentToken;\r
                object tokenValue;\r
                int peekChar;\r
+               string peekString;\r
                bool isElement;\r
                bool isLiteralNsUri;\r
 \r
                int line = 1;\r
                int column;\r
+               int savedLineNumber = 1;\r
+               int savedLinePosition;\r
                bool nextIncrementLine;\r
-               string prefixName;\r
+               string baseUri;\r
 \r
-               public RncTokenizer (TextReader source)\r
+               public RncTokenizer (TextReader source, string baseUri)\r
                {\r
                        this.source = source;\r
+                       this.baseUri = baseUri;\r
                }\r
 \r
                public bool IsElement {\r
@@ -62,11 +66,15 @@ namespace Commons.Xml.Relaxng.Rnc
                }\r
 \r
                public int Line {\r
-                       get { return line; }\r
+                       get { return savedLineNumber; }\r
                }\r
 \r
                public int Column {\r
-                       get { return column; }\r
+                       get { return savedLinePosition; }\r
+               }\r
+\r
+               public string BaseUri {\r
+                       get { return baseUri; }\r
                }\r
 \r
                // jay interface implementation\r
@@ -78,10 +86,10 @@ namespace Commons.Xml.Relaxng.Rnc
 \r
                public bool advance ()\r
                {\r
-                       if (prefixName != null)\r
-                               throw new RelaxngException ("Invalid prefix was found.");\r
                        tokenValue = null;\r
-                       currentToken = ParseToken ();\r
+                       currentToken = ParseToken (false);\r
+                       savedLineNumber = line;\r
+                       savedLinePosition = column;\r
                        return currentToken != Token.EOF;\r
                }\r
 \r
@@ -92,10 +100,85 @@ namespace Commons.Xml.Relaxng.Rnc
 \r
                // private methods\r
 \r
+               private int ReadEscapedHexNumber (int current)\r
+               {\r
+                       int i = source.Read ();\r
+                       switch (i) {\r
+                       case '0':\r
+                       case '1':\r
+                       case '2':\r
+                       case '3':\r
+                       case '4':\r
+                       case '5':\r
+                       case '6':\r
+                       case '7':\r
+                       case '8':\r
+                       case '9':\r
+                               current = current * 16 + (i - '0');\r
+                               return ReadEscapedHexNumber (current);\r
+                       case 'A':\r
+                       case 'B':\r
+                       case 'C':\r
+                       case 'D':\r
+                       case 'E':\r
+                       case 'F':\r
+                               current = current * 16 + (i - 'A') + 10;\r
+                               return ReadEscapedHexNumber (current);\r
+                       case 'a':\r
+                       case 'b':\r
+                       case 'c':\r
+                       case 'd':\r
+                       case 'e':\r
+                       case 'f':\r
+                               current = current * 16 + (i - 'a' + 10);\r
+                               return ReadEscapedHexNumber (current);\r
+                       }\r
+                       peekChar = i;\r
+                       return current;\r
+               }\r
+\r
+               private int ReadFromStream ()\r
+               {\r
+                       int ret = source.Read ();\r
+                       if (ret != '\\')\r
+                               return ret;\r
+                       ret = source.Read ();\r
+                       switch (ret) {\r
+                       case 'x':\r
+                               int tmp;\r
+                               int xcount = 0;\r
+                               do {\r
+                                       xcount++;\r
+                                       tmp = source.Read ();\r
+                               } while (tmp == 'x');\r
+                               if (tmp != '{') {\r
+                                       peekString = new string ('x', xcount);\r
+                                       if (tmp >= 0)\r
+                                               peekString += (char) tmp;\r
+                                       return '\\';\r
+                               }\r
+                               ret = ReadEscapedHexNumber (0);\r
+                               if (peekChar != '}')\r
+                                       break;\r
+                               peekChar = 0;\r
+                               return ret;\r
+                       }\r
+                       peekString = new string ((char) ret, 1);\r
+                       return '\\';\r
+               }\r
+\r
                private int PeekChar ()\r
                {\r
-                       if (peekChar == 0)\r
-                               peekChar = source.Read ();\r
+                       if (peekChar == 0) {\r
+                               if (peekString != null) {\r
+                                       peekChar = peekString [0];\r
+                                       peekString = peekString.Length == 1 ?\r
+                                               null : peekString.Substring (1);\r
+                               }\r
+                               else\r
+                                       peekChar = ReadFromStream ();\r
+                       }\r
+\r
                        return peekChar;\r
                }\r
 \r
@@ -106,8 +189,13 @@ namespace Commons.Xml.Relaxng.Rnc
                                ret = peekChar;\r
                                peekChar = 0;\r
                        }\r
+                       else if (peekString != null) {\r
+                               ret = peekString [0];\r
+                               peekString = peekString.Length == 1 ?\r
+                                       null : peekString.Substring (1);\r
+                       }\r
                        else\r
-                               ret = source.Read ();\r
+                               ret = ReadFromStream ();\r
 \r
                        if (nextIncrementLine) {\r
                                line++;\r
@@ -146,25 +234,82 @@ namespace Commons.Xml.Relaxng.Rnc
 \r
                char [] nameBuffer = new char [30];\r
 \r
-               // TODO: parse three quoted\r
                private string ReadQuoted (char quoteChar)\r
                {\r
                        int index = 0;\r
                        bool loop = true;\r
-                       do {\r
+                       while (loop) {\r
                                int c = ReadChar ();\r
                                switch (c) {\r
                                case -1:\r
+                               case '\'':\r
                                case '\"':\r
+                                       if (quoteChar != c)\r
+                                               goto default;\r
                                        loop = false;\r
                                        break;\r
                                default:\r
-                                       if (nameBuffer.Length == index) {\r
-                                               char [] arr = new char [index * 2];\r
-                                               Array.Copy (nameBuffer, arr, index);\r
-                                               nameBuffer = arr;\r
+                                       if (c < 0)\r
+                                               throw new RelaxngException ("Unterminated quoted literal.");\r
+                                       if (XmlChar.IsInvalid (c))\r
+                                               throw new RelaxngException ("Invalid character in literal.");\r
+                                       AppendNameChar (c, ref index);\r
+                                       break;\r
+                               }\r
+                       }\r
+\r
+                       return new string (nameBuffer, 0, index);\r
+               }\r
+\r
+               private void AppendNameChar (int c, ref int index)\r
+               {\r
+                       if (nameBuffer.Length == index) {\r
+                               char [] arr = new char [index * 2];\r
+                               Array.Copy (nameBuffer, arr, index);\r
+                               nameBuffer = arr;\r
+                       }\r
+                       if (c > 0x10000) {\r
+                               AppendNameChar ((c - 0x10000) / 0x400 + 0xD800, ref index);\r
+                               AppendNameChar ((c - 0x10000) % 0x400 + 0xDC00, ref index);\r
+                       }\r
+                       else\r
+                               nameBuffer [index++] = (char) c;\r
+               }\r
+\r
+               private string ReadTripleQuoted (char quoteChar)\r
+               {\r
+                       int index = 0;\r
+                       bool loop = true;\r
+                       do {\r
+                               int c = ReadChar ();\r
+                               switch (c) {\r
+                               case -1:\r
+                               case '\'':\r
+                               case '\"':\r
+                                       // 1\r
+                                       if (quoteChar != c)\r
+                                               goto default;\r
+                                       // 2\r
+                                       if ((c = PeekChar ()) != quoteChar) {\r
+                                               AppendNameChar (quoteChar, ref index);\r
+                                               goto default;\r
                                        }\r
-                                       nameBuffer [index++] = (char) c;\r
+                                       ReadChar ();\r
+                                       // 3\r
+                                       if ((c = PeekChar ()) == quoteChar) {\r
+                                               ReadChar ();\r
+                                               loop = false;\r
+                                               break;\r
+                                       }\r
+                                       AppendNameChar (quoteChar, ref index);\r
+                                       AppendNameChar (quoteChar, ref index);\r
+                                       break;\r
+                               default:\r
+                                       if (c < 0)\r
+                                               throw new RelaxngException ("Unterminated triple-quoted literal.");\r
+                                       if (XmlChar.IsInvalid (c))\r
+                                               throw new RelaxngException ("Invalid character in literal.");\r
+                                       AppendNameChar (c, ref index);\r
                                        break;\r
                                }\r
                        } while (loop);\r
@@ -172,12 +317,15 @@ namespace Commons.Xml.Relaxng.Rnc
                        return new string (nameBuffer, 0, index);\r
                }\r
 \r
-               private string ReadOneToken ()\r
+               private string ReadOneName ()\r
                {\r
                        int index = 0;\r
                        bool loop = true;\r
+                       int c = PeekChar ();\r
+                       if (!XmlChar.IsFirstNameChar (c) || !XmlChar.IsNCNameChar (c))\r
+                               throw new RelaxngException (String.Format ("Invalid NCName start character: {0}", c));\r
                        do {\r
-                               int c = PeekChar ();\r
+                               c = PeekChar ();\r
                                switch (c) {\r
                                case -1:\r
                                case ' ':\r
@@ -188,12 +336,7 @@ namespace Commons.Xml.Relaxng.Rnc
                                        loop = false;\r
                                        break;\r
                                default:\r
-                                       if (!IsTokenContinuable (c)) {\r
-                                               if (c == ':') {\r
-                                                       if (prefixName != null)\r
-                                                               throw new RelaxngException ("Invalid colon was found.");\r
-                                                       prefixName = new string (nameBuffer, 0, index);\r
-                                               }\r
+                                       if (!XmlChar.IsNCNameChar (c)) {\r
                                                loop = false;\r
                                                break;\r
                                        }\r
@@ -220,35 +363,7 @@ namespace Commons.Xml.Relaxng.Rnc
                        return s;\r
                }\r
 \r
-               private bool IsTokenContinuable (int c)\r
-               {\r
-                       switch (c) {\r
-                       case '=':\r
-                       case ':':\r
-                       case ',':\r
-                       case '{':\r
-                       case '}':\r
-                       case '(':\r
-                       case ')':\r
-                       case '[':\r
-                       case ']':\r
-                       case '&':\r
-                       case '|':\r
-                       case '?':\r
-                       case '*':\r
-                       case '\\':\r
-                       case '+':\r
-                       case '-':\r
-                       case '>':\r
-                       case '#':\r
-                       case '\'':\r
-                       case '\"':\r
-                               return false;\r
-                       }\r
-                       return true;\r
-               }\r
-\r
-               private int ParseToken ()\r
+               private int ParseToken (bool backslashed)\r
                {\r
                        SkipWhitespaces ();\r
                        int c = ReadChar ();\r
@@ -258,19 +373,8 @@ namespace Commons.Xml.Relaxng.Rnc
                                return Token.EOF;\r
                        case '=':\r
                                return Token.Equal;\r
-                       case ':':\r
-                               // return CName\r
-                               if (prefixName == null)\r
-                                       throw new RelaxngException ("Invalid character ':' was found.");\r
-                               if (PeekChar () == '*') {\r
-                                       ReadChar ();\r
-                                       tokenValue = prefixName;\r
-                                       prefixName = null;\r
-                                       return Token.NsName;\r
-                               }\r
-                               tokenValue = prefixName + ":" + ReadOneToken ();\r
-                               prefixName = null;\r
-                               return Token.CName;\r
+                       case '~':\r
+                               return Token.Tilde;\r
                        case ',':\r
                                return Token.Comma;\r
                        case '{':\r
@@ -301,7 +405,9 @@ namespace Commons.Xml.Relaxng.Rnc
                                // See also ':' for NsName\r
                                return Token.Asterisk;\r
                        case '\\':\r
-                               return Token.BackSlash;\r
+                               if (backslashed)\r
+                                       return Token.ERROR;\r
+                               return ParseToken (true);\r
                        case '+':\r
                                return Token.Plus;\r
                        case '-':\r
@@ -314,22 +420,46 @@ namespace Commons.Xml.Relaxng.Rnc
                                peekChar = '>';\r
                                goto default;\r
                        case '#':\r
-                               // NOTE: This interpretation is expanded against the spec\r
-//                             if (ReadChar () != '#')\r
-//                                     throw new RelaxngException ("Invalid character after '#'.");\r
-                               tokenValue = ReadLine ();\r
-                               return Token.Documentation;\r
+//                             tokenValue = ReadLine ();\r
+//                             return Token.Documentation;\r
+                               ReadLine ();\r
+                               return ParseToken (false);\r
                        case '\'':\r
                        case '\"':\r
-                               name = ReadQuoted ((char) c);\r
+                               if (PeekChar () != c)\r
+                                       name = ReadQuoted ((char) c);\r
+                               else {\r
+                                       ReadChar ();\r
+                                       if (PeekChar () == c) {\r
+                                               ReadChar ();\r
+                                               name = ReadTripleQuoted ((char) c);\r
+                                       } // else '' or ""\r
+                                       name = String.Empty;\r
+                               }\r
+                               int invidx = XmlChar.IndexOfInvalid (name, true) ;\r
+                               if (invidx >= 0)\r
+                                       throw new RelaxngException (String.Format ("Invalid XML character in compact syntax literal segment at {0:X}", (int) name [invidx]));\r
                                tokenValue = name;\r
                                return Token.LiteralSegment;\r
                        default:\r
+                               if (!XmlChar.IsNCNameChar (c))\r
+                                       throw new RelaxngException ("Invalid NCName character.");\r
                                peekChar = c;\r
-                               name = ReadOneToken ();\r
-                               if (prefixName != null)\r
-                                       return ParseToken ();\r
+                               name = ReadOneName ();\r
+                               if (PeekChar () == ':') {\r
+                                       ReadChar ();\r
+                                       if (PeekChar () == '*') {\r
+                                               ReadChar ();\r
+                                               tokenValue = name;\r
+                                               return Token.NsName;\r
+                                       }\r
+                                       tokenValue = name + ":" + ReadOneName ();\r
+                                       return Token.CName;\r
+\r
+                               }\r
                                tokenValue = name;\r
+                               if (backslashed)\r
+                                       return Token.QuotedIdentifier;\r
                                switch (name) {\r
                                case "attribute":\r
                                        isElement = false;\r
@@ -372,10 +502,10 @@ namespace Commons.Xml.Relaxng.Rnc
                                case "token":\r
                                        return Token.KeywordToken;\r
                                default:\r
-                                       return Token.NCNameButKeyword;\r
+                                       return Token.NCName;\r
                                }\r
                        }\r
                }\r
 \r
        }\r
-}
+}\r