2004-12-01 Atsushi Enomoto <atsushi@ximian.com>
[mono.git] / mcs / class / System.XML / System.Xml.XPath / Tokenizer.cs
index df94f5703e87ecf107d76c4bd366d8e665ad0220..88fe0a9ca2b06b7b00a4aedb9b0dfe9487dd9e48 100644 (file)
@@ -6,7 +6,29 @@
 //
 // (C) 2002 Piers Haken
 //
+
+//
+// Permission is hereby granted, free of charge, to any person obtaining
+// a copy of this software and associated documentation files (the
+// "Software"), to deal in the Software without restriction, including
+// without limitation the rights to use, copy, modify, merge, publish,
+// distribute, sublicense, and/or sell copies of the Software, and to
+// permit persons to whom the Software is furnished to do so, subject to
+// the following conditions:
+// 
+// The above copyright notice and this permission notice shall be
+// included in all copies or substantial portions of the Software.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+// MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+// OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+// WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+//
 using System;
+using System.Globalization;
 using System.IO;
 using System.Text;
 using System.Collections;
@@ -17,14 +39,16 @@ namespace System.Xml.XPath
 {
        internal class Tokenizer : Mono.Xml.XPath.yyParser.yyInput
        {
-               private char [] m_rgchInput;
+               private string m_rgchInput;
                private int m_ich;
                private int m_cch;
-//             private System.IO.StreamReader m_input;
                private int m_iToken;
+               private int m_iTokenPrev = Token.EOF;
                private Object m_objToken;
-               private static Hashtable m_mapTokens = new Hashtable ();
-               private static readonly Object [] rgTokenMap =
+               private bool m_fPrevWasOperator = false;
+               private bool m_fThisIsOperator = false;
+               private static readonly Hashtable s_mapTokens = new Hashtable ();
+               private static readonly Object [] s_rgTokenMap =
                {
                   Token.AND, "and",
                   Token.OR, "or",
@@ -48,49 +72,60 @@ namespace System.Xml.XPath
                   Token.PROCESSING_INSTRUCTION, "processing-instruction",
                   Token.NODE, "node",
                };
+               private const char EOL = '\0';
 
                static Tokenizer ()
                {
-                       for (int i = 0; i < rgTokenMap.Length; i += 2)
-                               m_mapTokens.Add (rgTokenMap [i + 1], rgTokenMap [i]);
+                       for (int i = 0; i < s_rgTokenMap.Length; i += 2)
+                               s_mapTokens.Add (s_rgTokenMap [i + 1], s_rgTokenMap [i]);
                }
 
                public Tokenizer (string strInput)
                {
-                       m_rgchInput = strInput.ToCharArray ();
+                       //Console.WriteLine ("Tokenizing: " + strInput);
+                       m_rgchInput = strInput;
                        m_ich = 0;
                        m_cch = strInput.Length;
                        SkipWhitespace ();
                }
 
-               private int Peek ()
+               private char Peek (int iOffset)
                {
-                       if (m_ich >= m_cch)
-                               return -1;
-                       return m_rgchInput [m_ich];
+                       if (m_ich + iOffset>= m_cch)
+                               return EOL;
+                       return m_rgchInput [m_ich + iOffset];
                }
 
-               private int GetChar ()
+               private char Peek ()
+               {
+                       return Peek (0);
+               }
+
+               private char GetChar ()
                {
                        if (m_ich >= m_cch)
-                               return -1;
+                               return EOL;
                        return m_rgchInput [m_ich++];
                }
 
-               private int PutBack ()
+               private char PutBack ()
                {
                        if (m_ich == 0)
-                               throw new Exception (); // TODO: better description
+                               throw new XPathException ("XPath parser returned an error status: invalid tokenizer state.");
                        return m_rgchInput [--m_ich];
                }
 
-               private void SkipWhitespace ()
+               private bool SkipWhitespace ()  // returns trus if any whitespace was skipped
                {
+                       if (!IsWhitespace (Peek ()))
+                               return false;
+                                       
                        while (IsWhitespace (Peek ()))
                                GetChar ();
+
+                       return true;
                }
 
-               [MonoTODO]
                private int ParseNumber ()
                {
                        StringBuilder sb = new StringBuilder ();
@@ -98,14 +133,14 @@ namespace System.Xml.XPath
                        while (IsDigit (Peek ()))
                                sb.Append ((char) GetChar ());
 
-                       // TODO: doesn't handle '3.' error case
+                       // don't handle '3.' as an error case (it is not. XPath 3.7 syntax [30])
                        if (Peek () == '.')
                        {
                                sb.Append ((char) GetChar ());
                                while (IsDigit (Peek ()))
                                        sb.Append ((char) GetChar ());
                        }
-                       m_objToken = Double.Parse (sb.ToString ());
+                       m_objToken = Double.Parse (sb.ToString (), NumberFormatInfo.InvariantInfo);
                        return Token.NUMBER;
                }
 
@@ -113,12 +148,12 @@ namespace System.Xml.XPath
                {
                        StringBuilder sb = new StringBuilder ();
 
-                       int chInit = GetChar ();
-                       int ch;
+                       char chInit = GetChar ();
+                       char ch;
                        while ((ch = Peek ()) != chInit)
                        {
-                               if (ch == -1)
-                                       return Token.ERROR;
+                               if (ch == EOL)
+                                       throw new XPathException ("unmatched "+chInit+" in expression");
                                sb.Append ((char) GetChar ());
                        }
                        GetChar ();
@@ -126,59 +161,118 @@ namespace System.Xml.XPath
                        return Token.LITERAL;
                }
 
-               private int ParseIdentifier ()
+               private string ReadIdentifier ()
                {
                        StringBuilder sb = new StringBuilder ();
 
-                       while (true)
+                       char ch = Peek ();
+                       if (!Char.IsLetter (ch) && ch != '_')
+                               return null;
+
+                       sb.Append ((char) GetChar ());
+
+                       while ((ch = Peek ()) == '_' || ch == '-' || ch == '.' || Char.IsLetterOrDigit (ch))
+                               sb.Append ((char) GetChar ());
+
+                       SkipWhitespace ();
+                       return sb.ToString ();
+               }
+
+               private int ParseIdentifier ()
+               {
+                       string strToken = ReadIdentifier ();
+                       Object objToken = s_mapTokens [strToken];
+
+                       int iToken = (objToken != null) ? (int) objToken : Token.QName;
+                       m_objToken = strToken;
+
+                       char ch = Peek ();
+                       if (ch == ':')
                        {
-                               int ch = Peek ();
-                               if (ch == '_' || ch == '-' ||
-                                               (ch >= 'a' && ch <= 'z') ||
-                                               (ch >= 'A' && ch <= 'Z'))
+                               if (Peek (1) == ':')
                                {
-                                       sb.Append ((char) GetChar ());
+                                       // If the two characters following an NCName (possibly
+                                       // after intervening ExprWhitespace) are ::, then the
+                                       // token must be recognized as an AxisName.
+                                       if (objToken == null || !IsAxisName (iToken))
+                                               throw new XPathException ("invalid axis name: '"+strToken+"'");
+                                       return iToken;
                                }
-                               else
-                                       break;
+
+                               GetChar ();
+                               SkipWhitespace ();
+                               ch = Peek ();
+
+                               if (ch == '*')
+                               {
+                                       GetChar ();
+                                       m_objToken = new XmlQualifiedName ("", strToken);
+                                       return Token.QName;
+                               }
+                               string strToken2 = ReadIdentifier ();
+                               if (strToken2 == null)
+                                       throw new XPathException ("invalid QName: "+strToken+":"+(char)ch);
+
+                               ch = Peek ();
+                               m_objToken = new XmlQualifiedName (strToken2, strToken);
+                               if (ch == '(')
+                                       return Token.FUNCTION_NAME;
+                               return Token.QName;
                        }
-                       String strToken = sb.ToString ();
-                       Object objToken = m_mapTokens [strToken];
-                       if (objToken != null)
+
+                       // If there is a preceding token and the preceding
+                       // token is not one of @, ::, (, [, , or an Operator,
+                       // then a * must be recognized as a MultiplyOperator
+                       // and an NCName must be recognized as an OperatorName.
+                       if (!IsFirstToken && !m_fPrevWasOperator)
                        {
-                               return (int) objToken;
+                               if (objToken == null || !IsOperatorName (iToken))
+                                       throw new XPathException ("invalid operator name: '"+strToken+"'");
+                               return iToken;
                        }
-                       else
-                       {
-                               m_objToken = strToken;
 
-                               SkipWhitespace ();
-                               if (Peek () == '(')                                     
+                       if (ch == '(')
+                       {
+                               // If the character following an NCName (possibly
+                               // after intervening ExprWhitespace) is (, then the
+                               // token must be recognized as a NodeType or a FunctionName.
+                               if (objToken == null)
+                               {
+                                       m_objToken = new XmlQualifiedName (strToken, "");
                                        return Token.FUNCTION_NAME;
-                               return Token.NCName;
+                               }
+                               if (IsNodeType (iToken))
+                                       return iToken;
+                               throw new XPathException ("invalid function name: '"+strToken+"'");
                        }
+
+                       m_objToken = new XmlQualifiedName (strToken, "");
+                       return Token.QName;
                }
 
-               private static bool IsWhitespace (int ch)
+               private static bool IsWhitespace (char ch)
                {
+                       // return Char.IsWhiteSpace (ch);
                        return (ch == ' ' || ch == '\t' || ch == '\n' || ch == '\r');
                }
 
-               private static bool IsDigit (int ch)
+               private static bool IsDigit (char ch)
                {
+                       // return Char.IsDigit (ch);
                        return ch >= '0' && ch <= '9';
                }
 
 
                int ParseToken ()
                {
-                       int ch = Peek ();
+                       char ch = Peek ();
                        switch (ch)
                        {
-                               case -1:
+                               case EOL:
                                        return Token.EOF;
 
                                case '/':
+                                       m_fThisIsOperator = true;
                                        GetChar ();
                                        if (Peek () == '/')
                                        {
@@ -205,20 +299,24 @@ namespace System.Xml.XPath
                                        GetChar ();
                                        if (Peek () == ':')
                                        {
+                                               m_fThisIsOperator = true;
                                                GetChar ();
                                                return Token.COLON2;
                                        }
-                                       return Token.COLON;
+                                       return Token.ERROR;
 
                                case ',':
+                                       m_fThisIsOperator = true;
                                        GetChar ();
                                        return Token.COMMA;
 
                                case '@':
+                                       m_fThisIsOperator = true;
                                        GetChar ();
                                        return Token.AT;
 
                                case '[':
+                                       m_fThisIsOperator = true;
                                        GetChar ();
                                        return Token.BRACKET_OPEN;
 
@@ -227,6 +325,7 @@ namespace System.Xml.XPath
                                        return Token.BRACKET_CLOSE;
 
                                case '(':
+                                       m_fThisIsOperator = true;
                                        GetChar ();
                                        return Token.PAREN_OPEN;
 
@@ -235,26 +334,36 @@ namespace System.Xml.XPath
                                        return Token.PAREN_CLOSE;
 
                                case '+':
+                                       m_fThisIsOperator = true;
                                        GetChar ();
                                        return Token.PLUS;
 
                                case '-':
+                                       m_fThisIsOperator = true;
                                        GetChar ();
                                        return Token.MINUS;
 
                                case '*':
                                        GetChar ();
+                                       if (!IsFirstToken && !m_fPrevWasOperator)
+                                       {
+                                               m_fThisIsOperator = true;
+                                               return Token.MULTIPLY;
+                                       }
                                        return Token.ASTERISK;
 
                                case '$':
                                        GetChar ();
+                                       m_fThisIsOperator = true;
                                        return Token.DOLLAR;
 
                                case '|':
+                                       m_fThisIsOperator = true;
                                        GetChar ();
                                        return Token.BAR;
 
                                case '=':
+                                       m_fThisIsOperator = true;
                                        GetChar ();
                                        return Token.EQ;
 
@@ -262,12 +371,14 @@ namespace System.Xml.XPath
                                        GetChar ();
                                        if (Peek () == '=')
                                        {
+                                               m_fThisIsOperator = true;
                                                GetChar ();
                                                return Token.NE;
                                        }
                                        break;
 
                                case '>':
+                                       m_fThisIsOperator = true;
                                        GetChar ();
                                        if (Peek () == '=')
                                        {
@@ -277,6 +388,7 @@ namespace System.Xml.XPath
                                        return Token.GT;
 
                                case '<':
+                                       m_fThisIsOperator = true;
                                        GetChar ();
                                        if (Peek () == '=')
                                        {
@@ -292,18 +404,20 @@ namespace System.Xml.XPath
                                        return ParseLiteral ();
 
                                default:
+                                       if (IsDigit (ch))
+                                       {
+                                               return ParseNumber ();
+                                       }
+                                       else if (Char.IsLetter (ch) || ch == '_')        // NCName
                                        {
-                                               if (IsDigit (ch))
-                                               {
-                                                       return ParseNumber ();
-                                               }
-                                               else
-                                               {
-                                                       return ParseIdentifier ();
-                                               }
+                                               int iToken = ParseIdentifier ();
+                                               if (IsOperatorName (iToken))
+                                                       m_fThisIsOperator = true;
+                                               return iToken;
                                        }
+                                       break;
                        }
-                       return Token.ERROR;
+                       throw new XPathException ("invalid token: '"+ch+"'");
                }
 
                ///////////////////////////
@@ -316,9 +430,12 @@ namespace System.Xml.XPath
                  */
                public bool advance ()
                {
+                       m_fThisIsOperator = false;
                        m_objToken = null;
                        m_iToken = ParseToken ();
                        SkipWhitespace ();
+                       m_iTokenPrev = m_iToken;
+                       m_fPrevWasOperator = m_fThisIsOperator;
                        return (m_iToken != Token.EOF);
                }
 
@@ -339,5 +456,55 @@ namespace System.Xml.XPath
                {
                        return m_objToken;
                }
+               private bool IsFirstToken { get { return m_iTokenPrev == Token.EOF; } }
+
+               private bool IsNodeType (int iToken)
+               {
+                       switch (iToken)
+                       {
+                               case Token.COMMENT:
+                               case Token.TEXT:
+                               case Token.PROCESSING_INSTRUCTION:
+                               case Token.NODE:
+                                       return true;
+                               default:
+                                       return false;
+                       }
+               }
+               private bool IsOperatorName (int iToken)
+               {
+                       switch (iToken)
+                       {
+                               case Token.AND:
+                               case Token.OR:
+                               case Token.MOD:
+                               case Token.DIV:
+                                       return true;
+                               default:
+                                       return false;
+                       }
+               }
+               private bool IsAxisName (int iToken)
+               {
+                       switch (iToken)
+                       {
+                               case Token.ATTRIBUTE:
+                               case Token.ANCESTOR:
+                               case Token.ANCESTOR_OR_SELF:
+                               case Token.CHILD:
+                               case Token.DESCENDANT:
+                               case Token.DESCENDANT_OR_SELF:
+                               case Token.FOLLOWING:
+                               case Token.FOLLOWING_SIBLING:
+                               case Token.NAMESPACE:
+                               case Token.PARENT:
+                               case Token.PRECEDING:
+                               case Token.PRECEDING_SIBLING:
+                               case Token.SELF:
+                                       return true;
+                               default:
+                                       return false;
+                       }
+               }
        }
 }