//
// (C) 2002 Piers Haken
//
+
+//
+// Permission is hereby granted, free of charge, to any person obtaining
+// a copy of this software and associated documentation files (the
+// "Software"), to deal in the Software without restriction, including
+// without limitation the rights to use, copy, modify, merge, publish,
+// distribute, sublicense, and/or sell copies of the Software, and to
+// permit persons to whom the Software is furnished to do so, subject to
+// the following conditions:
+//
+// The above copyright notice and this permission notice shall be
+// included in all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+// MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+// OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+// WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+//
using System;
+using System.Globalization;
using System.IO;
using System.Text;
using System.Collections;
{
internal class Tokenizer : Mono.Xml.XPath.yyParser.yyInput
{
- private char [] m_rgchInput;
+ private string m_rgchInput;
private int m_ich;
private int m_cch;
-// private System.IO.StreamReader m_input;
private int m_iToken;
+ private int m_iTokenPrev = Token.EOF;
private Object m_objToken;
- private static Hashtable m_mapTokens = new Hashtable ();
- private static readonly Object [] rgTokenMap =
+ private bool m_fPrevWasOperator = false;
+ private bool m_fThisIsOperator = false;
+ private static readonly Hashtable s_mapTokens = new Hashtable ();
+ private static readonly Object [] s_rgTokenMap =
{
Token.AND, "and",
Token.OR, "or",
Token.PROCESSING_INSTRUCTION, "processing-instruction",
Token.NODE, "node",
};
+ private const char EOL = '\0';
static Tokenizer ()
{
- for (int i = 0; i < rgTokenMap.Length; i += 2)
- m_mapTokens.Add (rgTokenMap [i + 1], rgTokenMap [i]);
+ for (int i = 0; i < s_rgTokenMap.Length; i += 2)
+ s_mapTokens.Add (s_rgTokenMap [i + 1], s_rgTokenMap [i]);
}
public Tokenizer (string strInput)
{
- m_rgchInput = strInput.ToCharArray ();
+ //Console.WriteLine ("Tokenizing: " + strInput);
+ m_rgchInput = strInput;
m_ich = 0;
m_cch = strInput.Length;
SkipWhitespace ();
}
- private int Peek ()
+ private char Peek (int iOffset)
{
- if (m_ich >= m_cch)
- return -1;
- return m_rgchInput [m_ich];
+ if (m_ich + iOffset>= m_cch)
+ return EOL;
+ return m_rgchInput [m_ich + iOffset];
}
- private int GetChar ()
+ private char Peek ()
+ {
+ return Peek (0);
+ }
+
+ private char GetChar ()
{
if (m_ich >= m_cch)
- return -1;
+ return EOL;
return m_rgchInput [m_ich++];
}
- private int PutBack ()
+ private char PutBack ()
{
if (m_ich == 0)
- throw new Exception (); // TODO: better description
+ throw new XPathException ("XPath parser returned an error status: invalid tokenizer state.");
return m_rgchInput [--m_ich];
}
- private void SkipWhitespace ()
+ private bool SkipWhitespace () // returns trus if any whitespace was skipped
{
+ if (!IsWhitespace (Peek ()))
+ return false;
+
while (IsWhitespace (Peek ()))
GetChar ();
+
+ return true;
}
- [MonoTODO]
private int ParseNumber ()
{
StringBuilder sb = new StringBuilder ();
while (IsDigit (Peek ()))
sb.Append ((char) GetChar ());
- // TODO: doesn't handle '3.' error case
+ // don't handle '3.' as an error case (it is not. XPath 3.7 syntax [30])
if (Peek () == '.')
{
sb.Append ((char) GetChar ());
while (IsDigit (Peek ()))
sb.Append ((char) GetChar ());
}
- m_objToken = Double.Parse (sb.ToString ());
+ m_objToken = Double.Parse (sb.ToString (), NumberFormatInfo.InvariantInfo);
return Token.NUMBER;
}
{
StringBuilder sb = new StringBuilder ();
- int chInit = GetChar ();
- int ch;
+ char chInit = GetChar ();
+ char ch;
while ((ch = Peek ()) != chInit)
{
- if (ch == -1)
- return Token.ERROR;
+ if (ch == EOL)
+ throw new XPathException ("unmatched "+chInit+" in expression");
sb.Append ((char) GetChar ());
}
GetChar ();
return Token.LITERAL;
}
- private int ParseIdentifier ()
+ private string ReadIdentifier ()
{
StringBuilder sb = new StringBuilder ();
- while (true)
+ char ch = Peek ();
+ if (!Char.IsLetter (ch) && ch != '_')
+ return null;
+
+ sb.Append ((char) GetChar ());
+
+ while ((ch = Peek ()) == '_' || ch == '-' || ch == '.' || Char.IsLetterOrDigit (ch))
+ sb.Append ((char) GetChar ());
+
+ SkipWhitespace ();
+ return sb.ToString ();
+ }
+
+ private int ParseIdentifier ()
+ {
+ string strToken = ReadIdentifier ();
+ Object objToken = s_mapTokens [strToken];
+
+ int iToken = (objToken != null) ? (int) objToken : Token.QName;
+ m_objToken = strToken;
+
+ char ch = Peek ();
+ if (ch == ':')
{
- int ch = Peek ();
- if (ch == '_' || ch == '-' ||
- (ch >= 'a' && ch <= 'z') ||
- (ch >= 'A' && ch <= 'Z'))
+ if (Peek (1) == ':')
{
- sb.Append ((char) GetChar ());
+ // If the two characters following an NCName (possibly
+ // after intervening ExprWhitespace) are ::, then the
+ // token must be recognized as an AxisName.
+ if (objToken == null || !IsAxisName (iToken))
+ throw new XPathException ("invalid axis name: '"+strToken+"'");
+ return iToken;
}
- else
- break;
+
+ GetChar ();
+ SkipWhitespace ();
+ ch = Peek ();
+
+ if (ch == '*')
+ {
+ GetChar ();
+ m_objToken = new XmlQualifiedName ("", strToken);
+ return Token.QName;
+ }
+ string strToken2 = ReadIdentifier ();
+ if (strToken2 == null)
+ throw new XPathException ("invalid QName: "+strToken+":"+(char)ch);
+
+ ch = Peek ();
+ m_objToken = new XmlQualifiedName (strToken2, strToken);
+ if (ch == '(')
+ return Token.FUNCTION_NAME;
+ return Token.QName;
}
- String strToken = sb.ToString ();
- Object objToken = m_mapTokens [strToken];
- if (objToken != null)
+
+ // If there is a preceding token and the preceding
+ // token is not one of @, ::, (, [, , or an Operator,
+ // then a * must be recognized as a MultiplyOperator
+ // and an NCName must be recognized as an OperatorName.
+ if (!IsFirstToken && !m_fPrevWasOperator)
{
- return (int) objToken;
+ if (objToken == null || !IsOperatorName (iToken))
+ throw new XPathException ("invalid operator name: '"+strToken+"'");
+ return iToken;
}
- else
- {
- m_objToken = strToken;
- SkipWhitespace ();
- if (Peek () == '(')
+ if (ch == '(')
+ {
+ // If the character following an NCName (possibly
+ // after intervening ExprWhitespace) is (, then the
+ // token must be recognized as a NodeType or a FunctionName.
+ if (objToken == null)
+ {
+ m_objToken = new XmlQualifiedName (strToken, "");
return Token.FUNCTION_NAME;
- return Token.NCName;
+ }
+ if (IsNodeType (iToken))
+ return iToken;
+ throw new XPathException ("invalid function name: '"+strToken+"'");
}
+
+ m_objToken = new XmlQualifiedName (strToken, "");
+ return Token.QName;
}
- private static bool IsWhitespace (int ch)
+ private static bool IsWhitespace (char ch)
{
+ // return Char.IsWhiteSpace (ch);
return (ch == ' ' || ch == '\t' || ch == '\n' || ch == '\r');
}
- private static bool IsDigit (int ch)
+ private static bool IsDigit (char ch)
{
+ // return Char.IsDigit (ch);
return ch >= '0' && ch <= '9';
}
int ParseToken ()
{
- int ch = Peek ();
+ char ch = Peek ();
switch (ch)
{
- case -1:
+ case EOL:
return Token.EOF;
case '/':
+ m_fThisIsOperator = true;
GetChar ();
if (Peek () == '/')
{
GetChar ();
if (Peek () == ':')
{
+ m_fThisIsOperator = true;
GetChar ();
return Token.COLON2;
}
- return Token.COLON;
+ return Token.ERROR;
case ',':
+ m_fThisIsOperator = true;
GetChar ();
return Token.COMMA;
case '@':
+ m_fThisIsOperator = true;
GetChar ();
return Token.AT;
case '[':
+ m_fThisIsOperator = true;
GetChar ();
return Token.BRACKET_OPEN;
return Token.BRACKET_CLOSE;
case '(':
+ m_fThisIsOperator = true;
GetChar ();
return Token.PAREN_OPEN;
return Token.PAREN_CLOSE;
case '+':
+ m_fThisIsOperator = true;
GetChar ();
return Token.PLUS;
case '-':
+ m_fThisIsOperator = true;
GetChar ();
return Token.MINUS;
case '*':
GetChar ();
+ if (!IsFirstToken && !m_fPrevWasOperator)
+ {
+ m_fThisIsOperator = true;
+ return Token.MULTIPLY;
+ }
return Token.ASTERISK;
case '$':
GetChar ();
+ m_fThisIsOperator = true;
return Token.DOLLAR;
case '|':
+ m_fThisIsOperator = true;
GetChar ();
return Token.BAR;
case '=':
+ m_fThisIsOperator = true;
GetChar ();
return Token.EQ;
GetChar ();
if (Peek () == '=')
{
+ m_fThisIsOperator = true;
GetChar ();
return Token.NE;
}
break;
case '>':
+ m_fThisIsOperator = true;
GetChar ();
if (Peek () == '=')
{
return Token.GT;
case '<':
+ m_fThisIsOperator = true;
GetChar ();
if (Peek () == '=')
{
return ParseLiteral ();
default:
+ if (IsDigit (ch))
+ {
+ return ParseNumber ();
+ }
+ else if (Char.IsLetter (ch) || ch == '_') // NCName
{
- if (IsDigit (ch))
- {
- return ParseNumber ();
- }
- else
- {
- return ParseIdentifier ();
- }
+ int iToken = ParseIdentifier ();
+ if (IsOperatorName (iToken))
+ m_fThisIsOperator = true;
+ return iToken;
}
+ break;
}
- return Token.ERROR;
+ throw new XPathException ("invalid token: '"+ch+"'");
}
///////////////////////////
*/
public bool advance ()
{
+ m_fThisIsOperator = false;
m_objToken = null;
m_iToken = ParseToken ();
SkipWhitespace ();
+ m_iTokenPrev = m_iToken;
+ m_fPrevWasOperator = m_fThisIsOperator;
return (m_iToken != Token.EOF);
}
{
return m_objToken;
}
+ private bool IsFirstToken { get { return m_iTokenPrev == Token.EOF; } }
+
+ private bool IsNodeType (int iToken)
+ {
+ switch (iToken)
+ {
+ case Token.COMMENT:
+ case Token.TEXT:
+ case Token.PROCESSING_INSTRUCTION:
+ case Token.NODE:
+ return true;
+ default:
+ return false;
+ }
+ }
+ private bool IsOperatorName (int iToken)
+ {
+ switch (iToken)
+ {
+ case Token.AND:
+ case Token.OR:
+ case Token.MOD:
+ case Token.DIV:
+ return true;
+ default:
+ return false;
+ }
+ }
+ private bool IsAxisName (int iToken)
+ {
+ switch (iToken)
+ {
+ case Token.ATTRIBUTE:
+ case Token.ANCESTOR:
+ case Token.ANCESTOR_OR_SELF:
+ case Token.CHILD:
+ case Token.DESCENDANT:
+ case Token.DESCENDANT_OR_SELF:
+ case Token.FOLLOWING:
+ case Token.FOLLOWING_SIBLING:
+ case Token.NAMESPACE:
+ case Token.PARENT:
+ case Token.PRECEDING:
+ case Token.PRECEDING_SIBLING:
+ case Token.SELF:
+ return true;
+ default:
+ return false;
+ }
+ }
}
}