2 // System.Xml.XPath.Tokenizer
5 // Piers Haken (piersh@friskit.com)
7 // (C) 2002 Piers Haken
10 using System.Globalization;
13 using System.Collections;
15 using Mono.Xml.XPath.yyParser;
17 namespace System.Xml.XPath
19 internal class Tokenizer : Mono.Xml.XPath.yyParser.yyInput
21 private char [] m_rgchInput;
25 private int m_iTokenPrev = Token.EOF;
26 private Object m_objToken;
27 private bool m_fPrevWasOperator = false;
28 private bool m_fThisIsOperator = false;
29 private static readonly Hashtable s_mapTokens = new Hashtable ();
30 private static readonly Object [] s_rgTokenMap =
36 Token.ANCESTOR, "ancestor",
37 Token.ANCESTOR_OR_SELF, "ancestor-or-self",
38 Token.ATTRIBUTE, "attribute",
40 Token.DESCENDANT, "descendant",
41 Token.DESCENDANT_OR_SELF, "descendant-or-self",
42 Token.FOLLOWING, "following",
43 Token.FOLLOWING_SIBLING, "following-sibling",
44 Token.NAMESPACE, "namespace",
45 Token.PARENT, "parent",
46 Token.PRECEDING, "preceding",
47 Token.PRECEDING_SIBLING, "preceding-sibling",
49 Token.COMMENT, "comment",
51 Token.PROCESSING_INSTRUCTION, "processing-instruction",
54 private const char EOL = '\0';
58 for (int i = 0; i < s_rgTokenMap.Length; i += 2)
59 s_mapTokens.Add (s_rgTokenMap [i + 1], s_rgTokenMap [i]);
62 public Tokenizer (string strInput)
64 //Console.WriteLine ("Tokenizing: " + strInput);
65 m_rgchInput = strInput.ToCharArray ();
67 m_cch = strInput.Length;
71 private char Peek (int iOffset)
73 if (m_ich + iOffset>= m_cch)
75 return m_rgchInput [m_ich + iOffset];
83 private char GetChar ()
87 return m_rgchInput [m_ich++];
90 private char PutBack ()
93 throw new XPathException ("XPath parser returned an error status: invalid tokenizer state.");
94 return m_rgchInput [--m_ich];
97 private bool SkipWhitespace () // returns trus if any whitespace was skipped
99 if (!IsWhitespace (Peek ()))
102 while (IsWhitespace (Peek ()))
108 private int ParseNumber ()
110 StringBuilder sb = new StringBuilder ();
112 while (IsDigit (Peek ()))
113 sb.Append ((char) GetChar ());
115 // don't handle '3.' as an error case (it is not. XPath 3.7 syntax [30])
118 sb.Append ((char) GetChar ());
119 while (IsDigit (Peek ()))
120 sb.Append ((char) GetChar ());
122 m_objToken = Double.Parse (sb.ToString (), NumberFormatInfo.InvariantInfo);
126 private int ParseLiteral ()
128 StringBuilder sb = new StringBuilder ();
130 char chInit = GetChar ();
132 while ((ch = Peek ()) != chInit)
135 throw new XPathException ("unmatched "+chInit+" in expression");
136 sb.Append ((char) GetChar ());
139 m_objToken = sb.ToString ();
140 return Token.LITERAL;
143 private string ReadIdentifier ()
145 StringBuilder sb = new StringBuilder ();
148 if (!Char.IsLetter (ch) && ch != '_')
151 sb.Append ((char) GetChar ());
153 while ((ch = Peek ()) == '_' || ch == '-' || ch == '.' || Char.IsLetterOrDigit (ch))
154 sb.Append ((char) GetChar ());
157 return sb.ToString ();
160 private int ParseIdentifier ()
162 string strToken = ReadIdentifier ();
163 Object objToken = s_mapTokens [strToken];
165 int iToken = (objToken != null) ? (int) objToken : Token.QName;
166 m_objToken = strToken;
173 // If the two characters following an NCName (possibly
174 // after intervening ExprWhitespace) are ::, then the
175 // token must be recognized as an AxisName.
176 if (objToken == null || !IsAxisName (iToken))
177 throw new XPathException ("invalid axis name: '"+strToken+"'");
188 m_objToken = new XmlQualifiedName ("", strToken);
191 string strToken2 = ReadIdentifier ();
192 if (strToken2 == null)
193 throw new XPathException ("invalid QName: "+strToken+":"+(char)ch);
196 m_objToken = new XmlQualifiedName (strToken2, strToken);
198 return Token.FUNCTION_NAME;
202 // If there is a preceding token and the preceding
203 // token is not one of @, ::, (, [, , or an Operator,
204 // then a * must be recognized as a MultiplyOperator
205 // and an NCName must be recognized as an OperatorName.
206 if (!IsFirstToken && !m_fPrevWasOperator)
208 if (objToken == null || !IsOperatorName (iToken))
209 throw new XPathException ("invalid operator name: '"+strToken+"'");
215 // If the character following an NCName (possibly
216 // after intervening ExprWhitespace) is (, then the
217 // token must be recognized as a NodeType or a FunctionName.
218 if (objToken == null)
220 m_objToken = new XmlQualifiedName (strToken, "");
221 return Token.FUNCTION_NAME;
223 if (IsNodeType (iToken))
225 throw new XPathException ("invalid function name: '"+strToken+"'");
228 m_objToken = new XmlQualifiedName (strToken, "");
232 private static bool IsWhitespace (char ch)
234 // return Char.IsWhiteSpace (ch);
235 return (ch == ' ' || ch == '\t' || ch == '\n' || ch == '\r');
238 private static bool IsDigit (char ch)
240 // return Char.IsDigit (ch);
241 return ch >= '0' && ch <= '9';
254 m_fThisIsOperator = true;
270 else if (IsDigit (Peek ()))
273 return ParseNumber ();
281 m_fThisIsOperator = true;
288 m_fThisIsOperator = true;
293 m_fThisIsOperator = true;
298 m_fThisIsOperator = true;
300 return Token.BRACKET_OPEN;
304 return Token.BRACKET_CLOSE;
307 m_fThisIsOperator = true;
309 return Token.PAREN_OPEN;
313 return Token.PAREN_CLOSE;
316 m_fThisIsOperator = true;
321 m_fThisIsOperator = true;
327 if (!IsFirstToken && !m_fPrevWasOperator)
329 m_fThisIsOperator = true;
330 return Token.MULTIPLY;
332 return Token.ASTERISK;
336 m_fThisIsOperator = true;
340 m_fThisIsOperator = true;
345 m_fThisIsOperator = true;
353 m_fThisIsOperator = true;
360 m_fThisIsOperator = true;
370 m_fThisIsOperator = true;
380 return ParseLiteral ();
383 return ParseLiteral ();
388 return ParseNumber ();
390 else if (Char.IsLetter (ch) || ch == '_') // NCName
392 int iToken = ParseIdentifier ();
393 if (IsOperatorName (iToken))
394 m_fThisIsOperator = true;
399 throw new XPathException ("invalid token: '"+ch+"'");
402 ///////////////////////////
403 // yyParser.yyInput methods
404 ///////////////////////////
406 /** move on to next token.
407 @return false if positioned beyond tokens.
408 @throws IOException on input error.
410 public bool advance ()
412 m_fThisIsOperator = false;
414 m_iToken = ParseToken ();
415 bool fWhitespace = SkipWhitespace ();
416 m_iTokenPrev = m_iToken;
417 m_fPrevWasOperator = m_fThisIsOperator;
418 return (m_iToken != Token.EOF);
421 /** classifies current token.
422 Should not be called if advance() returned false.
423 @return current %token or single character.
430 /** associated with current token.
431 Should not be called if advance() returned false.
432 @return value for token().
434 public Object value ()
438 private bool IsFirstToken { get { return m_iTokenPrev == Token.EOF; } }
440 private bool IsNodeType (int iToken)
446 case Token.PROCESSING_INSTRUCTION:
453 private bool IsOperatorName (int iToken)
466 private bool IsAxisName (int iToken)
470 case Token.ATTRIBUTE:
472 case Token.ANCESTOR_OR_SELF:
474 case Token.DESCENDANT:
475 case Token.DESCENDANT_OR_SELF:
476 case Token.FOLLOWING:
477 case Token.FOLLOWING_SIBLING:
478 case Token.NAMESPACE:
480 case Token.PRECEDING:
481 case Token.PRECEDING_SIBLING: