2004-03-01 Atsushi Enomoto <atsushi@ximian.com>
[mono.git] / mcs / class / System.XML / System.Xml.XPath / Tokenizer.cs
1 //
2 // System.Xml.XPath.Tokenizer
3 //
4 // Author:
5 //   Piers Haken (piersh@friskit.com)
6 //
7 // (C) 2002 Piers Haken
8 //
9 using System;
10 using System.Globalization;
11 using System.IO;
12 using System.Text;
13 using System.Collections;
14 using Mono.Xml.XPath;
15 using Mono.Xml.XPath.yyParser;
16
17 namespace System.Xml.XPath
18 {
19         internal class Tokenizer : Mono.Xml.XPath.yyParser.yyInput
20         {
21                 private char [] m_rgchInput;
22                 private int m_ich;
23                 private int m_cch;
24                 private int m_iToken;
25                 private int m_iTokenPrev = Token.EOF;
26                 private Object m_objToken;
27                 private bool m_fPrevWasOperator = false;
28                 private bool m_fThisIsOperator = false;
29                 private static readonly Hashtable s_mapTokens = new Hashtable ();
30                 private static readonly Object [] s_rgTokenMap =
31                 {
32                    Token.AND, "and",
33                    Token.OR, "or",
34                    Token.DIV, "div",
35                    Token.MOD, "mod",
36                    Token.ANCESTOR, "ancestor",
37                    Token.ANCESTOR_OR_SELF, "ancestor-or-self",
38                    Token.ATTRIBUTE, "attribute",
39                    Token.CHILD, "child",
40                    Token.DESCENDANT, "descendant",
41                    Token.DESCENDANT_OR_SELF, "descendant-or-self",
42                    Token.FOLLOWING, "following",
43                    Token.FOLLOWING_SIBLING, "following-sibling",
44                    Token.NAMESPACE, "namespace",
45                    Token.PARENT, "parent",
46                    Token.PRECEDING, "preceding",
47                    Token.PRECEDING_SIBLING, "preceding-sibling",
48                    Token.SELF, "self",
49                    Token.COMMENT, "comment",
50                    Token.TEXT, "text",
51                    Token.PROCESSING_INSTRUCTION, "processing-instruction",
52                    Token.NODE, "node",
53                 };
54                 private const char EOL = '\0';
55
56                 static Tokenizer ()
57                 {
58                         for (int i = 0; i < s_rgTokenMap.Length; i += 2)
59                                 s_mapTokens.Add (s_rgTokenMap [i + 1], s_rgTokenMap [i]);
60                 }
61
62                 public Tokenizer (string strInput)
63                 {
64                         //Console.WriteLine ("Tokenizing: " + strInput);
65                         m_rgchInput = strInput.ToCharArray ();
66                         m_ich = 0;
67                         m_cch = strInput.Length;
68                         SkipWhitespace ();
69                 }
70
71                 private char Peek (int iOffset)
72                 {
73                         if (m_ich + iOffset>= m_cch)
74                                 return EOL;
75                         return m_rgchInput [m_ich + iOffset];
76                 }
77
78                 private char Peek ()
79                 {
80                         return Peek (0);
81                 }
82
83                 private char GetChar ()
84                 {
85                         if (m_ich >= m_cch)
86                                 return EOL;
87                         return m_rgchInput [m_ich++];
88                 }
89
90                 private char PutBack ()
91                 {
92                         if (m_ich == 0)
93                                 throw new XPathException ("XPath parser returned an error status: invalid tokenizer state.");
94                         return m_rgchInput [--m_ich];
95                 }
96
97                 private bool SkipWhitespace ()  // returns trus if any whitespace was skipped
98                 {
99                         if (!IsWhitespace (Peek ()))
100                                 return false;
101                                         
102                         while (IsWhitespace (Peek ()))
103                                 GetChar ();
104
105                         return true;
106                 }
107
108                 private int ParseNumber ()
109                 {
110                         StringBuilder sb = new StringBuilder ();
111
112                         while (IsDigit (Peek ()))
113                                 sb.Append ((char) GetChar ());
114
115                         // don't handle '3.' as an error case (it is not. XPath 3.7 syntax [30])
116                         if (Peek () == '.')
117                         {
118                                 sb.Append ((char) GetChar ());
119                                 while (IsDigit (Peek ()))
120                                         sb.Append ((char) GetChar ());
121                         }
122                         m_objToken = Double.Parse (sb.ToString (), NumberFormatInfo.InvariantInfo);
123                         return Token.NUMBER;
124                 }
125
126                 private int ParseLiteral ()
127                 {
128                         StringBuilder sb = new StringBuilder ();
129
130                         char chInit = GetChar ();
131                         char ch;
132                         while ((ch = Peek ()) != chInit)
133                         {
134                                 if (ch == EOL)
135                                         throw new XPathException ("unmatched "+chInit+" in expression");
136                                 sb.Append ((char) GetChar ());
137                         }
138                         GetChar ();
139                         m_objToken = sb.ToString ();
140                         return Token.LITERAL;
141                 }
142
143                 private string ReadIdentifier ()
144                 {
145                         StringBuilder sb = new StringBuilder ();
146
147                         char ch = Peek ();
148                         if (!Char.IsLetter (ch) && ch != '_')
149                                 return null;
150
151                         sb.Append ((char) GetChar ());
152
153                         while ((ch = Peek ()) == '_' || ch == '-' || ch == '.' || Char.IsLetterOrDigit (ch))
154                                 sb.Append ((char) GetChar ());
155
156                         SkipWhitespace ();
157                         return sb.ToString ();
158                 }
159
160                 private int ParseIdentifier ()
161                 {
162                         string strToken = ReadIdentifier ();
163                         Object objToken = s_mapTokens [strToken];
164
165                         int iToken = (objToken != null) ? (int) objToken : Token.QName;
166                         m_objToken = strToken;
167
168                         char ch = Peek ();
169                         if (ch == ':')
170                         {
171                                 if (Peek (1) == ':')
172                                 {
173                                         // If the two characters following an NCName (possibly
174                                         // after intervening ExprWhitespace) are ::, then the
175                                         // token must be recognized as an AxisName.
176                                         if (objToken == null || !IsAxisName (iToken))
177                                                 throw new XPathException ("invalid axis name: '"+strToken+"'");
178                                         return iToken;
179                                 }
180
181                                 GetChar ();
182                                 SkipWhitespace ();
183                                 ch = Peek ();
184
185                                 if (ch == '*')
186                                 {
187                                         GetChar ();
188                                         m_objToken = new XmlQualifiedName ("", strToken);
189                                         return Token.QName;
190                                 }
191                                 string strToken2 = ReadIdentifier ();
192                                 if (strToken2 == null)
193                                         throw new XPathException ("invalid QName: "+strToken+":"+(char)ch);
194
195                                 ch = Peek ();
196                                 m_objToken = new XmlQualifiedName (strToken2, strToken);
197                                 if (ch == '(')
198                                         return Token.FUNCTION_NAME;
199                                 return Token.QName;
200                         }
201
202                         // If there is a preceding token and the preceding
203                         // token is not one of @, ::, (, [, , or an Operator,
204                         // then a * must be recognized as a MultiplyOperator
205                         // and an NCName must be recognized as an OperatorName.
206                         if (!IsFirstToken && !m_fPrevWasOperator)
207                         {
208                                 if (objToken == null || !IsOperatorName (iToken))
209                                         throw new XPathException ("invalid operator name: '"+strToken+"'");
210                                 return iToken;
211                         }
212
213                         if (ch == '(')
214                         {
215                                 // If the character following an NCName (possibly
216                                 // after intervening ExprWhitespace) is (, then the
217                                 // token must be recognized as a NodeType or a FunctionName.
218                                 if (objToken == null)
219                                 {
220                                         m_objToken = new XmlQualifiedName (strToken, "");
221                                         return Token.FUNCTION_NAME;
222                                 }
223                                 if (IsNodeType (iToken))
224                                         return iToken;
225                                 throw new XPathException ("invalid function name: '"+strToken+"'");
226                         }
227
228                         m_objToken = new XmlQualifiedName (strToken, "");
229                         return Token.QName;
230                 }
231
232                 private static bool IsWhitespace (char ch)
233                 {
234                         // return Char.IsWhiteSpace (ch);
235                         return (ch == ' ' || ch == '\t' || ch == '\n' || ch == '\r');
236                 }
237
238                 private static bool IsDigit (char ch)
239                 {
240                         // return Char.IsDigit (ch);
241                         return ch >= '0' && ch <= '9';
242                 }
243
244
245                 int ParseToken ()
246                 {
247                         char ch = Peek ();
248                         switch (ch)
249                         {
250                                 case EOL:
251                                         return Token.EOF;
252
253                                 case '/':
254                                         m_fThisIsOperator = true;
255                                         GetChar ();
256                                         if (Peek () == '/')
257                                         {
258                                                 GetChar ();
259                                                 return Token.SLASH2;
260                                         }
261                                         return Token.SLASH;
262
263                                 case '.':
264                                         GetChar ();
265                                         if (Peek () == '.')
266                                         {
267                                                 GetChar ();
268                                                 return Token.DOT2;
269                                         }
270                                         else if (IsDigit (Peek ()))
271                                         {
272                                                 PutBack ();
273                                                 return ParseNumber ();
274                                         }
275                                         return Token.DOT;
276
277                                 case ':':
278                                         GetChar ();
279                                         if (Peek () == ':')
280                                         {
281                                                 m_fThisIsOperator = true;
282                                                 GetChar ();
283                                                 return Token.COLON2;
284                                         }
285                                         return Token.ERROR;
286
287                                 case ',':
288                                         m_fThisIsOperator = true;
289                                         GetChar ();
290                                         return Token.COMMA;
291
292                                 case '@':
293                                         m_fThisIsOperator = true;
294                                         GetChar ();
295                                         return Token.AT;
296
297                                 case '[':
298                                         m_fThisIsOperator = true;
299                                         GetChar ();
300                                         return Token.BRACKET_OPEN;
301
302                                 case ']':
303                                         GetChar ();
304                                         return Token.BRACKET_CLOSE;
305
306                                 case '(':
307                                         m_fThisIsOperator = true;
308                                         GetChar ();
309                                         return Token.PAREN_OPEN;
310
311                                 case ')':
312                                         GetChar ();
313                                         return Token.PAREN_CLOSE;
314
315                                 case '+':
316                                         m_fThisIsOperator = true;
317                                         GetChar ();
318                                         return Token.PLUS;
319
320                                 case '-':
321                                         m_fThisIsOperator = true;
322                                         GetChar ();
323                                         return Token.MINUS;
324
325                                 case '*':
326                                         GetChar ();
327                                         if (!IsFirstToken && !m_fPrevWasOperator)
328                                         {
329                                                 m_fThisIsOperator = true;
330                                                 return Token.MULTIPLY;
331                                         }
332                                         return Token.ASTERISK;
333
334                                 case '$':
335                                         GetChar ();
336                                         m_fThisIsOperator = true;
337                                         return Token.DOLLAR;
338
339                                 case '|':
340                                         m_fThisIsOperator = true;
341                                         GetChar ();
342                                         return Token.BAR;
343
344                                 case '=':
345                                         m_fThisIsOperator = true;
346                                         GetChar ();
347                                         return Token.EQ;
348
349                                 case '!':
350                                         GetChar ();
351                                         if (Peek () == '=')
352                                         {
353                                                 m_fThisIsOperator = true;
354                                                 GetChar ();
355                                                 return Token.NE;
356                                         }
357                                         break;
358
359                                 case '>':
360                                         m_fThisIsOperator = true;
361                                         GetChar ();
362                                         if (Peek () == '=')
363                                         {
364                                                 GetChar ();
365                                                 return Token.GE;
366                                         }
367                                         return Token.GT;
368
369                                 case '<':
370                                         m_fThisIsOperator = true;
371                                         GetChar ();
372                                         if (Peek () == '=')
373                                         {
374                                                 GetChar ();
375                                                 return Token.LE;
376                                         }
377                                         return Token.LT;
378
379                                 case '\'':
380                                         return ParseLiteral ();
381
382                                 case '\"':
383                                         return ParseLiteral ();
384
385                                 default:
386                                         if (IsDigit (ch))
387                                         {
388                                                 return ParseNumber ();
389                                         }
390                                         else if (Char.IsLetter (ch) || ch == '_')        // NCName
391                                         {
392                                                 int iToken = ParseIdentifier ();
393                                                 if (IsOperatorName (iToken))
394                                                         m_fThisIsOperator = true;
395                                                 return iToken;
396                                         }
397                                         break;
398                         }
399                         throw new XPathException ("invalid token: '"+ch+"'");
400                 }
401
402                 ///////////////////////////
403                 // yyParser.yyInput methods
404                 ///////////////////////////
405
406                 /** move on to next token.
407                   @return false if positioned beyond tokens.
408                   @throws IOException on input error.
409                   */
410                 public bool advance ()
411                 {
412                         m_fThisIsOperator = false;
413                         m_objToken = null;
414                         m_iToken = ParseToken ();
415                         bool fWhitespace = SkipWhitespace ();
416                         m_iTokenPrev = m_iToken;
417                         m_fPrevWasOperator = m_fThisIsOperator;
418                         return (m_iToken != Token.EOF);
419                 }
420
421                 /** classifies current token.
422                   Should not be called if advance() returned false.
423                   @return current %token or single character.
424                   */
425                 public int token ()
426                 {
427                         return m_iToken;
428                 }
429
430                 /** associated with current token.
431                   Should not be called if advance() returned false.
432                   @return value for token().
433                   */
434                 public Object value ()
435                 {
436                         return m_objToken;
437                 }
438                 private bool IsFirstToken { get { return m_iTokenPrev == Token.EOF; } }
439
440                 private bool IsNodeType (int iToken)
441                 {
442                         switch (iToken)
443                         {
444                                 case Token.COMMENT:
445                                 case Token.TEXT:
446                                 case Token.PROCESSING_INSTRUCTION:
447                                 case Token.NODE:
448                                         return true;
449                                 default:
450                                         return false;
451                         }
452                 }
453                 private bool IsOperatorName (int iToken)
454                 {
455                         switch (iToken)
456                         {
457                                 case Token.AND:
458                                 case Token.OR:
459                                 case Token.MOD:
460                                 case Token.DIV:
461                                         return true;
462                                 default:
463                                         return false;
464                         }
465                 }
466                 private bool IsAxisName (int iToken)
467                 {
468                         switch (iToken)
469                         {
470                                 case Token.ATTRIBUTE:
471                                 case Token.ANCESTOR:
472                                 case Token.ANCESTOR_OR_SELF:
473                                 case Token.CHILD:
474                                 case Token.DESCENDANT:
475                                 case Token.DESCENDANT_OR_SELF:
476                                 case Token.FOLLOWING:
477                                 case Token.FOLLOWING_SIBLING:
478                                 case Token.NAMESPACE:
479                                 case Token.PARENT:
480                                 case Token.PRECEDING:
481                                 case Token.PRECEDING_SIBLING:
482                                 case Token.SELF:
483                                         return true;
484                                 default:
485                                         return false;
486                         }
487                 }
488         }
489 }