2005-01-31 Zoltan Varga <vargaz@freemail.hu>
[mono.git] / mcs / class / System.XML / System.Xml.XPath / Tokenizer.cs
1 //
2 // System.Xml.XPath.Tokenizer
3 //
4 // Author:
5 //   Piers Haken (piersh@friskit.com)
6 //
7 // (C) 2002 Piers Haken
8 //
9
10 //
11 // Permission is hereby granted, free of charge, to any person obtaining
12 // a copy of this software and associated documentation files (the
13 // "Software"), to deal in the Software without restriction, including
14 // without limitation the rights to use, copy, modify, merge, publish,
15 // distribute, sublicense, and/or sell copies of the Software, and to
16 // permit persons to whom the Software is furnished to do so, subject to
17 // the following conditions:
18 // 
19 // The above copyright notice and this permission notice shall be
20 // included in all copies or substantial portions of the Software.
21 // 
22 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
23 // EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
24 // MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
25 // NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
26 // LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
27 // OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
28 // WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
29 //
30 using System;
31 using System.Globalization;
32 using System.IO;
33 using System.Text;
34 using System.Collections;
35 using Mono.Xml.XPath;
36 using Mono.Xml.XPath.yyParser;
37
38 namespace System.Xml.XPath
39 {
40         internal class Tokenizer : Mono.Xml.XPath.yyParser.yyInput
41         {
42                 private string m_rgchInput;
43                 private int m_ich;
44                 private int m_cch;
45                 private int m_iToken;
46                 private int m_iTokenPrev = Token.EOF;
47                 private Object m_objToken;
48                 private bool m_fPrevWasOperator = false;
49                 private bool m_fThisIsOperator = false;
50                 private static readonly Hashtable s_mapTokens = new Hashtable ();
51                 private static readonly Object [] s_rgTokenMap =
52                 {
53                    Token.AND, "and",
54                    Token.OR, "or",
55                    Token.DIV, "div",
56                    Token.MOD, "mod",
57                    Token.ANCESTOR, "ancestor",
58                    Token.ANCESTOR_OR_SELF, "ancestor-or-self",
59                    Token.ATTRIBUTE, "attribute",
60                    Token.CHILD, "child",
61                    Token.DESCENDANT, "descendant",
62                    Token.DESCENDANT_OR_SELF, "descendant-or-self",
63                    Token.FOLLOWING, "following",
64                    Token.FOLLOWING_SIBLING, "following-sibling",
65                    Token.NAMESPACE, "namespace",
66                    Token.PARENT, "parent",
67                    Token.PRECEDING, "preceding",
68                    Token.PRECEDING_SIBLING, "preceding-sibling",
69                    Token.SELF, "self",
70                    Token.COMMENT, "comment",
71                    Token.TEXT, "text",
72                    Token.PROCESSING_INSTRUCTION, "processing-instruction",
73                    Token.NODE, "node",
74                 };
75                 private const char EOL = '\0';
76
77                 static Tokenizer ()
78                 {
79                         for (int i = 0; i < s_rgTokenMap.Length; i += 2)
80                                 s_mapTokens.Add (s_rgTokenMap [i + 1], s_rgTokenMap [i]);
81                 }
82
83                 public Tokenizer (string strInput)
84                 {
85                         //Console.WriteLine ("Tokenizing: " + strInput);
86                         m_rgchInput = strInput;
87                         m_ich = 0;
88                         m_cch = strInput.Length;
89                         SkipWhitespace ();
90                 }
91
92                 private char Peek (int iOffset)
93                 {
94                         if (m_ich + iOffset>= m_cch)
95                                 return EOL;
96                         return m_rgchInput [m_ich + iOffset];
97                 }
98
99                 private char Peek ()
100                 {
101                         return Peek (0);
102                 }
103
104                 private char GetChar ()
105                 {
106                         if (m_ich >= m_cch)
107                                 return EOL;
108                         return m_rgchInput [m_ich++];
109                 }
110
111                 private char PutBack ()
112                 {
113                         if (m_ich == 0)
114                                 throw new XPathException ("XPath parser returned an error status: invalid tokenizer state.");
115                         return m_rgchInput [--m_ich];
116                 }
117
118                 private bool SkipWhitespace ()  // returns trus if any whitespace was skipped
119                 {
120                         if (!IsWhitespace (Peek ()))
121                                 return false;
122                                         
123                         while (IsWhitespace (Peek ()))
124                                 GetChar ();
125
126                         return true;
127                 }
128
129                 private int ParseNumber ()
130                 {
131                         StringBuilder sb = new StringBuilder ();
132
133                         while (IsDigit (Peek ()))
134                                 sb.Append ((char) GetChar ());
135
136                         // don't handle '3.' as an error case (it is not. XPath 3.7 syntax [30])
137                         if (Peek () == '.')
138                         {
139                                 sb.Append ((char) GetChar ());
140                                 while (IsDigit (Peek ()))
141                                         sb.Append ((char) GetChar ());
142                         }
143                         m_objToken = Double.Parse (sb.ToString (), NumberFormatInfo.InvariantInfo);
144                         return Token.NUMBER;
145                 }
146
147                 private int ParseLiteral ()
148                 {
149                         StringBuilder sb = new StringBuilder ();
150
151                         char chInit = GetChar ();
152                         char ch;
153                         while ((ch = Peek ()) != chInit)
154                         {
155                                 if (ch == EOL)
156                                         throw new XPathException ("unmatched "+chInit+" in expression");
157                                 sb.Append ((char) GetChar ());
158                         }
159                         GetChar ();
160                         m_objToken = sb.ToString ();
161                         return Token.LITERAL;
162                 }
163
164                 private string ReadIdentifier ()
165                 {
166                         StringBuilder sb = new StringBuilder ();
167
168                         char ch = Peek ();
169                         if (!Char.IsLetter (ch) && ch != '_')
170                                 return null;
171
172                         sb.Append ((char) GetChar ());
173
174                         while ((ch = Peek ()) == '_' || ch == '-' || ch == '.' || Char.IsLetterOrDigit (ch))
175                                 sb.Append ((char) GetChar ());
176
177                         SkipWhitespace ();
178                         return sb.ToString ();
179                 }
180
181                 private int ParseIdentifier ()
182                 {
183                         string strToken = ReadIdentifier ();
184                         Object objToken = s_mapTokens [strToken];
185
186                         int iToken = (objToken != null) ? (int) objToken : Token.QName;
187                         m_objToken = strToken;
188
189                         char ch = Peek ();
190                         if (ch == ':')
191                         {
192                                 if (Peek (1) == ':')
193                                 {
194                                         // If the two characters following an NCName (possibly
195                                         // after intervening ExprWhitespace) are ::, then the
196                                         // token must be recognized as an AxisName.
197                                         if (objToken == null || !IsAxisName (iToken))
198                                                 throw new XPathException ("invalid axis name: '"+strToken+"'");
199                                         return iToken;
200                                 }
201
202                                 GetChar ();
203                                 SkipWhitespace ();
204                                 ch = Peek ();
205
206                                 if (ch == '*')
207                                 {
208                                         GetChar ();
209                                         m_objToken = new XmlQualifiedName ("", strToken);
210                                         return Token.QName;
211                                 }
212                                 string strToken2 = ReadIdentifier ();
213                                 if (strToken2 == null)
214                                         throw new XPathException ("invalid QName: "+strToken+":"+(char)ch);
215
216                                 ch = Peek ();
217                                 m_objToken = new XmlQualifiedName (strToken2, strToken);
218                                 if (ch == '(')
219                                         return Token.FUNCTION_NAME;
220                                 return Token.QName;
221                         }
222
223                         // If there is a preceding token and the preceding
224                         // token is not one of @, ::, (, [, , or an Operator,
225                         // then a * must be recognized as a MultiplyOperator
226                         // and an NCName must be recognized as an OperatorName.
227                         if (!IsFirstToken && !m_fPrevWasOperator)
228                         {
229                                 if (objToken == null || !IsOperatorName (iToken))
230                                         throw new XPathException ("invalid operator name: '"+strToken+"'");
231                                 return iToken;
232                         }
233
234                         if (ch == '(')
235                         {
236                                 // If the character following an NCName (possibly
237                                 // after intervening ExprWhitespace) is (, then the
238                                 // token must be recognized as a NodeType or a FunctionName.
239                                 if (objToken == null)
240                                 {
241                                         m_objToken = new XmlQualifiedName (strToken, "");
242                                         return Token.FUNCTION_NAME;
243                                 }
244                                 if (IsNodeType (iToken))
245                                         return iToken;
246                                 throw new XPathException ("invalid function name: '"+strToken+"'");
247                         }
248
249                         m_objToken = new XmlQualifiedName (strToken, "");
250                         return Token.QName;
251                 }
252
253                 private static bool IsWhitespace (char ch)
254                 {
255                         // return Char.IsWhiteSpace (ch);
256                         return (ch == ' ' || ch == '\t' || ch == '\n' || ch == '\r');
257                 }
258
259                 private static bool IsDigit (char ch)
260                 {
261                         // return Char.IsDigit (ch);
262                         return ch >= '0' && ch <= '9';
263                 }
264
265
266                 int ParseToken ()
267                 {
268                         char ch = Peek ();
269                         switch (ch)
270                         {
271                                 case EOL:
272                                         return Token.EOF;
273
274                                 case '/':
275                                         m_fThisIsOperator = true;
276                                         GetChar ();
277                                         if (Peek () == '/')
278                                         {
279                                                 GetChar ();
280                                                 return Token.SLASH2;
281                                         }
282                                         return Token.SLASH;
283
284                                 case '.':
285                                         GetChar ();
286                                         if (Peek () == '.')
287                                         {
288                                                 GetChar ();
289                                                 return Token.DOT2;
290                                         }
291                                         else if (IsDigit (Peek ()))
292                                         {
293                                                 PutBack ();
294                                                 return ParseNumber ();
295                                         }
296                                         return Token.DOT;
297
298                                 case ':':
299                                         GetChar ();
300                                         if (Peek () == ':')
301                                         {
302                                                 m_fThisIsOperator = true;
303                                                 GetChar ();
304                                                 return Token.COLON2;
305                                         }
306                                         return Token.ERROR;
307
308                                 case ',':
309                                         m_fThisIsOperator = true;
310                                         GetChar ();
311                                         return Token.COMMA;
312
313                                 case '@':
314                                         m_fThisIsOperator = true;
315                                         GetChar ();
316                                         return Token.AT;
317
318                                 case '[':
319                                         m_fThisIsOperator = true;
320                                         GetChar ();
321                                         return Token.BRACKET_OPEN;
322
323                                 case ']':
324                                         GetChar ();
325                                         return Token.BRACKET_CLOSE;
326
327                                 case '(':
328                                         m_fThisIsOperator = true;
329                                         GetChar ();
330                                         return Token.PAREN_OPEN;
331
332                                 case ')':
333                                         GetChar ();
334                                         return Token.PAREN_CLOSE;
335
336                                 case '+':
337                                         m_fThisIsOperator = true;
338                                         GetChar ();
339                                         return Token.PLUS;
340
341                                 case '-':
342                                         m_fThisIsOperator = true;
343                                         GetChar ();
344                                         return Token.MINUS;
345
346                                 case '*':
347                                         GetChar ();
348                                         if (!IsFirstToken && !m_fPrevWasOperator)
349                                         {
350                                                 m_fThisIsOperator = true;
351                                                 return Token.MULTIPLY;
352                                         }
353                                         return Token.ASTERISK;
354
355                                 case '$':
356                                         GetChar ();
357                                         m_fThisIsOperator = true;
358                                         return Token.DOLLAR;
359
360                                 case '|':
361                                         m_fThisIsOperator = true;
362                                         GetChar ();
363                                         return Token.BAR;
364
365                                 case '=':
366                                         m_fThisIsOperator = true;
367                                         GetChar ();
368                                         return Token.EQ;
369
370                                 case '!':
371                                         GetChar ();
372                                         if (Peek () == '=')
373                                         {
374                                                 m_fThisIsOperator = true;
375                                                 GetChar ();
376                                                 return Token.NE;
377                                         }
378                                         break;
379
380                                 case '>':
381                                         m_fThisIsOperator = true;
382                                         GetChar ();
383                                         if (Peek () == '=')
384                                         {
385                                                 GetChar ();
386                                                 return Token.GE;
387                                         }
388                                         return Token.GT;
389
390                                 case '<':
391                                         m_fThisIsOperator = true;
392                                         GetChar ();
393                                         if (Peek () == '=')
394                                         {
395                                                 GetChar ();
396                                                 return Token.LE;
397                                         }
398                                         return Token.LT;
399
400                                 case '\'':
401                                         return ParseLiteral ();
402
403                                 case '\"':
404                                         return ParseLiteral ();
405
406                                 default:
407                                         if (IsDigit (ch))
408                                         {
409                                                 return ParseNumber ();
410                                         }
411                                         else if (Char.IsLetter (ch) || ch == '_')        // NCName
412                                         {
413                                                 int iToken = ParseIdentifier ();
414                                                 if (IsOperatorName (iToken))
415                                                         m_fThisIsOperator = true;
416                                                 return iToken;
417                                         }
418                                         break;
419                         }
420                         throw new XPathException ("invalid token: '"+ch+"'");
421                 }
422
423                 ///////////////////////////
424                 // yyParser.yyInput methods
425                 ///////////////////////////
426
427                 /** move on to next token.
428                   @return false if positioned beyond tokens.
429                   @throws IOException on input error.
430                   */
431                 public bool advance ()
432                 {
433                         m_fThisIsOperator = false;
434                         m_objToken = null;
435                         m_iToken = ParseToken ();
436                         SkipWhitespace ();
437                         m_iTokenPrev = m_iToken;
438                         m_fPrevWasOperator = m_fThisIsOperator;
439                         return (m_iToken != Token.EOF);
440                 }
441
442                 /** classifies current token.
443                   Should not be called if advance() returned false.
444                   @return current %token or single character.
445                   */
446                 public int token ()
447                 {
448                         return m_iToken;
449                 }
450
451                 /** associated with current token.
452                   Should not be called if advance() returned false.
453                   @return value for token().
454                   */
455                 public Object value ()
456                 {
457                         return m_objToken;
458                 }
459                 private bool IsFirstToken { get { return m_iTokenPrev == Token.EOF; } }
460
461                 private bool IsNodeType (int iToken)
462                 {
463                         switch (iToken)
464                         {
465                                 case Token.COMMENT:
466                                 case Token.TEXT:
467                                 case Token.PROCESSING_INSTRUCTION:
468                                 case Token.NODE:
469                                         return true;
470                                 default:
471                                         return false;
472                         }
473                 }
474                 private bool IsOperatorName (int iToken)
475                 {
476                         switch (iToken)
477                         {
478                                 case Token.AND:
479                                 case Token.OR:
480                                 case Token.MOD:
481                                 case Token.DIV:
482                                         return true;
483                                 default:
484                                         return false;
485                         }
486                 }
487                 private bool IsAxisName (int iToken)
488                 {
489                         switch (iToken)
490                         {
491                                 case Token.ATTRIBUTE:
492                                 case Token.ANCESTOR:
493                                 case Token.ANCESTOR_OR_SELF:
494                                 case Token.CHILD:
495                                 case Token.DESCENDANT:
496                                 case Token.DESCENDANT_OR_SELF:
497                                 case Token.FOLLOWING:
498                                 case Token.FOLLOWING_SIBLING:
499                                 case Token.NAMESPACE:
500                                 case Token.PARENT:
501                                 case Token.PRECEDING:
502                                 case Token.PRECEDING_SIBLING:
503                                 case Token.SELF:
504                                         return true;
505                                 default:
506                                         return false;
507                         }
508                 }
509         }
510 }