TARGET_JVM: refer nunit from mono tree
[mono.git] / mcs / class / System.XML / System.Xml.XPath / Tokenizer.cs
1 //
2 // System.Xml.XPath.Tokenizer.cs / Mono.Xml.Xsl/PatternTokenizer.cs
3 //
4 // Author:
5 //   Piers Haken (piersh@friskit.com)
6 //   Atsushi Enomoto (atsushi@ximian.com)
7 //
8 // (C) 2002 Piers Haken
9 // (C) 2005 Novell Inc,
10 //
11 // IMPORTANT:
12 //
13 // Do not edit PatternTokenizer.cs. It is autogenerated.
14 //
15
16 //
17 // Permission is hereby granted, free of charge, to any person obtaining
18 // a copy of this software and associated documentation files (the
19 // "Software"), to deal in the Software without restriction, including
20 // without limitation the rights to use, copy, modify, merge, publish,
21 // distribute, sublicense, and/or sell copies of the Software, and to
22 // permit persons to whom the Software is furnished to do so, subject to
23 // the following conditions:
24 // 
25 // The above copyright notice and this permission notice shall be
26 // included in all copies or substantial portions of the Software.
27 // 
28 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
29 // EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
30 // MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
31 // NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
32 // LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
33 // OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
34 // WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
35 //
36 using System;
37 using System.Globalization;
38 using System.IO;
39 using System.Text;
40 using System.Collections;
41 using System.Xml;
42 using System.Xml.XPath;
43 using Mono.Xml.XPath;
44
45 #if XSLT_PATTERN
46 namespace Mono.Xml.Xsl
47 #else
48 namespace Mono.Xml.XPath
49 #endif
50 {
51         internal class Tokenizer : yyParser.yyInput
52         {
53                 private string m_rgchInput;
54                 private int m_ich;
55                 private int m_cch;
56                 private int m_iToken;
57                 private int m_iTokenPrev = Token.EOF;
58                 private Object m_objToken;
59                 private bool m_fPrevWasOperator = false;
60                 private bool m_fThisIsOperator = false;
61                 private static readonly Hashtable s_mapTokens = new Hashtable ();
62                 private static readonly Object [] s_rgTokenMap =
63                 {
64                    Token.AND, "and",
65                    Token.OR, "or",
66                    Token.DIV, "div",
67                    Token.MOD, "mod",
68                    Token.ANCESTOR, "ancestor",
69                    Token.ANCESTOR_OR_SELF, "ancestor-or-self",
70                    Token.ATTRIBUTE, "attribute",
71                    Token.CHILD, "child",
72                    Token.DESCENDANT, "descendant",
73                    Token.DESCENDANT_OR_SELF, "descendant-or-self",
74                    Token.FOLLOWING, "following",
75                    Token.FOLLOWING_SIBLING, "following-sibling",
76                    Token.NAMESPACE, "namespace",
77                    Token.PARENT, "parent",
78                    Token.PRECEDING, "preceding",
79                    Token.PRECEDING_SIBLING, "preceding-sibling",
80                    Token.SELF, "self",
81                    Token.COMMENT, "comment",
82                    Token.TEXT, "text",
83                    Token.PROCESSING_INSTRUCTION, "processing-instruction",
84                    Token.NODE, "node",
85                 };
86                 private const char EOL = '\0';
87
88                 static Tokenizer ()
89                 {
90                         for (int i = 0; i < s_rgTokenMap.Length; i += 2)
91                                 s_mapTokens.Add (s_rgTokenMap [i + 1], s_rgTokenMap [i]);
92                 }
93
94                 public Tokenizer (string strInput)
95                 {
96                         //Console.WriteLine ("Tokenizing: " + strInput);
97                         m_rgchInput = strInput;
98                         m_ich = 0;
99                         m_cch = strInput.Length;
100                         SkipWhitespace ();
101                 }
102
103                 private char Peek (int iOffset)
104                 {
105                         if (m_ich + iOffset>= m_cch)
106                                 return EOL;
107                         return m_rgchInput [m_ich + iOffset];
108                 }
109
110                 private char Peek ()
111                 {
112                         return Peek (0);
113                 }
114
115                 private char GetChar ()
116                 {
117                         if (m_ich >= m_cch)
118                                 return EOL;
119                         return m_rgchInput [m_ich++];
120                 }
121
122                 private char PutBack ()
123                 {
124                         if (m_ich == 0)
125                                 throw new XPathException ("XPath parser returned an error status: invalid tokenizer state.");
126                         return m_rgchInput [--m_ich];
127                 }
128
129                 private bool SkipWhitespace ()  // returns trus if any whitespace was skipped
130                 {
131                         if (!IsWhitespace (Peek ()))
132                                 return false;
133                                         
134                         while (IsWhitespace (Peek ()))
135                                 GetChar ();
136
137                         return true;
138                 }
139
140                 private int ParseNumber ()
141                 {
142                         StringBuilder sb = new StringBuilder ();
143
144                         while (IsDigit (Peek ()))
145                                 sb.Append ((char) GetChar ());
146
147                         // don't handle '3.' as an error case (it is not. XPath 3.7 syntax [30])
148                         if (Peek () == '.')
149                         {
150                                 sb.Append ((char) GetChar ());
151                                 while (IsDigit (Peek ()))
152                                         sb.Append ((char) GetChar ());
153                         }
154                         m_objToken = Double.Parse (sb.ToString (), NumberFormatInfo.InvariantInfo);
155                         return Token.NUMBER;
156                 }
157
158                 private int ParseLiteral ()
159                 {
160                         StringBuilder sb = new StringBuilder ();
161
162                         char chInit = GetChar ();
163                         char ch;
164                         while ((ch = Peek ()) != chInit)
165                         {
166                                 if (ch == EOL)
167                                         throw new XPathException ("unmatched "+chInit+" in expression");
168                                 sb.Append ((char) GetChar ());
169                         }
170                         GetChar ();
171                         m_objToken = sb.ToString ();
172                         return Token.LITERAL;
173                 }
174
175                 private string ReadIdentifier ()
176                 {
177                         StringBuilder sb = new StringBuilder ();
178
179                         char ch = Peek ();
180                         if (!Char.IsLetter (ch) && ch != '_')
181                                 return null;
182
183                         sb.Append ((char) GetChar ());
184
185                         while ((ch = Peek ()) == '_' || ch == '-' || ch == '.' || Char.IsLetterOrDigit (ch))
186                                 sb.Append ((char) GetChar ());
187
188                         SkipWhitespace ();
189                         return sb.ToString ();
190                 }
191
192                 private int ParseIdentifier ()
193                 {
194                         string strToken = ReadIdentifier ();
195                         Object objToken = s_mapTokens [strToken];
196
197                         int iToken = (objToken != null) ? (int) objToken : Token.QName;
198                         m_objToken = strToken;
199
200                         char ch = Peek ();
201                         if (ch == ':')
202                         {
203                                 if (Peek (1) == ':')
204                                 {
205                                         // If the two characters following an NCName (possibly
206                                         // after intervening ExprWhitespace) are ::, then the
207                                         // token must be recognized as an AxisName.
208                                         if (objToken == null || !IsAxisName (iToken))
209                                                 throw new XPathException ("invalid axis name: '"+strToken+"'");
210                                         return iToken;
211                                 }
212
213                                 GetChar ();
214                                 SkipWhitespace ();
215                                 ch = Peek ();
216
217                                 if (ch == '*')
218                                 {
219                                         GetChar ();
220                                         m_objToken = new XmlQualifiedName ("", strToken);
221                                         return Token.QName;
222                                 }
223                                 string strToken2 = ReadIdentifier ();
224                                 if (strToken2 == null)
225                                         throw new XPathException ("invalid QName: "+strToken+":"+(char)ch);
226
227                                 ch = Peek ();
228                                 m_objToken = new XmlQualifiedName (strToken2, strToken);
229                                 if (ch == '(')
230                                         return Token.FUNCTION_NAME;
231                                 return Token.QName;
232                         }
233
234                         // If there is a preceding token and the preceding
235                         // token is not one of @, ::, (, [, , or an Operator,
236                         // then a * must be recognized as a MultiplyOperator
237                         // and an NCName must be recognized as an OperatorName.
238                         if (!IsFirstToken && !m_fPrevWasOperator)
239                         {
240                                 if (objToken == null || !IsOperatorName (iToken))
241                                         throw new XPathException ("invalid operator name: '"+strToken+"'");
242                                 return iToken;
243                         }
244
245                         if (ch == '(')
246                         {
247                                 // If the character following an NCName (possibly
248                                 // after intervening ExprWhitespace) is (, then the
249                                 // token must be recognized as a NodeType or a FunctionName.
250                                 if (objToken == null)
251                                 {
252                                         m_objToken = new XmlQualifiedName (strToken, "");
253                                         return Token.FUNCTION_NAME;
254                                 }
255                                 if (IsNodeType (iToken))
256                                         return iToken;
257                                 throw new XPathException ("invalid function name: '"+strToken+"'");
258                         }
259
260                         m_objToken = new XmlQualifiedName (strToken, "");
261                         return Token.QName;
262                 }
263
264                 private static bool IsWhitespace (char ch)
265                 {
266                         // return Char.IsWhiteSpace (ch);
267                         return (ch == ' ' || ch == '\t' || ch == '\n' || ch == '\r');
268                 }
269
270                 private static bool IsDigit (char ch)
271                 {
272                         // return Char.IsDigit (ch);
273                         return ch >= '0' && ch <= '9';
274                 }
275
276
277                 int ParseToken ()
278                 {
279                         char ch = Peek ();
280                         switch (ch)
281                         {
282                                 case EOL:
283                                         return Token.EOF;
284
285                                 case '/':
286                                         m_fThisIsOperator = true;
287                                         GetChar ();
288                                         if (Peek () == '/')
289                                         {
290                                                 GetChar ();
291                                                 return Token.SLASH2;
292                                         }
293                                         return Token.SLASH;
294
295                                 case '.':
296                                         GetChar ();
297                                         if (Peek () == '.')
298                                         {
299                                                 GetChar ();
300                                                 return Token.DOT2;
301                                         }
302                                         else if (IsDigit (Peek ()))
303                                         {
304                                                 PutBack ();
305                                                 return ParseNumber ();
306                                         }
307                                         return Token.DOT;
308
309                                 case ':':
310                                         GetChar ();
311                                         if (Peek () == ':')
312                                         {
313                                                 m_fThisIsOperator = true;
314                                                 GetChar ();
315                                                 return Token.COLON2;
316                                         }
317                                         return Token.ERROR;
318
319                                 case ',':
320                                         m_fThisIsOperator = true;
321                                         GetChar ();
322                                         return Token.COMMA;
323
324                                 case '@':
325                                         m_fThisIsOperator = true;
326                                         GetChar ();
327                                         return Token.AT;
328
329                                 case '[':
330                                         m_fThisIsOperator = true;
331                                         GetChar ();
332                                         return Token.BRACKET_OPEN;
333
334                                 case ']':
335                                         GetChar ();
336                                         return Token.BRACKET_CLOSE;
337
338                                 case '(':
339                                         m_fThisIsOperator = true;
340                                         GetChar ();
341                                         return Token.PAREN_OPEN;
342
343                                 case ')':
344                                         GetChar ();
345                                         return Token.PAREN_CLOSE;
346
347                                 case '+':
348                                         m_fThisIsOperator = true;
349                                         GetChar ();
350                                         return Token.PLUS;
351
352                                 case '-':
353                                         m_fThisIsOperator = true;
354                                         GetChar ();
355                                         return Token.MINUS;
356
357                                 case '*':
358                                         GetChar ();
359                                         if (!IsFirstToken && !m_fPrevWasOperator)
360                                         {
361                                                 m_fThisIsOperator = true;
362                                                 return Token.MULTIPLY;
363                                         }
364                                         return Token.ASTERISK;
365
366                                 case '$':
367                                         GetChar ();
368                                         m_fThisIsOperator = true;
369                                         return Token.DOLLAR;
370
371                                 case '|':
372                                         m_fThisIsOperator = true;
373                                         GetChar ();
374                                         return Token.BAR;
375
376                                 case '=':
377                                         m_fThisIsOperator = true;
378                                         GetChar ();
379                                         return Token.EQ;
380
381                                 case '!':
382                                         GetChar ();
383                                         if (Peek () == '=')
384                                         {
385                                                 m_fThisIsOperator = true;
386                                                 GetChar ();
387                                                 return Token.NE;
388                                         }
389                                         break;
390
391                                 case '>':
392                                         m_fThisIsOperator = true;
393                                         GetChar ();
394                                         if (Peek () == '=')
395                                         {
396                                                 GetChar ();
397                                                 return Token.GE;
398                                         }
399                                         return Token.GT;
400
401                                 case '<':
402                                         m_fThisIsOperator = true;
403                                         GetChar ();
404                                         if (Peek () == '=')
405                                         {
406                                                 GetChar ();
407                                                 return Token.LE;
408                                         }
409                                         return Token.LT;
410
411                                 case '\'':
412                                         return ParseLiteral ();
413
414                                 case '\"':
415                                         return ParseLiteral ();
416
417                                 default:
418                                         if (IsDigit (ch))
419                                         {
420                                                 return ParseNumber ();
421                                         }
422                                         else if (Char.IsLetter (ch) || ch == '_')        // NCName
423                                         {
424                                                 int iToken = ParseIdentifier ();
425                                                 if (IsOperatorName (iToken))
426                                                         m_fThisIsOperator = true;
427                                                 return iToken;
428                                         }
429                                         break;
430                         }
431                         throw new XPathException ("invalid token: '"+ch+"'");
432                 }
433
434                 ///////////////////////////
435                 // yyParser.yyInput methods
436                 ///////////////////////////
437
438                 /** move on to next token.
439                   @return false if positioned beyond tokens.
440                   @throws IOException on input error.
441                   */
442                 public bool advance ()
443                 {
444                         m_fThisIsOperator = false;
445                         m_objToken = null;
446                         m_iToken = ParseToken ();
447                         SkipWhitespace ();
448                         m_iTokenPrev = m_iToken;
449                         m_fPrevWasOperator = m_fThisIsOperator;
450                         return (m_iToken != Token.EOF);
451                 }
452
453                 /** classifies current token.
454                   Should not be called if advance() returned false.
455                   @return current %token or single character.
456                   */
457                 public int token ()
458                 {
459                         return m_iToken;
460                 }
461
462                 /** associated with current token.
463                   Should not be called if advance() returned false.
464                   @return value for token().
465                   */
466                 public Object value ()
467                 {
468                         return m_objToken;
469                 }
470                 private bool IsFirstToken { get { return m_iTokenPrev == Token.EOF; } }
471
472                 private bool IsNodeType (int iToken)
473                 {
474                         switch (iToken)
475                         {
476                                 case Token.COMMENT:
477                                 case Token.TEXT:
478                                 case Token.PROCESSING_INSTRUCTION:
479                                 case Token.NODE:
480                                         return true;
481                                 default:
482                                         return false;
483                         }
484                 }
485                 private bool IsOperatorName (int iToken)
486                 {
487                         switch (iToken)
488                         {
489                                 case Token.AND:
490                                 case Token.OR:
491                                 case Token.MOD:
492                                 case Token.DIV:
493                                         return true;
494                                 default:
495                                         return false;
496                         }
497                 }
498                 private bool IsAxisName (int iToken)
499                 {
500                         switch (iToken)
501                         {
502                                 case Token.ATTRIBUTE:
503                                 case Token.ANCESTOR:
504                                 case Token.ANCESTOR_OR_SELF:
505                                 case Token.CHILD:
506                                 case Token.DESCENDANT:
507                                 case Token.DESCENDANT_OR_SELF:
508                                 case Token.FOLLOWING:
509                                 case Token.FOLLOWING_SIBLING:
510                                 case Token.NAMESPACE:
511                                 case Token.PARENT:
512                                 case Token.PRECEDING:
513                                 case Token.PRECEDING_SIBLING:
514                                 case Token.SELF:
515                                         return true;
516                                 default:
517                                         return false;
518                         }
519                 }
520         }
521 }