2003-02-16 Atsushi Enomoto <ginga@kit.hi-ho.ne.jp>
[mono.git] / mcs / class / System.XML / System.Xml.XPath / Tokenizer.cs
1 //
2 // System.Xml.XPath.Tokenizer
3 //
4 // Author:
5 //   Piers Haken (piersh@friskit.com)
6 //
7 // (C) 2002 Piers Haken
8 //
9 using System;
10 using System.IO;
11 using System.Text;
12 using System.Collections;
13 using Mono.Xml.XPath;
14 using Mono.Xml.XPath.yyParser;
15
16 namespace System.Xml.XPath
17 {
18         internal class Tokenizer : Mono.Xml.XPath.yyParser.yyInput
19         {
20                 private char [] m_rgchInput;
21                 private int m_ich;
22                 private int m_cch;
23                 private int m_iToken;
24                 private Object m_objToken;
25                 private bool m_fPrevWasSpecial = false;
26                 private static readonly Hashtable s_mapTokens = new Hashtable ();
27                 private static readonly Object [] s_rgTokenMap =
28                 {
29                    Token.AND, "and",
30                    Token.OR, "or",
31                    Token.DIV, "div",
32                    Token.MOD, "mod",
33                    Token.ANCESTOR, "ancestor",
34                    Token.ANCESTOR_OR_SELF, "ancestor-or-self",
35                    Token.ATTRIBUTE, "attribute",
36                    Token.CHILD, "child",
37                    Token.DESCENDANT, "descendant",
38                    Token.DESCENDANT_OR_SELF, "descendant-or-self",
39                    Token.FOLLOWING, "following",
40                    Token.FOLLOWING_SIBLING, "following-sibling",
41                    Token.NAMESPACE, "namespace",
42                    Token.PARENT, "parent",
43                    Token.PRECEDING, "preceding",
44                    Token.PRECEDING_SIBLING, "preceding-sibling",
45                    Token.SELF, "self",
46                    Token.COMMENT, "comment",
47                    Token.TEXT, "text",
48                    Token.PROCESSING_INSTRUCTION, "processing-instruction",
49                    Token.NODE, "node",
50                 };
51                 private static readonly Hashtable s_mapfPrevWasSpecial = new Hashtable ();
52                 private static readonly int [] s_rgfPrevWasSpecial =
53                 {
54                         Token.AT,
55                         Token.COLON2,
56                         Token.PAREN_OPEN,
57                         Token.BRACKET_OPEN,
58                         Token.COMMA,
59
60                         Token.AND,
61                         Token.OR,
62                         Token.DIV,
63                         Token.MOD,
64
65                         Token.SLASH,
66                         Token.SLASH2,
67                         Token.BAR,
68                         Token.PLUS,
69                         Token.MINUS,
70                         Token.EQ,
71                         Token.NE,
72                         Token.LE,
73                         Token.LT,
74                         Token.GE,
75                         Token.GT,
76
77                         Token.ASTERISK,
78                 };
79                 private const char EOL = '\0';
80
81                 static Tokenizer ()
82                 {
83                         for (int i = 0; i < s_rgTokenMap.Length; i += 2)
84                                 s_mapTokens.Add (s_rgTokenMap [i + 1], s_rgTokenMap [i]);
85                         object objTmp = new Object ();
86                         for (int i = 0; i < s_rgfPrevWasSpecial.Length; i++)
87                                 s_mapfPrevWasSpecial.Add (s_rgfPrevWasSpecial [i], null);
88                 }
89
90                 public Tokenizer (string strInput)
91                 {
92                         m_rgchInput = strInput.ToCharArray ();
93                         m_ich = 0;
94                         m_cch = strInput.Length;
95                         SkipWhitespace ();
96                 }
97
98                 private char Peek (int iOffset)
99                 {
100                         if (m_ich + iOffset>= m_cch)
101                                 return EOL;
102                         return m_rgchInput [m_ich + iOffset];
103                 }
104
105                 private char Peek ()
106                 {
107                         return Peek (0);
108                 }
109
110                 private char GetChar ()
111                 {
112                         if (m_ich >= m_cch)
113                                 return EOL;
114                         return m_rgchInput [m_ich++];
115                 }
116
117                 private char PutBack ()
118                 {
119                         if (m_ich == 0)
120                                 throw new XPathException ("invalid tokenizer state");   // TODO: better description
121                         return m_rgchInput [--m_ich];
122                 }
123
124                 private bool SkipWhitespace ()  // returns trus if any whitespace was skipped
125                 {
126                         if (!IsWhitespace (Peek ()))
127                                 return false;
128                                         
129                         while (IsWhitespace (Peek ()))
130                                 GetChar ();
131
132                         return true;
133                 }
134
135                 [MonoTODO]
136                 private int ParseNumber ()
137                 {
138                         StringBuilder sb = new StringBuilder ();
139
140                         while (IsDigit (Peek ()))
141                                 sb.Append ((char) GetChar ());
142
143                         // TODO: doesn't handle '3.' error case
144                         if (Peek () == '.')
145                         {
146                                 sb.Append ((char) GetChar ());
147                                 while (IsDigit (Peek ()))
148                                         sb.Append ((char) GetChar ());
149                         }
150                         m_objToken = Double.Parse (sb.ToString ());
151                         return Token.NUMBER;
152                 }
153
154                 private int ParseLiteral ()
155                 {
156                         StringBuilder sb = new StringBuilder ();
157
158                         char chInit = GetChar ();
159                         char ch;
160                         while ((ch = Peek ()) != chInit)
161                         {
162                                 if (ch == EOL)
163                                         return Token.ERROR;
164                                 sb.Append ((char) GetChar ());
165                         }
166                         GetChar ();
167                         m_objToken = sb.ToString ();
168                         return Token.LITERAL;
169                 }
170
171                 private int ParseIdentifier ()
172                 {
173                         StringBuilder sb = new StringBuilder ();
174
175                         char ch;
176                         while ((ch = Peek ()) == '_' || ch == '-' || Char.IsLetterOrDigit (ch))
177                                 sb.Append ((char) GetChar ());
178
179                         String strToken = sb.ToString ();
180                         Object objToken = s_mapTokens [strToken];
181
182                         if (!m_fPrevWasSpecial && objToken != null)
183                                 return (int) objToken;
184
185                         SkipWhitespace ();
186
187                         ch = Peek ();
188                         if (ch == '(')                                  
189                         {
190                                 if (objToken != null)
191                                         return (int) objToken;
192                                 m_objToken = strToken;
193                                 return Token.FUNCTION_NAME;
194                         }
195                         else if (ch == ':' && Peek (1) == ':')
196                         {
197                                 if (objToken != null)
198                                         return (int) objToken;
199                         }
200
201                         m_objToken = strToken;
202                         return Token.NCName;
203                 }
204
205                 private static bool IsWhitespace (char ch)
206                 {
207                         // return Char.IsWhiteSpace (ch);
208                         return (ch == ' ' || ch == '\t' || ch == '\n' || ch == '\r');
209                 }
210
211                 private static bool IsDigit (char ch)
212                 {
213                         // return Char.IsDigit (ch);
214                         return ch >= '0' && ch <= '9';
215                 }
216
217
218                 int ParseToken ()
219                 {
220                         char ch = Peek ();
221                         switch (ch)
222                         {
223                                 case EOL:
224                                         return Token.EOF;
225
226                                 case '/':
227                                         GetChar ();
228                                         if (Peek () == '/')
229                                         {
230                                                 GetChar ();
231                                                 return Token.SLASH2;
232                                         }
233                                         return Token.SLASH;
234
235                                 case '.':
236                                         GetChar ();
237                                         if (Peek () == '.')
238                                         {
239                                                 GetChar ();
240                                                 return Token.DOT2;
241                                         }
242                                         else if (IsDigit (Peek ()))
243                                         {
244                                                 PutBack ();
245                                                 return ParseNumber ();
246                                         }
247                                         return Token.DOT;
248
249                                 case ':':
250                                         GetChar ();
251                                         if (Peek () == ':')
252                                         {
253                                                 GetChar ();
254                                                 return Token.COLON2;
255                                         }
256                                         return Token.COLON;
257
258                                 case ',':
259                                         GetChar ();
260                                         return Token.COMMA;
261
262                                 case '@':
263                                         GetChar ();
264                                         return Token.AT;
265
266                                 case '[':
267                                         GetChar ();
268                                         return Token.BRACKET_OPEN;
269
270                                 case ']':
271                                         GetChar ();
272                                         return Token.BRACKET_CLOSE;
273
274                                 case '(':
275                                         GetChar ();
276                                         return Token.PAREN_OPEN;
277
278                                 case ')':
279                                         GetChar ();
280                                         return Token.PAREN_CLOSE;
281
282                                 case '+':
283                                         GetChar ();
284                                         return Token.PLUS;
285
286                                 case '-':
287                                         GetChar ();
288                                         return Token.MINUS;
289
290                                 case '*':
291                                         GetChar ();
292                                         return Token.ASTERISK;
293
294                                 case '$':
295                                         GetChar ();
296                                         return Token.DOLLAR;
297
298                                 case '|':
299                                         GetChar ();
300                                         return Token.BAR;
301
302                                 case '=':
303                                         GetChar ();
304                                         return Token.EQ;
305
306                                 case '!':
307                                         GetChar ();
308                                         if (Peek () == '=')
309                                         {
310                                                 GetChar ();
311                                                 return Token.NE;
312                                         }
313                                         break;
314
315                                 case '>':
316                                         GetChar ();
317                                         if (Peek () == '=')
318                                         {
319                                                 GetChar ();
320                                                 return Token.GE;
321                                         }
322                                         return Token.GT;
323
324                                 case '<':
325                                         GetChar ();
326                                         if (Peek () == '=')
327                                         {
328                                                 GetChar ();
329                                                 return Token.LE;
330                                         }
331                                         return Token.LT;
332
333                                 case '\'':
334                                         return ParseLiteral ();
335
336                                 case '\"':
337                                         return ParseLiteral ();
338
339                                 default:
340                                         {
341                                                 if (IsDigit (ch))
342                                                 {
343                                                         return ParseNumber ();
344                                                 }
345                                                 else
346                                                 {
347                                                         return ParseIdentifier ();
348                                                 }
349                                         }
350                         }
351                         return Token.ERROR;
352                 }
353
354                 ///////////////////////////
355                 // yyParser.yyInput methods
356                 ///////////////////////////
357
358                 /** move on to next token.
359                   @return false if positioned beyond tokens.
360                   @throws IOException on input error.
361                   */
362                 public bool advance ()
363                 {
364                         m_objToken = null;
365                         m_iToken = ParseToken ();
366                         bool fWhitespace = SkipWhitespace ();
367                         m_fPrevWasSpecial = (!fWhitespace && s_mapfPrevWasSpecial.Contains (m_iToken));
368                         return (m_iToken != Token.EOF);
369                 }
370
371                 /** classifies current token.
372                   Should not be called if advance() returned false.
373                   @return current %token or single character.
374                   */
375                 public int token ()
376                 {
377                         return m_iToken;
378                 }
379
380                 /** associated with current token.
381                   Should not be called if advance() returned false.
382                   @return value for token().
383                   */
384                 public Object value ()
385                 {
386                         return m_objToken;
387                 }
388         }
389 }