2002-09-12 Piers Haken <piersh@friskit.com>
[mono.git] / mcs / class / System.XML / System.Xml.XPath / Tokenizer.cs
1 //
2 // System.Xml.XPath.Tokenizer
3 //
4 // Author:
5 //   Piers Haken (piersh@friskit.com)
6 //
7 // (C) 2002 Piers Haken
8 //
9 using System;
10 using System.IO;
11 using System.Text;
12 using System.Collections;
13 using Mono.Xml.XPath;
14 using Mono.Xml.XPath.yyParser;
15
16 namespace System.Xml.XPath
17 {
18         internal class Tokenizer : Mono.Xml.XPath.yyParser.yyInput
19         {
20                 private char [] m_rgchInput;
21                 private int m_ich;
22                 private int m_cch;
23 //              private System.IO.StreamReader m_input;
24                 private int m_iToken;
25                 private Object m_objToken;
26                 private static Hashtable m_mapTokens = new Hashtable ();
27                 private static readonly Object [] rgTokenMap =
28                 {
29                    Token.AND, "and",
30                    Token.OR, "or",
31                    Token.DIV, "div",
32                    Token.MOD, "mod",
33                    Token.ANCESTOR, "ancestor",
34                    Token.ANCESTOR_OR_SELF, "ancestor-or-self",
35                    Token.ATTRIBUTE, "attribute",
36                    Token.CHILD, "child",
37                    Token.DESCENDANT, "descendant",
38                    Token.DESCENDANT_OR_SELF, "descendant-or-self",
39                    Token.FOLLOWING, "following",
40                    Token.FOLLOWING_SIBLING, "following-sibling",
41                    Token.NAMESPACE, "namespace",
42                    Token.PARENT, "parent",
43                    Token.PRECEDING, "preceding",
44                    Token.PRECEDING_SIBLING, "preceding-sibling",
45                    Token.SELF, "self",
46                    Token.COMMENT, "comment",
47                    Token.TEXT, "text",
48                    Token.PROCESSING_INSTRUCTION, "processing-instruction",
49                    Token.NODE, "node",
50                 };
51
52                 static Tokenizer ()
53                 {
54                         for (int i = 0; i < rgTokenMap.Length; i += 2)
55                                 m_mapTokens.Add (rgTokenMap [i + 1], rgTokenMap [i]);
56                 }
57
58                 public Tokenizer (string strInput)
59                 {
60                         m_rgchInput = strInput.ToCharArray ();
61                         m_ich = 0;
62                         m_cch = strInput.Length;
63                         SkipWhitespace ();
64                 }
65
66                 private int Peek ()
67                 {
68                         if (m_ich >= m_cch)
69                                 return -1;
70                         return m_rgchInput [m_ich];
71                 }
72
73                 private int GetChar ()
74                 {
75                         if (m_ich >= m_cch)
76                                 return -1;
77                         return m_rgchInput [m_ich++];
78                 }
79
80                 private void SkipWhitespace ()
81                 {
82                         while (IsWhitespace (Peek ()))
83                                 GetChar ();
84                 }
85
86                 [MonoTODO]
87                 private int ParseNumber ()
88                 {
89                         StringBuilder sb = new StringBuilder ();
90
91                         while (IsDigit (Peek ()))
92                                 sb.Append ((char) GetChar ());
93
94                         // TODO: doesn't handle '3.' error case
95                         if (Peek () == '.')
96                         {
97                                 sb.Append ((char) GetChar ());
98                                 while (IsDigit (Peek ()))
99                                         sb.Append ((char) GetChar ());
100                         }
101                         m_objToken = Double.Parse (sb.ToString ());
102                         return Token.NUMBER;
103                 }
104
105                 private int ParseLiteral ()
106                 {
107                         StringBuilder sb = new StringBuilder ();
108
109                         int chInit = GetChar ();
110                         int ch;
111                         while ((ch = Peek ()) != chInit)
112                         {
113                                 if (ch == -1)
114                                         return Token.ERROR;
115                                 sb.Append ((char) GetChar ());
116                         }
117                         GetChar ();
118                         m_objToken = sb.ToString ();
119                         return Token.LITERAL;
120                 }
121
122                 private int ParseIdentifier ()
123                 {
124                         StringBuilder sb = new StringBuilder ();
125
126                         while (true)
127                         {
128                                 int ch = Peek ();
129                                 if (ch == '_' || ch == '-' ||
130                                                 (ch >= 'a' && ch <= 'z') ||
131                                                 (ch >= 'A' && ch <= 'Z'))
132                                 {
133                                         sb.Append ((char) GetChar ());
134                                 }
135                                 else
136                                         break;
137                         }
138                         String strToken = sb.ToString ();
139                         Object objToken = m_mapTokens [strToken];
140                         if (objToken != null)
141                         {
142                                 return (int) objToken;
143                         }
144                         else
145                         {
146                                 m_objToken = strToken;
147
148                                 SkipWhitespace ();
149                                 if (Peek () == '(')                                     
150                                         return Token.FUNCTION_NAME;
151                                 return Token.NCName;
152                         }
153                 }
154
155                 private static bool IsWhitespace (int ch)
156                 {
157                         return (ch == ' ' || ch == '\t' || ch == '\n' || ch == '\r');
158                 }
159
160                 private static bool IsDigit (int ch)
161                 {
162                         return ch >= '0' && ch <= '9';
163                 }
164
165
166                 int ParseToken ()
167                 {
168                         switch (Peek ())
169                         {
170                                 case -1:
171                                         return Token.EOF;
172
173                                 case '/':
174                                         GetChar ();
175                                         if (Peek () == '/')
176                                         {
177                                                 GetChar ();
178                                                 return Token.SLASH2;
179                                         }
180                                         return Token.SLASH;
181
182                                 case '.':
183                                         GetChar ();
184                                         if (Peek () == '.')
185                                         {
186                                                 GetChar ();
187                                                 return Token.DOT2;
188                                         }
189                                         else if (Peek () >= '0' && Peek () <= '9')
190                                         {
191                                                 return ParseNumber ();
192                                         }
193                                         return Token.DOT;
194
195                                 case ':':
196                                         GetChar ();
197                                         if (Peek () == ':')
198                                         {
199                                                 GetChar ();
200                                                 return Token.COLON2;
201                                         }
202                                         return Token.COLON;
203
204                                 case ',':
205                                         GetChar ();
206                                         return Token.COMMA;
207
208                                 case '@':
209                                         GetChar ();
210                                         return Token.AT;
211
212                                 case '[':
213                                         GetChar ();
214                                         return Token.BRACKET_OPEN;
215
216                                 case ']':
217                                         GetChar ();
218                                         return Token.BRACKET_CLOSE;
219
220                                 case '(':
221                                         GetChar ();
222                                         return Token.PAREN_OPEN;
223
224                                 case ')':
225                                         GetChar ();
226                                         return Token.PAREN_CLOSE;
227
228                                 case '+':
229                                         GetChar ();
230                                         return Token.PLUS;
231
232                                 case '-':
233                                         GetChar ();
234                                         return Token.MINUS;
235
236                                 case '*':
237                                         GetChar ();
238                                         return Token.ASTERISK;
239
240                                 case '$':
241                                         GetChar ();
242                                         return Token.DOLLAR;
243
244                                 case '|':
245                                         GetChar ();
246                                         return Token.BAR;
247
248                                 case '=':
249                                         GetChar ();
250                                         return Token.EQ;
251
252                                 case '!':
253                                         GetChar ();
254                                         if (Peek () == '=')
255                                         {
256                                                 GetChar ();
257                                                 return Token.NE;
258                                         }
259                                         break;
260
261                                 case '>':
262                                         GetChar ();
263                                         if (Peek () == '=')
264                                         {
265                                                 GetChar ();
266                                                 return Token.GE;
267                                         }
268                                         return Token.GT;
269
270                                 case '<':
271                                         GetChar ();
272                                         if (Peek () == '=')
273                                         {
274                                                 GetChar ();
275                                                 return Token.LE;
276                                         }
277                                         return Token.LT;
278
279                                 case '\'':
280                                         return ParseLiteral ();
281
282                                 case '\"':
283                                         return ParseLiteral ();
284
285                                 default:
286                                         {
287                                                 if (IsDigit (Peek ()))
288                                                 {
289                                                         return ParseNumber ();
290                                                 }
291                                                 else
292                                                 {
293                                                         return ParseIdentifier ();
294                                                 }
295                                         }
296                         }
297                         return Token.ERROR;
298                 }
299
300                 ///////////////////////////
301                 // yyParser.yyInput methods
302                 ///////////////////////////
303
304                 /** move on to next token.
305                   @return false if positioned beyond tokens.
306                   @throws IOException on input error.
307                   */
308                 public bool advance ()
309                 {
310                         m_objToken = null;
311                         m_iToken = ParseToken ();
312                         SkipWhitespace ();
313                         return (m_iToken != Token.EOF);
314                 }
315
316                 /** classifies current token.
317                   Should not be called if advance() returned false.
318                   @return current %token or single character.
319                   */
320                 public int token ()
321                 {
322                         return m_iToken;
323                 }
324
325                 /** associated with current token.
326                   Should not be called if advance() returned false.
327                   @return value for token().
328                   */
329                 public Object value ()
330                 {
331                         return m_objToken;
332                 }
333         }
334 }