2002-09-21 Piers Haken <piersh@friskit.com>
[mono.git] / mcs / class / System.XML / System.Xml.XPath / Tokenizer.cs
1 //
2 // System.Xml.XPath.Tokenizer
3 //
4 // Author:
5 //   Piers Haken (piersh@friskit.com)
6 //
7 // (C) 2002 Piers Haken
8 //
9 using System;
10 using System.IO;
11 using System.Text;
12 using System.Collections;
13 using Mono.Xml.XPath;
14 using Mono.Xml.XPath.yyParser;
15
16 namespace System.Xml.XPath
17 {
18         internal class Tokenizer : Mono.Xml.XPath.yyParser.yyInput
19         {
20                 private char [] m_rgchInput;
21                 private int m_ich;
22                 private int m_cch;
23 //              private System.IO.StreamReader m_input;
24                 private int m_iToken;
25                 private Object m_objToken;
26                 private static Hashtable m_mapTokens = new Hashtable ();
27                 private static readonly Object [] rgTokenMap =
28                 {
29                    Token.AND, "and",
30                    Token.OR, "or",
31                    Token.DIV, "div",
32                    Token.MOD, "mod",
33                    Token.ANCESTOR, "ancestor",
34                    Token.ANCESTOR_OR_SELF, "ancestor-or-self",
35                    Token.ATTRIBUTE, "attribute",
36                    Token.CHILD, "child",
37                    Token.DESCENDANT, "descendant",
38                    Token.DESCENDANT_OR_SELF, "descendant-or-self",
39                    Token.FOLLOWING, "following",
40                    Token.FOLLOWING_SIBLING, "following-sibling",
41                    Token.NAMESPACE, "namespace",
42                    Token.PARENT, "parent",
43                    Token.PRECEDING, "preceding",
44                    Token.PRECEDING_SIBLING, "preceding-sibling",
45                    Token.SELF, "self",
46                    Token.COMMENT, "comment",
47                    Token.TEXT, "text",
48                    Token.PROCESSING_INSTRUCTION, "processing-instruction",
49                    Token.NODE, "node",
50                 };
51
52                 static Tokenizer ()
53                 {
54                         for (int i = 0; i < rgTokenMap.Length; i += 2)
55                                 m_mapTokens.Add (rgTokenMap [i + 1], rgTokenMap [i]);
56                 }
57
58                 public Tokenizer (string strInput)
59                 {
60                         m_rgchInput = strInput.ToCharArray ();
61                         m_ich = 0;
62                         m_cch = strInput.Length;
63                         SkipWhitespace ();
64                 }
65
66                 private int Peek ()
67                 {
68                         if (m_ich >= m_cch)
69                                 return -1;
70                         return m_rgchInput [m_ich];
71                 }
72
73                 private int GetChar ()
74                 {
75                         if (m_ich >= m_cch)
76                                 return -1;
77                         return m_rgchInput [m_ich++];
78                 }
79
80                 private int PutBack ()
81                 {
82                         if (m_ich == 0)
83                                 throw new XPathException ("invalid tokenizer state");   // TODO: better description
84                         return m_rgchInput [--m_ich];
85                 }
86
87                 private void SkipWhitespace ()
88                 {
89                         while (IsWhitespace (Peek ()))
90                                 GetChar ();
91                 }
92
93                 [MonoTODO]
94                 private int ParseNumber ()
95                 {
96                         StringBuilder sb = new StringBuilder ();
97
98                         while (IsDigit (Peek ()))
99                                 sb.Append ((char) GetChar ());
100
101                         // TODO: doesn't handle '3.' error case
102                         if (Peek () == '.')
103                         {
104                                 sb.Append ((char) GetChar ());
105                                 while (IsDigit (Peek ()))
106                                         sb.Append ((char) GetChar ());
107                         }
108                         m_objToken = Double.Parse (sb.ToString ());
109                         return Token.NUMBER;
110                 }
111
112                 private int ParseLiteral ()
113                 {
114                         StringBuilder sb = new StringBuilder ();
115
116                         int chInit = GetChar ();
117                         int ch;
118                         while ((ch = Peek ()) != chInit)
119                         {
120                                 if (ch == -1)
121                                         return Token.ERROR;
122                                 sb.Append ((char) GetChar ());
123                         }
124                         GetChar ();
125                         m_objToken = sb.ToString ();
126                         return Token.LITERAL;
127                 }
128
129                 private int ParseIdentifier ()
130                 {
131                         StringBuilder sb = new StringBuilder ();
132
133                         while (true)
134                         {
135                                 int ch = Peek ();
136                                 if (ch == '_' || ch == '-' ||
137                                                 (ch >= 'a' && ch <= 'z') ||
138                                                 (ch >= 'A' && ch <= 'Z'))
139                                 {
140                                         sb.Append ((char) GetChar ());
141                                 }
142                                 else
143                                         break;
144                         }
145                         String strToken = sb.ToString ();
146                         Object objToken = m_mapTokens [strToken];
147                         if (objToken != null)
148                         {
149                                 return (int) objToken;
150                         }
151                         else
152                         {
153                                 m_objToken = strToken;
154
155                                 SkipWhitespace ();
156                                 if (Peek () == '(')                                     
157                                         return Token.FUNCTION_NAME;
158                                 return Token.NCName;
159                         }
160                 }
161
162                 private static bool IsWhitespace (int ch)
163                 {
164                         return (ch == ' ' || ch == '\t' || ch == '\n' || ch == '\r');
165                 }
166
167                 private static bool IsDigit (int ch)
168                 {
169                         return ch >= '0' && ch <= '9';
170                 }
171
172
173                 int ParseToken ()
174                 {
175                         int ch = Peek ();
176                         switch (ch)
177                         {
178                                 case -1:
179                                         return Token.EOF;
180
181                                 case '/':
182                                         GetChar ();
183                                         if (Peek () == '/')
184                                         {
185                                                 GetChar ();
186                                                 return Token.SLASH2;
187                                         }
188                                         return Token.SLASH;
189
190                                 case '.':
191                                         GetChar ();
192                                         if (Peek () == '.')
193                                         {
194                                                 GetChar ();
195                                                 return Token.DOT2;
196                                         }
197                                         else if (IsDigit (Peek ()))
198                                         {
199                                                 PutBack ();
200                                                 return ParseNumber ();
201                                         }
202                                         return Token.DOT;
203
204                                 case ':':
205                                         GetChar ();
206                                         if (Peek () == ':')
207                                         {
208                                                 GetChar ();
209                                                 return Token.COLON2;
210                                         }
211                                         return Token.COLON;
212
213                                 case ',':
214                                         GetChar ();
215                                         return Token.COMMA;
216
217                                 case '@':
218                                         GetChar ();
219                                         return Token.AT;
220
221                                 case '[':
222                                         GetChar ();
223                                         return Token.BRACKET_OPEN;
224
225                                 case ']':
226                                         GetChar ();
227                                         return Token.BRACKET_CLOSE;
228
229                                 case '(':
230                                         GetChar ();
231                                         return Token.PAREN_OPEN;
232
233                                 case ')':
234                                         GetChar ();
235                                         return Token.PAREN_CLOSE;
236
237                                 case '+':
238                                         GetChar ();
239                                         return Token.PLUS;
240
241                                 case '-':
242                                         GetChar ();
243                                         return Token.MINUS;
244
245                                 case '*':
246                                         GetChar ();
247                                         return Token.ASTERISK;
248
249                                 case '$':
250                                         GetChar ();
251                                         return Token.DOLLAR;
252
253                                 case '|':
254                                         GetChar ();
255                                         return Token.BAR;
256
257                                 case '=':
258                                         GetChar ();
259                                         return Token.EQ;
260
261                                 case '!':
262                                         GetChar ();
263                                         if (Peek () == '=')
264                                         {
265                                                 GetChar ();
266                                                 return Token.NE;
267                                         }
268                                         break;
269
270                                 case '>':
271                                         GetChar ();
272                                         if (Peek () == '=')
273                                         {
274                                                 GetChar ();
275                                                 return Token.GE;
276                                         }
277                                         return Token.GT;
278
279                                 case '<':
280                                         GetChar ();
281                                         if (Peek () == '=')
282                                         {
283                                                 GetChar ();
284                                                 return Token.LE;
285                                         }
286                                         return Token.LT;
287
288                                 case '\'':
289                                         return ParseLiteral ();
290
291                                 case '\"':
292                                         return ParseLiteral ();
293
294                                 default:
295                                         {
296                                                 if (IsDigit (ch))
297                                                 {
298                                                         return ParseNumber ();
299                                                 }
300                                                 else
301                                                 {
302                                                         return ParseIdentifier ();
303                                                 }
304                                         }
305                         }
306                         return Token.ERROR;
307                 }
308
309                 ///////////////////////////
310                 // yyParser.yyInput methods
311                 ///////////////////////////
312
313                 /** move on to next token.
314                   @return false if positioned beyond tokens.
315                   @throws IOException on input error.
316                   */
317                 public bool advance ()
318                 {
319                         m_objToken = null;
320                         m_iToken = ParseToken ();
321                         SkipWhitespace ();
322                         return (m_iToken != Token.EOF);
323                 }
324
325                 /** classifies current token.
326                   Should not be called if advance() returned false.
327                   @return current %token or single character.
328                   */
329                 public int token ()
330                 {
331                         return m_iToken;
332                 }
333
334                 /** associated with current token.
335                   Should not be called if advance() returned false.
336                   @return value for token().
337                   */
338                 public Object value ()
339                 {
340                         return m_objToken;
341                 }
342         }
343 }