merge -r 58060:58217
[mono.git] / mcs / class / Commons.Xml.Relaxng / Commons.Xml.Relaxng.Rnc / RncTokenizer.cs
1 //\r
2 // RELAX NG Compact Syntax parser\r
3 //\r
4 // Author:\r
5 //      Atsushi Enomoto <ginga@kit.hi-ho.ne.jp>\r
6 //\r
7 // (C)2003 Atsushi Enomoto\r
8 // (C)2004 Novell Inc.\r
9 //\r
10
11 //
12 // Permission is hereby granted, free of charge, to any person obtaining
13 // a copy of this software and associated documentation files (the
14 // "Software"), to deal in the Software without restriction, including
15 // without limitation the rights to use, copy, modify, merge, publish,
16 // distribute, sublicense, and/or sell copies of the Software, and to
17 // permit persons to whom the Software is furnished to do so, subject to
18 // the following conditions:
19 // 
20 // The above copyright notice and this permission notice shall be
21 // included in all copies or substantial portions of the Software.
22 // 
23 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24 // EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25 // MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26 // NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
27 // LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
28 // OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
29 // WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
30 //
31 \r
32 using System;\r
33 using System.Collections;\r
34 using System.IO;\r
35 using System.Xml;\r
36 using Commons.Xml.Relaxng;\r
37 \r
38 namespace Commons.Xml.Relaxng.Rnc\r
39 {\r
40         internal class RncTokenizer : Commons.Xml.Relaxng.Rnc.yyParser.yyInput\r
41         {\r
42                 TextReader source;\r
43 \r
44                 int currentToken;\r
45                 object tokenValue;\r
46                 int peekChar;\r
47                 string peekString;\r
48                 bool isElement;\r
49                 bool isLiteralNsUri;\r
50 \r
51                 int line = 1;\r
52                 int column;\r
53                 int savedLineNumber = 1;\r
54                 int savedLinePosition;\r
55                 bool nextIncrementLine;\r
56 \r
57                 public RncTokenizer (TextReader source)\r
58                 {\r
59                         this.source = source;\r
60                 }\r
61 \r
62                 public bool IsElement {\r
63                         get { return isElement; }\r
64                 }\r
65 \r
66                 public int Line {\r
67                         get { return savedLineNumber; }\r
68                 }\r
69 \r
70                 public int Column {\r
71                         get { return savedLinePosition; }\r
72                 }\r
73 \r
74                 // jay interface implementation\r
75 \r
76                 public int token ()\r
77                 {\r
78                         return currentToken;\r
79                 }\r
80 \r
81                 public bool advance ()\r
82                 {\r
83                         tokenValue = null;\r
84                         currentToken = ParseToken (false);\r
85                         savedLineNumber = line;\r
86                         savedLinePosition = column;\r
87                         return currentToken != Token.EOF;\r
88                 }\r
89 \r
90                 public object value ()\r
91                 {\r
92                         return tokenValue;\r
93                 }\r
94 \r
95                 // private methods\r
96 \r
97                 private int ReadEscapedHexNumber (int current)\r
98                 {\r
99                         int i = source.Read ();\r
100                         switch (i) {\r
101                         case '0':\r
102                         case '1':\r
103                         case '2':\r
104                         case '3':\r
105                         case '4':\r
106                         case '5':\r
107                         case '6':\r
108                         case '7':\r
109                         case '8':\r
110                         case '9':\r
111                                 current = current * 16 + (i - '0');\r
112                                 return ReadEscapedHexNumber (current);\r
113                         case 'A':\r
114                         case 'B':\r
115                         case 'C':\r
116                         case 'D':\r
117                         case 'E':\r
118                         case 'F':\r
119                                 current = current * 16 + (i - 'A') + 10;\r
120                                 return ReadEscapedHexNumber (current);\r
121                         case 'a':\r
122                         case 'b':\r
123                         case 'c':\r
124                         case 'd':\r
125                         case 'e':\r
126                         case 'f':\r
127                                 current = current * 16 + (i - 'a' + 10);\r
128                                 return ReadEscapedHexNumber (current);\r
129                         }\r
130                         peekChar = i;\r
131                         return current;\r
132                 }\r
133 \r
134                 private int ReadFromStream ()\r
135                 {\r
136                         int ret = source.Read ();\r
137                         if (ret != '\\')\r
138                                 return ret;\r
139                         ret = source.Read ();\r
140                         switch (ret) {\r
141                         case 'x':\r
142                                 int tmp;\r
143                                 int xcount = 0;\r
144                                 do {\r
145                                         xcount++;\r
146                                         tmp = source.Read ();\r
147                                 } while (tmp == 'x');\r
148                                 if (tmp != '{') {\r
149                                         peekString = new string ('x', xcount);\r
150                                         if (tmp >= 0)\r
151                                                 peekString += (char) tmp;\r
152                                         return '\\';\r
153                                 }\r
154                                 ret = ReadEscapedHexNumber (0);\r
155                                 if (peekChar != '}')\r
156                                         break;\r
157                                 peekChar = 0;\r
158                                 return ret;\r
159                         }\r
160                         peekString = new string ((char) ret, 1);\r
161                         return '\\';\r
162                 }\r
163 \r
164                 private int PeekChar ()\r
165                 {\r
166                         if (peekChar == 0) {\r
167                                 if (peekString != null) {\r
168                                         peekChar = peekString [0];\r
169                                         peekString = peekString.Length == 1 ?\r
170                                                 null : peekString.Substring (1);\r
171                                 }\r
172                                 else\r
173                                         peekChar = ReadFromStream ();\r
174                         }\r
175 \r
176                         return peekChar;\r
177                 }\r
178 \r
179                 private int ReadChar ()\r
180                 {\r
181                         int ret;\r
182                         if (peekChar != 0) {\r
183                                 ret = peekChar;\r
184                                 peekChar = 0;\r
185                         }\r
186                         else if (peekString != null) {\r
187                                 ret = peekString [0];\r
188                                 peekString = peekString.Length == 1 ?\r
189                                         null : peekString.Substring (1);\r
190                         }\r
191                         else\r
192                                 ret = ReadFromStream ();\r
193 \r
194                         if (nextIncrementLine) {\r
195                                 line++;\r
196                                 column = 1;\r
197                                 nextIncrementLine = false;\r
198                         }\r
199                         switch (ret) {\r
200                         case '\r':\r
201                                 break;\r
202                         case '\n':\r
203                                 nextIncrementLine = true;\r
204                                 goto default;\r
205                         default:\r
206                                 column++;\r
207                                 break;\r
208                         }\r
209 \r
210                         return ret;\r
211                 }\r
212 \r
213                 private void SkipWhitespaces ()\r
214                 {\r
215                         while (true) {\r
216                                 switch (PeekChar ()) {\r
217                                 case ' ':\r
218                                 case '\t':\r
219                                 case '\r':\r
220                                 case '\n':\r
221                                         ReadChar ();\r
222                                         continue;\r
223                                 default:\r
224                                         return;\r
225                                 }\r
226                         }\r
227                 }\r
228 \r
229                 char [] nameBuffer = new char [30];\r
230 \r
231                 private string ReadQuoted (char quoteChar)\r
232                 {\r
233                         int index = 0;\r
234                         bool loop = true;\r
235                         while (loop) {\r
236                                 int c = ReadChar ();\r
237                                 switch (c) {\r
238                                 case -1:\r
239                                 case '\'':\r
240                                 case '\"':\r
241                                         if (quoteChar != c)\r
242                                                 goto default;\r
243                                         loop = false;\r
244                                         break;\r
245                                 default:\r
246                                         if (c < 0)\r
247                                                 throw new RelaxngException ("Unterminated quoted literal.");\r
248                                         if (XmlChar.IsInvalid (c))\r
249                                                 throw new RelaxngException ("Invalid character in literal.");\r
250                                         AppendNameChar (c, ref index);\r
251                                         break;\r
252                                 }\r
253                         }\r
254 \r
255                         return new string (nameBuffer, 0, index);\r
256                 }\r
257 \r
258                 private void AppendNameChar (int c, ref int index)\r
259                 {\r
260                         if (nameBuffer.Length == index) {\r
261                                 char [] arr = new char [index * 2];\r
262                                 Array.Copy (nameBuffer, arr, index);\r
263                                 nameBuffer = arr;\r
264                         }\r
265                         nameBuffer [index++] = (char) c;\r
266                 }\r
267 \r
268                 private string ReadTripleQuoted (char quoteChar)\r
269                 {\r
270                         int index = 0;\r
271                         bool loop = true;\r
272                         do {\r
273                                 int c = ReadChar ();\r
274                                 switch (c) {\r
275                                 case -1:\r
276                                 case '\'':\r
277                                 case '\"':\r
278                                         // 1\r
279                                         if (quoteChar != c)\r
280                                                 goto default;\r
281                                         // 2\r
282                                         if ((c = PeekChar ()) != quoteChar) {\r
283                                                 AppendNameChar (quoteChar, ref index);\r
284                                                 goto default;\r
285                                         }\r
286                                         ReadChar ();\r
287                                         // 3\r
288                                         if ((c = PeekChar ()) == quoteChar) {\r
289                                                 ReadChar ();\r
290                                                 loop = false;\r
291                                                 break;\r
292                                         }\r
293                                         AppendNameChar (quoteChar, ref index);\r
294                                         AppendNameChar (quoteChar, ref index);\r
295                                         break;\r
296                                 default:\r
297                                         if (c < 0)\r
298                                                 throw new RelaxngException ("Unterminated triple-quoted literal.");\r
299                                         if (XmlChar.IsInvalid (c))\r
300                                                 throw new RelaxngException ("Invalid character in literal.");\r
301                                         AppendNameChar (c, ref index);\r
302                                         break;\r
303                                 }\r
304                         } while (loop);\r
305 \r
306                         return new string (nameBuffer, 0, index);\r
307                 }\r
308 \r
309                 private string ReadOneName ()\r
310                 {\r
311                         int index = 0;\r
312                         bool loop = true;\r
313                         int c = PeekChar ();\r
314                         if (!XmlChar.IsFirstNameChar (c) || !XmlChar.IsNCNameChar (c))\r
315                                 throw new RelaxngException (String.Format ("Invalid NCName start character: {0}", c));\r
316                         do {\r
317                                 c = PeekChar ();\r
318                                 switch (c) {\r
319                                 case -1:\r
320                                 case ' ':\r
321                                 case '\t':\r
322                                 case '\r':\r
323                                 case '\n':\r
324                                         ReadChar ();\r
325                                         loop = false;\r
326                                         break;\r
327                                 default:\r
328                                         if (!XmlChar.IsNCNameChar (c)) {\r
329                                                 loop = false;\r
330                                                 break;\r
331                                         }\r
332 \r
333                                         ReadChar ();\r
334                                         if (nameBuffer.Length == index) {\r
335                                                 char [] arr = new char [index * 2];\r
336                                                 Array.Copy (nameBuffer, arr, index);\r
337                                                 nameBuffer = arr;\r
338                                         }\r
339                                         nameBuffer [index++] = (char) c;\r
340                                         break;\r
341                                 }\r
342                         } while (loop);\r
343 \r
344                         return new string (nameBuffer, 0, index);\r
345                 }\r
346 \r
347                 private string ReadLine ()\r
348                 {\r
349                         string s = source.ReadLine ();\r
350                         line++;\r
351                         column = 1;\r
352                         return s;\r
353                 }\r
354 \r
355                 private int ParseToken (bool backslashed)\r
356                 {\r
357                         SkipWhitespaces ();\r
358                         int c = ReadChar ();\r
359                         string name;\r
360                         switch (c) {\r
361                         case -1:\r
362                                 return Token.EOF;\r
363                         case '=':\r
364                                 return Token.Equal;\r
365                         case '~':\r
366                                 return Token.Tilde;\r
367                         case ',':\r
368                                 return Token.Comma;\r
369                         case '{':\r
370                                 return Token.OpenCurly;\r
371                         case '}':\r
372                                 return Token.CloseCurly;\r
373                         case '(':\r
374                                 return Token.OpenParen;\r
375                         case ')':\r
376                                 return Token.CloseParen;\r
377                         case '[':\r
378                                 return Token.OpenBracket;\r
379                         case ']':\r
380                                 return Token.CloseBracket;\r
381                         case '&':\r
382                                 if (PeekChar () != '=')\r
383                                         return Token.Amp;\r
384                                 ReadChar ();\r
385                                 return Token.AndEquals;\r
386                         case '|':\r
387                                 if (PeekChar () != '=')\r
388                                         return Token.Bar;\r
389                                 ReadChar ();\r
390                                 return Token.OrEquals;\r
391                         case '?':\r
392                                 return Token.Question;\r
393                         case '*':\r
394                                 // See also ':' for NsName\r
395                                 return Token.Asterisk;\r
396                         case '\\':\r
397                                 if (backslashed)\r
398                                         return Token.BackSlash;\r
399                                 return ParseToken (true);\r
400                         case '+':\r
401                                 return Token.Plus;\r
402                         case '-':\r
403                                 return Token.Minus;\r
404                         case '>':\r
405                                 if (PeekChar () == '>') {\r
406                                         ReadChar ();\r
407                                         return Token.TwoGreaters;\r
408                                 }\r
409                                 peekChar = '>';\r
410                                 goto default;\r
411                         case '#':\r
412                                 // NOTE: This interpretation is expanded against the spec\r
413 //                              if (ReadChar () != '#')\r
414 //                                      throw new RelaxngException ("Invalid character after '#'.");\r
415                                 tokenValue = ReadLine ();\r
416 //                              return Token.Documentation;\r
417                                 return ParseToken (false);\r
418                         case '\'':\r
419                         case '\"':\r
420                                 if (PeekChar () != c)\r
421                                         name = ReadQuoted ((char) c);\r
422                                 else {\r
423                                         ReadChar ();\r
424                                         if (PeekChar () == c) {\r
425                                                 ReadChar ();\r
426                                                 name = ReadTripleQuoted ((char) c);\r
427                                         } // else '' or ""\r
428                                         name = String.Empty;\r
429                                 }\r
430                                 tokenValue = name;\r
431                                 return Token.LiteralSegment;\r
432                         default:\r
433                                 if (!XmlChar.IsNCNameChar (c))\r
434                                         throw new RelaxngException ("Invalid NCName character.");\r
435                                 peekChar = c;\r
436                                 name = ReadOneName ();\r
437                                 if (PeekChar () == ':') {\r
438                                         ReadChar ();\r
439                                         if (PeekChar () == '*') {\r
440                                                 ReadChar ();\r
441                                                 tokenValue = name;\r
442                                                 return Token.NsName;\r
443                                         }\r
444                                         tokenValue = name + ":" + ReadOneName ();\r
445                                         return Token.CName;\r
446 \r
447                                 }\r
448                                 tokenValue = name;\r
449                                 if (backslashed)\r
450                                         return Token.NCName;\r
451                                 switch (name) {\r
452                                 case "attribute":\r
453                                         isElement = false;\r
454                                         return Token.KeywordAttribute;\r
455                                 case "element":\r
456                                         isElement = true;\r
457                                         return Token.KeywordElement;\r
458                                 case "datatypes":\r
459                                         return Token.KeywordDatatypes;\r
460                                 case "default":\r
461                                         return Token.KeywordDefault;\r
462                                 case "div":\r
463                                         return Token.KeywordDiv;\r
464                                 case "empty":\r
465                                         return Token.KeywordEmpty;\r
466                                 case "external":\r
467                                         return Token.KeywordExternal;\r
468                                 case "grammar":\r
469                                         return Token.KeywordGrammar;\r
470                                 case "include":\r
471                                         return Token.KeywordInclude;\r
472                                 case "inherit":\r
473                                         return Token.KeywordInherit;\r
474                                 case "list":\r
475                                         return Token.KeywordList;\r
476                                 case "mixed":\r
477                                         return Token.KeywordMixed;\r
478                                 case "namespace":\r
479                                         return Token.KeywordNamespace;\r
480                                 case "notAllowed":\r
481                                         return Token.KeywordNotAllowed;\r
482                                 case "parent":\r
483                                         return Token.KeywordParent;\r
484                                 case "start":\r
485                                         return Token.KeywordStart;\r
486                                 case "string":\r
487                                         return Token.KeywordString;\r
488                                 case "text":\r
489                                         return Token.KeywordText;\r
490                                 case "token":\r
491                                         return Token.KeywordToken;\r
492                                 default:\r
493                                         return Token.NCName;\r
494                                 }\r
495                         }\r
496                 }\r
497 \r
498         }\r
499 }