New test.
[mono.git] / mcs / class / Commons.Xml.Relaxng / Commons.Xml.Relaxng.Rnc / RncTokenizer.cs
1 //\r
2 // RELAX NG Compact Syntax parser\r
3 //\r
4 // Author:\r
5 //      Atsushi Enomoto <ginga@kit.hi-ho.ne.jp>\r
6 //\r
7 // (C)2003 Atsushi Enomoto\r
8 // (C)2004 Novell Inc.\r
9 //\r
10
11 //
12 // Permission is hereby granted, free of charge, to any person obtaining
13 // a copy of this software and associated documentation files (the
14 // "Software"), to deal in the Software without restriction, including
15 // without limitation the rights to use, copy, modify, merge, publish,
16 // distribute, sublicense, and/or sell copies of the Software, and to
17 // permit persons to whom the Software is furnished to do so, subject to
18 // the following conditions:
19 // 
20 // The above copyright notice and this permission notice shall be
21 // included in all copies or substantial portions of the Software.
22 // 
23 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24 // EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25 // MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26 // NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
27 // LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
28 // OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
29 // WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
30 //
31 \r
32 using System;\r
33 using System.Collections;\r
34 using System.IO;\r
35 using System.Xml;\r
36 using Commons.Xml.Relaxng;\r
37 \r
38 namespace Commons.Xml.Relaxng.Rnc\r
39 {\r
40         internal class RncTokenizer : Commons.Xml.Relaxng.Rnc.yyParser.yyInput\r
41         {\r
42                 TextReader source;\r
43 \r
44                 int currentToken;\r
45                 object tokenValue;\r
46                 int peekChar;\r
47                 string peekString;\r
48                 bool isElement;\r
49                 bool isLiteralNsUri;\r
50 \r
51                 int line = 1;\r
52                 int column;\r
53                 int savedLineNumber = 1;\r
54                 int savedLinePosition;\r
55                 bool nextIncrementLine;\r
56 \r
57                 public RncTokenizer (TextReader source)\r
58                 {\r
59                         this.source = source;\r
60                 }\r
61 \r
62                 public bool IsElement {\r
63                         get { return isElement; }\r
64                 }\r
65 \r
66                 public int Line {\r
67                         get { return savedLineNumber; }\r
68                 }\r
69 \r
70                 public int Column {\r
71                         get { return savedLinePosition; }\r
72                 }\r
73 \r
74                 // jay interface implementation\r
75 \r
76                 public int token ()\r
77                 {\r
78                         return currentToken;\r
79                 }\r
80 \r
81                 public bool advance ()\r
82                 {\r
83                         tokenValue = null;\r
84                         currentToken = ParseToken (false);\r
85                         savedLineNumber = line;\r
86                         savedLinePosition = column;\r
87                         return currentToken != Token.EOF;\r
88                 }\r
89 \r
90                 public object value ()\r
91                 {\r
92                         return tokenValue;\r
93                 }\r
94 \r
95                 // private methods\r
96 \r
97                 private int ReadEscapedHexNumber (int current)\r
98                 {\r
99                         int i = source.Read ();\r
100                         switch (i) {\r
101                         case '0':\r
102                         case '1':\r
103                         case '2':\r
104                         case '3':\r
105                         case '4':\r
106                         case '5':\r
107                         case '6':\r
108                         case '7':\r
109                         case '8':\r
110                         case '9':\r
111                                 current = current * 16 + (i - '0');\r
112                                 return ReadEscapedHexNumber (current);\r
113                         case 'A':\r
114                         case 'B':\r
115                         case 'C':\r
116                         case 'D':\r
117                         case 'E':\r
118                         case 'F':\r
119                                 current = current * 16 + (i - 'A') + 10;\r
120                                 return ReadEscapedHexNumber (current);\r
121                         case 'a':\r
122                         case 'b':\r
123                         case 'c':\r
124                         case 'd':\r
125                         case 'e':\r
126                         case 'f':\r
127                                 current = current * 16 + (i - 'a' + 10);\r
128                                 return ReadEscapedHexNumber (current);\r
129                         }\r
130                         peekChar = i;\r
131                         return current;\r
132                 }\r
133 \r
134                 private int ReadFromStream ()\r
135                 {\r
136                         int ret = source.Read ();\r
137                         if (ret != '\\')\r
138                                 return ret;\r
139                         ret = source.Read ();\r
140                         switch (ret) {\r
141                         case 'x':\r
142                                 int tmp;\r
143                                 int xcount = 0;\r
144                                 do {\r
145                                         xcount++;\r
146                                         tmp = source.Read ();\r
147                                 } while (tmp == 'x');\r
148                                 if (tmp != '{') {\r
149                                         peekString = new string ('x', xcount);\r
150                                         if (tmp >= 0)\r
151                                                 peekString += (char) tmp;\r
152                                         return '\\';\r
153                                 }\r
154                                 ret = ReadEscapedHexNumber (0);\r
155                                 if (peekChar != '}')\r
156                                         break;\r
157                                 peekChar = 0;\r
158                                 return ret;\r
159                         }\r
160                         peekString = new string ((char) ret, 1);\r
161                         return '\\';\r
162                 }\r
163 \r
164                 private int PeekChar ()\r
165                 {\r
166                         if (peekChar == 0) {\r
167                                 if (peekString != null) {\r
168                                         peekChar = peekString [0];\r
169                                         peekString = peekString.Length == 1 ?\r
170                                                 null : peekString.Substring (1);\r
171                                 }\r
172                                 else\r
173                                         peekChar = ReadFromStream ();\r
174                         }\r
175 \r
176                         return peekChar;\r
177                 }\r
178 \r
179                 private int ReadChar ()\r
180                 {\r
181                         int ret;\r
182                         if (peekChar != 0) {\r
183                                 ret = peekChar;\r
184                                 peekChar = 0;\r
185                         }\r
186                         else if (peekString != null) {\r
187                                 ret = peekString [0];\r
188                                 peekString = peekString.Length == 1 ?\r
189                                         null : peekString.Substring (1);\r
190                         }\r
191                         else\r
192                                 ret = ReadFromStream ();\r
193 \r
194                         if (nextIncrementLine) {\r
195                                 line++;\r
196                                 column = 1;\r
197                                 nextIncrementLine = false;\r
198                         }\r
199                         switch (ret) {\r
200                         case '\r':\r
201                                 break;\r
202                         case '\n':\r
203                                 nextIncrementLine = true;\r
204                                 goto default;\r
205                         default:\r
206                                 column++;\r
207                                 break;\r
208                         }\r
209 \r
210                         return ret;\r
211                 }\r
212 \r
213                 private void SkipWhitespaces ()\r
214                 {\r
215                         while (true) {\r
216                                 switch (PeekChar ()) {\r
217                                 case ' ':\r
218                                 case '\t':\r
219                                 case '\r':\r
220                                 case '\n':\r
221                                         ReadChar ();\r
222                                         continue;\r
223                                 default:\r
224                                         return;\r
225                                 }\r
226                         }\r
227                 }\r
228 \r
229                 char [] nameBuffer = new char [30];\r
230 \r
231                 private string ReadQuoted (char quoteChar)\r
232                 {\r
233                         int index = 0;\r
234                         bool loop = true;\r
235                         while (loop) {\r
236                                 int c = ReadChar ();\r
237                                 switch (c) {\r
238                                 case -1:\r
239                                 case '\'':\r
240                                 case '\"':\r
241                                         if (quoteChar != c)\r
242                                                 goto default;\r
243                                         loop = false;\r
244                                         break;\r
245                                 default:\r
246                                         if (c < 0)\r
247                                                 throw new RelaxngException ("Unterminated quoted literal.");\r
248                                         if (XmlChar.IsInvalid (c))\r
249                                                 throw new RelaxngException ("Invalid character in literal.");\r
250                                         AppendNameChar (c, ref index);\r
251                                         break;\r
252                                 }\r
253                         }\r
254 \r
255                         return new string (nameBuffer, 0, index);\r
256                 }\r
257 \r
258                 private void AppendNameChar (int c, ref int index)\r
259                 {\r
260                         if (nameBuffer.Length == index) {\r
261                                 char [] arr = new char [index * 2];\r
262                                 Array.Copy (nameBuffer, arr, index);\r
263                                 nameBuffer = arr;\r
264                         }\r
265                         if (c > 0x10000) {\r
266                                 AppendNameChar ((c - 0x10000) / 0x400 + 0xD800, ref index);\r
267                                 AppendNameChar ((c - 0x10000) % 0x400 + 0xDC00, ref index);\r
268                         }\r
269                         else\r
270                                 nameBuffer [index++] = (char) c;\r
271                 }\r
272 \r
273                 private string ReadTripleQuoted (char quoteChar)\r
274                 {\r
275                         int index = 0;\r
276                         bool loop = true;\r
277                         do {\r
278                                 int c = ReadChar ();\r
279                                 switch (c) {\r
280                                 case -1:\r
281                                 case '\'':\r
282                                 case '\"':\r
283                                         // 1\r
284                                         if (quoteChar != c)\r
285                                                 goto default;\r
286                                         // 2\r
287                                         if ((c = PeekChar ()) != quoteChar) {\r
288                                                 AppendNameChar (quoteChar, ref index);\r
289                                                 goto default;\r
290                                         }\r
291                                         ReadChar ();\r
292                                         // 3\r
293                                         if ((c = PeekChar ()) == quoteChar) {\r
294                                                 ReadChar ();\r
295                                                 loop = false;\r
296                                                 break;\r
297                                         }\r
298                                         AppendNameChar (quoteChar, ref index);\r
299                                         AppendNameChar (quoteChar, ref index);\r
300                                         break;\r
301                                 default:\r
302                                         if (c < 0)\r
303                                                 throw new RelaxngException ("Unterminated triple-quoted literal.");\r
304                                         if (XmlChar.IsInvalid (c))\r
305                                                 throw new RelaxngException ("Invalid character in literal.");\r
306                                         AppendNameChar (c, ref index);\r
307                                         break;\r
308                                 }\r
309                         } while (loop);\r
310 \r
311                         return new string (nameBuffer, 0, index);\r
312                 }\r
313 \r
314                 private string ReadOneName ()\r
315                 {\r
316                         int index = 0;\r
317                         bool loop = true;\r
318                         int c = PeekChar ();\r
319                         if (!XmlChar.IsFirstNameChar (c) || !XmlChar.IsNCNameChar (c))\r
320                                 throw new RelaxngException (String.Format ("Invalid NCName start character: {0}", c));\r
321                         do {\r
322                                 c = PeekChar ();\r
323                                 switch (c) {\r
324                                 case -1:\r
325                                 case ' ':\r
326                                 case '\t':\r
327                                 case '\r':\r
328                                 case '\n':\r
329                                         ReadChar ();\r
330                                         loop = false;\r
331                                         break;\r
332                                 default:\r
333                                         if (!XmlChar.IsNCNameChar (c)) {\r
334                                                 loop = false;\r
335                                                 break;\r
336                                         }\r
337 \r
338                                         ReadChar ();\r
339                                         if (nameBuffer.Length == index) {\r
340                                                 char [] arr = new char [index * 2];\r
341                                                 Array.Copy (nameBuffer, arr, index);\r
342                                                 nameBuffer = arr;\r
343                                         }\r
344                                         nameBuffer [index++] = (char) c;\r
345                                         break;\r
346                                 }\r
347                         } while (loop);\r
348 \r
349                         return new string (nameBuffer, 0, index);\r
350                 }\r
351 \r
352                 private string ReadLine ()\r
353                 {\r
354                         string s = source.ReadLine ();\r
355                         line++;\r
356                         column = 1;\r
357                         return s;\r
358                 }\r
359 \r
360                 private int ParseToken (bool backslashed)\r
361                 {\r
362                         SkipWhitespaces ();\r
363                         int c = ReadChar ();\r
364                         string name;\r
365                         switch (c) {\r
366                         case -1:\r
367                                 return Token.EOF;\r
368                         case '=':\r
369                                 return Token.Equal;\r
370                         case '~':\r
371                                 return Token.Tilde;\r
372                         case ',':\r
373                                 return Token.Comma;\r
374                         case '{':\r
375                                 return Token.OpenCurly;\r
376                         case '}':\r
377                                 return Token.CloseCurly;\r
378                         case '(':\r
379                                 return Token.OpenParen;\r
380                         case ')':\r
381                                 return Token.CloseParen;\r
382                         case '[':\r
383                                 return Token.OpenBracket;\r
384                         case ']':\r
385                                 return Token.CloseBracket;\r
386                         case '&':\r
387                                 if (PeekChar () != '=')\r
388                                         return Token.Amp;\r
389                                 ReadChar ();\r
390                                 return Token.AndEquals;\r
391                         case '|':\r
392                                 if (PeekChar () != '=')\r
393                                         return Token.Bar;\r
394                                 ReadChar ();\r
395                                 return Token.OrEquals;\r
396                         case '?':\r
397                                 return Token.Question;\r
398                         case '*':\r
399                                 // See also ':' for NsName\r
400                                 return Token.Asterisk;\r
401                         case '\\':\r
402                                 if (backslashed)\r
403                                         return Token.ERROR;\r
404                                 return ParseToken (true);\r
405                         case '+':\r
406                                 return Token.Plus;\r
407                         case '-':\r
408                                 return Token.Minus;\r
409                         case '>':\r
410                                 if (PeekChar () == '>') {\r
411                                         ReadChar ();\r
412                                         return Token.TwoGreaters;\r
413                                 }\r
414                                 peekChar = '>';\r
415                                 goto default;\r
416                         case '#':\r
417 //                              tokenValue = ReadLine ();\r
418 //                              return Token.Documentation;\r
419                                 ReadLine ();\r
420                                 return ParseToken (false);\r
421                         case '\'':\r
422                         case '\"':\r
423                                 if (PeekChar () != c)\r
424                                         name = ReadQuoted ((char) c);\r
425                                 else {\r
426                                         ReadChar ();\r
427                                         if (PeekChar () == c) {\r
428                                                 ReadChar ();\r
429                                                 name = ReadTripleQuoted ((char) c);\r
430                                         } // else '' or ""\r
431                                         name = String.Empty;\r
432                                 }\r
433                                 int invidx = XmlChar.IndexOfInvalid (name, true) ;\r
434                                 if (invidx >= 0)\r
435                                         throw new RelaxngException (String.Format ("Invalid XML character in compact syntax literal segment at {0:X}", (int) name [invidx]));\r
436                                 tokenValue = name;\r
437                                 return Token.LiteralSegment;\r
438                         default:\r
439                                 if (!XmlChar.IsNCNameChar (c))\r
440                                         throw new RelaxngException ("Invalid NCName character.");\r
441                                 peekChar = c;\r
442                                 name = ReadOneName ();\r
443                                 if (PeekChar () == ':') {\r
444                                         ReadChar ();\r
445                                         if (PeekChar () == '*') {\r
446                                                 ReadChar ();\r
447                                                 tokenValue = name;\r
448                                                 return Token.NsName;\r
449                                         }\r
450                                         tokenValue = name + ":" + ReadOneName ();\r
451                                         return Token.CName;\r
452 \r
453                                 }\r
454                                 tokenValue = name;\r
455                                 if (backslashed)\r
456                                         return Token.QuotedIdentifier;\r
457                                 switch (name) {\r
458                                 case "attribute":\r
459                                         isElement = false;\r
460                                         return Token.KeywordAttribute;\r
461                                 case "element":\r
462                                         isElement = true;\r
463                                         return Token.KeywordElement;\r
464                                 case "datatypes":\r
465                                         return Token.KeywordDatatypes;\r
466                                 case "default":\r
467                                         return Token.KeywordDefault;\r
468                                 case "div":\r
469                                         return Token.KeywordDiv;\r
470                                 case "empty":\r
471                                         return Token.KeywordEmpty;\r
472                                 case "external":\r
473                                         return Token.KeywordExternal;\r
474                                 case "grammar":\r
475                                         return Token.KeywordGrammar;\r
476                                 case "include":\r
477                                         return Token.KeywordInclude;\r
478                                 case "inherit":\r
479                                         return Token.KeywordInherit;\r
480                                 case "list":\r
481                                         return Token.KeywordList;\r
482                                 case "mixed":\r
483                                         return Token.KeywordMixed;\r
484                                 case "namespace":\r
485                                         return Token.KeywordNamespace;\r
486                                 case "notAllowed":\r
487                                         return Token.KeywordNotAllowed;\r
488                                 case "parent":\r
489                                         return Token.KeywordParent;\r
490                                 case "start":\r
491                                         return Token.KeywordStart;\r
492                                 case "string":\r
493                                         return Token.KeywordString;\r
494                                 case "text":\r
495                                         return Token.KeywordText;\r
496                                 case "token":\r
497                                         return Token.KeywordToken;\r
498                                 default:\r
499                                         return Token.NCName;\r
500                                 }\r
501                         }\r
502                 }\r
503 \r
504         }\r
505 }