Merge branch 'master' of github.com:tgiphil/mono
[mono.git] / mcs / class / Commons.Xml.Relaxng / Commons.Xml.Relaxng.Rnc / RncTokenizer.cs
1 //\r
2 // RELAX NG Compact Syntax parser\r
3 //\r
4 // Author:\r
5 //      Atsushi Enomoto <ginga@kit.hi-ho.ne.jp>\r
6 //\r
7 // (C)2003 Atsushi Enomoto\r
8 // (C)2004 Novell Inc.\r
9 //\r
10
11 //
12 // Permission is hereby granted, free of charge, to any person obtaining
13 // a copy of this software and associated documentation files (the
14 // "Software"), to deal in the Software without restriction, including
15 // without limitation the rights to use, copy, modify, merge, publish,
16 // distribute, sublicense, and/or sell copies of the Software, and to
17 // permit persons to whom the Software is furnished to do so, subject to
18 // the following conditions:
19 // 
20 // The above copyright notice and this permission notice shall be
21 // included in all copies or substantial portions of the Software.
22 // 
23 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24 // EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25 // MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26 // NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
27 // LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
28 // OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
29 // WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
30 //
31 \r
32 using System;\r
33 using System.Collections;\r
34 using System.IO;\r
35 using System.Xml;\r
36 using Commons.Xml.Relaxng;\r
37 \r
38 namespace Commons.Xml.Relaxng.Rnc\r
39 {\r
40         internal class RncTokenizer : Commons.Xml.Relaxng.Rnc.yyParser.yyInput\r
41         {\r
42                 TextReader source;\r
43 \r
44                 int currentToken;\r
45                 object tokenValue;\r
46                 int peekChar;\r
47                 string peekString;\r
48                 bool isElement;\r
49                 bool isLiteralNsUri;\r
50 \r
51                 int line = 1;\r
52                 int column;\r
53                 int savedLineNumber = 1;\r
54                 int savedLinePosition;\r
55                 bool nextIncrementLine;\r
56                 string baseUri;\r
57 \r
58                 public RncTokenizer (TextReader source, string baseUri)\r
59                 {\r
60                         this.source = source;\r
61                         this.baseUri = baseUri;\r
62                 }\r
63 \r
64                 public bool IsElement {\r
65                         get { return isElement; }\r
66                 }\r
67 \r
68                 public int Line {\r
69                         get { return savedLineNumber; }\r
70                 }\r
71 \r
72                 public int Column {\r
73                         get { return savedLinePosition; }\r
74                 }\r
75 \r
76                 public string BaseUri {\r
77                         get { return baseUri; }\r
78                 }\r
79 \r
80                 // jay interface implementation\r
81 \r
82                 public int token ()\r
83                 {\r
84                         return currentToken;\r
85                 }\r
86 \r
87                 public bool advance ()\r
88                 {\r
89                         tokenValue = null;\r
90                         currentToken = ParseToken (false);\r
91                         savedLineNumber = line;\r
92                         savedLinePosition = column;\r
93                         return currentToken != Token.EOF;\r
94                 }\r
95 \r
96                 public object value ()\r
97                 {\r
98                         return tokenValue;\r
99                 }\r
100 \r
101                 // private methods\r
102 \r
103                 private int ReadEscapedHexNumber (int current)\r
104                 {\r
105                         int i = source.Read ();\r
106                         switch (i) {\r
107                         case '0':\r
108                         case '1':\r
109                         case '2':\r
110                         case '3':\r
111                         case '4':\r
112                         case '5':\r
113                         case '6':\r
114                         case '7':\r
115                         case '8':\r
116                         case '9':\r
117                                 current = current * 16 + (i - '0');\r
118                                 return ReadEscapedHexNumber (current);\r
119                         case 'A':\r
120                         case 'B':\r
121                         case 'C':\r
122                         case 'D':\r
123                         case 'E':\r
124                         case 'F':\r
125                                 current = current * 16 + (i - 'A') + 10;\r
126                                 return ReadEscapedHexNumber (current);\r
127                         case 'a':\r
128                         case 'b':\r
129                         case 'c':\r
130                         case 'd':\r
131                         case 'e':\r
132                         case 'f':\r
133                                 current = current * 16 + (i - 'a' + 10);\r
134                                 return ReadEscapedHexNumber (current);\r
135                         }\r
136                         peekChar = i;\r
137                         return current;\r
138                 }\r
139 \r
140                 private int ReadFromStream ()\r
141                 {\r
142                         int ret = source.Read ();\r
143                         if (ret != '\\')\r
144                                 return ret;\r
145                         ret = source.Read ();\r
146                         switch (ret) {\r
147                         case 'x':\r
148                                 int tmp;\r
149                                 int xcount = 0;\r
150                                 do {\r
151                                         xcount++;\r
152                                         tmp = source.Read ();\r
153                                 } while (tmp == 'x');\r
154                                 if (tmp != '{') {\r
155                                         peekString = new string ('x', xcount);\r
156                                         if (tmp >= 0)\r
157                                                 peekString += (char) tmp;\r
158                                         return '\\';\r
159                                 }\r
160                                 ret = ReadEscapedHexNumber (0);\r
161                                 if (peekChar != '}')\r
162                                         break;\r
163                                 peekChar = 0;\r
164                                 return ret;\r
165                         }\r
166                         peekString = new string ((char) ret, 1);\r
167                         return '\\';\r
168                 }\r
169 \r
170                 private int PeekChar ()\r
171                 {\r
172                         if (peekChar == 0) {\r
173                                 if (peekString != null) {\r
174                                         peekChar = peekString [0];\r
175                                         peekString = peekString.Length == 1 ?\r
176                                                 null : peekString.Substring (1);\r
177                                 }\r
178                                 else\r
179                                         peekChar = ReadFromStream ();\r
180                         }\r
181 \r
182                         return peekChar;\r
183                 }\r
184 \r
185                 private int ReadChar ()\r
186                 {\r
187                         int ret;\r
188                         if (peekChar != 0) {\r
189                                 ret = peekChar;\r
190                                 peekChar = 0;\r
191                         }\r
192                         else if (peekString != null) {\r
193                                 ret = peekString [0];\r
194                                 peekString = peekString.Length == 1 ?\r
195                                         null : peekString.Substring (1);\r
196                         }\r
197                         else\r
198                                 ret = ReadFromStream ();\r
199 \r
200                         if (nextIncrementLine) {\r
201                                 line++;\r
202                                 column = 1;\r
203                                 nextIncrementLine = false;\r
204                         }\r
205                         switch (ret) {\r
206                         case '\r':\r
207                                 break;\r
208                         case '\n':\r
209                                 nextIncrementLine = true;\r
210                                 goto default;\r
211                         default:\r
212                                 column++;\r
213                                 break;\r
214                         }\r
215 \r
216                         return ret;\r
217                 }\r
218 \r
219                 private void SkipWhitespaces ()\r
220                 {\r
221                         while (true) {\r
222                                 switch (PeekChar ()) {\r
223                                 case ' ':\r
224                                 case '\t':\r
225                                 case '\r':\r
226                                 case '\n':\r
227                                         ReadChar ();\r
228                                         continue;\r
229                                 default:\r
230                                         return;\r
231                                 }\r
232                         }\r
233                 }\r
234 \r
235                 char [] nameBuffer = new char [30];\r
236 \r
237                 private string ReadQuoted (char quoteChar)\r
238                 {\r
239                         int index = 0;\r
240                         bool loop = true;\r
241                         while (loop) {\r
242                                 int c = ReadChar ();\r
243                                 switch (c) {\r
244                                 case -1:\r
245                                 case '\'':\r
246                                 case '\"':\r
247                                         if (quoteChar != c)\r
248                                                 goto default;\r
249                                         loop = false;\r
250                                         break;\r
251                                 default:\r
252                                         if (c < 0)\r
253                                                 throw new RelaxngException ("Unterminated quoted literal.");\r
254                                         if (XmlChar.IsInvalid (c))\r
255                                                 throw new RelaxngException ("Invalid character in literal.");\r
256                                         AppendNameChar (c, ref index);\r
257                                         break;\r
258                                 }\r
259                         }\r
260 \r
261                         return new string (nameBuffer, 0, index);\r
262                 }\r
263 \r
264                 private void AppendNameChar (int c, ref int index)\r
265                 {\r
266                         if (nameBuffer.Length == index) {\r
267                                 char [] arr = new char [index * 2];\r
268                                 Array.Copy (nameBuffer, arr, index);\r
269                                 nameBuffer = arr;\r
270                         }\r
271                         if (c > 0x10000) {\r
272                                 AppendNameChar ((c - 0x10000) / 0x400 + 0xD800, ref index);\r
273                                 AppendNameChar ((c - 0x10000) % 0x400 + 0xDC00, ref index);\r
274                         }\r
275                         else\r
276                                 nameBuffer [index++] = (char) c;\r
277                 }\r
278 \r
279                 private string ReadTripleQuoted (char quoteChar)\r
280                 {\r
281                         int index = 0;\r
282                         bool loop = true;\r
283                         do {\r
284                                 int c = ReadChar ();\r
285                                 switch (c) {\r
286                                 case -1:\r
287                                 case '\'':\r
288                                 case '\"':\r
289                                         // 1\r
290                                         if (quoteChar != c)\r
291                                                 goto default;\r
292                                         // 2\r
293                                         if ((c = PeekChar ()) != quoteChar) {\r
294                                                 AppendNameChar (quoteChar, ref index);\r
295                                                 goto default;\r
296                                         }\r
297                                         ReadChar ();\r
298                                         // 3\r
299                                         if ((c = PeekChar ()) == quoteChar) {\r
300                                                 ReadChar ();\r
301                                                 loop = false;\r
302                                                 break;\r
303                                         }\r
304                                         AppendNameChar (quoteChar, ref index);\r
305                                         AppendNameChar (quoteChar, ref index);\r
306                                         break;\r
307                                 default:\r
308                                         if (c < 0)\r
309                                                 throw new RelaxngException ("Unterminated triple-quoted literal.");\r
310                                         if (XmlChar.IsInvalid (c))\r
311                                                 throw new RelaxngException ("Invalid character in literal.");\r
312                                         AppendNameChar (c, ref index);\r
313                                         break;\r
314                                 }\r
315                         } while (loop);\r
316 \r
317                         return new string (nameBuffer, 0, index);\r
318                 }\r
319 \r
320                 private string ReadOneName ()\r
321                 {\r
322                         int index = 0;\r
323                         bool loop = true;\r
324                         int c = PeekChar ();\r
325                         if (!XmlChar.IsFirstNameChar (c) || !XmlChar.IsNCNameChar (c))\r
326                                 throw new RelaxngException (String.Format ("Invalid NCName start character: {0}", c));\r
327                         do {\r
328                                 c = PeekChar ();\r
329                                 switch (c) {\r
330                                 case -1:\r
331                                 case ' ':\r
332                                 case '\t':\r
333                                 case '\r':\r
334                                 case '\n':\r
335                                         ReadChar ();\r
336                                         loop = false;\r
337                                         break;\r
338                                 default:\r
339                                         if (!XmlChar.IsNCNameChar (c)) {\r
340                                                 loop = false;\r
341                                                 break;\r
342                                         }\r
343 \r
344                                         ReadChar ();\r
345                                         if (nameBuffer.Length == index) {\r
346                                                 char [] arr = new char [index * 2];\r
347                                                 Array.Copy (nameBuffer, arr, index);\r
348                                                 nameBuffer = arr;\r
349                                         }\r
350                                         nameBuffer [index++] = (char) c;\r
351                                         break;\r
352                                 }\r
353                         } while (loop);\r
354 \r
355                         return new string (nameBuffer, 0, index);\r
356                 }\r
357 \r
358                 private string ReadLine ()\r
359                 {\r
360                         string s = source.ReadLine ();\r
361                         line++;\r
362                         column = 1;\r
363                         return s;\r
364                 }\r
365 \r
366                 private int ParseToken (bool backslashed)\r
367                 {\r
368                         SkipWhitespaces ();\r
369                         int c = ReadChar ();\r
370                         string name;\r
371                         switch (c) {\r
372                         case -1:\r
373                                 return Token.EOF;\r
374                         case '=':\r
375                                 return Token.Equal;\r
376                         case '~':\r
377                                 return Token.Tilde;\r
378                         case ',':\r
379                                 return Token.Comma;\r
380                         case '{':\r
381                                 return Token.OpenCurly;\r
382                         case '}':\r
383                                 return Token.CloseCurly;\r
384                         case '(':\r
385                                 return Token.OpenParen;\r
386                         case ')':\r
387                                 return Token.CloseParen;\r
388                         case '[':\r
389                                 return Token.OpenBracket;\r
390                         case ']':\r
391                                 return Token.CloseBracket;\r
392                         case '&':\r
393                                 if (PeekChar () != '=')\r
394                                         return Token.Amp;\r
395                                 ReadChar ();\r
396                                 return Token.AndEquals;\r
397                         case '|':\r
398                                 if (PeekChar () != '=')\r
399                                         return Token.Bar;\r
400                                 ReadChar ();\r
401                                 return Token.OrEquals;\r
402                         case '?':\r
403                                 return Token.Question;\r
404                         case '*':\r
405                                 // See also ':' for NsName\r
406                                 return Token.Asterisk;\r
407                         case '\\':\r
408                                 if (backslashed)\r
409                                         return Token.ERROR;\r
410                                 return ParseToken (true);\r
411                         case '+':\r
412                                 return Token.Plus;\r
413                         case '-':\r
414                                 return Token.Minus;\r
415                         case '>':\r
416                                 if (PeekChar () == '>') {\r
417                                         ReadChar ();\r
418                                         return Token.TwoGreaters;\r
419                                 }\r
420                                 peekChar = '>';\r
421                                 goto default;\r
422                         case '#':\r
423 //                              tokenValue = ReadLine ();\r
424 //                              return Token.Documentation;\r
425                                 ReadLine ();\r
426                                 return ParseToken (false);\r
427                         case '\'':\r
428                         case '\"':\r
429                                 if (PeekChar () != c)\r
430                                         name = ReadQuoted ((char) c);\r
431                                 else {\r
432                                         ReadChar ();\r
433                                         if (PeekChar () == c) {\r
434                                                 ReadChar ();\r
435                                                 name = ReadTripleQuoted ((char) c);\r
436                                         } // else '' or ""\r
437                                         name = String.Empty;\r
438                                 }\r
439                                 int invidx = XmlChar.IndexOfInvalid (name, true) ;\r
440                                 if (invidx >= 0)\r
441                                         throw new RelaxngException (String.Format ("Invalid XML character in compact syntax literal segment at {0:X}", (int) name [invidx]));\r
442                                 tokenValue = name;\r
443                                 return Token.LiteralSegment;\r
444                         default:\r
445                                 if (!XmlChar.IsNCNameChar (c))\r
446                                         throw new RelaxngException ("Invalid NCName character.");\r
447                                 peekChar = c;\r
448                                 name = ReadOneName ();\r
449                                 if (PeekChar () == ':') {\r
450                                         ReadChar ();\r
451                                         if (PeekChar () == '*') {\r
452                                                 ReadChar ();\r
453                                                 tokenValue = name;\r
454                                                 return Token.NsName;\r
455                                         }\r
456                                         tokenValue = name + ":" + ReadOneName ();\r
457                                         return Token.CName;\r
458 \r
459                                 }\r
460                                 tokenValue = name;\r
461                                 if (backslashed)\r
462                                         return Token.QuotedIdentifier;\r
463                                 switch (name) {\r
464                                 case "attribute":\r
465                                         isElement = false;\r
466                                         return Token.KeywordAttribute;\r
467                                 case "element":\r
468                                         isElement = true;\r
469                                         return Token.KeywordElement;\r
470                                 case "datatypes":\r
471                                         return Token.KeywordDatatypes;\r
472                                 case "default":\r
473                                         return Token.KeywordDefault;\r
474                                 case "div":\r
475                                         return Token.KeywordDiv;\r
476                                 case "empty":\r
477                                         return Token.KeywordEmpty;\r
478                                 case "external":\r
479                                         return Token.KeywordExternal;\r
480                                 case "grammar":\r
481                                         return Token.KeywordGrammar;\r
482                                 case "include":\r
483                                         return Token.KeywordInclude;\r
484                                 case "inherit":\r
485                                         return Token.KeywordInherit;\r
486                                 case "list":\r
487                                         return Token.KeywordList;\r
488                                 case "mixed":\r
489                                         return Token.KeywordMixed;\r
490                                 case "namespace":\r
491                                         return Token.KeywordNamespace;\r
492                                 case "notAllowed":\r
493                                         return Token.KeywordNotAllowed;\r
494                                 case "parent":\r
495                                         return Token.KeywordParent;\r
496                                 case "start":\r
497                                         return Token.KeywordStart;\r
498                                 case "string":\r
499                                         return Token.KeywordString;\r
500                                 case "text":\r
501                                         return Token.KeywordText;\r
502                                 case "token":\r
503                                         return Token.KeywordToken;\r
504                                 default:\r
505                                         return Token.NCName;\r
506                                 }\r
507                         }\r
508                 }\r
509 \r
510         }\r
511 }