Merge pull request #2408 from tastywheattasteslikechicken/MoreInterfaceSupport
[mono.git] / mcs / class / corlib / Mono.Xml / SmallXmlParser.cs
1 //
2 // SmallXmlParser.cs
3 //
4 // Author:
5 //      Atsushi Enomoto  <atsushi@ximian.com>
6 //
7 // Copyright (C) 2005 Novell, Inc (http://www.novell.com)
8 //
9 // Permission is hereby granted, free of charge, to any person obtaining
10 // a copy of this software and associated documentation files (the
11 // "Software"), to deal in the Software without restriction, including
12 // without limitation the rights to use, copy, modify, merge, publish,
13 // distribute, sublicense, and/or sell copies of the Software, and to
14 // permit persons to whom the Software is furnished to do so, subject to
15 // the following conditions:
16 // 
17 // The above copyright notice and this permission notice shall be
18 // included in all copies or substantial portions of the Software.
19 // 
20 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
21 // EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
22 // MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
23 // NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
24 // LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
25 // OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
26 // WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
27 //
28
29 //
30 // small xml parser that is mostly compatible with
31 //
32
33 using System;
34 using System.Collections;
35 using System.Collections.Generic;
36 using System.Globalization;
37 using System.IO;
38 using System.Text;
39
40 namespace Mono.Xml
41 {
42 #if INSIDE_CORLIB
43         internal
44 #else
45         public
46 #endif
47         class DefaultHandler : SmallXmlParser.IContentHandler
48         {
49                 public void OnStartParsing (SmallXmlParser parser)
50                 {
51                 }
52
53                 public void OnEndParsing (SmallXmlParser parser)
54                 {
55                 }
56
57                 public void OnStartElement (string name, SmallXmlParser.IAttrList attrs)
58                 {
59                 }
60
61                 public void OnEndElement (string name)
62                 {
63                 }
64
65                 public void OnChars (string s)
66                 {
67                 }
68
69                 public void OnIgnorableWhitespace (string s)
70                 {
71                 }
72
73                 public void OnProcessingInstruction (string name, string text)
74                 {
75                 }
76         }
77
78 #if INSIDE_CORLIB
79         internal
80 #else
81         public
82 #endif
83         class SmallXmlParser
84         {
85                 public interface IContentHandler
86                 {
87                         void OnStartParsing (SmallXmlParser parser);
88                         void OnEndParsing (SmallXmlParser parser);
89                         void OnStartElement (string name, IAttrList attrs);
90                         void OnEndElement (string name);
91                         void OnProcessingInstruction (string name, string text);
92                         void OnChars (string text);
93                         void OnIgnorableWhitespace (string text);
94                 }
95
96                 public interface IAttrList
97                 {
98                         int Length { get; }
99                         bool IsEmpty { get; }
100                         string GetName (int i);
101                         string GetValue (int i);
102                         string GetValue (string name);
103                         string [] Names { get; }
104                         string [] Values { get; }
105                 }
106
107                 class AttrListImpl : IAttrList
108                 {
109                         public int Length {
110                                 get { return attrNames.Count; }
111                         }
112                         public bool IsEmpty {
113                                 get { return attrNames.Count == 0; }
114                         }
115                         public string GetName (int i)
116                         {
117                                 return attrNames [i];
118                         }
119                         public string GetValue (int i)
120                         {
121                                 return attrValues [i];
122                         }
123                         public string GetValue (string name)
124                         {
125                                 for (int i = 0; i < attrNames.Count; i++)
126                                         if (attrNames [i] == name)
127                                                 return attrValues [i];
128                                 return null;
129                         }
130                         public string [] Names {
131                                 get { return attrNames.ToArray (); }
132                         }
133                         public string [] Values {
134                                 get { return attrValues.ToArray (); }
135                         }
136
137                         List<string> attrNames = new List<string> ();
138                         List<string> attrValues = new List<string> ();
139
140                         internal void Clear ()
141                         {
142                                 attrNames.Clear ();
143                                 attrValues.Clear ();
144                         }
145
146                         internal void Add (string name, string value)
147                         {
148                                 attrNames.Add (name);
149                                 attrValues.Add (value);
150                         }
151                 }
152
153                 IContentHandler handler;
154                 TextReader reader;
155                 Stack elementNames = new Stack ();
156                 Stack xmlSpaces = new Stack ();
157                 string xmlSpace;
158                 StringBuilder buffer = new StringBuilder (200);
159                 char [] nameBuffer = new char [30];
160                 bool isWhitespace;
161
162                 AttrListImpl attributes = new AttrListImpl ();
163                 int line = 1, column;
164                 bool resetColumn;
165
166                 public SmallXmlParser ()
167                 {
168                 }
169
170                 private Exception Error (string msg)
171                 {
172                         return new SmallXmlParserException (msg, line, column);
173                 }
174
175                 private Exception UnexpectedEndError ()
176                 {
177                         string [] arr = new string [elementNames.Count];
178                         elementNames.CopyTo (arr, 0);
179                         return Error (String.Format (
180                                 "Unexpected end of stream. Element stack content is {0}", String.Join (",", arr)));
181                 }
182
183
184                 private bool IsNameChar (char c, bool start)
185                 {
186                         switch (c) {
187                         case ':':
188                         case '_':
189                                 return true;
190                         case '-':
191                         case '.':
192                                 return !start;
193                         }
194                         if (c > 0x100) { // optional condition for optimization
195                                 switch (c) {
196                                 case '\u0559':
197                                 case '\u06E5':
198                                 case '\u06E6':
199                                         return true;
200                                 }
201                                 if ('\u02BB' <= c && c <= '\u02C1')
202                                         return true;
203                         }
204                         switch (Char.GetUnicodeCategory (c)) {
205                         case UnicodeCategory.LowercaseLetter:
206                         case UnicodeCategory.UppercaseLetter:
207                         case UnicodeCategory.OtherLetter:
208                         case UnicodeCategory.TitlecaseLetter:
209                         case UnicodeCategory.LetterNumber:
210                                 return true;
211                         case UnicodeCategory.SpacingCombiningMark:
212                         case UnicodeCategory.EnclosingMark:
213                         case UnicodeCategory.NonSpacingMark:
214                         case UnicodeCategory.ModifierLetter:
215                         case UnicodeCategory.DecimalDigitNumber:
216                                 return !start;
217                         default:
218                                 return false;
219                         }
220                 }
221
222                 private bool IsWhitespace (int c)
223                 {
224                         switch (c) {
225                         case ' ':
226                         case '\r':
227                         case '\t':
228                         case '\n':
229                                 return true;
230                         default:
231                                 return false;
232                         }
233                 }
234
235
236                 public void SkipWhitespaces ()
237                 {
238                         SkipWhitespaces (false);
239                 }
240
241                 private void HandleWhitespaces ()
242                 {
243                         while (IsWhitespace (Peek ()))
244                                 buffer.Append ((char) Read ());
245                         if (Peek () != '<' && Peek () >= 0)
246                                 isWhitespace = false;
247                 }
248
249                 public void SkipWhitespaces (bool expected)
250                 {
251                         while (true) {
252                                 switch (Peek ()) {
253                                 case ' ':
254                                 case '\r':
255                                 case '\t':
256                                 case '\n':
257                                         Read ();
258                                         if (expected)
259                                                 expected = false;
260                                         continue;
261                                 }
262                                 if (expected)
263                                         throw Error ("Whitespace is expected.");
264                                 return;
265                         }
266                 }
267
268
269                 private int Peek ()
270                 {
271                         return reader.Peek ();
272                 }
273
274                 private int Read ()
275                 {
276                         int i = reader.Read ();
277                         if (i == '\n')
278                                 resetColumn = true;
279                         if (resetColumn) {
280                                 line++;
281                                 resetColumn = false;
282                                 column = 1;
283                         }
284                         else
285                                 column++;
286                         return i;
287                 }
288
289                 public void Expect (int c)
290                 {
291                         int p = Read ();
292                         if (p < 0)
293                                 throw UnexpectedEndError ();
294                         else if (p != c)
295                                 throw Error (String.Format ("Expected '{0}' but got {1}", (char) c, (char) p));
296                 }
297
298                 private string ReadUntil (char until, bool handleReferences)
299                 {
300                         while (true) {
301                                 if (Peek () < 0)
302                                         throw UnexpectedEndError ();
303                                 char c = (char) Read ();
304                                 if (c == until)
305                                         break;
306                                 else if (handleReferences && c == '&')
307                                         ReadReference ();
308                                 else
309                                         buffer.Append (c);
310                         }
311                         string ret = buffer.ToString ();
312                         buffer.Length = 0;
313                         return ret;
314                 }
315
316                 public string ReadName ()
317                 {
318                         int idx = 0;
319                         if (Peek () < 0 || !IsNameChar ((char) Peek (), true))
320                                 throw Error ("XML name start character is expected.");
321                         for (int i = Peek (); i >= 0; i = Peek ()) {
322                                 char c = (char) i;
323                                 if (!IsNameChar (c, false))
324                                         break;
325                                 if (idx == nameBuffer.Length) {
326                                         char [] tmp = new char [idx * 2];
327                                         Array.Copy (nameBuffer, tmp, idx);
328                                         nameBuffer = tmp;
329                                 }
330                                 nameBuffer [idx++] = c;
331                                 Read ();
332                         }
333                         if (idx == 0)
334                                 throw Error ("Valid XML name is expected.");
335                         return new string (nameBuffer, 0, idx);
336                 }
337
338
339                 public void Parse (TextReader input, IContentHandler handler)
340                 {
341                         this.reader = input;
342                         this.handler = handler;
343
344                         handler.OnStartParsing (this);
345
346                         while (Peek () >= 0)
347                                 ReadContent ();
348                         HandleBufferedContent ();
349                         if (elementNames.Count > 0)
350                                 throw Error (String.Format ("Insufficient close tag: {0}", elementNames.Peek ()));
351
352                         handler.OnEndParsing (this);
353
354                         Cleanup ();
355                 }
356
357                 private void Cleanup ()
358                 {
359                         line = 1;
360                         column = 0;
361                         handler = null;
362                         reader = null;
363                         elementNames.Clear ();
364                         xmlSpaces.Clear ();
365                         attributes.Clear ();
366                         buffer.Length = 0;
367                         xmlSpace = null;
368                         isWhitespace = false;
369                 }
370
371                 public void ReadContent ()
372                 {
373                         string name;
374                         if (IsWhitespace (Peek ())) {
375                                 if (buffer.Length == 0)
376                                         isWhitespace = true;
377                                 HandleWhitespaces ();
378                         }
379                         if (Peek () == '<') {
380                                 Read ();
381                                 switch (Peek ()) {
382                                 case '!': // declarations
383                                         Read ();
384                                         if (Peek () == '[') {
385                                                 Read ();
386                                                 if (ReadName () != "CDATA")
387                                                         throw Error ("Invalid declaration markup");
388                                                 Expect ('[');
389                                                 ReadCDATASection ();
390                                                 return;
391                                         }
392                                         else if (Peek () == '-') {
393                                                 ReadComment ();
394                                                 return;
395                                         }
396                                         else if (ReadName () != "DOCTYPE")
397                                                 throw Error ("Invalid declaration markup.");
398                                         else
399                                                 throw Error ("This parser does not support document type.");
400                                 case '?': // PIs
401                                         HandleBufferedContent ();
402                                         Read ();
403                                         name = ReadName ();
404                                         SkipWhitespaces ();
405                                         string text = String.Empty;
406                                         if (Peek () != '?') {
407                                                 while (true) {
408                                                         text += ReadUntil ('?', false);
409                                                         if (Peek () == '>')
410                                                                 break;
411                                                         text += "?";
412                                                 }
413                                         }
414                                         handler.OnProcessingInstruction (
415                                                 name, text);
416                                         Expect ('>');
417                                         return;
418                                 case '/': // end tags
419                                         HandleBufferedContent ();
420                                         if (elementNames.Count == 0)
421                                                 throw UnexpectedEndError ();
422                                         Read ();
423                                         name = ReadName ();
424                                         SkipWhitespaces ();
425                                         string expected = (string) elementNames.Pop ();
426                                         xmlSpaces.Pop ();
427                                         if (xmlSpaces.Count > 0)
428                                                 xmlSpace = (string) xmlSpaces.Peek ();
429                                         else
430                                                 xmlSpace = null;
431                                         if (name != expected)
432                                                 throw Error (String.Format ("End tag mismatch: expected {0} but found {1}", expected, name));
433                                         handler.OnEndElement (name);
434                                         Expect ('>');
435                                         return;
436                                 default: // start tags (including empty tags)
437                                         HandleBufferedContent ();
438                                         name = ReadName ();
439                                         while (Peek () != '>' && Peek () != '/')
440                                                 ReadAttribute (attributes);
441                                         handler.OnStartElement (name, attributes);
442                                         attributes.Clear ();
443                                         SkipWhitespaces ();
444                                         if (Peek () == '/') {
445                                                 Read ();
446                                                 handler.OnEndElement (name);
447                                         }
448                                         else {
449                                                 elementNames.Push (name);
450                                                 xmlSpaces.Push (xmlSpace);
451                                         }
452                                         Expect ('>');
453                                         return;
454                                 }
455                         }
456                         else
457                                 ReadCharacters ();
458                 }
459
460                 private void HandleBufferedContent ()
461                 {
462                         if (buffer.Length == 0)
463                                 return;
464                         if (isWhitespace)
465                                 handler.OnIgnorableWhitespace (buffer.ToString ());
466                         else
467                                 handler.OnChars (buffer.ToString ());
468                         buffer.Length = 0;
469                         isWhitespace = false;
470                 }
471
472                 private void ReadCharacters ()
473                 {
474                         isWhitespace = false;
475                         while (true) {
476                                 int i = Peek ();
477                                 switch (i) {
478                                 case -1:
479                                         return;
480                                 case '<':
481                                         return;
482                                 case '&':
483                                         Read ();
484                                         ReadReference ();
485                                         continue;
486                                 default:
487                                         buffer.Append ((char) Read ());
488                                         continue;
489                                 }
490                         }
491                 }
492
493                 private void ReadReference ()
494                 {
495                         if (Peek () == '#') {
496                                 // character reference
497                                 Read ();
498                                 ReadCharacterReference ();
499                         } else {
500                                 string name = ReadName ();
501                                 Expect (';');
502                                 switch (name) {
503                                 case "amp":
504                                         buffer.Append ('&');
505                                         break;
506                                 case "quot":
507                                         buffer.Append ('"');
508                                         break;
509                                 case "apos":
510                                         buffer.Append ('\'');
511                                         break;
512                                 case "lt":
513                                         buffer.Append ('<');
514                                         break;
515                                 case "gt":
516                                         buffer.Append ('>');
517                                         break;
518                                 default:
519                                         throw Error ("General non-predefined entity reference is not supported in this parser.");
520                                 }
521                         }
522                 }
523
524                 private int ReadCharacterReference ()
525                 {
526                         int n = 0;
527                         if (Peek () == 'x') { // hex
528                                 Read ();
529                                 for (int i = Peek (); i >= 0; i = Peek ()) {
530                                         if ('0' <= i && i <= '9')
531                                                 n = n << 4 + i - '0';
532                                         else if ('A' <= i && i <='F')
533                                                 n = n << 4 + i - 'A' + 10;
534                                         else if ('a' <= i && i <='f')
535                                                 n = n << 4 + i - 'a' + 10;
536                                         else
537                                                 break;
538                                         Read ();
539                                 }
540                         } else {
541                                 for (int i = Peek (); i >= 0; i = Peek ()) {
542                                         if ('0' <= i && i <= '9')
543                                                 n = n << 4 + i - '0';
544                                         else
545                                                 break;
546                                         Read ();
547                                 }
548                         }
549                         return n;
550                 }
551
552                 private void ReadAttribute (AttrListImpl a)
553                 {
554                         SkipWhitespaces (true);
555                         if (Peek () == '/' || Peek () == '>')
556                                 // came here just to spend trailing whitespaces
557                                 return;
558
559                         string name = ReadName ();
560                         string value;
561                         SkipWhitespaces ();
562                         Expect ('=');
563                         SkipWhitespaces ();
564                         switch (Read ()) {
565                         case '\'':
566                                 value = ReadUntil ('\'', true);
567                                 break;
568                         case '"':
569                                 value = ReadUntil ('"', true);
570                                 break;
571                         default:
572                                 throw Error ("Invalid attribute value markup.");
573                         }
574                         if (name == "xml:space")
575                                 xmlSpace = value;
576                         a.Add (name, value);
577                 }
578
579                 private void ReadCDATASection ()
580                 {
581                         int nBracket = 0;
582                         while (true) {
583                                 if (Peek () < 0)
584                                         throw UnexpectedEndError ();
585                                 char c = (char) Read ();
586                                 if (c == ']')
587                                         nBracket++;
588                                 else if (c == '>' && nBracket > 1) {
589                                         for (int i = nBracket; i > 2; i--)
590                                                 buffer.Append (']');
591                                         break;
592                                 }
593                                 else {
594                                         for (int i = 0; i < nBracket; i++)
595                                                 buffer.Append (']');
596                                         nBracket = 0;
597                                         buffer.Append (c);
598                                 }
599                         }
600                 }
601
602                 private void ReadComment ()
603                 {
604                         Expect ('-');
605                         Expect ('-');
606                         while (true) {
607                                 if (Read () != '-')
608                                         continue;
609                                 if (Read () != '-')
610                                         continue;
611                                 if (Read () != '>')
612                                         throw Error ("'--' is not allowed inside comment markup.");
613                                 break;
614                         }
615                 }
616         }
617
618 #if INSIDE_CORLIB
619         internal
620 #else
621         [CLSCompliant(false)]    
622         public
623 #endif
624         class SmallXmlParserException : SystemException
625         {
626                 int line;
627                 int column;
628
629                 public SmallXmlParserException (string msg, int line, int column)
630                         : base (String.Format ("{0}. At ({1},{2})", msg, line, column))
631                 {
632                         this.line = line;
633                         this.column = column;
634                 }
635
636                 public int Line {
637                         get { return line; }
638                 }
639
640                 public int Column {
641                         get { return column; }
642                 }
643         }
644 }
645