New test.
[mono.git] / mcs / class / corlib / Mono.Xml / SmallXmlParser.cs
1 //
2 // SmallXmlParser.cs
3 //
4 // Author:
5 //      Atsushi Enomoto  <atsushi@ximian.com>
6 //
7 // Copyright (C) 2005 Novell, Inc (http://www.novell.com)
8 //
9 // Permission is hereby granted, free of charge, to any person obtaining
10 // a copy of this software and associated documentation files (the
11 // "Software"), to deal in the Software without restriction, including
12 // without limitation the rights to use, copy, modify, merge, publish,
13 // distribute, sublicense, and/or sell copies of the Software, and to
14 // permit persons to whom the Software is furnished to do so, subject to
15 // the following conditions:
16 // 
17 // The above copyright notice and this permission notice shall be
18 // included in all copies or substantial portions of the Software.
19 // 
20 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
21 // EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
22 // MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
23 // NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
24 // LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
25 // OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
26 // WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
27 //
28
29 //
30 // small xml parser that is mostly compatible with
31 //
32
33 using System;
34 using System.Collections;
35 using System.Globalization;
36 using System.IO;
37 using System.Text;
38
39 namespace Mono.Xml
40 {
41 #if INSIDE_CORLIB
42         internal
43 #else
44         public
45 #endif
46         class DefaultHandler : SmallXmlParser.IContentHandler
47         {
48                 public void OnStartParsing (SmallXmlParser parser)
49                 {
50                 }
51
52                 public void OnEndParsing (SmallXmlParser parser)
53                 {
54                 }
55
56                 public void OnStartElement (string name, SmallXmlParser.IAttrList attrs)
57                 {
58                 }
59
60                 public void OnEndElement (string name)
61                 {
62                 }
63
64                 public void OnChars (string s)
65                 {
66                 }
67
68                 public void OnIgnorableWhitespace (string s)
69                 {
70                 }
71
72                 public void OnProcessingInstruction (string name, string text)
73                 {
74                 }
75         }
76
77 #if INSIDE_CORLIB
78         internal
79 #else
80         public
81 #endif
82         class SmallXmlParser
83         {
84                 public interface IContentHandler
85                 {
86                         void OnStartParsing (SmallXmlParser parser);
87                         void OnEndParsing (SmallXmlParser parser);
88                         void OnStartElement (string name, IAttrList attrs);
89                         void OnEndElement (string name);
90                         void OnProcessingInstruction (string name, string text);
91                         void OnChars (string text);
92                         void OnIgnorableWhitespace (string text);
93                 }
94
95                 public interface IAttrList
96                 {
97                         int Length { get; }
98                         bool IsEmpty { get; }
99                         string GetName (int i);
100                         string GetValue (int i);
101                         string GetValue (string name);
102                         string [] Names { get; }
103                         string [] Values { get; }
104                 }
105
106                 class AttrListImpl : IAttrList
107                 {
108                         public int Length {
109                                 get { return attrNames.Count; }
110                         }
111                         public bool IsEmpty {
112                                 get { return attrNames.Count == 0; }
113                         }
114                         public string GetName (int i)
115                         {
116                                 return (string) attrNames [i];
117                         }
118                         public string GetValue (int i)
119                         {
120                                 return (string) attrValues [i];
121                         }
122                         public string GetValue (string name)
123                         {
124                                 for (int i = 0; i < attrNames.Count; i++)
125                                         if ((string) attrNames [i] == name)
126                                                 return (string) attrValues [i];
127                                 return null;
128                         }
129                         public string [] Names {
130                                 get { return (string []) attrNames.ToArray (typeof (string)); }
131                         }
132                         public string [] Values {
133                                 get { return (string []) attrValues.ToArray (typeof (string)); }
134                         }
135
136                         ArrayList attrNames = new ArrayList ();
137                         ArrayList attrValues = new ArrayList ();
138
139                         internal void Clear ()
140                         {
141                                 attrNames.Clear ();
142                                 attrValues.Clear ();
143                         }
144
145                         internal void Add (string name, string value)
146                         {
147                                 attrNames.Add (name);
148                                 attrValues.Add (value);
149                         }
150                 }
151
152                 IContentHandler handler;
153                 TextReader reader;
154                 Stack elementNames = new Stack ();
155                 Stack xmlSpaces = new Stack ();
156                 string xmlSpace;
157                 StringBuilder buffer = new StringBuilder (200);
158                 char [] nameBuffer = new char [30];
159                 bool isWhitespace;
160
161                 AttrListImpl attributes = new AttrListImpl ();
162                 int line = 1, column;
163                 bool resetColumn;
164
165                 public SmallXmlParser ()
166                 {
167                 }
168
169                 private Exception Error (string msg)
170                 {
171                         return new SmallXmlParserException (msg, line, column);
172                 }
173
174                 private Exception UnexpectedEndError ()
175                 {
176                         string [] arr = new string [elementNames.Count];
177                         elementNames.CopyTo (arr, 0);
178                         return Error (String.Format (
179                                 "Unexpected end of stream. Element stack content is {0}", String.Join (",", arr)));
180                 }
181
182
183                 private bool IsNameChar (char c, bool start)
184                 {
185                         switch (c) {
186                         case ':':
187                         case '_':
188                                 return true;
189                         case '-':
190                         case '.':
191                                 return !start;
192                         }
193                         if (c > 0x100) { // optional condition for optimization
194                                 switch (c) {
195                                 case '\u0559':
196                                 case '\u06E5':
197                                 case '\u06E6':
198                                         return true;
199                                 }
200                                 if ('\u02BB' <= c && c <= '\u02C1')
201                                         return true;
202                         }
203                         switch (Char.GetUnicodeCategory (c)) {
204                         case UnicodeCategory.LowercaseLetter:
205                         case UnicodeCategory.UppercaseLetter:
206                         case UnicodeCategory.OtherLetter:
207                         case UnicodeCategory.TitlecaseLetter:
208                         case UnicodeCategory.LetterNumber:
209                                 return true;
210                         case UnicodeCategory.SpacingCombiningMark:
211                         case UnicodeCategory.EnclosingMark:
212                         case UnicodeCategory.NonSpacingMark:
213                         case UnicodeCategory.ModifierLetter:
214                         case UnicodeCategory.DecimalDigitNumber:
215                                 return !start;
216                         default:
217                                 return false;
218                         }
219                 }
220
221                 private bool IsWhitespace (int c)
222                 {
223                         switch (c) {
224                         case ' ':
225                         case '\r':
226                         case '\t':
227                         case '\n':
228                                 return true;
229                         default:
230                                 return false;
231                         }
232                 }
233
234
235                 public void SkipWhitespaces ()
236                 {
237                         SkipWhitespaces (false);
238                 }
239
240                 private void HandleWhitespaces ()
241                 {
242                         while (IsWhitespace (Peek ()))
243                                 buffer.Append ((char) Read ());
244                         if (Peek () != '<' && Peek () >= 0)
245                                 isWhitespace = false;
246                 }
247
248                 public void SkipWhitespaces (bool expected)
249                 {
250                         while (true) {
251                                 switch (Peek ()) {
252                                 case ' ':
253                                 case '\r':
254                                 case '\t':
255                                 case '\n':
256                                         Read ();
257                                         if (expected)
258                                                 expected = false;
259                                         continue;
260                                 }
261                                 if (expected)
262                                         throw Error ("Whitespace is expected.");
263                                 return;
264                         }
265                 }
266
267
268                 private int Peek ()
269                 {
270                         return reader.Peek ();
271                 }
272
273                 private int Read ()
274                 {
275                         int i = reader.Read ();
276                         if (i == '\n')
277                                 resetColumn = true;
278                         if (resetColumn) {
279                                 line++;
280                                 resetColumn = false;
281                                 column = 1;
282                         }
283                         else
284                                 column++;
285                         return i;
286                 }
287
288                 public void Expect (int c)
289                 {
290                         int p = Read ();
291                         if (p < 0)
292                                 throw UnexpectedEndError ();
293                         else if (p != c)
294                                 throw Error (String.Format ("Expected '{0}' but got {1}", (char) c, (char) p));
295                 }
296
297                 private string ReadUntil (char until, bool handleReferences)
298                 {
299                         while (true) {
300                                 if (Peek () < 0)
301                                         throw UnexpectedEndError ();
302                                 char c = (char) Read ();
303                                 if (c == until)
304                                         break;
305                                 else if (handleReferences && c == '&')
306                                         ReadReference ();
307                                 else
308                                         buffer.Append (c);
309                         }
310                         string ret = buffer.ToString ();
311                         buffer.Length = 0;
312                         return ret;
313                 }
314
315                 public string ReadName ()
316                 {
317                         int idx = 0;
318                         if (Peek () < 0 || !IsNameChar ((char) Peek (), true))
319                                 throw Error ("XML name start character is expected.");
320                         for (int i = Peek (); i >= 0; i = Peek ()) {
321                                 char c = (char) i;
322                                 if (!IsNameChar (c, false))
323                                         break;
324                                 if (idx == nameBuffer.Length) {
325                                         char [] tmp = new char [idx * 2];
326                                         Array.Copy (nameBuffer, tmp, idx);
327                                         nameBuffer = tmp;
328                                 }
329                                 nameBuffer [idx++] = c;
330                                 Read ();
331                         }
332                         if (idx == 0)
333                                 throw Error ("Valid XML name is expected.");
334                         return new string (nameBuffer, 0, idx);
335                 }
336
337
338                 public void Parse (TextReader input, IContentHandler handler)
339                 {
340                         this.reader = input;
341                         this.handler = handler;
342
343                         handler.OnStartParsing (this);
344
345                         while (Peek () >= 0)
346                                 ReadContent ();
347                         HandleBufferedContent ();
348                         if (elementNames.Count > 0)
349                                 throw Error (String.Format ("Insufficient close tag: {0}", elementNames.Peek ()));
350
351                         handler.OnEndParsing (this);
352
353                         Cleanup ();
354                 }
355
356                 private void Cleanup ()
357                 {
358                         line = 1;
359                         column = 0;
360                         handler = null;
361                         reader = null;
362                         elementNames.Clear ();
363                         xmlSpaces.Clear ();
364                         attributes.Clear ();
365                         buffer.Length = 0;
366                         xmlSpace = null;
367                         isWhitespace = false;
368                 }
369
370                 public void ReadContent ()
371                 {
372                         string name;
373                         if (IsWhitespace (Peek ())) {
374                                 if (buffer.Length == 0)
375                                         isWhitespace = true;
376                                 HandleWhitespaces ();
377                         }
378                         if (Peek () == '<') {
379                                 Read ();
380                                 switch (Peek ()) {
381                                 case '!': // declarations
382                                         Read ();
383                                         if (Peek () == '[') {
384                                                 Read ();
385                                                 if (ReadName () != "CDATA")
386                                                         throw Error ("Invalid declaration markup");
387                                                 Expect ('[');
388                                                 ReadCDATASection ();
389                                                 return;
390                                         }
391                                         else if (Peek () == '-') {
392                                                 ReadComment ();
393                                                 return;
394                                         }
395                                         else if (ReadName () != "DOCTYPE")
396                                                 throw Error ("Invalid declaration markup.");
397                                         else
398                                                 throw Error ("This parser does not support document type.");
399                                 case '?': // PIs
400                                         HandleBufferedContent ();
401                                         Read ();
402                                         name = ReadName ();
403                                         SkipWhitespaces ();
404                                         string text = String.Empty;
405                                         if (Peek () != '?') {
406                                                 while (true) {
407                                                         text += ReadUntil ('?', false);
408                                                         if (Peek () == '>')
409                                                                 break;
410                                                         text += "?";
411                                                 }
412                                         }
413                                         handler.OnProcessingInstruction (
414                                                 name, text);
415                                         Expect ('>');
416                                         return;
417                                 case '/': // end tags
418                                         HandleBufferedContent ();
419                                         if (elementNames.Count == 0)
420                                                 throw UnexpectedEndError ();
421                                         Read ();
422                                         name = ReadName ();
423                                         SkipWhitespaces ();
424                                         string expected = (string) elementNames.Pop ();
425                                         xmlSpaces.Pop ();
426                                         if (xmlSpaces.Count > 0)
427                                                 xmlSpace = (string) xmlSpaces.Peek ();
428                                         else
429                                                 xmlSpace = null;
430                                         if (name != expected)
431                                                 throw Error (String.Format ("End tag mismatch: expected {0} but found {1}", expected, name));
432                                         handler.OnEndElement (name);
433                                         Expect ('>');
434                                         return;
435                                 default: // start tags (including empty tags)
436                                         HandleBufferedContent ();
437                                         name = ReadName ();
438                                         while (Peek () != '>' && Peek () != '/')
439                                                 ReadAttribute (attributes);
440                                         handler.OnStartElement (name, attributes);
441                                         attributes.Clear ();
442                                         SkipWhitespaces ();
443                                         if (Peek () == '/') {
444                                                 Read ();
445                                                 handler.OnEndElement (name);
446                                         }
447                                         else {
448                                                 elementNames.Push (name);
449                                                 xmlSpaces.Push (xmlSpace);
450                                         }
451                                         Expect ('>');
452                                         return;
453                                 }
454                         }
455                         else
456                                 ReadCharacters ();
457                 }
458
459                 private void HandleBufferedContent ()
460                 {
461                         if (buffer.Length == 0)
462                                 return;
463                         if (isWhitespace)
464                                 handler.OnIgnorableWhitespace (buffer.ToString ());
465                         else
466                                 handler.OnChars (buffer.ToString ());
467                         buffer.Length = 0;
468                         isWhitespace = false;
469                 }
470
471                 private void ReadCharacters ()
472                 {
473                         isWhitespace = false;
474                         while (true) {
475                                 int i = Peek ();
476                                 switch (i) {
477                                 case -1:
478                                         return;
479                                 case '<':
480                                         return;
481                                 case '&':
482                                         Read ();
483                                         ReadReference ();
484                                         continue;
485                                 default:
486                                         buffer.Append ((char) Read ());
487                                         continue;
488                                 }
489                         }
490                 }
491
492                 private void ReadReference ()
493                 {
494                         if (Peek () == '#') {
495                                 // character reference
496                                 Read ();
497                                 ReadCharacterReference ();
498                         } else {
499                                 string name = ReadName ();
500                                 Expect (';');
501                                 switch (name) {
502                                 case "amp":
503                                         buffer.Append ('&');
504                                         break;
505                                 case "quot":
506                                         buffer.Append ('"');
507                                         break;
508                                 case "apos":
509                                         buffer.Append ('\'');
510                                         break;
511                                 case "lt":
512                                         buffer.Append ('<');
513                                         break;
514                                 case "gt":
515                                         buffer.Append ('>');
516                                         break;
517                                 default:
518                                         throw Error ("General non-predefined entity reference is not supported in this parser.");
519                                 }
520                         }
521                 }
522
523                 private int ReadCharacterReference ()
524                 {
525                         int n = 0;
526                         if (Peek () == 'x') { // hex
527                                 Read ();
528                                 for (int i = Peek (); i >= 0; i = Peek ()) {
529                                         if ('0' <= i && i <= '9')
530                                                 n = n << 4 + i - '0';
531                                         else if ('A' <= i && i <='F')
532                                                 n = n << 4 + i - 'A' + 10;
533                                         else if ('a' <= i && i <='f')
534                                                 n = n << 4 + i - 'a' + 10;
535                                         else
536                                                 break;
537                                         Read ();
538                                 }
539                         } else {
540                                 for (int i = Peek (); i >= 0; i = Peek ()) {
541                                         if ('0' <= i && i <= '9')
542                                                 n = n << 4 + i - '0';
543                                         else
544                                                 break;
545                                         Read ();
546                                 }
547                         }
548                         return n;
549                 }
550
551                 private void ReadAttribute (AttrListImpl a)
552                 {
553                         SkipWhitespaces (true);
554                         if (Peek () == '/' || Peek () == '>')
555                                 // came here just to spend trailing whitespaces
556                                 return;
557
558                         string name = ReadName ();
559                         string value;
560                         SkipWhitespaces ();
561                         Expect ('=');
562                         SkipWhitespaces ();
563                         switch (Read ()) {
564                         case '\'':
565                                 value = ReadUntil ('\'', true);
566                                 break;
567                         case '"':
568                                 value = ReadUntil ('"', true);
569                                 break;
570                         default:
571                                 throw Error ("Invalid attribute value markup.");
572                         }
573                         if (name == "xml:space")
574                                 xmlSpace = value;
575                         a.Add (name, value);
576                 }
577
578                 private void ReadCDATASection ()
579                 {
580                         int nBracket = 0;
581                         while (true) {
582                                 if (Peek () < 0)
583                                         throw UnexpectedEndError ();
584                                 char c = (char) Read ();
585                                 if (c == ']')
586                                         nBracket++;
587                                 else if (c == '>' && nBracket > 1) {
588                                         for (int i = nBracket; i > 2; i--)
589                                                 buffer.Append (']');
590                                         break;
591                                 }
592                                 else {
593                                         for (int i = 0; i < nBracket; i++)
594                                                 buffer.Append (']');
595                                         nBracket = 0;
596                                         buffer.Append (c);
597                                 }
598                         }
599                 }
600
601                 private void ReadComment ()
602                 {
603                         Expect ('-');
604                         Expect ('-');
605                         while (true) {
606                                 if (Read () != '-')
607                                         continue;
608                                 if (Read () != '-')
609                                         continue;
610                                 if (Read () != '>')
611                                         throw Error ("'--' is not allowed inside comment markup.");
612                                 break;
613                         }
614                 }
615         }
616
617 #if INSIDE_CORLIB
618         internal
619 #else
620         [CLSCompliant(false)]    
621         public
622 #endif
623         class SmallXmlParserException : SystemException
624         {
625                 int line;
626                 int column;
627
628                 public SmallXmlParserException (string msg, int line, int column)
629                         : base (String.Format ("{0}. At ({1},{2})", msg, line, column))
630                 {
631                         this.line = line;
632                         this.column = column;
633                 }
634
635                 public int Line {
636                         get { return line; }
637                 }
638
639                 public int Column {
640                         get { return column; }
641                 }
642         }
643 }
644