Merge pull request #5668 from kumpera/wasm-work-p4
[mono.git] / docs / HtmlAgilityPack / HtmlDocument.cs
1 // HtmlAgilityPack V1.0 - Simon Mourier <simon underscore mourier at hotmail dot com>\r
2 using System;\r
3 using System.Collections;\r
4 using System.Collections.Generic;\r
5 using System.IO;\r
6 using System.Text;\r
7 using System.Text.RegularExpressions;\r
8 using System.Xml;\r
9 using System.Xml.XPath;\r
10 \r
11 namespace HtmlAgilityPack\r
12 {\r
13     /// <summary>\r
14     /// Represents a complete HTML document.\r
15     /// </summary>\r
16     public class HtmlDocument : IXPathNavigable\r
17     {\r
18         #region Fields\r
19 \r
20         private int _c;\r
21         private Crc32 _crc32;\r
22         private HtmlAttribute _currentattribute;\r
23         private HtmlNode _currentnode;\r
24         private Encoding _declaredencoding;\r
25         private HtmlNode _documentnode;\r
26         private bool _fullcomment;\r
27         private int _index;\r
28         internal Hashtable _lastnodes = new Hashtable();\r
29         private HtmlNode _lastparentnode;\r
30         private int _line;\r
31         private int _lineposition, _maxlineposition;\r
32         internal Hashtable _nodesid;\r
33         private ParseState _oldstate;\r
34         private bool _onlyDetectEncoding;\r
35         internal Hashtable _openednodes;\r
36         private List<HtmlParseError> _parseerrors = new List<HtmlParseError>();\r
37         private string _remainder;\r
38         private int _remainderOffset;\r
39         private ParseState _state;\r
40         private Encoding _streamencoding;\r
41         internal string _text;\r
42 \r
43         // public props\r
44 \r
45         /// <summary>\r
46         /// Adds Debugging attributes to node. Default is false.\r
47         /// </summary>\r
48         public bool OptionAddDebuggingAttributes;\r
49 \r
50         /// <summary>\r
51         /// Defines if closing for non closed nodes must be done at the end or directly in the document.\r
52         /// Setting this to true can actually change how browsers render the page. Default is false.\r
53         /// </summary>\r
54         public bool OptionAutoCloseOnEnd; // close errors at the end\r
55 \r
56         /// <summary>\r
57         /// Defines if non closed nodes will be checked at the end of parsing. Default is true.\r
58         /// </summary>\r
59         public bool OptionCheckSyntax = true;\r
60 \r
61         /// <summary>\r
62         /// Defines if a checksum must be computed for the document while parsing. Default is false.\r
63         /// </summary>\r
64         public bool OptionComputeChecksum;\r
65 \r
66         /// <summary>\r
67         /// Defines the default stream encoding to use. Default is System.Text.Encoding.Default.\r
68         /// </summary>\r
69         public Encoding OptionDefaultStreamEncoding = Encoding.Default;\r
70 \r
71         /// <summary>\r
72         /// Defines if source text must be extracted while parsing errors.\r
73         /// If the document has a lot of errors, or cascading errors, parsing performance can be dramatically affected if set to true.\r
74         /// Default is false.\r
75         /// </summary>\r
76         public bool OptionExtractErrorSourceText;\r
77 \r
78         // turning this on can dramatically slow performance if a lot of errors are detected\r
79 \r
80         /// <summary>\r
81         /// Defines the maximum length of source text or parse errors. Default is 100.\r
82         /// </summary>\r
83         public int OptionExtractErrorSourceTextMaxLength = 100;\r
84 \r
85         /// <summary>\r
86         /// Defines if LI, TR, TH, TD tags must be partially fixed when nesting errors are detected. Default is false.\r
87         /// </summary>\r
88         public bool OptionFixNestedTags; // fix li, tr, th, td tags\r
89 \r
90         /// <summary>\r
91         /// Defines if output must conform to XML, instead of HTML.\r
92         /// </summary>\r
93         public bool OptionOutputAsXml;\r
94 \r
95         /// <summary>\r
96         /// Defines if attribute value output must be optimized (not bound with double quotes if it is possible). Default is false.\r
97         /// </summary>\r
98         public bool OptionOutputOptimizeAttributeValues;\r
99 \r
100         /// <summary>\r
101         /// Defines if name must be output with it's original case. Useful for asp.net tags and attributes\r
102         /// </summary>\r
103         public bool OptionOutputOriginalCase;\r
104 \r
105         /// <summary>\r
106         /// Defines if name must be output in uppercase. Default is false.\r
107         /// </summary>\r
108         public bool OptionOutputUpperCase;\r
109 \r
110         /// <summary>\r
111         /// Defines if declared encoding must be read from the document.\r
112         /// Declared encoding is determined using the meta http-equiv="content-type" content="text/html;charset=XXXXX" html node.\r
113         /// Default is true.\r
114         /// </summary>\r
115         public bool OptionReadEncoding = true;\r
116 \r
117         /// <summary>\r
118         /// Defines the name of a node that will throw the StopperNodeException when found as an end node. Default is null.\r
119         /// </summary>\r
120         public string OptionStopperNodeName;\r
121 \r
122         /// <summary>\r
123         /// Defines if the 'id' attribute must be specifically used. Default is true.\r
124         /// </summary>\r
125         public bool OptionUseIdAttribute = true;\r
126 \r
127         /// <summary>\r
128         /// Defines if empty nodes must be written as closed during output. Default is false.\r
129         /// </summary>\r
130         public bool OptionWriteEmptyNodes;\r
131 \r
132         #endregion\r
133 \r
134         #region Static Members\r
135 \r
136         internal static readonly string HtmlExceptionRefNotChild = "Reference node must be a child of this node";\r
137 \r
138         internal static readonly string HtmlExceptionUseIdAttributeFalse =\r
139             "You need to set UseIdAttribute property to true to enable this feature";\r
140 \r
141         #endregion\r
142 \r
143         #region Constructors\r
144 \r
145         /// <summary>\r
146         /// Creates an instance of an HTML document.\r
147         /// </summary>\r
148         public HtmlDocument()\r
149         {\r
150             _documentnode = CreateNode(HtmlNodeType.Document, 0);\r
151         }\r
152 \r
153         #endregion\r
154 \r
155         #region Properties\r
156 \r
157         /// <summary>\r
158         /// Gets the document CRC32 checksum if OptionComputeChecksum was set to true before parsing, 0 otherwise.\r
159         /// </summary>\r
160         public int CheckSum\r
161         {\r
162             get\r
163             {\r
164                 if (_crc32 == null)\r
165                 {\r
166                     return 0;\r
167                 }\r
168                 else\r
169                 {\r
170                     return (int) _crc32.CheckSum;\r
171                 }\r
172             }\r
173         }\r
174 \r
175         /// <summary>\r
176         /// Gets the document's declared encoding.\r
177         /// Declared encoding is determined using the meta http-equiv="content-type" content="text/html;charset=XXXXX" html node.\r
178         /// </summary>\r
179         public Encoding DeclaredEncoding\r
180         {\r
181             get { return _declaredencoding; }\r
182         }\r
183 \r
184         /// <summary>\r
185         /// Gets the root node of the document.\r
186         /// </summary>\r
187         public HtmlNode DocumentNode\r
188         {\r
189             get { return _documentnode; }\r
190         }\r
191 \r
192         /// <summary>\r
193         /// Gets the document's output encoding.\r
194         /// </summary>\r
195         public Encoding Encoding\r
196         {\r
197             get { return GetOutEncoding(); }\r
198         }\r
199 \r
200         /// <summary>\r
201         /// Gets a list of parse errors found in the document.\r
202         /// </summary>\r
203         public IEnumerable<HtmlParseError> ParseErrors\r
204         {\r
205             get { return _parseerrors; }\r
206         }\r
207 \r
208         /// <summary>\r
209         /// Gets the remaining text.\r
210         /// Will always be null if OptionStopperNodeName is null.\r
211         /// </summary>\r
212         public string Remainder\r
213         {\r
214             get { return _remainder; }\r
215         }\r
216 \r
217         /// <summary>\r
218         /// Gets the offset of Remainder in the original Html text.\r
219         /// If OptionStopperNodeName is null, this will return the length of the original Html text.\r
220         /// </summary>\r
221         public int RemainderOffset\r
222         {\r
223             get { return _remainderOffset; }\r
224         }\r
225 \r
226         /// <summary>\r
227         /// Gets the document's stream encoding.\r
228         /// </summary>\r
229         public Encoding StreamEncoding\r
230         {\r
231             get { return _streamencoding; }\r
232         }\r
233 \r
234         #endregion\r
235 \r
236         #region IXPathNavigable Members\r
237 \r
238         /// <summary>\r
239         /// Creates a new XPathNavigator object for navigating this HTML document.\r
240         /// </summary>\r
241         /// <returns>An XPathNavigator object. The XPathNavigator is positioned on the root of the document.</returns>\r
242         public XPathNavigator CreateNavigator()\r
243         {\r
244             return new HtmlNodeNavigator(this, _documentnode);\r
245         }\r
246 \r
247         #endregion\r
248 \r
249         #region Public Methods\r
250 \r
251         /// <summary>\r
252         /// Gets a valid XML name.\r
253         /// </summary>\r
254         /// <param name="name">Any text.</param>\r
255         /// <returns>A string that is a valid XML name.</returns>\r
256         public static string GetXmlName(string name)\r
257         {\r
258             string xmlname = string.Empty;\r
259             bool nameisok = true;\r
260             for (int i = 0; i < name.Length; i++)\r
261             {\r
262                 // names are lcase\r
263                 // note: we are very limited here, too much?\r
264                 if (((name[i] >= 'a') && (name[i] <= 'z')) ||\r
265                     ((name[i] >= '0') && (name[i] <= '9')) ||\r
266                     //                                  (name[i]==':') || (name[i]=='_') || (name[i]=='-') || (name[i]=='.')) // these are bads in fact\r
267                     (name[i] == '_') || (name[i] == '-') || (name[i] == '.'))\r
268                 {\r
269                     xmlname += name[i];\r
270                 }\r
271                 else\r
272                 {\r
273                     nameisok = false;\r
274                     byte[] bytes = Encoding.UTF8.GetBytes(new char[] {name[i]});\r
275                     for (int j = 0; j < bytes.Length; j++)\r
276                     {\r
277                         xmlname += bytes[j].ToString("x2");\r
278                     }\r
279                     xmlname += "_";\r
280                 }\r
281             }\r
282             if (nameisok)\r
283             {\r
284                 return xmlname;\r
285             }\r
286             return "_" + xmlname;\r
287         }\r
288 \r
289         /// <summary>\r
290         /// Applies HTML encoding to a specified string.\r
291         /// </summary>\r
292         /// <param name="html">The input string to encode. May not be null.</param>\r
293         /// <returns>The encoded string.</returns>\r
294         public static string HtmlEncode(string html)\r
295         {\r
296             if (html == null)\r
297             {\r
298                 throw new ArgumentNullException("html");\r
299             }\r
300             // replace & by &amp; but only once!\r
301             Regex rx = new Regex("&(?!(amp;)|(lt;)|(gt;)|(quot;))", RegexOptions.IgnoreCase);\r
302             return rx.Replace(html, "&amp;").Replace("<", "&lt;").Replace(">", "&gt;").Replace("\"", "&quot;");\r
303         }\r
304 \r
305         /// <summary>\r
306         /// Determines if the specified character is considered as a whitespace character.\r
307         /// </summary>\r
308         /// <param name="c">The character to check.</param>\r
309         /// <returns>true if if the specified character is considered as a whitespace character.</returns>\r
310         public static bool IsWhiteSpace(int c)\r
311         {\r
312             if ((c == 10) || (c == 13) || (c == 32) || (c == 9))\r
313             {\r
314                 return true;\r
315             }\r
316             return false;\r
317         }\r
318 \r
319         /// <summary>\r
320         /// Creates an HTML attribute with the specified name.\r
321         /// </summary>\r
322         /// <param name="name">The name of the attribute. May not be null.</param>\r
323         /// <returns>The new HTML attribute.</returns>\r
324         public HtmlAttribute CreateAttribute(string name)\r
325         {\r
326             if (name == null)\r
327             {\r
328                 throw new ArgumentNullException("name");\r
329             }\r
330             HtmlAttribute att = CreateAttribute();\r
331             att.Name = name;\r
332             return att;\r
333         }\r
334 \r
335         /// <summary>\r
336         /// Creates an HTML attribute with the specified name.\r
337         /// </summary>\r
338         /// <param name="name">The name of the attribute. May not be null.</param>\r
339         /// <param name="value">The value of the attribute.</param>\r
340         /// <returns>The new HTML attribute.</returns>\r
341         public HtmlAttribute CreateAttribute(string name, string value)\r
342         {\r
343             if (name == null)\r
344             {\r
345                 throw new ArgumentNullException("name");\r
346             }\r
347             HtmlAttribute att = CreateAttribute(name);\r
348             att.Value = value;\r
349             return att;\r
350         }\r
351 \r
352         /// <summary>\r
353         /// Creates an HTML comment node.\r
354         /// </summary>\r
355         /// <returns>The new HTML comment node.</returns>\r
356         public HtmlCommentNode CreateComment()\r
357         {\r
358             return (HtmlCommentNode) CreateNode(HtmlNodeType.Comment);\r
359         }\r
360 \r
361         /// <summary>\r
362         /// Creates an HTML comment node with the specified comment text.\r
363         /// </summary>\r
364         /// <param name="comment">The comment text. May not be null.</param>\r
365         /// <returns>The new HTML comment node.</returns>\r
366         public HtmlCommentNode CreateComment(string comment)\r
367         {\r
368             if (comment == null)\r
369             {\r
370                 throw new ArgumentNullException("comment");\r
371             }\r
372             HtmlCommentNode c = CreateComment();\r
373             c.Comment = comment;\r
374             return c;\r
375         }\r
376 \r
377         /// <summary>\r
378         /// Creates an HTML element node with the specified name.\r
379         /// </summary>\r
380         /// <param name="name">The qualified name of the element. May not be null.</param>\r
381         /// <returns>The new HTML node.</returns>\r
382         public HtmlNode CreateElement(string name)\r
383         {\r
384             if (name == null)\r
385             {\r
386                 throw new ArgumentNullException("name");\r
387             }\r
388             HtmlNode node = CreateNode(HtmlNodeType.Element);\r
389             node.Name = name;\r
390             return node;\r
391         }\r
392 \r
393         /// <summary>\r
394         /// Creates an HTML text node.\r
395         /// </summary>\r
396         /// <returns>The new HTML text node.</returns>\r
397         public HtmlTextNode CreateTextNode()\r
398         {\r
399             return (HtmlTextNode) CreateNode(HtmlNodeType.Text);\r
400         }\r
401 \r
402         /// <summary>\r
403         /// Creates an HTML text node with the specified text.\r
404         /// </summary>\r
405         /// <param name="text">The text of the node. May not be null.</param>\r
406         /// <returns>The new HTML text node.</returns>\r
407         public HtmlTextNode CreateTextNode(string text)\r
408         {\r
409             if (text == null)\r
410             {\r
411                 throw new ArgumentNullException("text");\r
412             }\r
413             HtmlTextNode t = CreateTextNode();\r
414             t.Text = text;\r
415             return t;\r
416         }\r
417 \r
418         /// <summary>\r
419         /// Detects the encoding of an HTML stream.\r
420         /// </summary>\r
421         /// <param name="stream">The input stream. May not be null.</param>\r
422         /// <returns>The detected encoding.</returns>\r
423         public Encoding DetectEncoding(Stream stream)\r
424         {\r
425             if (stream == null)\r
426             {\r
427                 throw new ArgumentNullException("stream");\r
428             }\r
429             return DetectEncoding(new StreamReader(stream));\r
430         }\r
431 \r
432         /// <summary>\r
433         /// Detects the encoding of an HTML file.\r
434         /// </summary>\r
435         /// <param name="path">Path for the file containing the HTML document to detect. May not be null.</param>\r
436         /// <returns>The detected encoding.</returns>\r
437         public Encoding DetectEncoding(string path)\r
438         {\r
439             if (path == null)\r
440             {\r
441                 throw new ArgumentNullException("path");\r
442             }\r
443             StreamReader sr = new StreamReader(path, OptionDefaultStreamEncoding);\r
444             Encoding encoding = DetectEncoding(sr);\r
445             sr.Close();\r
446             return encoding;\r
447         }\r
448 \r
449         /// <summary>\r
450         /// Detects the encoding of an HTML text provided on a TextReader.\r
451         /// </summary>\r
452         /// <param name="reader">The TextReader used to feed the HTML. May not be null.</param>\r
453         /// <returns>The detected encoding.</returns>\r
454         public Encoding DetectEncoding(TextReader reader)\r
455         {\r
456             if (reader == null)\r
457             {\r
458                 throw new ArgumentNullException("reader");\r
459             }\r
460             _onlyDetectEncoding = true;\r
461             if (OptionCheckSyntax)\r
462             {\r
463                 _openednodes = new Hashtable();\r
464             }\r
465             else\r
466             {\r
467                 _openednodes = null;\r
468             }\r
469 \r
470             if (OptionUseIdAttribute)\r
471             {\r
472                 _nodesid = new Hashtable();\r
473             }\r
474             else\r
475             {\r
476                 _nodesid = null;\r
477             }\r
478 \r
479             StreamReader sr = reader as StreamReader;\r
480             if (sr != null)\r
481             {\r
482                 _streamencoding = sr.CurrentEncoding;\r
483             }\r
484             else\r
485             {\r
486                 _streamencoding = null;\r
487             }\r
488             _declaredencoding = null;\r
489 \r
490             _text = reader.ReadToEnd();\r
491             _documentnode = CreateNode(HtmlNodeType.Document, 0);\r
492 \r
493             // this is almost a hack, but it allows us not to muck with the original parsing code\r
494             try\r
495             {\r
496                 Parse();\r
497             }\r
498             catch (EncodingFoundException ex)\r
499             {\r
500                 return ex.Encoding;\r
501             }\r
502             return null;\r
503         }\r
504 \r
505         /// <summary>\r
506         /// Detects the encoding of an HTML document from a file first, and then loads the file.\r
507         /// </summary>\r
508         /// <param name="path">The complete file path to be read.</param>\r
509         public void DetectEncodingAndLoad(string path)\r
510         {\r
511             DetectEncodingAndLoad(path, true);\r
512         }\r
513 \r
514         /// <summary>\r
515         /// Detects the encoding of an HTML document from a file first, and then loads the file.\r
516         /// </summary>\r
517         /// <param name="path">The complete file path to be read. May not be null.</param>\r
518         /// <param name="detectEncoding">true to detect encoding, false otherwise.</param>\r
519         public void DetectEncodingAndLoad(string path, bool detectEncoding)\r
520         {\r
521             if (path == null)\r
522             {\r
523                 throw new ArgumentNullException("path");\r
524             }\r
525             Encoding enc;\r
526             if (detectEncoding)\r
527             {\r
528                 enc = DetectEncoding(path);\r
529             }\r
530             else\r
531             {\r
532                 enc = null;\r
533             }\r
534 \r
535             if (enc == null)\r
536             {\r
537                 Load(path);\r
538             }\r
539             else\r
540             {\r
541                 Load(path, enc);\r
542             }\r
543         }\r
544 \r
545         /// <summary>\r
546         /// Detects the encoding of an HTML text.\r
547         /// </summary>\r
548         /// <param name="html">The input html text. May not be null.</param>\r
549         /// <returns>The detected encoding.</returns>\r
550         public Encoding DetectEncodingHtml(string html)\r
551         {\r
552             if (html == null)\r
553             {\r
554                 throw new ArgumentNullException("html");\r
555             }\r
556             StringReader sr = new StringReader(html);\r
557             Encoding encoding = DetectEncoding(sr);\r
558             sr.Close();\r
559             return encoding;\r
560         }\r
561 \r
562         /// <summary>\r
563         /// Gets the HTML node with the specified 'id' attribute value.\r
564         /// </summary>\r
565         /// <param name="id">The attribute id to match. May not be null.</param>\r
566         /// <returns>The HTML node with the matching id or null if not found.</returns>\r
567         public HtmlNode GetElementbyId(string id)\r
568         {\r
569             if (id == null)\r
570             {\r
571                 throw new ArgumentNullException("id");\r
572             }\r
573             if (_nodesid == null)\r
574             {\r
575                 throw new Exception(HtmlExceptionUseIdAttributeFalse);\r
576             }\r
577 \r
578             return _nodesid[id.ToLower()] as HtmlNode;\r
579         }\r
580 \r
581         /// <summary>\r
582         /// Loads an HTML document from a stream.\r
583         /// </summary>\r
584         /// <param name="stream">The input stream.</param>\r
585         public void Load(Stream stream)\r
586         {\r
587             Load(new StreamReader(stream, OptionDefaultStreamEncoding));\r
588         }\r
589 \r
590         /// <summary>\r
591         /// Loads an HTML document from a stream.\r
592         /// </summary>\r
593         /// <param name="stream">The input stream.</param>\r
594         /// <param name="detectEncodingFromByteOrderMarks">Indicates whether to look for byte order marks at the beginning of the stream.</param>\r
595         public void Load(Stream stream, bool detectEncodingFromByteOrderMarks)\r
596         {\r
597             Load(new StreamReader(stream, detectEncodingFromByteOrderMarks));\r
598         }\r
599 \r
600         /// <summary>\r
601         /// Loads an HTML document from a stream.\r
602         /// </summary>\r
603         /// <param name="stream">The input stream.</param>\r
604         /// <param name="encoding">The character encoding to use.</param>\r
605         public void Load(Stream stream, Encoding encoding)\r
606         {\r
607             Load(new StreamReader(stream, encoding));\r
608         }\r
609 \r
610         /// <summary>\r
611         /// Loads an HTML document from a stream.\r
612         /// </summary>\r
613         /// <param name="stream">The input stream.</param>\r
614         /// <param name="encoding">The character encoding to use.</param>\r
615         /// <param name="detectEncodingFromByteOrderMarks">Indicates whether to look for byte order marks at the beginning of the stream.</param>\r
616         public void Load(Stream stream, Encoding encoding, bool detectEncodingFromByteOrderMarks)\r
617         {\r
618             Load(new StreamReader(stream, encoding, detectEncodingFromByteOrderMarks));\r
619         }\r
620 \r
621         /// <summary>\r
622         /// Loads an HTML document from a stream.\r
623         /// </summary>\r
624         /// <param name="stream">The input stream.</param>\r
625         /// <param name="encoding">The character encoding to use.</param>\r
626         /// <param name="detectEncodingFromByteOrderMarks">Indicates whether to look for byte order marks at the beginning of the stream.</param>\r
627         /// <param name="buffersize">The minimum buffer size.</param>\r
628         public void Load(Stream stream, Encoding encoding, bool detectEncodingFromByteOrderMarks, int buffersize)\r
629         {\r
630             Load(new StreamReader(stream, encoding, detectEncodingFromByteOrderMarks, buffersize));\r
631         }\r
632 \r
633         /// <summary>\r
634         /// Loads an HTML document from a file.\r
635         /// </summary>\r
636         /// <param name="path">The complete file path to be read. May not be null.</param>\r
637         public void Load(string path)\r
638         {\r
639             if (path == null)\r
640             {\r
641                 throw new ArgumentNullException("path");\r
642             }\r
643             StreamReader sr = new StreamReader(path, OptionDefaultStreamEncoding);\r
644             Load(sr);\r
645             sr.Close();\r
646         }\r
647 \r
648         /// <summary>\r
649         /// Loads an HTML document from a file.\r
650         /// </summary>\r
651         /// <param name="path">The complete file path to be read. May not be null.</param>\r
652         /// <param name="detectEncodingFromByteOrderMarks">Indicates whether to look for byte order marks at the beginning of the file.</param>\r
653         public void Load(string path, bool detectEncodingFromByteOrderMarks)\r
654         {\r
655             if (path == null)\r
656             {\r
657                 throw new ArgumentNullException("path");\r
658             }\r
659             StreamReader sr = new StreamReader(path, detectEncodingFromByteOrderMarks);\r
660             Load(sr);\r
661             sr.Close();\r
662         }\r
663 \r
664         /// <summary>\r
665         /// Loads an HTML document from a file.\r
666         /// </summary>\r
667         /// <param name="path">The complete file path to be read. May not be null.</param>\r
668         /// <param name="encoding">The character encoding to use. May not be null.</param>\r
669         public void Load(string path, Encoding encoding)\r
670         {\r
671             if (path == null)\r
672             {\r
673                 throw new ArgumentNullException("path");\r
674             }\r
675             if (encoding == null)\r
676             {\r
677                 throw new ArgumentNullException("encoding");\r
678             }\r
679             StreamReader sr = new StreamReader(path, encoding);\r
680             Load(sr);\r
681             sr.Close();\r
682         }\r
683 \r
684         /// <summary>\r
685         /// Loads an HTML document from a file.\r
686         /// </summary>\r
687         /// <param name="path">The complete file path to be read. May not be null.</param>\r
688         /// <param name="encoding">The character encoding to use. May not be null.</param>\r
689         /// <param name="detectEncodingFromByteOrderMarks">Indicates whether to look for byte order marks at the beginning of the file.</param>\r
690         public void Load(string path, Encoding encoding, bool detectEncodingFromByteOrderMarks)\r
691         {\r
692             if (path == null)\r
693             {\r
694                 throw new ArgumentNullException("path");\r
695             }\r
696             if (encoding == null)\r
697             {\r
698                 throw new ArgumentNullException("encoding");\r
699             }\r
700             StreamReader sr = new StreamReader(path, encoding, detectEncodingFromByteOrderMarks);\r
701             Load(sr);\r
702             sr.Close();\r
703         }\r
704 \r
705         /// <summary>\r
706         /// Loads an HTML document from a file.\r
707         /// </summary>\r
708         /// <param name="path">The complete file path to be read. May not be null.</param>\r
709         /// <param name="encoding">The character encoding to use. May not be null.</param>\r
710         /// <param name="detectEncodingFromByteOrderMarks">Indicates whether to look for byte order marks at the beginning of the file.</param>\r
711         /// <param name="buffersize">The minimum buffer size.</param>\r
712         public void Load(string path, Encoding encoding, bool detectEncodingFromByteOrderMarks, int buffersize)\r
713         {\r
714             if (path == null)\r
715             {\r
716                 throw new ArgumentNullException("path");\r
717             }\r
718             if (encoding == null)\r
719             {\r
720                 throw new ArgumentNullException("encoding");\r
721             }\r
722             StreamReader sr = new StreamReader(path, encoding, detectEncodingFromByteOrderMarks, buffersize);\r
723             Load(sr);\r
724             sr.Close();\r
725         }\r
726 \r
727         /// <summary>\r
728         /// Loads the HTML document from the specified TextReader.\r
729         /// </summary>\r
730         /// <param name="reader">The TextReader used to feed the HTML data into the document. May not be null.</param>\r
731         public void Load(TextReader reader)\r
732         {\r
733             // all Load methods pass down to this one\r
734             if (reader == null)\r
735             {\r
736                 throw new ArgumentNullException("reader");\r
737             }\r
738 \r
739             _onlyDetectEncoding = false;\r
740 \r
741             if (OptionCheckSyntax)\r
742             {\r
743                 _openednodes = new Hashtable();\r
744             }\r
745             else\r
746             {\r
747                 _openednodes = null;\r
748             }\r
749 \r
750             if (OptionUseIdAttribute)\r
751             {\r
752                 _nodesid = new Hashtable();\r
753             }\r
754             else\r
755             {\r
756                 _nodesid = null;\r
757             }\r
758 \r
759             StreamReader sr = reader as StreamReader;\r
760             if (sr != null)\r
761             {\r
762                 try\r
763                 {\r
764                     // trigger bom read if needed\r
765                     sr.Peek();\r
766                 }\r
767                     // ReSharper disable EmptyGeneralCatchClause\r
768                 catch (Exception)\r
769                     // ReSharper restore EmptyGeneralCatchClause\r
770                 {\r
771                     // void on purpose\r
772                 }\r
773                 _streamencoding = sr.CurrentEncoding;\r
774             }\r
775             else\r
776             {\r
777                 _streamencoding = null;\r
778             }\r
779             _declaredencoding = null;\r
780 \r
781             _text = reader.ReadToEnd();\r
782             _documentnode = CreateNode(HtmlNodeType.Document, 0);\r
783             Parse();\r
784 \r
785             if (OptionCheckSyntax)\r
786             {\r
787                 foreach (HtmlNode node in _openednodes.Values)\r
788                 {\r
789                     if (!node._starttag) // already reported\r
790                     {\r
791                         continue;\r
792                     }\r
793 \r
794                     string html;\r
795                     if (OptionExtractErrorSourceText)\r
796                     {\r
797                         html = node.OuterHtml;\r
798                         if (html.Length > OptionExtractErrorSourceTextMaxLength)\r
799                         {\r
800                             html = html.Substring(0, OptionExtractErrorSourceTextMaxLength);\r
801                         }\r
802                     }\r
803                     else\r
804                     {\r
805                         html = string.Empty;\r
806                     }\r
807                     AddError(\r
808                         HtmlParseErrorCode.TagNotClosed,\r
809                         node._line, node._lineposition,\r
810                         node._streamposition, html,\r
811                         "End tag </" + node.Name + "> was not found");\r
812                 }\r
813 \r
814                 // we don't need this anymore\r
815                 _openednodes.Clear();\r
816             }\r
817         }\r
818 \r
819         /// <summary>\r
820         /// Loads the HTML document from the specified string.\r
821         /// </summary>\r
822         /// <param name="html">String containing the HTML document to load. May not be null.</param>\r
823         public void LoadHtml(string html)\r
824         {\r
825             if (html == null)\r
826             {\r
827                 throw new ArgumentNullException("html");\r
828             }\r
829             StringReader sr = new StringReader(html);\r
830             Load(sr);\r
831             sr.Close();\r
832         }\r
833 \r
834         /// <summary>\r
835         /// Saves the HTML document to the specified stream.\r
836         /// </summary>\r
837         /// <param name="outStream">The stream to which you want to save.</param>\r
838         public void Save(Stream outStream)\r
839         {\r
840             StreamWriter sw = new StreamWriter(outStream, GetOutEncoding());\r
841             Save(sw);\r
842         }\r
843 \r
844         /// <summary>\r
845         /// Saves the HTML document to the specified stream.\r
846         /// </summary>\r
847         /// <param name="outStream">The stream to which you want to save. May not be null.</param>\r
848         /// <param name="encoding">The character encoding to use. May not be null.</param>\r
849         public void Save(Stream outStream, Encoding encoding)\r
850         {\r
851             if (outStream == null)\r
852             {\r
853                 throw new ArgumentNullException("outStream");\r
854             }\r
855             if (encoding == null)\r
856             {\r
857                 throw new ArgumentNullException("encoding");\r
858             }\r
859             StreamWriter sw = new StreamWriter(outStream, encoding);\r
860             Save(sw);\r
861         }\r
862 \r
863         /// <summary>\r
864         /// Saves the mixed document to the specified file.\r
865         /// </summary>\r
866         /// <param name="filename">The location of the file where you want to save the document.</param>\r
867         public void Save(string filename)\r
868         {\r
869             StreamWriter sw = new StreamWriter(filename, false, GetOutEncoding());\r
870             Save(sw);\r
871             sw.Close();\r
872         }\r
873 \r
874         /// <summary>\r
875         /// Saves the mixed document to the specified file.\r
876         /// </summary>\r
877         /// <param name="filename">The location of the file where you want to save the document. May not be null.</param>\r
878         /// <param name="encoding">The character encoding to use. May not be null.</param>\r
879         public void Save(string filename, Encoding encoding)\r
880         {\r
881             if (filename == null)\r
882             {\r
883                 throw new ArgumentNullException("filename");\r
884             }\r
885             if (encoding == null)\r
886             {\r
887                 throw new ArgumentNullException("encoding");\r
888             }\r
889             StreamWriter sw = new StreamWriter(filename, false, encoding);\r
890             Save(sw);\r
891             sw.Close();\r
892         }\r
893 \r
894         /// <summary>\r
895         /// Saves the HTML document to the specified StreamWriter.\r
896         /// </summary>\r
897         /// <param name="writer">The StreamWriter to which you want to save.</param>\r
898         public void Save(StreamWriter writer)\r
899         {\r
900             Save((TextWriter) writer);\r
901         }\r
902 \r
903         /// <summary>\r
904         /// Saves the HTML document to the specified TextWriter.\r
905         /// </summary>\r
906         /// <param name="writer">The TextWriter to which you want to save. May not be null.</param>\r
907         public void Save(TextWriter writer)\r
908         {\r
909             if (writer == null)\r
910             {\r
911                 throw new ArgumentNullException("writer");\r
912             }\r
913             DocumentNode.WriteTo(writer);\r
914         }\r
915 \r
916         /// <summary>\r
917         /// Saves the HTML document to the specified XmlWriter.\r
918         /// </summary>\r
919         /// <param name="writer">The XmlWriter to which you want to save.</param>\r
920         public void Save(XmlWriter writer)\r
921         {\r
922             DocumentNode.WriteTo(writer);\r
923             writer.Flush();\r
924         }\r
925 \r
926         #endregion\r
927 \r
928         #region Internal Methods\r
929 \r
930         internal HtmlAttribute CreateAttribute()\r
931         {\r
932             return new HtmlAttribute(this);\r
933         }\r
934 \r
935         internal HtmlNode CreateNode(HtmlNodeType type)\r
936         {\r
937             return CreateNode(type, -1);\r
938         }\r
939 \r
940         internal HtmlNode CreateNode(HtmlNodeType type, int index)\r
941         {\r
942             switch (type)\r
943             {\r
944                 case HtmlNodeType.Comment:\r
945                     return new HtmlCommentNode(this, index);\r
946 \r
947                 case HtmlNodeType.Text:\r
948                     return new HtmlTextNode(this, index);\r
949 \r
950                 default:\r
951                     return new HtmlNode(type, this, index);\r
952             }\r
953         }\r
954 \r
955         internal Encoding GetOutEncoding()\r
956         {\r
957             // when unspecified, use the stream encoding first\r
958             if (_declaredencoding != null)\r
959             {\r
960                 return _declaredencoding;\r
961             }\r
962             else\r
963             {\r
964                 if (_streamencoding != null)\r
965                 {\r
966                     return _streamencoding;\r
967                 }\r
968             }\r
969             return OptionDefaultStreamEncoding;\r
970         }\r
971 \r
972         internal HtmlNode GetXmlDeclaration()\r
973         {\r
974             if (!_documentnode.HasChildNodes)\r
975             {\r
976                 return null;\r
977             }\r
978 \r
979             foreach (HtmlNode node in _documentnode._childnodes)\r
980             {\r
981                 if (node.Name == "?xml") // it's ok, names are case sensitive\r
982                 {\r
983                     return node;\r
984                 }\r
985             }\r
986             return null;\r
987         }\r
988 \r
989         internal void SetIdForNode(HtmlNode node, string id)\r
990         {\r
991             if (!OptionUseIdAttribute)\r
992             {\r
993                 return;\r
994             }\r
995 \r
996             if ((_nodesid == null) || (id == null))\r
997             {\r
998                 return;\r
999             }\r
1000 \r
1001             if (node == null)\r
1002             {\r
1003                 _nodesid.Remove(id.ToLower());\r
1004             }\r
1005             else\r
1006             {\r
1007                 _nodesid[id.ToLower()] = node;\r
1008             }\r
1009         }\r
1010 \r
1011         internal void UpdateLastParentNode()\r
1012         {\r
1013             do\r
1014             {\r
1015                 if (_lastparentnode.Closed)\r
1016                 {\r
1017                     _lastparentnode = _lastparentnode.ParentNode;\r
1018                 }\r
1019             } while ((_lastparentnode != null) && (_lastparentnode.Closed));\r
1020             if (_lastparentnode == null)\r
1021             {\r
1022                 _lastparentnode = _documentnode;\r
1023             }\r
1024         }\r
1025 \r
1026         #endregion\r
1027 \r
1028         #region Private Methods\r
1029 \r
1030         private HtmlParseError AddError(\r
1031             HtmlParseErrorCode code,\r
1032             int line,\r
1033             int linePosition,\r
1034             int streamPosition,\r
1035             string sourceText,\r
1036             string reason)\r
1037         {\r
1038             HtmlParseError err = new HtmlParseError(code, line, linePosition, streamPosition, sourceText, reason);\r
1039             _parseerrors.Add(err);\r
1040             return err;\r
1041         }\r
1042 \r
1043         private void CloseCurrentNode()\r
1044         {\r
1045             if (_currentnode.Closed) // text or document are by def closed\r
1046                 return;\r
1047 \r
1048             bool error = false;\r
1049 \r
1050             // find last node of this kind\r
1051             HtmlNode prev = (HtmlNode) _lastnodes[_currentnode.Name];\r
1052             if (prev == null)\r
1053             {\r
1054                 if (HtmlNode.IsClosedElement(_currentnode.Name))\r
1055                 {\r
1056                     // </br> will be seen as <br>\r
1057                     _currentnode.CloseNode(_currentnode);\r
1058 \r
1059                     // add to parent node\r
1060                     if (_lastparentnode != null)\r
1061                     {\r
1062                         HtmlNode foundNode = null;\r
1063                         Stack futureChild = new Stack();\r
1064                         for (HtmlNode node = _lastparentnode.LastChild; node != null; node = node.PreviousSibling)\r
1065                         {\r
1066                             if ((node.Name == _currentnode.Name) && (!node.HasChildNodes))\r
1067                             {\r
1068                                 foundNode = node;\r
1069                                 break;\r
1070                             }\r
1071                             futureChild.Push(node);\r
1072                         }\r
1073                         if (foundNode != null)\r
1074                         {\r
1075                             HtmlNode node = null;\r
1076                             while (futureChild.Count != 0)\r
1077                             {\r
1078                                 node = (HtmlNode) futureChild.Pop();\r
1079                                 _lastparentnode.RemoveChild(node);\r
1080                                 foundNode.AppendChild(node);\r
1081                             }\r
1082                         }\r
1083                         else\r
1084                         {\r
1085                             _lastparentnode.AppendChild(_currentnode);\r
1086                         }\r
1087                     }\r
1088                 }\r
1089                 else\r
1090                 {\r
1091                     // node has no parent\r
1092                     // node is not a closed node\r
1093 \r
1094                     if (HtmlNode.CanOverlapElement(_currentnode.Name))\r
1095                     {\r
1096                         // this is a hack: add it as a text node\r
1097                         HtmlNode closenode = CreateNode(HtmlNodeType.Text, _currentnode._outerstartindex);\r
1098                         closenode._outerlength = _currentnode._outerlength;\r
1099                         ((HtmlTextNode) closenode).Text = ((HtmlTextNode) closenode).Text.ToLower();\r
1100                         if (_lastparentnode != null)\r
1101                         {\r
1102                             _lastparentnode.AppendChild(closenode);\r
1103                         }\r
1104                     }\r
1105                     else\r
1106                     {\r
1107                         if (HtmlNode.IsEmptyElement(_currentnode.Name))\r
1108                         {\r
1109                             AddError(\r
1110                                 HtmlParseErrorCode.EndTagNotRequired,\r
1111                                 _currentnode._line, _currentnode._lineposition,\r
1112                                 _currentnode._streamposition, _currentnode.OuterHtml,\r
1113                                 "End tag </" + _currentnode.Name + "> is not required");\r
1114                         }\r
1115                         else\r
1116                         {\r
1117                             // node cannot overlap, node is not empty\r
1118                             AddError(\r
1119                                 HtmlParseErrorCode.TagNotOpened,\r
1120                                 _currentnode._line, _currentnode._lineposition,\r
1121                                 _currentnode._streamposition, _currentnode.OuterHtml,\r
1122                                 "Start tag <" + _currentnode.Name + "> was not found");\r
1123                             error = true;\r
1124                         }\r
1125                     }\r
1126                 }\r
1127             }\r
1128             else\r
1129             {\r
1130                 if (OptionFixNestedTags)\r
1131                 {\r
1132                     if (FindResetterNodes(prev, GetResetters(_currentnode.Name)))\r
1133                     {\r
1134                         AddError(\r
1135                             HtmlParseErrorCode.EndTagInvalidHere,\r
1136                             _currentnode._line, _currentnode._lineposition,\r
1137                             _currentnode._streamposition, _currentnode.OuterHtml,\r
1138                             "End tag </" + _currentnode.Name + "> invalid here");\r
1139                         error = true;\r
1140                     }\r
1141                 }\r
1142 \r
1143                 if (!error)\r
1144                 {\r
1145                     _lastnodes[_currentnode.Name] = prev._prevwithsamename;\r
1146                     prev.CloseNode(_currentnode);\r
1147                 }\r
1148             }\r
1149 \r
1150 \r
1151             // we close this node, get grandparent\r
1152             if (!error)\r
1153             {\r
1154                 if ((_lastparentnode != null) &&\r
1155                     ((!HtmlNode.IsClosedElement(_currentnode.Name)) ||\r
1156                      (_currentnode._starttag)))\r
1157                 {\r
1158                     UpdateLastParentNode();\r
1159                 }\r
1160             }\r
1161         }\r
1162 \r
1163         private string CurrentAttributeName()\r
1164         {\r
1165             return _text.Substring(_currentattribute._namestartindex, _currentattribute._namelength);\r
1166         }\r
1167 \r
1168         private string CurrentAttributeValue()\r
1169         {\r
1170             return _text.Substring(_currentattribute._valuestartindex, _currentattribute._valuelength);\r
1171         }\r
1172 \r
1173         private string CurrentNodeInner()\r
1174         {\r
1175             return _text.Substring(_currentnode._innerstartindex, _currentnode._innerlength);\r
1176         }\r
1177 \r
1178         private string CurrentNodeName()\r
1179         {\r
1180             return _text.Substring(_currentnode._namestartindex, _currentnode._namelength);\r
1181         }\r
1182 \r
1183         private string CurrentNodeOuter()\r
1184         {\r
1185             return _text.Substring(_currentnode._outerstartindex, _currentnode._outerlength);\r
1186         }\r
1187 \r
1188 \r
1189         private void DecrementPosition()\r
1190         {\r
1191             _index--;\r
1192             if (_lineposition == 1)\r
1193             {\r
1194                 _lineposition = _maxlineposition;\r
1195                 _line--;\r
1196             }\r
1197             else\r
1198             {\r
1199                 _lineposition--;\r
1200             }\r
1201         }\r
1202 \r
1203         private HtmlNode FindResetterNode(HtmlNode node, string name)\r
1204         {\r
1205             HtmlNode resetter = (HtmlNode) _lastnodes[name];\r
1206             if (resetter == null)\r
1207                 return null;\r
1208             if (resetter.Closed)\r
1209             {\r
1210                 return null;\r
1211             }\r
1212             if (resetter._streamposition < node._streamposition)\r
1213             {\r
1214                 return null;\r
1215             }\r
1216             return resetter;\r
1217         }\r
1218 \r
1219         private bool FindResetterNodes(HtmlNode node, string[] names)\r
1220         {\r
1221             if (names == null)\r
1222             {\r
1223                 return false;\r
1224             }\r
1225             for (int i = 0; i < names.Length; i++)\r
1226             {\r
1227                 if (FindResetterNode(node, names[i]) != null)\r
1228                 {\r
1229                     return true;\r
1230                 }\r
1231             }\r
1232             return false;\r
1233         }\r
1234 \r
1235         private void FixNestedTag(string name, string[] resetters)\r
1236         {\r
1237             if (resetters == null)\r
1238                 return;\r
1239 \r
1240             HtmlNode prev;\r
1241 \r
1242             // if we find a previous unclosed same name node, without a resetter node between, we must close it\r
1243             prev = (HtmlNode) _lastnodes[name];\r
1244             if ((prev != null) && (!prev.Closed))\r
1245             {\r
1246                 // try to find a resetter node, if found, we do nothing\r
1247                 if (FindResetterNodes(prev, resetters))\r
1248                 {\r
1249                     return;\r
1250                 }\r
1251 \r
1252                 // ok we need to close the prev now\r
1253                 // create a fake closer node\r
1254                 HtmlNode close = new HtmlNode(prev.NodeType, this, -1);\r
1255                 close._endnode = close;\r
1256                 prev.CloseNode(close);\r
1257             }\r
1258         }\r
1259 \r
1260         private void FixNestedTags()\r
1261         {\r
1262             // we are only interested by start tags, not closing tags\r
1263             if (!_currentnode._starttag)\r
1264                 return;\r
1265 \r
1266             string name = CurrentNodeName();\r
1267             FixNestedTag(name, GetResetters(name));\r
1268         }\r
1269 \r
1270         private string[] GetResetters(string name)\r
1271         {\r
1272             switch (name)\r
1273             {\r
1274                 case "li":\r
1275                     return new string[] {"ul"};\r
1276 \r
1277                 case "tr":\r
1278                     return new string[] {"table"};\r
1279 \r
1280                 case "th":\r
1281                 case "td":\r
1282                     return new string[] {"tr", "table"};\r
1283 \r
1284                 default:\r
1285                     return null;\r
1286             }\r
1287         }\r
1288 \r
1289         private void IncrementPosition()\r
1290         {\r
1291             if (_crc32 != null)\r
1292             {\r
1293                 // REVIEW: should we add some checksum code in DecrementPosition too?\r
1294                 _crc32.AddToCRC32(_c);\r
1295             }\r
1296 \r
1297             _index++;\r
1298             _maxlineposition = _lineposition;\r
1299             if (_c == 10)\r
1300             {\r
1301                 _lineposition = 1;\r
1302                 _line++;\r
1303             }\r
1304             else\r
1305             {\r
1306                 _lineposition++;\r
1307             }\r
1308         }\r
1309 \r
1310         private bool NewCheck()\r
1311         {\r
1312             if (_c != '<')\r
1313             {\r
1314                 return false;\r
1315             }\r
1316             if (_index < _text.Length)\r
1317             {\r
1318                 if (_text[_index] == '%')\r
1319                 {\r
1320                     switch (_state)\r
1321                     {\r
1322                         case ParseState.AttributeAfterEquals:\r
1323                             PushAttributeValueStart(_index - 1);\r
1324                             break;\r
1325 \r
1326                         case ParseState.BetweenAttributes:\r
1327                             PushAttributeNameStart(_index - 1);\r
1328                             break;\r
1329 \r
1330                         case ParseState.WhichTag:\r
1331                             PushNodeNameStart(true, _index - 1);\r
1332                             _state = ParseState.Tag;\r
1333                             break;\r
1334                     }\r
1335                     _oldstate = _state;\r
1336                     _state = ParseState.ServerSideCode;\r
1337                     return true;\r
1338                 }\r
1339             }\r
1340 \r
1341             if (!PushNodeEnd(_index - 1, true))\r
1342             {\r
1343                 // stop parsing\r
1344                 _index = _text.Length;\r
1345                 return true;\r
1346             }\r
1347             _state = ParseState.WhichTag;\r
1348             if ((_index - 1) <= (_text.Length - 2))\r
1349             {\r
1350                 if (_text[_index] == '!')\r
1351                 {\r
1352                     PushNodeStart(HtmlNodeType.Comment, _index - 1);\r
1353                     PushNodeNameStart(true, _index);\r
1354                     PushNodeNameEnd(_index + 1);\r
1355                     _state = ParseState.Comment;\r
1356                     if (_index < (_text.Length - 2))\r
1357                     {\r
1358                         if ((_text[_index + 1] == '-') &&\r
1359                             (_text[_index + 2] == '-'))\r
1360                         {\r
1361                             _fullcomment = true;\r
1362                         }\r
1363                         else\r
1364                         {\r
1365                             _fullcomment = false;\r
1366                         }\r
1367                     }\r
1368                     return true;\r
1369                 }\r
1370             }\r
1371             PushNodeStart(HtmlNodeType.Element, _index - 1);\r
1372             return true;\r
1373         }\r
1374 \r
1375         private void Parse()\r
1376         {\r
1377             int lastquote = 0;\r
1378             if (OptionComputeChecksum)\r
1379             {\r
1380                 _crc32 = new Crc32();\r
1381             }\r
1382 \r
1383             _lastnodes = new Hashtable();\r
1384             _c = 0;\r
1385             _fullcomment = false;\r
1386             _parseerrors = new List<HtmlParseError>();\r
1387             _line = 1;\r
1388             _lineposition = 1;\r
1389             _maxlineposition = 1;\r
1390 \r
1391             _state = ParseState.Text;\r
1392             _oldstate = _state;\r
1393             _documentnode._innerlength = _text.Length;\r
1394             _documentnode._outerlength = _text.Length;\r
1395             _remainderOffset = _text.Length;\r
1396 \r
1397             _lastparentnode = _documentnode;\r
1398             _currentnode = CreateNode(HtmlNodeType.Text, 0);\r
1399             _currentattribute = null;\r
1400 \r
1401             _index = 0;\r
1402             PushNodeStart(HtmlNodeType.Text, 0);\r
1403             while (_index < _text.Length)\r
1404             {\r
1405                 _c = _text[_index];\r
1406                 IncrementPosition();\r
1407 \r
1408                 switch (_state)\r
1409                 {\r
1410                     case ParseState.Text:\r
1411                         if (NewCheck())\r
1412                             continue;\r
1413                         break;\r
1414 \r
1415                     case ParseState.WhichTag:\r
1416                         if (NewCheck())\r
1417                             continue;\r
1418                         if (_c == '/')\r
1419                         {\r
1420                             PushNodeNameStart(false, _index);\r
1421                         }\r
1422                         else\r
1423                         {\r
1424                             PushNodeNameStart(true, _index - 1);\r
1425                             DecrementPosition();\r
1426                         }\r
1427                         _state = ParseState.Tag;\r
1428                         break;\r
1429 \r
1430                     case ParseState.Tag:\r
1431                         if (NewCheck())\r
1432                             continue;\r
1433                         if (IsWhiteSpace(_c))\r
1434                         {\r
1435                             PushNodeNameEnd(_index - 1);\r
1436                             if (_state != ParseState.Tag)\r
1437                                 continue;\r
1438                             _state = ParseState.BetweenAttributes;\r
1439                             continue;\r
1440                         }\r
1441                         if (_c == '/')\r
1442                         {\r
1443                             PushNodeNameEnd(_index - 1);\r
1444                             if (_state != ParseState.Tag)\r
1445                                 continue;\r
1446                             _state = ParseState.EmptyTag;\r
1447                             continue;\r
1448                         }\r
1449                         if (_c == '>')\r
1450                         {\r
1451                             PushNodeNameEnd(_index - 1);\r
1452                             if (_state != ParseState.Tag)\r
1453                                 continue;\r
1454                             if (!PushNodeEnd(_index, false))\r
1455                             {\r
1456                                 // stop parsing\r
1457                                 _index = _text.Length;\r
1458                                 break;\r
1459                             }\r
1460                             if (_state != ParseState.Tag)\r
1461                                 continue;\r
1462                             _state = ParseState.Text;\r
1463                             PushNodeStart(HtmlNodeType.Text, _index);\r
1464                         }\r
1465                         break;\r
1466 \r
1467                     case ParseState.BetweenAttributes:\r
1468                         if (NewCheck())\r
1469                             continue;\r
1470 \r
1471                         if (IsWhiteSpace(_c))\r
1472                             continue;\r
1473 \r
1474                         if ((_c == '/') || (_c == '?'))\r
1475                         {\r
1476                             _state = ParseState.EmptyTag;\r
1477                             continue;\r
1478                         }\r
1479 \r
1480                         if (_c == '>')\r
1481                         {\r
1482                             if (!PushNodeEnd(_index, false))\r
1483                             {\r
1484                                 // stop parsing\r
1485                                 _index = _text.Length;\r
1486                                 break;\r
1487                             }\r
1488 \r
1489                             if (_state != ParseState.BetweenAttributes)\r
1490                                 continue;\r
1491                             _state = ParseState.Text;\r
1492                             PushNodeStart(HtmlNodeType.Text, _index);\r
1493                             continue;\r
1494                         }\r
1495 \r
1496                         PushAttributeNameStart(_index - 1);\r
1497                         _state = ParseState.AttributeName;\r
1498                         break;\r
1499 \r
1500                     case ParseState.EmptyTag:\r
1501                         if (NewCheck())\r
1502                             continue;\r
1503 \r
1504                         if (_c == '>')\r
1505                         {\r
1506                             if (!PushNodeEnd(_index, true))\r
1507                             {\r
1508                                 // stop parsing\r
1509                                 _index = _text.Length;\r
1510                                 break;\r
1511                             }\r
1512 \r
1513                             if (_state != ParseState.EmptyTag)\r
1514                                 continue;\r
1515                             _state = ParseState.Text;\r
1516                             PushNodeStart(HtmlNodeType.Text, _index);\r
1517                             continue;\r
1518                         }\r
1519                         _state = ParseState.BetweenAttributes;\r
1520                         break;\r
1521 \r
1522                     case ParseState.AttributeName:\r
1523                         if (NewCheck())\r
1524                             continue;\r
1525 \r
1526                         if (IsWhiteSpace(_c))\r
1527                         {\r
1528                             PushAttributeNameEnd(_index - 1);\r
1529                             _state = ParseState.AttributeBeforeEquals;\r
1530                             continue;\r
1531                         }\r
1532                         if (_c == '=')\r
1533                         {\r
1534                             PushAttributeNameEnd(_index - 1);\r
1535                             _state = ParseState.AttributeAfterEquals;\r
1536                             continue;\r
1537                         }\r
1538                         if (_c == '>')\r
1539                         {\r
1540                             PushAttributeNameEnd(_index - 1);\r
1541                             if (!PushNodeEnd(_index, false))\r
1542                             {\r
1543                                 // stop parsing\r
1544                                 _index = _text.Length;\r
1545                                 break;\r
1546                             }\r
1547                             if (_state != ParseState.AttributeName)\r
1548                                 continue;\r
1549                             _state = ParseState.Text;\r
1550                             PushNodeStart(HtmlNodeType.Text, _index);\r
1551                             continue;\r
1552                         }\r
1553                         break;\r
1554 \r
1555                     case ParseState.AttributeBeforeEquals:\r
1556                         if (NewCheck())\r
1557                             continue;\r
1558 \r
1559                         if (IsWhiteSpace(_c))\r
1560                             continue;\r
1561                         if (_c == '>')\r
1562                         {\r
1563                             if (!PushNodeEnd(_index, false))\r
1564                             {\r
1565                                 // stop parsing\r
1566                                 _index = _text.Length;\r
1567                                 break;\r
1568                             }\r
1569                             if (_state != ParseState.AttributeBeforeEquals)\r
1570                                 continue;\r
1571                             _state = ParseState.Text;\r
1572                             PushNodeStart(HtmlNodeType.Text, _index);\r
1573                             continue;\r
1574                         }\r
1575                         if (_c == '=')\r
1576                         {\r
1577                             _state = ParseState.AttributeAfterEquals;\r
1578                             continue;\r
1579                         }\r
1580                         // no equals, no whitespace, it's a new attrribute starting\r
1581                         _state = ParseState.BetweenAttributes;\r
1582                         DecrementPosition();\r
1583                         break;\r
1584 \r
1585                     case ParseState.AttributeAfterEquals:\r
1586                         if (NewCheck())\r
1587                             continue;\r
1588 \r
1589                         if (IsWhiteSpace(_c))\r
1590                             continue;\r
1591 \r
1592                         if ((_c == '\'') || (_c == '"'))\r
1593                         {\r
1594                             _state = ParseState.QuotedAttributeValue;\r
1595                             PushAttributeValueStart(_index, _c);\r
1596                             lastquote = _c;\r
1597                             continue;\r
1598                         }\r
1599                         if (_c == '>')\r
1600                         {\r
1601                             if (!PushNodeEnd(_index, false))\r
1602                             {\r
1603                                 // stop parsing\r
1604                                 _index = _text.Length;\r
1605                                 break;\r
1606                             }\r
1607                             if (_state != ParseState.AttributeAfterEquals)\r
1608                                 continue;\r
1609                             _state = ParseState.Text;\r
1610                             PushNodeStart(HtmlNodeType.Text, _index);\r
1611                             continue;\r
1612                         }\r
1613                         PushAttributeValueStart(_index - 1);\r
1614                         _state = ParseState.AttributeValue;\r
1615                         break;\r
1616 \r
1617                     case ParseState.AttributeValue:\r
1618                         if (NewCheck())\r
1619                             continue;\r
1620 \r
1621                         if (IsWhiteSpace(_c))\r
1622                         {\r
1623                             PushAttributeValueEnd(_index - 1);\r
1624                             _state = ParseState.BetweenAttributes;\r
1625                             continue;\r
1626                         }\r
1627 \r
1628                         if (_c == '>')\r
1629                         {\r
1630                             PushAttributeValueEnd(_index - 1);\r
1631                             if (!PushNodeEnd(_index, false))\r
1632                             {\r
1633                                 // stop parsing\r
1634                                 _index = _text.Length;\r
1635                                 break;\r
1636                             }\r
1637                             if (_state != ParseState.AttributeValue)\r
1638                                 continue;\r
1639                             _state = ParseState.Text;\r
1640                             PushNodeStart(HtmlNodeType.Text, _index);\r
1641                             continue;\r
1642                         }\r
1643                         break;\r
1644 \r
1645                     case ParseState.QuotedAttributeValue:\r
1646                         if (_c == lastquote)\r
1647                         {\r
1648                             PushAttributeValueEnd(_index - 1);\r
1649                             _state = ParseState.BetweenAttributes;\r
1650                             continue;\r
1651                         }\r
1652                         if (_c == '<')\r
1653                         {\r
1654                             if (_index < _text.Length)\r
1655                             {\r
1656                                 if (_text[_index] == '%')\r
1657                                 {\r
1658                                     _oldstate = _state;\r
1659                                     _state = ParseState.ServerSideCode;\r
1660                                     continue;\r
1661                                 }\r
1662                             }\r
1663                         }\r
1664                         break;\r
1665 \r
1666                     case ParseState.Comment:\r
1667                         if (_c == '>')\r
1668                         {\r
1669                             if (_fullcomment)\r
1670                             {\r
1671                                 if ((_text[_index - 2] != '-') ||\r
1672                                     (_text[_index - 3] != '-'))\r
1673                                 {\r
1674                                     continue;\r
1675                                 }\r
1676                             }\r
1677                             if (!PushNodeEnd(_index, false))\r
1678                             {\r
1679                                 // stop parsing\r
1680                                 _index = _text.Length;\r
1681                                 break;\r
1682                             }\r
1683                             _state = ParseState.Text;\r
1684                             PushNodeStart(HtmlNodeType.Text, _index);\r
1685                             continue;\r
1686                         }\r
1687                         break;\r
1688 \r
1689                     case ParseState.ServerSideCode:\r
1690                         if (_c == '%')\r
1691                         {\r
1692                             if (_index < _text.Length)\r
1693                             {\r
1694                                 if (_text[_index] == '>')\r
1695                                 {\r
1696                                     switch (_oldstate)\r
1697                                     {\r
1698                                         case ParseState.AttributeAfterEquals:\r
1699                                             _state = ParseState.AttributeValue;\r
1700                                             break;\r
1701 \r
1702                                         case ParseState.BetweenAttributes:\r
1703                                             PushAttributeNameEnd(_index + 1);\r
1704                                             _state = ParseState.BetweenAttributes;\r
1705                                             break;\r
1706 \r
1707                                         default:\r
1708                                             _state = _oldstate;\r
1709                                             break;\r
1710                                     }\r
1711                                     IncrementPosition();\r
1712                                 }\r
1713                             }\r
1714                         }\r
1715                         break;\r
1716 \r
1717                     case ParseState.PcData:\r
1718                         // look for </tag + 1 char\r
1719 \r
1720                         // check buffer end\r
1721                         if ((_currentnode._namelength + 3) <= (_text.Length - (_index - 1)))\r
1722                         {\r
1723                             if (string.Compare(_text.Substring(_index - 1, _currentnode._namelength + 2),\r
1724                                                "</" + _currentnode.Name, true) == 0)\r
1725                             {\r
1726                                 int c = _text[_index - 1 + 2 + _currentnode.Name.Length];\r
1727                                 if ((c == '>') || (IsWhiteSpace(c)))\r
1728                                 {\r
1729                                     // add the script as a text node\r
1730                                     HtmlNode script = CreateNode(HtmlNodeType.Text,\r
1731                                                                  _currentnode._outerstartindex +\r
1732                                                                  _currentnode._outerlength);\r
1733                                     script._outerlength = _index - 1 - script._outerstartindex;\r
1734                                     _currentnode.AppendChild(script);\r
1735 \r
1736 \r
1737                                     PushNodeStart(HtmlNodeType.Element, _index - 1);\r
1738                                     PushNodeNameStart(false, _index - 1 + 2);\r
1739                                     _state = ParseState.Tag;\r
1740                                     IncrementPosition();\r
1741                                 }\r
1742                             }\r
1743                         }\r
1744                         break;\r
1745                 }\r
1746             }\r
1747 \r
1748             // finish the current work\r
1749             if (_currentnode._namestartindex > 0)\r
1750             {\r
1751                 PushNodeNameEnd(_index);\r
1752             }\r
1753             PushNodeEnd(_index, false);\r
1754 \r
1755             // we don't need this anymore\r
1756             _lastnodes.Clear();\r
1757         }\r
1758 \r
1759         private void PushAttributeNameEnd(int index)\r
1760         {\r
1761             _currentattribute._namelength = index - _currentattribute._namestartindex;\r
1762             _currentnode.Attributes.Append(_currentattribute);\r
1763         }\r
1764 \r
1765         private void PushAttributeNameStart(int index)\r
1766         {\r
1767             _currentattribute = CreateAttribute();\r
1768             _currentattribute._namestartindex = index;\r
1769             _currentattribute.Line = _line;\r
1770             _currentattribute._lineposition = _lineposition;\r
1771             _currentattribute._streamposition = index;\r
1772         }\r
1773 \r
1774         private void PushAttributeValueEnd(int index)\r
1775         {\r
1776             _currentattribute._valuelength = index - _currentattribute._valuestartindex;\r
1777         }\r
1778 \r
1779         private void PushAttributeValueStart(int index)\r
1780         {\r
1781             PushAttributeValueStart(index, 0);\r
1782         }\r
1783 \r
1784         private void PushAttributeValueStart(int index, int quote)\r
1785         {\r
1786             _currentattribute._valuestartindex = index;\r
1787             if (quote == '\'')\r
1788                 _currentattribute.QuoteType = AttributeValueQuote.SingleQuote;\r
1789         }\r
1790 \r
1791         private bool PushNodeEnd(int index, bool close)\r
1792         {\r
1793             _currentnode._outerlength = index - _currentnode._outerstartindex;\r
1794 \r
1795             if ((_currentnode._nodetype == HtmlNodeType.Text) ||\r
1796                 (_currentnode._nodetype == HtmlNodeType.Comment))\r
1797             {\r
1798                 // forget about void nodes\r
1799                 if (_currentnode._outerlength > 0)\r
1800                 {\r
1801                     _currentnode._innerlength = _currentnode._outerlength;\r
1802                     _currentnode._innerstartindex = _currentnode._outerstartindex;\r
1803                     if (_lastparentnode != null)\r
1804                     {\r
1805                         _lastparentnode.AppendChild(_currentnode);\r
1806                     }\r
1807                 }\r
1808             }\r
1809             else\r
1810             {\r
1811                 if ((_currentnode._starttag) && (_lastparentnode != _currentnode))\r
1812                 {\r
1813                     // add to parent node\r
1814                     if (_lastparentnode != null)\r
1815                     {\r
1816                         _lastparentnode.AppendChild(_currentnode);\r
1817                     }\r
1818 \r
1819                     ReadDocumentEncoding(_currentnode);\r
1820 \r
1821                     // remember last node of this kind\r
1822                     HtmlNode prev = (HtmlNode) _lastnodes[_currentnode.Name];\r
1823                     _currentnode._prevwithsamename = prev;\r
1824                     _lastnodes[_currentnode.Name] = _currentnode;\r
1825 \r
1826                     // change parent?\r
1827                     if ((_currentnode.NodeType == HtmlNodeType.Document) ||\r
1828                         (_currentnode.NodeType == HtmlNodeType.Element))\r
1829                     {\r
1830                         _lastparentnode = _currentnode;\r
1831                     }\r
1832 \r
1833                     if (HtmlNode.IsCDataElement(CurrentNodeName()))\r
1834                     {\r
1835                         _state = ParseState.PcData;\r
1836                         return true;\r
1837                     }\r
1838 \r
1839                     if ((HtmlNode.IsClosedElement(_currentnode.Name)) ||\r
1840                         (HtmlNode.IsEmptyElement(_currentnode.Name)))\r
1841                     {\r
1842                         close = true;\r
1843                     }\r
1844                 }\r
1845             }\r
1846 \r
1847             if ((close) || (!_currentnode._starttag))\r
1848             {\r
1849                 if ((OptionStopperNodeName != null) && (_remainder == null) &&\r
1850                     (string.Compare(_currentnode.Name, OptionStopperNodeName, true) == 0))\r
1851                 {\r
1852                     _remainderOffset = index;\r
1853                     _remainder = _text.Substring(_remainderOffset);\r
1854                     CloseCurrentNode();\r
1855                     return false; // stop parsing\r
1856                 }\r
1857                 CloseCurrentNode();\r
1858             }\r
1859             return true;\r
1860         }\r
1861 \r
1862         private void PushNodeNameEnd(int index)\r
1863         {\r
1864             _currentnode._namelength = index - _currentnode._namestartindex;\r
1865             if (OptionFixNestedTags)\r
1866             {\r
1867                 FixNestedTags();\r
1868             }\r
1869         }\r
1870 \r
1871         private void PushNodeNameStart(bool starttag, int index)\r
1872         {\r
1873             _currentnode._starttag = starttag;\r
1874             _currentnode._namestartindex = index;\r
1875         }\r
1876 \r
1877         private void PushNodeStart(HtmlNodeType type, int index)\r
1878         {\r
1879             _currentnode = CreateNode(type, index);\r
1880             _currentnode._line = _line;\r
1881             _currentnode._lineposition = _lineposition;\r
1882             if (type == HtmlNodeType.Element)\r
1883             {\r
1884                 _currentnode._lineposition--;\r
1885             }\r
1886             _currentnode._streamposition = index;\r
1887         }\r
1888 \r
1889         private void ReadDocumentEncoding(HtmlNode node)\r
1890         {\r
1891             if (!OptionReadEncoding)\r
1892                 return;\r
1893             // format is \r
1894             // <meta http-equiv="content-type" content="text/html;charset=iso-8859-1" />\r
1895 \r
1896             // when we append a child, we are in node end, so attributes are already populated\r
1897             if (node._namelength == 4) // quick check, avoids string alloc\r
1898             {\r
1899                 if (node.Name == "meta") // all nodes names are lowercase\r
1900                 {\r
1901                     HtmlAttribute att = node.Attributes["http-equiv"];\r
1902                     if (att != null)\r
1903                     {\r
1904                         if (string.Compare(att.Value, "content-type", true) == 0)\r
1905                         {\r
1906                             HtmlAttribute content = node.Attributes["content"];\r
1907                             if (content != null)\r
1908                             {\r
1909                                 string charset = NameValuePairList.GetNameValuePairsValue(content.Value, "charset");\r
1910                                 if (charset != null && (charset = charset.Trim()).Length > 0)\r
1911                                 {\r
1912                                     _declaredencoding = Encoding.GetEncoding(charset.Trim());\r
1913                                     if (_onlyDetectEncoding)\r
1914                                     {\r
1915                                         throw new EncodingFoundException(_declaredencoding);\r
1916                                     }\r
1917 \r
1918                                     if (_streamencoding != null)\r
1919                                     {\r
1920                                         if (_declaredencoding.WindowsCodePage != _streamencoding.WindowsCodePage)\r
1921                                         {\r
1922                                             AddError(\r
1923                                                 HtmlParseErrorCode.CharsetMismatch,\r
1924                                                 _line, _lineposition,\r
1925                                                 _index, node.OuterHtml,\r
1926                                                 "Encoding mismatch between StreamEncoding: " +\r
1927                                                 _streamencoding.WebName + " and DeclaredEncoding: " +\r
1928                                                 _declaredencoding.WebName);\r
1929                                         }\r
1930                                     }\r
1931                                 }\r
1932                             }\r
1933                         }\r
1934                     }\r
1935                 }\r
1936             }\r
1937         }\r
1938 \r
1939         #endregion\r
1940 \r
1941         #region Nested type: ParseState\r
1942 \r
1943         private enum ParseState\r
1944         {\r
1945             Text,\r
1946             WhichTag,\r
1947             Tag,\r
1948             BetweenAttributes,\r
1949             EmptyTag,\r
1950             AttributeName,\r
1951             AttributeBeforeEquals,\r
1952             AttributeAfterEquals,\r
1953             AttributeValue,\r
1954             Comment,\r
1955             QuotedAttributeValue,\r
1956             ServerSideCode,\r
1957             PcData\r
1958         }\r
1959 \r
1960         #endregion\r
1961     }\r
1962 }