1 // HtmlAgilityPack V1.0 - Simon Mourier <simon underscore mourier at hotmail dot com>
\r
3 using System.Collections;
\r
4 using System.Collections.Generic;
\r
7 using System.Text.RegularExpressions;
\r
9 using System.Xml.XPath;
\r
11 namespace HtmlAgilityPack
\r
14 /// Represents a complete HTML document.
\r
16 public class HtmlDocument : IXPathNavigable
\r
21 private Crc32 _crc32;
\r
22 private HtmlAttribute _currentattribute;
\r
23 private HtmlNode _currentnode;
\r
24 private Encoding _declaredencoding;
\r
25 private HtmlNode _documentnode;
\r
26 private bool _fullcomment;
\r
28 internal Hashtable _lastnodes = new Hashtable();
\r
29 private HtmlNode _lastparentnode;
\r
31 private int _lineposition, _maxlineposition;
\r
32 internal Hashtable _nodesid;
\r
33 private ParseState _oldstate;
\r
34 private bool _onlyDetectEncoding;
\r
35 internal Hashtable _openednodes;
\r
36 private List<HtmlParseError> _parseerrors = new List<HtmlParseError>();
\r
37 private string _remainder;
\r
38 private int _remainderOffset;
\r
39 private ParseState _state;
\r
40 private Encoding _streamencoding;
\r
41 internal string _text;
\r
46 /// Adds Debugging attributes to node. Default is false.
\r
48 public bool OptionAddDebuggingAttributes;
\r
51 /// Defines if closing for non closed nodes must be done at the end or directly in the document.
\r
52 /// Setting this to true can actually change how browsers render the page. Default is false.
\r
54 public bool OptionAutoCloseOnEnd; // close errors at the end
\r
57 /// Defines if non closed nodes will be checked at the end of parsing. Default is true.
\r
59 public bool OptionCheckSyntax = true;
\r
62 /// Defines if a checksum must be computed for the document while parsing. Default is false.
\r
64 public bool OptionComputeChecksum;
\r
67 /// Defines the default stream encoding to use. Default is System.Text.Encoding.Default.
\r
69 public Encoding OptionDefaultStreamEncoding = Encoding.Default;
\r
72 /// Defines if source text must be extracted while parsing errors.
\r
73 /// If the document has a lot of errors, or cascading errors, parsing performance can be dramatically affected if set to true.
\r
74 /// Default is false.
\r
76 public bool OptionExtractErrorSourceText;
\r
78 // turning this on can dramatically slow performance if a lot of errors are detected
\r
81 /// Defines the maximum length of source text or parse errors. Default is 100.
\r
83 public int OptionExtractErrorSourceTextMaxLength = 100;
\r
86 /// Defines if LI, TR, TH, TD tags must be partially fixed when nesting errors are detected. Default is false.
\r
88 public bool OptionFixNestedTags; // fix li, tr, th, td tags
\r
91 /// Defines if output must conform to XML, instead of HTML.
\r
93 public bool OptionOutputAsXml;
\r
96 /// Defines if attribute value output must be optimized (not bound with double quotes if it is possible). Default is false.
\r
98 public bool OptionOutputOptimizeAttributeValues;
\r
101 /// Defines if name must be output with it's original case. Useful for asp.net tags and attributes
\r
103 public bool OptionOutputOriginalCase;
\r
106 /// Defines if name must be output in uppercase. Default is false.
\r
108 public bool OptionOutputUpperCase;
\r
111 /// Defines if declared encoding must be read from the document.
\r
112 /// Declared encoding is determined using the meta http-equiv="content-type" content="text/html;charset=XXXXX" html node.
\r
113 /// Default is true.
\r
115 public bool OptionReadEncoding = true;
\r
118 /// Defines the name of a node that will throw the StopperNodeException when found as an end node. Default is null.
\r
120 public string OptionStopperNodeName;
\r
123 /// Defines if the 'id' attribute must be specifically used. Default is true.
\r
125 public bool OptionUseIdAttribute = true;
\r
128 /// Defines if empty nodes must be written as closed during output. Default is false.
\r
130 public bool OptionWriteEmptyNodes;
\r
134 #region Static Members
\r
136 internal static readonly string HtmlExceptionRefNotChild = "Reference node must be a child of this node";
\r
138 internal static readonly string HtmlExceptionUseIdAttributeFalse =
\r
139 "You need to set UseIdAttribute property to true to enable this feature";
\r
143 #region Constructors
\r
146 /// Creates an instance of an HTML document.
\r
148 public HtmlDocument()
\r
150 _documentnode = CreateNode(HtmlNodeType.Document, 0);
\r
158 /// Gets the document CRC32 checksum if OptionComputeChecksum was set to true before parsing, 0 otherwise.
\r
160 public int CheckSum
\r
164 if (_crc32 == null)
\r
170 return (int) _crc32.CheckSum;
\r
176 /// Gets the document's declared encoding.
\r
177 /// Declared encoding is determined using the meta http-equiv="content-type" content="text/html;charset=XXXXX" html node.
\r
179 public Encoding DeclaredEncoding
\r
181 get { return _declaredencoding; }
\r
185 /// Gets the root node of the document.
\r
187 public HtmlNode DocumentNode
\r
189 get { return _documentnode; }
\r
193 /// Gets the document's output encoding.
\r
195 public Encoding Encoding
\r
197 get { return GetOutEncoding(); }
\r
201 /// Gets a list of parse errors found in the document.
\r
203 public IEnumerable<HtmlParseError> ParseErrors
\r
205 get { return _parseerrors; }
\r
209 /// Gets the remaining text.
\r
210 /// Will always be null if OptionStopperNodeName is null.
\r
212 public string Remainder
\r
214 get { return _remainder; }
\r
218 /// Gets the offset of Remainder in the original Html text.
\r
219 /// If OptionStopperNodeName is null, this will return the length of the original Html text.
\r
221 public int RemainderOffset
\r
223 get { return _remainderOffset; }
\r
227 /// Gets the document's stream encoding.
\r
229 public Encoding StreamEncoding
\r
231 get { return _streamencoding; }
\r
236 #region IXPathNavigable Members
\r
239 /// Creates a new XPathNavigator object for navigating this HTML document.
\r
241 /// <returns>An XPathNavigator object. The XPathNavigator is positioned on the root of the document.</returns>
\r
242 public XPathNavigator CreateNavigator()
\r
244 return new HtmlNodeNavigator(this, _documentnode);
\r
249 #region Public Methods
\r
252 /// Gets a valid XML name.
\r
254 /// <param name="name">Any text.</param>
\r
255 /// <returns>A string that is a valid XML name.</returns>
\r
256 public static string GetXmlName(string name)
\r
258 string xmlname = string.Empty;
\r
259 bool nameisok = true;
\r
260 for (int i = 0; i < name.Length; i++)
\r
263 // note: we are very limited here, too much?
\r
264 if (((name[i] >= 'a') && (name[i] <= 'z')) ||
\r
265 ((name[i] >= '0') && (name[i] <= '9')) ||
\r
266 // (name[i]==':') || (name[i]=='_') || (name[i]=='-') || (name[i]=='.')) // these are bads in fact
\r
267 (name[i] == '_') || (name[i] == '-') || (name[i] == '.'))
\r
269 xmlname += name[i];
\r
274 byte[] bytes = Encoding.UTF8.GetBytes(new char[] {name[i]});
\r
275 for (int j = 0; j < bytes.Length; j++)
\r
277 xmlname += bytes[j].ToString("x2");
\r
286 return "_" + xmlname;
\r
290 /// Applies HTML encoding to a specified string.
\r
292 /// <param name="html">The input string to encode. May not be null.</param>
\r
293 /// <returns>The encoded string.</returns>
\r
294 public static string HtmlEncode(string html)
\r
298 throw new ArgumentNullException("html");
\r
300 // replace & by & but only once!
\r
301 Regex rx = new Regex("&(?!(amp;)|(lt;)|(gt;)|(quot;))", RegexOptions.IgnoreCase);
\r
302 return rx.Replace(html, "&").Replace("<", "<").Replace(">", ">").Replace("\"", """);
\r
306 /// Determines if the specified character is considered as a whitespace character.
\r
308 /// <param name="c">The character to check.</param>
\r
309 /// <returns>true if if the specified character is considered as a whitespace character.</returns>
\r
310 public static bool IsWhiteSpace(int c)
\r
312 if ((c == 10) || (c == 13) || (c == 32) || (c == 9))
\r
320 /// Creates an HTML attribute with the specified name.
\r
322 /// <param name="name">The name of the attribute. May not be null.</param>
\r
323 /// <returns>The new HTML attribute.</returns>
\r
324 public HtmlAttribute CreateAttribute(string name)
\r
328 throw new ArgumentNullException("name");
\r
330 HtmlAttribute att = CreateAttribute();
\r
336 /// Creates an HTML attribute with the specified name.
\r
338 /// <param name="name">The name of the attribute. May not be null.</param>
\r
339 /// <param name="value">The value of the attribute.</param>
\r
340 /// <returns>The new HTML attribute.</returns>
\r
341 public HtmlAttribute CreateAttribute(string name, string value)
\r
345 throw new ArgumentNullException("name");
\r
347 HtmlAttribute att = CreateAttribute(name);
\r
353 /// Creates an HTML comment node.
\r
355 /// <returns>The new HTML comment node.</returns>
\r
356 public HtmlCommentNode CreateComment()
\r
358 return (HtmlCommentNode) CreateNode(HtmlNodeType.Comment);
\r
362 /// Creates an HTML comment node with the specified comment text.
\r
364 /// <param name="comment">The comment text. May not be null.</param>
\r
365 /// <returns>The new HTML comment node.</returns>
\r
366 public HtmlCommentNode CreateComment(string comment)
\r
368 if (comment == null)
\r
370 throw new ArgumentNullException("comment");
\r
372 HtmlCommentNode c = CreateComment();
\r
373 c.Comment = comment;
\r
378 /// Creates an HTML element node with the specified name.
\r
380 /// <param name="name">The qualified name of the element. May not be null.</param>
\r
381 /// <returns>The new HTML node.</returns>
\r
382 public HtmlNode CreateElement(string name)
\r
386 throw new ArgumentNullException("name");
\r
388 HtmlNode node = CreateNode(HtmlNodeType.Element);
\r
394 /// Creates an HTML text node.
\r
396 /// <returns>The new HTML text node.</returns>
\r
397 public HtmlTextNode CreateTextNode()
\r
399 return (HtmlTextNode) CreateNode(HtmlNodeType.Text);
\r
403 /// Creates an HTML text node with the specified text.
\r
405 /// <param name="text">The text of the node. May not be null.</param>
\r
406 /// <returns>The new HTML text node.</returns>
\r
407 public HtmlTextNode CreateTextNode(string text)
\r
411 throw new ArgumentNullException("text");
\r
413 HtmlTextNode t = CreateTextNode();
\r
419 /// Detects the encoding of an HTML stream.
\r
421 /// <param name="stream">The input stream. May not be null.</param>
\r
422 /// <returns>The detected encoding.</returns>
\r
423 public Encoding DetectEncoding(Stream stream)
\r
425 if (stream == null)
\r
427 throw new ArgumentNullException("stream");
\r
429 return DetectEncoding(new StreamReader(stream));
\r
433 /// Detects the encoding of an HTML file.
\r
435 /// <param name="path">Path for the file containing the HTML document to detect. May not be null.</param>
\r
436 /// <returns>The detected encoding.</returns>
\r
437 public Encoding DetectEncoding(string path)
\r
441 throw new ArgumentNullException("path");
\r
443 StreamReader sr = new StreamReader(path, OptionDefaultStreamEncoding);
\r
444 Encoding encoding = DetectEncoding(sr);
\r
450 /// Detects the encoding of an HTML text provided on a TextReader.
\r
452 /// <param name="reader">The TextReader used to feed the HTML. May not be null.</param>
\r
453 /// <returns>The detected encoding.</returns>
\r
454 public Encoding DetectEncoding(TextReader reader)
\r
456 if (reader == null)
\r
458 throw new ArgumentNullException("reader");
\r
460 _onlyDetectEncoding = true;
\r
461 if (OptionCheckSyntax)
\r
463 _openednodes = new Hashtable();
\r
467 _openednodes = null;
\r
470 if (OptionUseIdAttribute)
\r
472 _nodesid = new Hashtable();
\r
479 StreamReader sr = reader as StreamReader;
\r
482 _streamencoding = sr.CurrentEncoding;
\r
486 _streamencoding = null;
\r
488 _declaredencoding = null;
\r
490 _text = reader.ReadToEnd();
\r
491 _documentnode = CreateNode(HtmlNodeType.Document, 0);
\r
493 // this is almost a hack, but it allows us not to muck with the original parsing code
\r
498 catch (EncodingFoundException ex)
\r
500 return ex.Encoding;
\r
506 /// Detects the encoding of an HTML document from a file first, and then loads the file.
\r
508 /// <param name="path">The complete file path to be read.</param>
\r
509 public void DetectEncodingAndLoad(string path)
\r
511 DetectEncodingAndLoad(path, true);
\r
515 /// Detects the encoding of an HTML document from a file first, and then loads the file.
\r
517 /// <param name="path">The complete file path to be read. May not be null.</param>
\r
518 /// <param name="detectEncoding">true to detect encoding, false otherwise.</param>
\r
519 public void DetectEncodingAndLoad(string path, bool detectEncoding)
\r
523 throw new ArgumentNullException("path");
\r
526 if (detectEncoding)
\r
528 enc = DetectEncoding(path);
\r
546 /// Detects the encoding of an HTML text.
\r
548 /// <param name="html">The input html text. May not be null.</param>
\r
549 /// <returns>The detected encoding.</returns>
\r
550 public Encoding DetectEncodingHtml(string html)
\r
554 throw new ArgumentNullException("html");
\r
556 StringReader sr = new StringReader(html);
\r
557 Encoding encoding = DetectEncoding(sr);
\r
563 /// Gets the HTML node with the specified 'id' attribute value.
\r
565 /// <param name="id">The attribute id to match. May not be null.</param>
\r
566 /// <returns>The HTML node with the matching id or null if not found.</returns>
\r
567 public HtmlNode GetElementbyId(string id)
\r
571 throw new ArgumentNullException("id");
\r
573 if (_nodesid == null)
\r
575 throw new Exception(HtmlExceptionUseIdAttributeFalse);
\r
578 return _nodesid[id.ToLower()] as HtmlNode;
\r
582 /// Loads an HTML document from a stream.
\r
584 /// <param name="stream">The input stream.</param>
\r
585 public void Load(Stream stream)
\r
587 Load(new StreamReader(stream, OptionDefaultStreamEncoding));
\r
591 /// Loads an HTML document from a stream.
\r
593 /// <param name="stream">The input stream.</param>
\r
594 /// <param name="detectEncodingFromByteOrderMarks">Indicates whether to look for byte order marks at the beginning of the stream.</param>
\r
595 public void Load(Stream stream, bool detectEncodingFromByteOrderMarks)
\r
597 Load(new StreamReader(stream, detectEncodingFromByteOrderMarks));
\r
601 /// Loads an HTML document from a stream.
\r
603 /// <param name="stream">The input stream.</param>
\r
604 /// <param name="encoding">The character encoding to use.</param>
\r
605 public void Load(Stream stream, Encoding encoding)
\r
607 Load(new StreamReader(stream, encoding));
\r
611 /// Loads an HTML document from a stream.
\r
613 /// <param name="stream">The input stream.</param>
\r
614 /// <param name="encoding">The character encoding to use.</param>
\r
615 /// <param name="detectEncodingFromByteOrderMarks">Indicates whether to look for byte order marks at the beginning of the stream.</param>
\r
616 public void Load(Stream stream, Encoding encoding, bool detectEncodingFromByteOrderMarks)
\r
618 Load(new StreamReader(stream, encoding, detectEncodingFromByteOrderMarks));
\r
622 /// Loads an HTML document from a stream.
\r
624 /// <param name="stream">The input stream.</param>
\r
625 /// <param name="encoding">The character encoding to use.</param>
\r
626 /// <param name="detectEncodingFromByteOrderMarks">Indicates whether to look for byte order marks at the beginning of the stream.</param>
\r
627 /// <param name="buffersize">The minimum buffer size.</param>
\r
628 public void Load(Stream stream, Encoding encoding, bool detectEncodingFromByteOrderMarks, int buffersize)
\r
630 Load(new StreamReader(stream, encoding, detectEncodingFromByteOrderMarks, buffersize));
\r
634 /// Loads an HTML document from a file.
\r
636 /// <param name="path">The complete file path to be read. May not be null.</param>
\r
637 public void Load(string path)
\r
641 throw new ArgumentNullException("path");
\r
643 StreamReader sr = new StreamReader(path, OptionDefaultStreamEncoding);
\r
649 /// Loads an HTML document from a file.
\r
651 /// <param name="path">The complete file path to be read. May not be null.</param>
\r
652 /// <param name="detectEncodingFromByteOrderMarks">Indicates whether to look for byte order marks at the beginning of the file.</param>
\r
653 public void Load(string path, bool detectEncodingFromByteOrderMarks)
\r
657 throw new ArgumentNullException("path");
\r
659 StreamReader sr = new StreamReader(path, detectEncodingFromByteOrderMarks);
\r
665 /// Loads an HTML document from a file.
\r
667 /// <param name="path">The complete file path to be read. May not be null.</param>
\r
668 /// <param name="encoding">The character encoding to use. May not be null.</param>
\r
669 public void Load(string path, Encoding encoding)
\r
673 throw new ArgumentNullException("path");
\r
675 if (encoding == null)
\r
677 throw new ArgumentNullException("encoding");
\r
679 StreamReader sr = new StreamReader(path, encoding);
\r
685 /// Loads an HTML document from a file.
\r
687 /// <param name="path">The complete file path to be read. May not be null.</param>
\r
688 /// <param name="encoding">The character encoding to use. May not be null.</param>
\r
689 /// <param name="detectEncodingFromByteOrderMarks">Indicates whether to look for byte order marks at the beginning of the file.</param>
\r
690 public void Load(string path, Encoding encoding, bool detectEncodingFromByteOrderMarks)
\r
694 throw new ArgumentNullException("path");
\r
696 if (encoding == null)
\r
698 throw new ArgumentNullException("encoding");
\r
700 StreamReader sr = new StreamReader(path, encoding, detectEncodingFromByteOrderMarks);
\r
706 /// Loads an HTML document from a file.
\r
708 /// <param name="path">The complete file path to be read. May not be null.</param>
\r
709 /// <param name="encoding">The character encoding to use. May not be null.</param>
\r
710 /// <param name="detectEncodingFromByteOrderMarks">Indicates whether to look for byte order marks at the beginning of the file.</param>
\r
711 /// <param name="buffersize">The minimum buffer size.</param>
\r
712 public void Load(string path, Encoding encoding, bool detectEncodingFromByteOrderMarks, int buffersize)
\r
716 throw new ArgumentNullException("path");
\r
718 if (encoding == null)
\r
720 throw new ArgumentNullException("encoding");
\r
722 StreamReader sr = new StreamReader(path, encoding, detectEncodingFromByteOrderMarks, buffersize);
\r
728 /// Loads the HTML document from the specified TextReader.
\r
730 /// <param name="reader">The TextReader used to feed the HTML data into the document. May not be null.</param>
\r
731 public void Load(TextReader reader)
\r
733 // all Load methods pass down to this one
\r
734 if (reader == null)
\r
736 throw new ArgumentNullException("reader");
\r
739 _onlyDetectEncoding = false;
\r
741 if (OptionCheckSyntax)
\r
743 _openednodes = new Hashtable();
\r
747 _openednodes = null;
\r
750 if (OptionUseIdAttribute)
\r
752 _nodesid = new Hashtable();
\r
759 StreamReader sr = reader as StreamReader;
\r
764 // trigger bom read if needed
\r
767 // ReSharper disable EmptyGeneralCatchClause
\r
769 // ReSharper restore EmptyGeneralCatchClause
\r
773 _streamencoding = sr.CurrentEncoding;
\r
777 _streamencoding = null;
\r
779 _declaredencoding = null;
\r
781 _text = reader.ReadToEnd();
\r
782 _documentnode = CreateNode(HtmlNodeType.Document, 0);
\r
785 if (OptionCheckSyntax)
\r
787 foreach (HtmlNode node in _openednodes.Values)
\r
789 if (!node._starttag) // already reported
\r
795 if (OptionExtractErrorSourceText)
\r
797 html = node.OuterHtml;
\r
798 if (html.Length > OptionExtractErrorSourceTextMaxLength)
\r
800 html = html.Substring(0, OptionExtractErrorSourceTextMaxLength);
\r
805 html = string.Empty;
\r
808 HtmlParseErrorCode.TagNotClosed,
\r
809 node._line, node._lineposition,
\r
810 node._streamposition, html,
\r
811 "End tag </" + node.Name + "> was not found");
\r
814 // we don't need this anymore
\r
815 _openednodes.Clear();
\r
820 /// Loads the HTML document from the specified string.
\r
822 /// <param name="html">String containing the HTML document to load. May not be null.</param>
\r
823 public void LoadHtml(string html)
\r
827 throw new ArgumentNullException("html");
\r
829 StringReader sr = new StringReader(html);
\r
835 /// Saves the HTML document to the specified stream.
\r
837 /// <param name="outStream">The stream to which you want to save.</param>
\r
838 public void Save(Stream outStream)
\r
840 StreamWriter sw = new StreamWriter(outStream, GetOutEncoding());
\r
845 /// Saves the HTML document to the specified stream.
\r
847 /// <param name="outStream">The stream to which you want to save. May not be null.</param>
\r
848 /// <param name="encoding">The character encoding to use. May not be null.</param>
\r
849 public void Save(Stream outStream, Encoding encoding)
\r
851 if (outStream == null)
\r
853 throw new ArgumentNullException("outStream");
\r
855 if (encoding == null)
\r
857 throw new ArgumentNullException("encoding");
\r
859 StreamWriter sw = new StreamWriter(outStream, encoding);
\r
864 /// Saves the mixed document to the specified file.
\r
866 /// <param name="filename">The location of the file where you want to save the document.</param>
\r
867 public void Save(string filename)
\r
869 StreamWriter sw = new StreamWriter(filename, false, GetOutEncoding());
\r
875 /// Saves the mixed document to the specified file.
\r
877 /// <param name="filename">The location of the file where you want to save the document. May not be null.</param>
\r
878 /// <param name="encoding">The character encoding to use. May not be null.</param>
\r
879 public void Save(string filename, Encoding encoding)
\r
881 if (filename == null)
\r
883 throw new ArgumentNullException("filename");
\r
885 if (encoding == null)
\r
887 throw new ArgumentNullException("encoding");
\r
889 StreamWriter sw = new StreamWriter(filename, false, encoding);
\r
895 /// Saves the HTML document to the specified StreamWriter.
\r
897 /// <param name="writer">The StreamWriter to which you want to save.</param>
\r
898 public void Save(StreamWriter writer)
\r
900 Save((TextWriter) writer);
\r
904 /// Saves the HTML document to the specified TextWriter.
\r
906 /// <param name="writer">The TextWriter to which you want to save. May not be null.</param>
\r
907 public void Save(TextWriter writer)
\r
909 if (writer == null)
\r
911 throw new ArgumentNullException("writer");
\r
913 DocumentNode.WriteTo(writer);
\r
917 /// Saves the HTML document to the specified XmlWriter.
\r
919 /// <param name="writer">The XmlWriter to which you want to save.</param>
\r
920 public void Save(XmlWriter writer)
\r
922 DocumentNode.WriteTo(writer);
\r
928 #region Internal Methods
\r
930 internal HtmlAttribute CreateAttribute()
\r
932 return new HtmlAttribute(this);
\r
935 internal HtmlNode CreateNode(HtmlNodeType type)
\r
937 return CreateNode(type, -1);
\r
940 internal HtmlNode CreateNode(HtmlNodeType type, int index)
\r
944 case HtmlNodeType.Comment:
\r
945 return new HtmlCommentNode(this, index);
\r
947 case HtmlNodeType.Text:
\r
948 return new HtmlTextNode(this, index);
\r
951 return new HtmlNode(type, this, index);
\r
955 internal Encoding GetOutEncoding()
\r
957 // when unspecified, use the stream encoding first
\r
958 if (_declaredencoding != null)
\r
960 return _declaredencoding;
\r
964 if (_streamencoding != null)
\r
966 return _streamencoding;
\r
969 return OptionDefaultStreamEncoding;
\r
972 internal HtmlNode GetXmlDeclaration()
\r
974 if (!_documentnode.HasChildNodes)
\r
979 foreach (HtmlNode node in _documentnode._childnodes)
\r
981 if (node.Name == "?xml") // it's ok, names are case sensitive
\r
989 internal void SetIdForNode(HtmlNode node, string id)
\r
991 if (!OptionUseIdAttribute)
\r
996 if ((_nodesid == null) || (id == null))
\r
1003 _nodesid.Remove(id.ToLower());
\r
1007 _nodesid[id.ToLower()] = node;
\r
1011 internal void UpdateLastParentNode()
\r
1015 if (_lastparentnode.Closed)
\r
1017 _lastparentnode = _lastparentnode.ParentNode;
\r
1019 } while ((_lastparentnode != null) && (_lastparentnode.Closed));
\r
1020 if (_lastparentnode == null)
\r
1022 _lastparentnode = _documentnode;
\r
1028 #region Private Methods
\r
1030 private HtmlParseError AddError(
\r
1031 HtmlParseErrorCode code,
\r
1034 int streamPosition,
\r
1035 string sourceText,
\r
1038 HtmlParseError err = new HtmlParseError(code, line, linePosition, streamPosition, sourceText, reason);
\r
1039 _parseerrors.Add(err);
\r
1043 private void CloseCurrentNode()
\r
1045 if (_currentnode.Closed) // text or document are by def closed
\r
1048 bool error = false;
\r
1050 // find last node of this kind
\r
1051 HtmlNode prev = (HtmlNode) _lastnodes[_currentnode.Name];
\r
1054 if (HtmlNode.IsClosedElement(_currentnode.Name))
\r
1056 // </br> will be seen as <br>
\r
1057 _currentnode.CloseNode(_currentnode);
\r
1059 // add to parent node
\r
1060 if (_lastparentnode != null)
\r
1062 HtmlNode foundNode = null;
\r
1063 Stack futureChild = new Stack();
\r
1064 for (HtmlNode node = _lastparentnode.LastChild; node != null; node = node.PreviousSibling)
\r
1066 if ((node.Name == _currentnode.Name) && (!node.HasChildNodes))
\r
1071 futureChild.Push(node);
\r
1073 if (foundNode != null)
\r
1075 HtmlNode node = null;
\r
1076 while (futureChild.Count != 0)
\r
1078 node = (HtmlNode) futureChild.Pop();
\r
1079 _lastparentnode.RemoveChild(node);
\r
1080 foundNode.AppendChild(node);
\r
1085 _lastparentnode.AppendChild(_currentnode);
\r
1091 // node has no parent
\r
1092 // node is not a closed node
\r
1094 if (HtmlNode.CanOverlapElement(_currentnode.Name))
\r
1096 // this is a hack: add it as a text node
\r
1097 HtmlNode closenode = CreateNode(HtmlNodeType.Text, _currentnode._outerstartindex);
\r
1098 closenode._outerlength = _currentnode._outerlength;
\r
1099 ((HtmlTextNode) closenode).Text = ((HtmlTextNode) closenode).Text.ToLower();
\r
1100 if (_lastparentnode != null)
\r
1102 _lastparentnode.AppendChild(closenode);
\r
1107 if (HtmlNode.IsEmptyElement(_currentnode.Name))
\r
1110 HtmlParseErrorCode.EndTagNotRequired,
\r
1111 _currentnode._line, _currentnode._lineposition,
\r
1112 _currentnode._streamposition, _currentnode.OuterHtml,
\r
1113 "End tag </" + _currentnode.Name + "> is not required");
\r
1117 // node cannot overlap, node is not empty
\r
1119 HtmlParseErrorCode.TagNotOpened,
\r
1120 _currentnode._line, _currentnode._lineposition,
\r
1121 _currentnode._streamposition, _currentnode.OuterHtml,
\r
1122 "Start tag <" + _currentnode.Name + "> was not found");
\r
1130 if (OptionFixNestedTags)
\r
1132 if (FindResetterNodes(prev, GetResetters(_currentnode.Name)))
\r
1135 HtmlParseErrorCode.EndTagInvalidHere,
\r
1136 _currentnode._line, _currentnode._lineposition,
\r
1137 _currentnode._streamposition, _currentnode.OuterHtml,
\r
1138 "End tag </" + _currentnode.Name + "> invalid here");
\r
1145 _lastnodes[_currentnode.Name] = prev._prevwithsamename;
\r
1146 prev.CloseNode(_currentnode);
\r
1151 // we close this node, get grandparent
\r
1154 if ((_lastparentnode != null) &&
\r
1155 ((!HtmlNode.IsClosedElement(_currentnode.Name)) ||
\r
1156 (_currentnode._starttag)))
\r
1158 UpdateLastParentNode();
\r
1163 private string CurrentAttributeName()
\r
1165 return _text.Substring(_currentattribute._namestartindex, _currentattribute._namelength);
\r
1168 private string CurrentAttributeValue()
\r
1170 return _text.Substring(_currentattribute._valuestartindex, _currentattribute._valuelength);
\r
1173 private string CurrentNodeInner()
\r
1175 return _text.Substring(_currentnode._innerstartindex, _currentnode._innerlength);
\r
1178 private string CurrentNodeName()
\r
1180 return _text.Substring(_currentnode._namestartindex, _currentnode._namelength);
\r
1183 private string CurrentNodeOuter()
\r
1185 return _text.Substring(_currentnode._outerstartindex, _currentnode._outerlength);
\r
1189 private void DecrementPosition()
\r
1192 if (_lineposition == 1)
\r
1194 _lineposition = _maxlineposition;
\r
1203 private HtmlNode FindResetterNode(HtmlNode node, string name)
\r
1205 HtmlNode resetter = (HtmlNode) _lastnodes[name];
\r
1206 if (resetter == null)
\r
1208 if (resetter.Closed)
\r
1212 if (resetter._streamposition < node._streamposition)
\r
1219 private bool FindResetterNodes(HtmlNode node, string[] names)
\r
1221 if (names == null)
\r
1225 for (int i = 0; i < names.Length; i++)
\r
1227 if (FindResetterNode(node, names[i]) != null)
\r
1235 private void FixNestedTag(string name, string[] resetters)
\r
1237 if (resetters == null)
\r
1242 // if we find a previous unclosed same name node, without a resetter node between, we must close it
\r
1243 prev = (HtmlNode) _lastnodes[name];
\r
1244 if ((prev != null) && (!prev.Closed))
\r
1246 // try to find a resetter node, if found, we do nothing
\r
1247 if (FindResetterNodes(prev, resetters))
\r
1252 // ok we need to close the prev now
\r
1253 // create a fake closer node
\r
1254 HtmlNode close = new HtmlNode(prev.NodeType, this, -1);
\r
1255 close._endnode = close;
\r
1256 prev.CloseNode(close);
\r
1260 private void FixNestedTags()
\r
1262 // we are only interested by start tags, not closing tags
\r
1263 if (!_currentnode._starttag)
\r
1266 string name = CurrentNodeName();
\r
1267 FixNestedTag(name, GetResetters(name));
\r
1270 private string[] GetResetters(string name)
\r
1275 return new string[] {"ul"};
\r
1278 return new string[] {"table"};
\r
1282 return new string[] {"tr", "table"};
\r
1289 private void IncrementPosition()
\r
1291 if (_crc32 != null)
\r
1293 // REVIEW: should we add some checksum code in DecrementPosition too?
\r
1294 _crc32.AddToCRC32(_c);
\r
1298 _maxlineposition = _lineposition;
\r
1301 _lineposition = 1;
\r
1310 private bool NewCheck()
\r
1316 if (_index < _text.Length)
\r
1318 if (_text[_index] == '%')
\r
1322 case ParseState.AttributeAfterEquals:
\r
1323 PushAttributeValueStart(_index - 1);
\r
1326 case ParseState.BetweenAttributes:
\r
1327 PushAttributeNameStart(_index - 1);
\r
1330 case ParseState.WhichTag:
\r
1331 PushNodeNameStart(true, _index - 1);
\r
1332 _state = ParseState.Tag;
\r
1335 _oldstate = _state;
\r
1336 _state = ParseState.ServerSideCode;
\r
1341 if (!PushNodeEnd(_index - 1, true))
\r
1344 _index = _text.Length;
\r
1347 _state = ParseState.WhichTag;
\r
1348 if ((_index - 1) <= (_text.Length - 2))
\r
1350 if (_text[_index] == '!')
\r
1352 PushNodeStart(HtmlNodeType.Comment, _index - 1);
\r
1353 PushNodeNameStart(true, _index);
\r
1354 PushNodeNameEnd(_index + 1);
\r
1355 _state = ParseState.Comment;
\r
1356 if (_index < (_text.Length - 2))
\r
1358 if ((_text[_index + 1] == '-') &&
\r
1359 (_text[_index + 2] == '-'))
\r
1361 _fullcomment = true;
\r
1365 _fullcomment = false;
\r
1371 PushNodeStart(HtmlNodeType.Element, _index - 1);
\r
1375 private void Parse()
\r
1377 int lastquote = 0;
\r
1378 if (OptionComputeChecksum)
\r
1380 _crc32 = new Crc32();
\r
1383 _lastnodes = new Hashtable();
\r
1385 _fullcomment = false;
\r
1386 _parseerrors = new List<HtmlParseError>();
\r
1388 _lineposition = 1;
\r
1389 _maxlineposition = 1;
\r
1391 _state = ParseState.Text;
\r
1392 _oldstate = _state;
\r
1393 _documentnode._innerlength = _text.Length;
\r
1394 _documentnode._outerlength = _text.Length;
\r
1395 _remainderOffset = _text.Length;
\r
1397 _lastparentnode = _documentnode;
\r
1398 _currentnode = CreateNode(HtmlNodeType.Text, 0);
\r
1399 _currentattribute = null;
\r
1402 PushNodeStart(HtmlNodeType.Text, 0);
\r
1403 while (_index < _text.Length)
\r
1405 _c = _text[_index];
\r
1406 IncrementPosition();
\r
1410 case ParseState.Text:
\r
1415 case ParseState.WhichTag:
\r
1420 PushNodeNameStart(false, _index);
\r
1424 PushNodeNameStart(true, _index - 1);
\r
1425 DecrementPosition();
\r
1427 _state = ParseState.Tag;
\r
1430 case ParseState.Tag:
\r
1433 if (IsWhiteSpace(_c))
\r
1435 PushNodeNameEnd(_index - 1);
\r
1436 if (_state != ParseState.Tag)
\r
1438 _state = ParseState.BetweenAttributes;
\r
1443 PushNodeNameEnd(_index - 1);
\r
1444 if (_state != ParseState.Tag)
\r
1446 _state = ParseState.EmptyTag;
\r
1451 PushNodeNameEnd(_index - 1);
\r
1452 if (_state != ParseState.Tag)
\r
1454 if (!PushNodeEnd(_index, false))
\r
1457 _index = _text.Length;
\r
1460 if (_state != ParseState.Tag)
\r
1462 _state = ParseState.Text;
\r
1463 PushNodeStart(HtmlNodeType.Text, _index);
\r
1467 case ParseState.BetweenAttributes:
\r
1471 if (IsWhiteSpace(_c))
\r
1474 if ((_c == '/') || (_c == '?'))
\r
1476 _state = ParseState.EmptyTag;
\r
1482 if (!PushNodeEnd(_index, false))
\r
1485 _index = _text.Length;
\r
1489 if (_state != ParseState.BetweenAttributes)
\r
1491 _state = ParseState.Text;
\r
1492 PushNodeStart(HtmlNodeType.Text, _index);
\r
1496 PushAttributeNameStart(_index - 1);
\r
1497 _state = ParseState.AttributeName;
\r
1500 case ParseState.EmptyTag:
\r
1506 if (!PushNodeEnd(_index, true))
\r
1509 _index = _text.Length;
\r
1513 if (_state != ParseState.EmptyTag)
\r
1515 _state = ParseState.Text;
\r
1516 PushNodeStart(HtmlNodeType.Text, _index);
\r
1519 _state = ParseState.BetweenAttributes;
\r
1522 case ParseState.AttributeName:
\r
1526 if (IsWhiteSpace(_c))
\r
1528 PushAttributeNameEnd(_index - 1);
\r
1529 _state = ParseState.AttributeBeforeEquals;
\r
1534 PushAttributeNameEnd(_index - 1);
\r
1535 _state = ParseState.AttributeAfterEquals;
\r
1540 PushAttributeNameEnd(_index - 1);
\r
1541 if (!PushNodeEnd(_index, false))
\r
1544 _index = _text.Length;
\r
1547 if (_state != ParseState.AttributeName)
\r
1549 _state = ParseState.Text;
\r
1550 PushNodeStart(HtmlNodeType.Text, _index);
\r
1555 case ParseState.AttributeBeforeEquals:
\r
1559 if (IsWhiteSpace(_c))
\r
1563 if (!PushNodeEnd(_index, false))
\r
1566 _index = _text.Length;
\r
1569 if (_state != ParseState.AttributeBeforeEquals)
\r
1571 _state = ParseState.Text;
\r
1572 PushNodeStart(HtmlNodeType.Text, _index);
\r
1577 _state = ParseState.AttributeAfterEquals;
\r
1580 // no equals, no whitespace, it's a new attrribute starting
\r
1581 _state = ParseState.BetweenAttributes;
\r
1582 DecrementPosition();
\r
1585 case ParseState.AttributeAfterEquals:
\r
1589 if (IsWhiteSpace(_c))
\r
1592 if ((_c == '\'') || (_c == '"'))
\r
1594 _state = ParseState.QuotedAttributeValue;
\r
1595 PushAttributeValueStart(_index, _c);
\r
1601 if (!PushNodeEnd(_index, false))
\r
1604 _index = _text.Length;
\r
1607 if (_state != ParseState.AttributeAfterEquals)
\r
1609 _state = ParseState.Text;
\r
1610 PushNodeStart(HtmlNodeType.Text, _index);
\r
1613 PushAttributeValueStart(_index - 1);
\r
1614 _state = ParseState.AttributeValue;
\r
1617 case ParseState.AttributeValue:
\r
1621 if (IsWhiteSpace(_c))
\r
1623 PushAttributeValueEnd(_index - 1);
\r
1624 _state = ParseState.BetweenAttributes;
\r
1630 PushAttributeValueEnd(_index - 1);
\r
1631 if (!PushNodeEnd(_index, false))
\r
1634 _index = _text.Length;
\r
1637 if (_state != ParseState.AttributeValue)
\r
1639 _state = ParseState.Text;
\r
1640 PushNodeStart(HtmlNodeType.Text, _index);
\r
1645 case ParseState.QuotedAttributeValue:
\r
1646 if (_c == lastquote)
\r
1648 PushAttributeValueEnd(_index - 1);
\r
1649 _state = ParseState.BetweenAttributes;
\r
1654 if (_index < _text.Length)
\r
1656 if (_text[_index] == '%')
\r
1658 _oldstate = _state;
\r
1659 _state = ParseState.ServerSideCode;
\r
1666 case ParseState.Comment:
\r
1671 if ((_text[_index - 2] != '-') ||
\r
1672 (_text[_index - 3] != '-'))
\r
1677 if (!PushNodeEnd(_index, false))
\r
1680 _index = _text.Length;
\r
1683 _state = ParseState.Text;
\r
1684 PushNodeStart(HtmlNodeType.Text, _index);
\r
1689 case ParseState.ServerSideCode:
\r
1692 if (_index < _text.Length)
\r
1694 if (_text[_index] == '>')
\r
1696 switch (_oldstate)
\r
1698 case ParseState.AttributeAfterEquals:
\r
1699 _state = ParseState.AttributeValue;
\r
1702 case ParseState.BetweenAttributes:
\r
1703 PushAttributeNameEnd(_index + 1);
\r
1704 _state = ParseState.BetweenAttributes;
\r
1708 _state = _oldstate;
\r
1711 IncrementPosition();
\r
1717 case ParseState.PcData:
\r
1718 // look for </tag + 1 char
\r
1720 // check buffer end
\r
1721 if ((_currentnode._namelength + 3) <= (_text.Length - (_index - 1)))
\r
1723 if (string.Compare(_text.Substring(_index - 1, _currentnode._namelength + 2),
\r
1724 "</" + _currentnode.Name, true) == 0)
\r
1726 int c = _text[_index - 1 + 2 + _currentnode.Name.Length];
\r
1727 if ((c == '>') || (IsWhiteSpace(c)))
\r
1729 // add the script as a text node
\r
1730 HtmlNode script = CreateNode(HtmlNodeType.Text,
\r
1731 _currentnode._outerstartindex +
\r
1732 _currentnode._outerlength);
\r
1733 script._outerlength = _index - 1 - script._outerstartindex;
\r
1734 _currentnode.AppendChild(script);
\r
1737 PushNodeStart(HtmlNodeType.Element, _index - 1);
\r
1738 PushNodeNameStart(false, _index - 1 + 2);
\r
1739 _state = ParseState.Tag;
\r
1740 IncrementPosition();
\r
1748 // finish the current work
\r
1749 if (_currentnode._namestartindex > 0)
\r
1751 PushNodeNameEnd(_index);
\r
1753 PushNodeEnd(_index, false);
\r
1755 // we don't need this anymore
\r
1756 _lastnodes.Clear();
\r
1759 private void PushAttributeNameEnd(int index)
\r
1761 _currentattribute._namelength = index - _currentattribute._namestartindex;
\r
1762 _currentnode.Attributes.Append(_currentattribute);
\r
1765 private void PushAttributeNameStart(int index)
\r
1767 _currentattribute = CreateAttribute();
\r
1768 _currentattribute._namestartindex = index;
\r
1769 _currentattribute.Line = _line;
\r
1770 _currentattribute._lineposition = _lineposition;
\r
1771 _currentattribute._streamposition = index;
\r
1774 private void PushAttributeValueEnd(int index)
\r
1776 _currentattribute._valuelength = index - _currentattribute._valuestartindex;
\r
1779 private void PushAttributeValueStart(int index)
\r
1781 PushAttributeValueStart(index, 0);
\r
1784 private void PushAttributeValueStart(int index, int quote)
\r
1786 _currentattribute._valuestartindex = index;
\r
1787 if (quote == '\'')
\r
1788 _currentattribute.QuoteType = AttributeValueQuote.SingleQuote;
\r
1791 private bool PushNodeEnd(int index, bool close)
\r
1793 _currentnode._outerlength = index - _currentnode._outerstartindex;
\r
1795 if ((_currentnode._nodetype == HtmlNodeType.Text) ||
\r
1796 (_currentnode._nodetype == HtmlNodeType.Comment))
\r
1798 // forget about void nodes
\r
1799 if (_currentnode._outerlength > 0)
\r
1801 _currentnode._innerlength = _currentnode._outerlength;
\r
1802 _currentnode._innerstartindex = _currentnode._outerstartindex;
\r
1803 if (_lastparentnode != null)
\r
1805 _lastparentnode.AppendChild(_currentnode);
\r
1811 if ((_currentnode._starttag) && (_lastparentnode != _currentnode))
\r
1813 // add to parent node
\r
1814 if (_lastparentnode != null)
\r
1816 _lastparentnode.AppendChild(_currentnode);
\r
1819 ReadDocumentEncoding(_currentnode);
\r
1821 // remember last node of this kind
\r
1822 HtmlNode prev = (HtmlNode) _lastnodes[_currentnode.Name];
\r
1823 _currentnode._prevwithsamename = prev;
\r
1824 _lastnodes[_currentnode.Name] = _currentnode;
\r
1827 if ((_currentnode.NodeType == HtmlNodeType.Document) ||
\r
1828 (_currentnode.NodeType == HtmlNodeType.Element))
\r
1830 _lastparentnode = _currentnode;
\r
1833 if (HtmlNode.IsCDataElement(CurrentNodeName()))
\r
1835 _state = ParseState.PcData;
\r
1839 if ((HtmlNode.IsClosedElement(_currentnode.Name)) ||
\r
1840 (HtmlNode.IsEmptyElement(_currentnode.Name)))
\r
1847 if ((close) || (!_currentnode._starttag))
\r
1849 if ((OptionStopperNodeName != null) && (_remainder == null) &&
\r
1850 (string.Compare(_currentnode.Name, OptionStopperNodeName, true) == 0))
\r
1852 _remainderOffset = index;
\r
1853 _remainder = _text.Substring(_remainderOffset);
\r
1854 CloseCurrentNode();
\r
1855 return false; // stop parsing
\r
1857 CloseCurrentNode();
\r
1862 private void PushNodeNameEnd(int index)
\r
1864 _currentnode._namelength = index - _currentnode._namestartindex;
\r
1865 if (OptionFixNestedTags)
\r
1871 private void PushNodeNameStart(bool starttag, int index)
\r
1873 _currentnode._starttag = starttag;
\r
1874 _currentnode._namestartindex = index;
\r
1877 private void PushNodeStart(HtmlNodeType type, int index)
\r
1879 _currentnode = CreateNode(type, index);
\r
1880 _currentnode._line = _line;
\r
1881 _currentnode._lineposition = _lineposition;
\r
1882 if (type == HtmlNodeType.Element)
\r
1884 _currentnode._lineposition--;
\r
1886 _currentnode._streamposition = index;
\r
1889 private void ReadDocumentEncoding(HtmlNode node)
\r
1891 if (!OptionReadEncoding)
\r
1894 // <meta http-equiv="content-type" content="text/html;charset=iso-8859-1" />
\r
1896 // when we append a child, we are in node end, so attributes are already populated
\r
1897 if (node._namelength == 4) // quick check, avoids string alloc
\r
1899 if (node.Name == "meta") // all nodes names are lowercase
\r
1901 HtmlAttribute att = node.Attributes["http-equiv"];
\r
1904 if (string.Compare(att.Value, "content-type", true) == 0)
\r
1906 HtmlAttribute content = node.Attributes["content"];
\r
1907 if (content != null)
\r
1909 string charset = NameValuePairList.GetNameValuePairsValue(content.Value, "charset");
\r
1910 if (charset != null && (charset = charset.Trim()).Length > 0)
\r
1912 _declaredencoding = Encoding.GetEncoding(charset.Trim());
\r
1913 if (_onlyDetectEncoding)
\r
1915 throw new EncodingFoundException(_declaredencoding);
\r
1918 if (_streamencoding != null)
\r
1920 if (_declaredencoding.WindowsCodePage != _streamencoding.WindowsCodePage)
\r
1923 HtmlParseErrorCode.CharsetMismatch,
\r
1924 _line, _lineposition,
\r
1925 _index, node.OuterHtml,
\r
1926 "Encoding mismatch between StreamEncoding: " +
\r
1927 _streamencoding.WebName + " and DeclaredEncoding: " +
\r
1928 _declaredencoding.WebName);
\r
1941 #region Nested type: ParseState
\r
1943 private enum ParseState
\r
1948 BetweenAttributes,
\r
1951 AttributeBeforeEquals,
\r
1952 AttributeAfterEquals,
\r
1955 QuotedAttributeValue,
\r