// HtmlAgilityPack V1.0 - Simon Mourier using System; using System.Diagnostics; using System.IO; using System.Text; using System.Xml; using System.Xml.XPath; namespace HtmlAgilityPack { /// /// Represents an HTML navigator on an HTML document seen as a data store. /// public class HtmlNodeNavigator : XPathNavigator { #region Fields private int _attindex; private HtmlNode _currentnode; private HtmlDocument _doc = new HtmlDocument(); private HtmlNameTable _nametable = new HtmlNameTable(); internal bool Trace; #endregion #region Constructors internal HtmlNodeNavigator() { Reset(); } internal HtmlNodeNavigator(HtmlDocument doc, HtmlNode currentNode) { if (currentNode == null) { throw new ArgumentNullException("currentNode"); } if (currentNode.OwnerDocument != doc) { throw new ArgumentException(HtmlDocument.HtmlExceptionRefNotChild); } InternalTrace(null); _doc = doc; Reset(); _currentnode = currentNode; } private HtmlNodeNavigator(HtmlNodeNavigator nav) { if (nav == null) { throw new ArgumentNullException("nav"); } InternalTrace(null); _doc = nav._doc; _currentnode = nav._currentnode; _attindex = nav._attindex; _nametable = nav._nametable; // REVIEW: should we do this? } /// /// Initializes a new instance of the HtmlNavigator and loads an HTML document from a stream. /// /// The input stream. public HtmlNodeNavigator(Stream stream) { _doc.Load(stream); Reset(); } /// /// Initializes a new instance of the HtmlNavigator and loads an HTML document from a stream. /// /// The input stream. /// Indicates whether to look for byte order marks at the beginning of the stream. public HtmlNodeNavigator(Stream stream, bool detectEncodingFromByteOrderMarks) { _doc.Load(stream, detectEncodingFromByteOrderMarks); Reset(); } /// /// Initializes a new instance of the HtmlNavigator and loads an HTML document from a stream. /// /// The input stream. /// The character encoding to use. public HtmlNodeNavigator(Stream stream, Encoding encoding) { _doc.Load(stream, encoding); Reset(); } /// /// Initializes a new instance of the HtmlNavigator and loads an HTML document from a stream. /// /// The input stream. /// The character encoding to use. /// Indicates whether to look for byte order marks at the beginning of the stream. public HtmlNodeNavigator(Stream stream, Encoding encoding, bool detectEncodingFromByteOrderMarks) { _doc.Load(stream, encoding, detectEncodingFromByteOrderMarks); Reset(); } /// /// Initializes a new instance of the HtmlNavigator and loads an HTML document from a stream. /// /// The input stream. /// The character encoding to use. /// Indicates whether to look for byte order marks at the beginning of the stream. /// The minimum buffer size. public HtmlNodeNavigator(Stream stream, Encoding encoding, bool detectEncodingFromByteOrderMarks, int buffersize) { _doc.Load(stream, encoding, detectEncodingFromByteOrderMarks, buffersize); Reset(); } /// /// Initializes a new instance of the HtmlNavigator and loads an HTML document from a TextReader. /// /// The TextReader used to feed the HTML data into the document. public HtmlNodeNavigator(TextReader reader) { _doc.Load(reader); Reset(); } /// /// Initializes a new instance of the HtmlNavigator and loads an HTML document from a file. /// /// The complete file path to be read. public HtmlNodeNavigator(string path) { _doc.Load(path); Reset(); } /// /// Initializes a new instance of the HtmlNavigator and loads an HTML document from a file. /// /// The complete file path to be read. /// Indicates whether to look for byte order marks at the beginning of the file. public HtmlNodeNavigator(string path, bool detectEncodingFromByteOrderMarks) { _doc.Load(path, detectEncodingFromByteOrderMarks); Reset(); } /// /// Initializes a new instance of the HtmlNavigator and loads an HTML document from a file. /// /// The complete file path to be read. /// The character encoding to use. public HtmlNodeNavigator(string path, Encoding encoding) { _doc.Load(path, encoding); Reset(); } /// /// Initializes a new instance of the HtmlNavigator and loads an HTML document from a file. /// /// The complete file path to be read. /// The character encoding to use. /// Indicates whether to look for byte order marks at the beginning of the file. public HtmlNodeNavigator(string path, Encoding encoding, bool detectEncodingFromByteOrderMarks) { _doc.Load(path, encoding, detectEncodingFromByteOrderMarks); Reset(); } /// /// Initializes a new instance of the HtmlNavigator and loads an HTML document from a file. /// /// The complete file path to be read. /// The character encoding to use. /// Indicates whether to look for byte order marks at the beginning of the file. /// The minimum buffer size. public HtmlNodeNavigator(string path, Encoding encoding, bool detectEncodingFromByteOrderMarks, int buffersize) { _doc.Load(path, encoding, detectEncodingFromByteOrderMarks, buffersize); Reset(); } #endregion #region Properties /// /// Gets the base URI for the current node. /// Always returns string.Empty in the case of HtmlNavigator implementation. /// public override string BaseURI { get { InternalTrace(">"); return _nametable.GetOrAdd(string.Empty); } } /// /// Gets the current HTML document. /// public HtmlDocument CurrentDocument { get { return _doc; } } /// /// Gets the current HTML node. /// public HtmlNode CurrentNode { get { return _currentnode; } } /// /// Gets a value indicating whether the current node has child nodes. /// public override bool HasAttributes { get { InternalTrace(">" + (_currentnode.Attributes.Count > 0)); return (_currentnode.Attributes.Count > 0); } } /// /// Gets a value indicating whether the current node has child nodes. /// public override bool HasChildren { get { InternalTrace(">" + (_currentnode.ChildNodes.Count > 0)); return (_currentnode.ChildNodes.Count > 0); } } /// /// Gets a value indicating whether the current node is an empty element. /// public override bool IsEmptyElement { get { InternalTrace(">" + !HasChildren); // REVIEW: is this ok? return !HasChildren; } } /// /// Gets the name of the current HTML node without the namespace prefix. /// public override string LocalName { get { if (_attindex != -1) { InternalTrace("att>" + _currentnode.Attributes[_attindex].Name); return _nametable.GetOrAdd(_currentnode.Attributes[_attindex].Name); } InternalTrace("node>" + _currentnode.Name); return _nametable.GetOrAdd(_currentnode.Name); } } /// /// Gets the qualified name of the current node. /// public override string Name { get { InternalTrace(">" + _currentnode.Name); return _nametable.GetOrAdd(_currentnode.Name); } } /// /// Gets the namespace URI (as defined in the W3C Namespace Specification) of the current node. /// Always returns string.Empty in the case of HtmlNavigator implementation. /// public override string NamespaceURI { get { InternalTrace(">"); return _nametable.GetOrAdd(string.Empty); } } /// /// Gets the associated with this implementation. /// public override XmlNameTable NameTable { get { InternalTrace(null); return _nametable; } } /// /// Gets the type of the current node. /// public override XPathNodeType NodeType { get { switch (_currentnode.NodeType) { case HtmlNodeType.Comment: InternalTrace(">" + XPathNodeType.Comment); return XPathNodeType.Comment; case HtmlNodeType.Document: InternalTrace(">" + XPathNodeType.Root); return XPathNodeType.Root; case HtmlNodeType.Text: InternalTrace(">" + XPathNodeType.Text); return XPathNodeType.Text; case HtmlNodeType.Element: { if (_attindex != -1) { InternalTrace(">" + XPathNodeType.Attribute); return XPathNodeType.Attribute; } InternalTrace(">" + XPathNodeType.Element); return XPathNodeType.Element; } default: throw new NotImplementedException("Internal error: Unhandled HtmlNodeType: " + _currentnode.NodeType); } } } /// /// Gets the prefix associated with the current node. /// Always returns string.Empty in the case of HtmlNavigator implementation. /// public override string Prefix { get { InternalTrace(null); return _nametable.GetOrAdd(string.Empty); } } /// /// Gets the text value of the current node. /// public override string Value { get { InternalTrace("nt=" + _currentnode.NodeType); switch (_currentnode.NodeType) { case HtmlNodeType.Comment: InternalTrace(">" + ((HtmlCommentNode) _currentnode).Comment); return ((HtmlCommentNode) _currentnode).Comment; case HtmlNodeType.Document: InternalTrace(">"); return ""; case HtmlNodeType.Text: InternalTrace(">" + ((HtmlTextNode) _currentnode).Text); return ((HtmlTextNode) _currentnode).Text; case HtmlNodeType.Element: { if (_attindex != -1) { InternalTrace(">" + _currentnode.Attributes[_attindex].Value); return _currentnode.Attributes[_attindex].Value; } return _currentnode.InnerText; } default: throw new NotImplementedException("Internal error: Unhandled HtmlNodeType: " + _currentnode.NodeType); } } } /// /// Gets the xml:lang scope for the current node. /// Always returns string.Empty in the case of HtmlNavigator implementation. /// public override string XmlLang { get { InternalTrace(null); return _nametable.GetOrAdd(string.Empty); } } #endregion #region Public Methods /// /// Creates a new HtmlNavigator positioned at the same node as this HtmlNavigator. /// /// A new HtmlNavigator object positioned at the same node as the original HtmlNavigator. public override XPathNavigator Clone() { InternalTrace(null); return new HtmlNodeNavigator(this); } /// /// Gets the value of the HTML attribute with the specified LocalName and NamespaceURI. /// /// The local name of the HTML attribute. /// The namespace URI of the attribute. Unsupported with the HtmlNavigator implementation. /// The value of the specified HTML attribute. String.Empty or null if a matching attribute is not found or if the navigator is not positioned on an element node. public override string GetAttribute(string localName, string namespaceURI) { InternalTrace("localName=" + localName + ", namespaceURI=" + namespaceURI); HtmlAttribute att = _currentnode.Attributes[localName]; if (att == null) { InternalTrace(">null"); return null; } InternalTrace(">" + att.Value); return att.Value; } /// /// Returns the value of the namespace node corresponding to the specified local name. /// Always returns string.Empty for the HtmlNavigator implementation. /// /// The local name of the namespace node. /// Always returns string.Empty for the HtmlNavigator implementation. public override string GetNamespace(string name) { InternalTrace("name=" + name); return string.Empty; } /// /// Determines whether the current HtmlNavigator is at the same position as the specified HtmlNavigator. /// /// The HtmlNavigator that you want to compare against. /// true if the two navigators have the same position, otherwise, false. public override bool IsSamePosition(XPathNavigator other) { HtmlNodeNavigator nav = other as HtmlNodeNavigator; if (nav == null) { InternalTrace(">false"); return false; } InternalTrace(">" + (nav._currentnode == _currentnode)); return (nav._currentnode == _currentnode); } /// /// Moves to the same position as the specified HtmlNavigator. /// /// The HtmlNavigator positioned on the node that you want to move to. /// true if successful, otherwise false. If false, the position of the navigator is unchanged. public override bool MoveTo(XPathNavigator other) { HtmlNodeNavigator nav = other as HtmlNodeNavigator; if (nav == null) { InternalTrace(">false (nav is not an HtmlNodeNavigator)"); return false; } InternalTrace("moveto oid=" + nav.GetHashCode() + ", n:" + nav._currentnode.Name + ", a:" + nav._attindex); if (nav._doc == _doc) { _currentnode = nav._currentnode; _attindex = nav._attindex; InternalTrace(">true"); return true; } // we don't know how to handle that InternalTrace(">false (???)"); return false; } /// /// Moves to the HTML attribute with matching LocalName and NamespaceURI. /// /// The local name of the HTML attribute. /// The namespace URI of the attribute. Unsupported with the HtmlNavigator implementation. /// true if the HTML attribute is found, otherwise, false. If false, the position of the navigator does not change. public override bool MoveToAttribute(string localName, string namespaceURI) { InternalTrace("localName=" + localName + ", namespaceURI=" + namespaceURI); int index = _currentnode.Attributes.GetAttributeIndex(localName); if (index == -1) { InternalTrace(">false"); return false; } _attindex = index; InternalTrace(">true"); return true; } /// /// Moves to the first sibling of the current node. /// /// true if the navigator is successful moving to the first sibling node, false if there is no first sibling or if the navigator is currently positioned on an attribute node. public override bool MoveToFirst() { if (_currentnode.ParentNode == null) { InternalTrace(">false"); return false; } if (_currentnode.ParentNode.FirstChild == null) { InternalTrace(">false"); return false; } _currentnode = _currentnode.ParentNode.FirstChild; InternalTrace(">true"); return true; } /// /// Moves to the first HTML attribute. /// /// true if the navigator is successful moving to the first HTML attribute, otherwise, false. public override bool MoveToFirstAttribute() { if (!HasAttributes) { InternalTrace(">false"); return false; } _attindex = 0; InternalTrace(">true"); return true; } /// /// Moves to the first child of the current node. /// /// true if there is a first child node, otherwise false. public override bool MoveToFirstChild() { if (!_currentnode.HasChildNodes) { InternalTrace(">false"); return false; } _currentnode = _currentnode.ChildNodes[0]; InternalTrace(">true"); return true; } /// /// Moves the XPathNavigator to the first namespace node of the current element. /// Always returns false for the HtmlNavigator implementation. /// /// An XPathNamespaceScope value describing the namespace scope. /// Always returns false for the HtmlNavigator implementation. public override bool MoveToFirstNamespace(XPathNamespaceScope scope) { InternalTrace(null); return false; } /// /// Moves to the node that has an attribute of type ID whose value matches the specified string. /// /// A string representing the ID value of the node to which you want to move. This argument does not need to be atomized. /// true if the move was successful, otherwise false. If false, the position of the navigator is unchanged. public override bool MoveToId(string id) { InternalTrace("id=" + id); HtmlNode node = _doc.GetElementbyId(id); if (node == null) { InternalTrace(">false"); return false; } _currentnode = node; InternalTrace(">true"); return true; } /// /// Moves the XPathNavigator to the namespace node with the specified local name. /// Always returns false for the HtmlNavigator implementation. /// /// The local name of the namespace node. /// Always returns false for the HtmlNavigator implementation. public override bool MoveToNamespace(string name) { InternalTrace("name=" + name); return false; } /// /// Moves to the next sibling of the current node. /// /// true if the navigator is successful moving to the next sibling node, false if there are no more siblings or if the navigator is currently positioned on an attribute node. If false, the position of the navigator is unchanged. public override bool MoveToNext() { if (_currentnode.NextSibling == null) { InternalTrace(">false"); return false; } InternalTrace("_c=" + _currentnode.CloneNode(false).OuterHtml); InternalTrace("_n=" + _currentnode.NextSibling.CloneNode(false).OuterHtml); _currentnode = _currentnode.NextSibling; InternalTrace(">true"); return true; } /// /// Moves to the next HTML attribute. /// /// public override bool MoveToNextAttribute() { InternalTrace(null); if (_attindex >= (_currentnode.Attributes.Count - 1)) { InternalTrace(">false"); return false; } _attindex++; InternalTrace(">true"); return true; } /// /// Moves the XPathNavigator to the next namespace node. /// Always returns falsefor the HtmlNavigator implementation. /// /// An XPathNamespaceScope value describing the namespace scope. /// Always returns false for the HtmlNavigator implementation. public override bool MoveToNextNamespace(XPathNamespaceScope scope) { InternalTrace(null); return false; } /// /// Moves to the parent of the current node. /// /// true if there is a parent node, otherwise false. public override bool MoveToParent() { if (_currentnode.ParentNode == null) { InternalTrace(">false"); return false; } _currentnode = _currentnode.ParentNode; InternalTrace(">true"); return true; } /// /// Moves to the previous sibling of the current node. /// /// true if the navigator is successful moving to the previous sibling node, false if there is no previous sibling or if the navigator is currently positioned on an attribute node. public override bool MoveToPrevious() { if (_currentnode.PreviousSibling == null) { InternalTrace(">false"); return false; } _currentnode = _currentnode.PreviousSibling; InternalTrace(">true"); return true; } /// /// Moves to the root node to which the current node belongs. /// public override void MoveToRoot() { _currentnode = _doc.DocumentNode; InternalTrace(null); } #endregion #region Internal Methods [Conditional("TRACE")] internal void InternalTrace(object traceValue) { if (!Trace) { return; } StackFrame sf = new StackFrame(1, true); string name = sf.GetMethod().Name; string nodename = _currentnode == null ? "(null)" : _currentnode.Name; string nodevalue; if (_currentnode == null) { nodevalue = "(null)"; } else { switch (_currentnode.NodeType) { case HtmlNodeType.Comment: nodevalue = ((HtmlCommentNode) _currentnode).Comment; break; case HtmlNodeType.Document: nodevalue = ""; break; case HtmlNodeType.Text: nodevalue = ((HtmlTextNode) _currentnode).Text; break; default: nodevalue = _currentnode.CloneNode(false).OuterHtml; break; } } System.Diagnostics.Trace.WriteLine(string.Format("oid={0},n={1},a={2},v={3},{4}", GetHashCode(), nodename, _attindex, nodevalue, traceValue), "N!" + name); } #endregion #region Private Methods private void Reset() { InternalTrace(null); _currentnode = _doc.DocumentNode; _attindex = -1; } #endregion } }