// HtmlAgilityPack V1.0 - Simon Mourier using System; using System.IO; using System.Net; using System.Text; using System.Xml; using System.Xml.Serialization; using System.Xml.Xsl; using Microsoft.Win32; namespace HtmlAgilityPack { /// /// A utility class to get HTML document from HTTP. /// public class HtmlWeb { #region Delegates /// /// Represents the method that will handle the PostResponse event. /// public delegate void PostResponseHandler(HttpWebRequest request, HttpWebResponse response); /// /// Represents the method that will handle the PreHandleDocument event. /// public delegate void PreHandleDocumentHandler(HtmlDocument document); /// /// Represents the method that will handle the PreRequest event. /// public delegate bool PreRequestHandler(HttpWebRequest request); #endregion #region Fields private bool _autoDetectEncoding = true; private bool _cacheOnly; private string _cachePath; private bool _fromCache; private int _requestDuration; private Uri _responseUri; private HttpStatusCode _statusCode = HttpStatusCode.OK; private int _streamBufferSize = 1024; private bool _useCookies; private bool _usingCache; /// /// Occurs after an HTTP request has been executed. /// public PostResponseHandler PostResponse; /// /// Occurs before an HTML document is handled. /// public PreHandleDocumentHandler PreHandleDocument; /// /// Occurs before an HTTP request is executed. /// public PreRequestHandler PreRequest; #endregion #region Properties /// /// Gets or Sets a value indicating if document encoding must be automatically detected. /// public bool AutoDetectEncoding { get { return _autoDetectEncoding; } set { _autoDetectEncoding = value; } } /// /// Gets or Sets a value indicating whether to get document only from the cache. /// If this is set to true and document is not found in the cache, nothing will be loaded. /// public bool CacheOnly { get { return _cacheOnly; } set { if ((value) && !UsingCache) { throw new HtmlWebException("Cache is not enabled. Set UsingCache to true first."); } _cacheOnly = value; } } /// /// Gets or Sets the cache path. If null, no caching mechanism will be used. /// public string CachePath { get { return _cachePath; } set { _cachePath = value; } } /// /// Gets a value indicating if the last document was retrieved from the cache. /// public bool FromCache { get { return _fromCache; } } /// /// Gets the last request duration in milliseconds. /// public int RequestDuration { get { return _requestDuration; } } /// /// Gets the URI of the Internet resource that actually responded to the request. /// public Uri ResponseUri { get { return _responseUri; } } /// /// Gets the last request status. /// public HttpStatusCode StatusCode { get { return _statusCode; } } /// /// Gets or Sets the size of the buffer used for memory operations. /// public int StreamBufferSize { get { return _streamBufferSize; } set { if (_streamBufferSize <= 0) { throw new ArgumentException("Size must be greater than zero."); } _streamBufferSize = value; } } /// /// Gets or Sets a value indicating if cookies will be stored. /// public bool UseCookies { get { return _useCookies; } set { _useCookies = value; } } /// /// Gets or Sets a value indicating whether the caching mechanisms should be used or not. /// public bool UsingCache { get { if (_cachePath == null) { return false; } return _usingCache; } set { if ((value) && (_cachePath == null)) { throw new HtmlWebException("You need to define a CachePath first."); } _usingCache = value; } } #endregion #region Public Methods /// /// Gets the MIME content type for a given path extension. /// /// The input path extension. /// The default content type to return if any error occurs. /// The path extension's MIME content type. public static string GetContentTypeForExtension(string extension, string def) { if (string.IsNullOrEmpty(extension)) { return def; } string contentType = ""; try { RegistryKey reg = Registry.ClassesRoot; reg = reg.OpenSubKey(extension, false); if (reg != null) contentType = (string)reg.GetValue("", def); } catch (Exception) { contentType = def; } return contentType; } /// /// Gets the path extension for a given MIME content type. /// /// The input MIME content type. /// The default path extension to return if any error occurs. /// The MIME content type's path extension. public static string GetExtensionForContentType(string contentType, string def) { if (string.IsNullOrEmpty(contentType)) { return def; } string ext = ""; try { RegistryKey reg = Registry.ClassesRoot; reg = reg.OpenSubKey(@"MIME\Database\Content Type\" + contentType, false); if (reg != null) ext = (string)reg.GetValue("Extension", def); } catch (Exception) { ext = def; } return ext; } /// /// Creates an instance of the given type from the specified Internet resource. /// /// The requested URL, such as "http://Myserver/Mypath/Myfile.asp". /// The requested type. /// An newly created instance. public object CreateInstance(string url, Type type) { return CreateInstance(url, null, null, type); } /// /// Creates an instance of the given type from the specified Internet resource. /// /// The requested URL, such as "http://Myserver/Mypath/Myfile.asp". /// The URL that specifies the XSLT stylesheet to load. /// An containing the namespace-qualified arguments used as input to the transform. /// The requested type. /// An newly created instance. public object CreateInstance(string htmlUrl, string xsltUrl, XsltArgumentList xsltArgs, Type type) { return CreateInstance(htmlUrl, xsltUrl, xsltArgs, type, null); } /// /// Creates an instance of the given type from the specified Internet resource. /// /// The requested URL, such as "http://Myserver/Mypath/Myfile.asp". /// The URL that specifies the XSLT stylesheet to load. /// An containing the namespace-qualified arguments used as input to the transform. /// The requested type. /// A file path where the temporary XML before transformation will be saved. Mostly used for debugging purposes. /// An newly created instance. public object CreateInstance(string htmlUrl, string xsltUrl, XsltArgumentList xsltArgs, Type type, string xmlPath) { StringWriter sw = new StringWriter(); XmlTextWriter writer = new XmlTextWriter(sw); if (xsltUrl == null) { LoadHtmlAsXml(htmlUrl, writer); } else { if (xmlPath == null) { LoadHtmlAsXml(htmlUrl, xsltUrl, xsltArgs, writer); } else { LoadHtmlAsXml(htmlUrl, xsltUrl, xsltArgs, writer, xmlPath); } } writer.Flush(); StringReader sr = new StringReader(sw.ToString()); XmlTextReader reader = new XmlTextReader(sr); XmlSerializer serializer = new XmlSerializer(type); object o; try { o = serializer.Deserialize(reader); } catch (InvalidOperationException ex) { throw new Exception(ex + ", --- xml:" + sw); } return o; } /// /// Gets an HTML document from an Internet resource and saves it to the specified file. /// /// The requested URL, such as "http://Myserver/Mypath/Myfile.asp". /// The location of the file where you want to save the document. public void Get(string url, string path) { Get(url, path, "GET"); } /// /// Gets an HTML document from an Internet resource and saves it to the specified file. - Proxy aware /// /// The requested URL, such as "http://Myserver/Mypath/Myfile.asp". /// The location of the file where you want to save the document. /// /// public void Get(string url, string path, WebProxy proxy, NetworkCredential credentials) { Get(url, path, proxy, credentials, "GET"); } /// /// Gets an HTML document from an Internet resource and saves it to the specified file. /// /// The requested URL, such as "http://Myserver/Mypath/Myfile.asp". /// The location of the file where you want to save the document. /// The HTTP method used to open the connection, such as GET, POST, PUT, or PROPFIND. public void Get(string url, string path, string method) { Uri uri = new Uri(url); if ((uri.Scheme == Uri.UriSchemeHttps) || (uri.Scheme == Uri.UriSchemeHttp)) { Get(uri, method, path, null, null, null); } else { throw new HtmlWebException("Unsupported uri scheme: '" + uri.Scheme + "'."); } } /// /// Gets an HTML document from an Internet resource and saves it to the specified file. Understands Proxies /// /// The requested URL, such as "http://Myserver/Mypath/Myfile.asp". /// The location of the file where you want to save the document. /// /// The HTTP method used to open the connection, such as GET, POST, PUT, or PROPFIND. /// public void Get(string url, string path, WebProxy proxy, NetworkCredential credentials, string method) { Uri uri = new Uri(url); if ((uri.Scheme == Uri.UriSchemeHttps) || (uri.Scheme == Uri.UriSchemeHttp)) { Get(uri, method, path, null, proxy, credentials); } else { throw new HtmlWebException("Unsupported uri scheme: '" + uri.Scheme + "'."); } } /// /// Gets the cache file path for a specified url. /// /// The url fo which to retrieve the cache path. May not be null. /// The cache file path. public string GetCachePath(Uri uri) { if (uri == null) { throw new ArgumentNullException("uri"); } if (!UsingCache) { throw new HtmlWebException("Cache is not enabled. Set UsingCache to true first."); } string cachePath; if (uri.AbsolutePath == "/") { cachePath = Path.Combine(_cachePath, ".htm"); } else { cachePath = Path.Combine(_cachePath, (uri.Host + uri.AbsolutePath).Replace('/', '\\')); } return cachePath; } /// /// Gets an HTML document from an Internet resource. /// /// The requested URL, such as "http://Myserver/Mypath/Myfile.asp". /// A new HTML document. public HtmlDocument Load(string url) { return Load(url, "GET"); } /// /// Gets an HTML document from an Internet resource. /// /// The requested URL, such as "http://Myserver/Mypath/Myfile.asp". /// Host to use for Proxy /// Port the Proxy is on /// User Id for Authentication /// Password for Authentication /// A new HTML document. public HtmlDocument Load(string url, string proxyHost, int proxyPort, string userId, string password) { //Create my proxy WebProxy myProxy = new WebProxy(proxyHost, proxyPort); myProxy.BypassProxyOnLocal = true; //Create my credentials NetworkCredential myCreds = null; if ((userId != null) && (password != null)) { myCreds = new NetworkCredential(userId, password); CredentialCache credCache = new CredentialCache(); //Add the creds credCache.Add(myProxy.Address, "Basic", myCreds); credCache.Add(myProxy.Address, "Digest", myCreds); } return Load(url, "GET", myProxy, myCreds); } /// /// Loads an HTML document from an Internet resource. /// /// The requested URL, such as "http://Myserver/Mypath/Myfile.asp". /// The HTTP method used to open the connection, such as GET, POST, PUT, or PROPFIND. /// A new HTML document. public HtmlDocument Load(string url, string method) { Uri uri = new Uri(url); HtmlDocument doc; if ((uri.Scheme == Uri.UriSchemeHttps) || (uri.Scheme == Uri.UriSchemeHttp)) { doc = LoadUrl(uri, method, null, null); } else { if (uri.Scheme == Uri.UriSchemeFile) { doc = new HtmlDocument(); doc.OptionAutoCloseOnEnd = false; doc.OptionAutoCloseOnEnd = true; doc.DetectEncodingAndLoad(url, _autoDetectEncoding); } else { throw new HtmlWebException("Unsupported uri scheme: '" + uri.Scheme + "'."); } } if (PreHandleDocument != null) { PreHandleDocument(doc); } return doc; } /// /// Loads an HTML document from an Internet resource. /// /// The requested URL, such as "http://Myserver/Mypath/Myfile.asp". /// The HTTP method used to open the connection, such as GET, POST, PUT, or PROPFIND. /// Proxy to use with this request /// Credentials to use when authenticating /// A new HTML document. public HtmlDocument Load(string url, string method, WebProxy proxy, NetworkCredential credentials) { Uri uri = new Uri(url); HtmlDocument doc; if ((uri.Scheme == Uri.UriSchemeHttps) || (uri.Scheme == Uri.UriSchemeHttp)) { doc = LoadUrl(uri, method, proxy, credentials); } else { if (uri.Scheme == Uri.UriSchemeFile) { doc = new HtmlDocument(); doc.OptionAutoCloseOnEnd = false; doc.OptionAutoCloseOnEnd = true; doc.DetectEncodingAndLoad(url, _autoDetectEncoding); } else { throw new HtmlWebException("Unsupported uri scheme: '" + uri.Scheme + "'."); } } if (PreHandleDocument != null) { PreHandleDocument(doc); } return doc; } /// /// Loads an HTML document from an Internet resource and saves it to the specified XmlTextWriter. /// /// The requested URL, such as "http://Myserver/Mypath/Myfile.asp". /// The XmlTextWriter to which you want to save. public void LoadHtmlAsXml(string htmlUrl, XmlTextWriter writer) { HtmlDocument doc = Load(htmlUrl); doc.Save(writer); } /// /// Loads an HTML document from an Internet resource and saves it to the specified XmlTextWriter, after an XSLT transformation. /// /// The requested URL, such as "http://Myserver/Mypath/Myfile.asp". /// The URL that specifies the XSLT stylesheet to load. /// An XsltArgumentList containing the namespace-qualified arguments used as input to the transform. /// The XmlTextWriter to which you want to save. public void LoadHtmlAsXml(string htmlUrl, string xsltUrl, XsltArgumentList xsltArgs, XmlTextWriter writer) { LoadHtmlAsXml(htmlUrl, xsltUrl, xsltArgs, writer, null); } /// /// Loads an HTML document from an Internet resource and saves it to the specified XmlTextWriter, after an XSLT transformation. /// /// The requested URL, such as "http://Myserver/Mypath/Myfile.asp". May not be null. /// The URL that specifies the XSLT stylesheet to load. /// An XsltArgumentList containing the namespace-qualified arguments used as input to the transform. /// The XmlTextWriter to which you want to save. /// A file path where the temporary XML before transformation will be saved. Mostly used for debugging purposes. public void LoadHtmlAsXml(string htmlUrl, string xsltUrl, XsltArgumentList xsltArgs, XmlTextWriter writer, string xmlPath) { if (htmlUrl == null) { throw new ArgumentNullException("htmlUrl"); } HtmlDocument doc = Load(htmlUrl); if (xmlPath != null) { XmlTextWriter w = new XmlTextWriter(xmlPath, doc.Encoding); doc.Save(w); w.Close(); } if (xsltArgs == null) { xsltArgs = new XsltArgumentList(); } // add some useful variables to the xslt doc xsltArgs.AddParam("url", "", htmlUrl); xsltArgs.AddParam("requestDuration", "", RequestDuration); xsltArgs.AddParam("fromCache", "", FromCache); XslCompiledTransform xslt = new XslCompiledTransform(); xslt.Load(xsltUrl); xslt.Transform(doc, xsltArgs, writer); } #endregion #region Private Methods private static void FilePreparePath(string target) { if (File.Exists(target)) { FileAttributes atts = File.GetAttributes(target); File.SetAttributes(target, atts & ~FileAttributes.ReadOnly); } else { string dir = Path.GetDirectoryName(target); if (!Directory.Exists(dir)) { Directory.CreateDirectory(dir); } } } private static DateTime RemoveMilliseconds(DateTime t) { return new DateTime(t.Year, t.Month, t.Day, t.Hour, t.Minute, t.Second, 0); } // ReSharper disable UnusedMethodReturnValue.Local private static long SaveStream(Stream stream, string path, DateTime touchDate, int streamBufferSize) // ReSharper restore UnusedMethodReturnValue.Local { FilePreparePath(path); FileStream fs = new FileStream(path, FileMode.Create, FileAccess.Write); BinaryReader br = null; BinaryWriter bw = null; long len = 0; try { br = new BinaryReader(stream); bw = new BinaryWriter(fs); byte[] buffer; do { buffer = br.ReadBytes(streamBufferSize); len += buffer.Length; if (buffer.Length > 0) { bw.Write(buffer); } } while (buffer.Length > 0); } finally { if (br != null) { br.Close(); } if (bw != null) { bw.Flush(); bw.Close(); } if (fs != null) { fs.Close(); } } File.SetLastWriteTime(path, touchDate); return len; } private HttpStatusCode Get(Uri uri, string method, string path, HtmlDocument doc, IWebProxy proxy, ICredentials creds) { string cachePath = null; HttpWebRequest req; bool oldFile = false; req = WebRequest.Create(uri) as HttpWebRequest; req.Method = method; if (proxy != null) { if (creds != null) { proxy.Credentials = creds; req.Credentials = creds; } else { proxy.Credentials = CredentialCache.DefaultCredentials; req.Credentials = CredentialCache.DefaultCredentials; } req.Proxy = proxy; } _fromCache = false; _requestDuration = 0; int tc = Environment.TickCount; if (UsingCache) { cachePath = GetCachePath(req.RequestUri); if (File.Exists(cachePath)) { req.IfModifiedSince = File.GetLastAccessTime(cachePath); oldFile = true; } } if (_cacheOnly) { if (!File.Exists(cachePath)) { throw new HtmlWebException("File was not found at cache path: '" + cachePath + "'"); } if (path != null) { IOLibrary.CopyAlways(cachePath, path); // touch the file File.SetLastWriteTime(path, File.GetLastWriteTime(cachePath)); } _fromCache = true; return HttpStatusCode.NotModified; } if (_useCookies) { req.CookieContainer = new CookieContainer(); } if (PreRequest != null) { // allow our user to change the request at will if (!PreRequest(req)) { return HttpStatusCode.ResetContent; } // dump cookie // if (_useCookies) // { // foreach(Cookie cookie in req.CookieContainer.GetCookies(req.RequestUri)) // { // HtmlLibrary.Trace("Cookie " + cookie.Name + "=" + cookie.Value + " path=" + cookie.Path + " domain=" + cookie.Domain); // } // } } HttpWebResponse resp; try { resp = req.GetResponse() as HttpWebResponse; } catch (WebException we) { _requestDuration = Environment.TickCount - tc; resp = (HttpWebResponse)we.Response; if (resp == null) { if (oldFile) { if (path != null) { IOLibrary.CopyAlways(cachePath, path); // touch the file File.SetLastWriteTime(path, File.GetLastWriteTime(cachePath)); } return HttpStatusCode.NotModified; } throw; } } catch (Exception) { _requestDuration = Environment.TickCount - tc; throw; } // allow our user to get some info from the response if (PostResponse != null) { PostResponse(req, resp); } _requestDuration = Environment.TickCount - tc; _responseUri = resp.ResponseUri; bool html = IsHtmlContent(resp.ContentType); Encoding respenc; if ((resp.ContentEncoding != null) && (resp.ContentEncoding.Length > 0)) { respenc = Encoding.GetEncoding(resp.ContentEncoding); } else { respenc = null; } if (resp.StatusCode == HttpStatusCode.NotModified) { if (UsingCache) { _fromCache = true; if (path != null) { IOLibrary.CopyAlways(cachePath, path); // touch the file File.SetLastWriteTime(path, File.GetLastWriteTime(cachePath)); } return resp.StatusCode; } else { // this should *never* happen... throw new HtmlWebException("Server has send a NotModifed code, without cache enabled."); } } Stream s = resp.GetResponseStream(); if (s != null) { if (UsingCache) { // NOTE: LastModified does not contain milliseconds, so we remove them to the file SaveStream(s, cachePath, RemoveMilliseconds(resp.LastModified), _streamBufferSize); // save headers SaveCacheHeaders(req.RequestUri, resp); if (path != null) { // copy and touch the file IOLibrary.CopyAlways(cachePath, path); File.SetLastWriteTime(path, File.GetLastWriteTime(cachePath)); } } else { // try to work in-memory if ((doc != null) && (html)) { if (respenc != null) { doc.Load(s, respenc); } else { doc.Load(s, true); } } } resp.Close(); } return resp.StatusCode; } private string GetCacheHeader(Uri requestUri, string name, string def) { // note: some headers are collection (ex: www-authenticate) // we don't handle that here XmlDocument doc = new XmlDocument(); doc.Load(GetCacheHeadersPath(requestUri)); XmlNode node = doc.SelectSingleNode("//h[translate(@n, 'abcdefghijklmnopqrstuvwxyz','ABCDEFGHIJKLMNOPQRSTUVWXYZ')='" + name.ToUpper() + "']"); if (node == null) { return def; } // attribute should exist return node.Attributes[name].Value; } private string GetCacheHeadersPath(Uri uri) { //return Path.Combine(GetCachePath(uri), ".h.xml"); return GetCachePath(uri) + ".h.xml"; } private bool IsCacheHtmlContent(string path) { string ct = GetContentTypeForExtension(Path.GetExtension(path), null); return IsHtmlContent(ct); } private bool IsHtmlContent(string contentType) { return contentType.ToLower().StartsWith("text/html"); } private HtmlDocument LoadUrl(Uri uri, string method, WebProxy proxy, NetworkCredential creds) { HtmlDocument doc = new HtmlDocument(); doc.OptionAutoCloseOnEnd = false; doc.OptionFixNestedTags = true; _statusCode = Get(uri, method, null, doc, proxy, creds); if (_statusCode == HttpStatusCode.NotModified) { // read cached encoding doc.DetectEncodingAndLoad(GetCachePath(uri)); } return doc; } private void SaveCacheHeaders(Uri requestUri, HttpWebResponse resp) { // we cache the original headers aside the cached document. string file = GetCacheHeadersPath(requestUri); XmlDocument doc = new XmlDocument(); doc.LoadXml(""); XmlNode cache = doc.FirstChild; foreach (string header in resp.Headers) { XmlNode entry = doc.CreateElement("h"); XmlAttribute att = doc.CreateAttribute("n"); att.Value = header; entry.Attributes.Append(att); att = doc.CreateAttribute("v"); att.Value = resp.Headers[header]; entry.Attributes.Append(att); cache.AppendChild(entry); } doc.Save(file); } #endregion } }