1 // HtmlAgilityPack V1.0 - Simon Mourier <simonm@microsoft.com>
6 using System.Xml.Serialization;
11 namespace HtmlAgilityPack
14 /// A utility class to get HTML document from HTTP.
19 /// Represents the method that will handle the PreRequest event.
21 public delegate bool PreRequestHandler(HttpWebRequest request);
24 /// Represents the method that will handle the PostResponse event.
26 public delegate void PostResponseHandler(HttpWebRequest request, HttpWebResponse response);
29 /// Represents the method that will handle the PreHandleDocument event.
31 public delegate void PreHandleDocumentHandler(HtmlDocument document);
33 private int _streamBufferSize = 1024;
34 private string _cachePath;
35 private bool _usingCache;
36 private bool _fromCache;
37 private bool _cacheOnly;
38 private bool _useCookies;
39 private int _requestDuration;
40 private bool _autoDetectEncoding = true;
41 private HttpStatusCode _statusCode = HttpStatusCode.OK;
42 private Uri _responseUri;
45 /// Occurs before an HTTP request is executed.
47 public PreRequestHandler PreRequest;
50 /// Occurs after an HTTP request has been executed.
52 public PostResponseHandler PostResponse;
55 /// Occurs before an HTML document is handled.
57 public PreHandleDocumentHandler PreHandleDocument;
60 /// Creates an instance of an HtmlWeb class.
67 /// Gets an HTML document from an Internet resource and saves it to the specified file.
69 /// <param name="url">The requested URL, such as "http://Myserver/Mypath/Myfile.asp".</param>
70 /// <param name="path">The location of the file where you want to save the document.</param>
71 public void Get(string url, string path)
73 Get(url, path, "GET");
77 /// Gets an HTML document from an Internet resource and saves it to the specified file.
79 /// <param name="url">The requested URL, such as "http://Myserver/Mypath/Myfile.asp".</param>
80 /// <param name="path">The location of the file where you want to save the document.</param>
81 /// <param name="method">The HTTP method used to open the connection, such as GET, POST, PUT, or PROPFIND.</param>
82 public void Get(string url, string path, string method)
84 Uri uri = new Uri(url);
85 if ((uri.Scheme == Uri.UriSchemeHttps) ||
86 (uri.Scheme == Uri.UriSchemeHttp))
88 Get(uri, method, path, null);
92 throw new HtmlWebException("Unsupported uri scheme: '" + uri.Scheme + "'.");
97 /// Gets an HTML document from an Internet resource.
99 /// <param name="url">The requested URL, such as "http://Myserver/Mypath/Myfile.asp".</param>
100 /// <returns>A new HTML document.</returns>
101 public HtmlDocument Load(string url)
103 return Load(url, "GET");
107 /// Loads an HTML document from an Internet resource.
109 /// <param name="url">The requested URL, such as "http://Myserver/Mypath/Myfile.asp".</param>
110 /// <param name="method">The HTTP method used to open the connection, such as GET, POST, PUT, or PROPFIND.</param>
111 /// <returns>A new HTML document.</returns>
112 public HtmlDocument Load(string url, string method)
114 Uri uri = new Uri(url);
116 if ((uri.Scheme == Uri.UriSchemeHttps) ||
117 (uri.Scheme == Uri.UriSchemeHttp))
119 doc = LoadUrl(uri, method);
124 if (uri.Scheme == Uri.UriSchemeFile)
126 doc = new HtmlDocument();
127 doc.OptionAutoCloseOnEnd = false;
128 doc.OptionAutoCloseOnEnd = true;
129 doc.DetectEncodingAndLoad(url, _autoDetectEncoding);
133 throw new HtmlWebException("Unsupported uri scheme: '" + uri.Scheme + "'.");
136 if (PreHandleDocument != null)
138 PreHandleDocument(doc);
143 private bool IsCacheHtmlContent(string path)
145 string ct = GetContentTypeForExtension(Path.GetExtension(path), null);
146 return IsHtmlContent(ct);
149 private bool IsHtmlContent(string contentType)
151 return contentType.ToLower().StartsWith("text/html");
154 private string GetCacheHeadersPath(Uri uri)
156 //return Path.Combine(GetCachePath(uri), ".h.xml");
157 return GetCachePath(uri) + ".h.xml";
161 /// Gets the cache file path for a specified url.
163 /// <param name="uri">The url fo which to retrieve the cache path. May not be null.</param>
164 /// <returns>The cache file path.</returns>
165 public string GetCachePath(Uri uri)
169 throw new ArgumentNullException("uri");
173 throw new HtmlWebException("Cache is not enabled. Set UsingCache to true first.");
176 if (uri.AbsolutePath == "/")
178 cachePath = Path.Combine(_cachePath, ".htm");
182 cachePath = Path.Combine(_cachePath, (uri.Host + uri.AbsolutePath).Replace('/', '\\'));
188 /// Gets a value indicating if the last document was retrieved from the cache.
190 public bool FromCache
199 /// Gets the URI of the Internet resource that actually responded to the request.
201 public Uri ResponseUri
210 /// Gets or Sets a value indicating whether to get document only from the cache.
211 /// If this is set to true and document is not found in the cache, nothing will be loaded.
213 public bool CacheOnly
221 if ((value) && !UsingCache)
223 throw new HtmlWebException("Cache is not enabled. Set UsingCache to true first.");
230 /// Gets or Sets a value indicating if cookies will be stored.
232 public bool UseCookies
245 /// Gets the last request duration in milliseconds.
247 public int RequestDuration
251 return _requestDuration;
256 /// Gets or Sets a value indicating if document encoding must be automatically detected.
258 public bool AutoDetectEncoding
262 return _autoDetectEncoding;
266 _autoDetectEncoding = value;
271 /// Gets the last request status.
273 public HttpStatusCode StatusCode
282 /// Gets or Sets the size of the buffer used for memory operations.
284 public int StreamBufferSize
288 return _streamBufferSize;
292 if (_streamBufferSize <= 0)
294 throw new ArgumentException("Size must be greater than zero.");
296 _streamBufferSize = value;
300 private HtmlDocument LoadUrl(Uri uri, string method)
302 HtmlDocument doc = new HtmlDocument();
303 doc.OptionAutoCloseOnEnd = false;
304 doc.OptionFixNestedTags = true;
305 _statusCode = Get(uri, method, null, doc);
306 if (_statusCode == HttpStatusCode.NotModified)
308 // read cached encoding
309 doc.DetectEncodingAndLoad(GetCachePath(uri));
314 private HttpStatusCode Get(Uri uri, string method, string path, HtmlDocument doc)
316 string cachePath = null;
318 bool oldFile = false;
320 req = WebRequest.Create(uri) as HttpWebRequest;
324 _requestDuration = 0;
325 int tc = Environment.TickCount;
328 cachePath = GetCachePath(req.RequestUri);
329 if (File.Exists(cachePath))
331 req.IfModifiedSince = File.GetLastAccessTime(cachePath);
338 if (!File.Exists(cachePath))
340 throw new HtmlWebException("File was not found at cache path: '" + cachePath + "'");
345 IOLibrary.CopyAlways(cachePath, path);
347 File.SetLastWriteTime(path, File.GetLastWriteTime(cachePath));
350 return HttpStatusCode.NotModified;
355 req.CookieContainer = new CookieContainer();
358 if (PreRequest != null)
360 // allow our user to change the request at will
361 if (!PreRequest(req))
363 return HttpStatusCode.ResetContent;
369 // foreach(Cookie cookie in req.CookieContainer.GetCookies(req.RequestUri))
371 // HtmlLibrary.Trace("Cookie " + cookie.Name + "=" + cookie.Value + " path=" + cookie.Path + " domain=" + cookie.Domain);
376 HttpWebResponse resp;
380 resp = req.GetResponse() as HttpWebResponse;
382 catch (WebException we)
384 _requestDuration = Environment.TickCount - tc;
385 resp = (HttpWebResponse)we.Response;
392 IOLibrary.CopyAlways(cachePath, path);
394 File.SetLastWriteTime(path, File.GetLastWriteTime(cachePath));
396 return HttpStatusCode.NotModified;
403 _requestDuration = Environment.TickCount - tc;
407 // allow our user to get some info from the response
408 if (PostResponse != null)
410 PostResponse(req, resp);
413 _requestDuration = Environment.TickCount - tc;
414 _responseUri = resp.ResponseUri;
416 bool html = IsHtmlContent(resp.ContentType);
417 System.Text.Encoding respenc;
419 if ((resp.ContentEncoding != null) && (resp.ContentEncoding.Length>0))
421 respenc = System.Text.Encoding.GetEncoding(resp.ContentEncoding);
428 if (resp.StatusCode == HttpStatusCode.NotModified)
435 IOLibrary.CopyAlways(cachePath, path);
437 File.SetLastWriteTime(path, File.GetLastWriteTime(cachePath));
439 return resp.StatusCode;
443 // this should *never* happen...
444 throw new HtmlWebException("Server has send a NotModifed code, without cache enabled.");
447 Stream s = resp.GetResponseStream();
452 // NOTE: LastModified does not contain milliseconds, so we remove them to the file
453 SaveStream(s, cachePath, RemoveMilliseconds(resp.LastModified), _streamBufferSize);
456 SaveCacheHeaders(req.RequestUri, resp);
460 // copy and touch the file
461 IOLibrary.CopyAlways(cachePath, path);
462 File.SetLastWriteTime(path, File.GetLastWriteTime(cachePath));
467 // try to work in-memory
468 if ((doc != null) && (html))
482 return resp.StatusCode;
485 private string GetCacheHeader(Uri requestUri, string name, string def)
487 // note: some headers are collection (ex: www-authenticate)
488 // we don't handle that here
489 XmlDocument doc = new XmlDocument();
490 doc.Load(GetCacheHeadersPath(requestUri));
491 XmlNode node = doc.SelectSingleNode("//h[translate(@n, 'abcdefghijklmnopqrstuvwxyz','ABCDEFGHIJKLMNOPQRSTUVWXYZ')='" + name.ToUpper() + "']");
496 // attribute should exist
497 return node.Attributes[name].Value;
500 private void SaveCacheHeaders(Uri requestUri, HttpWebResponse resp)
502 // we cache the original headers aside the cached document.
503 string file = GetCacheHeadersPath(requestUri);
504 XmlDocument doc = new XmlDocument();
505 doc.LoadXml("<c></c>");
506 XmlNode cache = doc.FirstChild;
507 foreach(string header in resp.Headers)
509 XmlNode entry = doc.CreateElement("h");
510 XmlAttribute att = doc.CreateAttribute("n");
512 entry.Attributes.Append(att);
514 att = doc.CreateAttribute("v");
515 att.Value = resp.Headers[header];
516 entry.Attributes.Append(att);
518 cache.AppendChild(entry);
523 private static long SaveStream(Stream stream, string path, DateTime touchDate, int streamBufferSize)
525 FilePreparePath(path);
526 FileStream fs = new FileStream(path, FileMode.Create, FileAccess.Write);
527 BinaryReader br = null;
528 BinaryWriter bw = null;
532 br = new BinaryReader(stream);
533 bw = new BinaryWriter(fs);
538 buffer = br.ReadBytes(streamBufferSize);
539 len += buffer.Length;
545 while (buffer.Length>0);
563 File.SetLastWriteTime(path, touchDate);
567 private static void FilePreparePath(string target)
569 if (File.Exists(target))
571 FileAttributes atts = File.GetAttributes(target);
572 File.SetAttributes(target, atts & ~FileAttributes.ReadOnly);
576 string dir = Path.GetDirectoryName(target);
577 if (!Directory.Exists(dir))
579 Directory.CreateDirectory(dir);
584 private static DateTime RemoveMilliseconds(DateTime t)
586 return new DateTime(t.Year, t.Month, t.Day, t.Hour, t.Minute, t.Second, 0);
590 /// Gets the path extension for a given MIME content type.
592 /// <param name="contentType">The input MIME content type.</param>
593 /// <param name="def">The default path extension to return if any error occurs.</param>
594 /// <returns>The MIME content type's path extension.</returns>
595 public static string GetExtensionForContentType(string contentType, string def)
597 if ((contentType == null) || (contentType.Length == 0))
604 RegistryKey reg = Registry.ClassesRoot;
605 reg = reg.OpenSubKey(@"MIME\Database\Content Type\" + contentType, false);
606 ext = (string)reg.GetValue("Extension", def);
616 /// Gets the MIME content type for a given path extension.
618 /// <param name="extension">The input path extension.</param>
619 /// <param name="def">The default content type to return if any error occurs.</param>
620 /// <returns>The path extention's MIME content type.</returns>
621 public static string GetContentTypeForExtension(string extension, string def)
623 if ((extension == null) || (extension.Length == 0))
630 RegistryKey reg = Registry.ClassesRoot;
631 reg = reg.OpenSubKey(extension, false);
632 contentType = (string)reg.GetValue("", def);
642 /// Loads an HTML document from an Internet resource and saves it to the specified XmlTextWriter.
644 /// <param name="htmlUrl">The requested URL, such as "http://Myserver/Mypath/Myfile.asp".</param>
645 /// <param name="writer">The XmlTextWriter to which you want to save.</param>
646 public void LoadHtmlAsXml(string htmlUrl, XmlTextWriter writer)
648 HtmlDocument doc = Load(htmlUrl);
653 /// Loads an HTML document from an Internet resource and saves it to the specified XmlTextWriter, after an XSLT transformation.
655 /// <param name="htmlUrl">The requested URL, such as "http://Myserver/Mypath/Myfile.asp".</param>
656 /// <param name="xsltUrl">The URL that specifies the XSLT stylesheet to load.</param>
657 /// <param name="xsltArgs">An XsltArgumentList containing the namespace-qualified arguments used as input to the transform.</param>
658 /// <param name="writer">The XmlTextWriter to which you want to save.</param>
659 public void LoadHtmlAsXml(string htmlUrl, string xsltUrl, XsltArgumentList xsltArgs, XmlTextWriter writer)
661 LoadHtmlAsXml(htmlUrl, xsltUrl, xsltArgs, writer, null);
665 /// Loads an HTML document from an Internet resource and saves it to the specified XmlTextWriter, after an XSLT transformation.
667 /// <param name="htmlUrl">The requested URL, such as "http://Myserver/Mypath/Myfile.asp". May not be null.</param>
668 /// <param name="xsltUrl">The URL that specifies the XSLT stylesheet to load.</param>
669 /// <param name="xsltArgs">An XsltArgumentList containing the namespace-qualified arguments used as input to the transform.</param>
670 /// <param name="writer">The XmlTextWriter to which you want to save.</param>
671 /// <param name="xmlPath">A file path where the temporary XML before transformation will be saved. Mostly used for debugging purposes.</param>
672 public void LoadHtmlAsXml(string htmlUrl, string xsltUrl, XsltArgumentList xsltArgs, XmlTextWriter writer, string xmlPath)
676 throw new ArgumentNullException("htmlUrl");
679 HtmlDocument doc = Load(htmlUrl);
683 XmlTextWriter w = new XmlTextWriter(xmlPath, doc.Encoding);
687 if (xsltArgs == null)
689 xsltArgs = new XsltArgumentList();
692 // add some useful variables to the xslt doc
693 xsltArgs.AddParam("url", "", htmlUrl);
694 xsltArgs.AddParam("requestDuration", "", RequestDuration);
695 xsltArgs.AddParam("fromCache", "", FromCache);
697 XslTransform xslt = new XslTransform();
699 xslt.Transform(doc, xsltArgs, writer, null);
703 /// Creates an instance of the given type from the specified Internet resource.
705 /// <param name="url">The requested URL, such as "http://Myserver/Mypath/Myfile.asp".</param>
706 /// <param name="type">The requested type.</param>
707 /// <returns>An newly created instance.</returns>
708 public object CreateInstance(string url, Type type)
710 return CreateInstance(url, null, null, type);
714 /// Creates an instance of the given type from the specified Internet resource.
716 /// <param name="htmlUrl">The requested URL, such as "http://Myserver/Mypath/Myfile.asp".</param>
717 /// <param name="xsltUrl">The URL that specifies the XSLT stylesheet to load.</param>
718 /// <param name="xsltArgs">An XsltArgumentList containing the namespace-qualified arguments used as input to the transform.</param>
719 /// <param name="type">The requested type.</param>
720 /// <returns>An newly created instance.</returns>
721 public object CreateInstance(string htmlUrl, string xsltUrl, XsltArgumentList xsltArgs, Type type)
723 return CreateInstance(htmlUrl, xsltUrl, xsltArgs, type, null);
727 /// Creates an instance of the given type from the specified Internet resource.
729 /// <param name="htmlUrl">The requested URL, such as "http://Myserver/Mypath/Myfile.asp".</param>
730 /// <param name="xsltUrl">The URL that specifies the XSLT stylesheet to load.</param>
731 /// <param name="xsltArgs">An XsltArgumentList containing the namespace-qualified arguments used as input to the transform.</param>
732 /// <param name="type">The requested type.</param>
733 /// <param name="xmlPath">A file path where the temporary XML before transformation will be saved. Mostly used for debugging purposes.</param>
734 /// <returns>An newly created instance.</returns>
735 public object CreateInstance(string htmlUrl, string xsltUrl, XsltArgumentList xsltArgs, Type type, string xmlPath)
737 StringWriter sw = new StringWriter();
738 XmlTextWriter writer = new XmlTextWriter(sw);
741 LoadHtmlAsXml(htmlUrl, writer);
747 LoadHtmlAsXml(htmlUrl, xsltUrl, xsltArgs, writer);
751 LoadHtmlAsXml(htmlUrl, xsltUrl, xsltArgs, writer, xmlPath);
755 StringReader sr = new StringReader(sw.ToString());
756 XmlTextReader reader = new XmlTextReader(sr);
757 XmlSerializer serializer = new XmlSerializer(type);
761 o = serializer.Deserialize(reader);
763 catch(InvalidOperationException ex)
765 throw new Exception(ex.ToString() + ", --- xml:" + sw.ToString());
771 /// Gets or Sets the cache path. If null, no caching mechanism will be used.
773 public string CachePath
786 /// Gets or Sets a value indicating whether the caching mechanisms should be used or not.
788 public bool UsingCache
792 if (_cachePath == null)
800 if ((value) && (_cachePath == null))
802 throw new HtmlWebException("You need to define a CachePath first.");
810 /// Represents an exception thrown by the HtmlWeb utility class.
812 public class HtmlWebException: Exception
815 /// Creates an instance of the HtmlWebException.
817 /// <param name="message">The exception's message.</param>
818 public HtmlWebException(string message)