Merge branch 'document'

mganss · Mar 21, 2016 · 4e3a647 · 4e3a647
2 parents af609b5 + 2f23ab4
commit 4e3a647
Show file tree

Hide file tree

Showing 6 changed files with 119 additions and 60 deletions.
diff --git a/HtmlSanitizer.Tests/Tests.cs b/HtmlSanitizer.Tests/Tests.cs
@@ -2202,8 +2202,7 @@ public void RussianTextTest()
 
             // Act
             var htmlFragment = "Тест";
-            //var outputFormatter = new CsQuery.Output.FormatDefault(DomRenderingOptions.RemoveComments | DomRenderingOptions.QuoteAllAttributes, HtmlEncoders.Minimum);
-            var actual = s.Sanitize(htmlFragment, ""/*, outputFormatter*/);
+            var actual = s.Sanitize(htmlFragment, "");
 
             // Assert
             var expected = htmlFragment;
@@ -2487,6 +2486,40 @@ public void RemoveEventForNotAllowedTag_ScriptTagAndSpan()
             s.Sanitize("<span>Hi</span><script>alert('Hello world!')</script>");
             Assert.That(actual, Is.EqualTo(RemoveReason.NotAllowedTag));
         }
+
+        [Test]
+        public void DocumentTest()
+        {
+            var s = new HtmlSanitizer();
+            s.AllowedTags.Add("title");
+            var html = "<html><head><title>Test</title></head><body><div>Test</div></body></html>";
+
+            var actual = s.SanitizeDocument(html);
+
+            Assert.That(actual, Is.EqualTo(html));
+        }
+
+        [Test]
+        public void DocumentFromFragmentTest()
+        {
+            var s = new HtmlSanitizer();
+            var html = "<div>Test</div>";
+
+            var actual = s.SanitizeDocument(html);
+
+            Assert.That(actual, Is.EqualTo("<html><head></head><body><div>Test</div></body></html>"));
+        }
+
+        [Test]
+        public void FragmentFromDocumentTest()
+        {
+            var s = new HtmlSanitizer();
+            var html = "<html><head><title>Test</title></head><body><div>Test</div></body></html>";
+
+            var actual = s.Sanitize(html);
+
+            Assert.That(actual, Is.EqualTo("<div>Test</div>"));
+        }
     }
 }
 

diff --git a/HtmlSanitizer/HtmlSanitizer.cs b/HtmlSanitizer/HtmlSanitizer.cs
@@ -15,19 +15,18 @@
 namespace Ganss.XSS
 {
     /// <summary>
-    /// Cleans HTML fragments from constructs that can lead to <a href="https://en.wikipedia.org/wiki/Cross-site_scripting">XSS attacks</a>.
+    /// Cleans HTML documents and fragments from constructs that can lead to <a href="https://en.wikipedia.org/wiki/Cross-site_scripting">XSS attacks</a>.
     /// </summary>
     /// <remarks>
-    /// XSS attacks can occur at several levels within an HTML fragment:
+    /// XSS attacks can occur at several levels within an HTML document or fragment:
     /// <list type="bullet">
     /// <item>HTML Tags (e.g. the &lt;script&gt; tag)</item>
     /// <item>HTML attributes (e.g. the "onload" attribute)</item>
     /// <item>CSS styles (url property values)</item>
     /// <item>malformed HTML or HTML that exploits parser bugs in specific browsers</item>
     /// </list>
     /// <para>
-    /// The HtmlSanitizer class addresses all of these possible attack vectors by using an HTML parser that is based on the one used
-    /// in the Gecko browser engine (see <a href="https://github.com/jamietre/CsQuery">CsQuery</a>).
+    /// The HtmlSanitizer class addresses all of these possible attack vectors by using a sophisticated HTML parser (<a href="https://github.com/AngleSharp/AngleSharp">AngleSharp</a>).
     /// </para>
     /// <para>
     /// In order to facilitate different use cases, HtmlSanitizer can be customized at the levels mentioned above:
@@ -115,7 +114,9 @@ public HtmlSanitizer(IEnumerable<string> allowedTags = null, IEnumerable<string>
             // Forms
             "datalist", "keygen", "output", "progress", "meter",
             // Interactive elements
-            "details", "summary", "menuitem"
+            "details", "summary", "menuitem",
+            // document elements
+            "html", "head", "body"
         };
 
         /// <summary>
@@ -297,6 +298,11 @@ protected virtual void OnRemovingStyle(RemovingStyleEventArgs e)
         /// </summary>
         public static readonly Regex DefaultDisallowedCssPropertyValue = new Regex(@"[<>]", RegexOptions.Compiled);
 
+        /// <summary>
+        /// Return all nested subnodes of a node.
+        /// </summary>
+        /// <param name="dom">The root node.</param>
+        /// <returns>All nested subnodes.</returns>
         private static IEnumerable<INode> GetAllNodes(INode dom)
         {
             if (dom == null) yield break;
@@ -312,31 +318,78 @@ private static IEnumerable<INode> GetAllNodes(INode dom)
         }
 
         /// <summary>
-        /// Sanitizes the specified HTML.
+        /// Sanitizes the specified HTML body fragment. If a document is given, only the body part will be returned.
         /// </summary>
-        /// <param name="html">The HTML to sanitize.</param>
+        /// <param name="html">The HTML body fragment to sanitize.</param>
         /// <param name="baseUrl">The base URL relative URLs are resolved against. No resolution if empty.</param>
-        /// <param name="outputFormatter">The CsQuery output formatter used to render the DOM. Using the default formatter if null.</param>
-        /// <returns>The sanitized HTML.</returns>
+        /// <param name="outputFormatter">The formatter used to render the DOM. Using the default formatter if null.</param>
+        /// <returns>The sanitized HTML body fragment.</returns>
         public string Sanitize(string html, string baseUrl = "", IMarkupFormatter outputFormatter = null)
         {
-            var parser = new HtmlParser(new Configuration().WithCss(e => e.Options = new CssParserOptions
+            var parser = CreateParser();
+            var dom = parser.Parse("<body>" + html + "</body>");
+
+            DoSanitize(dom, dom.Body, baseUrl, outputFormatter);
+
+            var output = dom.Body.ChildNodes.ToHtml(outputFormatter ?? HtmlMarkupFormatter.Instance);
+
+            return output;
+        }
+
+        /// <summary>
+        /// Sanitizes the specified HTML document. Even if only a fragment is given, a whole document will be returned.
+        /// </summary>
+        /// <param name="html">The HTML document to sanitize.</param>
+        /// <param name="baseUrl">The base URL relative URLs are resolved against. No resolution if empty.</param>
+        /// <param name="outputFormatter">The formatter used to render the DOM. Using the default formatter if null.</param>
+        /// <returns>The sanitized HTML document.</returns>
+        public string SanitizeDocument(string html, string baseUrl = "", IMarkupFormatter outputFormatter = null)
+        {
+            var parser = CreateParser();
+            var dom = parser.Parse(html);
+
+            DoSanitize(dom, dom.DocumentElement, baseUrl, outputFormatter);
+
+            var output = dom.ToHtml(outputFormatter ?? HtmlMarkupFormatter.Instance);
+
+            return output;
+        }
+
+        /// <summary>
+        /// Creeates an instance of <see cref="HtmlParser"/>.
+        /// </summary>
+        /// <returns>An instance of <see cref="HtmlParser"/>.</returns>
+        private static HtmlParser CreateParser()
+        {
+            return new HtmlParser(new Configuration().WithCss(e => e.Options = new CssParserOptions
             {
                 IsIncludingUnknownDeclarations = true,
                 IsIncludingUnknownRules = true,
                 IsToleratingInvalidConstraints = true,
                 IsToleratingInvalidValues = true
             }));
-            var dom = parser.Parse("<body>" + html + "</body>");
+        }
 
+        /// <summary>
+        /// Removes all comment nodes from a list of nodes.
+        /// </summary>
+        /// <param name="nodes">The list of nodes.</param>
+        private static void RemoveComments(List<INode> nodes)
+        {
+            foreach (var comment in nodes.OfType<IComment>())
+                comment.Remove();
+        }
+
+        private void DoSanitize(IHtmlDocument dom, IElement context, string baseUrl = "", IMarkupFormatter outputFormatter = null)
+        {
             // remove non-whitelisted tags
-            foreach (var tag in dom.Body.QuerySelectorAll("*").Where(t => !IsAllowedTag(t)).ToList())
+            foreach (var tag in context.QuerySelectorAll("*").Where(t => !IsAllowedTag(t)).ToList())
             {
                 RemoveTag(tag, RemoveReason.NotAllowedTag);
             }
 
             // cleanup attributes
-            foreach (var tag in dom.Body.QuerySelectorAll("*").OfType<IHtmlElement>().ToList())
+            foreach (var tag in context.QuerySelectorAll("*").OfType<IHtmlElement>().ToList())
             {
                 // remove non-whitelisted attributes
                 foreach (var attribute in tag.Attributes.Where(a => !IsAllowedAttribute(a)).ToList())
@@ -373,11 +426,20 @@ public string Sanitize(string html, string baseUrl = "", IMarkupFormatter output
                 }
             }
 
-            var nodes = GetAllNodes(dom.Body).ToList();
+            var nodes = GetAllNodes(context).ToList();
 
-            foreach (var comment in nodes.OfType<IComment>())
-                comment.Remove();
+            RemoveComments(nodes);
+
+            DoPostProcess(dom, nodes);
+        }
 
+        /// <summary>
+        /// Performs post processing on all nodes in the document.
+        /// </summary>
+        /// <param name="dom">The HTML document.</param>
+        /// <param name="nodes">The list of nodes in the document.</param>
+        private void DoPostProcess(IHtmlDocument dom, List<INode> nodes)
+        {
             if (PostProcessNode != null)
             {
                 foreach (var node in nodes)
@@ -388,10 +450,6 @@ public string Sanitize(string html, string baseUrl = "", IMarkupFormatter output
                         ((IChildNode)node).Replace(e.ReplacementNodes.ToArray());
                 }
             }
-
-            var output = dom.Body.ChildNodes.ToHtml(outputFormatter ?? HtmlMarkupFormatter.Instance);
-
-            return output;
         }
 
         /// <summary>

diff --git a/HtmlSanitizer/HtmlSanitizer.nuspec b/HtmlSanitizer/HtmlSanitizer.nuspec
@@ -2,7 +2,7 @@
 <package >
   <metadata>
     <id>$id$</id>
-    <version>$version$</version>
+    <version>$version$-beta</version>
     <title>$title$</title>
     <authors>$author$</authors>
     <owners>$author$</owners>

diff --git a/HtmlSanitizer/IHtmlSanitizer.cs b/HtmlSanitizer/IHtmlSanitizer.cs
@@ -6,41 +6,9 @@
 namespace Ganss.XSS
 {
     /// <summary>
-    /// Cleans HTML fragments from constructs that can lead to <a href="https://en.wikipedia.org/wiki/Cross-site_scripting">XSS attacks</a>.
+    /// Enables an inheriting class to implement an HtmlSanitizer class, which cleans HTML documents and fragments
+    /// from constructs that can lead to <a href="https://en.wikipedia.org/wiki/Cross-site_scripting">XSS attacks</a>.
     /// </summary>
-    /// <remarks>
-    /// XSS attacks can occur at several levels within an HTML fragment:
-    /// <list type="bullet">
-    /// <item>HTML Tags (e.g. the &lt;script&gt; tag)</item>
-    /// <item>HTML attributes (e.g. the "onload" attribute)</item>
-    /// <item>CSS styles (url property values)</item>
-    /// <item>malformed HTML or HTML that exploits parser bugs in specific browsers</item>
-    /// </list>
-    /// <para>
-    /// The HtmlSanitizer class addresses all of these possible attack vectors by using an HTML parser that is based on the one used
-    /// in the Gecko browser engine (see <a href="https://github.com/jamietre/CsQuery">CsQuery</a>).
-    /// </para>
-    /// <para>
-    /// In order to facilitate different use cases, HtmlSanitizer can be customized at the levels mentioned above:
-    /// <list type="bullet">
-    /// <item>You can specify the allowed HTML tags through the property <see cref="AllowedTags"/>. All other tags will be stripped.</item>
-    /// <item>You can specify the allowed HTML attributes through the property <see cref="AllowedAttributes"/>. All other attributes will be stripped.</item>
-    /// <item>You can specify the allowed CSS property names through the property <see cref="AllowedCssProperties"/>. All other styles will be stripped.</item>
-    /// <item>You can specify the allowed URI schemes through the property <see cref="AllowedCssProperties"/>. All other URIs will be stripped.</item>
-    /// <item>You can specify the HTML attributes that contain URIs (such as "src", "href" etc.) through the property <see cref="UriAttributes"/>.</item>
-    /// </list>
-    /// </para>
-    /// </remarks>
-    /// <example>
-    /// <code>
-    /// <![CDATA[
-    /// var sanitizer = new HtmlSanitizer();
-    /// var html = @"<script>alert('xss')</script><div onload=""alert('xss')"" style=""background-color: test"">Test<img src=""test.gif"" style=""background-image: url(javascript:alert('xss')); margin: 10px""></div>";
-    /// var sanitized = sanitizer.Sanitize(html, "http://www.example.com");
-    /// // -> "<div style="background-color: test">Test<img style="margin: 10px" src="http://www.example.com/test.gif"></div>"
-    /// ]]>
-    /// </code>
-    /// </example>
     public interface IHtmlSanitizer
     {
         /// <summary>
@@ -121,7 +89,7 @@ public interface IHtmlSanitizer
         /// </summary>
         /// <param name="html">The HTML to sanitize.</param>
         /// <param name="baseUrl">The base URL relative URLs are resolved against. No resolution if empty.</param>
-        /// <param name="outputFormatter">The CsQuery output formatter used to render the DOM. Using the default formatter if null.</param>
+        /// <param name="outputFormatter">The formatter used to render the DOM. Using the default formatter if null.</param>
         /// <returns>The sanitized HTML.</returns>
         string Sanitize(string html, string baseUrl = "", IMarkupFormatter outputFormatter = null);
     }

diff --git a/README.md b/README.md
@@ -5,7 +5,7 @@ HtmlSanitizer
 [![Build status](https://ci.appveyor.com/api/projects/status/418bmfx643iae00c/branch/master?svg=true)](https://ci.appveyor.com/project/mganss/htmlsanitizer/branch/master)
 [![codecov.io](https://codecov.io/github/mganss/HtmlSanitizer/coverage.svg?branch=master)](https://codecov.io/github/mganss/HtmlSanitizer?branch=master)
 
-HtmlSanitizer is a .NET library for cleaning HTML fragments from constructs that can lead to [XSS attacks](https://en.wikipedia.org/wiki/Cross-site_scripting).
+HtmlSanitizer is a .NET library for cleaning HTML fragments and documents from constructs that can lead to [XSS attacks](https://en.wikipedia.org/wiki/Cross-site_scripting).
 It uses [AngleSharp](https://github.com/AngleSharp/AngleSharp) to parse, manipulate, and render HTML and CSS.
 
 Because HtmlSanitizer is based on a robust HTML parser it can also shield you from deliberate or accidental

diff --git a/appveyor.yml b/appveyor.yml
@@ -1,4 +1,4 @@
-version: 3.1.{build}
+version: 3.2.{build}
 install:
   - nuget restore
 configuration: Release