Skip to content

Commit

Permalink
Merge branch 'document'
Browse files Browse the repository at this point in the history
  • Loading branch information
mganss committed Mar 21, 2016
2 parents af609b5 + 2f23ab4 commit 4e3a647
Show file tree
Hide file tree
Showing 6 changed files with 119 additions and 60 deletions.
37 changes: 35 additions & 2 deletions HtmlSanitizer.Tests/Tests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -2202,8 +2202,7 @@ public void RussianTextTest()

// Act
var htmlFragment = "Тест";
//var outputFormatter = new CsQuery.Output.FormatDefault(DomRenderingOptions.RemoveComments | DomRenderingOptions.QuoteAllAttributes, HtmlEncoders.Minimum);
var actual = s.Sanitize(htmlFragment, ""/*, outputFormatter*/);
var actual = s.Sanitize(htmlFragment, "");

// Assert
var expected = htmlFragment;
Expand Down Expand Up @@ -2487,6 +2486,40 @@ public void RemoveEventForNotAllowedTag_ScriptTagAndSpan()
s.Sanitize("<span>Hi</span><script>alert('Hello world!')</script>");
Assert.That(actual, Is.EqualTo(RemoveReason.NotAllowedTag));
}

[Test]
public void DocumentTest()
{
var s = new HtmlSanitizer();
s.AllowedTags.Add("title");
var html = "<html><head><title>Test</title></head><body><div>Test</div></body></html>";

var actual = s.SanitizeDocument(html);

Assert.That(actual, Is.EqualTo(html));
}

[Test]
public void DocumentFromFragmentTest()
{
var s = new HtmlSanitizer();
var html = "<div>Test</div>";

var actual = s.SanitizeDocument(html);

Assert.That(actual, Is.EqualTo("<html><head></head><body><div>Test</div></body></html>"));
}

[Test]
public void FragmentFromDocumentTest()
{
var s = new HtmlSanitizer();
var html = "<html><head><title>Test</title></head><body><div>Test</div></body></html>";

var actual = s.Sanitize(html);

Assert.That(actual, Is.EqualTo("<div>Test</div>"));
}
}
}

Expand Down
98 changes: 78 additions & 20 deletions HtmlSanitizer/HtmlSanitizer.cs
Original file line number Diff line number Diff line change
Expand Up @@ -15,19 +15,18 @@
namespace Ganss.XSS
{
/// <summary>
/// Cleans HTML fragments from constructs that can lead to <a href="https://en.wikipedia.org/wiki/Cross-site_scripting">XSS attacks</a>.
/// Cleans HTML documents and fragments from constructs that can lead to <a href="https://en.wikipedia.org/wiki/Cross-site_scripting">XSS attacks</a>.
/// </summary>
/// <remarks>
/// XSS attacks can occur at several levels within an HTML fragment:
/// XSS attacks can occur at several levels within an HTML document or fragment:
/// <list type="bullet">
/// <item>HTML Tags (e.g. the &lt;script&gt; tag)</item>
/// <item>HTML attributes (e.g. the "onload" attribute)</item>
/// <item>CSS styles (url property values)</item>
/// <item>malformed HTML or HTML that exploits parser bugs in specific browsers</item>
/// </list>
/// <para>
/// The HtmlSanitizer class addresses all of these possible attack vectors by using an HTML parser that is based on the one used
/// in the Gecko browser engine (see <a href="https://github.com/jamietre/CsQuery">CsQuery</a>).
/// The HtmlSanitizer class addresses all of these possible attack vectors by using a sophisticated HTML parser (<a href="https://github.com/AngleSharp/AngleSharp">AngleSharp</a>).
/// </para>
/// <para>
/// In order to facilitate different use cases, HtmlSanitizer can be customized at the levels mentioned above:
Expand Down Expand Up @@ -115,7 +114,9 @@ public HtmlSanitizer(IEnumerable<string> allowedTags = null, IEnumerable<string>
// Forms
"datalist", "keygen", "output", "progress", "meter",
// Interactive elements
"details", "summary", "menuitem"
"details", "summary", "menuitem",
// document elements
"html", "head", "body"
};

/// <summary>
Expand Down Expand Up @@ -297,6 +298,11 @@ protected virtual void OnRemovingStyle(RemovingStyleEventArgs e)
/// </summary>
public static readonly Regex DefaultDisallowedCssPropertyValue = new Regex(@"[<>]", RegexOptions.Compiled);

/// <summary>
/// Return all nested subnodes of a node.
/// </summary>
/// <param name="dom">The root node.</param>
/// <returns>All nested subnodes.</returns>
private static IEnumerable<INode> GetAllNodes(INode dom)
{
if (dom == null) yield break;
Expand All @@ -312,31 +318,78 @@ private static IEnumerable<INode> GetAllNodes(INode dom)
}

/// <summary>
/// Sanitizes the specified HTML.
/// Sanitizes the specified HTML body fragment. If a document is given, only the body part will be returned.
/// </summary>
/// <param name="html">The HTML to sanitize.</param>
/// <param name="html">The HTML body fragment to sanitize.</param>
/// <param name="baseUrl">The base URL relative URLs are resolved against. No resolution if empty.</param>
/// <param name="outputFormatter">The CsQuery output formatter used to render the DOM. Using the default formatter if null.</param>
/// <returns>The sanitized HTML.</returns>
/// <param name="outputFormatter">The formatter used to render the DOM. Using the default formatter if null.</param>
/// <returns>The sanitized HTML body fragment.</returns>
public string Sanitize(string html, string baseUrl = "", IMarkupFormatter outputFormatter = null)
{
var parser = new HtmlParser(new Configuration().WithCss(e => e.Options = new CssParserOptions
var parser = CreateParser();
var dom = parser.Parse("<body>" + html + "</body>");

DoSanitize(dom, dom.Body, baseUrl, outputFormatter);

var output = dom.Body.ChildNodes.ToHtml(outputFormatter ?? HtmlMarkupFormatter.Instance);

return output;
}

/// <summary>
/// Sanitizes the specified HTML document. Even if only a fragment is given, a whole document will be returned.
/// </summary>
/// <param name="html">The HTML document to sanitize.</param>
/// <param name="baseUrl">The base URL relative URLs are resolved against. No resolution if empty.</param>
/// <param name="outputFormatter">The formatter used to render the DOM. Using the default formatter if null.</param>
/// <returns>The sanitized HTML document.</returns>
public string SanitizeDocument(string html, string baseUrl = "", IMarkupFormatter outputFormatter = null)
{
var parser = CreateParser();
var dom = parser.Parse(html);

DoSanitize(dom, dom.DocumentElement, baseUrl, outputFormatter);

var output = dom.ToHtml(outputFormatter ?? HtmlMarkupFormatter.Instance);

return output;
}

/// <summary>
/// Creeates an instance of <see cref="HtmlParser"/>.
/// </summary>
/// <returns>An instance of <see cref="HtmlParser"/>.</returns>
private static HtmlParser CreateParser()
{
return new HtmlParser(new Configuration().WithCss(e => e.Options = new CssParserOptions
{
IsIncludingUnknownDeclarations = true,
IsIncludingUnknownRules = true,
IsToleratingInvalidConstraints = true,
IsToleratingInvalidValues = true
}));
var dom = parser.Parse("<body>" + html + "</body>");
}

/// <summary>
/// Removes all comment nodes from a list of nodes.
/// </summary>
/// <param name="nodes">The list of nodes.</param>
private static void RemoveComments(List<INode> nodes)
{
foreach (var comment in nodes.OfType<IComment>())
comment.Remove();
}

private void DoSanitize(IHtmlDocument dom, IElement context, string baseUrl = "", IMarkupFormatter outputFormatter = null)
{
// remove non-whitelisted tags
foreach (var tag in dom.Body.QuerySelectorAll("*").Where(t => !IsAllowedTag(t)).ToList())
foreach (var tag in context.QuerySelectorAll("*").Where(t => !IsAllowedTag(t)).ToList())
{
RemoveTag(tag, RemoveReason.NotAllowedTag);
}

// cleanup attributes
foreach (var tag in dom.Body.QuerySelectorAll("*").OfType<IHtmlElement>().ToList())
foreach (var tag in context.QuerySelectorAll("*").OfType<IHtmlElement>().ToList())
{
// remove non-whitelisted attributes
foreach (var attribute in tag.Attributes.Where(a => !IsAllowedAttribute(a)).ToList())
Expand Down Expand Up @@ -373,11 +426,20 @@ public string Sanitize(string html, string baseUrl = "", IMarkupFormatter output
}
}

var nodes = GetAllNodes(dom.Body).ToList();
var nodes = GetAllNodes(context).ToList();

foreach (var comment in nodes.OfType<IComment>())
comment.Remove();
RemoveComments(nodes);

DoPostProcess(dom, nodes);
}

/// <summary>
/// Performs post processing on all nodes in the document.
/// </summary>
/// <param name="dom">The HTML document.</param>
/// <param name="nodes">The list of nodes in the document.</param>
private void DoPostProcess(IHtmlDocument dom, List<INode> nodes)
{
if (PostProcessNode != null)
{
foreach (var node in nodes)
Expand All @@ -388,10 +450,6 @@ public string Sanitize(string html, string baseUrl = "", IMarkupFormatter output
((IChildNode)node).Replace(e.ReplacementNodes.ToArray());
}
}

var output = dom.Body.ChildNodes.ToHtml(outputFormatter ?? HtmlMarkupFormatter.Instance);

return output;
}

/// <summary>
Expand Down
2 changes: 1 addition & 1 deletion HtmlSanitizer/HtmlSanitizer.nuspec
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
<package >
<metadata>
<id>$id$</id>
<version>$version$</version>
<version>$version$-beta</version>
<title>$title$</title>
<authors>$author$</authors>
<owners>$author$</owners>
Expand Down
38 changes: 3 additions & 35 deletions HtmlSanitizer/IHtmlSanitizer.cs
Original file line number Diff line number Diff line change
Expand Up @@ -6,41 +6,9 @@
namespace Ganss.XSS
{
/// <summary>
/// Cleans HTML fragments from constructs that can lead to <a href="https://en.wikipedia.org/wiki/Cross-site_scripting">XSS attacks</a>.
/// Enables an inheriting class to implement an HtmlSanitizer class, which cleans HTML documents and fragments
/// from constructs that can lead to <a href="https://en.wikipedia.org/wiki/Cross-site_scripting">XSS attacks</a>.
/// </summary>
/// <remarks>
/// XSS attacks can occur at several levels within an HTML fragment:
/// <list type="bullet">
/// <item>HTML Tags (e.g. the &lt;script&gt; tag)</item>
/// <item>HTML attributes (e.g. the "onload" attribute)</item>
/// <item>CSS styles (url property values)</item>
/// <item>malformed HTML or HTML that exploits parser bugs in specific browsers</item>
/// </list>
/// <para>
/// The HtmlSanitizer class addresses all of these possible attack vectors by using an HTML parser that is based on the one used
/// in the Gecko browser engine (see <a href="https://github.com/jamietre/CsQuery">CsQuery</a>).
/// </para>
/// <para>
/// In order to facilitate different use cases, HtmlSanitizer can be customized at the levels mentioned above:
/// <list type="bullet">
/// <item>You can specify the allowed HTML tags through the property <see cref="AllowedTags"/>. All other tags will be stripped.</item>
/// <item>You can specify the allowed HTML attributes through the property <see cref="AllowedAttributes"/>. All other attributes will be stripped.</item>
/// <item>You can specify the allowed CSS property names through the property <see cref="AllowedCssProperties"/>. All other styles will be stripped.</item>
/// <item>You can specify the allowed URI schemes through the property <see cref="AllowedCssProperties"/>. All other URIs will be stripped.</item>
/// <item>You can specify the HTML attributes that contain URIs (such as "src", "href" etc.) through the property <see cref="UriAttributes"/>.</item>
/// </list>
/// </para>
/// </remarks>
/// <example>
/// <code>
/// <![CDATA[
/// var sanitizer = new HtmlSanitizer();
/// var html = @"<script>alert('xss')</script><div onload=""alert('xss')"" style=""background-color: test"">Test<img src=""test.gif"" style=""background-image: url(javascript:alert('xss')); margin: 10px""></div>";
/// var sanitized = sanitizer.Sanitize(html, "http://www.example.com");
/// // -> "<div style="background-color: test">Test<img style="margin: 10px" src="http://www.example.com/test.gif"></div>"
/// ]]>
/// </code>
/// </example>
public interface IHtmlSanitizer
{
/// <summary>
Expand Down Expand Up @@ -121,7 +89,7 @@ public interface IHtmlSanitizer
/// </summary>
/// <param name="html">The HTML to sanitize.</param>
/// <param name="baseUrl">The base URL relative URLs are resolved against. No resolution if empty.</param>
/// <param name="outputFormatter">The CsQuery output formatter used to render the DOM. Using the default formatter if null.</param>
/// <param name="outputFormatter">The formatter used to render the DOM. Using the default formatter if null.</param>
/// <returns>The sanitized HTML.</returns>
string Sanitize(string html, string baseUrl = "", IMarkupFormatter outputFormatter = null);
}
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ HtmlSanitizer
[![Build status](https://ci.appveyor.com/api/projects/status/418bmfx643iae00c/branch/master?svg=true)](https://ci.appveyor.com/project/mganss/htmlsanitizer/branch/master)
[![codecov.io](https://codecov.io/github/mganss/HtmlSanitizer/coverage.svg?branch=master)](https://codecov.io/github/mganss/HtmlSanitizer?branch=master)

HtmlSanitizer is a .NET library for cleaning HTML fragments from constructs that can lead to [XSS attacks](https://en.wikipedia.org/wiki/Cross-site_scripting).
HtmlSanitizer is a .NET library for cleaning HTML fragments and documents from constructs that can lead to [XSS attacks](https://en.wikipedia.org/wiki/Cross-site_scripting).
It uses [AngleSharp](https://github.com/AngleSharp/AngleSharp) to parse, manipulate, and render HTML and CSS.

Because HtmlSanitizer is based on a robust HTML parser it can also shield you from deliberate or accidental
Expand Down
2 changes: 1 addition & 1 deletion appveyor.yml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
version: 3.1.{build}
version: 3.2.{build}
install:
- nuget restore
configuration: Release
Expand Down

0 comments on commit 4e3a647

Please sign in to comment.