Skip to content

Commit

Permalink
isso: html.py: Make <code class="language-$lang"> for syntax highligh…
Browse files Browse the repository at this point in the history
…ting

- Updated the HTML sanitization process to allow specific attributes for different tags.

- Modified the blockcode function to add 'language-' prefix to the class attribute for code tags, improving compatibility with Highlight.js.

Closes isso-comments#177
  • Loading branch information
pkvach committed Mar 12, 2024
1 parent 90aa041 commit 3ddf086
Show file tree
Hide file tree
Showing 2 changed files with 20 additions and 5 deletions.
4 changes: 3 additions & 1 deletion isso/tests/test_html.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,8 @@ def test_sanitizer(self):
('ld.so', 'ld.so'),
('/usr/lib/x86_64-linux-gnu/libc/memcpy-preload.so', '/usr/lib/x86_64-linux-gnu/libc/memcpy-preload.so'),
('<p style="visibility: hidden;">Test</p>', '<p>Test</p>'),
('<code class="language-cpp">Test</code>', '<code class="language-cpp">Test</code>'),
('<code class="test language-cpp">Test</code>', '<code>Test</code>'),
('<script>alert("Onoe")</script>', 'alert("Onoe")')]

for (input, expected) in examples:
Expand Down Expand Up @@ -122,7 +124,7 @@ def test_code_blocks(self):
convert = html.Markdown(extensions=('fenced-code',))
examples = [
("```\nThis is a code-fence. <hello>\n```", "<p><pre><code>This is a code-fence. &lt;hello&gt;\n</code></pre></p>"),
("```c++\nThis is a code-fence. <hello>\n```", "<p><pre><code class=\"c++\">This is a code-fence. &lt;hello&gt;\n</code></pre></p>"),
("```cpp\nThis is a code-fence. <hello>\n```", "<p><pre><code class=\"language-cpp\">This is a code-fence. &lt;hello&gt;\n</code></pre></p>"),
(" This is a four-character indent. <hello>", "<p><pre><code>This is a four-character indent. &lt;hello&gt;\n</code></pre></p>")]

for (input, expected) in examples:
Expand Down
21 changes: 17 additions & 4 deletions isso/utils/html.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,21 @@
# -*- encoding: utf-8 -*-

import html
import re

import bleach
import misaka


class Sanitizer(object):

# pattern to match a valid class attribute for code tags
code_language_pattern = re.compile(r"^language-[a-zA-Z0-9]{1,20}$")

@staticmethod
def allow_attribute_class(tag, name, value):
return name == "class" and bool(Sanitizer.code_language_pattern.match(value))

def __init__(self, elements, attributes):
# attributes found in Sundown's HTML serializer [1]
# - except for <img> tag, because images are not generated anyways.
Expand All @@ -20,8 +28,13 @@ def __init__(self, elements, attributes):
"h1", "h2", "h3", "h4", "h5", "h6", "sub", "sup",
"table", "thead", "tbody", "th", "td"] + elements

# href for <a> and align for <table>
self.attributes = ["align", "href"] + attributes
# allowed attributes for tags
self.attributes = {
"table": ["align"],
"a": ["href"],
"code": Sanitizer.allow_attribute_class,
"*": attributes
}

def sanitize(self, text):
clean_html = bleach.clean(text, tags=self.elements, attributes=self.attributes, strip=True)
Expand Down Expand Up @@ -73,11 +86,11 @@ class Unofficial(misaka.HtmlRenderer):
For instance, fenced code blocks (~~~ or ```) are just wrapped in <code>
which does not preserve line breaks. If a language is given, it is added
to <code class="$lang">, compatible with Highlight.js.
to <code class="language-$lang">, compatible with Highlight.js.
"""

def blockcode(self, text, lang):
lang = ' class="{0}"'.format(html.escape(lang)) if lang else ''
lang = ' class="language-{0}"'.format(html.escape(lang)) if lang else ''
return "<pre><code{1}>{0}</code></pre>\n".format(html.escape(text, False), lang)


Expand Down

0 comments on commit 3ddf086

Please sign in to comment.