Skip to content

Commit

Permalink
Merge pull request #19 from mozilla/issue18
Browse files Browse the repository at this point in the history
Fixed #18 (Some sentences contain HTML).
  • Loading branch information
kdavis-mozilla authored Dec 13, 2018
2 parents 4d4571e + 642441b commit 292dad6
Showing 1 changed file with 38 additions and 0 deletions.
38 changes: 38 additions & 0 deletions src/corporacreator/preprocessors/common.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,39 @@
from html.parser import HTMLParser

class _HTMLStripper(HTMLParser):
"""Class that strips HTML from strings.
Examples:
>>> stripper = _HTMLStripper()
>>> stripper.feed(html)
>>> nohtml = stripper.get_data()
"""
def __init__(self):
super().__init__()
self.reset()
self.strict = False
self.convert_charrefs= True
self.fed = []

def handle_data(self, d):
self.fed.append(d)

def get_data(self):
return ''.join(self.fed)

def _strip_tags(html):
"""Removes HTML tags from passed text.
Args:
html (str): String containing HTML
Returns:
(str): String with HTML removed
"""
s = _HTMLStripper()
s.feed(html)
return s.get_data()

def common(sentence):
"""Cleans up the passed sentence in a language independent manner, removing or reformatting invalid data.
Expand All @@ -7,5 +43,7 @@ def common(sentence):
Returns:
(str): Cleaned up sentence.
"""
# Remove any HTML tags
sentence = _strip_tags(sentence)
# TODO: Clean up data in a language independent manner
return sentence

0 comments on commit 292dad6

Please sign in to comment.