Skip to content

Commit

Permalink
Merge pull request #90 from mozilla/issue89
Browse files Browse the repository at this point in the history
Fixed #89 (Mark as invalid sentences with digits)
  • Loading branch information
kdavis-mozilla authored Feb 25, 2019
2 parents 1d9be5e + e621aea commit 32da1f1
Show file tree
Hide file tree
Showing 2 changed files with 19 additions and 4 deletions.
4 changes: 2 additions & 2 deletions src/corporacreator/corpora.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,8 @@


def common_wrapper(sentence, up_votes, down_votes):
sentence = common(sentence)
if None == sentence or not sentence.strip():
is_valid, sentence = common(sentence)
if False == is_valid:
up_votes = 0
down_votes = 2
return pd.Series([sentence, up_votes, down_votes])
Expand Down
19 changes: 17 additions & 2 deletions src/corporacreator/preprocessors/common.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,16 @@
import re
import unicodedata

from urllib.parse import unquote
from html.parser import HTMLParser


RE_DIGITS = re.compile('\d')

def _has_digit(sentence):
return RE_DIGITS.search(sentence)


class _HTMLStripper(HTMLParser):
"""Class that strips HTML from strings.
Expand Down Expand Up @@ -66,9 +73,11 @@ def common(sentence):
sentence (str): Sentence to be cleaned up.
Returns:
(str): Cleaned up sentence. Returning None or a `str` of whitespace flags the sentence as invalid.
(is_valid,str): A boolean indicating validity and cleaned up sentence.
"""

# Define a boolean indicating validity
is_valid = True
# Decode any URL encoded elements of sentence
sentence = unquote(sentence)
# Remove any HTML tags
Expand All @@ -78,4 +87,10 @@ def common(sentence):
# collapse all whitespace and replace with single space
sentence = (' ').join(sentence.split())
# TODO: Clean up data in a language independent manner
return sentence
# If the sentence contains digits reject it
if _has_digit(sentence):
is_valid = False
# If the sentence is blank reject it
if not sentence.strip():
is_valid = False
return (is_valid, sentence)

0 comments on commit 32da1f1

Please sign in to comment.