Skip to content

Commit

Permalink
Merge pull request #21 from mozilla/issue20
Browse files Browse the repository at this point in the history
Fixed #20 (Allow the common preprocessor to reject sentences)
  • Loading branch information
kdavis-mozilla authored Dec 14, 2018
2 parents 2b80f8c + 4d445cc commit 89f523f
Showing 1 changed file with 12 additions and 1 deletion.
13 changes: 12 additions & 1 deletion src/corporacreator/corpora.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,14 @@
_logger = logging.getLogger(__name__)


def common_wrapper(sentence, up_votes, down_votes):
sentence = common(sentence)
if None == sentence or not sentence.strip():
up_votes = 0
down_votes = 2
return pd.Series([sentence, up_votes, down_votes])


class Corpora:
"""Corpora representing all Common Voice datasets.
Expand All @@ -21,6 +29,7 @@ class Corpora:
args ([str]): command line parameters as list of strings
corpora ([:class:`corporacreator.Corpus`]): List of :class:`corporacreator.Corpus` instances
"""

def __init__(self, args):
self.args = args
self.corpora = []
Expand All @@ -30,7 +39,9 @@ def create(self):
"""
_logger.info("Creating corpora...")
corpora_data = self._parse_tsv()
corpora_data["sentence"] = corpora_data["sentence"].apply(func=common)
corpora_data[["sentence", "up_votes", "down_votes"]] = corpora_data[
["sentence", "up_votes", "down_votes"]
].apply(func=lambda arg: common_wrapper(*arg), axis=1)
for locale in corpora_data.locale.unique():
_logger.info("Selecting %s corpus data..." % locale)
corpus_data = corpora_data.loc[
Expand Down

0 comments on commit 89f523f

Please sign in to comment.