Skip to content

Commit

Permalink
Fixed #7 (There is no language independent preprocessor)
Browse files Browse the repository at this point in the history
  • Loading branch information
kdavis-mozilla committed Dec 11, 2018
1 parent c8e7ebd commit 4565088
Show file tree
Hide file tree
Showing 3 changed files with 14 additions and 0 deletions.
2 changes: 2 additions & 0 deletions src/corporacreator/corpora.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import pandas as pd

from corporacreator import Corpus
from corporacreator.preprocessors import common


_logger = logging.getLogger(__name__)
Expand All @@ -29,6 +30,7 @@ def create(self):
"""
_logger.info("Creating corpora...")
corpora_data = self._parse_tsv()
corpora_data["sentence"] = corpora_data["sentence"].apply(func=common)
for locale in corpora_data.locale.unique():
_logger.info("Selecting %s corpus data..." % locale)
corpus_data = corpora_data.loc[
Expand Down
1 change: 1 addition & 0 deletions src/corporacreator/preprocessors/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
# -*- coding: utf-8 -*-
from .common import common
from .br import br
from .ca import ca
from .cv import cv
Expand Down
11 changes: 11 additions & 0 deletions src/corporacreator/preprocessors/common.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
def common(sentence):
"""Cleans up the passed sentence in a language independent manner, removing or reformatting invalid data.
Args:
sentence (str): Sentence to be cleaned up.
Returns:
(str): Cleaned up sentence.
"""
# TODO: Clean up data in a language independent manner
return sentence

0 comments on commit 4565088

Please sign in to comment.