Skip to content

Commit

Permalink
Merge pull request #54 from simnotes/de_quotecleanup
Browse files Browse the repository at this point in the history
removed unnecessary quotationmarks from de-lang sentences
  • Loading branch information
kdavis-mozilla authored Jan 28, 2019
2 parents 1da027b + db5f902 commit 63f9be1
Showing 1 changed file with 34 additions and 0 deletions.
34 changes: 34 additions & 0 deletions src/corporacreator/preprocessors/de.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,35 @@
import re
QUOTE_PATTERN = re.compile(r'^\"{3}(.*)\"{2}(.*)\"{1}$')
QUOTE_PATTERN_2 = re.compile(r'^\"{1}(.*)\"{2}(.*)\"{2}(.*)\"{1}$')
QUOTE_PATTERN_3 = re.compile(r'^\"{1}(.*)\"{1}$')

def _change_multi_quotes(sentence):
"""Changes all quotes from patterns like
[\"""content""content"] to ["content"content] or
["content""content""content"] to [content"content"content] or
["content" to content]
Args:
sentence (str): Sentence to be cleaned up.
Returns:
(str): Cleaned up sentence. Returns the sentence 'as-is', if matching
did not work as expected
"""
matches = QUOTE_PATTERN.match(sentence) # pattern: \"\"\"content\"\"content\"
matches2 = QUOTE_PATTERN_2.match(sentence) # pattern: \"content\"\"content\"\"content\"
matches3 = QUOTE_PATTERN_3.match(sentence) # patter: \"content\"

if matches != None and matches.lastindex == 2:
return "\"{}\"{}".format(matches.group(1), matches.group(2))
elif matches2 != None and matches2.lastindex == 3:
return "{}\"{}\"{}".format(matches2.group(1), matches2.group(2), matches2.group(3))
elif matches3 != None and matches3.lastindex == 1:
return "{}".format(matches3.group(1))

return sentence


def de(client_id, sentence):
"""Cleans up the passed sentence, removing or reformatting invalid data.
Expand All @@ -8,5 +40,7 @@ def de(client_id, sentence):
Returns:
(str): Cleaned up sentence. Returning None or a `str` of whitespace flags the sentence as invalid.
"""
sentence = _change_multi_quotes(sentence)

# TODO: Clean up de data
return sentence

0 comments on commit 63f9be1

Please sign in to comment.