From 7202ba0a5006ffbb0e7072d046374cd918ef3a2e Mon Sep 17 00:00:00 2001 From: Simon Streubel Date: Thu, 24 Jan 2019 15:05:00 +0100 Subject: [PATCH 1/3] removed unnecessary quotationmarks from de-lang sentences --- src/corporacreator/preprocessors/de.py | 34 ++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/src/corporacreator/preprocessors/de.py b/src/corporacreator/preprocessors/de.py index 075be36..6f8449f 100644 --- a/src/corporacreator/preprocessors/de.py +++ b/src/corporacreator/preprocessors/de.py @@ -1,3 +1,35 @@ +import re +QUOTE_PATTERN = re.compile(r'^\"{3}(.*)\"{2}(.*)\"{1}$') +QUOTE_PATTERN_2 = re.compile(r'^\"{1}(.*)\"{2}(.*)\"{2}(.*)\"{1}$') +QUOTE_PATTERN_3 = re.compile(r'^\"{1}(.*)\"{1}') + +def _remove_multi_quotes(sentence): + """Removes all quotes from patterns like + \"""content""content" or + "content""content""content" or + "content" + + Args: + sentence (str): Sentence to be cleaned up. + + Returns: + (str): Cleaned up sentence. Returns the sentence 'as-is', if matching + did not work as expected + """ + matches = QUOTE_PATTERN.match(sentence) # pattern: \"\"\"content\"\"content\" + matches2 = QUOTE_PATTERN_2.match(sentence) # pattern: \"content\"\"content\"\"content\" + matches3 = QUOTE_PATTERN_3.match(sentence) # patter: \"content\" + + if matches != None and matches.lastindex == 2: + return "{}{}".format(matches.group(1), matches.group(2)) + elif matches2 != None and matches2.lastindex == 3: + return "{}{}{}".format(matches2.group(1), matches2.group(2), matches2.group(3)) + elif matches3 != None and matches3.lastindex == 1: + return "{}".format(matches3.group(1)) + + return sentence + + def de(client_id, sentence): """Cleans up the passed sentence, removing or reformatting invalid data. @@ -8,5 +40,7 @@ def de(client_id, sentence): Returns: (str): Cleaned up sentence. Returning None or a `str` of whitespace flags the sentence as invalid. """ + sentence = _remove_multi_quotes(sentence) + # TODO: Clean up de data return sentence From 82b6d79bf560be717a82ef7a6c4a162796fb66c6 Mon Sep 17 00:00:00 2001 From: Simon Streubel Date: Mon, 28 Jan 2019 10:08:54 +0100 Subject: [PATCH 2/3] changed quote-replacement behaviour of pattern "content""content""content" to content"content"content --- src/corporacreator/preprocessors/de.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/corporacreator/preprocessors/de.py b/src/corporacreator/preprocessors/de.py index 6f8449f..7b58792 100644 --- a/src/corporacreator/preprocessors/de.py +++ b/src/corporacreator/preprocessors/de.py @@ -3,11 +3,11 @@ QUOTE_PATTERN_2 = re.compile(r'^\"{1}(.*)\"{2}(.*)\"{2}(.*)\"{1}$') QUOTE_PATTERN_3 = re.compile(r'^\"{1}(.*)\"{1}') -def _remove_multi_quotes(sentence): - """Removes all quotes from patterns like - \"""content""content" or - "content""content""content" or - "content" +def _change_multi_quotes(sentence): + """Changes all quotes from patterns like + [\"""content""content"] to [content content] or + ["content""content""content"] to [content "content" content] or + ["content" to content] Args: sentence (str): Sentence to be cleaned up. @@ -23,7 +23,7 @@ def _remove_multi_quotes(sentence): if matches != None and matches.lastindex == 2: return "{}{}".format(matches.group(1), matches.group(2)) elif matches2 != None and matches2.lastindex == 3: - return "{}{}{}".format(matches2.group(1), matches2.group(2), matches2.group(3)) + return "{}\"{}\"{}".format(matches2.group(1), matches2.group(2), matches2.group(3)) elif matches3 != None and matches3.lastindex == 1: return "{}".format(matches3.group(1)) @@ -40,7 +40,7 @@ def de(client_id, sentence): Returns: (str): Cleaned up sentence. Returning None or a `str` of whitespace flags the sentence as invalid. """ - sentence = _remove_multi_quotes(sentence) + sentence = _change_multi_quotes(sentence) # TODO: Clean up de data return sentence From db5f9023edd48d798eb862cc98ce12ced71af7c6 Mon Sep 17 00:00:00 2001 From: Simon Streubel Date: Mon, 28 Jan 2019 15:03:44 +0100 Subject: [PATCH 3/3] change quote-replacement of pattern """content""content" --- src/corporacreator/preprocessors/de.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/corporacreator/preprocessors/de.py b/src/corporacreator/preprocessors/de.py index 7b58792..a599ab3 100644 --- a/src/corporacreator/preprocessors/de.py +++ b/src/corporacreator/preprocessors/de.py @@ -1,12 +1,12 @@ import re QUOTE_PATTERN = re.compile(r'^\"{3}(.*)\"{2}(.*)\"{1}$') QUOTE_PATTERN_2 = re.compile(r'^\"{1}(.*)\"{2}(.*)\"{2}(.*)\"{1}$') -QUOTE_PATTERN_3 = re.compile(r'^\"{1}(.*)\"{1}') +QUOTE_PATTERN_3 = re.compile(r'^\"{1}(.*)\"{1}$') def _change_multi_quotes(sentence): """Changes all quotes from patterns like - [\"""content""content"] to [content content] or - ["content""content""content"] to [content "content" content] or + [\"""content""content"] to ["content"content] or + ["content""content""content"] to [content"content"content] or ["content" to content] Args: @@ -21,7 +21,7 @@ def _change_multi_quotes(sentence): matches3 = QUOTE_PATTERN_3.match(sentence) # patter: \"content\" if matches != None and matches.lastindex == 2: - return "{}{}".format(matches.group(1), matches.group(2)) + return "\"{}\"{}".format(matches.group(1), matches.group(2)) elif matches2 != None and matches2.lastindex == 3: return "{}\"{}\"{}".format(matches2.group(1), matches2.group(2), matches2.group(3)) elif matches3 != None and matches3.lastindex == 1: