From a7294596a09f0d0db6fec2c31eeaa996ead72356 Mon Sep 17 00:00:00 2001 From: kdavis-mozilla Date: Thu, 13 Dec 2018 06:56:32 +0100 Subject: [PATCH] Fixed #14 (Some sentences contain URL encoded text) --- src/corporacreator/preprocessors/common.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/corporacreator/preprocessors/common.py b/src/corporacreator/preprocessors/common.py index 110792a..9e77d07 100644 --- a/src/corporacreator/preprocessors/common.py +++ b/src/corporacreator/preprocessors/common.py @@ -1,3 +1,5 @@ +from urllib.parse import unquote + def common(sentence): """Cleans up the passed sentence in a language independent manner, removing or reformatting invalid data. @@ -7,5 +9,7 @@ def common(sentence): Returns: (str): Cleaned up sentence. """ + # Decode any URL encoded elements of sentence + sentence = unquote(sentence) # TODO: Clean up data in a language independent manner return sentence