From ba2b52ecacb31fe8fd7baa4f80330a91e71ce0a1 Mon Sep 17 00:00:00 2001 From: josh Date: Wed, 30 Jan 2019 16:12:57 +0100 Subject: [PATCH] collapse whitespace for all langs --- src/corporacreator/preprocessors/common.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/corporacreator/preprocessors/common.py b/src/corporacreator/preprocessors/common.py index 1f29a6d..1ffb290 100644 --- a/src/corporacreator/preprocessors/common.py +++ b/src/corporacreator/preprocessors/common.py @@ -75,5 +75,7 @@ def common(sentence): sentence = _strip_tags(sentence) # Remove non-printable characters sentence = _strip_string(sentence) + # collapse all whitespace and replace with single space + sentence = (' ').join(sentence.split()) # TODO: Clean up data in a language independent manner return sentence