From 0af9cfbaabaa85a66afc1e740854780ebb11afe9 Mon Sep 17 00:00:00 2001 From: Artie <54867745+artie-inc@users.noreply.github.com> Date: Fri, 4 Oct 2019 15:52:40 -0700 Subject: [PATCH 1/4] Change ticks --> apostrophes There are a lot of tick marks that occur in English Common Voice that should be apostrophes This is part of a larger problem which involves quotation marks / double quotes --- src/corporacreator/preprocessors/en.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/corporacreator/preprocessors/en.py b/src/corporacreator/preprocessors/en.py index 0d95ef6..98456db 100644 --- a/src/corporacreator/preprocessors/en.py +++ b/src/corporacreator/preprocessors/en.py @@ -8,5 +8,8 @@ def en(client_id, sentence): Returns: (str): Cleaned up sentence. Returning None or a `str` of whitespace flags the sentence as invalid. """ - # TODO: Clean up en data + ## collapse all apostrophe-like marks + ## e.g. common_voice_en_18441344.mp3 ‘I’m not a serpent!’ --> 'I'm not a serpent!' + sentence = sentence.replace("’","'") # right-ticks --> apostrophes + sentence = sentence.replace("‘","'") # left-ticks --> apostrophes return sentence From 755c08be0bee07fc5d9db7a2c077a9fee0fce441 Mon Sep 17 00:00:00 2001 From: Artie <54867745+artie-inc@users.noreply.github.com> Date: Fri, 4 Oct 2019 16:11:41 -0700 Subject: [PATCH 2/4] Replace em-dash with dash This is also an issue WRT hyphens... should hyphens and dashes be collapsed? --- src/corporacreator/preprocessors/en.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/corporacreator/preprocessors/en.py b/src/corporacreator/preprocessors/en.py index 98456db..6d9c945 100644 --- a/src/corporacreator/preprocessors/en.py +++ b/src/corporacreator/preprocessors/en.py @@ -12,4 +12,7 @@ def en(client_id, sentence): ## e.g. common_voice_en_18441344.mp3 ‘I’m not a serpent!’ --> 'I'm not a serpent!' sentence = sentence.replace("’","'") # right-ticks --> apostrophes sentence = sentence.replace("‘","'") # left-ticks --> apostrophes + ## Change em-dash to dash + ## e.g. common_voice_en_18607891.mp3 Nelly, come here — is it morning? --> Nelly, come here – is it morning? + sentence = sentence.replace("—","–") return sentence From de131e4da07b0fd1af417671e31188ca2c9cdc24 Mon Sep 17 00:00:00 2001 From: Artie <54867745+artie-inc@users.noreply.github.com> Date: Fri, 4 Oct 2019 16:41:59 -0700 Subject: [PATCH 3/4] C++ --> C plus plus there are 160 utterances with C++ spoken in common voice english, and this user is in the `test.csv` file after `test.tsv` is passed through `import_cv2.py`, and I verified that it is spoken this way --- src/corporacreator/preprocessors/en.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/corporacreator/preprocessors/en.py b/src/corporacreator/preprocessors/en.py index 6d9c945..2da5341 100644 --- a/src/corporacreator/preprocessors/en.py +++ b/src/corporacreator/preprocessors/en.py @@ -8,6 +8,8 @@ def en(client_id, sentence): Returns: (str): Cleaned up sentence. Returning None or a `str` of whitespace flags the sentence as invalid. """ + if client_id == "3c8f07827758e9ce8070ed287139d6d3e6457c1c16dcb972edac76c4d4333dc9e9c428711237e2b34fa29f4d249287fd238ac884534e33075958233643bbd0a1": + sentence = sentence.replace("C++", "C plus plus") ## collapse all apostrophe-like marks ## e.g. common_voice_en_18441344.mp3 ‘I’m not a serpent!’ --> 'I'm not a serpent!' sentence = sentence.replace("’","'") # right-ticks --> apostrophes From ce93881ef3d63c591e6d7b106689e5ea7d72d3b3 Mon Sep 17 00:00:00 2001 From: Artie <54867745+artie-inc@users.noreply.github.com> Date: Fri, 4 Oct 2019 18:14:55 -0700 Subject: [PATCH 4/4] Replace "=" to "equal to" for one utterance I listened to this myself --- src/corporacreator/preprocessors/en.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/corporacreator/preprocessors/en.py b/src/corporacreator/preprocessors/en.py index 2da5341..6fbed01 100644 --- a/src/corporacreator/preprocessors/en.py +++ b/src/corporacreator/preprocessors/en.py @@ -8,6 +8,8 @@ def en(client_id, sentence): Returns: (str): Cleaned up sentence. Returning None or a `str` of whitespace flags the sentence as invalid. """ + if client_id == "f2909ec9143e4ff4792b0ccb2e109da938c1c5bfc91641165b0fd32f6caf67ad234318233affcaa4117a36212440eae89dbf06c75e2cb9d4a19ddec3663044ac": + sentence = sentence.replace("=", "equal to") if client_id == "3c8f07827758e9ce8070ed287139d6d3e6457c1c16dcb972edac76c4d4333dc9e9c428711237e2b34fa29f4d249287fd238ac884534e33075958233643bbd0a1": sentence = sentence.replace("C++", "C plus plus") ## collapse all apostrophe-like marks