first_commit

common-voice · beam11221 · Oct 12, 2022 · Oct 12, 2022 · Oct 12, 2022 · Oct 12, 2022
commit 669fd1d26fdefc9009c9698cb8b4e826f1c6a678
diff --git a/cv11.py b/cv11.py
@@ -0,0 +1,185 @@
+# coding=utf-8
+# Copyright 2021 The HuggingFace Datasets Authors and the current dataset script contributor.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Common Voice Dataset"""
+
+
+import os
+
+import datasets
+from datasets.tasks import AutomaticSpeechRecognition
+
+_CITATION = """\
+@inproceedings{commonvoice:2020,
+  author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},
+  title = {Common Voice: A Massively-Multilingual Speech Corpus},
+  booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},
+  pages = {4211--4215},
+  year = 2020
+}
+"""
+
+_DESCRIPTION = """\
+Common Voice is Mozilla's initiative to help teach machines how real people speak.
+The dataset currently consists of 7,335 validated hours of speech in 60 languages, but we’re always adding more voices and languages.
+"""
+
+_HOMEPAGE = "https://commonvoice.mozilla.org/en/datasets"
+
+_LICENSE = "https://github.com/common-voice/common-voice/blob/main/LICENSE"
+
+_LANGUAGES = {
+    "th": {
+        "Language": "Thai",
+        "Date": "2021-07-21",
+        "Size": "5 GB",
+        "Version": "th_255h_2021-07-21",
+        "Validated_Hr_Total": 133,
+        "Overall_Hr_Total": 255,
+        "Number_Of_Voice": 7212,
+    },
+}
+
+#preprocess table
+import pandas as pd
+# df_dev= pd.read_csv("/home/shared/commonvoice11/annotation/all_split/dev_uniq.tsv", sep='\t')
+# df_test= pd.read_csv("/home/shared/commonvoice11/annotation/all_split/test_uniq.tsv", sep='\t')
+# df_train= pd.read_csv("/home/shared/commonvoice11/annotation/all_split/train_cleaned.tsv", sep='\t')
+df_train= pd.read_csv("/home/nattanaa/ASR_train/mozilla/normal/balanced_3/df_balanced_train.csv")
+df_train_add= pd.read_csv("/home/nattanaa/ASR_train/mozilla/normal/balanced_3/3_additional_balanced_same_sentence_train.csv")
+# df_dev=df_dev[['path','sentence']]
+# df_test=df_test[['path','sentence']]
+# df_train=df_train[['path','sentence']]
+df_train_add=df_train_add[['path','sentence']]
+df_added=pd.concat([df_train, df_train_add], ignore_index=True)
+# # converting df file into csv
+# df_dev.to_csv('/home/nattanaa/ASR_train/mozilla/normal/all_split/df_all_dev.csv',index=False)
+# df_test.to_csv('/home/nattanaa/ASR_train/mozilla/normal/all_split/df_all_test.csv',index=False)
+# df_train.to_csv('/home/nattanaa/ASR_train/mozilla/normal/all_split/df_all_train.csv',index=False)
+df_added.to_csv('/home/nattanaa/ASR_train/mozilla/normal/balanced_3/df_added.csv',index=False)
+class CommonVoiceConfig(datasets.BuilderConfig):
+    """BuilderConfig for CommonVoice."""
+
+    def __init__(self, name, sub_version, **kwargs):
+        """
+        Args:
+          data_dir: `string`, the path to the folder containing the files in the
+            downloaded .tar
+          citation: `string`, citation for the data set
+          url: `string`, url for information about the data set
+          **kwargs: keyword arguments forwarded to super.
+        """
+        self.sub_version = sub_version
+        self.language = kwargs.pop("language", None)
+        self.date_of_snapshot = kwargs.pop("date", None)
+        self.size = kwargs.pop("size", None)
+        self.validated_hr_total = kwargs.pop("val_hrs", None)
+        self.total_hr_total = kwargs.pop("total_hrs", None)
+        self.num_of_voice = kwargs.pop("num_of_voice", None)
+        description = f"Common Voice speech to text dataset in {self.language} version {self.sub_version} of {self.date_of_snapshot}. The dataset comprises {self.validated_hr_total} of validated transcribed speech data from {self.num_of_voice} speakers. The dataset has a size of {self.size}"
+        super(CommonVoiceConfig, self).__init__(
+            name=name, version=datasets.Version("7.0.0", ""), description=description, **kwargs
+        )
+
+
+class CommonVoice(datasets.GeneratorBasedBuilder):
+
+    BUILDER_CONFIGS = [
+        CommonVoiceConfig(
+            name=lang_id,
+            language=_LANGUAGES[lang_id]["Language"],
+            sub_version=_LANGUAGES[lang_id]["Version"],
+            date=_LANGUAGES[lang_id]["Date"],
+            size=_LANGUAGES[lang_id]["Size"],
+            val_hrs=_LANGUAGES[lang_id]["Validated_Hr_Total"],
+            total_hrs=_LANGUAGES[lang_id]["Overall_Hr_Total"],
+            num_of_voice=_LANGUAGES[lang_id]["Number_Of_Voice"],
+        )
+        for lang_id in _LANGUAGES.keys()
+    ]
+
+    def _info(self):
+        features = datasets.Features(
+            {
+                "path": datasets.Value("string"),
+                "sentence": datasets.Value("string"),
+            }
+        )
+
+        return datasets.DatasetInfo(
+            description=_DESCRIPTION,
+            features=features,
+            supervised_keys=None,
+            homepage=_HOMEPAGE,
+            license=_LICENSE,
+            citation=_CITATION,
+            task_templates=[
+                AutomaticSpeechRecognition(audio_file_path_column="path", transcription_column="sentence")
+            ],
+        )
+
+    def _split_generators(self, dl_manager):
+        """Returns SplitGenerators."""
+        abs_path_to_data = "/home/shared/commonvoice11/data"
+        abs_path_to_clips = os.path.join(abs_path_to_data, "clips_wav")
+
+        return [
+            datasets.SplitGenerator(
+                name=datasets.Split.TRAIN,
+                gen_kwargs={
+                    "filepath": "/home/nattanaa/ASR_train/mozilla/normal/balanced_3/df_added.csv",
+                    "path_to_clips": abs_path_to_clips,
+                },
+            ),
+            datasets.SplitGenerator(
+                name=datasets.Split.TEST,
+                gen_kwargs={
+                    "filepath": "/home/nattanaa/ASR_train/mozilla/normal/balanced_3/df_balanced_test.csv",
+                    "path_to_clips": abs_path_to_clips,
+                },
+            ),
+            datasets.SplitGenerator(
+                name=datasets.Split.VALIDATION,
+                gen_kwargs={
+                    "filepath": "/home/nattanaa/ASR_train/mozilla/normal/balanced_3/df_balanced_dev.csv",
+                    "path_to_clips": abs_path_to_clips,
+                },
+            ),
+        ]
+
+    def _generate_examples(self, filepath, path_to_clips):
+        """Yields examples."""
+        data_fields = list(self._info().features.keys())
+        path_idx = data_fields.index("path")
+
+        with open(filepath, encoding="utf-8") as f:
+            lines = f.readlines()
+            headline = lines[0]
+
+            column_names = headline.strip().split(",")
+            assert (
+                column_names == data_fields
+            ), f"The file should have {data_fields} as column names, but has {column_names}"
+
+            for id_, line in enumerate(lines[1:]):
+                field_values = line.strip().split(",")
+
+                # set absolute path for mp3 audio file
+                field_values[path_idx] = os.path.join(path_to_clips, field_values[path_idx].split("/")[-1].replace(".mp3", ".wav"))
+
+                # if data is incomplete, fill with empty values
+                if len(field_values) < len(data_fields):
+                    field_values += (len(data_fields) - len(field_values)) * ["''"]
+
+                yield id_, {key: value for key, value in zip(data_fields, field_values)}