Skip to content
This repository has been archived by the owner on Nov 28, 2022. It is now read-only.

Submission for WEDO Team #9

Open
wants to merge 102 commits into
base: main
Choose a base branch
from
Open
Changes from 1 commit
Commits
Show all changes
102 commits
Select commit Hold shift + click to select a range
2edf5c8
Add pretrained model
BeamNC Oct 12, 2022
89c27b5
add inference code
BeamNC Oct 12, 2022
97eef82
Update script
BeamNC Oct 12, 2022
669fd1d
first_commit
nattanaa Oct 12, 2022
4d5cd37
commit2
nattanaa Oct 12, 2022
8e6bf57
commit3
nattanaa Oct 12, 2022
917eddc
Add setup.sh for install
BeamNC Oct 12, 2022
cfa2d43
Delete submit/Gender_Category/gender_classification/pretrained_model …
beam11221 Oct 12, 2022
2f68b8b
Delete hparams_inference.yaml
beam11221 Oct 12, 2022
c772afc
Update setup.sh
BeamNC Oct 14, 2022
129e3d9
Add readme
BeamNC Oct 14, 2022
44e555f
From local. Fixed conflict
BeamNC Oct 14, 2022
6128900
Update README.md
beam11221 Oct 14, 2022
75a3089
Merge pull request #1 from KongpolC/gender_clf
KongpolC Oct 14, 2022
d12ba78
commit3
nattanaa Oct 14, 2022
5239ad1
commit4
nattanaa Oct 14, 2022
f8706aa
setup update
nattanaa Oct 14, 2022
75ba2a1
update setup.sh to download validated.tsv for cv11
BeamNC Oct 14, 2022
fe2da24
Add cv11 gender inference. Update setup.sh
BeamNC Oct 14, 2022
13b49e7
Update README.md
beam11221 Oct 14, 2022
8fceeb3
Add Thai-ser download script, add data preprocessing notebook; mp3 ->…
BeamNC Oct 14, 2022
281de7e
fixed local conflict
BeamNC Oct 14, 2022
82d6689
Merge pull request #2 from KongpolC/gender_clf_2
beam11221 Oct 14, 2022
67750a8
2 update
nattanaa Oct 14, 2022
73725db
Merge branch 'main' of https://github.com/KongpolC/our-voices-model-c…
nattanaa Oct 14, 2022
799d40c
Update readme
nattanaa Oct 14, 2022
31e0126
Update all
nattanaa Oct 16, 2022
a934cd3
Update readme
nattanaa Oct 16, 2022
7e9b9ee
Add training script
BeamNC Oct 17, 2022
7794fe5
Merge pull request #3 from KongpolC/gender_clf_3
beam11221 Oct 17, 2022
e1e985c
first commit
KongpolC Oct 17, 2022
b0552d6
Merge branch 'main' of github.com:KongpolC/our-voices-model-competition
KongpolC Oct 17, 2022
0175562
Change download directory to ./models
BeamNC Oct 17, 2022
5b6f4f4
Fix setup.sh
BeamNC Oct 17, 2022
eb1132e
Merge pull request #4 from KongpolC/edit_pretrain_path
beam11221 Oct 17, 2022
0eed352
Update dataset internal path
BeamNC Oct 17, 2022
ee12e67
change paths to data
KongpolC Oct 17, 2022
4a9fd68
Merge branch 'main' of github.com:KongpolC/our-voices-model-competition
KongpolC Oct 17, 2022
b475c29
Add requirements
BeamNC Oct 17, 2022
d7cbf71
Update main.ipynb
nattanaa Oct 17, 2022
346b645
Update main.ipynb
nattanaa Oct 17, 2022
83f9276
Add gitignore, add audio preprocessing script
BeamNC Oct 17, 2022
18171af
Update training config
BeamNC Oct 17, 2022
af94756
Add load_dataset.sh
BeamNC Oct 17, 2022
2f12875
Update load_dataset.sh; Add copy tsv file from cv11 to commonvoice11/…
BeamNC Oct 17, 2022
5058376
change paths and add more explanation
KongpolC Oct 17, 2022
03f550b
Update readme
BeamNC Oct 17, 2022
0865f00
Update comment in model_inference notebook. UPdate readme
BeamNC Oct 17, 2022
5606b7a
update_all
nattanaa Oct 17, 2022
7aa9265
Merge branch 'main' of https://github.com/KongpolC/our-voices-model-c…
nattanaa Oct 17, 2022
5f97e9f
modify analysis 5.3
KongpolC Oct 18, 2022
a165796
Merge branch 'main' of github.com:KongpolC/our-voices-model-competition
KongpolC Oct 18, 2022
b5b2002
modify analysis 5.3
KongpolC Oct 18, 2022
fdca75c
Merge remote-tracking branch 'origin/migrate'
KongpolC Oct 18, 2022
e9c0959
rearange data
KongpolC Oct 18, 2022
15a2e4f
sample clips
KongpolC Oct 18, 2022
6744a25
ignores .wav files except one
KongpolC Oct 18, 2022
6e22c0d
remove README
KongpolC Oct 18, 2022
c1b3a8a
rename
KongpolC Oct 18, 2022
4d8b3f7
Move data file to scripts
BeamNC Oct 19, 2022
ff8f973
Fix path to compat with new directory
BeamNC Oct 19, 2022
d16231c
Merge pull request #5 from KongpolC/migrate_script
beam11221 Oct 19, 2022
ad13bac
Add training script
BeamNC Oct 19, 2022
36ade8e
Update training config
BeamNC Oct 19, 2022
c7cf4f2
update path
nattanaa Oct 19, 2022
4377c4b
add_floder_data_prep
nattanaa Oct 19, 2022
e2296e0
Update readme
BeamNC Oct 19, 2022
1294cfb
Merge branch 'main' of https://github.com/KongpolC/our-voices-model-c…
BeamNC Oct 19, 2022
7f93408
update sh
nattanaa Oct 19, 2022
aa6cac3
Merge branch 'main' of https://github.com/KongpolC/our-voices-model-c…
nattanaa Oct 19, 2022
d0df94a
update setup to train
nattanaa Oct 19, 2022
7583333
Update README.md
nattanaa Oct 19, 2022
b277dbe
Split load_dataset.sh into 2 files for commonvoice11 and Thai-SER
BeamNC Oct 19, 2022
32d49b0
Update load_commonvoice11.sh
beam11221 Oct 19, 2022
d74af66
Update commonvoice11 loading script
BeamNC Oct 19, 2022
9cd17c1
Merge pull request #7 from KongpolC/split_load_dataset
beam11221 Oct 19, 2022
e7ec1a3
Update dataset path
BeamNC Oct 19, 2022
8d61e07
Update setup.sh & requirement
BeamNC Oct 19, 2022
12e1846
Update parameter for inference
BeamNC Oct 19, 2022
b572dc8
Add Commonvoice11 annotation genereator & annotation
BeamNC Oct 19, 2022
44b8110
Add Thai-SER annotation and annotation generate scripts
BeamNC Oct 19, 2022
2815225
Update ds_path
BeamNC Oct 19, 2022
3fe660e
Add manifest generator for gender classification training
BeamNC Oct 19, 2022
fad601a
Update readme.md
BeamNC Oct 19, 2022
01bc75d
Merge pull request #8 from KongpolC/add_create_anno
beam11221 Oct 19, 2022
82b77b3
Update README.md
nattanaa Oct 20, 2022
0f01361
Update README.md
nutchascg Oct 20, 2022
da49f73
Update README.md
nutchascg Oct 20, 2022
8793047
Update README.md
nutchascg Oct 20, 2022
eabc1e6
Update README.md
nutchascg Oct 20, 2022
c581687
Update README.md
nutchascg Oct 20, 2022
a42a529
Update README.md
nutchascg Oct 20, 2022
24c09b9
Update README.md
nutchascg Oct 20, 2022
f06ea43
Add README
BeamNC Oct 20, 2022
0e32565
Update README
BeamNC Oct 20, 2022
0187234
Update README.md
nutchascg Oct 20, 2022
21a19a5
Remove files
BeamNC Oct 20, 2022
eeb86c7
Update README.md
nutchascg Oct 20, 2022
db4345c
Update README.md
nutchascg Oct 20, 2022
23dc3ad
Update main notebook
BeamNC Oct 20, 2022
5a61c44
Merge branch 'doc_string' of https://github.com/KongpolC/our-voices-m…
BeamNC Oct 20, 2022
ffadf04
Merge pull request #9 from KongpolC/doc_string
beam11221 Oct 20, 2022
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
first_commit
  • Loading branch information
nattanaa committed Oct 12, 2022

Verified

This commit was created on GitHub.com and signed with GitHub’s verified signature. The key has expired.
commit 669fd1d26fdefc9009c9698cb8b4e826f1c6a678
185 changes: 185 additions & 0 deletions cv11.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,185 @@
# coding=utf-8
# Copyright 2021 The HuggingFace Datasets Authors and the current dataset script contributor.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" Common Voice Dataset"""


import os

import datasets
from datasets.tasks import AutomaticSpeechRecognition

_CITATION = """\
@inproceedings{commonvoice:2020,
author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},
title = {Common Voice: A Massively-Multilingual Speech Corpus},
booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},
pages = {4211--4215},
year = 2020
}
"""

_DESCRIPTION = """\
Common Voice is Mozilla's initiative to help teach machines how real people speak.
The dataset currently consists of 7,335 validated hours of speech in 60 languages, but we’re always adding more voices and languages.
"""

_HOMEPAGE = "https://commonvoice.mozilla.org/en/datasets"

_LICENSE = "https://github.com/common-voice/common-voice/blob/main/LICENSE"

_LANGUAGES = {
"th": {
"Language": "Thai",
"Date": "2021-07-21",
"Size": "5 GB",
"Version": "th_255h_2021-07-21",
"Validated_Hr_Total": 133,
"Overall_Hr_Total": 255,
"Number_Of_Voice": 7212,
},
}

#preprocess table
import pandas as pd
# df_dev= pd.read_csv("/home/shared/commonvoice11/annotation/all_split/dev_uniq.tsv", sep='\t')
# df_test= pd.read_csv("/home/shared/commonvoice11/annotation/all_split/test_uniq.tsv", sep='\t')
# df_train= pd.read_csv("/home/shared/commonvoice11/annotation/all_split/train_cleaned.tsv", sep='\t')
df_train= pd.read_csv("/home/nattanaa/ASR_train/mozilla/normal/balanced_3/df_balanced_train.csv")
df_train_add= pd.read_csv("/home/nattanaa/ASR_train/mozilla/normal/balanced_3/3_additional_balanced_same_sentence_train.csv")
# df_dev=df_dev[['path','sentence']]
# df_test=df_test[['path','sentence']]
# df_train=df_train[['path','sentence']]
df_train_add=df_train_add[['path','sentence']]
df_added=pd.concat([df_train, df_train_add], ignore_index=True)
# # converting df file into csv
# df_dev.to_csv('/home/nattanaa/ASR_train/mozilla/normal/all_split/df_all_dev.csv',index=False)
# df_test.to_csv('/home/nattanaa/ASR_train/mozilla/normal/all_split/df_all_test.csv',index=False)
# df_train.to_csv('/home/nattanaa/ASR_train/mozilla/normal/all_split/df_all_train.csv',index=False)
df_added.to_csv('/home/nattanaa/ASR_train/mozilla/normal/balanced_3/df_added.csv',index=False)
class CommonVoiceConfig(datasets.BuilderConfig):
"""BuilderConfig for CommonVoice."""

def __init__(self, name, sub_version, **kwargs):
"""
Args:
data_dir: `string`, the path to the folder containing the files in the
downloaded .tar
citation: `string`, citation for the data set
url: `string`, url for information about the data set
**kwargs: keyword arguments forwarded to super.
"""
self.sub_version = sub_version
self.language = kwargs.pop("language", None)
self.date_of_snapshot = kwargs.pop("date", None)
self.size = kwargs.pop("size", None)
self.validated_hr_total = kwargs.pop("val_hrs", None)
self.total_hr_total = kwargs.pop("total_hrs", None)
self.num_of_voice = kwargs.pop("num_of_voice", None)
description = f"Common Voice speech to text dataset in {self.language} version {self.sub_version} of {self.date_of_snapshot}. The dataset comprises {self.validated_hr_total} of validated transcribed speech data from {self.num_of_voice} speakers. The dataset has a size of {self.size}"
super(CommonVoiceConfig, self).__init__(
name=name, version=datasets.Version("7.0.0", ""), description=description, **kwargs
)


class CommonVoice(datasets.GeneratorBasedBuilder):

BUILDER_CONFIGS = [
CommonVoiceConfig(
name=lang_id,
language=_LANGUAGES[lang_id]["Language"],
sub_version=_LANGUAGES[lang_id]["Version"],
date=_LANGUAGES[lang_id]["Date"],
size=_LANGUAGES[lang_id]["Size"],
val_hrs=_LANGUAGES[lang_id]["Validated_Hr_Total"],
total_hrs=_LANGUAGES[lang_id]["Overall_Hr_Total"],
num_of_voice=_LANGUAGES[lang_id]["Number_Of_Voice"],
)
for lang_id in _LANGUAGES.keys()
]

def _info(self):
features = datasets.Features(
{
"path": datasets.Value("string"),
"sentence": datasets.Value("string"),
}
)

return datasets.DatasetInfo(
description=_DESCRIPTION,
features=features,
supervised_keys=None,
homepage=_HOMEPAGE,
license=_LICENSE,
citation=_CITATION,
task_templates=[
AutomaticSpeechRecognition(audio_file_path_column="path", transcription_column="sentence")
],
)

def _split_generators(self, dl_manager):
"""Returns SplitGenerators."""
abs_path_to_data = "/home/shared/commonvoice11/data"
abs_path_to_clips = os.path.join(abs_path_to_data, "clips_wav")

return [
datasets.SplitGenerator(
name=datasets.Split.TRAIN,
gen_kwargs={
"filepath": "/home/nattanaa/ASR_train/mozilla/normal/balanced_3/df_added.csv",
"path_to_clips": abs_path_to_clips,
},
),
datasets.SplitGenerator(
name=datasets.Split.TEST,
gen_kwargs={
"filepath": "/home/nattanaa/ASR_train/mozilla/normal/balanced_3/df_balanced_test.csv",
"path_to_clips": abs_path_to_clips,
},
),
datasets.SplitGenerator(
name=datasets.Split.VALIDATION,
gen_kwargs={
"filepath": "/home/nattanaa/ASR_train/mozilla/normal/balanced_3/df_balanced_dev.csv",
"path_to_clips": abs_path_to_clips,
},
),
]

def _generate_examples(self, filepath, path_to_clips):
"""Yields examples."""
data_fields = list(self._info().features.keys())
path_idx = data_fields.index("path")

with open(filepath, encoding="utf-8") as f:
lines = f.readlines()
headline = lines[0]

column_names = headline.strip().split(",")
assert (
column_names == data_fields
), f"The file should have {data_fields} as column names, but has {column_names}"

for id_, line in enumerate(lines[1:]):
field_values = line.strip().split(",")

# set absolute path for mp3 audio file
field_values[path_idx] = os.path.join(path_to_clips, field_values[path_idx].split("/")[-1].replace(".mp3", ".wav"))

# if data is incomplete, fill with empty values
if len(field_values) < len(data_fields):
field_values += (len(data_fields) - len(field_values)) * ["''"]

yield id_, {key: value for key, value in zip(data_fields, field_values)}
Loading