From 99159a746651bd848a8309da7f676045913f3d25 Mon Sep 17 00:00:00 2001 From: Dewi Bryn Jones Date: Thu, 11 Mar 2021 12:27:50 +0000 Subject: [PATCH] sgriptiau mireinio / finetuning scripts --- .gitignore | 1 + Dockerfile | 2 +- Dockerfile.train.tmpl | 68 ----- Makefile | 26 +- local/README.md | 34 ++- local/README_EN.md | 40 ++- local/analyze_audio.py | 50 ---- local/analyze_cv.py | 88 +++++++ local/build_lm_scorer.sh | 159 +++++++---- local/conf/default_lm_alpha.macsen.txt | 1 + local/conf/default_lm_alpha.transcribe.txt | 1 + local/conf/default_lm_alpha.txt | 1 + local/conf/default_lm_beta.macsen.txt | 1 + local/conf/default_lm_beta.transcribe.txt | 1 + local/conf/default_lm_beta.txt | 1 + local/evaluate.sh | 117 +++++++++ local/evalutate.sh | 47 ---- local/fine_tune.sh | 166 ++++++++++++ local/import_bangor_resources.py | 291 +++------------------ local/import_custom_macsen_text_corpus.py | 60 +++++ local/import_cv_archive.py | 39 --- local/import_fine_tuning_resources.py | 89 +++++++ local/optimize_lm_scorer.sh | 121 ++++++--- local/run_tl_cv_cy.sh | 88 +++++-- local/run_tl_cv_moz_sets_cy.sh | 157 +++++++++++ local/train_tl.sh | 121 +++++++++ local/utils/corpus.py | 91 +++++++ local/utils/evaluate_lm.py | 37 +++ local/utils/imports.py | 158 +++++++++++ local/utils/kfold.py | 4 + 30 files changed, 1464 insertions(+), 596 deletions(-) delete mode 100644 Dockerfile.train.tmpl delete mode 100755 local/analyze_audio.py create mode 100755 local/analyze_cv.py create mode 100644 local/conf/default_lm_alpha.macsen.txt create mode 100644 local/conf/default_lm_alpha.transcribe.txt create mode 100644 local/conf/default_lm_alpha.txt create mode 100644 local/conf/default_lm_beta.macsen.txt create mode 100644 local/conf/default_lm_beta.transcribe.txt create mode 100644 local/conf/default_lm_beta.txt create mode 100755 local/evaluate.sh delete mode 100755 local/evalutate.sh create mode 100755 local/fine_tune.sh create mode 100755 local/import_custom_macsen_text_corpus.py create mode 100755 local/import_fine_tuning_resources.py create mode 100755 local/run_tl_cv_moz_sets_cy.sh create mode 100755 local/train_tl.sh create mode 100755 local/utils/corpus.py create mode 100755 local/utils/evaluate_lm.py create mode 100755 local/utils/imports.py diff --git a/.gitignore b/.gitignore index def55e4..701248f 100644 --- a/.gitignore +++ b/.gitignore @@ -10,6 +10,7 @@ models/* checkpoints/* tmp/* local/bin/commonvoice_url.py +local/Makefile* keep local/__pycache__ local/utils/__pycache__ diff --git a/Dockerfile b/Dockerfile index 7458152..943d15c 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,5 +1,5 @@ ARG BRANCH -FROM mozilla/deepspeech:$BRANCH +FROM mozilla/deepspeech-train:$BRANCH RUN curl -s https://packagecloud.io/install/repositories/github/git-lfs/script.deb.sh | bash \ && apt-get update && apt-get install -y git-lfs lame sox libsox-fmt-mp3 vim zip file locales-all \ diff --git a/Dockerfile.train.tmpl b/Dockerfile.train.tmpl deleted file mode 100644 index 3e534b6..0000000 --- a/Dockerfile.train.tmpl +++ /dev/null @@ -1,68 +0,0 @@ -# Please refer to the TRAINING documentation, "Basic Dockerfile for training" - -FROM tensorflow/tensorflow:1.15.2-gpu-py3 -ENV DEBIAN_FRONTEND=noninteractive - -ENV DEEPSPEECH_REPO=#DEEPSPEECH_REPO# -ENV DEEPSPEECH_SHA=#DEEPSPEECH_SHA# - -RUN apt-get update && apt-get install -y --no-install-recommends \ - apt-utils \ - bash-completion \ - build-essential \ - cmake \ - curl \ - git \ - libboost-all-dev \ - libbz2-dev \ - locales \ - python3-venv \ - unzip \ - wget - -# We need to remove it because it's breaking deepspeech install later with -# weird errors about setuptools -RUN apt-get purge -y python3-xdg - -# Install dependencies for audio augmentation -RUN apt-get install -y --no-install-recommends libopus0 libsndfile1 - -# Try and free some space -RUN rm -rf /var/lib/apt/lists/* - -WORKDIR / -RUN git clone $DEEPSPEECH_REPO DeepSpeech - -WORKDIR /DeepSpeech -RUN git checkout $DEEPSPEECH_SHA - -# Build CTC decoder first, to avoid clashes on incompatible versions upgrades -RUN cd native_client/ctcdecode && make NUM_PROCESSES=$(nproc) bindings -RUN pip3 install --upgrade native_client/ctcdecode/dist/*.whl - -# Prepare deps -RUN pip3 install --upgrade pip==20.2.2 wheel==0.34.2 setuptools==49.6.0 - -# Install DeepSpeech -# - No need for the decoder since we did it earlier -# - There is already correct TensorFlow GPU installed on the base image, -# we don't want to break that -RUN DS_NODECODER=y DS_NOTENSORFLOW=y pip3 install --upgrade -e . - -# Tool to convert output graph for inference -RUN python3 util/taskcluster.py --source tensorflow --branch r1.15 \ - --artifact convert_graphdef_memmapped_format --target . - -# Build KenLM to generate new scorers -WORKDIR /DeepSpeech/native_client -RUN rm -rf kenlm && \ - git clone https://github.com/kpu/kenlm && \ - cd kenlm && \ - git checkout 87e85e66c99ceff1fab2500a7c60c01da7315eec && \ - mkdir -p build && \ - cd build && \ - cmake .. && \ - make -j $(nproc) -WORKDIR /DeepSpeech - -RUN ./bin/run-ldc93s1.sh diff --git a/Makefile b/Makefile index 3c4f671..89d0c36 100644 --- a/Makefile +++ b/Makefile @@ -1,10 +1,11 @@ default: build -DEEPSPEECH_RELEASE := 0.9.1 -TECHIAITH_RELEASE := 21.01 +DEEPSPEECH_RELEASE := 0.9.3 +TECHIAITH_RELEASE := 21.03 + run: - docker run --gpus all --name techiaith-deepspeech-v${DEEPSPEECH_RELEASE}-${USER} -it \ + docker run --gpus all --name techiaith-deepspeech-train-v${DEEPSPEECH_RELEASE}-${USER} -it \ -v ${PWD}/data/:/data \ -v ${PWD}/checkpoints/:/checkpoints \ -v ${PWD}/models/:/models \ @@ -13,15 +14,11 @@ run: -v ${PWD}/local/:/DeepSpeech/bin/bangor_welsh \ --env DEEPSPEECH_RELEASE=${DEEPSPEECH_RELEASE} \ --env TECHIAITH_RELEASE=${TECHIAITH_RELEASE} \ - techiaith/deepspeech:v${DEEPSPEECH_RELEASE} bash + techiaith/deepspeech-train:v${DEEPSPEECH_RELEASE} bash build: - if [ ! -d "DeepSpeech" ]; then \ - git clone https://github.com/mozilla/DeepSpeech.git; \ - fi - cd DeepSpeech && make Dockerfile.train DEEPSPEECH_SHA=tags/v${DEEPSPEECH_RELEASE} && docker build --rm -t mozilla/deepspeech:v${DEEPSPEECH_RELEASE} -f Dockerfile.train . if [ ! -d "checkpoints/mozilla" ]; then \ mkdir -p checkpoints/mozilla; \ cd checkpoints/mozilla && \ @@ -48,20 +45,17 @@ build: wget https://github.com/techiaith/docker-deepspeech-cy/releases/download/$(TECHIAITH_RELEASE)/techiaith_bangor_macsen_$(TECHIAITH_RELEASE).scorer && \ wget https://github.com/techiaith/docker-deepspeech-cy/releases/download/$(TECHIAITH_RELEASE)/techiaith_bangor_transcribe_$(TECHIAITH_RELEASE).scorer;\ fi - docker build --build-arg BRANCH=v${DEEPSPEECH_RELEASE} --rm -t techiaith/deepspeech:v${DEEPSPEECH_RELEASE} . + docker build --build-arg BRANCH=v${DEEPSPEECH_RELEASE} --rm -t techiaith/deepspeech-train:v${DEEPSPEECH_RELEASE} . clean: - -docker rmi techiaith/deepspeech:v${DEEPSPEECH_RELEASE} - -docker rmi mozilla/deepspeech:v${DEEPSPEECH_RELEASE} - -docker rmi nvidia/cuda:10.0-cudnn7-devel-ubuntu18.04 - -docker rmi tensorflow/tensorflow:1.15.2-gpu-py3 - sudo rm -rf DeepSpeech + -docker rmi techiaith/deepspeech-train:v${DEEPSPEECH_RELEASE} + -docker rmi mozilla/deepspeech-train:v${DEEPSPEECH_RELEASE} sudo rm -rf homedir sudo rm -rf checkpoints sudo rm -rf models stop: - -docker stop techiaith-deepspeech-v${DEEPSPEECH_RELEASE}-${USER} - -docker rm techiaith-deepspeech-v${DEEPSPEECH_RELEASE}-${USER} + -docker stop techiaith-deepspeech-train-v${DEEPSPEECH_RELEASE}-${USER} + -docker rm techiaith-deepspeech-train-v${DEEPSPEECH_RELEASE}-${USER} diff --git a/local/README.md b/local/README.md index 1794003..ce727d9 100644 --- a/local/README.md +++ b/local/README.md @@ -6,6 +6,8 @@ Mae dogfennaeth gan Mozilla ar DeepSpeech ar gael fan hyn: https://deepspeech.re Mae'r sgriptiau canlynol yn cysylltu ag yn hwyluso'r holl gamau a ddilynir er mwyn hyfforddi, gynhyrchu a gwerthuso modelau adnabod lleferydd Cymraeg gyda DeepSpeech Mozilla. Defnyddir setiau Cymraeg o wefan CommonVoice Mozilla fel prif ffynhonnell data hyfforddi. Gydag adnoddau bellach gan Uned Technolegau Iaith, Prifysgol Bangor, mae'r modelau'n addas ar gyfer rhaglenni cynorthwyydd digidol (e.e. Macsen) a thrawsgrifiwr gyffredinol. +Mae modelau sydd wedi'i hyfforddi'n barod ar gael o'r dudalen cyhoeddi: https://github.com/techiaith/docker-deepspeech-cy/releases + ## Rhagofynion @@ -16,16 +18,16 @@ Llwythwch i lawr hefyd Corpws OSCAR o https://oscar-public.huma-num.fr/shuff-ori ## Paratoi Data -### `import_audio_archive.py` +### `import_cv_archive.py` ```shell -root@c67722092f2e:/DeepSpeech# bin/bangor_welsh/import_audio_archive.py --archive /data/commonvoice/cy.tar.gz --target_dir /data/commonvoice/ +root@c67722092f2e:/DeepSpeech# bin/bangor_welsh/import_cv_archive.py --archive /data/commonvoice/cy.tar.gz --target_dir /data/commonvoice/ ``` -### `analyze_audio.py` +### `analyze_cv.py` ```shell -root@c67722092f2e:/DeepSpeech# /DeepSpeech/bin/bangor_welsh/analyze_audio.py --csv_dir /data/commonvoice/clips/ +root@c67722092f2e:/DeepSpeech# /DeepSpeech/bin/bangor_welsh/analyze_cv.py --cv_dir /data/commonvoice/ /data/commonvoice-cy-v5-20200622/clips/dev.csv 0.91 hours (3269.93 seconds) /data/commonvoice-cy-v5-20200622/clips/test.csv 0.98 hours (3514.49 seconds) /data/commonvoice-cy-v5-20200622/clips/train.csv 1.09 hours (3941.04 seconds) @@ -36,14 +38,14 @@ root@c67722092f2e:/DeepSpeech# /DeepSpeech/bin/bangor_welsh/analyze_audio.py --c ## Model Acwstig -Defnyddiwch y sgript ganlynol i hyfforddi model acwstig. Dyle paramedr `-a` nodi ble mae'r ffeiliau CSV o ganlyniad i fewnforio CommonVoice. Yn yr enghraifft hon, maent wedi'u lleoli yn is-gyfeiriadur `/clips` y `target_dir` gwreiddiol. +Defnyddiwch y sgript ganlynol i hyfforddi model acwstig gyda data gan gwefan CommonVoice. ### `run_tl_cv_cy.sh` Mae'r sgript hon yn defnyddio nodwedd dysgu trosglwyddol (*transfer learning*) DeepSpeech er mwyn cael fudd o ddefnyddio modelau acwstig Saesneg Mozilla, sydd wedi'u hyfforddi ar gasgliadau data llawer mwy o sain, fel man cychwyn ar gyfer hyfforddi adnabod lleferydd Cymraeg. ```shell -root@c67722092f2e:/DeepSpeech# /DeepSpeech/bin/bangor_welsh/run_tl_cv_cy.sh -a /data/commonvoice/clips +root@c67722092f2e:/DeepSpeech# ./bin/bangor_welsh/run_tl_cv_cy.sh --cv_dir /data/commonvoice ``` @@ -58,7 +60,7 @@ Mae angen rhagor o adnoddau gan Brifysgol Bangor er mwyn hyfforddi DeepSpeech ar Mae'r sgript isod yn llwytho i lawr rhagor o recordiadau a corpora testun sydd yn galluogi adnabod lleferydd Cymraeg o fewn cynorthwyydd digidol a trawsgrifiwr. Rhaid i chi llwytho i lawr ffeil archif corpws testun OSCAR o flaen llaw er mwyn ei ddefnyddio gyda'r orchymyn isod: ```shell -root@6a88b0d59848:/DeepSpeech# bin/bangor_welsh/import_bangor_resources.py -o /data/oscar/cy.txt.gzip -c /data/commonvoice/validated.tsv +root@6a88b0d59848:/DeepSpeech# ./bin/bangor_welsh/import_bangor_resources.py --target_dir /data/bangor --oscar_archive /data/oscar/cy.txt.gzip --cv_dir /data/commonvoice/ ``` Mae'r sgript mewnforio hefyd yn hidlo unrhyw testunau sy'n anaddas i'r proses hyfforddi modelau iaith adnabod lleferydd ac yn creu copi 'glan' (`.clean`) o'r corpws. @@ -70,12 +72,12 @@ Dyma'r brif sgript ar gyfer hyfforddi model iaith ac yna ei werthuso gyda model ##### Ar gyfer defnyddio adnabod lleferydd o fewn Macsen: ```shell -root@6a88b0d59848:/DeepSpeech# ./bin/bangor_welsh/build_lm_scorer.sh -s /data/bangor/lm-data/macsen/corpus.clean.txt -t /data/bangor/testsets/data/macsen/deepspeech.csv -o /data/bangor/lm/macsen +root@6a88b0d59848:/DeepSpeech# ./bin/bangor_welsh/build_lm_scorer.sh --text_file /data/bangor/lm-data/macsen/corpus.clean.txt --domain macsen ``` ##### Ar gyfer defnyddio adnabod lleferydd i drawsgrifio: ```shell -root@6a88b0d59848:/DeepSpeech# ./bin/bangor_welsh/build_lm_scorer.sh -s /data/bangor/lm-data/oscar/corpus.clean.txt -t /data/bangor/testsets/data/trawsgrifio/deepspeech.csv -o /data/bangor/lm/trawsgrifio +root@6a88b0d59848:/DeepSpeech# ./bin/bangor_welsh/build_lm_scorer.sh --text_file /data/bangor/lm-data/oscar/corpus.clean.txt --domain macsen --output_dir /export/macsen --scorer kenlm.scorer ``` @@ -84,8 +86,18 @@ root@6a88b0d59848:/DeepSpeech# ./bin/bangor_welsh/build_lm_scorer.sh -s /data/ba Bydd y sgript yma yn arbrofi gyda gwahanol baramedrau modelau iaith nes iddo ddod o hyd i'r gwerthoedd gorau posibl sy'n rhoi'r cyfraddau gwallau adnabod lleferydd isaf posibl. -Gall y broses gymryd amser hir - oriau neu ddiwrnod neu ddau - gan y bydd yn arbrofi miloedd o weithiau. Yn y diwedd, bydd y sgript yn adrodd ar ddau werth gorau posibl ac yn gofyn ichi eu mewnbynnu i'w cynnwys ym mhecyn terfynol y model iaith. (gweler y ffeil `kenlm.scorer` yn y cyfeiriadur a bennir gan y ddadl sgript` -l`) +Gall y broses gymryd amser hir - oriau neu ddiwrnod neu ddau - gan y bydd yn arbrofi miloedd o weithiau. Yn y diwedd, bydd y sgript yn adrodd ar ddau werth gorau posibl (gelwir yn 'alpha' a 'beta') ac yn gofyn ichi eu mewnbynnu er mwyn eu cynnwys ym mhecyn terfynol y model iaith. + +```shell +root@6a88b0d59848:/DeepSpeech# ./bin/bangor_welsh/optimize_lm_scorer.sh --csv_test_file /data/bangor/testsets/data/macsen/deepspeech.csv --domain macsen [--checkpoint_dir /checkpoints/cy] +``` + + +## Profi'r modelau + +Er mwyn gwybod pa mor dda neu ddim mae'r modelau, mae modd profi erbyn set profi sydd wedi ei fanylu o fewn ffeil CSV. Er enghraifft, er mwyn profi'r modelau trawsgrifio: ```shell -root@6a88b0d59848:/DeepSpeech# bin/bangor_welsh/optimize_lm_scorer.sh -l /data/bangor/lm/mascen -t /data/bangor/testsets/data/macsen/deepspeech.csv +root@6a88b0d59848:/DeepSpeech# ./bin/bangor_welsh/evaluate.sh --csv_test_file /data/bangor/testsets/data/trawsgrifio/arddweud_200617/deepspeech.csv --scorer /export/transcribe/kenlm.transcribe.scorer ``` + diff --git a/local/README_EN.md b/local/README_EN.md index 5928b44..d586411 100644 --- a/local/README_EN.md +++ b/local/README_EN.md @@ -6,24 +6,27 @@ Documentation by Mozilla on DeepSpeech can be found here: https://deepspeech.rea The following scripts join up all the steps that are needed to train, generate and evaluate models for Welsh language speech recognition with Mozilla's DeepSpeech. The Welsh datasets from Mozilla's CommonVoice website are the primary resource for training. With some further resources from Bangor University's Language Technologies Unit, the models are viable for voice assistant (e.g. Macsen) and a transcriber applications. +Pre-trained models however are available from the release page: https://github.com/techiaith/docker-deepspeech-cy/releases ## Prerequisites Download the Welsh speech data from the Mozilla CommonVoice website: https://voice.mozilla.org/cy/datasets which is provided as a single large compressed file (`.tar.gz`). Save the file into the `data` ffolder. +Download also the OSCAR text corpus from https://oscar-public.huma-num.fr/shuff-orig/cy which contains Welsh language texts collected from the world wide web. You will need to register for the website to permit downloading. Save the file in the `data/oscar` folder. + ## Prepare Data ### `import_audio_archive.py` ```shell -root@c67722092f2e:/DeepSpeech# bin/bangor_welsh/import_audio_archive.py --archive /data/cy-v4.tar.gz --target_dir /data/commonvoice-cy-v5-20200622/ +root@c67722092f2e:/DeepSpeech# bin/bangor_welsh/import_audio_archive.py --archive /data/commonvoice/cy.tar.gz --target_dir /data/commonvoice/ ``` ### `analyze_audio.py` ```shell -root@c67722092f2e:/DeepSpeech# /DeepSpeech/bin/bangor_welsh/analyze_audio.py --csv_dir /data/commonvoice-cy-v5-20200622/clips/ +root@c67722092f2e:/DeepSpeech# /DeepSpeech/bin/bangor_welsh/analyze_audio.py --csv_dir /data/commonvoice/clips/ /data/commonvoice-cy-v5-20200622/clips/dev.csv 0.91 hours (3269.93 seconds) /data/commonvoice-cy-v5-20200622/clips/test.csv 0.98 hours (3514.49 seconds) /data/commonvoice-cy-v5-20200622/clips/train.csv 1.09 hours (3941.04 seconds) @@ -35,15 +38,14 @@ root@c67722092f2e:/DeepSpeech# /DeepSpeech/bin/bangor_welsh/analyze_audio.py --c ## Acoustic Model -Use the following script to train an acoustic model. The `-a` argument needs to point to where to the CSV files from your CommonVoice import. In this example, they are located in the `/clips` subdirectory of the original `target_dir`. - +Use the following script to train an acoustic model with data from the CommonVoice website. ### `run_tl_cv_cy.sh` This script uses DeepSpeech's transfer learning feature in order to benefit from Mozilla's English acoustic models, trained on much larger speech data collections, as a starting point for training Welsh speech recognition. ```shell -root@c67722092f2e:/DeepSpeech# /DeepSpeech/bin/bangor_welsh/run_tl_cv_cy.sh -a /data/commonvoice-cy-v5-20200622/clips +root@c67722092f2e:/DeepSpeech# ./bin/bangor_welsh/run_tl_cv_cy.sh --cv_dir /data/commonvoice ``` @@ -57,30 +59,46 @@ An acoustic model on its own, despite having used transfer learning techniques, You will need further resources from Bangor University in order to train DeepSpeech with language models for various Welsh language applications. -The following script will download further recordings and/or text corpora that facilitate Welsh speech recognition for a simple voice assistant ('macsen') or a transcriber ('transcribe') (as requested in the `-d` argument). +The following script will download further recordings and/or text corpora that facilitate Welsh speech recognition for a simple voice assistant or a transcriber. You should have downloaded the OSCAR corpus beforehand in order to use the following command: ```shell -root@6a88b0d59848:/DeepSpeech# bin/bangor_welsh/import_bangor_resources.py -t /data/macsen -d macsen +root@6a88b0d59848:/DeepSpeech# ./bin/bangor_welsh/import_bangor_resources.py --target_dir /data/bangor --oscar_archive /data/oscar/cy.txt.gzip --cv_dir /data/commonvoice/ ``` +The script filters unsuitable texts for training process and creates a 'clean' (`.clean`) version of the corpus. + ### `build_lm_scorer.sh` This is the main script for training a language model and evaluation with the acoustic model from the previous steps in training DeepSpeech. - +##### Training voice assistant Macsen's language model: ```shell -root@6a88b0d59848:/DeepSpeech# bin/bangor_welsh/build_lm_scorer.sh -s /data/texts/macsen/corpus.clean.txt -o /data/texts/macsen/ -t /data/macsen/deepspeech.csv +root@6a88b0d59848:/DeepSpeech# ./bin/bangor_welsh/build_lm_scorer.sh --text_file /data/bangor/lm-data/macsen/corpus.clean.txt --domain macsen ``` +##### Training transcriber language model: +```shell +root@6a88b0d59848:/DeepSpeech# ./bin/bangor_welsh/build_lm_scorer.sh --text_file /data/bangor/lm-data/oscar/corpus.clean.txt --domain macsen --output_dir /export/macsen --scorer kenlm.scorer +``` + + ### `optimize_lm_scorer.sh` This script will experiment with various language model parameters until it finds optimal values that give the lowest possible recognition error rates. -The process can take a long time - hours or possibly day or two - since it will experiment many thousands of times. In the end, the script will report on two optimal values and ask you to enter them for final inclusion in the finally packaged language model. (`kenlm.scorer` in the directory specified by the `-o` script argument) +The process can take a long time - hours or possibly day or two - since it will experiment many thousands of times. In the end, the script will report on two optimal values and ask you to enter them for final inclusion in the finally packaged language model. ```shell -root@6a88b0d59848:/DeepSpeech# bin/bangor_welsh/optimize_lm_scorer.sh -l /data/texts/macsen -t /data/macsen/deepspeech.csv +root@6a88b0d59848:/DeepSpeech# ./bin/bangor_welsh/optimize_lm_scorer.sh --csv_test_file /data/bangor/testsets/data/macsen/deepspeech.csv --domain macsen [--checkpoint_dir /checkpoints/cy] ``` + +## Model Evaluation + +You can test how well your model will perform if you have a CSV that provides a test set. For example, to test the transcription models: + +```shell +root@6a88b0d59848:/DeepSpeech# ./bin/bangor_welsh/evaluate.sh --csv_test_file /data/bangor/testsets/data/trawsgrifio/arddweud_200617/deepspeech.csv --scorer /export/transcribe/kenlm.transcribe.scorer +``` \ No newline at end of file diff --git a/local/analyze_audio.py b/local/analyze_audio.py deleted file mode 100755 index 4169351..0000000 --- a/local/analyze_audio.py +++ /dev/null @@ -1,50 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -import os -import sys -import pathlib -import librosa -import pandas - -from argparse import ArgumentParser, RawTextHelpFormatter - -DESCRIPTION = """ - -""" - -def main(csv_root_dir, **args): - csv_files = pathlib.Path(csv_root_dir).glob("*.csv") - - # client_id path sentence up_votes down_votes age gender accent locale segment - for csv_file_path in csv_files: - - df = pandas.read_csv(csv_file_path, encoding='utf-8') - # - df_grouped = df.groupby("transcript").size().to_frame('count').reset_index() - df_grouped = df_grouped.sort_values("count", ascending=False) - - df_grouped.to_csv(str(csv_file_path).replace(".csv",".dups.txt")) - - # - total_duration = 0.0 - count = 0 - for index, row in df.iterrows(): - count += 1 - wav_file_path = os.path.join(csv_root_dir, row["wav_filename"]) - total_duration = total_duration + librosa.get_duration(filename=wav_file_path) - - print ("%s\t%s recordings\t\t%.2f hours\t(%.2f seconds)" % (csv_file_path, count, total_duration/60.0/60.0, total_duration)) - print (df_grouped.nlargest(n=5, columns='count')) - print ('\n') - - - -if __name__ == "__main__": - - parser = ArgumentParser(description=DESCRIPTION, formatter_class=RawTextHelpFormatter) - - parser.add_argument("--csv_dir", dest="csv_root_dir", required=True, help="path to audio corpus CSV files") - - parser.set_defaults(func=main) - args = parser.parse_args() - args.func(**vars(args)) diff --git a/local/analyze_cv.py b/local/analyze_cv.py new file mode 100755 index 0000000..ad01e90 --- /dev/null +++ b/local/analyze_cv.py @@ -0,0 +1,88 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +import os +import sys +import pathlib +import librosa +import pandas + +from argparse import ArgumentParser, RawTextHelpFormatter + +DESCRIPTION = """ + +""" + +def panda_group(df, column, destination_file_path): + + df_grp_client = df.groupby(column).size().to_frame('count').reset_index() + df_grp_client = df_grp_client.sort_values("count", ascending=False) + df_grp_client.to_csv(destination_file_path, index=False) + + + +def analyze_tsvs(cv_root_dir): + #client_id path sentence up_votes down_votes age gender accent locale segment + tsv_files = pathlib.Path(cv_root_dir).glob("*.tsv") + for tsv_file_path in tsv_files: + + print ("Analyzing %s " % tsv_file_path) + + if 'reported.tsv' in str(tsv_file_path): + continue + + df = pandas.read_csv(tsv_file_path, encoding='utf-8', sep='\t', header=0, dtype={'gender':str}) + + panda_group(df, 'client_id', str(tsv_file_path).replace(".tsv",".counts.client_id.txt")) + panda_group(df, 'sentence', str(tsv_file_path).replace(".tsv",".counts.sentence.txt")) + + panda_group(df, 'age', str(tsv_file_path).replace(".tsv",".counts.age.txt")) + panda_group(df, 'gender', str(tsv_file_path).replace(".tsv",".counts.gender.txt")) + + # analyze clients by age and gender.... + + +def analyze_csvs(cv_root_dir): + + clips_dir = os.path.join(cv_root_dir, "clips") + csv_files = pathlib.Path(clips_dir).glob("*.csv") + + # client_id path sentence up_votes down_votes age gender accent locale segment + for csv_file_path in csv_files: + + df = pandas.read_csv(csv_file_path, encoding='utf-8') + # + df_grouped = df.groupby("transcript").size().to_frame('count').reset_index() + df_grouped = df_grouped.sort_values("count", ascending=False) + + df_grouped.to_csv(str(csv_file_path).replace(".csv",".dups.txt"), index=False) + + # + total_duration = 0.0 + count = 0 + for index, row in df.iterrows(): + count += 1 + wav_file_path = os.path.join(cv_root_dir, "clips", row["wav_filename"]) + total_duration = total_duration + librosa.get_duration(filename=wav_file_path) + + print ("%s\t%s recordings\t\t%.2f hours\t(%.2f seconds)" % (csv_file_path, count, total_duration/60.0/60.0, total_duration)) + print (df_grouped.nlargest(n=5, columns='count')) + print ('\n') + + +def main(cv_root_dir, **args): + + analyze_tsvs(cv_root_dir) + analyze_csvs(cv_root_dir) + + + + +if __name__ == "__main__": + + parser = ArgumentParser(description=DESCRIPTION, formatter_class=RawTextHelpFormatter) + + parser.add_argument("--cv_dir", dest="cv_root_dir", required=True, help="path to commonvoice files") + + parser.set_defaults(func=main) + args = parser.parse_args() + args.func(**vars(args)) diff --git a/local/build_lm_scorer.sh b/local/build_lm_scorer.sh index 0869dcf..ffe27ae 100755 --- a/local/build_lm_scorer.sh +++ b/local/build_lm_scorer.sh @@ -3,48 +3,88 @@ set -e set -u set -o pipefail +help() +{ + echo + echo "Rhedeg sgriptiau hyfforddi modelau iaith KenLM i'w defnyddio gyda DeepSpeech" + echo "Run scripts for training KenLM language models for use with DeepSpeech" + echo + echo "Syntax: $ `basename $0` [OPTIONS]" + echo + echo "Options:" + echo + echo " -t, --text_file Path to text file containing all corpus text " + echo " -d, --domain Name for language model domain (e.g. 'macsen' or 'transcribe' " + echo " -o, --output_dir (optional) Default: /export/${DEEPSPEECH_RELEASE}_${TECHIAITH_RELEASE}" + echo + exit 0 +} + +lm_domain='' source_text_file='' -output_dir='' -test_files='' +output_dir=/export/${DEEPSPEECH_RELEASE}_${TECHIAITH_RELEASE} -VOCAB_SIZE=50000 +SHORT=ht:d:o: +LONG=text_file:,domain:,output_dir: -alphabet_file_path=/DeepSpeech/bin/bangor_welsh/alphabet.txt -checkpoint_cy_dir=/checkpoints/cy - -while getopts ":s:t:o:" opt; do - case $opt in - s) - source_text_file=$OPTARG - ;; - t) - test_files=$OPTARG - ;; - o) - output_dir=$OPTARG - ;; - \?) echo "Invalid option -$OPTARG" >&2 - ;; +# read options +OPTS=$(getopt --options $SHORT --long $LONG --name "$0" -- "$@") + + +if [ $? != 0 ] ; then + echo "Failed to parse options...exiting." >&2 ; + exit 1 ; +fi + +eval set -- "$OPTS" + +while true ; do + case "$1" in + -t | --text_file ) + source_text_file="$2" + shift 2 + ;; + -d | --domain ) + lm_domain="$2" + shift 2 + ;; + -o | --output_dir ) + output_dir="$2" + shift 2 + ;; + -h | --help ) + help + shift + ;; + -- ) + shift + break + ;; + *) + help + exit 1 + ;; esac done -shift "$(($OPTIND -1))" if [ -z "${source_text_file}" ]; then - echo "-s source_text_file not set" - exit 2 + echo "--text_file missing. Use `basename $0` -h for more info." + exit 2 fi -if [ -z "$test_files" ]; then - echo "-t test_files not set (csv file containing speech test set)" - exit 2 -fi -if [ -z "$output_dir" ]; then - echo "-o output_dir not set" - exit 2 + +if [ -z "${lm_domain}" ]; then + echo "--domain missing. Use `basename $0` -h for more info." + exit 2 fi mkdir -p ${output_dir} cd ${output_dir} +VOCAB_SIZE=50000 +alphabet_file_path=/DeepSpeech/bin/bangor_welsh/alphabet.txt + + + set +x echo "####################################################################################" echo "#### Generating binary language model ####" @@ -63,42 +103,63 @@ python /DeepSpeech/data/lm/generate_lm.py \ --binary_type 'trie' \ --discount_fallback +# +set +x + +default_alpha=1.7242448485503816 +default_beta=4.9065413926676165 + +default_alpha_file="${output_dir}/optimal_alpha.${lm_domain}.txt" +default_beta_file="${output_dir}/optimal_beta.${lm_domain}.txt" + +bangor_default_alpha_file=/DeepSpeech/bin/bangor_welsh/conf/default_lm_alpha.${lm_domain}.txt +bangor_default_beta_file=/DeepSpeech/bin/bangor_welsh/conf/default_lm_beta.${lm_domain}.txt + +if [ -f ${bangor_default_alpha_file} ] ; then + if [ ! -f ${default_alpha_file} ] ; then + cp ${bangor_default_alpha_file} ${default_alpha_file} + fi +fi + +if [ -f ${bangor_default_beta_file} ] ; then + if [ ! -f ${default_beta_file} ] ; then + cp ${bangor_default_beta_file} ${default_beta_file} + fi +fi + +if [ -f ${default_alpha_file} ] ; then + default_alpha=$(<${default_alpha_file}) +fi + +if [ -f ${default_beta_file} ] ; then + default_beta=$(<${default_beta_file}) +fi set +x echo "####################################################################################" -echo "#### Generating package for un-optimized language model package ####" -echo "#### ####" -echo "#### Default alpha and beta values used. Previous optimal values were: ####" +echo "#### Generating language model package ####" echo "#### ####" -echo "#### Voice Assistant Language Model ####" -echo "#### alpha: 1.7242448485503816 ####" -echo "#### beta: 4.9065413926676165 ####" +echo "#### Default alpha and beta values are ####" echo "#### ####" -echo "#### Transcription Language Model ####" -echo "#### alpha: 1.1417685444561605 ####" -echo "#### beta: 0.5798010479098541 ####" +echo "#### alpha : ${default_alpha} ####" +echo "#### beta : ${default_beta} ####" echo "#### ####" echo "####################################################################################" set -x + /DeepSpeech/native_client/generate_scorer_package \ --alphabet "${alphabet_file_path}" \ --lm lm.binary \ --vocab vocab-${VOCAB_SIZE}.txt \ - --package kenlm.scorer \ - --default_alpha 0.75 \ - --default_beta 1.85 + --package kenlm.${lm_domain}.scorer \ + --default_alpha ${default_alpha} \ + --default_beta ${default_beta} +cd - set +x echo "####################################################################################" -echo "#### Evaluate Scorer with current Welsh checkpoint ###" +echo "#### Successfully built lm package : ${output_dir}/kenlm.${lm_domain}.scorer " echo "####################################################################################" set -x -python -u /DeepSpeech/evaluate.py \ - --test_files "${test_files}" --test_batch_size 1 \ - --alphabet_config_path "${alphabet_file_path}" \ - --load_checkpoint_dir "${checkpoint_cy_dir}" \ - --scorer_path kenlm.scorer - -cd - diff --git a/local/conf/default_lm_alpha.macsen.txt b/local/conf/default_lm_alpha.macsen.txt new file mode 100644 index 0000000..897afde --- /dev/null +++ b/local/conf/default_lm_alpha.macsen.txt @@ -0,0 +1 @@ +1.7242448485503816 diff --git a/local/conf/default_lm_alpha.transcribe.txt b/local/conf/default_lm_alpha.transcribe.txt new file mode 100644 index 0000000..6841ab6 --- /dev/null +++ b/local/conf/default_lm_alpha.transcribe.txt @@ -0,0 +1 @@ +1.1417685444561605 diff --git a/local/conf/default_lm_alpha.txt b/local/conf/default_lm_alpha.txt new file mode 100644 index 0000000..6841ab6 --- /dev/null +++ b/local/conf/default_lm_alpha.txt @@ -0,0 +1 @@ +1.1417685444561605 diff --git a/local/conf/default_lm_beta.macsen.txt b/local/conf/default_lm_beta.macsen.txt new file mode 100644 index 0000000..c9b206f --- /dev/null +++ b/local/conf/default_lm_beta.macsen.txt @@ -0,0 +1 @@ +4.9065413926676165 diff --git a/local/conf/default_lm_beta.transcribe.txt b/local/conf/default_lm_beta.transcribe.txt new file mode 100644 index 0000000..f03c709 --- /dev/null +++ b/local/conf/default_lm_beta.transcribe.txt @@ -0,0 +1 @@ +0.5798010479098541 diff --git a/local/conf/default_lm_beta.txt b/local/conf/default_lm_beta.txt new file mode 100644 index 0000000..f03c709 --- /dev/null +++ b/local/conf/default_lm_beta.txt @@ -0,0 +1 @@ +0.5798010479098541 diff --git a/local/evaluate.sh b/local/evaluate.sh new file mode 100755 index 0000000..9a18dc7 --- /dev/null +++ b/local/evaluate.sh @@ -0,0 +1,117 @@ +#!/bin/bash +set -e +set -u +set -o pipefail + +help() +{ + echo + echo "Rhedeg sgriptiau profi modelau DeepSpeech erbyn set profi benodol" + echo "Run scripts for testing DeepSpeech models against a specific test set" + echo + echo "Usage: $ `basename $0` [OPTIONS]" + echo + echo "Options:" + echo + echo " -c, --csv_test_file Path to test set csv file containing paths to clips and reference transcriptions" + echo " -s, --scorer Path to language model scorer" + echo " -p, --checkpoint_dir Path to checkpoint directory (optional)" + echo " -r, --results_file Path to results file (optional)" + echo + exit 0 +} + +scorer='' +test_file='' +results_file='' +checkpoint_cy_dir='' + +SHORT=hs:c:p:r: +LONG=scorer:,csv_test_file:,checkpoint_dir:,results_file: + +# read options +OPTS=$(getopt --options $SHORT --long $LONG --name "$0" -- "$@") + + +if [ $? != 0 ] ; then + echo "Failed to parse options...exiting." >&2 ; + exit 1 ; +fi + +eval set -- "$OPTS" + +while true ; do + case "$1" in + -s | --scorer ) + scorer="$2" + shift 2 + ;; + -c | --csv_test_file ) + test_file="$2" + shift 2 + ;; + -p | --checkpoint_dir ) + checkpoint_cy_dir="$2" + shift 2 + ;; + -r | --results_file ) + results_file="$2" + shift 2 + ;; + -h) + help + shift + ;; + -- ) + shift + break + ;; + *) + help + exit 1 + ;; + esac +done + + +if [ -z "${test_file}" ]; then + echo "--csv_test_file missing. Use `basename $0` -h for more info." + exit 2 +fi + +if [ -z "${scorer}" ]; then + echo "--scorer missing. Use `basename $0` -h for more info." + exit 2 +fi + +if [ -z "$checkpoint_cy_dir" ]; then + checkpoint_cy_dir=/checkpoints/cy + echo "-p|--checkpoint_dir not set. Setting to ${checkpoint_cy_dir} " +fi + +if [ -z "$results_file" ]; then + results_file=${test_file}.results.json +fi + +alphabet_file_path=/DeepSpeech/bin/bangor_welsh/alphabet.txt + +set +x +echo "####################################################################################" +echo "#### evaluating with transcriber testset ###" +echo "####################################################################################" +set -x + +python -u /DeepSpeech/evaluate.py \ + --test_files "${test_file}" \ + --test_batch_size 1 \ + --alphabet_config_path "${alphabet_file_path}" \ + --load_checkpoint_dir "${checkpoint_cy_dir}" \ + --scorer_path ${scorer} \ + --test_output_file ${results_file} + + +set +x +echo "####################################################################################" +echo "#### Results in ${results_file} " +echo "####################################################################################" +set -x \ No newline at end of file diff --git a/local/evalutate.sh b/local/evalutate.sh deleted file mode 100755 index cf0dee4..0000000 --- a/local/evalutate.sh +++ /dev/null @@ -1,47 +0,0 @@ -#!/bin/bash -set -e -set -u -set -o pipefail - -testset_dir='' -scorer_path='' - -alphabet_file_path=/DeepSpeech/bin/bangor_welsh/alphabet.txt -checkpoint_cy_dir=/checkpoints/cy - -while getopts ":t:s:" opt; do - case $opt in - t) - testset_dir=$OPTARG - ;; - s) - scorer_path=$OPTARG - ;; - \?) echo "Invalid option -$OPTARG" >&2 - ;; - esac -done -shift "$(($OPTIND -1))" - -if [ -z "$testset_dir" ]; then - echo "-t testset_dir not set (csv file containing speech test set)" - exit 2 -fi -if [ -z "$scorer_path" ]; then - echo "-s scorer_path not set" - exit 2 -fi - - -set +x -echo "####################################################################################" -echo "#### evaluating with transcriber testset ###" -echo "####################################################################################" -set -x - -python -u /DeepSpeech/evaluate.py \ - --test_files "${testset_dir}/data/trawsgrifio/OpiwHxPPqRI/deepspeech.csv" \ - --test_batch_size 1 \ - --alphabet_config_path "${alphabet_file_path}" \ - --load_checkpoint_dir "${checkpoint_cy_dir}" \ - --scorer_path "${scorer_path}" diff --git a/local/fine_tune.sh b/local/fine_tune.sh new file mode 100755 index 0000000..a130daa --- /dev/null +++ b/local/fine_tune.sh @@ -0,0 +1,166 @@ +#!/bin/bash +set -e +set -u +set -o pipefail + +help() +{ + echo + echo "Rhedeg sgriptiau creu modelau wedi eu gwella gyda data ychwanegol" + echo "Run scripts for fine tuning models with new additional data" + echo + echo "Usage: $ `basename $0` [OPTIONS]" + echo + echo "Options:" + echo + echo " -c, --csv_train_file Path to csv file containing extra transcriptions with paths to audio clips" + echo " -p, --checkpoint_dir Path to previously trained checkpoint directory (optional - default /checkpoints/cy)" + echo " -t, --text_file Path to text file containing union of all corpora (e.g. corpus.union.clean.txt" + echo " -n, --name Name for fine tuning" + echo " -d, --domain language model domain (e.g. 'macsen' or 'transcribe')" + echo + exit 0 +} + +csv_file='' +lm_domain='' +fine_tune_name='' +source_text_file='' +pretrained_checkpoint_dir='' + +SHORT=hc:t:n:p:d: +LONG=csv_train_file:,text_file:,name:,checkpoint_dir:,domain: + +# read options +OPTS=$(getopt --options $SHORT --long $LONG --name "$0" -- "$@") + +if [ $? != 0 ] ; then + echo "Failed to parse options...exiting." >&2 ; + exit 1 ; +fi + +eval set -- "$OPTS" + +while true ; do + case "$1" in + -c | --csv_train_file ) + csv_file="$2" + shift 2 + ;; + -t | --text_file ) + source_text_file="$2" + shift 2 + ;; + -n | --name ) + fine_tune_name="$2" + shift 2 + ;; + -d | --domain ) + lm_domain="$2" + shift 2 + ;; + -p | --checkpoint_dir ) + pretrained_checkpoint_dir="$2" + shift 2 + ;; + -h | --help ) + help + shift + ;; + -- ) + shift + break + ;; + *) + help + exit 1 + ;; + esac +done + + +if [ -z "${csv_file}" ]; then + echo "--csv_train_file missing. Use `basename $0` -h for more info." + exit 2 +fi + +if [ -z "${source_text_file}" ]; then + echo "--text_file missing. Use `basename $0` -h for more info." + exit 2 +fi + +if [ -z "${lm_domain}" ]; then + echo "--domain missing. Use `basename $0` -h for more info." + exit 2 +fi + +if [ -z "${fine_tune_name}" ]; then + echo "--name missing. Use `basename $0` -h for more info." + exit 2 +fi + + +if [ -z "${pretrained_checkpoint_dir}" ]; then + checkpoint_cy_dir=/checkpoints/cy + pretrained_checkpoint_dir=/checkpoints/techiaith + if [ "$(ls -A ${checkpoint_cy_dir})" ]; then + pretrained_checkpoint_dir=${checkpoint_cy_dir} + fi +fi + +# +export_dir=/export/${DEEPSPEECH_RELEASE}_${TECHIAITH_RELEASE}/${fine_tune_name} +checkpoint_finetuned_dir=/checkpoints/${fine_tune_name} + +rm -rf ${export_dir} +rm -rf ${checkpoint_finetuned_dir} + +mkdir -p ${export_dir} +mkdir -p ${checkpoint_finetuned_dir} + + +### +train_files=${csv_file} +alphabet_cy_file=/DeepSpeech/bin/bangor_welsh/alphabet.txt + +### Force UTF-8 output +export PYTHONIOENCODING=utf-8 + +set +x +echo +echo "####################################################################################" +echo "#### Continue acoustic model training from best previous checkpoint ####" +echo "####################################################################################" +set -x +python -u DeepSpeech.py \ + --train_files "${train_files}" \ + --train_batch_size 2 \ + --epochs 10 \ + --alphabet_config_path "${alphabet_cy_file}" \ + --load_checkpoint_dir "${pretrained_checkpoint_dir}" \ + --save_checkpoint_dir "${checkpoint_finetuned_dir}" \ + --export_dir "${export_dir}" + +### +/DeepSpeech/native_client/convert_graphdef_memmapped_format \ + --in_graph=${export_dir}/output_graph.pb \ + --out_graph=${export_dir}/output_graph.pbmm + + +set +x +echo +echo "####################################################################################" +echo "#### Fine tuned acoustic models (.pb/.pbmm files) can be found at ${export_dir} " +echo "####################################################################################" +set -x + +set +x +echo "####################################################################################" +echo "#### Generating finetuned binary language model ####" +echo "####################################################################################" +set -x + +/DeepSpeech/bin/bangor_welsh/build_lm_scorer.sh \ + --text_file ${source_text_file} \ + --domain ${lm_domain} \ + --output_dir ${export_dir} diff --git a/local/import_bangor_resources.py b/local/import_bangor_resources.py index f0b2c10..61f8672 100755 --- a/local/import_bangor_resources.py +++ b/local/import_bangor_resources.py @@ -1,9 +1,7 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- import os -import sys import csv -import hashlib import gzip import pandas import requests @@ -11,26 +9,23 @@ from enum import Enum -import tarfile import gzip import shutil import functools from urllib.parse import urlparse -from praatio import tgio - from pathlib import Path from git import Repo -from pydub import AudioSegment import utils.kfold as kfold import utils.audio as audio -from tqdm import tqdm - from utils.clean_transcript import clean_transcript +from utils.imports import import_textgrid, import_clips_dir, get_directory_structure +from utils.corpus import clean_text_corpus, import_csv_textcorpus, join_corpus_files, get_macsen_textcorpus + from argparse import ArgumentParser, RawTextHelpFormatter DESCRIPTION = """ @@ -44,33 +39,18 @@ ALPHABET_FILE_PATH = "/DeepSpeech/bin/bangor_welsh/alphabet.txt" -TESTSET_URL = "https://git.techiaith.bangor.ac.uk/data-corpws-mewnol/corpws-profi-deepspeech" +TESTSET_URL = "https://git.techiaith.bangor.ac.uk/data-corpws-mewnol/corpws-profi-deepspeech" +MACSEN_TEXT_CORPUS_URL = "https://api.techiaith.org/assistant/get_all_sentences" -def clone_testset(target_testset_dir): +def clone_bangor_testset(target_testset_dir): Repo.clone_from(TESTSET_URL, target_testset_dir) def get_commonvoice_textcorpus(commonvoice_validated_csv_file_path, lm_data_root_dir): - - print ("Extracting texts from CommonVoice: %s " % commonvoice_validated_csv_file_path) - if not os.path.isfile(commonvoice_validated_csv_file_path): - print ("Proceeding with missing file %s " % commonvoice_validated_csv_file_path) - target_dir = os.path.join(lm_data_root_dir, 'commonvoice') - Path(target_dir).mkdir(parents=True, exist_ok=True) - - corpus_file_path = os.path.join(target_dir, "corpus.txt") - - df = pandas.read_csv(commonvoice_validated_csv_file_path, encoding='utf-8', sep='\t', header=0, dtype={'sentence':str}) - sentences = df['sentence'] - - with open(corpus_file_path, 'w', encoding='utf-8') as corpus_file: - for t in sentences: - corpus_file.write(t + "\n") - - return clean_text_corpus(target_dir) + import_csv_textcorpus(commonvoice_validated_csv_file_path, target_dir) @@ -94,203 +74,6 @@ def get_oscar_textcorpus(oscar_archive_file_path, lm_data_root_dir): - -def get_macsen_textcorpus(lm_data_root_dir): - - target_dir = os.path.join(lm_data_root_dir, 'macsen') - Path(target_dir).mkdir(parents=True, exist_ok=True) - - json_data = json.loads(requests.get("https://api.techiaith.org/assistant/get_all_sentences").text) - with open(os.path.join(target_dir, "corpus.txt"), 'w', encoding='utf-8') as macsen_file_out: - for s in json_data["result"]: - macsen_file_out.write(s[0] + "\n") - - return clean_text_corpus(target_dir) - - - -def clean_text_corpus(lm_data_root_dir): - - print ("Cleaning corpus files in %s " % lm_data_root_dir) - - source_text_file_path = os.path.join(lm_data_root_dir, "corpus.txt") - output_text_file_path = os.path.join(lm_data_root_dir, "corpus.clean.txt") - - ooa_text_file_path = source_text_file_path.replace(".txt", ".ooa.txt") - clean = clean_transcript(ALPHABET_FILE_PATH, ooa_text_file_path) - - with open(output_text_file_path, 'w', encoding='utf-8') as out_file: - with open(source_text_file_path, 'r', encoding='utf-8') as in_file: - for i, transcript in enumerate(tqdm(in_file)): - cleaned, transcript = clean.clean(transcript) - if cleaned: - out_file.write(transcript.lower() + "\n") - - return output_text_file_path - - -def get_directory_structure(rootdir): - dir = {} - rootdir = rootdir.rstrip(os.sep) - start = rootdir.rfind(os.sep) + 1 - for path, dirs, files in os.walk(rootdir, followlinks=True): - folders = path[start:].split(os.sep) - subdir = dict.fromkeys(files) - parent = functools.reduce(dict.get, folders[:-1], dir) - parent[folders[-1]] = subdir - - return dir - - -def join_corpus_files(corpus_files, target_languagemodel_data_root_dir): - - print ("Join corpus text files %s " % corpus_files) - - corpus_file_path = os.path.join(target_languagemodel_data_root_dir, "corpus.txt") - - with open(corpus_file_path, 'w', encoding='utf-8') as corpus_outfile: - for fname in corpus_files: - with open(fname, 'r', encoding='utf-8') as corpus_infile: - for line in corpus_infile: - corpus_outfile.write(line) - - return clean_text_corpus(target_languagemodel_data_root_dir) - - -def import_textgrid_test(target_data_root_dir): - - print ("Importing transcripts from files in %s " % target_data_root_dir) - - target_clips_dir = os.path.join(target_data_root_dir, "clips") - Path(target_clips_dir).mkdir(parents=True, exist_ok=True) - - csv_file_path = os.path.join(target_data_root_dir, 'deepspeech.csv') - textgrid_file_path = os.path.join(target_data_root_dir, 'sain.TextGrid') - audio_file = AudioSegment.from_wav(os.path.join(target_data_root_dir, 'sain.wav')) - - moz_fieldnames = ['wav_filename', 'wav_filesize', 'transcript'] - with open(csv_file_path, 'w', encoding='utf-8') as csv_file: - - csv_file_out = csv.DictWriter(csv_file, fieldnames=moz_fieldnames) - csv_file_out.writeheader() - - ooa_text_file_path = os.path.join(target_data_root_dir, 'deepspeech.ooa.txt') - clean = clean_transcript(ALPHABET_FILE_PATH, ooa_text_file_path) - - tg = tgio.openTextgrid(textgrid_file_path) - entryList = tg.tierDict["utterance"].entryList - - for interval in entryList: - text = interval.label - cleaned, transcript = clean.clean(text) - - if cleaned and len(transcript)>0: - transcript = transcript.lower() - - start = float(interval.start) * 1000 - end = float(interval.end) * 1000 - - #print (start, end, transcript) - - split_audio = audio_file[start:end] - hashId = hashlib.md5(transcript.encode('utf-8')).hexdigest() - wav_segment_filepath = os.path.join(target_clips_dir, hashId + ".wav") - split_audio.export(wav_segment_filepath, format="wav") - - csv_file_out.writerow({ - 'wav_filename':wav_segment_filepath, - 'wav_filesize':os.path.getsize(wav_segment_filepath), - 'transcript':transcript - }) - - return pandas.read_csv(csv_file_path, delimiter=',', encoding='utf-8') - - -def import_textgrid_test(target_data_root_dir): - - print ("Importing transcripts from files in %s " % target_data_root_dir) - - target_clips_dir = os.path.join(target_data_root_dir, "clips") - Path(target_clips_dir).mkdir(parents=True, exist_ok=True) - - csv_file_path = os.path.join(target_data_root_dir, 'deepspeech.csv') - textgrid_file_path = os.path.join(target_data_root_dir, 'sain.TextGrid') - audio_file = AudioSegment.from_wav(os.path.join(target_data_root_dir, 'sain.wav')) - - moz_fieldnames = ['wav_filename', 'wav_filesize', 'transcript'] - with open(csv_file_path, 'w', encoding='utf-8') as csv_file: - - csv_file_out = csv.DictWriter(csv_file, fieldnames=moz_fieldnames) - csv_file_out.writeheader() - - ooa_text_file_path = os.path.join(target_data_root_dir, 'deepspeech.ooa.txt') - clean = clean_transcript(ALPHABET_FILE_PATH, ooa_text_file_path) - - tg = tgio.openTextgrid(textgrid_file_path) - entryList = tg.tierDict["utterance"].entryList - - for interval in entryList: - text = interval.label - cleaned, transcript = clean.clean(text) - - if cleaned and len(transcript)>0: - transcript = transcript.lower() - - start = float(interval.start) * 1000 - end = float(interval.end) * 1000 - - #print (start, end, transcript) - - split_audio = audio_file[start:end] - hashId = hashlib.md5(transcript.encode('utf-8')).hexdigest() - wav_segment_filepath = os.path.join(target_clips_dir, hashId + ".wav") - split_audio.export(wav_segment_filepath, format="wav") - - csv_file_out.writerow({ - 'wav_filename':wav_segment_filepath, - 'wav_filesize':os.path.getsize(wav_segment_filepath), - 'transcript':transcript - }) - - return pandas.read_csv(csv_file_path, delimiter=',', encoding='utf-8') - - -def import_clips_dir(target_testset_dir, **args): - - print ("Importing clips dir in %s " % target_testset_dir) - - arddweud_root_dir = get_directory_structure(os.path.join(target_testset_dir, "clips")) - - csv_file_path = os.path.join(target_testset_dir, 'deepspeech.csv') - print (csv_file_path) - - moz_fieldnames = ['wav_filename', 'wav_filesize', 'transcript'] - csv_file_out = csv.DictWriter(open(csv_file_path, 'w', encoding='utf-8'), fieldnames=moz_fieldnames) - csv_file_out.writeheader() - - ooa_text_file_path = os.path.join(target_testset_dir, 'deepspeech.ooa.txt') - clean = clean_transcript(ALPHABET_FILE_PATH, ooa_text_file_path) - - for filename in arddweud_root_dir["clips"]: - if filename.endswith(".wav"): - wavfilepath = os.path.join(target_testset_dir, "clips", filename) - txtfilepath = wavfilepath.replace(".wav", ".txt") - with open(txtfilepath, "r", encoding='utf-8') as txtfile: - transcript = txtfile.read() - cleaned, transcript = clean.clean(transcript) - if cleaned: - transcript = transcript.lower() - if audio.downsample_wavfile(wavfilepath): - # print (wavfilepath) - csv_file_out.writerow({ - 'wav_filename':wavfilepath, - 'wav_filesize':os.path.getsize(wavfilepath), - 'transcript':transcript - }) - - return pandas.read_csv(csv_file_path, delimiter=',', encoding='utf-8') - - def import_macsen_testset(target_testset_dir, **args): print ("Importing Macsen test sets") @@ -328,53 +111,63 @@ def import_macsen_testset(target_testset_dir, **args): - -def main(target_bangor_root_dir, oscar_archive_file_path, commonvoice_validated_csv_file_path, **args): +def main(bangor_target_root_dir, oscar_archive_file_path, commonvoice_root_dir, **args): # - target_testset_root_dir = os.path.join(target_bangor_root_dir, "testsets") - target_languagemodel_data_root_dir = os.path.join(target_bangor_root_dir, "lm-data") + target_testset_root_dir = os.path.join(bangor_target_root_dir, "testsets") + + target_languagemodel_data_root_dir = os.path.join(bangor_target_root_dir, "lm-data") + Path(target_languagemodel_data_root_dir).mkdir(parents=True, exist_ok=True) - clone_testset(target_testset_root_dir) + # Bangor testset contains tests for Macsen (digital assistant) and more general purpose transcription + clone_bangor_testset(target_testset_root_dir) + + # import Macsen testset into our environment + import_macsen_testset(os.path.join(target_testset_root_dir, "data", "macsen")) - # import audio and transcripts + # import transcription resources from bangor testset df_csvs=[] - df_csvs.append(import_textgrid_test(os.path.join(target_testset_root_dir, "data", "trawsgrifio", "OpiwHxPPqRI"))) - df_csvs.append(import_clips_dir(os.path.join(target_testset_root_dir, "data", "trawsgrifio", "arddweud_200617"))) - - ## + csv_OpiwHxPPqRI_file_path = import_textgrid(os.path.join(target_testset_root_dir, "data", "trawsgrifio", "OpiwHxPPqRI"), "sain.wav", "sain.TextGrid") + df_csvs.append(pandas.read_csv(csv_OpiwHxPPqRI_file_path, delimiter=',', encoding='utf-8')) + + csv_arddweud_200617_file_path=import_clips_dir(os.path.join(target_testset_root_dir, "data", "trawsgrifio", "arddweud_200617")) + df_csvs.append(pandas.read_csv(csv_arddweud_200617_file_path, delimiter=',', encoding='utf-8')) + + + ## merge sub-tests into one bigger test df_all_transcript_csvs = pandas.concat(df_csvs) df_all_transcript_csvs.to_csv(os.path.join(target_testset_root_dir, "data", "trawsgrifio", "deepspeech.csv"), encoding='utf-8', index=False) + print ("Testsets ready at %s " % target_testset_root_dir) - # - import_macsen_testset(os.path.join(target_testset_root_dir, "data", "macsen")) + # Resources for building language models - # - get_macsen_textcorpus(target_languagemodel_data_root_dir) + # language model for Macsen downloaded from API.. + get_macsen_textcorpus(MACSEN_TEXT_CORPUS_URL, target_languagemodel_data_root_dir) + print ("Macen text corpus ready at %s " % target_languagemodel_data_root_dir ) - # + # language model for transcription made up of multiple text sources.. corpus_files = [] - corpus_files.append(get_oscar_textcorpus(oscar_archive_file_path, target_languagemodel_data_root_dir)) + corpus_files.append(get_oscar_textcorpus(oscar_archive_file_path, target_languagemodel_data_root_dir)) + + commonvoice_validated_csv_file_path = os.path.join(commonvoice_root_dir, "validated.tsv") corpus_files.append(get_commonvoice_textcorpus(commonvoice_validated_csv_file_path, target_languagemodel_data_root_dir)) - # corpus_file_path = join_corpus_files(corpus_files, target_languagemodel_data_root_dir) - print ("Text corpus ready at %s " % corpus_file_path ) - + print ("Transcription text corpus ready at %s " % corpus_file_path ) - # - print ("Import Bangor data to %s finished." % (target_testset_root_dir)) if __name__ == "__main__": - parser = ArgumentParser(description=DESCRIPTION, formatter_class=RawTextHelpFormatter) - parser.add_argument("-b", dest="target_bangor_root_dir", default="/data/bangor") - parser.add_argument("-o", dest="oscar_archive_file_path", required=True) - parser.add_argument("-c", dest="commonvoice_validated_csv_file_path", required=True) + parser = ArgumentParser(description=DESCRIPTION, formatter_class=RawTextHelpFormatter) + + parser.add_argument("--bangor_dir", dest="bangor_target_root_dir", default="/data/bangor") + parser.add_argument("--oscar_archive", dest="oscar_archive_file_path", required=True) + parser.add_argument("--cv_dir", dest="commonvoice_root_dir", required=True) + parser.set_defaults(func=main) args = parser.parse_args() args.func(**vars(args)) diff --git a/local/import_custom_macsen_text_corpus.py b/local/import_custom_macsen_text_corpus.py new file mode 100755 index 0000000..3691cb8 --- /dev/null +++ b/local/import_custom_macsen_text_corpus.py @@ -0,0 +1,60 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +import os +import csv +import gzip +import pandas +import requests +import json + +from enum import Enum + +import gzip +import shutil +import functools + +from urllib.parse import urlparse + +from pathlib import Path +from git import Repo + +import utils.kfold as kfold +import utils.audio as audio + +from utils.clean_transcript import clean_transcript + +from utils.imports import import_textgrid, import_clips_dir, get_directory_structure +from utils.corpus import clean_text_corpus, import_csv_textcorpus, join_corpus_files, get_macsen_textcorpus + +from argparse import ArgumentParser, RawTextHelpFormatter + + +DESCRIPTION = """ +Llwytho i lawr set data amcan Macsen + +Mae angen rhoid lleoliad i ffeil alphabet.txt + +© Prifysgol Bangor University + +""" + + +def main(target_destination_root_dir, url_to_custom_macsen_skills_server, **args): + + # + target_languagemodel_data_root_dir = os.path.join(target_destination_root_dir, "lm-data") + Path(target_languagemodel_data_root_dir).mkdir(parents=True, exist_ok=True) + + # language model for Macsen downloaded from API.. + get_macsen_textcorpus(url_to_custom_macsen_skills_server, target_languagemodel_data_root_dir) + print ("Custom Macsen text corpus ready at %s " % target_languagemodel_data_root_dir ) + + +if __name__ == "__main__": + + parser = ArgumentParser(description=DESCRIPTION, formatter_class=RawTextHelpFormatter) + parser.add_argument("--output_dir", dest="target_destination_root_dir", default="/data/custom") + parser.add_argument("--url", dest="url_to_custom_macsen_skills_server", help="custom macsen skills server e.g. https://mywebsite.com/assistant/get_all_sentences" required=True) + parser.set_defaults(func=main) + args = parser.parse_args() + args.func(**vars(args)) diff --git a/local/import_cv_archive.py b/local/import_cv_archive.py index 224f1d8..4d4ab46 100755 --- a/local/import_cv_archive.py +++ b/local/import_cv_archive.py @@ -3,23 +3,18 @@ import os import sys import tarfile -import pandas -import csv import shlex import shutil import subprocess import glob from pathlib import Path -from utils.clean_transcript import clean_transcript from argparse import ArgumentParser, RawTextHelpFormatter DESCRIPTION = """ """ -ALPHABET_FILE_PATH = "/DeepSpeech/bin/bangor_welsh/alphabet.txt" - def extract(source_tar_gz, target_dir): @@ -39,44 +34,10 @@ def extract(source_tar_gz, target_dir): shutil.move(file_path, target_dir) - -def panda_group(df, column, destination_file_path): - - df_grp_client = df.groupby(column).size().to_frame('count').reset_index() - df_grp_client = df_grp_client.sort_values("count", ascending=False) - df_grp_client.to_csv(destination_file_path) - - - -def analyze_tsvs(cv_root_dir): - #client_id path sentence up_votes down_votes age gender accent locale segment - tsv_files = Path(cv_root_dir).glob("*.tsv") - for tsv_file_path in tsv_files: - - print ("Analyzing %s " % tsv_file_path) - - if 'reported.tsv' in str(tsv_file_path): - continue - - df = pandas.read_csv(tsv_file_path, encoding='utf-8', sep='\t', header=0, dtype={'gender':str}) - - panda_group(df, 'client_id', str(tsv_file_path).replace(".tsv",".counts.client_id.txt")) - panda_group(df, 'sentence', str(tsv_file_path).replace(".tsv",".counts.sentence.txt")) - - panda_group(df, 'age', str(tsv_file_path).replace(".tsv",".counts.age.txt")) - panda_group(df, 'gender', str(tsv_file_path).replace(".tsv",".counts.gender.txt")) - - # analyze clients by age and gender.... - - - def main(cv_archive_file_path, cv_root_dir, **args): extract(cv_archive_file_path, cv_root_dir) - # - analyze_tsvs(cv_root_dir) - # print ("Preparing for DeepSpeech with import_cv2.py") cmd = "python3 /DeepSpeech/bin/import_cv2.py %s --validate_label_locale /DeepSpeech/bin/bangor_welsh/utils/validate_label_locale.py" % (cv_root_dir) diff --git a/local/import_fine_tuning_resources.py b/local/import_fine_tuning_resources.py new file mode 100755 index 0000000..c8e2b66 --- /dev/null +++ b/local/import_fine_tuning_resources.py @@ -0,0 +1,89 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +import os +import pandas +import functools + +from pathlib import Path + +from utils.imports import import_textgrid, import_srt +from utils.corpus import import_csv_textcorpus, join_corpus_files +from utils.kfold import create_kfolds + +from argparse import ArgumentParser, RawTextHelpFormatter + +DESCRIPTION = """ +Paratoi adnoddau ychwanegol ar gyfer hyfforddi rhagor. + +© Prifysgol Bangor University + +""" +def create_kfolds_and_lm_data(bangor_data_root_dir, csv_file_path, target_kfolds_dir): + create_kfolds(csv_file_path, target_kfolds_dir, 10) + + target_languagemodel_data_root_dir = os.path.join(target_kfolds_dir, "lm-data") + Path(target_languagemodel_data_root_dir).mkdir(parents=True, exist_ok=True) + + base_text_corpus_file_path = os.path.join(bangor_data_root_dir, "lm-data", "corpus.clean.txt") + for kfold_csv in os.listdir(target_kfolds_dir): + if kfold_csv.startswith("train_"): + corpus_files = [] + corpus_files.append(import_csv_textcorpus(os.path.join(target_kfolds_dir, kfold_csv), target_languagemodel_data_root_dir)) + corpus_files.append(base_text_corpus_file_path) + corpus_file_path = join_corpus_files(corpus_files, target_languagemodel_data_root_dir, "corpus.%s.union.clean.txt" % kfold_csv.replace(".csv","")) + print ("lm-data for kfold %s created in %s" % (kfold_csv, corpus_file_path)) + + +def main(bangor_data_root_dir, finetune_data_file, target_finetuning_root_dir, **args): + + target_languagemodel_data_root_dir = os.path.join(target_finetuning_root_dir, "lm-data") + Path(target_languagemodel_data_root_dir).mkdir(parents=True, exist_ok=True) + + target_csv_file_path = os.path.join(target_finetuning_root_dir, "deepspeech.csv") + + # + transcribed_clips = [] + with open(finetune_data_file, 'r', encoding='utf-8') as finetune_files: + for finetune_file_path in finetune_files: + if finetune_file_path.startswith("#"): + continue + + finetune_file_path = finetune_file_path.rstrip() + if not os.path.isfile(finetune_file_path): + continue + + if finetune_file_path.endswith(".TextGrid"): + transcribed_clips.append(import_textgrid(target_csv_file_path, finetune_file_path)) + elif finetune_file_path.endswith(".srt"): + transcribed_clips.append(import_srt(target_csv_file_path, finetune_file_path)) + + df_transcribed_clips = pandas.concat(transcribed_clips) + df_transcribed_clips.to_csv(target_csv_file_path, index=False) + + # collect transcriptions into additions for fine tune training a language model + base_text_corpus_file_path = os.path.join(bangor_data_root_dir, "lm-data", "corpus.clean.txt") + corpus_files = [] + corpus_files.append(import_csv_textcorpus(target_csv_file_path, target_languagemodel_data_root_dir)) + corpus_files.append(base_text_corpus_file_path) + corpus_file_path = join_corpus_files(corpus_files, target_languagemodel_data_root_dir, "corpus.union.clean.txt") + + # create k-folds for determining new WER from fine tuned data. + create_kfolds_and_lm_data(bangor_data_root_dir, target_csv_file_path, os.path.join(target_finetuning_root_dir, "kfolds")) + + # + print ("Import fine tuning data to %s finished." % (target_finetuning_root_dir)) + print ("Corpus for fine tuning language model is at %s" % corpus_file_path) + + +if __name__ == "__main__": + + parser = ArgumentParser(description=DESCRIPTION, formatter_class=RawTextHelpFormatter) + + parser.add_argument("--bangor_dir", dest="bangor_data_root_dir", default="/data/bangor") + parser.add_argument("--target_dir", dest="target_finetuning_root_dir", help="target folder for all finetuning resources. should have an accompanying wav file (of the same name but with .wav extension)", default="/data/finetuning") + parser.add_argument("--finetuning_data_file", dest="finetune_data_file", help="File containing paths (one per line) to srt and/or TextGrid files", required=True) + # parser.add_argument("-c", dest="base_text_corpus_file_path", help=" file path to a text corpus that will be fined tuned e.g. OSCAR corpus", required=True) + + parser.set_defaults(func=main) + args = parser.parse_args() + args.func(**vars(args)) diff --git a/local/optimize_lm_scorer.sh b/local/optimize_lm_scorer.sh index b0ff30d..42e511e 100755 --- a/local/optimize_lm_scorer.sh +++ b/local/optimize_lm_scorer.sh @@ -3,39 +3,99 @@ set -e set -u set -o pipefail -lm_dir='' -test_files='' +help() +{ + echo + echo "Rhedeg sgriptiau optimeiddio modelau iaith KenLM" + echo "Run scripts for optimizing KenLM language models" + echo + echo "Usage: $ `basename $0` [OPTIONS]" + echo + echo "Options:" + echo + echo " -c, --csv_test_file Path csv file containing transcriptions with paths to clips" + echo " -d, --domain Name for language model domain" + echo " -p, --checkpoint_dir Path to checkpoint directory (optional)" + echo + exit 0 +} -VOCAB_SIZE=50000 +lm_domain='' +test_file='' +checkpoint_cy_dir='' -alphabet_file_path=/DeepSpeech/bin/bangor_welsh/alphabet.txt +SHORT=hd:c:p +LONG=help,domain:,csv_test_file:,checkpoint_dir + +# read options +OPTS=$(getopt --options $SHORT --long $LONG --name "$0" -- "$@") + +if [ $? != 0 ] ; then + echo "Failed to parse options...exiting." >&2 ; + exit 1 ; +fi -while getopts ":l:t:" opt; do - case $opt in - l) - lm_dir=$OPTARG - ;; - t) - test_files=$OPTARG - ;; - \?) echo "Invalid option -$OPTARG" >&2 - ;; +eval set -- "$OPTS" + +while true ; do + case "$1" in + -d | --domain ) + lm_domain="$2" + shift 2 + ;; + -t | --csv_test_file ) + test_file="$2" + shift 2 + ;; + -p | --checkpoint_dir ) + checkpoint_cy_dir="$2" + shift 2 + ;; + -h | --help ) + help + shift + ;; + -- ) + shift + break + ;; + *) + help + exit 1 + ;; esac done -shift "$(($OPTIND -1))" -if [ -z "${lm_dir}" ]; then - echo "-l lm_dir not set" +if [ -z "${test_file}" ]; then + echo "--csv_test_file missing. Use `basename $0` -h for more info." exit 2 fi -if [ -z "$test_files" ]; then - echo "-t test_files not set" - exit 2 + +if [ -z "${lm_domain}" ]; then + echo "--domain missing. Use `basename $0` -h for more info." + exit 2 +fi + +if [ -z "$checkpoint_cy_dir" ]; then + checkpoint_cy_dir=/checkpoints/cy + echo "-p|--checkpoint_dir not set. Setting to ${checkpoint_cy_dir} " fi -checkpoint_cy_dir=/checkpoints/cy -cd ${lm_dir} +# If checkpoint dir is empty, copy pretrained from techiaith.. +pretrained_checkpoint_dir=/checkpoints/techiaith +if [ ! "$(ls -A ${checkpoint_cy_dir})" ]; then + cp -r ${pretrained_checkpoint_dir} ${checkpoint_cy_dir} +fi + + +VOCAB_SIZE=50000 +alphabet_file_path=/DeepSpeech/bin/bangor_welsh/alphabet.txt +output_dir=/export/${DEEPSPEECH_RELEASE}_${TECHIAITH_RELEASE} + + +cd ${output_dir} + # Force UTF-8 output export PYTHONIOENCODING=utf-8 @@ -44,10 +104,10 @@ echo "########################################################################## echo "#### Determine optimal alpha and beta parameters ####" echo "####################################################################################" python /DeepSpeech/lm_optimizer.py \ - --test_files ${test_files} \ + --test_files ${test_file} \ --checkpoint_dir ${checkpoint_cy_dir} \ --alphabet_config_path ${alphabet_file_path} \ - --scorer kenlm.scorer + --scorer kenlm.${lm_domain}.scorer echo "####################################################################################" @@ -58,14 +118,11 @@ read -p "Enter best default beta: " beta echo "####################################################################################" -echo "#### Generating package with optimal alpha and beta ####" +echo "#### saving optimal alpha and beta values. run build_lm_scorer.sh once more ####" echo "####################################################################################" -/DeepSpeech/native_client/generate_scorer_package \ - --alphabet "${alphabet_file_path}" \ - --lm lm.binary \ - --vocab vocab-${VOCAB_SIZE}.txt \ - --package kenlm.scorer \ - --default_alpha ${alpha} \ - --default_beta ${beta} +default_alpha_file="${output_dir}/optimal_alpha.${lm_domain}.txt" +default_beta_file="${output_dir}/optimal_beta.${lm_domain}.txt" +echo ${alpha} > ${default_alpha_file} +echo ${beta} > ${default_beta_file} cd - diff --git a/local/run_tl_cv_cy.sh b/local/run_tl_cv_cy.sh index 2ef07b2..a528824 100755 --- a/local/run_tl_cv_cy.sh +++ b/local/run_tl_cv_cy.sh @@ -1,27 +1,68 @@ #!/bin/bash set -e -### -csv_dir='' -while getopts ":a:" opt; do - case $opt in - a) - csv_dir=$OPTARG - ;; - \?) echo "Invalid option -$OPTARG" >&2 - ;; +help() +{ + echo + echo "Rhedeg sgriptiau hyfforddi modelau acwstig DeepSpeech gyda data o CommonVoice" + echo "Run scripts for training DeepSpeech acoustic models with data from CommonVoice" + echo + echo "Syntax: $ `basename $0` [OPTIONS]" + echo + echo "Options:" + echo + echo " -c, --cv_dir Path to CommonVoice data directory " + echo " (imported with import_cv_archive.py) " + echo + exit 0 +} + + +SHORT=hc: +LONG=help,cv_dir: + +cv_dir='' + + +# read options +OPTS=$(getopt --options $SHORT --long $LONG --name "$0" -- "$@") + +if [ $? != 0 ] ; then + echo "Failed to parse options...exiting." >&2 ; + exit 1 ; +fi + +eval set -- "$OPTS" + +while true ; do + case "$1" in + -c | --cv_dir ) + cv_dir="$2" + shift 2 + ;; + -h | --help) + help + shift + ;; + -- ) + shift + break + ;; + *) + help + exit 1 + ;; esac done -shift "$(($OPTIND -1))" - -if [ -z "${csv_dir}" ]; then - echo "-a csv_dir not set" +if [ -z "${cv_dir}" ]; then + echo "--cv_dir missing. Use `basename $0` -h for more info." exit 2 fi + ### -model_name='bangor-welsh' +model_name='bangor' model_language='cy-Latn-GB' model_license='MPL' model_description='Welsh language acoustic model trained using transfer learning and approximately 90hrs of validated and other Welsh speech data from the Mozilla CommonVoice December 2020 release.' @@ -32,7 +73,7 @@ model_contact_info='techiaith@bangor.ac.uk' echo echo "####################################################################################" echo " model_name : ${model_name}" -echo " model_language : ${cy-Latn-GB}" +echo " model_language : ${model_language}" echo " model_license : ${model_license}" echo " model_description : ${model_description}" echo " model_author : ${model_author}" @@ -42,8 +83,9 @@ echo " DeepSpeech Version : ${DEEPSPEECH_RELEASE} " echo "####################################################################################" echo + ### -train_files=${csv_dir}/validated.csv,${csv_dir}/other.csv +train_files=${cv_dir}/clips/validated.csv,${cv_dir}/clips/other.csv alphabet_cy_file=/DeepSpeech/bin/bangor_welsh/alphabet.txt checkpoint_dir=/checkpoints @@ -73,27 +115,27 @@ echo "#### Transfer to WELSH model with --save_checkpoint_dir --load_checkpoint_ echo "####################################################################################" set -x python -u DeepSpeech.py \ - --train_files "${train_files}" --train_batch_size 64 \ + --train_files "${train_files}" \ + --train_batch_size 64 \ --drop_source_layers 2 \ --epochs 10 \ --alphabet_config_path "${alphabet_cy_file}" \ - --save_checkpoint_dir "${checkpoint_cy_dir}" \ - --load_checkpoint_dir "${checkpoint_en_dir}" - + --load_checkpoint_dir "${checkpoint_en_dir}" \ + --save_checkpoint_dir "${checkpoint_cy_dir}" + set +x echo echo "####################################################################################" -echo "#### Export new Welsh checkpoint to frozen model ####" +echo "#### Export new Welsh checkpoint to frozen model ####" echo "####################################################################################" set -x python -u DeepSpeech.py \ --train_files "${train_files}" --train_batch_size 64 \ --epochs 1 \ --alphabet_config_path "${alphabet_cy_file}" \ - --save_checkpoint_dir "${checkpoint_cy_dir}" \ --load_checkpoint_dir "${checkpoint_cy_dir}" \ - --remove_export \ + --save_checkpoint_dir "${checkpoint_cy_dir}" \ --export_dir "${export_dir}" \ --export_author_id "${model_author}" \ --export_model_name "${model_name}" \ diff --git a/local/run_tl_cv_moz_sets_cy.sh b/local/run_tl_cv_moz_sets_cy.sh new file mode 100755 index 0000000..3ed843e --- /dev/null +++ b/local/run_tl_cv_moz_sets_cy.sh @@ -0,0 +1,157 @@ +#!/bin/bash +set -e + +help() +{ + echo + echo "Rhedeg sgriptiau hyfforddi modelau acwstig DeepSpeech gyda setiau data Mozilla o CommonVoice" + echo "Run scripts for training DeepSpeech acoustic models with Mozilla prescribed datasets from CommonVoice" + echo + echo "Syntax: $ `basename $0` [OPTIONS]" + echo + echo "Options:" + echo + echo " -c, --cv_dir Path to CommonVoice data directory " + echo " (imported with import_cv_archive.py) " + echo + exit 0 +} + + +SHORT=hc: +LONG=cv_dir: + +cv_dir='' + + +# read options +OPTS=$(getopt --options $SHORT --long $LONG --name "$0" -- "$@") + +if [ $? != 0 ] ; then + echo "Failed to parse options...exiting." >&2 ; + exit 1 ; +fi + +eval set "$OPTS" + +while true ; do + case "$1" in + -c | --cv_dir ) + cv_dir="$2" + shift 2 + ;; + -h) + help + shift + ;; + *) + help + exit 1 + ;; + esac +done + + +### +model_name='bangor-mozilla-welsh' +model_language='cy-Latn-GB' +model_license='MPL' +model_description='Welsh language acoustic model trained using transfer learning and Mozilla''s prescribed CommonVoice datasets for training, validation and testing.' +model_author='techiaith' +model_contact_info='techiaith@bangor.ac.uk' + +echo +echo "####################################################################################" +echo " model_name : ${model_name}" +echo " model_language : ${cy-Latn-GB}" +echo " model_license : ${model_license}" +echo " model_description : ${model_description}" +echo " model_author : ${model_author}" +echo " model_contact_info : ${model_contact_info}" +echo " model_version : ${TECHIAITH_RELEASE} " +echo " DeepSpeech Version : ${DEEPSPEECH_RELEASE} " +echo "####################################################################################" +echo + +### +train_files=${csv_dir}/train.csv +devset_files=${csv_dir}/dev.csv +test_files=${csv_dir}/test.csv + +alphabet_cy_file=/DeepSpeech/bin/bangor_welsh/alphabet.txt + +checkpoint_dir=/checkpoints +export_dir=/export/${DEEPSPEECH_RELEASE}_${TECHIAITH_RELEASE} + + +### Force UTF-8 output +export PYTHONIOENCODING=utf-8 + +checkpoint_en_dir="${checkpoint_dir}/en" +checkpoint_cy_dir="${checkpoint_dir}/cy-moz" + +rm -rf ${checkpoint_en_dir} +rm -rf ${checkpoint_cy_dir} +rm -rf ${export_dir} + +mkdir -p ${checkpoint_en_dir} +mkdir -p ${checkpoint_cy_dir} +mkdir -p ${export_dir} + +cp -r /checkpoints/mozilla/deepspeech-en-checkpoint/ $checkpoint_en_dir + +### +echo +echo "####################################################################################" +echo "#### Transfer to WELSH model with --save_checkpoint_dir --load_checkpoint_dir ####" +echo "####################################################################################" +set -x +python -u DeepSpeech.py \ + --train_files "${train_files}" \ + --dev_files "${devset_files}" \ + --train_batch_size 24 \ + --drop_source_layers 2 \ + --epochs 10 \ + --alphabet_config_path "${alphabet_cy_file}" \ + --load_checkpoint_dir "${checkpoint_en_dir}" \ + --save_checkpoint_dir "${checkpoint_cy_dir}" + + +set +x +echo +echo "####################################################################################" +echo "#### Export new Welsh checkpoint to frozen model ####" +echo "####################################################################################" +set -x +python -u DeepSpeech.py \ + --train_files "${train_files}" \ + --train_batch_size 64 \ + --test_files "${test_files}" \ + --epochs 1 \ + --alphabet_config_path "${alphabet_cy_file}" \ + --save_checkpoint_dir "${checkpoint_cy_dir}" \ + --load_checkpoint_dir "${checkpoint_cy_dir}" \ + --remove_export \ + --export_dir "${export_dir}" \ + --export_author_id "${model_author}" \ + --export_model_name "${model_name}" \ + --export_model_version "${TECHIAITH_RELEASE}" \ + --export_contact_info "${model_contact_info}" \ + --export_license "${model_license}" \ + --export_language "${model_language}" \ + --export_min_ds_version "${DEEPSPEECH_RELEASE}" \ + --export_max_ds_version "${DEEPSPEECH_RELEASE}" \ + --export_description "${model_description}" + +### +/DeepSpeech/native_client/convert_graphdef_memmapped_format \ + --in_graph=${export_dir}/output_graph.pb \ + --out_graph=${export_dir}/output_graph.pbmm + + +set +x +echo +echo "####################################################################################" +echo "#### Exported acoustic models (.pb/.pbmm files) can be found in ${export_dir} " +echo "####################################################################################" +set -x diff --git a/local/train_tl.sh b/local/train_tl.sh new file mode 100755 index 0000000..d6d5f8e --- /dev/null +++ b/local/train_tl.sh @@ -0,0 +1,121 @@ +#!/bin/bash +set -e + +help() +{ + echo + echo "Rhedeg sgriptiau hyfforddi modelau acwstig DeepSpeech gyda data o CommonVoice" + echo "Run scripts for training DeepSpeech acoustic models with data from CommonVoice" + echo + echo "Syntax: $ `basename $0` [OPTIONS]" + echo + echo "Options:" + echo + echo " -c, --csv_files One or more CSV files to be used for training" + echo " -s, --save_checkpoint_dir Path to directory for saving checkpoints (Optional. Default /checkoints/cy)" + echo + exit 0 +} + + +SHORT=hc:s: +LONG=help,csv_files:,save_checkpoint_dir: + +csv_files='' +save_checkpoint_dir='' + +# read options +OPTS=$(getopt --options $SHORT --long $LONG --name "$0" -- "$@") + + +if [ $? != 0 ] ; then + echo "Failed to parse options...exiting." >&2 ; + exit 1 ; +fi + +eval set -- "$OPTS" + +while true ; do + case "$1" in + -c | --csv_files ) + csv_files="$2" + shift 2 + ;; + -s | --save_checkpoint_dir ) + save_checkpoint_dir="$2" + shift 2 + ;; + -h | --help) + help + shift + ;; + -- ) + shift + break + ;; + *) + help + exit 1 + ;; + esac +done + +# +if [ -z "${csv_files}" ]; then + echo "--csv_files missing. Use `basename $0` -h for more info." + exit 2 +fi + + +### +checkpoint_dir=/checkpoints +checkpoint_en_dir="${checkpoint_dir}/en" +checkpoint_cy_dir=${save_checkpoint_dir} +if [ -z "${save_checkpoint_dir}" ]; then + checkpoint_cy_dir="${checkpoint_dir}/cy" +fi + + +### +alphabet_cy_file=/DeepSpeech/bin/bangor_welsh/alphabet.txt + + +### Force UTF-8 output +export PYTHONIOENCODING=utf-8 + +rm -rf ${checkpoint_en_dir} +rm -rf ${checkpoint_cy_dir} + +mkdir -p ${checkpoint_en_dir} +mkdir -p ${checkpoint_cy_dir} + +cp -r /checkpoints/mozilla/deepspeech-en-checkpoint/ $checkpoint_en_dir + +### +echo +echo "####################################################################################" +echo "#### Transfer to WELSH model with --save_checkpoint_dir --load_checkpoint_dir ####" +echo "####################################################################################" +set -x +python -u DeepSpeech.py \ + --train_files "${csv_files}" \ + --train_batch_size 64 \ + --drop_source_layers 2 \ + --epochs 10 \ + --alphabet_config_path "${alphabet_cy_file}" \ + --load_checkpoint_dir "${checkpoint_en_dir}" \ + --save_checkpoint_dir "${checkpoint_cy_dir}" + + +set +x +echo +echo "####################################################################################" +echo "#### Export new Welsh checkpoint to frozen model ####" +echo "####################################################################################" +set -x +python -u DeepSpeech.py \ + --train_files "${train_files}" --train_batch_size 64 \ + --epochs 1 \ + --alphabet_config_path "${alphabet_cy_file}" \ + --load_checkpoint_dir "${checkpoint_cy_dir}" \ + --save_checkpoint_dir "${checkpoint_cy_dir}" diff --git a/local/utils/corpus.py b/local/utils/corpus.py new file mode 100755 index 0000000..ff37213 --- /dev/null +++ b/local/utils/corpus.py @@ -0,0 +1,91 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +import os +import csv +import hashlib +from typing import ContextManager +import srt +import pandas +import requests + +import functools + +from tqdm import tqdm +from pydub import AudioSegment + +from datetime import datetime, timedelta +from pathlib import Path +from praatio import tgio + +from .clean_transcript import clean_transcript + + +ALPHABET_FILE_PATH = "/DeepSpeech/bin/bangor_welsh/alphabet.txt" + + + +def import_csv_textcorpus(csv_file_path, lm_data_root_dir): + + print ("Extracting texts from csv file: %s " % csv_file_path) + if not os.path.isfile(csv_file_path): + print ("Proceeding with missing file %s " % csv_file_path) + + Path(lm_data_root_dir).mkdir(parents=True, exist_ok=True) + corpus_file_path = os.path.join(lm_data_root_dir, "corpus.txt") + + df = pandas.read_csv(csv_file_path, encoding='utf-8', sep=',', header=0, dtype={'transcript':str}) + sentences = df['transcript'] + + with open(corpus_file_path, 'w', encoding='utf-8') as corpus_file: + for t in sentences: + corpus_file.write(t + "\n") + + return clean_text_corpus(lm_data_root_dir) + + +def clean_text_corpus(lm_data_root_dir): + + print ("Cleaning corpus files in %s " % lm_data_root_dir) + + source_text_file_path = os.path.join(lm_data_root_dir, "corpus.txt") + output_text_file_path = os.path.join(lm_data_root_dir, "corpus.clean.txt") + + ooa_text_file_path = source_text_file_path.replace(".txt", ".ooa.txt") + clean = clean_transcript(ALPHABET_FILE_PATH, ooa_text_file_path) + + with open(output_text_file_path, 'w', encoding='utf-8') as out_file: + with open(source_text_file_path, 'r', encoding='utf-8') as in_file: + for i, transcript in enumerate(tqdm(in_file)): + cleaned, transcript = clean.clean(transcript) + if cleaned: + out_file.write(transcript.lower() + "\n") + + return output_text_file_path + + +def get_macsen_textcorpus(url, lm_data_root_dir): + + target_dir = os.path.join(lm_data_root_dir, 'macsen') + Path(target_dir).mkdir(parents=True, exist_ok=True) + + json_data = json.loads(requests.get(url).text) + with open(os.path.join(target_dir, "corpus.txt"), 'w', encoding='utf-8') as macsen_file_out: + for s in json_data["result"]: + macsen_file_out.write(s[0] + "\n") + + return clean_text_corpus(target_dir) + + +def join_corpus_files(corpus_files, target_languagemodel_data_root_dir, joined_file_name): + + corpus_file_path = os.path.join(target_languagemodel_data_root_dir, joined_file_name) + + print ("Join corpus text files %s into %s" % (corpus_files, corpus_file_path) ) + + with open(corpus_file_path, 'w', encoding='utf-8') as corpus_outfile: + for fname in corpus_files: + with open(fname, 'r', encoding='utf-8') as corpus_infile: + for line in corpus_infile: + corpus_outfile.write(line) + + return clean_text_corpus(target_languagemodel_data_root_dir) \ No newline at end of file diff --git a/local/utils/evaluate_lm.py b/local/utils/evaluate_lm.py new file mode 100755 index 0000000..aebdb69 --- /dev/null +++ b/local/utils/evaluate_lm.py @@ -0,0 +1,37 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +import os +import sys +import kenlm + +from argparse import ArgumentParser, RawTextHelpFormatter + +DESCRIPTION = """ + +""" + +def main(text_file_path, language_model_file_path, **args): + model = kenlm.LanguageModel(language_model_file_path) + print('{0}-gram model'.format(model.order)) + + with open(text_file_path, 'r', encoding='utf-8') as in_text: + for text in in_text: + print ("\n\n{0} : {1}".format(text.rstrip(), model.score(text))) + words = [''] + text.split() + [''] + for i, (prob, length, oov) in enumerate(model.full_scores(text)): + print('{0} {1}: {2}'.format(prob, length, ' '.join(words[i+2-length:i+2]))) + if oov: + print('\t"{0}" is an OOV'.format(words[i+1])) + for w in words: + if not w in model: + print('"{0}" is an OOV'.format(w)) + + +if __name__ == "__main__": + parser = ArgumentParser(description=DESCRIPTION, formatter_class=RawTextHelpFormatter) + parser.add_argument("-t", dest="text_file_path", required=True) + parser.add_argument("-l", dest="language_model_file_path", required=True) + + parser.set_defaults(func=main) + args = parser.parse_args() + args.func(**vars(args)) diff --git a/local/utils/imports.py b/local/utils/imports.py new file mode 100755 index 0000000..a9cb22f --- /dev/null +++ b/local/utils/imports.py @@ -0,0 +1,158 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +import os +import csv +import hashlib +from typing import ContextManager +import srt +import pandas +import functools + +from pydub import AudioSegment + +from datetime import datetime, timedelta +from pathlib import Path + +from praatio import tgio + +from .clean_transcript import clean_transcript + + +ALPHABET_FILE_PATH = "/DeepSpeech/bin/bangor_welsh/alphabet.txt" + +def get_directory_structure(rootdir): + dir = {} + rootdir = rootdir.rstrip(os.sep) + start = rootdir.rfind(os.sep) + 1 + for path, dirs, files in os.walk(rootdir, followlinks=True): + folders = path[start:].split(os.sep) + subdir = dict.fromkeys(files) + parent = functools.reduce(dict.get, folders[:-1], dir) + parent[folders[-1]] = subdir + + return dir + + +def import_textgrid(target_csv_file, textfile): + + print ("Importing clips and transcripts from %s " % textfile) + target_data_root_dir = Path(target_csv_file).parent + + target_clips_dir = os.path.join(target_data_root_dir, "clips") + Path(target_clips_dir).mkdir(parents=True, exist_ok=True) + + df = pandas.DataFrame(columns=['wav_filename', 'wav_filesize', 'transcript']) + + textgrid_file_path = os.path.join(target_data_root_dir, textfile) + soundfile = textgrid_file_path.replace(".TextGrid",".wav") + + audio_file = AudioSegment.from_wav(os.path.join(target_data_root_dir, soundfile)) + + ooa_text_file_path = os.path.join(target_data_root_dir, 'deepspeech.ooa.txt') + clean = clean_transcript(ALPHABET_FILE_PATH, ooa_text_file_path) + + tg = tgio.openTextgrid(textgrid_file_path) + entryList = tg.tierDict["utterance"].entryList + i=0 + for interval in entryList: + text = interval.label + cleaned, transcript = clean.clean(text) + + if cleaned and len(transcript)>0: + transcript = transcript.lower() + + start = float(interval.start) * 1000 + end = float(interval.end) * 1000 + + #print (start, end, transcript) + + split_audio = audio_file[start:end] + hashId = hashlib.md5(transcript.encode('utf-8')).hexdigest() + wav_segment_filepath = os.path.join(target_clips_dir, hashId + ".wav") + split_audio.export(wav_segment_filepath, format="wav") + + df.loc[i] = [wav_segment_filepath, os.path.getsize(wav_segment_filepath), transcript] + i += 1 + + return df + + +def import_srt(target_csv_file, srtfile): + + print ("Importing transcripts from srt file in %s " % srtfile) + target_data_root_dir = Path(target_csv_file).parent + + target_clips_dir = os.path.join(target_data_root_dir, "clips") + Path(target_clips_dir).mkdir(parents=True, exist_ok=True) + + df = pandas.DataFrame(columns=['wav_filename', 'wav_filesize', 'transcript']) + + srt_file_path = os.path.join(target_data_root_dir, srtfile) + soundfile = srt_file_path.replace(".srt",".wav") + + audio_file = AudioSegment.from_wav(os.path.join(target_data_root_dir, soundfile)) + + ooa_text_file_path = os.path.join(target_data_root_dir, 'deepspeech.ooa.txt') + clean = clean_transcript(ALPHABET_FILE_PATH, ooa_text_file_path) + + subs = list(srt.parse(open(srt_file_path, 'r', encoding='utf-8').read())) + i = 0 + for s in subs: + text = s.content + cleaned, transcript = clean.clean(text) + + if cleaned and len(transcript)>0: + transcript = transcript.lower() + + start = float(s.start.total_seconds()) * 1000 + end = float(s.end.total_seconds()) * 1000 + + #print (start, end, transcript) + + split_audio = audio_file[start:end] + hashId = hashlib.md5(transcript.encode('utf-8')).hexdigest() + wav_segment_filepath = os.path.join(target_clips_dir, hashId + ".wav") + split_audio.export(wav_segment_filepath, format="wav") + + df.loc[i] = [wav_segment_filepath, os.path.getsize(wav_segment_filepath), transcript] + i += 1 + + return df + + + +def import_clips_dir(target_testset_dir, **args): + + print ("Importing clips dir in %s " % target_testset_dir) + + arddweud_root_dir = get_directory_structure(os.path.join(target_testset_dir, "clips")) + + csv_file_path = os.path.join(target_testset_dir, 'deepspeech.csv') + print (csv_file_path) + + moz_fieldnames = ['wav_filename', 'wav_filesize', 'transcript'] + csv_file_out = csv.DictWriter(open(csv_file_path, 'w', encoding='utf-8'), fieldnames=moz_fieldnames) + csv_file_out.writeheader() + + ooa_text_file_path = os.path.join(target_testset_dir, 'deepspeech.ooa.txt') + clean = clean_transcript(ALPHABET_FILE_PATH, ooa_text_file_path) + + for filename in arddweud_root_dir["clips"]: + if filename.endswith(".wav"): + wavfilepath = os.path.join(target_testset_dir, "clips", filename) + txtfilepath = wavfilepath.replace(".wav", ".txt") + with open(txtfilepath, "r", encoding='utf-8') as txtfile: + transcript = txtfile.read() + cleaned, transcript = clean.clean(transcript) + if cleaned: + transcript = transcript.lower() + if audio.downsample_wavfile(wavfilepath): + # print (wavfilepath) + csv_file_out.writerow({ + 'wav_filename':wavfilepath, + 'wav_filesize':os.path.getsize(wavfilepath), + 'transcript':transcript + }) + + #return pandas.read_csv(csv_file_path, delimiter=',', encoding='utf-8') + return csv_file_path diff --git a/local/utils/kfold.py b/local/utils/kfold.py index ab1ae14..dae5040 100755 --- a/local/utils/kfold.py +++ b/local/utils/kfold.py @@ -6,6 +6,8 @@ import sys import pandas as pd +from pathlib import Path + from sklearn.model_selection import KFold from sklearn import datasets, linear_model from sklearn.model_selection import train_test_split @@ -16,6 +18,8 @@ def create_kfolds(csvfile, dest_dir, k): print ("Splitting %s into kfolds" % csvfile) + Path(dest_dir).mkdir(parents=True, exist_ok=True) + kf = KFold(n_splits=k, shuffle=True, random_state=2) try: