sgriptiau mireinio / finetuning scripts

techiaith · Mar 11, 2021 · 99159a7 · 99159a7
1 parent 5ee2465
commit 99159a7
Show file tree

Hide file tree

Showing 30 changed files with 1,464 additions and 596 deletions.
diff --git a/.gitignore b/.gitignore
@@ -10,6 +10,7 @@ models/*
 checkpoints/*
 tmp/*
 local/bin/commonvoice_url.py
+local/Makefile*
 keep
 local/__pycache__
 local/utils/__pycache__
diff --git a/Dockerfile b/Dockerfile
@@ -1,5 +1,5 @@
 ARG BRANCH
-FROM mozilla/deepspeech:$BRANCH
+FROM mozilla/deepspeech-train:$BRANCH
 
 RUN curl -s https://packagecloud.io/install/repositories/github/git-lfs/script.deb.sh | bash \
 	&& apt-get update && apt-get install -y git-lfs lame sox libsox-fmt-mp3 vim zip file locales-all \

diff --git a/Dockerfile.train.tmpl b/Dockerfile.train.tmpl
diff --git a/Makefile b/Makefile
@@ -1,10 +1,11 @@
 default: build
 
-DEEPSPEECH_RELEASE := 0.9.1
-TECHIAITH_RELEASE := 21.01
+DEEPSPEECH_RELEASE := 0.9.3
+TECHIAITH_RELEASE := 21.03
+
 
 run: 
-	docker run --gpus all --name techiaith-deepspeech-v${DEEPSPEECH_RELEASE}-${USER} -it \
+	docker run --gpus all --name techiaith-deepspeech-train-v${DEEPSPEECH_RELEASE}-${USER} -it \
 		-v ${PWD}/data/:/data \
 		-v ${PWD}/checkpoints/:/checkpoints \
 		-v ${PWD}/models/:/models \
@@ -13,15 +14,11 @@ run:
 		-v ${PWD}/local/:/DeepSpeech/bin/bangor_welsh \
 		--env DEEPSPEECH_RELEASE=${DEEPSPEECH_RELEASE} \
 		--env TECHIAITH_RELEASE=${TECHIAITH_RELEASE} \
-		techiaith/deepspeech:v${DEEPSPEECH_RELEASE} bash
+		techiaith/deepspeech-train:v${DEEPSPEECH_RELEASE} bash
 
 
 build:
 
-	if [ ! -d "DeepSpeech" ]; then \
-	    git clone https://github.com/mozilla/DeepSpeech.git; \
-	fi
-	cd DeepSpeech && make Dockerfile.train DEEPSPEECH_SHA=tags/v${DEEPSPEECH_RELEASE} && docker build --rm -t mozilla/deepspeech:v${DEEPSPEECH_RELEASE} -f Dockerfile.train .
 	if [ ! -d "checkpoints/mozilla" ]; then \
 	    mkdir -p checkpoints/mozilla; \
 	    cd checkpoints/mozilla && \
@@ -48,20 +45,17 @@ build:
 		wget https://github.com/techiaith/docker-deepspeech-cy/releases/download/$(TECHIAITH_RELEASE)/techiaith_bangor_macsen_$(TECHIAITH_RELEASE).scorer && \
 		wget https://github.com/techiaith/docker-deepspeech-cy/releases/download/$(TECHIAITH_RELEASE)/techiaith_bangor_transcribe_$(TECHIAITH_RELEASE).scorer;\
 	fi
-	docker build --build-arg BRANCH=v${DEEPSPEECH_RELEASE} --rm -t techiaith/deepspeech:v${DEEPSPEECH_RELEASE} .
+	docker build --build-arg BRANCH=v${DEEPSPEECH_RELEASE} --rm -t techiaith/deepspeech-train:v${DEEPSPEECH_RELEASE} .
 
 
 clean:
-	-docker rmi techiaith/deepspeech:v${DEEPSPEECH_RELEASE}
-	-docker rmi mozilla/deepspeech:v${DEEPSPEECH_RELEASE}
-	-docker rmi nvidia/cuda:10.0-cudnn7-devel-ubuntu18.04
-	-docker rmi tensorflow/tensorflow:1.15.2-gpu-py3
-	sudo rm -rf DeepSpeech
+	-docker rmi techiaith/deepspeech-train:v${DEEPSPEECH_RELEASE}
+	-docker rmi mozilla/deepspeech-train:v${DEEPSPEECH_RELEASE}	
 	sudo rm -rf homedir
 	sudo rm -rf checkpoints
 	sudo rm -rf models
 
 
 stop:
-	-docker stop techiaith-deepspeech-v${DEEPSPEECH_RELEASE}-${USER}
-	-docker rm techiaith-deepspeech-v${DEEPSPEECH_RELEASE}-${USER}
+	-docker stop techiaith-deepspeech-train-v${DEEPSPEECH_RELEASE}-${USER}
+	-docker rm techiaith-deepspeech-train-v${DEEPSPEECH_RELEASE}-${USER}
diff --git a/local/README.md b/local/README.md
@@ -6,6 +6,8 @@ Mae dogfennaeth gan Mozilla ar DeepSpeech ar gael fan hyn: https://deepspeech.re
 
 Mae'r sgriptiau canlynol yn cysylltu ag yn hwyluso'r holl gamau a ddilynir er mwyn hyfforddi, gynhyrchu a gwerthuso modelau adnabod lleferydd Cymraeg gyda DeepSpeech Mozilla. Defnyddir setiau Cymraeg o wefan CommonVoice Mozilla fel prif ffynhonnell data hyfforddi. Gydag adnoddau bellach gan Uned Technolegau Iaith, Prifysgol Bangor, mae'r modelau'n addas ar gyfer rhaglenni cynorthwyydd digidol (e.e. Macsen) a thrawsgrifiwr gyffredinol. 
 
+Mae modelau sydd wedi'i hyfforddi'n barod ar gael o'r dudalen cyhoeddi: https://github.com/techiaith/docker-deepspeech-cy/releases
+
 
 ## Rhagofynion
 
@@ -16,16 +18,16 @@ Llwythwch i lawr hefyd Corpws OSCAR o https://oscar-public.huma-num.fr/shuff-ori
 
 ## Paratoi Data
 
-### `import_audio_archive.py`
+### `import_cv_archive.py`
 
 ```shell
-root@c67722092f2e:/DeepSpeech# bin/bangor_welsh/import_audio_archive.py --archive /data/commonvoice/cy.tar.gz --target_dir /data/commonvoice/
+root@c67722092f2e:/DeepSpeech# bin/bangor_welsh/import_cv_archive.py --archive /data/commonvoice/cy.tar.gz --target_dir /data/commonvoice/
 ```
 
-### `analyze_audio.py`
+### `analyze_cv.py`
 
 ```shell
-root@c67722092f2e:/DeepSpeech# /DeepSpeech/bin/bangor_welsh/analyze_audio.py --csv_dir /data/commonvoice/clips/
+root@c67722092f2e:/DeepSpeech# /DeepSpeech/bin/bangor_welsh/analyze_cv.py --cv_dir /data/commonvoice/
 /data/commonvoice-cy-v5-20200622/clips/dev.csv                0.91 hours      (3269.93 seconds)
 /data/commonvoice-cy-v5-20200622/clips/test.csv               0.98 hours      (3514.49 seconds)
 /data/commonvoice-cy-v5-20200622/clips/train.csv              1.09 hours      (3941.04 seconds)
@@ -36,14 +38,14 @@ root@c67722092f2e:/DeepSpeech# /DeepSpeech/bin/bangor_welsh/analyze_audio.py --c
 
 ## Model Acwstig
 
-Defnyddiwch y sgript ganlynol i hyfforddi model acwstig. Dyle paramedr `-a` nodi ble mae'r ffeiliau CSV o ganlyniad i fewnforio CommonVoice. Yn yr enghraifft hon, maent wedi'u lleoli yn is-gyfeiriadur `/clips` y `target_dir` gwreiddiol.
+Defnyddiwch y sgript ganlynol i hyfforddi model acwstig gyda data gan gwefan CommonVoice.
 
 ### `run_tl_cv_cy.sh`
 
 Mae'r sgript hon yn defnyddio nodwedd dysgu trosglwyddol (*transfer learning*) DeepSpeech er mwyn cael fudd o ddefnyddio modelau acwstig Saesneg Mozilla, sydd wedi'u hyfforddi ar gasgliadau data llawer mwy o sain, fel man cychwyn ar gyfer hyfforddi adnabod lleferydd Cymraeg.
 
 ```shell
-root@c67722092f2e:/DeepSpeech# /DeepSpeech/bin/bangor_welsh/run_tl_cv_cy.sh -a /data/commonvoice/clips
+root@c67722092f2e:/DeepSpeech# ./bin/bangor_welsh/run_tl_cv_cy.sh --cv_dir /data/commonvoice
 ```
 
 
@@ -58,7 +60,7 @@ Mae angen rhagor o adnoddau gan Brifysgol Bangor er mwyn hyfforddi DeepSpeech ar
 Mae'r sgript isod yn llwytho i lawr rhagor o recordiadau a corpora testun sydd yn galluogi adnabod lleferydd Cymraeg o fewn cynorthwyydd digidol a trawsgrifiwr. Rhaid i chi llwytho i lawr ffeil archif corpws testun OSCAR o flaen llaw er mwyn ei ddefnyddio gyda'r orchymyn isod:
 
 ```shell
-root@6a88b0d59848:/DeepSpeech# bin/bangor_welsh/import_bangor_resources.py -o /data/oscar/cy.txt.gzip -c /data/commonvoice/validated.tsv
+root@6a88b0d59848:/DeepSpeech# ./bin/bangor_welsh/import_bangor_resources.py --target_dir /data/bangor --oscar_archive /data/oscar/cy.txt.gzip --cv_dir /data/commonvoice/
 ```
 
 Mae'r sgript mewnforio hefyd yn hidlo unrhyw testunau sy'n anaddas i'r proses hyfforddi modelau iaith adnabod lleferydd ac yn creu copi 'glan' (`.clean`) o'r corpws. 
@@ -70,12 +72,12 @@ Dyma'r brif sgript ar gyfer hyfforddi model iaith ac yna ei werthuso gyda model
 
 ##### Ar gyfer defnyddio adnabod lleferydd o fewn Macsen:
 ```shell
-root@6a88b0d59848:/DeepSpeech# ./bin/bangor_welsh/build_lm_scorer.sh -s /data/bangor/lm-data/macsen/corpus.clean.txt -t /data/bangor/testsets/data/macsen/deepspeech.csv -o /data/bangor/lm/macsen
+root@6a88b0d59848:/DeepSpeech# ./bin/bangor_welsh/build_lm_scorer.sh --text_file /data/bangor/lm-data/macsen/corpus.clean.txt --domain macsen
 ```
 
 ##### Ar gyfer defnyddio adnabod lleferydd i drawsgrifio:
 ```shell
-root@6a88b0d59848:/DeepSpeech# ./bin/bangor_welsh/build_lm_scorer.sh -s /data/bangor/lm-data/oscar/corpus.clean.txt -t /data/bangor/testsets/data/trawsgrifio/deepspeech.csv -o /data/bangor/lm/trawsgrifio
+root@6a88b0d59848:/DeepSpeech# ./bin/bangor_welsh/build_lm_scorer.sh --text_file /data/bangor/lm-data/oscar/corpus.clean.txt --domain macsen --output_dir /export/macsen --scorer kenlm.scorer
 ```
 
 
@@ -84,8 +86,18 @@ root@6a88b0d59848:/DeepSpeech# ./bin/bangor_welsh/build_lm_scorer.sh -s /data/ba
 
 Bydd y sgript yma yn arbrofi gyda gwahanol baramedrau modelau iaith nes iddo ddod o hyd i'r gwerthoedd gorau posibl sy'n rhoi'r cyfraddau gwallau adnabod lleferydd isaf posibl.
 
-Gall y broses gymryd amser hir - oriau neu ddiwrnod neu ddau - gan y bydd yn arbrofi miloedd o weithiau. Yn y diwedd, bydd y sgript yn adrodd ar ddau werth gorau posibl ac yn gofyn ichi eu mewnbynnu i'w cynnwys ym mhecyn terfynol y model iaith. (gweler y ffeil `kenlm.scorer` yn y cyfeiriadur a bennir gan y ddadl sgript` -l`)
+Gall y broses gymryd amser hir - oriau neu ddiwrnod neu ddau - gan y bydd yn arbrofi miloedd o weithiau. Yn y diwedd, bydd y sgript yn adrodd ar ddau werth gorau posibl (gelwir yn 'alpha' a 'beta') ac yn gofyn ichi eu mewnbynnu er mwyn eu cynnwys ym mhecyn terfynol y model iaith.
+
+```shell
+root@6a88b0d59848:/DeepSpeech# ./bin/bangor_welsh/optimize_lm_scorer.sh --csv_test_file /data/bangor/testsets/data/macsen/deepspeech.csv --domain macsen [--checkpoint_dir /checkpoints/cy]
+```
+
+
+## Profi'r modelau
+
+Er mwyn gwybod pa mor dda neu ddim mae'r modelau, mae modd profi erbyn set profi sydd wedi ei fanylu o fewn ffeil CSV. Er enghraifft, er mwyn profi'r modelau trawsgrifio:
 
 ```shell
-root@6a88b0d59848:/DeepSpeech# bin/bangor_welsh/optimize_lm_scorer.sh -l /data/bangor/lm/mascen -t /data/bangor/testsets/data/macsen/deepspeech.csv
+root@6a88b0d59848:/DeepSpeech# ./bin/bangor_welsh/evaluate.sh --csv_test_file /data/bangor/testsets/data/trawsgrifio/arddweud_200617/deepspeech.csv --scorer /export/transcribe/kenlm.transcribe.scorer
 ```
+
diff --git a/local/README_EN.md b/local/README_EN.md
@@ -6,24 +6,27 @@ Documentation by Mozilla on DeepSpeech can be found here: https://deepspeech.rea
 
 The following scripts join up all the steps that are needed to train, generate and evaluate models for Welsh language speech recognition with Mozilla's DeepSpeech. The Welsh datasets from Mozilla's CommonVoice website are the primary resource for training. With some further resources from Bangor University's Language Technologies Unit, the models are viable for voice assistant (e.g. Macsen) and a transcriber applications. 
 
+Pre-trained models however are available from the release page: https://github.com/techiaith/docker-deepspeech-cy/releases
 
 ## Prerequisites
 
 Download the Welsh speech data from the Mozilla CommonVoice website: https://voice.mozilla.org/cy/datasets which is provided as a single large compressed file (`.tar.gz`). Save the file into the `data` ffolder. 
 
+Download also the OSCAR text corpus from https://oscar-public.huma-num.fr/shuff-orig/cy which contains Welsh language texts collected from the world wide web. You will need to register for the website to permit downloading. Save the file in the `data/oscar` folder.
+
 
 ## Prepare Data
 
 ### `import_audio_archive.py`
 
 ```shell
-root@c67722092f2e:/DeepSpeech# bin/bangor_welsh/import_audio_archive.py --archive /data/cy-v4.tar.gz --target_dir /data/commonvoice-cy-v5-20200622/
+root@c67722092f2e:/DeepSpeech# bin/bangor_welsh/import_audio_archive.py --archive /data/commonvoice/cy.tar.gz --target_dir /data/commonvoice/
 ```
 
 ### `analyze_audio.py`
 
 ```shell
-root@c67722092f2e:/DeepSpeech# /DeepSpeech/bin/bangor_welsh/analyze_audio.py --csv_dir /data/commonvoice-cy-v5-20200622/clips/
+root@c67722092f2e:/DeepSpeech# /DeepSpeech/bin/bangor_welsh/analyze_audio.py --csv_dir /data/commonvoice/clips/
 /data/commonvoice-cy-v5-20200622/clips/dev.csv                0.91 hours      (3269.93 seconds)
 /data/commonvoice-cy-v5-20200622/clips/test.csv               0.98 hours      (3514.49 seconds)
 /data/commonvoice-cy-v5-20200622/clips/train.csv              1.09 hours      (3941.04 seconds)
@@ -35,15 +38,14 @@ root@c67722092f2e:/DeepSpeech# /DeepSpeech/bin/bangor_welsh/analyze_audio.py --c
 
 ## Acoustic Model
 
-Use the following script to train an acoustic model. The `-a` argument needs to point to where to the CSV files from your CommonVoice import. In this example, they are located in the `/clips` subdirectory of the original `target_dir`.  
-
+Use the following script to train an acoustic model with data from the CommonVoice website.
 
 ### `run_tl_cv_cy.sh`
 
 This script uses DeepSpeech's transfer learning feature in order to benefit from Mozilla's English acoustic models, trained on much larger speech data collections, as a starting point for training Welsh speech recognition.
 
 ```shell
-root@c67722092f2e:/DeepSpeech# /DeepSpeech/bin/bangor_welsh/run_tl_cv_cy.sh -a /data/commonvoice-cy-v5-20200622/clips
+root@c67722092f2e:/DeepSpeech# ./bin/bangor_welsh/run_tl_cv_cy.sh --cv_dir /data/commonvoice
 ```
 
 
@@ -57,30 +59,46 @@ An acoustic model on its own, despite having used transfer learning techniques,
 
 You will need further resources from Bangor University in order to train  DeepSpeech with language models for various Welsh language applications. 
 
-The following script will download further recordings and/or text corpora that facilitate Welsh speech recognition for a simple voice assistant ('macsen') or a transcriber ('transcribe') (as requested in the `-d` argument).
+The following script will download further recordings and/or text corpora that facilitate Welsh speech recognition for a simple voice assistant or a transcriber. You should have downloaded the OSCAR corpus beforehand in order to use the following command:
 
 
 ```shell
-root@6a88b0d59848:/DeepSpeech# bin/bangor_welsh/import_bangor_resources.py -t /data/macsen -d macsen
+root@6a88b0d59848:/DeepSpeech# ./bin/bangor_welsh/import_bangor_resources.py --target_dir /data/bangor --oscar_archive /data/oscar/cy.txt.gzip --cv_dir /data/commonvoice/
 ```
 
+The script filters unsuitable texts for training process and creates a 'clean' (`.clean`) version of the corpus.
+
 
 ### `build_lm_scorer.sh`
 
 This is the main script for training a language model and evaluation with the acoustic model from the previous steps in training DeepSpeech.
 
-
+##### Training voice assistant Macsen's language model:
 ```shell
-root@6a88b0d59848:/DeepSpeech# bin/bangor_welsh/build_lm_scorer.sh -s /data/texts/macsen/corpus.clean.txt -o /data/texts/macsen/ -t /data/macsen/deepspeech.csv
+root@6a88b0d59848:/DeepSpeech# ./bin/bangor_welsh/build_lm_scorer.sh --text_file /data/bangor/lm-data/macsen/corpus.clean.txt --domain macsen
 ```
 
+##### Training transcriber language model:
+```shell
+root@6a88b0d59848:/DeepSpeech# ./bin/bangor_welsh/build_lm_scorer.sh --text_file /data/bangor/lm-data/oscar/corpus.clean.txt --domain macsen --output_dir /export/macsen --scorer kenlm.scorer
+``` 
+
+
 
 ### `optimize_lm_scorer.sh`
 
 This script will experiment with various language model parameters until it finds optimal values that give the lowest possible recognition error rates. 
 
-The process can take a long time - hours or possibly day or two - since it will experiment many thousands of times. In the end, the script will report on two optimal values and ask you to enter them for final inclusion in the finally packaged language model. (`kenlm.scorer` in the directory specified by the `-o` script argument)
+The process can take a long time - hours or possibly day or two - since it will experiment many thousands of times. In the end, the script will report on two optimal values and ask you to enter them for final inclusion in the finally packaged language model.
 
 ```shell
-root@6a88b0d59848:/DeepSpeech# bin/bangor_welsh/optimize_lm_scorer.sh -l /data/texts/macsen -t /data/macsen/deepspeech.csv
+root@6a88b0d59848:/DeepSpeech# ./bin/bangor_welsh/optimize_lm_scorer.sh --csv_test_file /data/bangor/testsets/data/macsen/deepspeech.csv --domain macsen [--checkpoint_dir /checkpoints/cy]
 ```
+
+## Model Evaluation
+
+You can test how well your model will perform if you have a CSV that provides a test set. For example, to test the transcription models:
+
+```shell
+root@6a88b0d59848:/DeepSpeech# ./bin/bangor_welsh/evaluate.sh --csv_test_file /data/bangor/testsets/data/trawsgrifio/arddweud_200617/deepspeech.csv --scorer /export/transcribe/kenlm.transcribe.scorer
+```
diff --git a/local/analyze_audio.py b/local/analyze_audio.py