deepspeech 0.7.3

techiaith · Jun 18, 2020 · dff4947 · dff4947
2 parents 5a1d79d + bd8e6b8
commit dff4947
Show file tree

Hide file tree

Showing 45 changed files with 1,165 additions and 1,556 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,6 +1,8 @@
+*.pyc
 DeepSpeech
 CorporaCreator
 *.csv
+.vscode/*
 data/*
 export/*
 homedir/*
@@ -9,4 +11,5 @@ checkpoints/*
 tmp/*
 local/bin/commonvoice_url.py
 keep
-local/bin/__pycache__
+local/__pycache__
+local/utils/__pycache__
diff --git a/Dockerfile b/Dockerfile
@@ -2,19 +2,21 @@ ARG BRANCH
 FROM mozilla/deepspeech:$BRANCH
 
 RUN curl -s https://packagecloud.io/install/repositories/github/git-lfs/script.deb.sh | bash \
-	&& apt-get update && apt-get install -y git-lfs lame sox vim zip file \
-						unzip python3 python3-pip python3-dev  \
-						libffi-dev libssl-dev libxml2-dev \
+	&& apt-get update && apt-get install -y git-lfs lame sox vim zip file locales-all \
+						unzip valgrind libffi-dev libssl-dev libxml2-dev \
 						libxslt1-dev libjpeg8-dev zlib1g-dev dos2unix\
 	&& apt-get clean \
 	&& git lfs install \
-        && pip3 install sox wget sklearn pandas python_speech_features virtualenv requests jiwer tqdm \
+    && pip install sox wget sklearn pandas python_speech_features virtualenv requests tqdm columnize \
 	&& rm -rf /var/lib/apt/lists/* 
 
+ENV LC_ALL cy_GB.UTF-8
+ENV LANG cy_GB.UTF-8
+ENV LANGUAGE cy_GB.UTF-8
+
 WORKDIR /DeepSpeech
 
-#RUN python3 util/taskcluster.py --source tensorflow --artifact convert_graphdef_memmapped_format --target native_client \
-#	&& chmod +x native_client/convert_graphdef_memmapped_format
+RUN python util/taskcluster.py --source tensorflow --artifact convert_graphdef_memmapped_format --branch r1.15 --target .
 
 ENV PATH /DeepSpeech/native_client:/DeepSpeech/native_client/kenlm/build/bin:$PATH
 
diff --git a/Makefile b/Makefile
@@ -1,40 +1,49 @@
 default: build
 DEEPSPEECH_RELEASE := 0.7.3
 DEEPSPEECH_BRANCH := v$(DEEPSPEECH_RELEASE)
-#DEEPSPEECH_RELEASE := 0.5.1
-#DEEPSPEECH_BRANCH := transfer-learning2
+
 
 run: 
 	docker run --gpus all --name techiaith-deepspeech-${DEEPSPEECH_BRANCH}-${USER} -it \
 		-v ${PWD}/data/:/data \
-                -v ${PWD}/checkpoints/:/checkpoints \
+		-v ${PWD}/checkpoints/:/checkpoints \
+		-v ${PWD}/models/:/models \
 		-v ${PWD}/export/:/export \
 		-v ${PWD}/homedir/:/root \
-		-v ${PWD}/local/bin:/DeepSpeech/bin/bangor_welsh \
+		-v ${PWD}/local/:/DeepSpeech/bin/bangor_welsh \
 		techiaith/deepspeech:${DEEPSPEECH_BRANCH} bash
-
+
+
 build:
 	if [ ! -d "DeepSpeech" ]; then \
 	    git clone --branch $(DEEPSPEECH_BRANCH) https://github.com/mozilla/DeepSpeech.git; \
 	    cd DeepSpeech && docker build --rm -t mozilla/deepspeech:${DEEPSPEECH_BRANCH} .; \
-	fi
+	fi	
 	if [ ! -d "checkpoints/mozilla" ]; then \
 	    mkdir -p checkpoints/mozilla; \
 	    cd checkpoints/mozilla && \
 		wget https://github.com/mozilla/DeepSpeech/releases/download/v$(DEEPSPEECH_RELEASE)/deepspeech-$(DEEPSPEECH_RELEASE)-checkpoint.tar.gz && \
-		tar xvfz deepspeech-$(DEEPSPEECH_RELEASE)-checkpoint.tar.gz;\
+		tar xvfz deepspeech-$(DEEPSPEECH_RELEASE)-checkpoint.tar.gz && \
+		mv deepspeech-$(DEEPSPEECH_RELEASE)-checkpoint deepspeech-en-checkpoint;\
 	fi
+	if [ ! -d "models/mozilla" ]; then \
+	    mkdir -p models/mozilla; \
+	    cd models/mozilla && \
+		wget https://github.com/mozilla/DeepSpeech/releases/download/v$(DEEPSPEECH_RELEASE)/deepspeech-$(DEEPSPEECH_RELEASE)-models.pbmm && \
+		wget https://github.com/mozilla/DeepSpeech/releases/download/v$(DEEPSPEECH_RELEASE)/deepspeech-$(DEEPSPEECH_RELEASE)-models.scorer;\
+	fi		
 	docker build --build-arg BRANCH=${DEEPSPEECH_BRANCH} --rm -t techiaith/deepspeech:${DEEPSPEECH_BRANCH} .
 
+
 clean:
 	-docker rmi techiaith/deepspeech:${DEEPSPEECH_BRANCH}
 	-docker rmi mozilla/deepspeech:${DEEPSPEECH_BRANCH}
 	-docker rmi nvidia/cuda:10.0-cudnn7-devel-ubuntu18.04
 	sudo rm -rf DeepSpeech
 	sudo rm -rf homedir
 	sudo rm -rf checkpoints
-
+
+
 stop:
 	-docker stop techiaith-deepspeech-${DEEPSPEECH_BRANCH}-${USER}
 	-docker rm techiaith-deepspeech-${DEEPSPEECH_BRANCH}-${USER}
-
diff --git a/README.md b/README.md
@@ -15,7 +15,7 @@ Gweler/*See also* : https://github.com/NVIDIA/nvidia-docker#quickstart
 
 <br/>
 
-## Cychwyn arni / *Quickstart*
+## Gosod / *Installation*
 
 ```
 $ git clone https://github.com/techiaith/docker-deepspeech-cy
@@ -27,38 +27,13 @@ $ make run
 
 ### Data Cymraeg Mozilla CommonVoice / *Mozilla Common Welsh Data*
 
-Llwythwch y data diweddaraf i lawr o https://voice.mozilla.org/cy/datasets ac yna echdynnwch popeth i ffolder newydd o dan `data`. Er enghraifft.....
+Llwythwch y data diweddaraf i lawr o https://voice.mozilla.org/cy/datasets i'r ffolder `docker-deepspeech-cy/data`.
 
-*Download the latest data from https://voice.mozilla.org/cy/datasets and extract all into a new folder underneath `data`. For example.....*
+*Download the latest data from https://voice.mozilla.org/cy/datasets to the `docker-deepspeech-cy/data`folder.*
 
 
-```bash
-techiaith@gweinydd:/home/techiaith/docker/docker-deepspeech-cy/data/commonvoice-cy-v4-20191210⟫ ls -l
-total 2124544
-drwxr-xr-x 2 techiaith techiaith    6459392 Feb  3 18:35 clips
--rw-r--r-- 1 techiaith techiaith     148342 Dec 10 13:42 dev.tsv
--rw-r--r-- 1 techiaith techiaith     580477 Dec 10 13:42 invalidated.tsv
--rw-r--r-- 1 techiaith techiaith    2568371 Dec 10 13:42 other.tsv
--rw-r--r-- 1 techiaith techiaith     147797 Dec 10 13:42 test.tsv
--rw-r--r-- 1 techiaith techiaith     164667 Dec 10 13:42 train.tsv
--rw-r--r-- 1 techiaith techiaith   10434562 Dec 10 13:42 validated.tsv
-```
-
 ## Hyfforddi / *Training*
 
-Y prif sgriptiau a ddefnyddir ar gyfer hyfforddi yw: 
-
-*The sgripts primarity for training are:*
-
-```
-root@3deb765f2438:/DeepSpeech# ./bin/bangor_welsh/run-tl-cv-macsen.sh
-root@3deb765f2438:/DeepSpeech# ./bin/bangor_welsh/run-tl-cv-arddweud.sh
-```
-
-Gweler y nodyn rhyddhau am wybodaeth am unrhyw ddata pellach y gallai fod eu hangen arnoch
-
-*Please see the release note for information on any further data you might require*
-
-
-
+Gweler [README.md](local/README.md)
 
+*See [README.md](local/README_EN.md)*
diff --git a/local/README.md b/local/README.md
@@ -0,0 +1,71 @@
+# Sgriptiau Hyfforddi DeepSpeech Mozilla
+
+*Click [here](README_EN.md) to read this page in English*
+
+Mae dogfennaeth gan Mozilla ar DeepSpeech ar gael fan hyn: https://deepspeech.readthedocs.io . 
+
+Mae'r sgriptiau canlynol yn enghreifftio ac yn hwyluso defnyddio'r camau cyffredinol a ddisgrifir yn nogfennaeth DeepSpeech Mozilla er mwyn creu modelau adnabod lleferydd Cymraeg ar gyfer rhaglenni cynorthwyydd digidol (e.e. Macsen) a trawsgrifiwr.  
+
+
+## Rhagofynion
+
+Llwythwch i lawr data lleferydd Cymraeg o wefan CommonVoice: https://voice.mozilla.org/cy/datasets sy'n cael ei ddarparu fel un ffeil mawr wedi'i gwasgu (e.e. `cy.tar.gz`) . Cadwch y ffeil o fewn y ffolder `data`. 
+
+
+## Paratoi Data
+
+### `import_audio_archive.py`
+
+```shell
+root@c67722092f2e:/DeepSpeech# bin/bangor_welsh/import_audio_archive.py --archive /data/cy-v4.tar.gz --target_dir /data/commonvoice-cy-v4-20191210/
+```
+
+### `analyze_audio.py`
+
+```shell
+root@c67722092f2e:/DeepSpeech# /DeepSpeech/bin/bangor_welsh/analyze_audio.py --csv_dir /data/commonvoice-cy-v4-20191210/clips/
+/data/commonvoice-cy-v4-20191210/clips/dev.csv                0.91 hours      (3269.93 seconds)
+/data/commonvoice-cy-v4-20191210/clips/test.csv               0.98 hours      (3514.49 seconds)
+/data/commonvoice-cy-v4-20191210/clips/train.csv              1.09 hours      (3941.04 seconds)
+/data/commonvoice-cy-v4-20191210/clips/train-all.csv          7.48 hours      (26928.55 seconds)
+/data/commonvoice-cy-v4-20191210/clips/other.csv              14.75 hours     (53092.44 seconds)
+/data/commonvoice-cy-v4-20191210/clips/validated.csv          58.16 hours     (209380.97 seconds)
+```
+
+## Model Acwstig
+
+
+### `run_tl_cv_cy.sh`
+
+```shell
+root@c67722092f2e:/DeepSpeech# /DeepSpeech/bin/bangor_welsh/run_tl_cv_cy.sh -c /data/commonvoice-cy-v4-20191210/clips
+```
+
+
+## Modelau Iaith / Parth Penodol
+
+### `import_bangor_resources.py`
+
+Mae angen rhagor o adnoddau gan Brifysgol Bangor er mwyn hyfforddi DeepSpeech ar gyfer adnabod lleferydd Cymraeg mewn gwahanol gyd-destunau defnyddiol. Mae'r sgript isod yn llwytho i lawr rhagor o recordiadau ac/neu chorpora testun sydd yn galluogi adnabod lleferydd Cymraeg o fewn cynorthwyydd digidol ('macsen') neu drawsgrifiwr ('transcribe').
+
+```shell
+root@6a88b0d59848:/DeepSpeech# bin/bangor_welsh/import_bangor_resources.py -t /data/macsen -d macsen
+```
+
+### `clean_lm_corpus.py`
+
+```shell
+root@6a88b0d59848:/DeepSpeech# bin/bangor_welsh/clean_lm_corpus.sh -s /data/macsen/corpus.txt -o /data/macsen/corpus.clean.txt 
+```
+
+### `build_lm_scorer.sh`
+
+```shell
+root@6a88b0d59848:/DeepSpeech# bin/bangor_welsh/build_lm_scorer.sh -s /data/macsen/corpus.clean.txt -o /data/macsen/ -t /data/macsen/deepspeech.csv
+```
+
+### `evaluate_lm_scorer.sh`
+
+```shell
+root@6a88b0d59848:/DeepSpeech# bin/bangor_welsh/evaluate_lm_scorer.sh -l /data/mascen -t /data/macsen/deepspeech.csv
+```
diff --git a/local/README_EN.md b/local/README_EN.md
@@ -0,0 +1,70 @@
+# Scripts for Training Mozilla DeepSpeech
+
+*Cliciwch [yma](README.md) i ddarllen y dudalen hon yn Gymraeg*
+
+Documentation by Mozilla on DeepSpeech can be found here: https://deepspeech.readthedocs.io 
+
+The following scripts demonstrate how the general steps described in Mozilla's documentation can be used to create Welsh language speech recognition models for both a voice assistant (e.g. Macsen) and a transcribing applications. 
+
+
+## Prerequisites
+
+Download the Welsh speech data from the Mozilla CommonVoice website: https://voice.mozilla.org/cy/datasets which is provided as a single large compressed file (`.tar.gz`). Save the file into the `data` ffolder. 
+
+
+## Prepare Data
+
+### `import_audio_archive.py`
+
+```shell
+root@c67722092f2e:/DeepSpeech# bin/bangor_welsh/import_audio_archive.py --archive /data/cy-v4.tar.gz --target_dir /data/commonvoice-cy-v4-20191210/
+```
+
+### `analyze_audio.py`
+
+```shell
+root@c67722092f2e:/DeepSpeech# /DeepSpeech/bin/bangor_welsh/analyze_audio.py --csv_dir /data/commonvoice-cy-v4-20191210/clips/
+/data/commonvoice-cy-v4-20191210/clips/dev.csv                0.91 hours      (3269.93 seconds)
+/data/commonvoice-cy-v4-20191210/clips/test.csv               0.98 hours      (3514.49 seconds)
+/data/commonvoice-cy-v4-20191210/clips/train.csv              1.09 hours      (3941.04 seconds)
+/data/commonvoice-cy-v4-20191210/clips/train-all.csv          7.48 hours      (26928.55 seconds)
+/data/commonvoice-cy-v4-20191210/clips/other.csv              14.75 hours     (53092.44 seconds)
+/data/commonvoice-cy-v4-20191210/clips/validated.csv          58.16 hours     (209380.97 seconds)
+```
+
+
+## Acoustic Model
+
+### `run_tl_cv_cy.sh`
+
+```shell
+root@c67722092f2e:/DeepSpeech# /DeepSpeech/bin/bangor_welsh/run_tl_cv_cy.sh -c /data/commonvoice-cy-v4-20191210/clips
+```
+
+## Language Models / Domain Specific
+
+### `import_bangor_resources.py`
+
+You will need further resources from Bangor University in order to train  DeepSpeech for various Welsh language applications. The below script will download further recordings and/or text corpora that facilitate Welsh speech recognition for a simple voice assistant ('macsen') or a transcriber ('transcribe').
+
+```shell
+root@6a88b0d59848:/DeepSpeech# bin/bangor_welsh/import_bangor_resources.py -t /data/macsen -d macsen
+```
+
+### `clean_lm_corpus.py`
+
+```shell
+root@6a88b0d59848:/DeepSpeech# bin/bangor_welsh/clean_lm_corpus.sh -s /data/texts/macsen/corpus.txt -o /data/texts/macsen/corpus.clean.txt 
+```
+
+### `build_lm_scorer.sh`
+
+```shell
+root@6a88b0d59848:/DeepSpeech# bin/bangor_welsh/build_lm_scorer.sh -s /data/texts/macsen/corpus.clean.txt -o /data/texts/macsen/ -t /data/macsen/deepspeech.csv
+```
+
+### `evaluate_lm_scorer.sh`
+
+```shell
+root@6a88b0d59848:/DeepSpeech# bin/bangor_welsh/evaluate_lm_scorer.sh -l /data/texts/macsen -t /data/macsen/deepspeech.csv
+```
diff --git a/local/__init__.py b/local/__init__.py
diff --git a/local/bin/alphabet.txt → local/alphabet.txt b/local/bin/alphabet.txt → local/alphabet.txt
diff --git a/local/analyze_audio.py b/local/analyze_audio.py
@@ -0,0 +1,36 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+import os
+import sys
+import pathlib
+import librosa
+import pandas
+
+from argparse import ArgumentParser, RawTextHelpFormatter
+
+DESCRIPTION = """
+
+"""
+
+def main(csv_root_dir, **args):
+    csv_files = pathlib.Path(csv_root_dir).glob("*.csv")
+
+    for csv_file_path in csv_files:        
+        df = pandas.read_csv(csv_file_path, encoding='utf-8')        
+        total_duration = 0.0
+        for index, row in df.iterrows(): 
+            wav_file_path = os.path.join(csv_root_dir, row["wav_filename"])
+            total_duration = total_duration + librosa.get_duration(filename=wav_file_path)
+
+        print ("%s\t\t%.2f hours\t(%.2f seconds)" % (csv_file_path, total_duration/60.0/60.0, total_duration))
+
+
+if __name__ == "__main__": 
+
+    parser = ArgumentParser(description=DESCRIPTION, formatter_class=RawTextHelpFormatter) 
+
+    parser.add_argument("--csv_dir", dest="csv_root_dir", required=True, help="path to audio corpus CSV files")
+
+    parser.set_defaults(func=main)
+    args = parser.parse_args()
+    args.func(**vars(args))