From 4183aac4700ce849c20127f34c8ee73d3c5f30a7 Mon Sep 17 00:00:00 2001
From: Josh Meyer <joshua.richard.meyer@gmail.com>
Date: Sun, 3 Oct 2021 15:17:09 -0400
Subject: [PATCH] Add v1.0.0 digits

---
 .../{yesno/v0.0.1 => v1.0.0-digits}/LICENSE   |   0
 .../v0.0.1 => v1.0.0-digits}/MODEL_CARD.md    |  38 ++++++++----------
 english/coqui/v1.0.0-digits/digits.scorer     | Bin 0 -> 1648 bytes
 english/coqui/yesno/v0.0.1/alphabet.txt       |   6 ---
 4 files changed, 16 insertions(+), 28 deletions(-)
 rename english/coqui/{yesno/v0.0.1 => v1.0.0-digits}/LICENSE (100%)
 rename english/coqui/{yesno/v0.0.1 => v1.0.0-digits}/MODEL_CARD.md (57%)
 create mode 100644 english/coqui/v1.0.0-digits/digits.scorer
 delete mode 100644 english/coqui/yesno/v0.0.1/alphabet.txt

diff --git a/english/coqui/yesno/v0.0.1/LICENSE b/english/coqui/v1.0.0-digits/LICENSE
similarity index 100%
rename from english/coqui/yesno/v0.0.1/LICENSE
rename to english/coqui/v1.0.0-digits/LICENSE
diff --git a/english/coqui/yesno/v0.0.1/MODEL_CARD.md b/english/coqui/v1.0.0-digits/MODEL_CARD.md
similarity index 57%
rename from english/coqui/yesno/v0.0.1/MODEL_CARD.md
rename to english/coqui/v1.0.0-digits/MODEL_CARD.md
index 1ba3f0a..65ff661 100644
--- a/english/coqui/yesno/v0.0.1/MODEL_CARD.md
+++ b/english/coqui/v1.0.0-digits/MODEL_CARD.md
@@ -1,4 +1,4 @@
-# Model card for English yesno STT
+# Model card for English STT v1.0.0
 
 Jump to section:
 
@@ -15,17 +15,17 @@ Jump to section:
 
 - Person or organization developing model: Maintained by [Coqui](https://coqui.ai/).
 - Model language: English / English / `en`
-- Model date: July 26, 2021
-- Model type: `Speech-to-Text` / `constrained vocabulary` / `yesno`
-- Model version: `v0.0.1`
-- Compatible with 🐸 STT version: `v0.9.3`
+- Model date: October 3, 2021
+- Model type: `Small vocabulary Speech-to-Text`
+- Model version: `v1.0.0-digits`
+- Compatible with 🐸 STT version: `v1.0.0`
 - License: Apache 2.0
-- Citation details: `@techreport{english-yesno-stt, author = {Coqui}, title = {English yesno STT v0.0.1}, institution = {Coqui}, address = {\url{https://github.com/coqui-ai/STT-models}} year = {2021}, month = {July}, number = {STT-EN-YESNO-0.0.1} }`
-- Where to send questions or comments about the model: You can leave an issue on [`STT-model` issues](https://github.com/coqui-ai/STT-models/issues), open a new discussion on [`STT-model` discussions](https://github.com/coqui-ai/STT-models/discussions), or chat with us on [Gitter](https://gitter.im/coqui-ai/).
+- Citation details: `@techreport{english-stt, author = {Coqui}, title = {English STT v1.0.0}, institution = {Coqui}, address = {\url{https://coqui.ai/models}} year = {2021}, month = {October}, number = {STT-EN-1.0.0} }`
+- Where to send questions or comments about the model: You can leave an issue on [`STT` issues](https://github.com/coqui-ai/STT/issues), open a new discussion on [`STT` discussions](https://github.com/coqui-ai/STT/discussions), or chat with us on [Gitter](https://gitter.im/coqui-ai/).
 
 ## Intended use
 
-Speech-to-Text `yesno` model for the [English Language](https://en.wikipedia.org/wiki/English_language) on 16kHz, mono-channel audio. This model has been trained to only recognize the two words "yes" and "no" in English.
+Closed vocabulary (digits "zero" through "nine") Speech-to-Text for the [English Language](https://en.wikipedia.org/wiki/English_language) on 16kHz, mono-channel audio. This acoustic model and language model pair will only be able to recognize the words {"zero","one","two","three","four","five","six","seven","eight" and "nine"}, which is a common use case in IVR systems.
 
 ## Performance Factors
 
@@ -33,20 +33,14 @@ Factors relevant to Speech-to-Text performance include but are not limited to sp
 
 ## Metrics
 
-STT models are usually evaluated in terms of their transcription accuracy, deployment Real-Time Factor, and model size on disk.
-
-#### Transcription Accuracy
-
-The model was trained and evaluted on the Common Voice Target Segments Corpus, specifically, only on "yes" and "no" audio clips.
-
-|Test Corpus|Word Error Rate|
-|-------|----------|
-|Common Voice 6.1 (Target Segments Corpus "yes" and "no") | 1.6\% |
-
 #### Model Size
 
-`yesno.pbmm`: 319K
-`yesno.scorer`: 1.7K
+For STT, you always must deploy an acoustic model, and it is often the case you also will want to deploy an application-specific language model. The acoustic model comes in two forms: quantized and unquantized. There is a size<->accuracy trade-off for acoustic model quantization. For this combination of acoustic model and language model, we optimize for small size.
+
+|Model type|Vocabulary|Filename|Size|
+----------------|-----|----------------|-----|
+|Acoustic model | open | `model_quantized.tflite` | 46M||
+|Language model | small| `digits.scorer` |1.7K| 
 
 ### Approaches to uncertainty and variability
 
@@ -54,11 +48,11 @@ Confidence scores and multiple paths from the decoding beam can be used to measu
 
 ## Training data
 
-The model was trained and evaluted on the Common Voice Target Segments Corpus, specifically, only on "yes" and "no" audio clips.
+This model was trained on the following corpora: Common Voice 7.0 English (custom Coqui train/dev/test splits), LibriSpeech, and Multilingual Librispeech. In total approximately ~47,000 hours of data.
 
 ## Evaluation data
 
-The model was trained and evaluted on the Common Voice Target Segments Corpus, specifically, only on "yes" and "no" audio clips.
+The validation ("dev") sets came from CV, Librispeech, and MLS. Testing accuracy is reported for MLS and Librispeech.
 
 ## Ethical considerations
 
diff --git a/english/coqui/v1.0.0-digits/digits.scorer b/english/coqui/v1.0.0-digits/digits.scorer
new file mode 100644
index 0000000000000000000000000000000000000000..f9d7ac902175efb7e753834e1a10d22a625151de
GIT binary patch
literal 1648
zcmZ{kOK1~O6ox0Q)f%<dcj|NES_~>;>n3Os(VfMGWMxQFTALTi)P>Mw@QsL~SdglV
zsu3xQHZDZLMlFa?=)y(aR*<@JQK`_4MdwTAn9FrMaOa-?zmGZhzyC}s70sH-l-Xlh
z+5MqVUr!?1-J3|pcg8ZQP%INqnBAFNDr%Xh6S;hECT;HZ8}+cmPG0djkKazflM@Zq
z=i6{owXg9j+<dLu(=B)NY$5oj<6^(@XjgfA<G^&ppPc_Oke}{*_uzB7bAHa6Xo{Cc
ze$8Cz4}2?qcsY4vV(910ROH0%vxnOYp$k`sU*Fs@c(-fx$>_P=_m6%bej2!CKYRc9
z*w<^XzWB#4mlx!f4Zd21FFLMQ+D^8Nwcq<#amH*Jr@vzkjS1e@`LLqmJ(r#{X3IHc
ze3`pIu$)V<oLz8>Hh-mtTQ$5&!>cvC#>LX98Vg5%;M1wl<e9nFgU!M7b^g$iuH%dA
zaF%OzVm3UrP%dzVu}nH|H9JY(il*byT%7YSa+=Q=jEt0=#Emw$4+>$}MbagIF^OFS
zi-xr!yU6}E-_#i9;Wm99TI(Eo>ub?i=g?JOi?%w4zB-4-I)~0Wx6XrG@4;>G-~t{T
zn(FV^=)rCB;5K`3TRga}9^5t$&iscnjG)8)FL0GOi!*TV;sx#o-;4Jfot*m-@8d7@
z6)$5%w51X3%b95e`{Eb*L*XYCFLkiGzl_Oz-lx{XovHQXJ6YqT)qfXz(g?nB?vF;`
zKXpBOQxlD#7Cb;Bc%L*dzEsEYBwmnjj4?bZ-#)#1K7E{e)OyiXUC(^AUUXJ5wW<0u
PU)4)r59ZFryXf)<!nw3x

literal 0
HcmV?d00001

diff --git a/english/coqui/yesno/v0.0.1/alphabet.txt b/english/coqui/yesno/v0.0.1/alphabet.txt
deleted file mode 100644
index 04c9235..0000000
--- a/english/coqui/yesno/v0.0.1/alphabet.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-y
-e
-s
-n
-o
-#