common-voice · kdavis-mozilla · Jan 28, 2019 · Jan 24, 2019 · Jan 27, 2019 · kdavis-mozilla
diff --git a/README.rst b/README.rst
@@ -19,12 +19,18 @@ Usage
 ===========
 
 
-Given the ``clips.tsv`` file dumped from the Common Voice database one creates a corpora in the directory ``corpora`` as follows
+Given the ``clips.tsv`` file dumped from the Common Voice database, you can create a corpus (for each language in the ``clips.tsv`` file) as follows:
 
 ``CorporaCreator$ create-corpora -d corpora -f clips.tsv``
 
 This will create the corpora in the directory ``corpora`` from the ``clips.tsv`` file.
 
+If you would like to just create corpora for a some language(s), you can pass the ``--langs`` flag as follows:
+
+``CorporaCreator$ create-corpora -d corpora -f clips.tsv --langs en fr``
+
+This will create the corpora only for English and French.
+
 Each created corpus will contain the files ``valid.tsv``, containing the validated clips; ``invalid.tsv``, containing the invalidated clips; and ``other.tsv``, containing clips that don't have sufficient votes to be considered valid or invalid. In addition it will contain the files ``train.tsv``, the valid clips in the training set; ``dev.tsv``, the valid clips in the validation set; and ``test.tsv``, the valid clips in test set.
 
 The split of ``valid.tsv`` into ``train.tsv``, ``dev.tsv``, and ``test.tsv`` is done such that the number of clips in ``dev.tsv`` or ``test.tsv`` is a "statistically significant" sample relataive to the number of clips in ``train.tsv``. More specificially, if the population size is the number of clips in ``train.tsv``, then the number of clips in ``dev.tsv`` or ``test.tsv`` is the sample size required for a confidence level of 99% and a margin of error of 1% for the ``train.tsv`` population size.

diff --git a/src/corporacreator/argparse.py b/src/corporacreator/argparse.py
@@ -66,6 +66,13 @@ def parse_args(args):
         help="Path to the Common Voice tsv for all languages",
         dest="tsv_filename",
     )
+    parser.add_argument(
+        "-l",
+        "--langs",
+        required=False,
+        nargs='+',
+        help="Which language(s) you want to make corpora for",
+    )
     parser.add_argument(
         "-d",
         "--directory",

diff --git a/src/corporacreator/corpora.py b/src/corporacreator/corpora.py
@@ -7,6 +7,7 @@
 
 from corporacreator import Corpus
 from corporacreator.preprocessors import common
+import argparse
 
 _logger = logging.getLogger(__name__)
 
@@ -42,7 +43,17 @@ def create(self):
         corpora_data[["sentence", "up_votes", "down_votes"]] = corpora_data[
             ["sentence", "up_votes", "down_votes"]
         ].swifter.apply(func=lambda arg: common_wrapper(*arg), axis=1)
-        for locale in corpora_data.locale.unique():
+        if self.args.langs:
+            # check if all languages provided at command line are actually
+            # in the clips.tsv file, if not, throw error
+            if self.args.langs.issubset(corpora_data.locale.unique()):
+                locales = self.args.langs
+            else:
+                raise argparse.ArgumentTypeError("ERROR: You have requested languages which do not exist in clips.tsv")
+        else:
+            locales = corpora_data.locale.unique()
+
+        for locale in locales:
             _logger.info("Selecting %s corpus data..." % locale)
             corpus_data = corpora_data.loc[
                 lambda df: df.locale == locale,