From 4eae14e00d4ec457b696a9f40445e1d0e1369ed4 Mon Sep 17 00:00:00 2001
From: Aleksandr Seliverstov <seliverstov@ipavlov.ai>
Date: Thu, 15 Mar 2018 16:52:58 +0300
Subject: [PATCH 1/2] feat: add support for csv/json classification datasets

---
 deeppavlov/__init__.py                        |  2 +-
 .../configs/intents/intents_sample_csv.json   | 98 +++++++++++++++++++
 .../configs/intents/intents_sample_json.json  | 96 ++++++++++++++++++
 deeppavlov/core/commands/train.py             | 17 +++-
 ...ader.py => basic_classification_reader.py} | 36 +++++--
 5 files changed, 237 insertions(+), 12 deletions(-)
 create mode 100644 deeppavlov/configs/intents/intents_sample_csv.json
 create mode 100644 deeppavlov/configs/intents/intents_sample_json.json
 rename deeppavlov/dataset_readers/{csv_classification_reader.py => basic_classification_reader.py} (61%)

diff --git a/deeppavlov/__init__.py b/deeppavlov/__init__.py
index c33e135cf8..d1e20e8556 100644
--- a/deeppavlov/__init__.py
+++ b/deeppavlov/__init__.py
@@ -10,7 +10,7 @@
 import deeppavlov.dataset_readers.dstc2_reader
 import deeppavlov.dataset_readers.conll2003_reader
 import deeppavlov.dataset_readers.typos_reader
-import deeppavlov.dataset_readers.csv_classification_reader
+import deeppavlov.dataset_readers.basic_classification_reader
 import deeppavlov.dataset_iterators.dialog_iterator
 import deeppavlov.dataset_iterators.dstc2_ner_iterator
 import deeppavlov.dataset_iterators.dstc2_intents_iterator
diff --git a/deeppavlov/configs/intents/intents_sample_csv.json b/deeppavlov/configs/intents/intents_sample_csv.json
new file mode 100644
index 0000000000..af0e35b856
--- /dev/null
+++ b/deeppavlov/configs/intents/intents_sample_csv.json
@@ -0,0 +1,98 @@
+{
+  "dataset": {
+    "type": "classification",
+    "format": "csv",
+    "sep": ",",
+    "header": 0,
+    "names": ["text", "classes"],
+    "class_sep": ",",
+    "train": "sample.csv",
+    "data_path": "sample",
+    "x": "text",
+    "y": "classes",
+    "url": "http://lnsigo.mipt.ru/export/datasets/snips_intents/train.csv",
+    "seed": 42,
+    "field_to_split": "train",
+    "split_fields": [
+      "train",
+      "valid"
+    ],
+    "split_proportions": [
+      0.9,
+      0.1
+    ]
+  },
+  "chainer": {
+    "in": ["x"],
+    "in_y": ["y"],
+    "pipe": [
+      {
+        "id": "classes_vocab",
+        "name": "default_vocab",
+        "fit_on": ["y"],
+        "level": "token",
+        "save_path": "vocabs/snips_classes.dict",
+        "load_path": "vocabs/snips_classes.dict"
+      },
+      {
+        "in": ["x"],
+        "in_y": ["y"],
+        "out": ["y_predicted"],
+        "main": true,
+        "name": "intent_model",
+        "save_path": "intents/intent_cnn_snips_v2",
+        "load_path": "intents/intent_cnn_snips_v2",
+        "classes": "#classes_vocab.keys()",
+        "opt": {
+          "kernel_sizes_cnn": [
+            1,
+            2,
+            3
+          ],
+          "filters_cnn": 256,
+          "lear_metrics": [
+            "binary_accuracy",
+            "fmeasure"
+          ],
+          "confident_threshold": 0.5,
+          "optimizer": "Adam",
+          "lear_rate": 0.01,
+          "lear_rate_decay": 0.1,
+          "loss": "binary_crossentropy",
+          "text_size": 15,
+          "coef_reg_cnn": 1e-4,
+          "coef_reg_den": 1e-4,
+          "dropout_rate": 0.5,
+          "epochs": 1000,
+          "dense_size": 100,
+          "model_name": "cnn_model"
+        },
+        "embedder": {
+          "name": "fasttext",
+          "save_path": "embeddings/dstc2_fastText_model.bin",
+          "load_path": "embeddings/dstc2_fastText_model.bin",
+          "emb_module": "fasttext",
+          "dim": 100
+        },
+        "tokenizer": {
+          "name": "nltk_tokenizer",
+          "tokenizer": "wordpunct_tokenize"
+        }
+      }
+    ],
+    "out": ["y_predicted"]
+  },
+  "train": {
+    "epochs": 100,
+    "batch_size": 64,
+    "metrics": [
+      "sets_accuracy"
+    ],
+    "validation_patience": 5,
+    "val_every_n_epochs": 1,
+    "log_every_n_epochs": 1,
+    "show_examples": false,
+    "validate_best": true,
+    "test_best": false
+  }
+}
\ No newline at end of file
diff --git a/deeppavlov/configs/intents/intents_sample_json.json b/deeppavlov/configs/intents/intents_sample_json.json
new file mode 100644
index 0000000000..e471b9fd1f
--- /dev/null
+++ b/deeppavlov/configs/intents/intents_sample_json.json
@@ -0,0 +1,96 @@
+{
+  "dataset": {
+    "type": "classification",
+    "format": "json",
+    "orient": "records",
+    "lines": true,
+    "data_path": "sample",
+    "train": "sample.json",
+    "x": "text",
+    "y": "intents",
+    "url": "http://lnsigo.mipt.ru/export/datasets/snips_intents/train.json",
+    "seed": 42,
+    "field_to_split": "train",
+    "split_fields": [
+      "train",
+      "valid"
+    ],
+    "split_proportions": [
+      0.9,
+      0.1
+    ]
+  },
+  "chainer": {
+    "in": ["x"],
+    "in_y": ["y"],
+    "pipe": [
+      {
+        "id": "classes_vocab",
+        "name": "default_vocab",
+        "fit_on": ["y"],
+        "level": "token",
+        "save_path": "vocabs/snips_classes.dict",
+        "load_path": "vocabs/snips_classes.dict"
+      },
+      {
+        "in": ["x"],
+        "in_y": ["y"],
+        "out": ["y_predicted"],
+        "main": true,
+        "name": "intent_model",
+        "save_path": "intents/intent_cnn_snips_v2",
+        "load_path": "intents/intent_cnn_snips_v2",
+        "classes": "#classes_vocab.keys()",
+        "opt": {
+          "kernel_sizes_cnn": [
+            1,
+            2,
+            3
+          ],
+          "filters_cnn": 256,
+          "lear_metrics": [
+            "binary_accuracy",
+            "fmeasure"
+          ],
+          "confident_threshold": 0.5,
+          "optimizer": "Adam",
+          "lear_rate": 0.01,
+          "lear_rate_decay": 0.1,
+          "loss": "binary_crossentropy",
+          "text_size": 15,
+          "coef_reg_cnn": 1e-4,
+          "coef_reg_den": 1e-4,
+          "dropout_rate": 0.5,
+          "epochs": 1000,
+          "dense_size": 100,
+          "model_name": "cnn_model"
+        },
+        "embedder": {
+          "name": "fasttext",
+          "save_path": "embeddings/dstc2_fastText_model.bin",
+          "load_path": "embeddings/dstc2_fastText_model.bin",
+          "emb_module": "fasttext",
+          "dim": 100
+        },
+        "tokenizer": {
+          "name": "nltk_tokenizer",
+          "tokenizer": "wordpunct_tokenize"
+        }
+      }
+    ],
+    "out": ["y_predicted"]
+  },
+  "train": {
+    "epochs": 100,
+    "batch_size": 64,
+    "metrics": [
+      "sets_accuracy"
+    ],
+    "validation_patience": 5,
+    "val_every_n_epochs": 1,
+    "log_every_n_epochs": 1,
+    "show_examples": false,
+    "validate_best": true,
+    "test_best": false
+  }
+}
\ No newline at end of file
diff --git a/deeppavlov/core/commands/train.py b/deeppavlov/core/commands/train.py
index f89b1025cd..09702c87ae 100644
--- a/deeppavlov/core/commands/train.py
+++ b/deeppavlov/core/commands/train.py
@@ -73,6 +73,19 @@ def train_model_from_config(config_path: str):
     config = read_json(config_path)
     set_deeppavlov_root(config)
 
+    dataset_config = config.get('dataset', None)
+
+    if dataset_config is not None:
+        del config['dataset']
+        ds_type = dataset_config['type']
+        if ds_type == 'classification':
+            reader = {'name': 'basic_classification_reader'}
+            iterator = {'name': 'basic_classification_iterator'}
+            config['dataset_reader'] = {**dataset_config, **reader}
+            config['dataset_iterator'] = {**dataset_config, **iterator}
+        else:
+            raise Exception("Unsupported dataset type: {}".format(ds_type))
+
     reader_config = config['dataset_reader']
     reader = get_model(reader_config['name'])()
     data_path = expand_path(reader_config.get('data_path', ''))
@@ -81,8 +94,8 @@ def train_model_from_config(config_path: str):
     if "data_path" in kwargs: del kwargs["data_path"]
     data = reader.read(data_path, **kwargs)
 
-    dataset_config = config['dataset_iterator']
-    dataset: BasicDatasetIterator = from_params(dataset_config, data=data)
+    iterator_config = config['dataset_iterator']
+    dataset: BasicDatasetIterator = from_params(iterator_config, data=data)
 
     if 'chainer' in config:
         model = fit_chainer(config, dataset)
diff --git a/deeppavlov/dataset_readers/csv_classification_reader.py b/deeppavlov/dataset_readers/basic_classification_reader.py
similarity index 61%
rename from deeppavlov/dataset_readers/csv_classification_reader.py
rename to deeppavlov/dataset_readers/basic_classification_reader.py
index 2d1f64bfab..7fa7dc3251 100644
--- a/deeppavlov/dataset_readers/csv_classification_reader.py
+++ b/deeppavlov/dataset_readers/basic_classification_reader.py
@@ -28,8 +28,8 @@
 log = get_logger(__name__)
 
 
-@register('csv_classification_reader')
-class CsvClassificationDatasetReader(DatasetReader):
+@register('basic_classification_reader')
+class BasicClassificationDatasetReader(DatasetReader):
     """
     Class provides reading dataset in .csv format
     """
@@ -52,20 +52,38 @@ def read(self, data_path, url=None, *args, **kwargs):
         """
         data_types = ["train", "valid", "test"]
 
-        if not Path(data_path, "train.csv").exists():
+        train_file = format(kwargs.get('train', 'train.csv'))
+
+        if not Path(data_path, train_file).exists():
             if url is None:
                 raise Exception("data path {} is not exists or empty and download url parameter not specified!".format(data_path))
             log.info("Loading train data from {} to {}".format(url, data_path))
-            download(source_url=url, dest_file_path=Path(data_path, "train.csv"))
+            download(source_url=url, dest_file_path=Path(data_path, train_file))
 
         data = {"train": [],
                 "valid": [],
                 "test": []}
         for data_type in data_types:
-            try:
-                df = pd.read_csv(Path(data_path).joinpath(data_type + ".csv"))
-                data[data_type] = [(row['text'], row['intents'].split(',')) for _, row in df.iterrows()]
-            except FileNotFoundError:
-                log.warning("Cannot find {}.csv data file".format(data_type))
+            file_format = kwargs.get('format', 'csv')
+            file_name = kwargs.get(data_type, '{}.{}'.format(data_type, file_format))
+            file = Path(data_path).joinpath(file_name)
+            if file.exists():
+                if file_format == 'csv':
+                    keys = ('sep', 'header', 'names')
+                    options = {k: kwargs[k] for k in keys if k in kwargs}
+                    df = pd.read_csv(file, **options)
+                elif file_format == 'json':
+                    keys = ('orient', 'lines')
+                    options = {k: kwargs[k] for k in keys if k in kwargs}
+                    df = pd.read_json(file, **options)
+                else:
+                    raise Exception('Unsupported file format: {}'.format(file_format))
+
+                x = kwargs.get("x", "text")
+                y = kwargs.get('y', 'label')
+                class_sep = kwargs.get('class_sep', ',')
+                data[data_type] = [(row[x], row[y].split(class_sep)) for _, row in df.iterrows()]
+            else:
+                log.warning("Cannot find {} file".format(file))
 
         return data

From 33f434b3885a1f39f5c8a8b002a837cfc406bdce Mon Sep 17 00:00:00 2001
From: Aleksandr Seliverstov <seliverstov@ipavlov.ai>
Date: Thu, 15 Mar 2018 18:40:29 +0300
Subject: [PATCH 2/2] feat: add tests for snips and samples

---
 deeppavlov/configs/intents/intents_snips.json |  4 +-
 .../basic_classification_reader.py            |  2 +-
 tests/test_quick_start.py                     | 41 ++++++++++---------
 3 files changed, 25 insertions(+), 22 deletions(-)

diff --git a/deeppavlov/configs/intents/intents_snips.json b/deeppavlov/configs/intents/intents_snips.json
index 7842d13c62..1fe43bf22a 100644
--- a/deeppavlov/configs/intents/intents_snips.json
+++ b/deeppavlov/configs/intents/intents_snips.json
@@ -1,6 +1,8 @@
 {
   "dataset_reader": {
-    "name": "csv_classification_reader",
+    "name": "basic_classification_reader",
+    "x": "text",
+    "y": "intents",
     "data_path": "snips",
     "url": "http://lnsigo.mipt.ru/export/datasets/snips_intents/train.csv"
   },
diff --git a/deeppavlov/dataset_readers/basic_classification_reader.py b/deeppavlov/dataset_readers/basic_classification_reader.py
index 7fa7dc3251..626988ddcd 100644
--- a/deeppavlov/dataset_readers/basic_classification_reader.py
+++ b/deeppavlov/dataset_readers/basic_classification_reader.py
@@ -80,7 +80,7 @@ def read(self, data_path, url=None, *args, **kwargs):
                     raise Exception('Unsupported file format: {}'.format(file_format))
 
                 x = kwargs.get("x", "text")
-                y = kwargs.get('y', 'label')
+                y = kwargs.get('y', 'labels')
                 class_sep = kwargs.get('class_sep', ',')
                 data[data_type] = [(row[x], row[y].split(class_sep)) for _, row in df.iterrows()]
             else:
diff --git a/tests/test_quick_start.py b/tests/test_quick_start.py
index fc0c1c699d..31c813276d 100644
--- a/tests/test_quick_start.py
+++ b/tests/test_quick_start.py
@@ -11,17 +11,17 @@
 
 
 # Mapping from model name to config-model_dir and corresponding query-response pairs.
-PARAMS = {"error_model": {("configs/error_model/brillmoore_wikitypos_en.json", "error_model"):
+PARAMS = {"error_model": {("configs/error_model/brillmoore_wikitypos_en.json", "error_model", True):
                               [
                                   ("helllo", "hello"),
                                   ("datha", "data")
                               ],
-                          ("configs/error_model/brillmoore_kartaslov_ru.json", "error_model"):
+                          ("configs/error_model/brillmoore_kartaslov_ru.json", "error_model", True):
                               [
 
                               ]
                           },
-          "go_bot": {("configs/go_bot/gobot_dstc2.json", "go_bot"):
+          "go_bot": {("configs/go_bot/gobot_dstc2.json", "go_bot", True):
                          [
 
                          ],
@@ -30,25 +30,24 @@
                      #
                      #     ]
                      },
-          "intents": {("configs/intents/intents_dstc2.json", "intents"):
-                          [
-
-                          ]
-                      },
-          "ner": {("configs/ner/ner_conll2003.json", "ner_conll2003_model"):
+          "intents": {("configs/intents/intents_dstc2.json", "intents", True):  []},
+          "snips": {("configs/intents/intents_snips.json", "intents", False): []},
+          "sample": {("configs/intents/intents_sample_csv.json", "intents", False): [],
+                    ("configs/intents/intents_sample_json.json", "intents", False): []},
+          "ner": {("configs/ner/ner_conll2003.json", "ner_conll2003_model", True):
                       [
                           # ("Albert Einstein and Erwin Schrodinger", "['B-PER', 'I-PER', 'O', 'B-PER', 'I-PER']"),
                           # ("Antananarivo is the capital of Madagascar", "['B-LOC', 'O', 'O', 'O', 'O', 'B-LOC']"),
                           # ("UN launches new global data collection tool to help reduce disaster",
                           #  "['B-ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']")
                       ],
-                  ("configs/ner/ner_dstc2.json", "ner"):
+                  ("configs/ner/ner_dstc2.json", "ner", True):
                       [
                           # ("chinese food", "['B-food', 'O']"),
                           # ("in the west part", "['O', 'O', 'B-area', 'O']"),
                           # ("moderate price range", "['B-pricerange', 'O', 'O']")
                       ],
-                  ("configs/ner/slotfill_dstc2.json", "ner"):
+                  ("configs/ner/slotfill_dstc2.json", "ner", True):
                       [
                           ("chinese food", "{'food': 'chinese'}"),
                           ("in the west part", "{'area': 'west'}"),
@@ -67,7 +66,7 @@ def setup_module():
 
     for m_name, conf_dict in PARAMS.items():
         test_configs_path.joinpath(m_name).mkdir()
-        for (conf_file, _), _ in conf_dict.items():
+        for (conf_file, _, _), _ in conf_dict.items():
             with (src_dir / conf_file).open() as fin:
                 config = json.load(fin)
             try:
@@ -91,7 +90,7 @@ def download(full=None):
     pexpect.run(cmd, timeout=None)
 
 
-@pytest.mark.parametrize("model,conf_file,model_dir", [(m, c, md) for m in PARAMS.keys() for c, md in PARAMS[m].keys()])
+@pytest.mark.parametrize("model,conf_file,model_dir,d", [(m, c, md, d) for m in PARAMS.keys() for c, md, d in PARAMS[m].keys()])
 class TestQuickStart(object):
 
     @staticmethod
@@ -108,15 +107,17 @@ def interact(conf_file, model_dir, qr_list=None):
         p.sendline("quit")
         assert p.expect(pexpect.EOF) == 0, f"Error in quitting from deep.py ({conf_file})"
 
-    def test_downloaded_model_existence(self, model, conf_file, model_dir):
-        if not download_path.exists():
-            download()
-        assert download_path.joinpath(model_dir).exists(), f"{model_dir} was not downloaded"
+    def test_downloaded_model_existence(self, model, conf_file, model_dir, d):
+        if d:
+            if not download_path.exists():
+                download()
+            assert download_path.joinpath(model_dir).exists(), f"{model_dir} was not downloaded"
 
-    def test_interacting_pretrained_model(self, model, conf_file, model_dir):
-        self.interact(tests_dir / conf_file, model_dir, PARAMS[model][(conf_file, model_dir)])
+    def test_interacting_pretrained_model(self, model, conf_file, model_dir, d):
+        if d:
+            self.interact(tests_dir / conf_file, model_dir, PARAMS[model][(conf_file, model_dir, d)])
 
-    def test_consecutive_training_and_interacting(self, model, conf_file, model_dir):
+    def test_consecutive_training_and_interacting(self, model, conf_file, model_dir, d):
         c = tests_dir / conf_file
         model_path = download_path / model_dir
         shutil.rmtree(str(model_path),  ignore_errors=True)