From 4eae14e00d4ec457b696a9f40445e1d0e1369ed4 Mon Sep 17 00:00:00 2001 From: Aleksandr Seliverstov Date: Thu, 15 Mar 2018 16:52:58 +0300 Subject: [PATCH 1/2] feat: add support for csv/json classification datasets --- deeppavlov/__init__.py | 2 +- .../configs/intents/intents_sample_csv.json | 98 +++++++++++++++++++ .../configs/intents/intents_sample_json.json | 96 ++++++++++++++++++ deeppavlov/core/commands/train.py | 17 +++- ...ader.py => basic_classification_reader.py} | 36 +++++-- 5 files changed, 237 insertions(+), 12 deletions(-) create mode 100644 deeppavlov/configs/intents/intents_sample_csv.json create mode 100644 deeppavlov/configs/intents/intents_sample_json.json rename deeppavlov/dataset_readers/{csv_classification_reader.py => basic_classification_reader.py} (61%) diff --git a/deeppavlov/__init__.py b/deeppavlov/__init__.py index c33e135cf8..d1e20e8556 100644 --- a/deeppavlov/__init__.py +++ b/deeppavlov/__init__.py @@ -10,7 +10,7 @@ import deeppavlov.dataset_readers.dstc2_reader import deeppavlov.dataset_readers.conll2003_reader import deeppavlov.dataset_readers.typos_reader -import deeppavlov.dataset_readers.csv_classification_reader +import deeppavlov.dataset_readers.basic_classification_reader import deeppavlov.dataset_iterators.dialog_iterator import deeppavlov.dataset_iterators.dstc2_ner_iterator import deeppavlov.dataset_iterators.dstc2_intents_iterator diff --git a/deeppavlov/configs/intents/intents_sample_csv.json b/deeppavlov/configs/intents/intents_sample_csv.json new file mode 100644 index 0000000000..af0e35b856 --- /dev/null +++ b/deeppavlov/configs/intents/intents_sample_csv.json @@ -0,0 +1,98 @@ +{ + "dataset": { + "type": "classification", + "format": "csv", + "sep": ",", + "header": 0, + "names": ["text", "classes"], + "class_sep": ",", + "train": "sample.csv", + "data_path": "sample", + "x": "text", + "y": "classes", + "url": "http://lnsigo.mipt.ru/export/datasets/snips_intents/train.csv", + "seed": 42, + "field_to_split": "train", + "split_fields": [ + "train", + "valid" + ], + "split_proportions": [ + 0.9, + 0.1 + ] + }, + "chainer": { + "in": ["x"], + "in_y": ["y"], + "pipe": [ + { + "id": "classes_vocab", + "name": "default_vocab", + "fit_on": ["y"], + "level": "token", + "save_path": "vocabs/snips_classes.dict", + "load_path": "vocabs/snips_classes.dict" + }, + { + "in": ["x"], + "in_y": ["y"], + "out": ["y_predicted"], + "main": true, + "name": "intent_model", + "save_path": "intents/intent_cnn_snips_v2", + "load_path": "intents/intent_cnn_snips_v2", + "classes": "#classes_vocab.keys()", + "opt": { + "kernel_sizes_cnn": [ + 1, + 2, + 3 + ], + "filters_cnn": 256, + "lear_metrics": [ + "binary_accuracy", + "fmeasure" + ], + "confident_threshold": 0.5, + "optimizer": "Adam", + "lear_rate": 0.01, + "lear_rate_decay": 0.1, + "loss": "binary_crossentropy", + "text_size": 15, + "coef_reg_cnn": 1e-4, + "coef_reg_den": 1e-4, + "dropout_rate": 0.5, + "epochs": 1000, + "dense_size": 100, + "model_name": "cnn_model" + }, + "embedder": { + "name": "fasttext", + "save_path": "embeddings/dstc2_fastText_model.bin", + "load_path": "embeddings/dstc2_fastText_model.bin", + "emb_module": "fasttext", + "dim": 100 + }, + "tokenizer": { + "name": "nltk_tokenizer", + "tokenizer": "wordpunct_tokenize" + } + } + ], + "out": ["y_predicted"] + }, + "train": { + "epochs": 100, + "batch_size": 64, + "metrics": [ + "sets_accuracy" + ], + "validation_patience": 5, + "val_every_n_epochs": 1, + "log_every_n_epochs": 1, + "show_examples": false, + "validate_best": true, + "test_best": false + } +} \ No newline at end of file diff --git a/deeppavlov/configs/intents/intents_sample_json.json b/deeppavlov/configs/intents/intents_sample_json.json new file mode 100644 index 0000000000..e471b9fd1f --- /dev/null +++ b/deeppavlov/configs/intents/intents_sample_json.json @@ -0,0 +1,96 @@ +{ + "dataset": { + "type": "classification", + "format": "json", + "orient": "records", + "lines": true, + "data_path": "sample", + "train": "sample.json", + "x": "text", + "y": "intents", + "url": "http://lnsigo.mipt.ru/export/datasets/snips_intents/train.json", + "seed": 42, + "field_to_split": "train", + "split_fields": [ + "train", + "valid" + ], + "split_proportions": [ + 0.9, + 0.1 + ] + }, + "chainer": { + "in": ["x"], + "in_y": ["y"], + "pipe": [ + { + "id": "classes_vocab", + "name": "default_vocab", + "fit_on": ["y"], + "level": "token", + "save_path": "vocabs/snips_classes.dict", + "load_path": "vocabs/snips_classes.dict" + }, + { + "in": ["x"], + "in_y": ["y"], + "out": ["y_predicted"], + "main": true, + "name": "intent_model", + "save_path": "intents/intent_cnn_snips_v2", + "load_path": "intents/intent_cnn_snips_v2", + "classes": "#classes_vocab.keys()", + "opt": { + "kernel_sizes_cnn": [ + 1, + 2, + 3 + ], + "filters_cnn": 256, + "lear_metrics": [ + "binary_accuracy", + "fmeasure" + ], + "confident_threshold": 0.5, + "optimizer": "Adam", + "lear_rate": 0.01, + "lear_rate_decay": 0.1, + "loss": "binary_crossentropy", + "text_size": 15, + "coef_reg_cnn": 1e-4, + "coef_reg_den": 1e-4, + "dropout_rate": 0.5, + "epochs": 1000, + "dense_size": 100, + "model_name": "cnn_model" + }, + "embedder": { + "name": "fasttext", + "save_path": "embeddings/dstc2_fastText_model.bin", + "load_path": "embeddings/dstc2_fastText_model.bin", + "emb_module": "fasttext", + "dim": 100 + }, + "tokenizer": { + "name": "nltk_tokenizer", + "tokenizer": "wordpunct_tokenize" + } + } + ], + "out": ["y_predicted"] + }, + "train": { + "epochs": 100, + "batch_size": 64, + "metrics": [ + "sets_accuracy" + ], + "validation_patience": 5, + "val_every_n_epochs": 1, + "log_every_n_epochs": 1, + "show_examples": false, + "validate_best": true, + "test_best": false + } +} \ No newline at end of file diff --git a/deeppavlov/core/commands/train.py b/deeppavlov/core/commands/train.py index f89b1025cd..09702c87ae 100644 --- a/deeppavlov/core/commands/train.py +++ b/deeppavlov/core/commands/train.py @@ -73,6 +73,19 @@ def train_model_from_config(config_path: str): config = read_json(config_path) set_deeppavlov_root(config) + dataset_config = config.get('dataset', None) + + if dataset_config is not None: + del config['dataset'] + ds_type = dataset_config['type'] + if ds_type == 'classification': + reader = {'name': 'basic_classification_reader'} + iterator = {'name': 'basic_classification_iterator'} + config['dataset_reader'] = {**dataset_config, **reader} + config['dataset_iterator'] = {**dataset_config, **iterator} + else: + raise Exception("Unsupported dataset type: {}".format(ds_type)) + reader_config = config['dataset_reader'] reader = get_model(reader_config['name'])() data_path = expand_path(reader_config.get('data_path', '')) @@ -81,8 +94,8 @@ def train_model_from_config(config_path: str): if "data_path" in kwargs: del kwargs["data_path"] data = reader.read(data_path, **kwargs) - dataset_config = config['dataset_iterator'] - dataset: BasicDatasetIterator = from_params(dataset_config, data=data) + iterator_config = config['dataset_iterator'] + dataset: BasicDatasetIterator = from_params(iterator_config, data=data) if 'chainer' in config: model = fit_chainer(config, dataset) diff --git a/deeppavlov/dataset_readers/csv_classification_reader.py b/deeppavlov/dataset_readers/basic_classification_reader.py similarity index 61% rename from deeppavlov/dataset_readers/csv_classification_reader.py rename to deeppavlov/dataset_readers/basic_classification_reader.py index 2d1f64bfab..7fa7dc3251 100644 --- a/deeppavlov/dataset_readers/csv_classification_reader.py +++ b/deeppavlov/dataset_readers/basic_classification_reader.py @@ -28,8 +28,8 @@ log = get_logger(__name__) -@register('csv_classification_reader') -class CsvClassificationDatasetReader(DatasetReader): +@register('basic_classification_reader') +class BasicClassificationDatasetReader(DatasetReader): """ Class provides reading dataset in .csv format """ @@ -52,20 +52,38 @@ def read(self, data_path, url=None, *args, **kwargs): """ data_types = ["train", "valid", "test"] - if not Path(data_path, "train.csv").exists(): + train_file = format(kwargs.get('train', 'train.csv')) + + if not Path(data_path, train_file).exists(): if url is None: raise Exception("data path {} is not exists or empty and download url parameter not specified!".format(data_path)) log.info("Loading train data from {} to {}".format(url, data_path)) - download(source_url=url, dest_file_path=Path(data_path, "train.csv")) + download(source_url=url, dest_file_path=Path(data_path, train_file)) data = {"train": [], "valid": [], "test": []} for data_type in data_types: - try: - df = pd.read_csv(Path(data_path).joinpath(data_type + ".csv")) - data[data_type] = [(row['text'], row['intents'].split(',')) for _, row in df.iterrows()] - except FileNotFoundError: - log.warning("Cannot find {}.csv data file".format(data_type)) + file_format = kwargs.get('format', 'csv') + file_name = kwargs.get(data_type, '{}.{}'.format(data_type, file_format)) + file = Path(data_path).joinpath(file_name) + if file.exists(): + if file_format == 'csv': + keys = ('sep', 'header', 'names') + options = {k: kwargs[k] for k in keys if k in kwargs} + df = pd.read_csv(file, **options) + elif file_format == 'json': + keys = ('orient', 'lines') + options = {k: kwargs[k] for k in keys if k in kwargs} + df = pd.read_json(file, **options) + else: + raise Exception('Unsupported file format: {}'.format(file_format)) + + x = kwargs.get("x", "text") + y = kwargs.get('y', 'label') + class_sep = kwargs.get('class_sep', ',') + data[data_type] = [(row[x], row[y].split(class_sep)) for _, row in df.iterrows()] + else: + log.warning("Cannot find {} file".format(file)) return data From 33f434b3885a1f39f5c8a8b002a837cfc406bdce Mon Sep 17 00:00:00 2001 From: Aleksandr Seliverstov Date: Thu, 15 Mar 2018 18:40:29 +0300 Subject: [PATCH 2/2] feat: add tests for snips and samples --- deeppavlov/configs/intents/intents_snips.json | 4 +- .../basic_classification_reader.py | 2 +- tests/test_quick_start.py | 41 ++++++++++--------- 3 files changed, 25 insertions(+), 22 deletions(-) diff --git a/deeppavlov/configs/intents/intents_snips.json b/deeppavlov/configs/intents/intents_snips.json index 7842d13c62..1fe43bf22a 100644 --- a/deeppavlov/configs/intents/intents_snips.json +++ b/deeppavlov/configs/intents/intents_snips.json @@ -1,6 +1,8 @@ { "dataset_reader": { - "name": "csv_classification_reader", + "name": "basic_classification_reader", + "x": "text", + "y": "intents", "data_path": "snips", "url": "http://lnsigo.mipt.ru/export/datasets/snips_intents/train.csv" }, diff --git a/deeppavlov/dataset_readers/basic_classification_reader.py b/deeppavlov/dataset_readers/basic_classification_reader.py index 7fa7dc3251..626988ddcd 100644 --- a/deeppavlov/dataset_readers/basic_classification_reader.py +++ b/deeppavlov/dataset_readers/basic_classification_reader.py @@ -80,7 +80,7 @@ def read(self, data_path, url=None, *args, **kwargs): raise Exception('Unsupported file format: {}'.format(file_format)) x = kwargs.get("x", "text") - y = kwargs.get('y', 'label') + y = kwargs.get('y', 'labels') class_sep = kwargs.get('class_sep', ',') data[data_type] = [(row[x], row[y].split(class_sep)) for _, row in df.iterrows()] else: diff --git a/tests/test_quick_start.py b/tests/test_quick_start.py index fc0c1c699d..31c813276d 100644 --- a/tests/test_quick_start.py +++ b/tests/test_quick_start.py @@ -11,17 +11,17 @@ # Mapping from model name to config-model_dir and corresponding query-response pairs. -PARAMS = {"error_model": {("configs/error_model/brillmoore_wikitypos_en.json", "error_model"): +PARAMS = {"error_model": {("configs/error_model/brillmoore_wikitypos_en.json", "error_model", True): [ ("helllo", "hello"), ("datha", "data") ], - ("configs/error_model/brillmoore_kartaslov_ru.json", "error_model"): + ("configs/error_model/brillmoore_kartaslov_ru.json", "error_model", True): [ ] }, - "go_bot": {("configs/go_bot/gobot_dstc2.json", "go_bot"): + "go_bot": {("configs/go_bot/gobot_dstc2.json", "go_bot", True): [ ], @@ -30,25 +30,24 @@ # # ] }, - "intents": {("configs/intents/intents_dstc2.json", "intents"): - [ - - ] - }, - "ner": {("configs/ner/ner_conll2003.json", "ner_conll2003_model"): + "intents": {("configs/intents/intents_dstc2.json", "intents", True): []}, + "snips": {("configs/intents/intents_snips.json", "intents", False): []}, + "sample": {("configs/intents/intents_sample_csv.json", "intents", False): [], + ("configs/intents/intents_sample_json.json", "intents", False): []}, + "ner": {("configs/ner/ner_conll2003.json", "ner_conll2003_model", True): [ # ("Albert Einstein and Erwin Schrodinger", "['B-PER', 'I-PER', 'O', 'B-PER', 'I-PER']"), # ("Antananarivo is the capital of Madagascar", "['B-LOC', 'O', 'O', 'O', 'O', 'B-LOC']"), # ("UN launches new global data collection tool to help reduce disaster", # "['B-ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']") ], - ("configs/ner/ner_dstc2.json", "ner"): + ("configs/ner/ner_dstc2.json", "ner", True): [ # ("chinese food", "['B-food', 'O']"), # ("in the west part", "['O', 'O', 'B-area', 'O']"), # ("moderate price range", "['B-pricerange', 'O', 'O']") ], - ("configs/ner/slotfill_dstc2.json", "ner"): + ("configs/ner/slotfill_dstc2.json", "ner", True): [ ("chinese food", "{'food': 'chinese'}"), ("in the west part", "{'area': 'west'}"), @@ -67,7 +66,7 @@ def setup_module(): for m_name, conf_dict in PARAMS.items(): test_configs_path.joinpath(m_name).mkdir() - for (conf_file, _), _ in conf_dict.items(): + for (conf_file, _, _), _ in conf_dict.items(): with (src_dir / conf_file).open() as fin: config = json.load(fin) try: @@ -91,7 +90,7 @@ def download(full=None): pexpect.run(cmd, timeout=None) -@pytest.mark.parametrize("model,conf_file,model_dir", [(m, c, md) for m in PARAMS.keys() for c, md in PARAMS[m].keys()]) +@pytest.mark.parametrize("model,conf_file,model_dir,d", [(m, c, md, d) for m in PARAMS.keys() for c, md, d in PARAMS[m].keys()]) class TestQuickStart(object): @staticmethod @@ -108,15 +107,17 @@ def interact(conf_file, model_dir, qr_list=None): p.sendline("quit") assert p.expect(pexpect.EOF) == 0, f"Error in quitting from deep.py ({conf_file})" - def test_downloaded_model_existence(self, model, conf_file, model_dir): - if not download_path.exists(): - download() - assert download_path.joinpath(model_dir).exists(), f"{model_dir} was not downloaded" + def test_downloaded_model_existence(self, model, conf_file, model_dir, d): + if d: + if not download_path.exists(): + download() + assert download_path.joinpath(model_dir).exists(), f"{model_dir} was not downloaded" - def test_interacting_pretrained_model(self, model, conf_file, model_dir): - self.interact(tests_dir / conf_file, model_dir, PARAMS[model][(conf_file, model_dir)]) + def test_interacting_pretrained_model(self, model, conf_file, model_dir, d): + if d: + self.interact(tests_dir / conf_file, model_dir, PARAMS[model][(conf_file, model_dir, d)]) - def test_consecutive_training_and_interacting(self, model, conf_file, model_dir): + def test_consecutive_training_and_interacting(self, model, conf_file, model_dir, d): c = tests_dir / conf_file model_path = download_path / model_dir shutil.rmtree(str(model_path), ignore_errors=True)