-
Notifications
You must be signed in to change notification settings - Fork 1.2k
/
Copy pathbasic_classification_reader.py
112 lines (96 loc) · 4.73 KB
/
basic_classification_reader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
# Copyright 2017 Neural Networks and Deep Learning lab, MIPT
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from logging import getLogger
from pathlib import Path
import pandas as pd
from deeppavlov.core.common.registry import register
from deeppavlov.core.data.dataset_reader import DatasetReader
from deeppavlov.core.data.utils import download
log = getLogger(__name__)
@register('basic_classification_reader')
class BasicClassificationDatasetReader(DatasetReader):
"""
Class provides reading dataset in .csv format
"""
def read(self, data_path: str, url: str = None,
format: str = "csv", class_sep: str = None,
*args, **kwargs) -> dict:
"""
Read dataset from data_path directory.
Reading files are all data_types + extension
(i.e for data_types=["train", "valid"] files "train.csv" and "valid.csv" form
data_path will be read)
Args:
data_path: directory with files
url: download data files if data_path not exists or empty
format: extension of files. Set of Values: ``"csv", "json"``
class_sep: string separator of labels in column with labels
sep (str): delimeter for ``"csv"`` files. Default: None -> only one class per sample
header (int): row number to use as the column names
names (array): list of column names to use
orient (str): indication of expected JSON string format
lines (boolean): read the file as a json object per line. Default: ``False``
Returns:
dictionary with types from data_types.
Each field of dictionary is a list of tuples (x_i, y_i)
"""
data_types = ["train", "valid", "test"]
train_file = kwargs.get('train', 'train.csv')
if not Path(data_path, train_file).exists():
if url is None:
raise Exception(
"data path {} does not exist or is empty, and download url parameter not specified!".format(
data_path))
log.info("Loading train data from {} to {}".format(url, data_path))
download(source_url=url, dest_file_path=Path(data_path, train_file))
data = {"train": [],
"valid": [],
"test": []}
for data_type in data_types:
file_name = kwargs.get(data_type, '{}.{}'.format(data_type, format))
if file_name is None:
continue
file = Path(data_path).joinpath(file_name)
if file.exists():
if format == 'csv':
keys = ('sep', 'header', 'names')
options = {k: kwargs[k] for k in keys if k in kwargs}
df = pd.read_csv(file, **options)
elif format == 'json':
keys = ('orient', 'lines')
options = {k: kwargs[k] for k in keys if k in kwargs}
df = pd.read_json(file, **options)
else:
raise Exception('Unsupported file format: {}'.format(format))
x = kwargs.get("x", "text")
y = kwargs.get('y', 'labels')
if isinstance(x, list):
if class_sep is None:
# each sample is a tuple ("text", "label")
data[data_type] = [([row[x_] for x_ in x], str(row[y]))
for _, row in df.iterrows()]
else:
# each sample is a tuple ("text", ["label", "label", ...])
data[data_type] = [([row[x_] for x_ in x], str(row[y]).split(class_sep))
for _, row in df.iterrows()]
else:
if class_sep is None:
# each sample is a tuple ("text", "label")
data[data_type] = [(row[x], str(row[y])) for _, row in df.iterrows()]
else:
# each sample is a tuple ("text", ["label", "label", ...])
data[data_type] = [(row[x], str(row[y]).split(class_sep)) for _, row in df.iterrows()]
else:
log.warning("Cannot find {} file".format(file))
return data