-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathloaddata.py
181 lines (157 loc) · 7.28 KB
/
loaddata.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
#-----IMPORTANT CONSTANTS-------------------------------------------------------
taskTypeMap = {
'c': 'classification',
'ic': 'imageclassification',
'tc': 'textclassification',
'r': 'regression'
}
datasetTaskMap = {
'iris': 'c',
'diabetes': 'r',
'digits': 'ic',
'wine': 'c',
'breast_cancer': 'c',
'california_housing': 'r',
'rcv1': 'tc',
'covtype': 'c'
}
DOWNLOAD_PATH = './datasets'
ONLINE_DOWNLOAD_PATH = DOWNLOAD_PATH + '/online_data.csv'
#-----------------------------------------------------------------------------
#-------UTILITY FUNCTIONS FOR DATA--------------------------------------------
def generateReturnDict(features, labels):
return {
'features': features,
'labels': labels,
}
def manipulateLocalData(file_path=ONLINE_DOWNLOAD_PATH, feature_attributes=[], label_attributes=[]):
import pandas as pd
df = pd.read_csv(file_path)
return generateReturnDict(df[feature_attributes].to_numpy(), df[label_attributes].to_numpy())
#Used to convert multiclass classification into a binary classification
def extractZeroOneClasses(datadict, zeroClassLabel=0, oneClassLabel=1):
features = datadict['features']
labels = datadict['labels']
newfeatures = []
newlabels = []
for index in range(len(labels)):
feature = features[index]
label = labels[index]
if label == zeroClassLabel:
newlabels.append(0)
newfeatures.append(feature)
elif label == oneClassLabel:
newlabels.append(1)
newfeatures.append(feature)
return generateReturnDict(newfeatures, newlabels)
#Example use case is if you want to label one class -1 and the other +1 instead of 0 and 1 respectively
def convertZeroOne(datadict, newZeroLabel, newOneLabel, oldZeroLabel=0, oldOneLabel=1):
labels = datadict['labels']
for index in range(len(labels)):
curr_label = labels[index]
if curr_label == oldZeroLabel:
labels[index] = newZeroLabel
elif curr_label == oldOneLabel:
labels[index] = newOneLabel
return generateReturnDict(datadict['features'], labels)
#-------------------------------------------------------------------------------
#-------------CAN LOAD DATA FROM A VARIETY OF SOURCES---------------------------
def loadData(source, identifier, task='classification', **kwargs):
local_file_path = None
if source == 'sklearn':
if identifier in datasetTaskMap.keys():
if taskTypeMap[datasetTaskMap[identifier]] != task:
raise ValueError('The specified sklearn dataset does not fit the specified machine learning task.')
else:
raise ValueError('The specified sklearn dataset is not recognised.')
import sklearn.datasets as dat
data = None
if identifier == 'iris':
data = dat.load_iris()
elif identifier == 'diabetes':
data = dat.load_diabetes()
elif identifier == 'digits':
data = dat.load_digits()
elif identifier == 'wine':
data = dat.load_wine()
elif identifier == 'breast_cancer':
data = dat.load_breast_cancer()
elif identifier == 'california_housing':
data = dat.fetch_california_housing(data_home=DOWNLOAD_PATH)
elif identifier == 'rcv1':
data = dat.fetch_rcv1(data_home=DOWNLOAD_PATH)
elif identifier == 'covtype':
data = dat.fetch_covtype(data_home=DOWNLOAD_PATH)
# Generate dictionary in required format
return generateReturnDict(data['data'], data['target'])
elif source == 'online':
import requests
# Load file from online
req = requests.get(identifier)
url_content = req.content
# Save locally into online download path
csv_file = open(ONLINE_DOWNLOAD_PATH, 'wb+')
csv_file.write(url_content)
csv_file.close()
# Specify the local data file path as the place where the file was downloaded from online.
local_file_path = ONLINE_DOWNLOAD_PATH
elif source == 'local':
# Specify the local data file path as the path provided as parameter.
local_file_path = identifier
else:
raise ValueError('The specified source of data is not recognised.')
# If the function has not already returned, this means the data is in a local file and
# needs to be manipulated into the desired format.
# The kwargs help decide what aspects of the data are features and labels.
if 'feature_attributes' in kwargs.keys() and 'label_attributes' in kwargs.keys():
return manipulateLocalData(local_file_path, kwargs['feature_attributes'], kwargs['label_attributes'])
else:
raise ValueError('The feature and label attributes of the data both need to be specified.')
#-------------------------------------------------------------------------------
#---------------SPLITS LOADED DATA INTO TRAIN, VALIDATION & TEST SETS-----------
# Test proportion is zero because we probably will only need training and validation sets.
def trainTestSplit(data_dictionary, method='separate', validation_proportion=0.2, test_proportion=0.0):
from sklearn.model_selection import train_test_split
# Generate test set and joint test and validation set.
split1 = None
if test_proportion <= 0:
split1 = data_dictionary['features'], [], data_dictionary['labels'], []
else:
split1 = train_test_split(data_dictionary['features'], data_dictionary['labels'], test_size=test_proportion)
X_train_and_validation, X_test, y_train_and_validation, y_test = split1
rescaled_validation_proportion = validation_proportion/(1 - test_proportion)
if method == 'separate':
# Generate training and validation set.
split2 = None
if rescaled_validation_proportion <= 0:
split2 = X_train_and_validation, [], y_train_and_validation, []
else:
split2 = train_test_split(X_train_and_validation, y_train_and_validation, test_size=rescaled_validation_proportion)
X_train, X_validation, y_train, y_validation = split2
return {
'training_features': X_train,
'training_labels': y_train,
'validation_features': X_validation,
'validation_labels': y_validation,
'test_features': X_test,
'test_labels': y_test
}
elif method == 'cross_validation':
if validation_proportion <= 0:
raise ValueError('Validation data size cannot be zero for cross validation.')
from sklearn.model_selection import KFold
# Calculate number of folds from proportion of validation data
n_splits = int(1/rescaled_validation_proportion)
# Create generator object
kf = KFold(n_splits=n_splits, shuffle=False)
return {
'trainval_features': X_train_and_validation,
'trainval_labels': y_train_and_validation,
'index_generator': kf.split(X_train_and_validation),
'no_splits': kf.get_n_splits(X_train_and_validation),
'test_features': X_test,
'test_labels': y_test
}
else:
raise ValueError('The specified method to split the data is not recognised.')
#------------------------------------------------------------------------