forked from TenninYan/quantum-mnist
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmnist_cca.py
123 lines (102 loc) · 3.59 KB
/
mnist_cca.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
import tensorflow as tf
from tensorflow.examples.tutorials.mnist import input_data
from sklearn.cross_decomposition import CCA
import numpy as np
import os
import pickle
cca_vectors_base_path = "data/cca_reduced_vectors_normalized"
def get_data(data_set, one_hot=False):
""" Fetches and returns data and labels from MNIST data set.
Args:
data_set: one of train, test, validation
one_hot: whether to generate one-hot labels
Returns:
train_data: (55000, 784) matrix
train_labels: (55000, 9) matrix (labels are one-hot)
"""
options = ["train", "test", "validation"]
if data_set not in options:
print("data set must be one of train, test, or validation")
return
mnist = input_data.read_data_sets("MNIST_data/", one_hot=one_hot)
if data_set == "train":
return mnist.train.images, mnist.train.labels
elif data_set == "test":
return mnist.test.images, mnist.test.labels
elif data_set == "validation":
return mnist.validation.images, mnist.validation.labels
def CCA_reduction(data_set="train"):
""" Reduces dimensions of MNIST data using CCA
Returns:
X_scores: training data reduced to 16 dimenional vectors
labels: data labels
"""
data, labels = get_data(data_set, one_hot=True)
X_scores, _ = CCA(n_components=16).fit_transform(data, labels)
return X_scores, labels
def normalize_row(row):
""" Normalizes data between 0 and 1.
Args:
row: list of values
Returns:
lst: list of values normalized between 0 and 1
"""
max_val = max(row)
min_val = min(row)
diff = max_val - min_val
return [np.float(x - min_val)/np.float(diff) for x in row]
def normalize_scores(data):
""" Normalizes matrix.
Args:
data: matrix to normalize
Returns:
lst: matrix where each row is normalized between 0 and 1
"""
return [normalize_row(row) for row in data]
def test_normalize_scores():
""" Tests that rows are normalized correctly.
"""
X_scores = CCA_reduction()
normalized = normalize_scores(X_scores)
for row in normalized:
assert max(row) <= 1.0
assert min(row) >= 0.0
print("Rows normalized correctly")
def get_cca_data_as_matrices(data_set="train"):
""" Normalizes data and returns data and labels as separate lists.
Args:
data_set: data set to retrieve
Returns:
data: list of data vectors
labels: list of vectors
"""
X_scores, labels = CCA_reduction(data_set=data_set)
data = normalize_scores(X_scores)
return data, labels
def get_cca_vectors(data_set="train", load_files=True):
""" Retrieves pickled data for
Args:
data_set: one of train, test, validation
load_files: whether to load the pkl files or generate new ones
Returns:
data: dictionary with two keys, features and labels. Features contains
the normalized vector and labels is a one-hot list.
"""
full_path = cca_vectors_base_path + data_set + ".pkl"
if os.path.isfile(full_path) and load_files:
with open(full_path, "rb") as fp:
data = pickle.load(fp)
return data
else:
X_scores, labels = CCA_reduction(data_set=data_set)
normalized = normalize_scores(X_scores)
data = []
assert len(labels) == len(normalized)
for idx, row in enumerate(normalized):
data.append({
"features": row,
"labels": labels[idx]
})
with open(full_path, "wb") as fp:
pickle.dump(data, fp)
return data