-
Notifications
You must be signed in to change notification settings - Fork 24
/
Copy pathphysics.py
125 lines (108 loc) · 4.04 KB
/
physics.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
# Pylearn2 dataset for physics data.
#__authors__ = "Peter Sadowski"
# May 2014
from pylearn2.datasets import dense_design_matrix
from pylearn2.datasets import control
from pylearn2.utils import serial
import os
import numpy as np
import pickle as pkl
class PHYSICS(dense_design_matrix.DenseDesignMatrix):
def __init__(self,
which_set,
benchmark,
derived_feat=True,
version='',
seed=None, # Randomize data order if seed is not None
start=0,
stop=np.inf):
self.args = locals()
path = os.environ['PYLEARN2_DATA_PATH']
if derived_feat == 'False':
derived_feat = False
elif derived_feat == 'True':
derived_feat = True
if benchmark == 1:
inputfile = '%s/HIGGS.csv' % path
elif benchmark == 2:
inputfile = '%s/SUSY.csv' % path
#X = pkl.load(open(inputfile, 'r'))
X = np.loadtxt(inputfile, dtype='f4', delimiter=',')
y = X[:,0].reshape((-1,1))
X = X[:,1:]
X = np.array(X, dtype='float32')
y = np.array(y, dtype='float32')
print 'Data loaded: benchmark%d.' % (benchmark)
# Select a subset of examples.
if benchmark == 1:
# HIGGS
ntrain = 10000000
nvalid = 500000
ntest = 500000
elif benchmark == 2:
# SUSY
ntrain = 4000000
nvalid = 500000
ntest = 500000
if which_set == 'train':
X = X[0:ntrain, :]
y = y[0:ntrain, :]
elif which_set == 'valid':
X = X[ntrain:ntrain+nvalid, :]
y = y[ntrain:ntrain+nvalid, :]
elif which_set == 'test':
X = X[ntrain+nvalid:ntrain+nvalid+ntest, :]
y = y[ntrain+nvalid:ntrain+nvalid+ntest, :]
# Decide which feature set to use.
if benchmark == 1 and derived_feat == 'only':
# Only the 7 high level features.
X = X[:, 21:28]
elif benchmark == 1 and not derived_feat:
# Only the 21 raw features.
X = X[:, 0:21]
elif benchmark == 1 and derived_feat == 'regress':
# Predict high level features from low level.
y = X[:, 21:28]
X = X[:, 0:21]
elif benchmark == 2 and derived_feat == 'only':
# Only the 10 high-level features.
X = X[:, 8:18]
elif benchmark == 2 and not derived_feat:
# Only the 8 low-level features.
X = X[:, 0:8]
elif benchmark == 3 and derived_feat == 'only':
# Only the 15 high-level features.
X = X[:, 10:25]
elif benchmark == 3 and not derived_feat:
# Only the 10 raw features.
X = X[:, 0:10]
# Randomize data order.
if seed:
rng = np.random.RandomState(42) # reproducible results with a fixed seed
indices = np.arange(X.shape[0])
rng.shuffle(indices)
X = X[indices, :]
y = y[indices, :]
# Limit number of samples.
stop = min(stop, X.shape[0])
X = X[start:stop, :]
y = y[start:stop, :]
# Initialize the superclass. DenseDesignMatrix
super(PHYSICS,self).__init__(X=X, y=y)
def standardize(self, X):
"""
Standardize each feature:
1) If data contains negative values, we assume its either normally or uniformly distributed, center, and standardize.
2) elseif data has large values, we set mean to 1.
"""
for j in range(X.shape[1]):
vec = X[:, j]
if np.min(vec) < 0:
# Assume data is Gaussian or uniform -- center and standardize.
vec = vec - np.mean(vec)
vec = vec / np.std(vec)
elif np.max(vec) > 1.0:
# Assume data is exponential -- just set mean to 1.
vec = vec / np.mean(vec)
X[:,j] = vec
return X