-
Notifications
You must be signed in to change notification settings - Fork 13
/
utils.py
178 lines (149 loc) · 6.27 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
import math
import csv
import copy
import tensorflow as tf
import numpy as np
import pandas as pd
from sklearn.cluster import estimate_bandwidth, MeanShift
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
from matplotlib.colors import LogNorm
from matplotlib import gridspec
# Coordinates of Porto's city centre
porto = [41.1579, -8.6291]
def np_haversine(latlon1, latlon2):
"""
Numpy version of the Haversine function to calculate distances between two sets of points.
Converted to Python from the R version provided in the competition's evaluation script.
Returns the distance in km.
"""
lat1 = latlon1[:, 0]
lon1 = latlon1[:, 1]
lat2 = latlon2[:, 0]
lon2 = latlon2[:, 1]
REarth = 6371
lat = np.abs(lat1 - lat2) * np.pi / 180
lon = np.abs(lon1 - lon2) * np.pi / 180
lat1 = lat1 * np.pi / 180
lat2 = lat2 * np.pi / 180
a = np.sin(lat / 2) * np.sin(lat / 2) + np.cos(lat1) * np.cos(lat2) * np.sin(lon / 2) * np.sin(lon / 2)
d = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))
return REarth * d
def tf_haversine(latlon1, latlon2):
"""
Tensorflow version of the Haversine function to calculate distances between two sets of points.
"""
lat1 = latlon1[:, 0]
lon1 = latlon1[:, 1]
lat2 = latlon2[:, 0]
lon2 = latlon2[:, 1]
REarth = 6371
lat = tf.abs(lat1 - lat2) * np.pi / 180
lon = tf.abs(lon1 - lon2) * np.pi / 180
lat1 = lat1 * np.pi / 180
lat2 = lat2 * np.pi / 180
a = tf.sin(lat / 2) * tf.sin(lat / 2) + tf.cos(lat1) * tf.cos(lat2) * tf.sin(lon / 2) * tf.sin(lon / 2)
d = 2 * tf_atan2(tf.sqrt(a), tf.sqrt(1 - a))
return REarth * d
def tf_atan2(y, x):
"""
Tensorflow doesn't have an Atan2 function (at least not yet, see: https://github.com/tensorflow/tensorflow/issues/6095).
So we define it here ourselves.
"""
angle = tf.select(tf.greater(x,0.0), tf.atan(y/x), tf.zeros_like(x))
angle = tf.select(tf.logical_and(tf.less(x,0.0), tf.greater_equal(y,0.0)), tf.atan(y/x) + np.pi, angle)
angle = tf.select(tf.logical_and(tf.less(x,0.0), tf.less(y,0.0)), tf.atan(y/x) - np.pi, angle)
angle = tf.select(tf.logical_and(tf.equal(x,0.0), tf.greater(y,0.0)), 0.5*np.pi * tf.ones_like(x), angle)
angle = tf.select(tf.logical_and(tf.equal(x,0.0), tf.less(y,0.0)), -0.5*np.pi * tf.ones_like(x), angle)
angle = tf.select(tf.logical_and(tf.equal(x,0.0), tf.equal(y,0.0)), np.nan * tf.zeros_like(x), angle)
return angle
def get_clusters(coords):
"""
Estimate clusters for the given list of coordinates.
"""
# First, grossly reduce the spatial dataset by rounding up the coordinates to the 4th decimal
# (i.e. 11 meters. See: https://en.wikipedia.org/wiki/Decimal_degrees)
clusters = pd.DataFrame({
'approx_latitudes': coords[:,0].round(4),
'approx_longitudes': coords[:,1].round(4)
})
clusters = clusters.drop_duplicates(['approx_latitudes', 'approx_longitudes'])
clusters = clusters.as_matrix()
# Further reduce the number of clusters
# (Note: the quantile parameter was tuned to find a significant and reasonable number of clusters)
bandwidth = estimate_bandwidth(clusters, quantile=0.0002)
ms = MeanShift(bandwidth=bandwidth, bin_seeding=True)
ms.fit(clusters)
return ms.cluster_centers_
def density_map(latitudes, longitudes, center=porto, bins=1000, radius=0.1):
"""
Displays a density map in a matplotlib histogram for all the points
defined by the given latitudes and longitudes.
"""
# LogNorm will flag 0-value pixels as "bad" and color them in white.
# So we tweak the color map to color them in black.
cmap = copy.copy(plt.cm.jet)
cmap.set_bad((0,0,0)) # Fill background with black
# Center the map around the provided center coordinates
histogram_range = [
[center[1] - radius, center[1] + radius],
[center[0] - radius, center[0] + radius]
]
plt.figure(figsize=(5,5))
plt.hist2d(longitudes, latitudes, bins=bins, norm=LogNorm(),
cmap=cmap, range=histogram_range)
# Remove all axes and annotations to keep the map clean and simple
plt.grid('off')
plt.axis('off')
plt.gca().xaxis.set_visible(False)
plt.gca().yaxis.set_visible(False)
plt.tight_layout()
plt.show()
def plot_embeddings(embeddings):
"""
Plot all the given embeddings on a 2D plane using t-SNE.
"""
# Determine the dimension of the subplots' grid
N = len(embeddings)
cols = 2
rows = int(math.ceil(N / float(cols)))
gs = gridspec.GridSpec(rows, cols)
fig = plt.figure(figsize=(8,7))
for i, embedding in enumerate(embeddings):
ax = fig.add_subplot(gs[i])
weights = embedding[1].get_weights()[0]
names = range(weights.shape[0])
# Compute TSNE to convert the embedding's weights to a 2D plane
tsne = TSNE(n_components=2, random_state=0)
tsne = tsne.fit_transform(weights)
x, y = tsne[:,0], tsne[:,1]
# Plot the points
scatter = ax.scatter(x, y, alpha=0.7, c=names, s=40, cmap="jet")
fig.colorbar(scatter, ax=ax)
# Display the scatter point annotations
for i, name in enumerate(names):
ax.annotate(name, (x[i], y[i]), size=6)
# Make sure all points fit nicely within the space
x_delta = x.max() - x.min()
x_margin = x_delta / 10
y_delta = y.max() - y.min()
y_margin = y_delta / 10
ax.set_xlim(x.min()-x_margin, x.max()+x_margin)
ax.set_ylim(y.min()-y_margin, y.max()+y_margin)
ax.xaxis.set_visible(False)
ax.yaxis.set_visible(False)
ax.set_title(embedding[0])
fig.tight_layout()
plt.show()
def export_answers(model, competition_test, filename='answers.csv'):
"""
Export the predictions for the test dataset to be
submitted as the answer for the competition.
"""
from code.training import process_features
predictions = model.predict(process_features(competition_test))
answer_csv = open(filename, 'w')
answer_csv = csv.writer(answer_csv, quoting=csv.QUOTE_NONNUMERIC)
answer_csv.writerow(['TRIP_ID', 'LATITUDE', 'LONGITUDE'])
for index, (latitude, longitude) in enumerate(predictions):
answer_csv.writerow([competition_test['TRIP_ID'][index], latitude, longitude])