-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathstdbscan.py
127 lines (103 loc) · 4.66 KB
/
stdbscan.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
from datetime import timedelta, datetime
from geopy.distance import great_circle
import math
import argparse
import time
import pandas as pd
def st_dbscan(df, spatial_threshold, temporal_threshold, min_neighbors):
"""
Python st-dbscan implementation.
INPUTS:
df={o1,o2,...,on} Set of objects
spatial_threshold = Maximum geographical coordinate (spatial) distance
value
temporal_threshold = Maximum non-spatial distance value
min_neighbors = Minimun number of points within Eps1 and Eps2 distance
OUTPUT:
C = {c1,c2,...,ck} Set of clusters
"""
cluster_label = 0
noise = -1
unmarked = 777777
stack = []
# initialize each point with unmarked
df['cluster'] = unmarked
# for each point in database
for index, point in df.iterrows():
if df.loc[index]['cluster'] == unmarked:
neighborhood = retrieve_neighbors(index, df, spatial_threshold,
temporal_threshold)
if len(neighborhood) < min_neighbors:
df.set_value(index, 'cluster', noise)
else: # found a core point
cluster_label += 1
# assign a label to core point
df.set_value(index, 'cluster', cluster_label)
# assign core's label to its neighborhood
for neig_index in neighborhood:
df.set_value(neig_index, 'cluster', cluster_label)
stack.append(neig_index) # append neighborhood to stack
# find new neighbors from core point neighborhood
while len(stack) > 0:
current_point_index = stack.pop()
new_neighborhood = retrieve_neighbors(
current_point_index, df, spatial_threshold,
temporal_threshold)
# current_point is a new core
if len(new_neighborhood) >= min_neighbors:
for neig_index in new_neighborhood:
neig_cluster = df.loc[neig_index]['cluster']
if all([neig_cluster != noise,
neig_cluster == unmarked]):
# TODO: verify cluster average
# before add new point
df.set_value(neig_index, 'cluster',
cluster_label)
stack.append(neig_index)
return df
def retrieve_neighbors(index_center, df, spatial_threshold, temporal_threshold):
neigborhood = []
center_point = df.loc[index_center]
# filter by time
min_time = center_point['DATATIME'] - timedelta(minutes=temporal_threshold)
max_time = center_point['DATATIME'] + timedelta(minutes=temporal_threshold)
df = df[(df['DATATIME'] >= min_time) & (df['DATATIME'] <= max_time)]
# filter by distance
for index, point in df.iterrows():
if index != index_center:
distance = great_circle(
(center_point['LATITUDE'], center_point['LONGITUDE']),
(point['LATITUDE'], point['LONGITUDE'])).meters
if distance <= spatial_threshold:
neigborhood.append(index)
return neigborhood
def parse_dates(x):
return datetime.strptime(x, '%Y-%m-%d %H:%M:%S')
def main():
filename = 'data/ms.csv'
names = ['id','DATATIME', 'LATITUDE', 'LONGITUDE']
df = pd.read_csv(filename, sep=",", skiprows=[0], names = names, converters={'DATATIME': parse_dates})
# print (df)
print ( df[(df['DATATIME'].dt.hour == 13) & (df['DATATIME'].dt.day == 2) & (df['DATATIME'].dt.minute == 34)] )
msss = df[(df['DATATIME'].dt.hour == 13) & (df['DATATIME'].dt.day == 2) & (df['DATATIME'].dt.minute == 34)]
# print df[df['DATATIME'].dt.day == 1]
num_ids = len(msss)
print ( "Len:{}\n---".format(num_ids) )
numFrag = 4
# STBSCAN
spatial_threshold = 500
temporal_threshold = 60
minPts = 20
result_df = st_dbscan( msss, spatial_threshold,
temporal_threshold, minPts)
print ("Finished")
# import time
# timestr = time.strftime("%Y%m%d-%H%M%S")
print(result_df)
# result_df['cluster'].to_csv("result_{}_{}_{}_{}.csv".format(spatial_threshold,
# temporal_threshold,
# minPts,
# timestr)
# )
if __name__ == "__main__":
main()