-
-
Notifications
You must be signed in to change notification settings - Fork 1.4k
/
Copy pathclustering.py
267 lines (231 loc) · 11.5 KB
/
clustering.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
"""Extension template for clusterers.
Purpose of this implementation template:
quick implementation of new estimators following the template
NOT a concrete class to import! This is NOT a base class or concrete class!
This is to be used as a "fill-in" coding template.
How to use this implementation template to implement a new estimator:
- make a copy of the template in a suitable location, give it a descriptive name.
- work through all the "todo" comments below
- fill in code for mandatory methods, and optionally for optional methods
- do not write to reserved variables: is_fitted, _is_fitted, fit_time_,
_class_dictionary, _threads_to_use, n_clusters, _tags, _tags_dynamic
- you can add more private methods, but do not override BaseEstimator's private methods
an easy way to be safe is to prefix your methods with "_custom"
- change docstrings for functions and the file
- ensure interface compatibility by testing clustering/tests
- once complete: use as a local library, or contribute to sktime via PR
- more details:
https://www.sktime.net/en/stable/developer_guide/add_estimators.html
Mandatory implements:
fitting - _fit(self, X)
Optional implements:
cluster assignment - _predict(self, X)
fitted parameter inspection - _get_fitted_params()
Testing - required for sktime test framework and check_estimator usage:
get default parameters for test instance(s) - get_test_params()
copyright: sktime developers, BSD-3-Clause License (see LICENSE file)
"""
# todo: write an informative docstring for the file or module, remove the above
# todo: add an appropriate copyright notice for your estimator
# estimators contributed to sktime should have the copyright notice at the top
# estimators of your own do not need to have permissive or BSD-3 copyright
# todo: uncomment the following line, enter authors' GitHub IDs
# __author__ = [authorGitHubID, anotherAuthorGitHubID]
from sktime.clustering import BaseClusterer
# todo: add any necessary imports here
# todo: for imports of sktime soft dependencies:
# make sure to fill in the "python_dependencies" tag with the package import name
# import soft dependencies only inside methods of the class, not at the top of the file
# todo: change class name and write docstring
class MyClusterer(BaseClusterer):
"""Custom clusterer. todo: write docstring.
todo: describe your custom clusterer here
Hyper-parameters
----------------
parama : int
descriptive explanation of parama
paramb : string, optional (default='default')
descriptive explanation of paramb
paramc : boolean, optional (default= whether paramb is not the default)
descriptive explanation of paramc
and so on
Components
----------
est : sktime.estimator, BaseEstimator descendant
descriptive explanation of est
est2: another estimator
descriptive explanation of est2
and so on
"""
# optional todo: override base class estimator default tags here if necessary
# these are the default values, only add if different to these.
_tags = {
# packaging info
# --------------
"authors": ["author1", "author2"], # authors, GitHub handles
"maintainers": ["maintainer1", "maintainer2"], # maintainers, GitHub handles
# author = significant contribution to code at some point
# if interfacing a 3rd party estimator, ensure to give credit to the
# authors of the interfaced estimator
# maintainer = algorithm maintainer role, "owner" of the sktime class
# for 3rd party interfaces, the scope is the sktime class only
# specify one or multiple authors and maintainers, only for sktime contribution
# remove maintainer tag if maintained by sktime core team
#
"python_version": None, # PEP 440 python version specifier to limit versions
"python_dependencies": None, # PEP 440 python dependencies specifier,
# e.g., "numba>0.53", or a list, e.g., ["numba>0.53", "numpy>=1.19.0"]
# delete if no python dependencies or version limitations
#
# estimator tags
# --------------
"X_inner_mtype": "numpy3D", # which type do _fit/_predict accept, usually
# this is one of "numpy3D" (instance, variable, time point),
# "pd-multiindex" (row index: instance, time; column index: variable) or other
# machine types, see datatypes/panel/_registry.py for options.
"capability:multivariate": False,
"capability:unequal_length": False,
"capability:missing_values": False,
"capability:multithreading": False,
"capability:predict": True, # implements _predict for cluster assignment?
"capability:predict_proba": False, # implements non-default _predict_proba?
"capability:out_of_sample": True, # implements _predict for new data?
}
# todo: add any hyper-parameters and components to constructor
def __init__(self, est, parama, est2=None, paramb="default", paramc=None):
# estimators should precede parameters
# if estimators have default values, set None and initialize below
# todo: write any hyper-parameters and components to self
self.est = est
self.parama = parama
self.paramb = paramb
self.paramc = paramc
# IMPORTANT: the self.params should never be overwritten or mutated from now on
# for handling defaults etc, write to other attributes, e.g., self._parama
# leave this as is
super().__init__()
# todo: optional, parameter checking logic (if applicable) should happen here
# if writes derived values to self, should *not* overwrite self.parama etc
# instead, write to self._parama, self._newparam (starting with _)
# todo: default estimators should have None arg defaults
# and be initialized here
# do this only with default estimators, not with parameters
# if est2 is None:
# self.estimator = MyDefaultEstimator()
# todo: implement this abstract class, mandatory
def _fit(self, X):
"""Fit time series clusterer to training data.
Parameters
----------
X : Data to cluster, of type self.get_tag("X_inner_mtype")
Returns
-------
self:
Fitted estimator.
"""
# implement here
# IMPORTANT: avoid side effects to X
# todo: implement this, mandatory
# at least one of _predict and _get_fitted_params should be implemented
def _predict(self, X):
"""Predict the closest cluster each sample in X belongs to.
Parameters
----------
X : data to cluster based on model formed in _fit, of type self.get_tag(
"X_inner_mtype")
y: ignored, exists for API consistency reasons.
Returns
-------
np.ndarray (1d array of shape (n_instances,))
Index of the cluster each time series in X belongs to.
"""
# implement here
# IMPORTANT: avoid side effects to X
# todo: consider implementing this, optional
# implement only if different from default:
# default retrieves all self attributes ending in "_"
# and returns them with keys that have the "_" removed
# if not implementing, delete the method
# avoid overriding get_fitted_params
# this is typically important for clustering
# at least one of _predict and _get_fitted_params should be functional
def _get_fitted_params(self):
"""Get fitted parameters.
private _get_fitted_params, called from get_fitted_params
State required:
Requires state to be "fitted".
Returns
-------
fitted_params : dict with str keys
fitted parameters, keyed by names of fitted parameter
"""
# implement here
#
# when this function is reached, it is already guaranteed that self is fitted
# this does not need to be checked separately
#
# parameters of components should follow the sklearn convention:
# separate component name from parameter name by double-underscore
# e.g., componentname__paramname
# todo: return default parameters, so that a test instance can be created
# required for automated unit and integration testing of estimator
@classmethod
def get_test_params(cls, parameter_set="default"):
"""Return testing parameter settings for the estimator.
Parameters
----------
parameter_set : str, default="default"
Name of the set of test parameters to return, for use in tests. If no
special parameters are defined for a value, will return `"default"` set.
There are currently no reserved values for clusterers.
Returns
-------
params : dict or list of dict, default = {}
Parameters to create testing instances of the class
Each dict are parameters to construct an "interesting" test instance, i.e.,
`MyClass(**params)` or `MyClass(**params[i])` creates a valid test instance.
`create_test_instance` uses the first (or only) dictionary in `params`
"""
# todo: set the testing parameters for the estimators
# Testing parameters can be dictionary or list of dictionaries
# Testing parameter choice should cover internal cases well.
#
# this method can, if required, use:
# class properties (e.g., inherited); parent class test case
# imported objects such as estimators from sktime or sklearn
# important: all such imports should be *inside get_test_params*, not at the top
# since imports are used only at testing time
#
# The parameter_set argument is not used for automated, module level tests.
# It can be used in custom, estimator specific tests, for "special" settings.
# A parameter dictionary must be returned *for all values* of parameter_set,
# i.e., "parameter_set not available" errors should never be raised.
#
# A good parameter set should primarily satisfy two criteria,
# 1. Chosen set of parameters should have a low testing time,
# ideally in the magnitude of few seconds for the entire test suite.
# This is vital for the cases where default values result in
# "big" models which not only increases test time but also
# run into the risk of test workers crashing.
# 2. There should be a minimum two such parameter sets with different
# sets of values to ensure a wide range of code coverage is provided.
#
# example 1: specify params as dictionary
# any number of params can be specified
# params = {"est": value0, "parama": value1, "paramb": value2}
#
# example 2: specify params as list of dictionary
# note: Only first dictionary will be used by create_test_instance
# params = [{"est": value1, "parama": value2},
# {"est": value3, "parama": value4}]
# return params
#
# example 3: parameter set depending on param_set value
# note: only needed if a separate parameter set is needed in tests
# if parameter_set == "special_param_set":
# params = {"est": value1, "parama": value2}
# return params
#
# # "default" params - always returned except for "special_param_set" value
# params = {"est": value3, "parama": value4}
# return params