-
Notifications
You must be signed in to change notification settings - Fork 38
/
algorithms.go
323 lines (268 loc) · 9.3 KB
/
algorithms.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
package anomalyzer
import (
"fmt"
"github.com/drewlanenga/govector"
"math"
)
type Algorithm func(govector.Vector, AnomalyzerConf) float64
var (
Algorithms = map[string]Algorithm{
"magnitude": MagnitudeTest,
"diff": DiffTest,
"highrank": RankTest,
"lowrank": ReverseRankTest,
"cdf": CDFTest,
"fence": FenceTest,
"ks": BootstrapKsTest,
}
)
// Identity function
func identity(anything interface{}) interface{} {
return anything
}
// Returns a value within a given window (xmin and xmax).
func cap(x, min, max float64) float64 {
return math.Max(math.Min(x, max), min)
}
// Returns a contant
func constant(x float64) float64 {
return 0.2
}
// Return integer math comparisons
func max(x, y int) int {
if x > y {
return x
}
return y
}
func min(x, y int) int {
if y < x {
return y
}
return x
}
// Return a vector slice for the active window and reference window.
// Some tests require different minimum thresholds for sizes of reference windows.
// This can be specified in the minRefSize parameter. If size isn't important, use -1
func extractWindows(vector govector.Vector, refSize, activeSize, minRefSize int) (govector.Vector, govector.Vector, error) {
n := len(vector)
activeSize = min(activeSize, n)
refSize = min(refSize, n-activeSize)
// make sure the reference size is at least as big as the active size
// note that this penalty might be overly severe for some tests
if refSize < minRefSize {
return nil, nil, fmt.Errorf("Reference size must be at least as big as active size")
}
// return reference and active windows
return vector[n-activeSize-refSize : n-activeSize], vector[n-activeSize:], nil
}
// This function can be used to test whether or not data is getting close to a
// specified upper or lower bound.
func FenceTest(vector govector.Vector, conf AnomalyzerConf) float64 {
// we don't really care about a reference window for this one
_, active, _ := extractWindows(vector, conf.referenceSize, conf.ActiveSize, -1)
x := active.Mean()
distance := 0.0
if conf.LowerBound == NA {
// we only care about distance from the upper bound
distance = x / conf.UpperBound
} else {
// we care about both bounds, so measure distance
// from midpoint
bound := (conf.UpperBound - conf.LowerBound) / 2
mid := conf.LowerBound + bound
distance = (math.Abs(x - mid)) / bound
}
return weightExp(cap(distance, 0, 1), 10)
}
// This is a function will sharply scale values between 0 and 1 such that
// smaller values are weighted more towards 0. A larger base value means a
// more horshoe type function.
func weightExp(x, base float64) float64 {
return (math.Pow(base, x) - 1) / (math.Pow(base, 1) - 1)
}
// Generates permutations of reference and active window values to determine
// whether or not data is anomalous. The number of permutations desired has
// been set to 500 but can be increased for more precision.
func DiffTest(vector govector.Vector, conf AnomalyzerConf) float64 {
// Find the differences between neighboring elements and rank those differences.
ranks := vector.RelDiff().Apply(math.Abs).Rank()
// The indexing runs to length-1 because after applying .Diff(), We have
// decreased the length of out vector by 1.
_, active, err := extractWindows(ranks, conf.referenceSize-1, conf.ActiveSize, conf.ActiveSize)
if err != nil {
return NA
}
// Consider the sum of the ranks across the active data. This is the sum that
// we will compare our permutations to.
activeSum := active.Sum()
i := 0
significant := 0
// Permute the active and reference data and compute the sums across the tail
// (from the length of the reference data to the full length).
for i < conf.PermCount {
permRanks := vector.Shuffle().RelDiff().Apply(math.Abs).Rank()
_, permActive, _ := extractWindows(permRanks, conf.referenceSize-1, conf.ActiveSize, conf.ActiveSize)
// If we find a sum that is less than the initial sum across the active data,
// this implies our initial sum might be uncharacteristically high. We increment
// our count.
if permActive.Sum() < activeSum {
significant++
}
i++
}
// We return the percentage of the number of iterations where we found our initial
// sum to be high.
return float64(significant) / float64(conf.PermCount)
}
func RankTest(vector govector.Vector, conf AnomalyzerConf) float64 {
return rankTest(vector, conf, lessThan)
}
func ReverseRankTest(vector govector.Vector, conf AnomalyzerConf) float64 {
return rankTest(vector, conf, greaterThan)
}
type compare func(x, y float64) bool
func greaterThan(x, y float64) bool {
if x > y {
return true
}
return false
}
func lessThan(x, y float64) bool {
if x < y {
return true
}
return false
}
// Generates permutations of reference and active window values to determine
// whether or not data is anomalous. The number of permutations desired defaults
// to 500 but can be increased for more precision. A comparison function above
// can be specified to create Rank and ReverseRank tests.
func rankTest(vector govector.Vector, conf AnomalyzerConf, comparison compare) float64 {
// Rank the elements of a vector
ranks := vector.Rank()
_, active, err := extractWindows(ranks, conf.referenceSize, conf.ActiveSize, conf.ActiveSize)
if err != nil {
return NA
}
// Consider the sum of the ranks across the active data. This is the sum that
// we will compare our permutations to.
activeSum := active.Sum()
i := 0
significant := 0
// Permute the active and reference data and compute the sums across the tail
// (from the length of the reference data to the full length).
for i < conf.PermCount {
permRanks := vector.Shuffle().Rank()
_, permActive, _ := extractWindows(permRanks, conf.referenceSize, conf.ActiveSize, conf.ActiveSize)
// If we find a sum that is less than the initial sum across the active data,
// this implies our initial sum might be uncharacteristically high. We increment
// our count.
permSum := permActive.Sum()
if comparison(permSum, activeSum) {
significant++
}
i++
}
// We return the percentage of the number of iterations where we found our initial
// sum to be high.
return float64(significant) / float64(conf.PermCount)
}
// Generates the cumulative distribution function using the difference in the means
// for the data.
func CDFTest(vector govector.Vector, conf AnomalyzerConf) float64 {
diffs := vector.Diff().Apply(math.Abs)
reference, active, err := extractWindows(diffs, conf.referenceSize-1, conf.ActiveSize, conf.ActiveSize)
if err != nil {
return NA
}
// Find the empircal distribution function using the reference window.
refEcdf := reference.Ecdf()
// Difference between the active and reference means.
activeDiff := active.Mean() - reference.Mean()
// Apply the empirical distribution function to that difference.
percentile := refEcdf(activeDiff)
// Scale so max probability is in tails and prob at 0.5 is 0.
return (2 * math.Abs(0.5-percentile))
}
// Generates the percent difference between the means of the reference and active
// data. Returns a value scaled such that it lies between 0 and 1.
func MagnitudeTest(vector govector.Vector, conf AnomalyzerConf) float64 {
reference, active, err := extractWindows(vector, conf.referenceSize, conf.ActiveSize, 1)
if err != nil {
return NA
}
activeMean := active.Mean()
refMean := reference.Mean()
// If the baseline is 0, then the magnitude should be Inf, but we'll
// round to 1.
if refMean == 0 {
if activeMean == 0 {
return 0
} else {
return 1
}
}
pdiff := math.Abs(activeMean-refMean) / refMean
//return weightExp(pdiff, 10)
return pdiff
}
// Calculate a Kolmogorov-Smirnov test statistic.
func KsStat(vector govector.Vector, conf AnomalyzerConf) float64 {
reference, active, err := extractWindows(vector, conf.referenceSize, conf.ActiveSize, conf.ActiveSize)
if err != nil {
return NA
}
n1 := len(reference)
n2 := len(active)
if n1%n2 != 0 {
return NA
}
// First sort the active data and generate a cummulative distribution function
// using that data. Do the same for the reference data.
activeEcdf := active.Ecdf()
refEcdf := reference.Ecdf()
// We want the reference and active vectors to have the same length n, so we
// consider the min and max for each and interpolated the points between.
min := math.Min(reference.Min(), active.Min())
max := math.Max(reference.Max(), active.Max())
interpolated := interpolate(min, max, n1+n2)
// Then we apply the distribution function over the interpolated data.
activeDist := interpolated.Apply(activeEcdf)
refDist := interpolated.Apply(refEcdf)
// Find the maximum displacement between both distributions.
d := 0.0
for i := 0; i < n1+n2; i++ {
d = math.Max(d, math.Abs(activeDist[i]-refDist[i]))
}
return d
}
func BootstrapKsTest(vector govector.Vector, conf AnomalyzerConf) float64 {
dist := KsStat(vector, conf)
if dist == NA {
return NA
}
i := 0
significant := 0
for i < conf.PermCount {
permVector := vector.Shuffle()
permDist := KsStat(permVector, conf)
if permDist < dist {
significant++
}
i++
}
return float64(significant) / float64(conf.PermCount)
}
// A helper function for KS that rescales a vector to the desired length npoints.
func interpolate(min, max float64, npoints int) govector.Vector {
interp := make(govector.Vector, npoints)
step := (max - min) / (float64(npoints) - 1)
interp[0] = min
i := 1
for i < npoints {
interp[i] = interp[i-1] + step
i++
}
return interp
}