Skip to content

Commit

Permalink
Implement kNN imputation
Browse files Browse the repository at this point in the history
  • Loading branch information
appleparan committed Mar 12, 2020
1 parent 4546b9a commit c67e2fd
Show file tree
Hide file tree
Showing 5 changed files with 260 additions and 4 deletions.
6 changes: 5 additions & 1 deletion Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,11 @@ authors = ["Invenia Technical Computing"]
version = "0.4.0"

[deps]
Distances = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7"
IterTools = "c8e1da08-722c-5040-9ed9-7db0dc04731e"
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
Missings = "e1d29d7a-bbdc-5cf2-9ac0-f12de2c33e28"
NearestNeighbors = "b8a86587-4115-5ab1-83bc-aa920d37bbce"
Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
Expand All @@ -26,7 +29,8 @@ DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
Dates = "ade2ca70-3891-5945-98fb-dc099432e06a"
Distances = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7"
RDatasets = "ce6b1742-4840-55fa-b093-852dadbb1d8b"
Query = "1a8c2f83-1ff3-5112-b086-8aa67b057ba1"
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"

[targets]
test = ["AxisArrays", "Combinatorics", "DataFrames", "Dates", "Distances", "RDatasets", "Test"]
test = ["AxisArrays", "Combinatorics", "DataFrames", "Dates", "Distances", "RDatasets", "Query", "Test"]
4 changes: 4 additions & 0 deletions src/Impute.jl
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
module Impute

using Distances
using IterTools
using Missings
using NearestNeighbors
using Random
using Statistics
using StatsBase
Expand Down Expand Up @@ -67,6 +70,7 @@ const global imputation_methods = (
nocb = NOCB,
srs = SRS,
svd = SVD,
knn = KNN,
)

include("deprecated.jl")
Expand Down
2 changes: 1 addition & 1 deletion src/imputors.jl
Original file line number Diff line number Diff line change
Expand Up @@ -175,6 +175,6 @@ function impute!(table, imp::Imputor)
return table
end

for file in ("drop.jl", "locf.jl", "nocb.jl", "interp.jl", "fill.jl", "chain.jl", "srs.jl", "svd.jl")
for file in ("drop.jl", "locf.jl", "nocb.jl", "interp.jl", "fill.jl", "chain.jl", "srs.jl", "svd.jl", "knn.jl")
include(joinpath("imputors", file))
end
86 changes: 86 additions & 0 deletions src/imputors/knn.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
"""
KNN <: Imputor
Imputation using k-Nearest Neighbor algorithm.
# Keyword Arguments
* `num_nn::Int`: number of nearest neighbors
* `dist::MinkowskiMetric`: distance metric suppports by `NearestNeighbors.jl` (Euclidean, Chebyshev, Minkowski and Cityblock)
* `on_complete::Function`: a function to run when imputation is complete
# Reference
* Troyanskaya, Olga, et al. "Missing value estimation methods for DNA microarrays." Bioinformatics 17.6 (2001): 520-525.
"""
# TODO : Support Categorical Distance (NearestNeighbors.jl support needed)
struct KNN{M} <: Imputor where M <: NearestNeighbors.MinkowskiMetric
num_nn::Int
dist::M
missing_neighbors_threshold::AbstractFloat
context::AbstractContext
end

function KNN(; num_nn=1, missing_neighbors_threshold=0.5,
dist=Euclidean(), context=Context())

if num_nn < 1
throw(ArgumentError("The number of nearset neighbors should be greater than 0"))
end

# to exclude missing value itself
KNN(num_nn + 1, dist, missing_neighbors_threshold, context)
end

function impute!(data::AbstractMatrix{<:Union{T, Missing}}, imp::KNN) where T<:Real

imp.context() do ctx
# transpose to D x N for KDTree
transposed = float.(collect(transpose(data)))

# Get our before and after views of our missing and non-missing data
mmask = ismissing.(transposed)
omask = .!mmask

mdata = transposed[mmask]
odata = transposed[omask]

# fill missing value as mean first
impute!(transposed, Fill(; value=mean, context=ctx))

# disallow missing to compute kdtree
transposed = disallowmissing(transposed)

kdtree = KDTree(transposed, imp.dist)
idxs, dists = NearestNeighbors.knn(kdtree, transposed, imp.num_nn, true)

idxes = CartesianIndices(transposed)

# iterate all value
for I in CartesianIndices(transposed)
if mmask[I] == 1
w = 1.0 ./ dists[I[2]]
#@show idxs[I[2]][2], dists[I[2]][2], sum(w[2:end])

missing_neighbors = ismissing.(transposed[:, idxs[I[2]]][:, 2:end])
# exclude missing value itself because distance would be zero
if isnan(sum(w[2:end])) || isinf(sum(w[2:end])) || sum(w[2:end]) == 0.0
# if distance is zero, keep mean imputation
transposed[I] = transposed[I]
elseif count(x -> !=(0, x), mapslices(sum, missing_neighbors, dims=[1])) >
imp.num_nn * imp.missing_neighbors_threshold
# If too many neighbors are also missing, fallback to mean imputation
# get column and check how many neighbors are also missing
transposed[I] = transposed[I]
else
# Inverse distance weighting
# idxs = num_points x num_nearestpoints
wsum = w .* transposed[I[1], idxs[I[2]]]
transposed[I] = sum(wsum[2:end]) / sum(w[2:end])
end
end
end

data = transposed'

data
end
end
166 changes: 164 additions & 2 deletions test/runtests.jl
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@

using AxisArrays
using Combinatorics
using DataFrames
using Dates
using Distances
using LinearAlgebra
using Query
using RDatasets
using Random
using Statistics
Expand All @@ -30,7 +30,6 @@ using Impute:
interp,
chain


function add_missings(X, ratio=0.1)
result = Matrix{Union{Float64, Missing}}(X)

Expand All @@ -41,6 +40,17 @@ function add_missings(X, ratio=0.1)
return result
end

function add_missings_single(X, ratio=0.1)
result = Matrix{Union{Float64, Missing}}(X)

randcols = 1:floor(Int, size(X, 2) * ratio)
for col in randcols
result[rand(1:size(X, 1)), col] = missing
end

return result
end

@testset "Impute" begin
# Defining our missing datasets
a = allowmissing(1.0:1.0:20.0)
Expand Down Expand Up @@ -531,6 +541,158 @@ end
end
end

@testset "KNN" begin
@testset "Iris" begin
# Reference
# P. Schimitt, et. al
# A comparison of six methods for missing data imputation
iris = dataset("datasets", "iris")
iris2 = @from i in iris begin
@where i.Species == "versicolor" || i.Species == "virginica"
@select {i.SepalLength, i.SepalWidth, i.PetalLength, i.PetalWidth}
@collect DataFrame
end
data = Array(iris2)
num_tests = 100

@testset "Iris - 0.15" begin
X = add_missings(data', 0.15)

knn_nrmsd, mean_nrmsd = 0.0, 0.0

for i = 1:num_tests
knn_imputed = impute(copy(X), Impute.KNN(; num_nn=1, missing_neighbors_threshold=0.5,
dist=Euclidean(), context=Context(; limit=1.0)))
mean_imputed = impute(copy(X), :fill; limit=1.0)

knn_nrmsd = ((i - 1) * knn_nrmsd + nrmsd(data', knn_imputed)) / i
mean_nrmsd = ((i - 1) * mean_nrmsd + nrmsd(data', mean_imputed)) / i
if ismissing(mean_nrmsd)
@show mean_imputed
end
end

@test knn_nrmsd < mean_nrmsd
end

@testset "Iris - 0.25" begin
X = add_missings(data', 0.25)

knn_nrmsd, mean_nrmsd = 0.0, 0.0

for i = 1:num_tests
knn_imputed = impute(copy(X), Impute.KNN(; num_nn=1, missing_neighbors_threshold=0.5,
dist=Euclidean(), context=Context(; limit=1.0)))
mean_imputed = impute(copy(X), :fill; limit=1.0)

knn_nrmsd = ((i - 1) * knn_nrmsd + nrmsd(data', knn_imputed)) / i
mean_nrmsd = ((i - 1) * mean_nrmsd + nrmsd(data', mean_imputed)) / i
if ismissing(mean_nrmsd)
@show mean_imputed
end
end

@test knn_nrmsd < mean_nrmsd
end

@testset "Iris - 0.35" begin
X = add_missings(data', 0.35)

knn_nrmsd, mean_nrmsd = 0.0, 0.0

for i = 1:num_tests
knn_imputed = impute(copy(X), Impute.KNN(; num_nn=1, missing_neighbors_threshold=0.5,
dist=Euclidean(), context=Context(; limit=1.0)))
mean_imputed = impute(copy(X), :fill; limit=1.0)

knn_nrmsd = ((i - 1) * knn_nrmsd + nrmsd(data', knn_imputed)) / i
mean_nrmsd = ((i - 1) * mean_nrmsd + nrmsd(data', mean_imputed)) / i
if ismissing(mean_nrmsd)
@show mean_imputed
end
end

@test knn_nrmsd < mean_nrmsd
end
end

# Test a case where we expect kNN to perform well (e.g., many variables, )
@testset "Data match" begin
data = mapreduce(hcat, 1:1000) do i
seeds = [sin(i), cos(i), tan(i), atan(i)]
mapreduce(vcat, combinations(seeds)) do args
[
+(args...),
*(args...),
+(args...) * 100,
+(abs.(args)...),
(+(args...) * 10) ^ 2,
(+(abs.(args)...) * 10) ^ 2,
log(+(abs.(args)...) * 100),
+(args...) * 100 + rand(-10:0.1:10),
]
end
end

X = add_missings(data')
num_tests = 100

knn_nrmsd, mean_nrmsd = 0.0, 0.0

for i = 1:num_tests
knn_imputed = impute(copy(X), Impute.KNN(; num_nn=3, missing_neighbors_threshold=0.5,
dist=Euclidean(), context=Context(; limit=1.0)))
mean_imputed = impute(copy(X), :fill; limit=1.0)

knn_nrmsd = ((i - 1) * knn_nrmsd + nrmsd(data', knn_imputed)) / i
mean_nrmsd = ((i - 1) * mean_nrmsd + nrmsd(data', mean_imputed)) / i
end

@test knn_nrmsd < mean_nrmsd
end

# Test a case with few variable
# (e.g., only a few variables, only )
@testset "Data with few variables" begin
data = Matrix(dataset("Ecdat", "Electricity"))
X = add_missings(data)
num_tests = 100

knn_nrmsd, mean_nrmsd = 0.0, 0.0

for i = 1:num_tests
knn_imputed = impute(copy(X), Impute.KNN(; num_nn=3, missing_neighbors_threshold=0.5,
dist=Euclidean(), context=Context(; limit=1.0)))
mean_imputed = impute(copy(X), :fill; limit=1.0)

knn_nrmsd = ((i - 1) * knn_nrmsd + nrmsd(data', knn_imputed)) / i
mean_nrmsd = ((i - 1) * mean_nrmsd + nrmsd(data', mean_imputed)) / i
end

@test knn_nrmsd < mean_nrmsd
end

@testset "Data with random values" begin
M = rand(100, 200)
data = M * M'
X = add_missings(data)
num_tests = 100

knn_nrmsd, mean_nrmsd = 0.0, 0.0

for i = 1:num_tests
knn_imputed = impute(copy(X), Impute.KNN(; num_nn=3, missing_neighbors_threshold=0.5,
dist=Euclidean(), context=Context(; limit=1.0)))
mean_imputed = impute(copy(X), :fill; limit=1.0)

knn_nrmsd = ((i - 1) * knn_nrmsd + nrmsd(data', knn_imputed)) / i
mean_nrmsd = ((i - 1) * mean_nrmsd + nrmsd(data', mean_imputed)) / i
end

@test knn_nrmsd < mean_nrmsd
end
end

include("deprecated.jl")
include("testutils.jl")

Expand Down

0 comments on commit c67e2fd

Please sign in to comment.