Implement kNN imputation

invenia · Mar 12, 2020 · 28e8203 · 28e8203
1 parent 4546b9a
commit 28e8203
Show file tree

Hide file tree

Showing 5 changed files with 260 additions and 4 deletions.
diff --git a/Project.toml b/Project.toml
@@ -4,8 +4,11 @@ authors = ["Invenia Technical Computing"]
 version = "0.4.0"
 
 [deps]
+Distances = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7"
 IterTools = "c8e1da08-722c-5040-9ed9-7db0dc04731e"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
+Missings = "e1d29d7a-bbdc-5cf2-9ac0-f12de2c33e28"
+NearestNeighbors = "b8a86587-4115-5ab1-83bc-aa920d37bbce"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
 StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
@@ -26,7 +29,8 @@ DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
 Dates = "ade2ca70-3891-5945-98fb-dc099432e06a"
 Distances = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7"
 RDatasets = "ce6b1742-4840-55fa-b093-852dadbb1d8b"
+Query = "1a8c2f83-1ff3-5112-b086-8aa67b057ba1"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 
 [targets]
-test = ["AxisArrays", "Combinatorics", "DataFrames", "Dates", "Distances", "RDatasets", "Test"]
+test = ["AxisArrays", "Combinatorics", "DataFrames", "Dates", "Distances", "RDatasets", "Query", "Test"]
diff --git a/src/Impute.jl b/src/Impute.jl
@@ -1,6 +1,9 @@
 module Impute
 
+using Distances
 using IterTools
+using Missings
+using NearestNeighbors
 using Random
 using Statistics
 using StatsBase
@@ -67,6 +70,7 @@ const global imputation_methods = (
     nocb = NOCB,
     srs = SRS,
     svd = SVD,
+    knn = KNN,
 )
 
 include("deprecated.jl")

diff --git a/src/imputors.jl b/src/imputors.jl
@@ -175,6 +175,6 @@ function impute!(table, imp::Imputor)
     return table
 end
 
-for file in ("drop.jl", "locf.jl", "nocb.jl", "interp.jl", "fill.jl", "chain.jl", "srs.jl", "svd.jl")
+for file in ("drop.jl", "locf.jl", "nocb.jl", "interp.jl", "fill.jl", "chain.jl", "srs.jl", "svd.jl", "knn.jl")
     include(joinpath("imputors", file))
 end
diff --git a/src/imputors/knn.jl b/src/imputors/knn.jl
@@ -0,0 +1,86 @@
+"""
+    KNN <: Imputor
+
+Imputation using k-Nearest Neighbor algorithm.
+
+# Keyword Arguments
+* `num_nn::Int`: number of nearest neighbors
+* `dist::MinkowskiMetric`: distance metric suppports by `NearestNeighbors.jl` (Euclidean, Chebyshev, Minkowski and Cityblock)
+* `on_complete::Function`: a function to run when imputation is complete
+
+# Reference
+* Troyanskaya, Olga, et al. "Missing value estimation methods for DNA microarrays." Bioinformatics 17.6 (2001): 520-525.
+"""
+# TODO : Support Categorical Distance (NearestNeighbors.jl support needed)
+struct KNN{M} <: Imputor where M <: NearestNeighbors.MinkowskiMetric
+    num_nn::Int
+    dist::M
+    missing_neighbors_threshold::AbstractFloat
+    context::AbstractContext
+end
+
+function KNN(; num_nn=1, missing_neighbors_threshold=0.5,
+    dist=Euclidean(), context=Context())
+
+    if num_nn < 1
+        throw(ArgumentError("The number of nearset neighbors should be greater than 0"))
+    end
+
+    # to exclude missing value itself
+    KNN(num_nn + 1, dist, missing_neighbors_threshold, context)
+end
+
+function impute!(data::AbstractMatrix{<:Union{T, Missing}}, imp::KNN) where T<:Real
+
+    imp.context() do ctx
+        # transpose to D x N for KDTree
+        transposed = float.(collect(transpose(data)))
+
+        # Get our before and after views of our missing and non-missing data
+        mmask = ismissing.(transposed)
+        omask = .!mmask
+
+        mdata = transposed[mmask]
+        odata = transposed[omask]
+
+        # fill missing value as mean first
+        impute!(transposed, Fill(; value=mean, context=ctx))
+
+        # disallow missing to compute kdtree
+        transposed = disallowmissing(transposed)
+
+        kdtree = KDTree(transposed, imp.dist)
+        idxs, dists = NearestNeighbors.knn(kdtree, transposed, imp.num_nn, true)
+
+        idxes = CartesianIndices(transposed)
+
+        # iterate all value
+        for I in CartesianIndices(transposed)
+            if mmask[I] == 1
+                w = 1.0 ./ dists[I[2]]
+                #@show idxs[I[2]][2], dists[I[2]][2], sum(w[2:end])
+
+                missing_neighbors = ismissing.(transposed[:, idxs[I[2]]][:, 2:end])
+                # exclude missing value itself because distance would be zero
+                if isnan(sum(w[2:end])) || isinf(sum(w[2:end])) || sum(w[2:end]) == 0.0
+                    # if distance is zero, keep mean imputation
+                    transposed[I] = transposed[I]
+                elseif count(!=(0), mapslices(sum, missing_neighbors, dims=[1])) >
+                    imp.num_nn * imp.missing_neighbors_threshold
+                    # If too many neighbors are also missing, fallback to mean imputation
+                    # get column and check how many neighbors are also missing
+                    transposed[I] = transposed[I]
+                else
+                    # Inverse distance weighting
+                    # idxs = num_points x num_nearestpoints
+                    wsum = w .* transposed[I[1], idxs[I[2]]]
+                    transposed[I] = sum(wsum[2:end]) / sum(w[2:end])
+                end
+            end
+        end
+
+        data = transposed'
+
+        data
+    end
+end
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -1,10 +1,10 @@
-
 using AxisArrays
 using Combinatorics
 using DataFrames
 using Dates
 using Distances
 using LinearAlgebra
+using Query
 using RDatasets
 using Random
 using Statistics
@@ -30,7 +30,6 @@ using Impute:
     interp,
     chain
 
-
 function add_missings(X, ratio=0.1)
     result = Matrix{Union{Float64, Missing}}(X)
 
@@ -41,6 +40,17 @@ function add_missings(X, ratio=0.1)
     return result
 end
 
+function add_missings_single(X, ratio=0.1)
+    result = Matrix{Union{Float64, Missing}}(X)
+
+    randcols = 1:floor(Int, size(X, 2) * ratio)
+    for col in randcols
+        result[rand(1:size(X, 1)), col] = missing
+    end
+
+    return result
+end
+
 @testset "Impute" begin
     # Defining our missing datasets
     a = allowmissing(1.0:1.0:20.0)
@@ -531,6 +541,158 @@ end
         end
     end
 
+    @testset "KNN" begin
+        @testset "Iris" begin
+            # Reference
+            # P. Schimitt, et. al
+            # A comparison of six methods for missing data imputation
+            iris = dataset("datasets", "iris")
+            iris2 = @from i in iris begin
+                    @where i.Species == "versicolor" || i.Species == "virginica"
+                    @select {i.SepalLength, i.SepalWidth, i.PetalLength, i.PetalWidth}
+                    @collect DataFrame
+            end
+            data = Array(iris2)
+            num_tests = 100
+
+            @testset "Iris - 0.15" begin
+                X = add_missings(data', 0.15)
+
+                knn_nrmsd, mean_nrmsd = 0.0, 0.0
+
+                for i = 1:num_tests
+                    knn_imputed = impute(copy(X), Impute.KNN(; num_nn=1, missing_neighbors_threshold=0.5,
+                        dist=Euclidean(), context=Context(; limit=1.0)))
+                    mean_imputed = impute(copy(X), :fill; limit=1.0)
+
+                    knn_nrmsd = ((i - 1) * knn_nrmsd + nrmsd(data', knn_imputed)) / i
+                    mean_nrmsd = ((i - 1) * mean_nrmsd + nrmsd(data', mean_imputed)) / i
+                    if ismissing(mean_nrmsd)
+                        @show mean_imputed
+                    end
+                end
+
+                @test knn_nrmsd < mean_nrmsd
+            end
+
+            @testset "Iris - 0.25" begin
+                X = add_missings(data', 0.25)
+
+                knn_nrmsd, mean_nrmsd = 0.0, 0.0
+
+                for i = 1:num_tests
+                    knn_imputed = impute(copy(X), Impute.KNN(; num_nn=1, missing_neighbors_threshold=0.5,
+                        dist=Euclidean(), context=Context(; limit=1.0)))
+                    mean_imputed = impute(copy(X), :fill; limit=1.0)
+
+                    knn_nrmsd = ((i - 1) * knn_nrmsd + nrmsd(data', knn_imputed)) / i
+                    mean_nrmsd = ((i - 1) * mean_nrmsd + nrmsd(data', mean_imputed)) / i
+                    if ismissing(mean_nrmsd)
+                        @show mean_imputed
+                    end
+                end
+
+                @test knn_nrmsd < mean_nrmsd
+            end
+
+            @testset "Iris - 0.35" begin
+                X = add_missings(data', 0.35)
+
+                knn_nrmsd, mean_nrmsd = 0.0, 0.0
+
+                for i = 1:num_tests
+                    knn_imputed = impute(copy(X), Impute.KNN(; num_nn=1, missing_neighbors_threshold=0.5,
+                        dist=Euclidean(), context=Context(; limit=1.0)))
+                    mean_imputed = impute(copy(X), :fill; limit=1.0)
+
+                    knn_nrmsd = ((i - 1) * knn_nrmsd + nrmsd(data', knn_imputed)) / i
+                    mean_nrmsd = ((i - 1) * mean_nrmsd + nrmsd(data', mean_imputed)) / i
+                    if ismissing(mean_nrmsd)
+                        @show mean_imputed
+                    end
+                end
+
+                @test knn_nrmsd < mean_nrmsd
+            end
+        end
+
+        # Test a case where we expect kNN to perform well (e.g., many variables, )
+        @testset "Data match" begin
+            data = mapreduce(hcat, 1:1000) do i
+                seeds = [sin(i), cos(i), tan(i), atan(i)]
+                mapreduce(vcat, combinations(seeds)) do args
+                    [
+                        +(args...),
+                        *(args...),
+                        +(args...) * 100,
+                        +(abs.(args)...),
+                        (+(args...) * 10) ^ 2,
+                        (+(abs.(args)...) * 10) ^ 2,
+                        log(+(abs.(args)...) * 100),
+                        +(args...) * 100 + rand(-10:0.1:10),
+                    ]
+                end
+            end
+
+            X = add_missings(data')
+            num_tests = 100
+
+            knn_nrmsd, mean_nrmsd = 0.0, 0.0
+
+            for i = 1:num_tests
+                knn_imputed = impute(copy(X), Impute.KNN(; num_nn=3, missing_neighbors_threshold=0.5,
+                    dist=Euclidean(), context=Context(; limit=1.0)))
+                mean_imputed = impute(copy(X), :fill; limit=1.0)
+
+                knn_nrmsd = ((i - 1) * knn_nrmsd + nrmsd(data', knn_imputed)) / i
+                mean_nrmsd = ((i - 1) * mean_nrmsd + nrmsd(data', mean_imputed)) / i
+            end
+
+            @test knn_nrmsd < mean_nrmsd
+        end
+
+        # Test a case with few variable
+        # (e.g., only a few variables, only )
+        @testset "Data with few variables" begin
+            data = Matrix(dataset("Ecdat", "Electricity"))
+            X = add_missings(data)
+            num_tests = 100
+
+            knn_nrmsd, mean_nrmsd = 0.0, 0.0
+
+            for i = 1:num_tests
+                knn_imputed = impute(copy(X), Impute.KNN(; num_nn=3, missing_neighbors_threshold=0.5,
+                    dist=Euclidean(), context=Context(; limit=1.0)))
+                mean_imputed = impute(copy(X), :fill; limit=1.0)
+
+                knn_nrmsd = ((i - 1) * knn_nrmsd + nrmsd(data', knn_imputed)) / i
+                mean_nrmsd = ((i - 1) * mean_nrmsd + nrmsd(data', mean_imputed)) / i
+            end
+
+            @test knn_nrmsd < mean_nrmsd
+        end
+
+        @testset "Data with random values" begin
+            M = rand(100, 200)
+            data = M * M'
+            X = add_missings(data)
+            num_tests = 100
+
+            knn_nrmsd, mean_nrmsd = 0.0, 0.0
+
+            for i = 1:num_tests
+                knn_imputed = impute(copy(X), Impute.KNN(; num_nn=3, missing_neighbors_threshold=0.5,
+                    dist=Euclidean(), context=Context(; limit=1.0)))
+                mean_imputed = impute(copy(X), :fill; limit=1.0)
+
+                knn_nrmsd = ((i - 1) * knn_nrmsd + nrmsd(data', knn_imputed)) / i
+                mean_nrmsd = ((i - 1) * mean_nrmsd + nrmsd(data', mean_imputed)) / i
+            end
+
+            @test knn_nrmsd < mean_nrmsd
+        end
+    end
+
     include("deprecated.jl")
     include("testutils.jl")