diff --git a/Project.toml b/Project.toml index 64a4d55..f9de25a 100644 --- a/Project.toml +++ b/Project.toml @@ -4,8 +4,11 @@ authors = ["Invenia Technical Computing"] version = "0.4.0" [deps] +Distances = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7" IterTools = "c8e1da08-722c-5040-9ed9-7db0dc04731e" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" +Missings = "e1d29d7a-bbdc-5cf2-9ac0-f12de2c33e28" +NearestNeighbors = "b8a86587-4115-5ab1-83bc-aa920d37bbce" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" @@ -26,7 +29,8 @@ DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" Dates = "ade2ca70-3891-5945-98fb-dc099432e06a" Distances = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7" RDatasets = "ce6b1742-4840-55fa-b093-852dadbb1d8b" +Query = "1a8c2f83-1ff3-5112-b086-8aa67b057ba1" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" [targets] -test = ["AxisArrays", "Combinatorics", "DataFrames", "Dates", "Distances", "RDatasets", "Test"] +test = ["AxisArrays", "Combinatorics", "DataFrames", "Dates", "Distances", "RDatasets", "Query", "Test"] diff --git a/src/Impute.jl b/src/Impute.jl index 7c21d8f..1ec1c8c 100644 --- a/src/Impute.jl +++ b/src/Impute.jl @@ -1,6 +1,9 @@ module Impute +using Distances using IterTools +using Missings +using NearestNeighbors using Random using Statistics using StatsBase @@ -67,6 +70,7 @@ const global imputation_methods = ( nocb = NOCB, srs = SRS, svd = SVD, + knn = KNN, ) include("deprecated.jl") diff --git a/src/imputors.jl b/src/imputors.jl index d67114c..cda9f06 100644 --- a/src/imputors.jl +++ b/src/imputors.jl @@ -175,6 +175,6 @@ function impute!(table, imp::Imputor) return table end -for file in ("drop.jl", "locf.jl", "nocb.jl", "interp.jl", "fill.jl", "chain.jl", "srs.jl", "svd.jl") +for file in ("drop.jl", "locf.jl", "nocb.jl", "interp.jl", "fill.jl", "chain.jl", "srs.jl", "svd.jl", "knn.jl") include(joinpath("imputors", file)) end diff --git a/src/imputors/knn.jl b/src/imputors/knn.jl new file mode 100644 index 0000000..3a01f3c --- /dev/null +++ b/src/imputors/knn.jl @@ -0,0 +1,86 @@ +""" + KNN <: Imputor + +Imputation using k-Nearest Neighbor algorithm. + +# Keyword Arguments +* `num_nn::Int`: number of nearest neighbors +* `dist::MinkowskiMetric`: distance metric suppports by `NearestNeighbors.jl` (Euclidean, Chebyshev, Minkowski and Cityblock) +* `on_complete::Function`: a function to run when imputation is complete + +# Reference +* Troyanskaya, Olga, et al. "Missing value estimation methods for DNA microarrays." Bioinformatics 17.6 (2001): 520-525. +""" +# TODO : Support Categorical Distance (NearestNeighbors.jl support needed) +struct KNN{M} <: Imputor where M <: NearestNeighbors.MinkowskiMetric + num_nn::Int + dist::M + missing_neighbors_threshold::AbstractFloat + context::AbstractContext +end + +function KNN(; num_nn=1, missing_neighbors_threshold=0.5, + dist=Euclidean(), context=Context()) + + if num_nn < 1 + throw(ArgumentError("The number of nearset neighbors should be greater than 0")) + end + + # to exclude missing value itself + KNN(num_nn + 1, dist, missing_neighbors_threshold, context) +end + +function impute!(data::AbstractMatrix{<:Union{T, Missing}}, imp::KNN) where T<:Real + + imp.context() do ctx + # transpose to D x N for KDTree + transposed = float.(collect(transpose(data))) + + # Get our before and after views of our missing and non-missing data + mmask = ismissing.(transposed) + omask = .!mmask + + mdata = transposed[mmask] + odata = transposed[omask] + + # fill missing value as mean first + impute!(transposed, Fill(; value=mean, context=ctx)) + + # disallow missing to compute kdtree + transposed = disallowmissing(transposed) + + kdtree = KDTree(transposed, imp.dist) + idxs, dists = NearestNeighbors.knn(kdtree, transposed, imp.num_nn, true) + + idxes = CartesianIndices(transposed) + + # iterate all value + for I in CartesianIndices(transposed) + if mmask[I] == 1 + w = 1.0 ./ dists[I[2]] + #@show idxs[I[2]][2], dists[I[2]][2], sum(w[2:end]) + + missing_neighbors = ismissing.(transposed[:, idxs[I[2]]][:, 2:end]) + # exclude missing value itself because distance would be zero + if isnan(sum(w[2:end])) || isinf(sum(w[2:end])) || sum(w[2:end]) == 0.0 + # if distance is zero, keep mean imputation + transposed[I] = transposed[I] + elseif count(x -> !=(0, x), mapslices(sum, missing_neighbors, dims=[1])) > + imp.num_nn * imp.missing_neighbors_threshold + # If too many neighbors are also missing, fallback to mean imputation + # get column and check how many neighbors are also missing + transposed[I] = transposed[I] + else + # Inverse distance weighting + # idxs = num_points x num_nearestpoints + wsum = w .* transposed[I[1], idxs[I[2]]] + transposed[I] = sum(wsum[2:end]) / sum(w[2:end]) + end + end + end + + data = transposed' + + data + end +end diff --git a/test/runtests.jl b/test/runtests.jl index f625834..8a6859b 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -1,10 +1,10 @@ - using AxisArrays using Combinatorics using DataFrames using Dates using Distances using LinearAlgebra +using Query using RDatasets using Random using Statistics @@ -30,7 +30,6 @@ using Impute: interp, chain - function add_missings(X, ratio=0.1) result = Matrix{Union{Float64, Missing}}(X) @@ -41,6 +40,17 @@ function add_missings(X, ratio=0.1) return result end +function add_missings_single(X, ratio=0.1) + result = Matrix{Union{Float64, Missing}}(X) + + randcols = 1:floor(Int, size(X, 2) * ratio) + for col in randcols + result[rand(1:size(X, 1)), col] = missing + end + + return result +end + @testset "Impute" begin # Defining our missing datasets a = allowmissing(1.0:1.0:20.0) @@ -531,6 +541,158 @@ end end end + @testset "KNN" begin + @testset "Iris" begin + # Reference + # P. Schimitt, et. al + # A comparison of six methods for missing data imputation + iris = dataset("datasets", "iris") + iris2 = @from i in iris begin + @where i.Species == "versicolor" || i.Species == "virginica" + @select {i.SepalLength, i.SepalWidth, i.PetalLength, i.PetalWidth} + @collect DataFrame + end + data = Array(iris2) + num_tests = 100 + + @testset "Iris - 0.15" begin + X = add_missings(data', 0.15) + + knn_nrmsd, mean_nrmsd = 0.0, 0.0 + + for i = 1:num_tests + knn_imputed = impute(copy(X), Impute.KNN(; num_nn=1, missing_neighbors_threshold=0.5, + dist=Euclidean(), context=Context(; limit=1.0))) + mean_imputed = impute(copy(X), :fill; limit=1.0) + + knn_nrmsd = ((i - 1) * knn_nrmsd + nrmsd(data', knn_imputed)) / i + mean_nrmsd = ((i - 1) * mean_nrmsd + nrmsd(data', mean_imputed)) / i + if ismissing(mean_nrmsd) + @show mean_imputed + end + end + + @test knn_nrmsd < mean_nrmsd + end + + @testset "Iris - 0.25" begin + X = add_missings(data', 0.25) + + knn_nrmsd, mean_nrmsd = 0.0, 0.0 + + for i = 1:num_tests + knn_imputed = impute(copy(X), Impute.KNN(; num_nn=1, missing_neighbors_threshold=0.5, + dist=Euclidean(), context=Context(; limit=1.0))) + mean_imputed = impute(copy(X), :fill; limit=1.0) + + knn_nrmsd = ((i - 1) * knn_nrmsd + nrmsd(data', knn_imputed)) / i + mean_nrmsd = ((i - 1) * mean_nrmsd + nrmsd(data', mean_imputed)) / i + if ismissing(mean_nrmsd) + @show mean_imputed + end + end + + @test knn_nrmsd < mean_nrmsd + end + + @testset "Iris - 0.35" begin + X = add_missings(data', 0.35) + + knn_nrmsd, mean_nrmsd = 0.0, 0.0 + + for i = 1:num_tests + knn_imputed = impute(copy(X), Impute.KNN(; num_nn=1, missing_neighbors_threshold=0.5, + dist=Euclidean(), context=Context(; limit=1.0))) + mean_imputed = impute(copy(X), :fill; limit=1.0) + + knn_nrmsd = ((i - 1) * knn_nrmsd + nrmsd(data', knn_imputed)) / i + mean_nrmsd = ((i - 1) * mean_nrmsd + nrmsd(data', mean_imputed)) / i + if ismissing(mean_nrmsd) + @show mean_imputed + end + end + + @test knn_nrmsd < mean_nrmsd + end + end + + # Test a case where we expect kNN to perform well (e.g., many variables, ) + @testset "Data match" begin + data = mapreduce(hcat, 1:1000) do i + seeds = [sin(i), cos(i), tan(i), atan(i)] + mapreduce(vcat, combinations(seeds)) do args + [ + +(args...), + *(args...), + +(args...) * 100, + +(abs.(args)...), + (+(args...) * 10) ^ 2, + (+(abs.(args)...) * 10) ^ 2, + log(+(abs.(args)...) * 100), + +(args...) * 100 + rand(-10:0.1:10), + ] + end + end + + X = add_missings(data') + num_tests = 100 + + knn_nrmsd, mean_nrmsd = 0.0, 0.0 + + for i = 1:num_tests + knn_imputed = impute(copy(X), Impute.KNN(; num_nn=3, missing_neighbors_threshold=0.5, + dist=Euclidean(), context=Context(; limit=1.0))) + mean_imputed = impute(copy(X), :fill; limit=1.0) + + knn_nrmsd = ((i - 1) * knn_nrmsd + nrmsd(data', knn_imputed)) / i + mean_nrmsd = ((i - 1) * mean_nrmsd + nrmsd(data', mean_imputed)) / i + end + + @test knn_nrmsd < mean_nrmsd + end + + # Test a case with few variable + # (e.g., only a few variables, only ) + @testset "Data with few variables" begin + data = Matrix(dataset("Ecdat", "Electricity")) + X = add_missings(data) + num_tests = 100 + + knn_nrmsd, mean_nrmsd = 0.0, 0.0 + + for i = 1:num_tests + knn_imputed = impute(copy(X), Impute.KNN(; num_nn=3, missing_neighbors_threshold=0.5, + dist=Euclidean(), context=Context(; limit=1.0))) + mean_imputed = impute(copy(X), :fill; limit=1.0) + + knn_nrmsd = ((i - 1) * knn_nrmsd + nrmsd(data', knn_imputed)) / i + mean_nrmsd = ((i - 1) * mean_nrmsd + nrmsd(data', mean_imputed)) / i + end + + @test knn_nrmsd < mean_nrmsd + end + + @testset "Data with random values" begin + M = rand(100, 200) + data = M * M' + X = add_missings(data) + num_tests = 100 + + knn_nrmsd, mean_nrmsd = 0.0, 0.0 + + for i = 1:num_tests + knn_imputed = impute(copy(X), Impute.KNN(; num_nn=3, missing_neighbors_threshold=0.5, + dist=Euclidean(), context=Context(; limit=1.0))) + mean_imputed = impute(copy(X), :fill; limit=1.0) + + knn_nrmsd = ((i - 1) * knn_nrmsd + nrmsd(data', knn_imputed)) / i + mean_nrmsd = ((i - 1) * mean_nrmsd + nrmsd(data', mean_imputed)) / i + end + + @test knn_nrmsd < mean_nrmsd + end + end + include("deprecated.jl") include("testutils.jl")