invenia · rofinn · Feb 24, 2020 · Dec 16, 2018 · Dec 28, 2018 · Jan 2, 2019
diff --git a/Project.toml b/Project.toml
@@ -5,6 +5,7 @@ version = "0.4.0"
 
 [deps]
 IterTools = "c8e1da08-722c-5040-9ed9-7db0dc04731e"
+LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
 StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
@@ -20,10 +21,12 @@ julia = "1"
 
 [extras]
 AxisArrays = "39de3d68-74b9-583c-8d2d-e117c070f3a9"
+Combinatorics = "861a8166-3701-5b0c-9a16-15d98fcdc6aa"
 DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
 Dates = "ade2ca70-3891-5945-98fb-dc099432e06a"
+Distances = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7"
 RDatasets = "ce6b1742-4840-55fa-b093-852dadbb1d8b"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 
 [targets]
-test = ["AxisArrays", "DataFrames", "Dates", "RDatasets", "Test"]
+test = ["AxisArrays", "Combinatorics", "DataFrames", "Dates", "Distances", "RDatasets", "Test"]
diff --git a/src/Impute.jl b/src/Impute.jl
@@ -6,8 +6,11 @@ using Statistics
 using StatsBase
 using Tables: Tables, materializer, istable
 
-import Base.Iterators: drop
+using Base.Iterators
+using LinearAlgebra
+using LinearAlgebra: Diagonal
 
+import Base.Iterators: drop
 
 """
     ImputeError{T} <: Exception
@@ -63,6 +66,7 @@ const global imputation_methods = (
     locf = LOCF,
     nocb = NOCB,
     srs = SRS,
+    svd = SVD,
 )
 
 include("deprecated.jl")
@@ -315,4 +319,18 @@ julia> Impute.srs(df; rng=MersenneTwister(1234), context=Context(; limit=1.0))
 ```
 """ srs
 
+"""
+    svd!(data::AbstractMatrix; limit=1.0)
+
+Utility method for `impute!(data, :svd; limit=limit)`
+"""
+svd!(data::AbstractMatrix; limit=1.0) = impute!(data, :svd; limit=limit)
+
+"""
+    svd(data::AbstractMatrix; limit=1.0)
+
+Utility method for `impute(data, :svd; limit=limit)`
+"""
+svd(data::AbstractMatrix; limit=1.0) = impute(data, :svd; limit=limit)
+
 end  # module
diff --git a/src/imputors.jl b/src/imputors.jl
@@ -175,7 +175,6 @@ function impute!(table, imp::Imputor)
     return table
 end
 
-
-for file in ("drop.jl", "locf.jl", "nocb.jl", "interp.jl", "fill.jl", "chain.jl", "srs.jl")
+for file in ("drop.jl", "locf.jl", "nocb.jl", "interp.jl", "fill.jl", "chain.jl", "srs.jl", "svd.jl")
     include(joinpath("imputors", file))
 end
diff --git a/src/imputors/svd.jl b/src/imputors/svd.jl
@@ -0,0 +1,97 @@
+"""
+    SVD <: Imputor
+
+Imputes the missing values in a matrix using an expectation maximization (EM) algorithm
+over low-rank SVD approximations.
+
+# Keyword Arguments
+* `init::Imputor`: initialization method for missing values (default: Fill())
+* `rank::Union{Int, Nothing}`: rank of the SVD approximation (default: nothing meaning start and 0 and increase)
+* `tol::Float64`: convergence tolerance (default: 1e-10)
+* `maxiter::Int`: Maximum number of iterations if convergence is not achieved (default: 100)
+* `limits::Unoin{Tuple{Float64, Float64}, Nothing}`: Bound the possible approximation values (default: nothing)
+* `verbose::Bool`: Whether to display convergence progress (default: true)
+* `context::Context`: Missing data context settings (default: Context())
+
+# References
+* Troyanskaya, Olga, et al. "Missing value estimation methods for DNA microarrays." Bioinformatics 17.6 (2001): 520-525.
+"""
+struct SVD <: Imputor
+    init::Fill
+    rank::Union{Int, Nothing}
+    tol::Float64
+    maxiter::Int
+    limits::Union{Tuple{Float64, Float64}, Nothing}
+    verbose::Bool
+    context::AbstractContext
+end
+
+function SVD(;
+    init=Fill(), rank=nothing, tol=1e-10, maxiter=100, limits=nothing, verbose=true, context=Context()
+)
+    SVD(init, rank, tol, maxiter, limits, verbose, context)
+end
+
+function impute!(data::AbstractMatrix{<:Union{T, Missing}}, imp::SVD) where T<:Real
+    n, p = size(data)
+    k = imp.rank === nothing ? 0 : min(imp.rank, p-1)
+    S = zeros(T, min(n, p))
+    X = zeros(T, n, p)
+
+    ctx = imp.context
+    # Get our before and after views of our missing and non-missing data
+    mmask = ismissing.(data)
+    omask = .!mmask
+
+    mdata = data[mmask]
+    mX = X[mmask]
+    odata = data[omask]
+    oX = X[omask]
+
+    # Fill in the original data
+    impute!(data, imp.init)
+
+    C = sum(abs2, mdata - mX) / sum(abs2, mdata)
+    err = mean(abs.(odata - oX))
+    @info("Before: Diff=$(sum(mdata - mX)), MAE=$err, convergence=$C, normsq=$(sum(abs2, mdata)), $(mX[1])")
+
+    for i in 1:imp.maxiter
+        if imp.rank === nothing
+            k = min(k + 1, p - 1, n - 1)
+        end
+
+        # Compute the SVD and produce a low-rank approximation of the data
+        F = LinearAlgebra.svd(data)
+        # println(join([size(S), size(F.S), size(F.U), size(F.Vt)], ", "))
+
+        S[1:k] .= F.S[1:k]
+        X = F.U * Diagonal(S) * F.Vt
+
+        # Clamp the values if necessary
+        imp.limits !== nothing && clamp!(X, imp.limits...)
+
+        # Test for convergence
+        mdata = data[mmask]
+        mX = X[mmask]
+        odata = data[omask]
+        oX = X[omask]
+
+        # println(join([size(mdata), size(mX)], ", "))
+        C = sum(abs2, mdata - mX) / sum(abs2, mdata)
+
+        # Print the error between reconstruction and observed inputs
+        if imp.verbose
+            err = mean(abs.(odata - oX))
+            @info("Iteration $i: Diff=$(sum(mdata - mX)), MAE=$err, MSS=$(sum(abs2, mdata)), convergence=$C")
+        end
+
+        # Update missing values
+        data[mmask] .= X[mmask]
+
+        if isfinite(C) && C < imp.tol
+            break
+        end
+    end
+
+    return data
+end
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -1,14 +1,18 @@
-using Impute
-using Tables
-using Test
+
 using AxisArrays
+using Combinatorics
 using DataFrames
 using Dates
+using Distances
+using LinearAlgebra
 using RDatasets
+using Random
 using Statistics
 using StatsBase
-using Random
+using Tables
+using Test
 
+using Impute
 using Impute:
     Impute,
     Imputor,
@@ -27,6 +31,16 @@ using Impute:
     chain
 
 
+function add_missings(X, ratio=0.1)
+    result = Matrix{Union{Float64, Missing}}(X)
+
+    for i in 1:floor(Int, length(X) * ratio)
+        result[rand(1:length(X))] = missing
+    end
+
+    return result
+end
+
 @testset "Impute" begin
     # Defining our missing datasets
     a = allowmissing(1.0:1.0:20.0)
@@ -523,4 +537,63 @@ using Impute:
     @testset "$T" for T in (DropObs, DropVars, Interpolate, Fill, LOCF, NOCB)
         test_all(ImputorTester(T))
     end
+
+    @testset "SVD" begin
+        # Test a case where we expect SVD to perform well (e.g., many variables, )
+        @testset "Data match" begin
+            data = mapreduce(hcat, 1:1000) do i
+                seeds = [sin(i), cos(i), tan(i), atan(i)]
+                mapreduce(vcat, combinations(seeds)) do args
+                    [
+                        +(args...),
+                        *(args...),
+                        +(args...) * 100,
+                        +(abs.(args)...),
+                        (+(args...) * 10) ^ 2,
+                        (+(abs.(args)...) * 10) ^ 2,
+                        log(+(abs.(args)...) * 100),
+                        +(args...) * 100 + rand(-10:0.1:10),
+                    ]
+                end
+            end
+
+            # println(svd(data').S)
+            X = add_missings(data')
+
+            svd_imputed = Impute.svd(X)
+            mean_imputed = impute(copy(X), :fill; limit=1.0)
+
+            # With sufficient correlation between the variables and enough observation we
+            # expect the svd imputation to perform severl times better than mean imputation.
+            @test nrmsd(svd_imputed, data') < nrmsd(mean_imputed, data') * 0.5
+        end
+
+        # Test a case where we know SVD imputation won't perform well
+        # (e.g., only a few variables, only )
+        @testset "Data mismatch - too few variables" begin
+            data = Matrix(dataset("Ecdat", "Electricity"))
+            X = add_missings(data)
+
+            svd_imputed = Impute.svd(X)
+            mean_imputed = impute(copy(X), :fill; limit=1.0)
+
+            # If we don't have enough variables then SVD imputation will probably perform
+            # about as well as mean imputation.
+            @test nrmsd(svd_imputed, data) > nrmsd(mean_imputed, data) * 0.9
+        end
+
+        @testset "Data mismatch - poor low rank approximations" begin
+            M = rand(100, 200)
+            data = M * M'
+            X = add_missings(data)
+
+            svd_imputed = Impute.svd(X)
+            mean_imputed = impute(copy(X), :fill; limit=1.0)
+
+            # If most of the variance in the original data can't be explained by a small
+            # subset of the eigen values in the svd decomposition then our low rank approximations
+            # won't perform very well.
+            @test nrmsd(svd_imputed, data) > nrmsd(mean_imputed, data) * 0.9
+        end
+    end
 end