-
Notifications
You must be signed in to change notification settings - Fork 11
/
Copy pathsvd.jl
100 lines (78 loc) · 3.18 KB
/
svd.jl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
"""
SVD(; kwargs...)
Imputes the missing values in a matrix using an expectation maximization (EM) algorithm
over low-rank SVD approximations.
# Keyword Arguments
* `init::Imputor`: initialization method for missing values (default: Substitute())
* `rank::Union{Int, Nothing}`: rank of the SVD approximation (default: nothing meaning start and 0 and increase)
* `tol::Float64`: convergence tolerance (default: 1e-10)
* `maxiter::Int`: Maximum number of iterations if convergence is not achieved (default: 100)
* `limits::Union{Tuple{Float64, Float64}, Nothing}`: Bound the possible approximation values (default: nothing)
* `verbose::Bool`: Whether to display convergence progress (default: true)
# References
* Troyanskaya, Olga, et al. "Missing value estimation methods for DNA microarrays." Bioinformatics 17.6 (2001): 520-525.
"""
struct SVD <: Imputor
init::Imputor
rank::Union{Int, Nothing}
tol::Float64
maxiter::Int
limits::Union{Tuple{Float64, Float64}, Nothing}
verbose::Bool
end
function SVD(;
init=Substitute(), rank=nothing, tol=1e-10, maxiter=100, limits=nothing, verbose=true
)
SVD(init, rank, tol, maxiter, limits, verbose)
end
function impute!(data::AbstractMatrix{Union{T, Missing}}, imp::SVD; dims=nothing) where T<:Real
n, p = size(data)
k = imp.rank === nothing ? 0 : min(imp.rank, p-1)
S = zeros(T, min(n, p))
X = zeros(T, n, p)
# Get our before and after views of our missing and non-missing data
mmask = ismissing.(data)
omask = .!mmask
mdata = data[mmask]
mX = X[mmask]
odata = data[omask]
oX = X[omask]
# Fill in the original data
impute!(data, imp.init; dims=dims)
C = sum(abs2, mdata - mX) / sum(abs2, mdata)
err = mean(abs.(odata - oX))
@debug("Before", Diff=sum(mdata - mX), MAE=err, convergence=C, normsq=sum(abs2, mdata), mX[1])
for i in 1:imp.maxiter
if imp.rank === nothing
k = min(k + 1, p - 1, n - 1)
end
# Compute the SVD and produce a low-rank approximation of the data
F = LinearAlgebra.svd(data)
# println(join([size(S), size(F.S), size(F.U), size(F.Vt)], ", "))
S[1:k] .= F.S[1:k]
X = F.U * Diagonal(S) * F.Vt
# Clamp the values if necessary
imp.limits !== nothing && clamp!(X, imp.limits...)
# Test for convergence
mdata = data[mmask]
mX = X[mmask]
odata = data[omask]
oX = X[omask]
# println(join([size(mdata), size(mX)], ", "))
C = sum(abs2, mdata - mX) / sum(abs2, mdata)
# Print the error between reconstruction and observed inputs
if imp.verbose
err = mean(abs.(odata - oX))
@debug("Iteration", i, Diff=sum(mdata - mX), MAE=err, MSS=sum(abs2, mdata), convergence=C)
end
# Update missing values
data[mmask] .= X[mmask]
isfinite(C) && C < imp.tol && break
end
return data
end
impute!(data::AbstractMatrix{Missing}, imp::SVD; kwargs...) = data
function impute(data::AbstractMatrix{Union{T, Missing}}, imp::SVD; kwargs...) where T<:Real
return impute!(trycopy(data), imp; kwargs...)
end
impute(data::AbstractMatrix{Missing}, imp::SVD; kwargs...) = trycopy(data)