Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add limit keyword to LOCF, NOCB and Interpolation #117

Merged
merged 5 commits into from
Apr 16, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Project.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
name = "Impute"
uuid = "f7bf1975-0170-51b9-8c5f-a992d46b9575"
authors = ["Invenia Technical Computing"]
version = "0.6.4"
version = "0.6.5"

[deps]
BSON = "fbb218c0-5317-5bc6-957e-2ee96dd4b1f0"
Expand Down
46 changes: 30 additions & 16 deletions src/imputors/interp.jl
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
"""
Interpolate()
Interpolate(; limit=nothing)

Performs linear interpolation between the nearest values in an vector.
The current implementation is univariate, so each variable in a table or matrix will
Expand All @@ -9,22 +9,34 @@ be handled independently.
are no existing values on both sides. As a result, this method does not guarantee
that all missing values will be imputed.

# Keyword Arguments
* `limit::Union{UInt, Nothing}`: Optionally limit the gap sizes that can be interpolated.

# Example
```jldoctest
julia> using Impute: Interpolate, impute

julia> M = [1.0 2.0 missing missing 5.0; 1.1 2.2 3.3 missing 5.5]
5 Matrix{Union{Missing, Float64}}:
1.0 2.0 missing missing 5.0
1.1 2.2 3.3 missing 5.5
julia> M = [1.0 2.0 missing missing missing 6.0; 1.1 missing missing 4.4 5.5 6.6]
6 Matrix{Union{Missing, Float64}}:
1.0 2.0 missing missing missing 6.0
1.1 missing missing 4.4 5.5 6.6

julia> impute(M, Interpolate(); dims=:rows)
2×5 Matrix{Union{Missing, Float64}}:
1.0 2.0 3.0 4.0 5.0
1.1 2.2 3.3 4.4 5.5
2×6 Matrix{Union{Missing, Float64}}:
1.0 2.0 3.0 4.0 5.0 6.0
1.1 2.2 3.3 4.4 5.5 6.6

julia> impute(M, Interpolate(; limit=2); dims=:rows)
2×6 Matrix{Union{Missing, Float64}}:
1.0 2.0 missing missing missing 6.0
1.1 2.2 3.3 4.4 5.5 6.6
```
"""
struct Interpolate <: Imputor end
struct Interpolate <: Imputor
limit::Union{UInt, Nothing}
end

Interpolate(; limit=nothing) = Interpolate(limit)

function _impute!(data::AbstractVector{<:Union{T, Missing}}, imp::Interpolate) where T
@assert !all(ismissing, data)
Expand All @@ -38,14 +50,16 @@ function _impute!(data::AbstractVector{<:Union{T, Missing}}, imp::Interpolate) w
if next_idx !== nothing
gap_sz = (next_idx - prev_idx) - 1

diff = data[next_idx] - data[prev_idx]
incr = diff / T(gap_sz + 1)
val = data[prev_idx] + incr
if imp.limit === nothing || gap_sz <= imp.limit
diff = data[next_idx] - data[prev_idx]
incr = diff / T(gap_sz + 1)
val = data[prev_idx] + incr

# Iteratively fill in the values
for j in i:(next_idx - 1)
data[j] = val
val += incr
# Iteratively fill in the values
for j in i:(next_idx - 1)
data[j] = val
val += incr
end
end

i = next_idx
Expand Down
45 changes: 33 additions & 12 deletions src/imputors/locf.jl
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
"""
LOCF()
LOCF(; limit=nothing)

Last observation carried forward (LOCF) iterates forwards through the `data` and fills
missing data with the last existing observation. The current implementation is univariate,
Expand All @@ -12,30 +12,51 @@ See also:
existing observation to carry forward. As a result, this method does not guarantee
that all missing values will be imputed.

# Keyword Arguments
* `limit::Union{UInt, Nothing}`: Optionally limits the amount of consecutive missing values to replace.

# Example
```jldoctest
julia> using Impute: LOCF, impute

julia> M = [1.0 2.0 missing missing 5.0; 1.1 2.2 3.3 missing 5.5]
5 Matrix{Union{Missing, Float64}}:
1.0 2.0 missing missing 5.0
1.1 2.2 3.3 missing 5.5
julia> M = [1.0 2.0 missing missing missing 6.0; 1.1 missing missing 4.4 5.5 6.6]
6 Matrix{Union{Missing, Float64}}:
1.0 2.0 missing missing missing 6.0
1.1 missing missing 4.4 5.5 6.6

julia> impute(M, LOCF(); dims=:rows)
2×5 Matrix{Union{Missing, Float64}}:
1.0 2.0 2.0 2.0 5.0
1.1 2.2 3.3 3.3 5.5
2×6 Matrix{Union{Missing, Float64}}:
1.0 2.0 2.0 2.0 2.0 6.0
1.1 1.1 1.1 4.4 5.5 6.6

julia> impute(M, LOCF(; limit=2); dims=:rows)
2×6 Matrix{Union{Missing, Float64}}:
1.0 2.0 2.0 2.0 missing 6.0
1.1 1.1 1.1 4.4 5.5 6.6
```
"""
struct LOCF <: Imputor end
struct LOCF <: Imputor
limit::Union{UInt, Nothing}
end

LOCF(; limit=nothing) = LOCF(limit)

function _impute!(data::AbstractVector{Union{T, Missing}}, imp::LOCF) where T
@assert !all(ismissing, data)
start_idx = findfirst(!ismissing, data) + 1
start_idx = findfirst(!ismissing, data)
count = 1

for i in start_idx:lastindex(data)
for i in start_idx + 1:lastindex(data)
if ismissing(data[i])
data[i] = data[i-1]
if imp.limit === nothing
data[i] = data[i-1]
elseif count <= imp.limit
data[i] = data[start_idx]
count += 1
end
else
start_idx = i
count = 1
end
end

Expand Down
43 changes: 31 additions & 12 deletions src/imputors/nocb.jl
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
"""
NOCB()
NOCB(; limit=nothing)

Next observation carried backward (NOCB) iterates backwards through the `data` and fills
missing data with the next existing observation.
Expand All @@ -12,31 +12,50 @@ existing observation to carry backward. As a result, this method does not guaran
that all missing values will be imputed.

# Keyword Arguments
* `limit::Union{UInt, Nothing}`: Optionally limits the amount of consecutive missing values to replace.

# Example
```jldoctest
julia> using Impute: NOCB, impute

julia> M = [1.0 2.0 missing missing 5.0; 1.1 2.2 3.3 missing 5.5]
5 Matrix{Union{Missing, Float64}}:
1.0 2.0 missing missing 5.0
1.1 2.2 3.3 missing 5.5
julia> M = [1.0 2.0 missing missing missing 6.0; 1.1 missing missing 4.4 5.5 6.6]
6 Matrix{Union{Missing, Float64}}:
1.0 2.0 missing missing missing 6.0
1.1 missing missing 4.4 5.5 6.6

julia> impute(M, NOCB(); dims=:rows)
2×5 Matrix{Union{Missing, Float64}}:
1.0 2.0 5.0 5.0 5.0
1.1 2.2 3.3 5.5 5.5
2×6 Matrix{Union{Missing, Float64}}:
1.0 2.0 6.0 6.0 6.0 6.0
1.1 4.4 4.4 4.4 5.5 6.6

julia> impute(M, NOCB(; limit=2); dims=:rows)
2×6 Matrix{Union{Missing, Float64}}:
1.0 2.0 missing 6.0 6.0 6.0
1.1 4.4 4.4 4.4 5.5 6.6
```
"""
struct NOCB <: Imputor end
struct NOCB <: Imputor
limit::Union{UInt, Nothing}
end

NOCB(; limit=nothing) = NOCB(limit)

function _impute!(data::AbstractVector{Union{T, Missing}}, imp::NOCB) where T
@assert !all(ismissing, data)
end_idx = findlast(!ismissing, data) - 1
end_idx = findlast(!ismissing, data)
count = 1

for i in end_idx:-1:firstindex(data)
for i in end_idx - 1:-1:firstindex(data)
if ismissing(data[i])
data[i] = data[i+1]
if imp.limit === nothing
data[i] = data[i+1]
elseif count <= imp.limit
data[i] = data[end_idx]
count += 1
end
else
end_idx = i
count = 1
end
end

Expand Down
12 changes: 12 additions & 0 deletions test/imputors/interp.jl
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,18 @@
result = Impute.interp(b)
@test ismissing(result[1])
@test ismissing(result[20])

# Test limiting
c = allowmissing(1.0:1.0:20.0)
c[13:15] .= missing

# Limit too small for gap
expected = copy(c)
@test isequal(impute(c, Interpolate(; limit=2)), expected)

# Limit matches gap size
expected[13:15] .= [13.0, 14.0, 15.0]
@test isequal(impute(c, Interpolate(; limit=3)), expected)
end

@testset "Ints" begin
Expand Down
15 changes: 15 additions & 0 deletions test/imputors/locf.jl
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@
test_columntable(tester)
test_rowtable(tester)

test_limited(tester)

@testset "Cube" begin
a = allowmissing(1.0:1.0:60.0)
a[[2, 7, 18, 23, 34, 41, 55, 59, 60]] .= missing
Expand Down Expand Up @@ -48,6 +50,19 @@
result = Impute.locf(b)
@test ismissing(result[1])
@test result[20] == 1.0

# Test limiting
a[11:15] .= missing

expected = copy(a)
@test isequal(impute(a, LOCF(; limit=0)), expected)

expected[2] = 1.0
expected[3] = 1.0
expected[7] = 6.0
expected[11:13] .= 10.0

@test isequal(impute(a, LOCF(; limit=3)), expected)
end

@testset "Ints" begin
Expand Down
15 changes: 15 additions & 0 deletions test/imputors/nocb.jl
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@
test_columntable(tester)
test_rowtable(tester)

test_limited(tester)

@testset "Cube" begin
a = allowmissing(1.0:1.0:60.0)
a[[2, 7, 18, 23, 34, 41, 55, 59, 60]] .= missing
Expand Down Expand Up @@ -48,6 +50,19 @@
result = Impute.nocb(b)
@test result[1] == 1.0
@test ismissing(result[20])

# Test limiting
a[11:15] .= missing

expected = copy(a)
@test isequal(impute(a, NOCB(; limit=0)), expected)

expected[2] = 4.0
expected[3] = 4.0
expected[7] = 8.0
expected[13:15] .= 16.0

@test isequal(impute(a, NOCB(; limit=3)), expected)
end

@testset "Ints" begin
Expand Down
40 changes: 40 additions & 0 deletions test/testutils.jl
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,46 @@ function test_equality(tester::ImputorTester)
end
end

function test_limited(tester::ImputorTester)
@testset "Limited" begin
a = allowmissing(1.0:1.0:20.0)
a[[2, 3, 7, 11:15...]] .= missing

all_imputed = impute(a, tester.imp(; tester.kwargs...))

@testset "Limit equals missings" begin
result = impute(a, tester.imp(; limit=5, tester.kwargs...))

@test count(ismissing, result) < count(ismissing, a)

@test isequal(result, all_imputed)
@test isequal(result, tester.f(a; limit=5, tester.kwargs...))
end

@testset "Limit less than missings" begin
result = impute(a, tester.imp(; limit=2, tester.kwargs...))

@test count(ismissing, result) < count(ismissing, a)
@test count(ismissing, all_imputed) < count(ismissing, result)

@test isequal(result, tester.f(a; limit=2, tester.kwargs...))
end

@testset "In-place" begin
# Test that the in-place function return the new results and logs whether it
# successfully did it in-place
a2 = deepcopy(a)
result = tester.f(a2; limit=1, tester.kwargs...)

a2_ = tester.f!(a2; limit=1, tester.kwargs...)
@test isequal(a2_, result)
if !isequal(a2, result)
@warn "$(tester.f!) did not mutate input data when limited"
end
end
end
end

function test_vector(tester::ImputorTester)
@testset "Vector" begin
if tester.imp != DropVars
Expand Down