Skip to content

Commit

Permalink
Merge pull request #117 from invenia/rf/limit-locf
Browse files Browse the repository at this point in the history
Add limit keyword to LOCF, NOCB and Interpolation
  • Loading branch information
rofinn authored Apr 16, 2021
2 parents ad306f1 + 53b1bcb commit 5f688b9
Show file tree
Hide file tree
Showing 8 changed files with 177 additions and 41 deletions.
2 changes: 1 addition & 1 deletion Project.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
name = "Impute"
uuid = "f7bf1975-0170-51b9-8c5f-a992d46b9575"
authors = ["Invenia Technical Computing"]
version = "0.6.4"
version = "0.6.5"

[deps]
BSON = "fbb218c0-5317-5bc6-957e-2ee96dd4b1f0"
Expand Down
46 changes: 30 additions & 16 deletions src/imputors/interp.jl
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
"""
Interpolate()
Interpolate(; limit=nothing)
Performs linear interpolation between the nearest values in an vector.
The current implementation is univariate, so each variable in a table or matrix will
Expand All @@ -9,22 +9,34 @@ be handled independently.
are no existing values on both sides. As a result, this method does not guarantee
that all missing values will be imputed.
# Keyword Arguments
* `limit::Union{UInt, Nothing}`: Optionally limit the gap sizes that can be interpolated.
# Example
```jldoctest
julia> using Impute: Interpolate, impute
julia> M = [1.0 2.0 missing missing 5.0; 1.1 2.2 3.3 missing 5.5]
5 Matrix{Union{Missing, Float64}}:
1.0 2.0 missing missing 5.0
1.1 2.2 3.3 missing 5.5
julia> M = [1.0 2.0 missing missing missing 6.0; 1.1 missing missing 4.4 5.5 6.6]
6 Matrix{Union{Missing, Float64}}:
1.0 2.0 missing missing missing 6.0
1.1 missing missing 4.4 5.5 6.6
julia> impute(M, Interpolate(); dims=:rows)
2×5 Matrix{Union{Missing, Float64}}:
1.0 2.0 3.0 4.0 5.0
1.1 2.2 3.3 4.4 5.5
2×6 Matrix{Union{Missing, Float64}}:
1.0 2.0 3.0 4.0 5.0 6.0
1.1 2.2 3.3 4.4 5.5 6.6
julia> impute(M, Interpolate(; limit=2); dims=:rows)
2×6 Matrix{Union{Missing, Float64}}:
1.0 2.0 missing missing missing 6.0
1.1 2.2 3.3 4.4 5.5 6.6
```
"""
struct Interpolate <: Imputor end
struct Interpolate <: Imputor
limit::Union{UInt, Nothing}
end

Interpolate(; limit=nothing) = Interpolate(limit)

function _impute!(data::AbstractVector{<:Union{T, Missing}}, imp::Interpolate) where T
@assert !all(ismissing, data)
Expand All @@ -38,14 +50,16 @@ function _impute!(data::AbstractVector{<:Union{T, Missing}}, imp::Interpolate) w
if next_idx !== nothing
gap_sz = (next_idx - prev_idx) - 1

diff = data[next_idx] - data[prev_idx]
incr = diff / T(gap_sz + 1)
val = data[prev_idx] + incr
if imp.limit === nothing || gap_sz <= imp.limit
diff = data[next_idx] - data[prev_idx]
incr = diff / T(gap_sz + 1)
val = data[prev_idx] + incr

# Iteratively fill in the values
for j in i:(next_idx - 1)
data[j] = val
val += incr
# Iteratively fill in the values
for j in i:(next_idx - 1)
data[j] = val
val += incr
end
end

i = next_idx
Expand Down
45 changes: 33 additions & 12 deletions src/imputors/locf.jl
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
"""
LOCF()
LOCF(; limit=nothing)
Last observation carried forward (LOCF) iterates forwards through the `data` and fills
missing data with the last existing observation. The current implementation is univariate,
Expand All @@ -12,30 +12,51 @@ See also:
existing observation to carry forward. As a result, this method does not guarantee
that all missing values will be imputed.
# Keyword Arguments
* `limit::Union{UInt, Nothing}`: Optionally limits the amount of consecutive missing values to replace.
# Example
```jldoctest
julia> using Impute: LOCF, impute
julia> M = [1.0 2.0 missing missing 5.0; 1.1 2.2 3.3 missing 5.5]
5 Matrix{Union{Missing, Float64}}:
1.0 2.0 missing missing 5.0
1.1 2.2 3.3 missing 5.5
julia> M = [1.0 2.0 missing missing missing 6.0; 1.1 missing missing 4.4 5.5 6.6]
6 Matrix{Union{Missing, Float64}}:
1.0 2.0 missing missing missing 6.0
1.1 missing missing 4.4 5.5 6.6
julia> impute(M, LOCF(); dims=:rows)
2×5 Matrix{Union{Missing, Float64}}:
1.0 2.0 2.0 2.0 5.0
1.1 2.2 3.3 3.3 5.5
2×6 Matrix{Union{Missing, Float64}}:
1.0 2.0 2.0 2.0 2.0 6.0
1.1 1.1 1.1 4.4 5.5 6.6
julia> impute(M, LOCF(; limit=2); dims=:rows)
2×6 Matrix{Union{Missing, Float64}}:
1.0 2.0 2.0 2.0 missing 6.0
1.1 1.1 1.1 4.4 5.5 6.6
```
"""
struct LOCF <: Imputor end
struct LOCF <: Imputor
limit::Union{UInt, Nothing}
end

LOCF(; limit=nothing) = LOCF(limit)

function _impute!(data::AbstractVector{Union{T, Missing}}, imp::LOCF) where T
@assert !all(ismissing, data)
start_idx = findfirst(!ismissing, data) + 1
start_idx = findfirst(!ismissing, data)
count = 1

for i in start_idx:lastindex(data)
for i in start_idx + 1:lastindex(data)
if ismissing(data[i])
data[i] = data[i-1]
if imp.limit === nothing
data[i] = data[i-1]
elseif count <= imp.limit
data[i] = data[start_idx]
count += 1
end
else
start_idx = i
count = 1
end
end

Expand Down
43 changes: 31 additions & 12 deletions src/imputors/nocb.jl
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
"""
NOCB()
NOCB(; limit=nothing)
Next observation carried backward (NOCB) iterates backwards through the `data` and fills
missing data with the next existing observation.
Expand All @@ -12,31 +12,50 @@ existing observation to carry backward. As a result, this method does not guaran
that all missing values will be imputed.
# Keyword Arguments
* `limit::Union{UInt, Nothing}`: Optionally limits the amount of consecutive missing values to replace.
# Example
```jldoctest
julia> using Impute: NOCB, impute
julia> M = [1.0 2.0 missing missing 5.0; 1.1 2.2 3.3 missing 5.5]
5 Matrix{Union{Missing, Float64}}:
1.0 2.0 missing missing 5.0
1.1 2.2 3.3 missing 5.5
julia> M = [1.0 2.0 missing missing missing 6.0; 1.1 missing missing 4.4 5.5 6.6]
6 Matrix{Union{Missing, Float64}}:
1.0 2.0 missing missing missing 6.0
1.1 missing missing 4.4 5.5 6.6
julia> impute(M, NOCB(); dims=:rows)
2×5 Matrix{Union{Missing, Float64}}:
1.0 2.0 5.0 5.0 5.0
1.1 2.2 3.3 5.5 5.5
2×6 Matrix{Union{Missing, Float64}}:
1.0 2.0 6.0 6.0 6.0 6.0
1.1 4.4 4.4 4.4 5.5 6.6
julia> impute(M, NOCB(; limit=2); dims=:rows)
2×6 Matrix{Union{Missing, Float64}}:
1.0 2.0 missing 6.0 6.0 6.0
1.1 4.4 4.4 4.4 5.5 6.6
```
"""
struct NOCB <: Imputor end
struct NOCB <: Imputor
limit::Union{UInt, Nothing}
end

NOCB(; limit=nothing) = NOCB(limit)

function _impute!(data::AbstractVector{Union{T, Missing}}, imp::NOCB) where T
@assert !all(ismissing, data)
end_idx = findlast(!ismissing, data) - 1
end_idx = findlast(!ismissing, data)
count = 1

for i in end_idx:-1:firstindex(data)
for i in end_idx - 1:-1:firstindex(data)
if ismissing(data[i])
data[i] = data[i+1]
if imp.limit === nothing
data[i] = data[i+1]
elseif count <= imp.limit
data[i] = data[end_idx]
count += 1
end
else
end_idx = i
count = 1
end
end

Expand Down
12 changes: 12 additions & 0 deletions test/imputors/interp.jl
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,18 @@
result = Impute.interp(b)
@test ismissing(result[1])
@test ismissing(result[20])

# Test limiting
c = allowmissing(1.0:1.0:20.0)
c[13:15] .= missing

# Limit too small for gap
expected = copy(c)
@test isequal(impute(c, Interpolate(; limit=2)), expected)

# Limit matches gap size
expected[13:15] .= [13.0, 14.0, 15.0]
@test isequal(impute(c, Interpolate(; limit=3)), expected)
end

@testset "Ints" begin
Expand Down
15 changes: 15 additions & 0 deletions test/imputors/locf.jl
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@
test_columntable(tester)
test_rowtable(tester)

test_limited(tester)

@testset "Cube" begin
a = allowmissing(1.0:1.0:60.0)
a[[2, 7, 18, 23, 34, 41, 55, 59, 60]] .= missing
Expand Down Expand Up @@ -48,6 +50,19 @@
result = Impute.locf(b)
@test ismissing(result[1])
@test result[20] == 1.0

# Test limiting
a[11:15] .= missing

expected = copy(a)
@test isequal(impute(a, LOCF(; limit=0)), expected)

expected[2] = 1.0
expected[3] = 1.0
expected[7] = 6.0
expected[11:13] .= 10.0

@test isequal(impute(a, LOCF(; limit=3)), expected)
end

@testset "Ints" begin
Expand Down
15 changes: 15 additions & 0 deletions test/imputors/nocb.jl
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@
test_columntable(tester)
test_rowtable(tester)

test_limited(tester)

@testset "Cube" begin
a = allowmissing(1.0:1.0:60.0)
a[[2, 7, 18, 23, 34, 41, 55, 59, 60]] .= missing
Expand Down Expand Up @@ -48,6 +50,19 @@
result = Impute.nocb(b)
@test result[1] == 1.0
@test ismissing(result[20])

# Test limiting
a[11:15] .= missing

expected = copy(a)
@test isequal(impute(a, NOCB(; limit=0)), expected)

expected[2] = 4.0
expected[3] = 4.0
expected[7] = 8.0
expected[13:15] .= 16.0

@test isequal(impute(a, NOCB(; limit=3)), expected)
end

@testset "Ints" begin
Expand Down
40 changes: 40 additions & 0 deletions test/testutils.jl
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,46 @@ function test_equality(tester::ImputorTester)
end
end

function test_limited(tester::ImputorTester)
@testset "Limited" begin
a = allowmissing(1.0:1.0:20.0)
a[[2, 3, 7, 11:15...]] .= missing

all_imputed = impute(a, tester.imp(; tester.kwargs...))

@testset "Limit equals missings" begin
result = impute(a, tester.imp(; limit=5, tester.kwargs...))

@test count(ismissing, result) < count(ismissing, a)

@test isequal(result, all_imputed)
@test isequal(result, tester.f(a; limit=5, tester.kwargs...))
end

@testset "Limit less than missings" begin
result = impute(a, tester.imp(; limit=2, tester.kwargs...))

@test count(ismissing, result) < count(ismissing, a)
@test count(ismissing, all_imputed) < count(ismissing, result)

@test isequal(result, tester.f(a; limit=2, tester.kwargs...))
end

@testset "In-place" begin
# Test that the in-place function return the new results and logs whether it
# successfully did it in-place
a2 = deepcopy(a)
result = tester.f(a2; limit=1, tester.kwargs...)

a2_ = tester.f!(a2; limit=1, tester.kwargs...)
@test isequal(a2_, result)
if !isequal(a2, result)
@warn "$(tester.f!) did not mutate input data when limited"
end
end
end
end

function test_vector(tester::ImputorTester)
@testset "Vector" begin
if tester.imp != DropVars
Expand Down

2 comments on commit 5f688b9

@rofinn
Copy link
Member Author

@rofinn rofinn commented on 5f688b9 Apr 16, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@JuliaRegistrator
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Registration pull request created: JuliaRegistries/General/34521

After the above pull request is merged, it is recommended that a tag is created on this repository for the registered package version.

This will be done automatically if the Julia TagBot GitHub Action is installed, or can be done manually through the github interface, or via:

git tag -a v0.6.5 -m "<description of version>" 5f688b9aca772f8f7418ac504091f4df103bacf5
git push origin v0.6.5

Please sign in to comment.