Merge pull request #117 from invenia/rf/limit-locf

Add limit keyword to LOCF, NOCB and Interpolation
invenia · Apr 16, 2021 · 5f688b9 · 5f688b9 · rofinn · Apr 16, 2021
2 parents ad306f1 + 53b1bcb
commit 5f688b9
Show file tree

Hide file tree

Showing 8 changed files with 177 additions and 41 deletions.
diff --git a/Project.toml b/Project.toml
@@ -1,7 +1,7 @@
 name = "Impute"
 uuid = "f7bf1975-0170-51b9-8c5f-a992d46b9575"
 authors = ["Invenia Technical Computing"]
-version = "0.6.4"
+version = "0.6.5"
 
 [deps]
 BSON = "fbb218c0-5317-5bc6-957e-2ee96dd4b1f0"

diff --git a/src/imputors/interp.jl b/src/imputors/interp.jl
@@ -1,5 +1,5 @@
 """
-    Interpolate()
+    Interpolate(; limit=nothing)
 
 Performs linear interpolation between the nearest values in an vector.
 The current implementation is univariate, so each variable in a table or matrix will
@@ -9,22 +9,34 @@ be handled independently.
 are no existing values on both sides. As a result, this method does not guarantee
 that all missing values will be imputed.
 
+# Keyword Arguments
+* `limit::Union{UInt, Nothing}`: Optionally limit the gap sizes that can be interpolated.
+
 # Example
 ```jldoctest
 julia> using Impute: Interpolate, impute
 
-julia> M = [1.0 2.0 missing missing 5.0; 1.1 2.2 3.3 missing 5.5]
-2×5 Matrix{Union{Missing, Float64}}:
- 1.0  2.0   missing  missing  5.0
- 1.1  2.2  3.3       missing  5.5
+julia> M = [1.0 2.0 missing missing missing 6.0; 1.1 missing missing 4.4 5.5 6.6]
+2×6 Matrix{Union{Missing, Float64}}:
+ 1.0  2.0       missing   missing   missing  6.0
+ 1.1   missing  missing  4.4       5.5       6.6
 
 julia> impute(M, Interpolate(); dims=:rows)
-2×5 Matrix{Union{Missing, Float64}}:
- 1.0  2.0  3.0  4.0  5.0
- 1.1  2.2  3.3  4.4  5.5
+2×6 Matrix{Union{Missing, Float64}}:
+ 1.0  2.0  3.0  4.0  5.0  6.0
+ 1.1  2.2  3.3  4.4  5.5  6.6
+
+julia> impute(M, Interpolate(; limit=2); dims=:rows)
+2×6 Matrix{Union{Missing, Float64}}:
+ 1.0  2.0   missing   missing   missing  6.0
+ 1.1  2.2  3.3       4.4       5.5       6.6
 ```
 """
-struct Interpolate <: Imputor end
+struct Interpolate <: Imputor
+    limit::Union{UInt, Nothing}
+end
+
+Interpolate(; limit=nothing) = Interpolate(limit)
 
 function _impute!(data::AbstractVector{<:Union{T, Missing}}, imp::Interpolate) where T
     @assert !all(ismissing, data)
@@ -38,14 +50,16 @@ function _impute!(data::AbstractVector{<:Union{T, Missing}}, imp::Interpolate) w
             if next_idx !== nothing
                 gap_sz = (next_idx - prev_idx) - 1
 
-                diff = data[next_idx] - data[prev_idx]
-                incr = diff / T(gap_sz + 1)
-                val = data[prev_idx] + incr
+                if imp.limit === nothing || gap_sz <= imp.limit
+                    diff = data[next_idx] - data[prev_idx]
+                    incr = diff / T(gap_sz + 1)
+                    val = data[prev_idx] + incr
 
-                # Iteratively fill in the values
-                for j in i:(next_idx - 1)
-                    data[j] = val
-                    val += incr
+                    # Iteratively fill in the values
+                    for j in i:(next_idx - 1)
+                        data[j] = val
+                        val += incr
+                    end
                 end
 
                 i = next_idx

diff --git a/src/imputors/locf.jl b/src/imputors/locf.jl
@@ -1,5 +1,5 @@
 """
-    LOCF()
+    LOCF(; limit=nothing)
 
 Last observation carried forward (LOCF) iterates forwards through the `data` and fills
 missing data with the last existing observation. The current implementation is univariate,
@@ -12,30 +12,51 @@ See also:
 existing observation to carry forward. As a result, this method does not guarantee
 that all missing values will be imputed.
 
+# Keyword Arguments
+* `limit::Union{UInt, Nothing}`: Optionally limits the amount of consecutive missing values to replace.
+
 # Example
 ```jldoctest
 julia> using Impute: LOCF, impute
 
-julia> M = [1.0 2.0 missing missing 5.0; 1.1 2.2 3.3 missing 5.5]
-2×5 Matrix{Union{Missing, Float64}}:
- 1.0  2.0   missing  missing  5.0
- 1.1  2.2  3.3       missing  5.5
+julia> M = [1.0 2.0 missing missing missing 6.0; 1.1 missing missing 4.4 5.5 6.6]
+2×6 Matrix{Union{Missing, Float64}}:
+ 1.0  2.0       missing   missing   missing  6.0
+ 1.1   missing  missing  4.4       5.5       6.6
 
 julia> impute(M, LOCF(); dims=:rows)
-2×5 Matrix{Union{Missing, Float64}}:
- 1.0  2.0  2.0  2.0  5.0
- 1.1  2.2  3.3  3.3  5.5
+2×6 Matrix{Union{Missing, Float64}}:
+ 1.0  2.0  2.0  2.0  2.0  6.0
+ 1.1  1.1  1.1  4.4  5.5  6.6
+
+julia> impute(M,  LOCF(; limit=2); dims=:rows)
+2×6 Matrix{Union{Missing, Float64}}:
+ 1.0  2.0  2.0  2.0   missing  6.0
+ 1.1  1.1  1.1  4.4  5.5       6.6
 ```
 """
-struct LOCF <: Imputor end
+struct LOCF <: Imputor
+    limit::Union{UInt, Nothing}
+end
+
+LOCF(; limit=nothing) = LOCF(limit)
 
 function _impute!(data::AbstractVector{Union{T, Missing}}, imp::LOCF) where T
     @assert !all(ismissing, data)
-    start_idx = findfirst(!ismissing, data) + 1
+    start_idx = findfirst(!ismissing, data)
+    count = 1
 
-    for i in start_idx:lastindex(data)
+    for i in start_idx + 1:lastindex(data)
         if ismissing(data[i])
-            data[i] = data[i-1]
+            if imp.limit === nothing
+                data[i] = data[i-1]
+            elseif count <= imp.limit
+                data[i] = data[start_idx]
+                count += 1
+            end
+        else
+            start_idx = i
+            count = 1
         end
     end
 

diff --git a/src/imputors/nocb.jl b/src/imputors/nocb.jl
@@ -1,5 +1,5 @@
 """
-    NOCB()
+    NOCB(; limit=nothing)
 
 Next observation carried backward (NOCB) iterates backwards through the `data` and fills
 missing data with the next existing observation.
@@ -12,31 +12,50 @@ existing observation to carry backward. As a result, this method does not guaran
 that all missing values will be imputed.
 
 # Keyword Arguments
+* `limit::Union{UInt, Nothing}`: Optionally limits the amount of consecutive missing values to replace.
 
 # Example
 ```jldoctest
 julia> using Impute: NOCB, impute
 
-julia> M = [1.0 2.0 missing missing 5.0; 1.1 2.2 3.3 missing 5.5]
-2×5 Matrix{Union{Missing, Float64}}:
- 1.0  2.0   missing  missing  5.0
- 1.1  2.2  3.3       missing  5.5
+julia> M = [1.0 2.0 missing missing missing 6.0; 1.1 missing missing 4.4 5.5 6.6]
+2×6 Matrix{Union{Missing, Float64}}:
+ 1.0  2.0       missing   missing   missing  6.0
+ 1.1   missing  missing  4.4       5.5       6.6
 
 julia> impute(M, NOCB(); dims=:rows)
-2×5 Matrix{Union{Missing, Float64}}:
- 1.0  2.0  5.0  5.0  5.0
- 1.1  2.2  3.3  5.5  5.5
+2×6 Matrix{Union{Missing, Float64}}:
+ 1.0  2.0  6.0  6.0  6.0  6.0
+ 1.1  4.4  4.4  4.4  5.5  6.6
+
+julia> impute(M,  NOCB(; limit=2); dims=:rows)
+2×6 Matrix{Union{Missing, Float64}}:
+ 1.0  2.0   missing  6.0  6.0  6.0
+ 1.1  4.4  4.4       4.4  5.5  6.6
 ```
 """
-struct NOCB <: Imputor end
+struct NOCB <: Imputor
+    limit::Union{UInt, Nothing}
+end
+
+NOCB(; limit=nothing) = NOCB(limit)
 
 function _impute!(data::AbstractVector{Union{T, Missing}}, imp::NOCB) where T
     @assert !all(ismissing, data)
-    end_idx = findlast(!ismissing, data) - 1
+    end_idx = findlast(!ismissing, data)
+    count = 1
 
-    for i in end_idx:-1:firstindex(data)
+    for i in end_idx - 1:-1:firstindex(data)
         if ismissing(data[i])
-            data[i] = data[i+1]
+            if imp.limit === nothing
+                data[i] = data[i+1]
+            elseif count <= imp.limit
+                data[i] = data[end_idx]
+                count += 1
+            end
+        else
+            end_idx = i
+            count = 1
         end
     end
 

diff --git a/test/imputors/interp.jl b/test/imputors/interp.jl
@@ -50,6 +50,18 @@
         result = Impute.interp(b)
         @test ismissing(result[1])
         @test ismissing(result[20])
+
+        # Test limiting
+        c = allowmissing(1.0:1.0:20.0)
+        c[13:15] .= missing
+
+         # Limit too small for gap
+        expected = copy(c)
+        @test isequal(impute(c, Interpolate(; limit=2)), expected)
+
+        # Limit matches gap size
+        expected[13:15] .= [13.0, 14.0, 15.0]
+        @test isequal(impute(c, Interpolate(; limit=3)), expected)
     end
 
     @testset "Ints" begin

diff --git a/test/imputors/locf.jl b/test/imputors/locf.jl
@@ -15,6 +15,8 @@
         test_columntable(tester)
         test_rowtable(tester)
 
+        test_limited(tester)
+
         @testset "Cube" begin
             a = allowmissing(1.0:1.0:60.0)
             a[[2, 7, 18, 23, 34, 41, 55, 59, 60]] .= missing
@@ -48,6 +50,19 @@
         result = Impute.locf(b)
         @test ismissing(result[1])
         @test result[20] == 1.0
+
+        # Test limiting
+        a[11:15] .= missing
+
+        expected = copy(a)
+        @test isequal(impute(a, LOCF(; limit=0)), expected)
+
+        expected[2] = 1.0
+        expected[3] = 1.0
+        expected[7] = 6.0
+        expected[11:13] .= 10.0
+
+        @test isequal(impute(a, LOCF(; limit=3)), expected)
     end
 
     @testset "Ints" begin

diff --git a/test/imputors/nocb.jl b/test/imputors/nocb.jl
@@ -15,6 +15,8 @@
         test_columntable(tester)
         test_rowtable(tester)
 
+        test_limited(tester)
+
         @testset "Cube" begin
             a = allowmissing(1.0:1.0:60.0)
             a[[2, 7, 18, 23, 34, 41, 55, 59, 60]] .= missing
@@ -48,6 +50,19 @@
         result = Impute.nocb(b)
         @test result[1] == 1.0
         @test ismissing(result[20])
+
+        # Test limiting
+        a[11:15] .= missing
+
+        expected = copy(a)
+        @test isequal(impute(a, NOCB(; limit=0)), expected)
+
+        expected[2] = 4.0
+        expected[3] = 4.0
+        expected[7] = 8.0
+        expected[13:15] .= 16.0
+
+        @test isequal(impute(a, NOCB(; limit=3)), expected)
     end
 
     @testset "Ints" begin

diff --git a/test/testutils.jl b/test/testutils.jl
@@ -76,6 +76,46 @@ function test_equality(tester::ImputorTester)
     end
 end
 
+function test_limited(tester::ImputorTester)
+    @testset "Limited" begin
+        a = allowmissing(1.0:1.0:20.0)
+        a[[2, 3, 7, 11:15...]] .= missing
+
+        all_imputed = impute(a, tester.imp(; tester.kwargs...))
+
+        @testset "Limit equals missings" begin
+            result = impute(a, tester.imp(; limit=5, tester.kwargs...))
+
+            @test count(ismissing, result) < count(ismissing, a)
+
+            @test isequal(result, all_imputed)
+            @test isequal(result, tester.f(a; limit=5, tester.kwargs...))
+        end
+
+        @testset "Limit less than missings" begin
+            result = impute(a, tester.imp(; limit=2, tester.kwargs...))
+
+            @test count(ismissing, result) < count(ismissing, a)
+            @test count(ismissing, all_imputed) < count(ismissing, result)
+
+            @test isequal(result, tester.f(a; limit=2, tester.kwargs...))
+        end
+
+        @testset "In-place" begin
+            # Test that the in-place function return the new results and logs whether it
+            # successfully did it in-place
+            a2 = deepcopy(a)
+            result = tester.f(a2; limit=1, tester.kwargs...)
+
+            a2_ = tester.f!(a2; limit=1, tester.kwargs...)
+            @test isequal(a2_, result)
+            if !isequal(a2, result)
+                @warn "$(tester.f!) did not mutate input data when limited"
+            end
+        end
+    end
+end
+
 function test_vector(tester::ImputorTester)
     @testset "Vector" begin
         if tester.imp != DropVars