Add apply_append methods

invenia · Mar 31, 2021 · 0a5252d · 0a5252d
1 parent 178a442
commit 0a5252d
Show file tree

Hide file tree

Showing 15 changed files with 433 additions and 18 deletions.
diff --git a/Project.toml b/Project.toml
@@ -1,10 +1,11 @@
 name = "FeatureTransforms"
 uuid = "8fd68953-04b8-4117-ac19-158bf6de9782"
 authors = ["Invenia Technical Computing Corporation"]
-version = "0.3.0"
+version = "0.3.1"
 
 [deps]
 Dates = "ade2ca70-3891-5945-98fb-dc099432e06a"
+NamedDims = "356022a1-0364-5f58-8944-0da4b18d706f"
 Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
 Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c"
 
@@ -13,6 +14,7 @@ AxisArrays = "0.4"
 AxisKeys = "0.1"
 DataFrames = "0.22"
 Documenter = "0.26"
+NamedDims = "0.2.32"
 Tables = "1.3"
 julia = "1.5"
 

diff --git a/README.md b/README.md
@@ -32,10 +32,15 @@ julia> df = DataFrame(:a=>[1, 2, 3, 4, 5], :b=>[5, 4, 3, 2, 1], :c=>[2, 1, 3, 1,
    5 │     5      1      3
 ```
 
-Next, we construct the `Transform` that we want to `apply` to the data, which can either be non-mutating (`apply`) or mutating (`apply!`).
-All `Transforms` support the non-mutating `apply` method but any `Transform` that changes the type or dimension of the input does not support mutation.
+Next, we construct the `Transform` that we want to perform on the data.
+This can be done one of three ways:
+1. `apply` which does not mutate the underlying data,
+1. `apply!` which _does_ mutate the underlying data,
+1. `apply_append` which will `apply` transform then `append` the result to a copy of the input.
 
-In either case, the return will be the same type as the input, so if you provide an `Array` you get back an `Array`, and if you provide a `Table` you get back a `Table`.
+All `Transforms` support the non-mutating `apply` and `apply_append` methods, but any `Transform` that changes the type or dimension of the input does not support the mutating `apply!`.
+
+In any case, the return type will be the same as the input, so if you provide an `Array` you get back an `Array`, and if you provide a `Table` you get back a `Table`.
 Here we are working with a `DataFrame`, so the return will always be a `DataFrame`:
 ```julia
 julia> p = Power(3);
@@ -61,11 +66,21 @@ julia> FeatureTransforms.apply!(df, p; cols=[:a])
    3 │    27      3      3
    4 │    64      2      1
    5 │   125      1      3
-```
 
+julia> FeatureTransforms.apply_append(df, p; cols=[:a], header=[:a3])
+5×4 DataFrame
+ Row │ a      b      c      a3    
+     │ Int64  Int64  Int64  Int64 
+─────┼────────────────────────────
+   1 │     1      5      2      1
+   2 │     2      4      1      8
+   3 │     3      3      3     27
+   4 │     4      2      1     64
+   5 │     5      1      3    125
+
+```
 
-`Transform`s that don't support mutation must be called using `apply` and appended.
-To help with this, you can call the `Transform` type directly:
+As an extra convenience, you can call the `Transform` type directly, which emulates calling `apply`:
 ```julia
 julia> ohe = OneHotEncoding(1:3);
 

diff --git a/docs/src/api.md b/docs/src/api.md
@@ -19,6 +19,7 @@ OneHotEncoding
 ```@docs
 FeatureTransforms.apply
 FeatureTransforms.apply!
+FeatureTransforms.apply_append
 FeatureTransforms.is_transformable
 FeatureTransforms.transform!
 FeatureTransforms.transform

diff --git a/docs/src/examples.md b/docs/src/examples.md
@@ -41,9 +41,7 @@ One way to do this is with the `Periodic` transform, specifying a period of 1 da
 ```jldoctest example
 julia> periodic = Periodic(sin, Day(1));
 
-julia> df.hour_of_day_sin = FeatureTransforms.apply(df.time, periodic);
-
-julia> feature_df = df
+julia> feature_df = FeatureTransforms.apply_append(df, periodic, cols=:time, header=[:hour_of_day_sin])
 24×4 DataFrame
  Row │ time                 temperature  humidity  hour_of_day_sin
      │ DateTime             Float64      Float64   Float64
@@ -127,8 +125,8 @@ julia> FeatureTransforms.apply!(test_df, hum_scaling; cols=:humidity)
    2 │ 2018-09-10T23:00:00    -0.403818  0.579814        -0.258819
 ```
 
-Suppose we then train our model, and get a prediction for the test points as a matrix: `[-0.36 0.61; -0.45 0.68]`.
-We can scale this back to the original units of temperature and humidity by converting to a [`Table`](https://github.com/JuliaData/Tables.jl) type (to label the columns) and using inverse scaling:
+Suppose we then train our model, and get a prediction for the test points.
+We can scale this back to the original units of temperature and humidity by using the inverse scaling:
 
 ```jldoctest example
 julia> predictions = DataFrame([-0.36 0.61; -0.45 0.68], output_cols);

diff --git a/docs/src/index.md b/docs/src/index.md
@@ -6,8 +6,8 @@ FeatureTransforms supports operations on `AbstractArray`s and [`Table`](https://
 There are three key parts of the Transforms.jl API:
 
 * Subtypes of [`Transform`](@ref about-transforms) define transformations of data, for example normalization or a periodic function.
-* The `apply` and `apply!` methods transform data according to the given [`Transform`](@ref about-transforms), in a manner determined by the data type and specified dimensions, column names, indices, and other `Transform`-specific parameters.
-* The `transform`(@ref transform-interface) method should be overloaded to define feature engineering pipelines that include [`Transform`](@ref about-transforms)s.
+* The [`apply`](@ref), [`apply!`](@ref) and [`apply_append`](@ref) methods transform data according to the given [`Transform`](@ref about-transforms), in a manner determined by the data type and specified dimensions, column names, indices, and other `Transform`-specific parameters.
+* The [`transform`](@ref transform-interface) method should be overloaded to define feature engineering pipelines that include [`Transform`](@ref about-transforms)s.
 
 ## Getting Started
 

diff --git a/docs/src/transforms.md b/docs/src/transforms.md
@@ -53,7 +53,7 @@ julia> p(x)
  9.0
 ```
 
-Alternatively, the data can be mutated using the `apply!` method.
+Secondly, the data can be mutated using the `apply!` method.
 
 !!! note
 
@@ -73,6 +73,18 @@ julia> x
  9.0
 ```
 
+Finally, the result can be appended to the input using the `apply_append` method.
+
+```jldoctest transforms
+julia> x = [1.0, 2.0, 3.0];
+
+julia> FeatureTransforms.apply_append(x, p, append_dim=2)
+3×2 Matrix{Float64}:
+ 1.0  1.0
+ 2.0  4.0
+ 3.0  9.0
+```
+
 A single `Transform` instance can be applied to different data types, with support for `AbstractArray`s and [`Table`s](https://github.com/JuliaData/Tables.jl).
 
 !!! note

diff --git a/src/FeatureTransforms.jl b/src/FeatureTransforms.jl
@@ -1,6 +1,7 @@
 module FeatureTransforms
 
 using Dates: TimeType, Period, Day, hour
+using NamedDims: dim
 using Statistics: mean, std
 using Tables
 

diff --git a/src/apply.jl b/src/apply.jl
@@ -101,3 +101,27 @@ function apply!(table::T, t::Transform; cols=_get_cols(table), kwargs...)::T whe
 
     return table
 end
+
+"""
+    apply_append(A::AbstractArray, ::Transform; append_dim, kwargs...)
+
+Applies the [`Transform`](@ref) to `A` and returns the result in a new array where the output
+is appended to `A` along the `append_dim` dimension. The remaining `kwargs` correspond to
+the usual [`Transform`](@ref) being invoked.
+"""
+function apply_append(A::AbstractArray, t; append_dim, kwargs...)::AbstractArray
+    return cat(A, apply(A, t; kwargs...); dims=append_dim)
+end
+
+"""
+    apply_append(table, ::Transform; [header], kwargs...)
+
+Applies the [`Transform`](@ref) to the `table` and appends the result in a new table with an
+optional `header`. If none is provided the default in `Tables.table` is used. The remaining
+`kwargs` correspond to the [`Transform`](@ref) being invoked.
+"""
+function apply_append(table, t; kwargs...)
+    T = Tables.materializer(table)
+    result = Tables.columntable(apply(table, t; kwargs...))
+    return T(merge(Tables.columntable(table), result))
+end
diff --git a/src/linear_combination.jl b/src/linear_combination.jl
@@ -28,7 +28,6 @@ function apply(
     return _sum_terms(eachslice(selectdim(A, dims, inds); dims=dims), LC.coefficients)
 end
 
-
 """
     apply(table, LC::LinearCombination; [cols], [header]) -> Table
 
@@ -50,6 +49,15 @@ function apply(table, LC::LinearCombination; cols=_get_cols(table), header=nothi
     return Tables.materializer(table)(_to_table(result, header))
 end
 
+function apply_append(
+    A::AbstractArray{<:Real, N}, LC::LinearCombination; append_dim, kwargs...
+)::AbstractArray{<:Real, N} where N
+    # A was reduced along the append_dim so we must reshape the result setting that dim to 1
+    new_size = collect(size(A))
+    setindex!(new_size, 1, dim(A, append_dim))
+    return cat(A, reshape(apply(A, LC; kwargs...), new_size...); dims=append_dim)
+end
+
 function _sum_terms(terms, coeffs)
     # Need this check because map will work even if there are more/less terms than coeffs
     if length(terms) != length(coeffs)

diff --git a/test/linear_combination.jl b/test/linear_combination.jl
@@ -37,6 +37,12 @@
             @test FeatureTransforms.apply(x, lc) == fill(-.1)
             @test lc(x) == fill(-.1)
         end
+
+        @testset "apply_append" begin
+            x = [1, 2]
+            lc = LinearCombination([1, -1])
+            @test FeatureTransforms.apply_append(x, lc; append_dim=1) == [1, 2, -1]
+        end
     end
 
     @testset "Matrix" begin
@@ -81,6 +87,17 @@
             @test FeatureTransforms.apply(M, lc; inds=[2, 3]) == [3, -2]
             @test lc(M; inds=[2, 3]) == [3, -2]
         end
+
+        @testset "apply_append" begin
+            M = [1 1 1; 2 2 2; 3 3 3]
+            lc = LinearCombination([1, 1, 1])
+
+            expected1 = [1 1 1; 2 2 2; 3 3 3; 6 6 6]
+            @test FeatureTransforms.apply_append(M, lc; dims=1, append_dim=1) == expected1
+
+            expected2 = [1 1 1 3; 2 2 2 6; 3 3 3 9]
+            @test FeatureTransforms.apply_append(M, lc; dims=2, append_dim=2) == expected2
+        end
     end
 
     @testset "N-dim Array" begin
@@ -125,6 +142,16 @@
             @test FeatureTransforms.apply(A, lc; inds=[1, 2]) == [-3, -3, -2]
             @test lc(A; inds=[1, 2]) == [-3, -3, -2]
         end
+
+        @testset "apply_append" begin
+            A = AxisArray([1 2; 4 5], foo=["a", "b"], bar=["x", "y"])
+
+            expected1 = [1 2; 4 5; -3 -3]
+            @test FeatureTransforms.apply_append(A, lc; dims=1, append_dim=1) == expected1
+
+            expected2 = [1 2 -1; 4 5 -1]
+            @test FeatureTransforms.apply_append(A, lc; dims=2, append_dim=2) == expected2
+        end
     end
 
     @testset "AxisKey" begin
@@ -164,6 +191,16 @@
             @test FeatureTransforms.apply(A, lc; inds=[1, 2]) == [-3, -3, -2]
             @test lc(A; inds=[1, 2]) == [-3, -3, -2]
         end
+
+        @testset "apply_append" begin
+            A = KeyedArray([1 2; 4 5], foo=["a", "b"], bar=["x", "y"])
+
+            expected1 = [1 2; 4 5; -3 -3]
+            @test FeatureTransforms.apply_append(A, lc; dims=:foo, append_dim=:foo) == expected1
+
+            expected2 = [1 2 -1; 4 5 -1]
+            @test FeatureTransforms.apply_append(A, lc; dims=:bar, append_dim=:bar) == expected2
+        end
     end
 
     @testset "NamedTuple" begin
@@ -205,6 +242,13 @@
             @test FeatureTransforms.apply(nt, lc_single; cols=[:a]) == expected
             @test lc_single(nt; cols=:a) == expected
         end
+
+        @testset "apply_append" begin
+            nt = (a = [1, 2, 3], b = [4, 5, 6])
+            lc = LinearCombination([1, -1])
+            expected = (a = [1, 2, 3], b = [4, 5, 6], Column1 = [-3, -3, -3])
+            @test FeatureTransforms.apply_append(nt, lc) == expected
+        end
     end
 
     @testset "DataFrame" begin
@@ -247,5 +291,12 @@
             @test FeatureTransforms.apply(df, lc_single; cols=[:a]) == expected
             @test lc_single(df; cols=:a) == expected
         end
+
+        @testset "apply_append" begin
+            df = DataFrame(:a => [1, 2, 3], :b => [4, 5, 6])
+            lc = LinearCombination([1, -1])
+            expected = DataFrame(:a => [1, 2, 3], :b => [4, 5, 6], :Column1 => [-3, -3, -3])
+            @test FeatureTransforms.apply_append(df, lc) == expected
+        end
     end
 end
diff --git a/test/one_hot_encoding.jl b/test/one_hot_encoding.jl
@@ -50,6 +50,12 @@
 
             @test_throws BoundsError FeatureTransforms.apply(x, ohe; dims=2)
         end
+
+        @testset "apply_append" begin
+            x = ["foo", "baz", "bar", "baz"]
+            expected = ["foo" 1 0 0; "baz" 0 0 1; "bar" 0 1 0; "baz" 0 0 1]
+            @test FeatureTransforms.apply_append(x, ohe; append_dim=2) == expected
+        end
     end
 
 
@@ -70,6 +76,12 @@
             @test FeatureTransforms.apply(M, ohe; inds=[2, 3]) == [0 0 0 1 0; 0 1 0 0 0]
             @test FeatureTransforms.apply(M, ohe; dims=:, inds=[2, 3]) == [0 0 0 1 0; 0 1 0 0 0]
         end
+
+        @testset "apply_append" begin
+            M = ["foo" "bar"; "foo2" "bar2"]
+            @test_throws DimensionMismatch FeatureTransforms.apply_append(M, ohe; append_dim=1)
+            @test_throws DimensionMismatch FeatureTransforms.apply_append(M, ohe; append_dim=2)
+        end
     end
 
     @testset "AxisArray" begin
@@ -91,6 +103,13 @@
             @test FeatureTransforms.apply(A, ohe; inds=[2, 3]) == [0 0 0 1 0; 0 1 0 0 0]
             @test FeatureTransforms.apply(A, ohe; dims=:, inds=[2, 3]) == [0 0 0 1 0; 0 1 0 0 0]
         end
+
+        @testset "apply_append" begin
+            M = ["foo" "bar"; "foo2" "bar2"]
+            A = AxisArray(M, foo=["a", "b"], bar=["x", "y"])
+            @test_throws DimensionMismatch FeatureTransforms.apply_append(A, ohe; append_dim=1)
+            @test_throws DimensionMismatch FeatureTransforms.apply_append(A, ohe; append_dim=2)
+        end
     end
 
     @testset "AxisKey" begin
@@ -112,12 +131,19 @@
             @test FeatureTransforms.apply(A, ohe; inds=[2, 3]) == [0 0 0 1 0; 0 1 0 0 0]
             @test FeatureTransforms.apply(A, ohe; dims=:, inds=[2, 3]) == [0 0 0 1 0; 0 1 0 0 0]
         end
+
+        @testset "apply_append" begin
+            M = ["foo" "bar"; "foo2" "bar2"]
+            A = KeyedArray(M, foo=["a", "b"], bar=["x", "y"])
+            @test_throws DimensionMismatch FeatureTransforms.apply_append(A, ohe; append_dim=:foo)
+            @test_throws DimensionMismatch FeatureTransforms.apply_append(A, ohe; append_dim=:bar)
+        end
     end
 
     @testset "NamedTuple" begin
         categories = ["foo", "bar", "baz", "foo2", "bar2"]
         ohe = OneHotEncoding(categories)
-        nt = (a = ["foo" "bar"], b = ["foo2" "bar2"])
+        nt = (a = ["foo", "bar"], b = ["foo2", "bar2"])
 
         @testset "all cols" begin
             expected = NamedTuple{Tuple(Symbol.(:Column, x) for x in 1:10)}(
@@ -135,6 +161,11 @@
             @test FeatureTransforms.apply(nt, ohe; cols=:a) == expected
             @test ohe(nt; cols=:a) == expected
         end
+
+        @testset "apply_append" begin
+            nt = (a = ["foo", "bar"], b = ["foo2", "bar2"])
+            @test FeatureTransforms.apply_append(nt, ohe) == merge(nt, ohe(nt))
+        end
     end
 
     @testset "DataFrame" begin
@@ -153,9 +184,13 @@
         @test FeatureTransforms.apply(df, ohe; cols=:a) == expected[:, 1:5]
 
         expected = DataFrame(
-            [[0, 0], [0, 0], [0, 0], [1, 0], [0, 1]],
+            [[false, false], [false, false], [false, false], [true, false], [false, true]],
             [Symbol.(:Column, x) for x in 1:5],
         )
         @test FeatureTransforms.apply(df, ohe; cols=[:b]) == expected
+
+        @testset "apply_append" begin
+            @test FeatureTransforms.apply_append(df, ohe) == hcat(df, ohe(df))
+        end
     end
 end