Refactor apply methods to use Traits (#80)

invenia · Apr 16, 2021 · e95ea9c · e95ea9c
1 parent f66d6a4
commit e95ea9c
Show file tree

Hide file tree

Showing 5 changed files with 73 additions and 83 deletions.
diff --git a/src/apply.jl b/src/apply.jl
@@ -18,7 +18,6 @@ If [`Transform`](@ref) does not support mutation, this method will error.
 """
 function apply! end
 
-
 """
     apply(A::AbstractArray, ::Transform; dims=:, inds=:, kwargs...)
 
@@ -33,21 +32,30 @@ Provide the `inds` keyword to apply the [`Transform`](@ref) to certain indices a
 
 Note: if `dims === :` (all dimensions), then `inds` will be the global indices of the array,
 instead of being relative to a certain dimension.
-
-This method does not guarantee the data type of what is returned. It will try to conserve
-type but the returned type depends on what the original `A` was, and the `dims` and `inds`
-specified.
 """
 function apply(A::AbstractArray, t::Transform; dims=:, inds=:, kwargs...)
+
+    # TODO: remove this https://github.com/invenia/FeatureTransforms.jl/issues/82
+    if t isa LinearCombination && dims === Colon()
+        Base.depwarn(
+           "The default `dims=1` for `LinearCombination` is deprecated and will be removed " *
+           "in a future release. Please set `dims` explicitly.",
+           :apply,
+        )
+        dims=1
+    end
+
+    c = cardinality(t)
     if dims === Colon()
         if inds === Colon()
-            return _apply(A, t; kwargs...)
+            return _apply(_preformat(c, A, :), t; dims=:, kwargs...)
         else
-            return @views _apply(A[:][inds], t; kwargs...)
+            return _apply(_preformat(c, A[:][inds], :), t; dims=:, kwargs...)
         end
     end
 
-    return _apply(selectdim(A, dims, inds), t; kwargs...)
+    input = _preformat(c, selectdim(A, dims, inds), dims)
+    return _apply(input, t; dims=dims, kwargs...)
 end
 
 """
@@ -72,12 +80,12 @@ Optionally provide a `header` for the output table. If none is provided the defa
 function apply(table, t::Transform; cols=_get_cols(table), header=nothing, kwargs...)
     Tables.istable(table) || throw(MethodError(apply, (table, t)))
 
-    # Extract a columns iterator that we should be able to use to mutate the data.
-    # NOTE: Mutation is not guaranteed for all table types, but it avoid copying the data
-    coltable = Tables.columntable(table)
-    cols = _to_vec(cols)
+    indices = Tables.columnindex.(Ref(table), _to_vec(cols))
+    components = Tables.matrix(table)[:, indices]
 
-    result = reduce(hcat, [_apply(getproperty(coltable, col), t; kwargs...) for col in cols])
+    # Passing dims=2 only matters for ManyToOne transforms - otherwise it has no effect.
+    input = _preformat(cardinality(t), components, 2)
+    result = hcat(_apply(input, t; dims=2, kwargs...))
     return Tables.materializer(table)(_to_table(result, header))
 end
 
@@ -93,9 +101,7 @@ function apply!(table::T, t::Transform; cols=_get_cols(table), kwargs...)::T whe
     # Extract a columns iterator that we should be able to use to mutate the data.
     # NOTE: Mutation is not guaranteed for all table types, but it avoid copying the data
     coltable = Tables.columntable(table)
-    cols = _to_vec(cols)  # handle single column name
-
-    for cname in cols
+    for cname in _to_vec(cols)
         apply!(getproperty(coltable, cname), t; kwargs...)
     end
 
@@ -110,7 +116,9 @@ is appended to `A` along the `append_dim` dimension. The remaining `kwargs` corr
 the usual [`Transform`](@ref) being invoked.
 """
 function apply_append(A::AbstractArray, t; append_dim, kwargs...)::AbstractArray
-    return cat(A, apply(A, t; kwargs...); dims=append_dim)
+    result = apply(A, t; kwargs...)
+    result = _postformat(cardinality(t), result, A, append_dim)
+    return cat(A, result; dims=append_dim)
 end
 
 """
@@ -125,3 +133,25 @@ function apply_append(table, t; kwargs...)
     result = Tables.columntable(apply(table, t; kwargs...))
     return T(merge(Tables.columntable(table), result))
 end
+
+# These methods format data according to the cardinality of the Transform.
+# Most Transforms don't require any formatting, only those that are ManyToOne do.
+# Note: we don't yet have a ManyToMany transform, so those might need separate treatment.
+
+# _preformat formats the data before calling _apply. Needed for all apply methods.
+# Before applying a ManyToOne Transform we must first slice up the data along the dimension
+# we are reducing over.
+_preformat(::Cardinality, A, d) = A
+_preformat(::ManyToOne, A, d) = eachslice(A; dims=d)
+
+# _postformat formats the data after calling _apply so it will work with apply_append.
+# Basically, when we call LinearCombination it always returns a column vector. But if we
+# want to append the result as a row we have to reshape to get it to fit.
+# In general, after applying a ManyToOne Transform, we have to reshape the reduced dimension
+# to 1 if we want to cat the result.
+_postformat(::Cardinality, result, A, append_dim) = result
+function _postformat(::ManyToOne, result, A, append_dim)
+    new_size = collect(size(A))
+    setindex!(new_size, 1, dim(A, append_dim))
+    return reshape(result, new_size...)
+end
diff --git a/src/linear_combination.jl b/src/linear_combination.jl
@@ -1,72 +1,32 @@
 """
     LinearCombination(coefficients) <: Transform
 
-Calculate the linear combination using the vector coefficients passed in.
-"""
-struct LinearCombination <: Transform
-    coefficients::Vector{Real}
-end
-
-cardinality(::LinearCombination) = ManyToOne()
+Calculates the linear combination of a collection of terms weighted by some `coefficients`.
 
-"""
-    apply(
-        ::AbstractArray{<:Real, N}, ::LinearCombination; dims=1, inds=:
-    ) -> AbstractArray{<:Real, N-1}
-
-Applies the [`LinearCombination`](@ref) to each of the specified indices in the N-dimensional
-array `A`, reducing along the `dim` provided. The result is an (N-1)-dimensional array.
-
-The default behaviour reduces along the column dimension.
+When applied to an N-dimensional array, `LinearCombination` reduces along the `dim` provided
+and returns an (N-1)-dimensional array.
 
 If no `inds` are specified, then the transform is applied to all elements.
-"""
-function apply(
-    A::AbstractArray{<:Real, N}, LC::LinearCombination; dims=1, inds=:
-)::AbstractArray{<:Real, N-1} where N
-
-    dims === Colon() && throw(ArgumentError("dims=: not supported, choose dims ∈ [1, $N]"))
 
-    return _sum_terms(eachslice(selectdim(A, dims, inds); dims=dims), LC.coefficients)
-end
-
-"""
-    apply(table, LC::LinearCombination; [cols], [header]) -> Table
-
-Applies the [`LinearCombination`](@ref) across the specified cols in `table`. If no `cols`
-are specified, then the [`LinearCombination`](@ref) is applied to all columns.
-
-Optionally provide a `header` for the output table. If none is provided the default in
-`Tables.table` is used.
+!!!note
+    The current default is that `dims=1` but this behaviour will be deprecated in a future
+    release and the `dims` keyword argument will have to be specified explicitly.
+    https://github.com/invenia/FeatureTransforms.jl/issues/82
 """
-function apply(table, LC::LinearCombination; cols=_get_cols(table), header=nothing, kwargs...)
-    Tables.istable(table) || throw(MethodError(apply, (table, LC)))
-
-    # Extract a columns iterator that we should be able to use to mutate the data.
-    # NOTE: Mutation is not guaranteed for all table types, but it avoid copying the data
-    coltable = Tables.columntable(table)
-    cols = _to_vec(cols)
-
-    result = hcat(_sum_terms([getproperty(coltable, col) for col in cols], LC.coefficients))
-    return Tables.materializer(table)(_to_table(result, header))
+struct LinearCombination <: Transform
+    coefficients::Vector{Real}
 end
 
-function apply_append(
-    A::AbstractArray{<:Real, N}, LC::LinearCombination; append_dim, kwargs...
-)::AbstractArray{<:Real, N} where N
-    # A was reduced along the append_dim so we must reshape the result setting that dim to 1
-    new_size = collect(size(A))
-    setindex!(new_size, 1, dim(A, append_dim))
-    return cat(A, reshape(apply(A, LC; kwargs...), new_size...); dims=append_dim)
-end
+cardinality(::LinearCombination) = ManyToOne()
 
-function _sum_terms(terms, coeffs)
+function _apply(terms, LC::LinearCombination; kwargs...)
     # Need this check because map will work even if there are more/less terms than coeffs
-    if length(terms) != length(coeffs)
+    if length(terms) != length(LC.coefficients)
         throw(DimensionMismatch(
             "Number of terms $(length(terms)) does not match "*
-            "number of coefficients $(length(coeffs))."
+            "number of coefficients $(length(LC.coefficients))."
         ))
     end
-   return sum(map(*, terms, coeffs))
+
+   return sum(map(*, terms, LC.coefficients))
 end
diff --git a/src/utils.jl b/src/utils.jl
@@ -1,5 +1,5 @@
 _to_vec(x::AbstractArray) = x
-_to_vec(x::Tuple) = x
+_to_vec(x::Tuple) = collect(x)
 _to_vec(x::Nothing) = x
 _to_vec(x) = [x]
 

diff --git a/test/linear_combination.jl b/test/linear_combination.jl
@@ -60,7 +60,7 @@
             @testset "dims = :" begin
                 M = [1 1; 2 2; 3 5]
                 lc = LinearCombination([1, -1, 1])
-                @test_throws ArgumentError FeatureTransforms.apply(M, lc; dims=:)
+                @test_deprecated FeatureTransforms.apply(M, lc; dims=:)
             end
 
             @testset "dims = 1" begin
@@ -120,7 +120,7 @@
 
         @testset "dims" begin
             @testset "dims = :" begin
-                @test_throws ArgumentError FeatureTransforms.apply(A, lc; dims=:)
+                @test_deprecated FeatureTransforms.apply(A, lc; dims=:)
             end
 
             @testset "dims = 1" begin
@@ -167,7 +167,7 @@
 
         @testset "dims" begin
             @testset "dims = :" begin
-                @test_throws ArgumentError FeatureTransforms.apply(A, lc; dims=:)
+                @test_deprecated FeatureTransforms.apply(A, lc; dims=:)
             end
 
             @testset "dims = 1" begin

diff --git a/test/one_hot_encoding.jl b/test/one_hot_encoding.jl
@@ -147,8 +147,8 @@
         nt = (a = ["foo", "bar"], b = ["foo2", "bar2"])
 
         @testset "all cols" begin
-            expected = NamedTuple{Tuple(Symbol.(:Column, x) for x in 1:10)}(
-               ([1, 0], [0, 1], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [1, 0], [0, 1])
+            expected = NamedTuple{Tuple(Symbol.(:Column, x) for x in 1:5)}(
+               eachcol(Bool[1 0 0 0 0; 0 1 0 0 0; 0 0 0 1 0; 0 0 0 0 1])
             )
             @test FeatureTransforms.apply(nt, ohe) == expected
             @test ohe(nt) == expected
@@ -175,14 +175,14 @@
 
         df = DataFrame(:a => ["foo", "bar"], :b => ["foo2", "bar2"])
         expected = DataFrame(
-            [[1, 0], [0, 1], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [1, 0], [0, 1]],
-            [Symbol.(:Column, x) for x in 1:10],
+            Bool[1 0 0 0 0; 0 1 0 0 0; 0 0 0 1 0; 0 0 0 0 1],
+            [Symbol.(:Column, x) for x in 1:5],
         )
 
         @test FeatureTransforms.apply(df, ohe) == expected
 
-        @test FeatureTransforms.apply(df, ohe; cols=[:a]) == expected[:, 1:5]
-        @test FeatureTransforms.apply(df, ohe; cols=:a) == expected[:, 1:5]
+        @test FeatureTransforms.apply(df, ohe; cols=[:a]) == expected[1:2, :]
+        @test FeatureTransforms.apply(df, ohe; cols=:a) == expected[1:2, :]
 
         expected = DataFrame(
             [[false, false], [false, false], [false, false], [true, false], [false, true]],
@@ -191,7 +191,7 @@
         @test FeatureTransforms.apply(df, ohe; cols=[:b]) == expected
 
         @testset "apply_append" begin
-            @test FeatureTransforms.apply_append(df, ohe) == hcat(df, ohe(df))
+            @test_throws DimensionMismatch FeatureTransforms.apply_append(df, ohe)
         end
     end
 end