Skip to content

Commit

Permalink
Refactor apply methods to use Traits (#80)
Browse files Browse the repository at this point in the history
  • Loading branch information
Glenn Moynihan authored Apr 16, 2021
1 parent f66d6a4 commit e95ea9c
Show file tree
Hide file tree
Showing 5 changed files with 73 additions and 83 deletions.
64 changes: 47 additions & 17 deletions src/apply.jl
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@ If [`Transform`](@ref) does not support mutation, this method will error.
"""
function apply! end


"""
apply(A::AbstractArray, ::Transform; dims=:, inds=:, kwargs...)
Expand All @@ -33,21 +32,30 @@ Provide the `inds` keyword to apply the [`Transform`](@ref) to certain indices a
Note: if `dims === :` (all dimensions), then `inds` will be the global indices of the array,
instead of being relative to a certain dimension.
This method does not guarantee the data type of what is returned. It will try to conserve
type but the returned type depends on what the original `A` was, and the `dims` and `inds`
specified.
"""
function apply(A::AbstractArray, t::Transform; dims=:, inds=:, kwargs...)

# TODO: remove this https://github.com/invenia/FeatureTransforms.jl/issues/82
if t isa LinearCombination && dims === Colon()
Base.depwarn(
"The default `dims=1` for `LinearCombination` is deprecated and will be removed " *
"in a future release. Please set `dims` explicitly.",
:apply,
)
dims=1
end

c = cardinality(t)
if dims === Colon()
if inds === Colon()
return _apply(A, t; kwargs...)
return _apply(_preformat(c, A, :), t; dims=:, kwargs...)
else
return @views _apply(A[:][inds], t; kwargs...)
return _apply(_preformat(c, A[:][inds], :), t; dims=:, kwargs...)
end
end

return _apply(selectdim(A, dims, inds), t; kwargs...)
input = _preformat(c, selectdim(A, dims, inds), dims)
return _apply(input, t; dims=dims, kwargs...)
end

"""
Expand All @@ -72,12 +80,12 @@ Optionally provide a `header` for the output table. If none is provided the defa
function apply(table, t::Transform; cols=_get_cols(table), header=nothing, kwargs...)
Tables.istable(table) || throw(MethodError(apply, (table, t)))

# Extract a columns iterator that we should be able to use to mutate the data.
# NOTE: Mutation is not guaranteed for all table types, but it avoid copying the data
coltable = Tables.columntable(table)
cols = _to_vec(cols)
indices = Tables.columnindex.(Ref(table), _to_vec(cols))
components = Tables.matrix(table)[:, indices]

result = reduce(hcat, [_apply(getproperty(coltable, col), t; kwargs...) for col in cols])
# Passing dims=2 only matters for ManyToOne transforms - otherwise it has no effect.
input = _preformat(cardinality(t), components, 2)
result = hcat(_apply(input, t; dims=2, kwargs...))
return Tables.materializer(table)(_to_table(result, header))
end

Expand All @@ -93,9 +101,7 @@ function apply!(table::T, t::Transform; cols=_get_cols(table), kwargs...)::T whe
# Extract a columns iterator that we should be able to use to mutate the data.
# NOTE: Mutation is not guaranteed for all table types, but it avoid copying the data
coltable = Tables.columntable(table)
cols = _to_vec(cols) # handle single column name

for cname in cols
for cname in _to_vec(cols)
apply!(getproperty(coltable, cname), t; kwargs...)
end

Expand All @@ -110,7 +116,9 @@ is appended to `A` along the `append_dim` dimension. The remaining `kwargs` corr
the usual [`Transform`](@ref) being invoked.
"""
function apply_append(A::AbstractArray, t; append_dim, kwargs...)::AbstractArray
return cat(A, apply(A, t; kwargs...); dims=append_dim)
result = apply(A, t; kwargs...)
result = _postformat(cardinality(t), result, A, append_dim)
return cat(A, result; dims=append_dim)
end

"""
Expand All @@ -125,3 +133,25 @@ function apply_append(table, t; kwargs...)
result = Tables.columntable(apply(table, t; kwargs...))
return T(merge(Tables.columntable(table), result))
end

# These methods format data according to the cardinality of the Transform.
# Most Transforms don't require any formatting, only those that are ManyToOne do.
# Note: we don't yet have a ManyToMany transform, so those might need separate treatment.

# _preformat formats the data before calling _apply. Needed for all apply methods.
# Before applying a ManyToOne Transform we must first slice up the data along the dimension
# we are reducing over.
_preformat(::Cardinality, A, d) = A
_preformat(::ManyToOne, A, d) = eachslice(A; dims=d)

# _postformat formats the data after calling _apply so it will work with apply_append.
# Basically, when we call LinearCombination it always returns a column vector. But if we
# want to append the result as a row we have to reshape to get it to fit.
# In general, after applying a ManyToOne Transform, we have to reshape the reduced dimension
# to 1 if we want to cat the result.
_postformat(::Cardinality, result, A, append_dim) = result
function _postformat(::ManyToOne, result, A, append_dim)
new_size = collect(size(A))
setindex!(new_size, 1, dim(A, append_dim))
return reshape(result, new_size...)
end
70 changes: 15 additions & 55 deletions src/linear_combination.jl
Original file line number Diff line number Diff line change
@@ -1,72 +1,32 @@
"""
LinearCombination(coefficients) <: Transform
Calculate the linear combination using the vector coefficients passed in.
"""
struct LinearCombination <: Transform
coefficients::Vector{Real}
end

cardinality(::LinearCombination) = ManyToOne()
Calculates the linear combination of a collection of terms weighted by some `coefficients`.
"""
apply(
::AbstractArray{<:Real, N}, ::LinearCombination; dims=1, inds=:
) -> AbstractArray{<:Real, N-1}
Applies the [`LinearCombination`](@ref) to each of the specified indices in the N-dimensional
array `A`, reducing along the `dim` provided. The result is an (N-1)-dimensional array.
The default behaviour reduces along the column dimension.
When applied to an N-dimensional array, `LinearCombination` reduces along the `dim` provided
and returns an (N-1)-dimensional array.
If no `inds` are specified, then the transform is applied to all elements.
"""
function apply(
A::AbstractArray{<:Real, N}, LC::LinearCombination; dims=1, inds=:
)::AbstractArray{<:Real, N-1} where N

dims === Colon() && throw(ArgumentError("dims=: not supported, choose dims ∈ [1, $N]"))
return _sum_terms(eachslice(selectdim(A, dims, inds); dims=dims), LC.coefficients)
end

"""
apply(table, LC::LinearCombination; [cols], [header]) -> Table
Applies the [`LinearCombination`](@ref) across the specified cols in `table`. If no `cols`
are specified, then the [`LinearCombination`](@ref) is applied to all columns.
Optionally provide a `header` for the output table. If none is provided the default in
`Tables.table` is used.
!!!note
The current default is that `dims=1` but this behaviour will be deprecated in a future
release and the `dims` keyword argument will have to be specified explicitly.
https://github.com/invenia/FeatureTransforms.jl/issues/82
"""
function apply(table, LC::LinearCombination; cols=_get_cols(table), header=nothing, kwargs...)
Tables.istable(table) || throw(MethodError(apply, (table, LC)))

# Extract a columns iterator that we should be able to use to mutate the data.
# NOTE: Mutation is not guaranteed for all table types, but it avoid copying the data
coltable = Tables.columntable(table)
cols = _to_vec(cols)

result = hcat(_sum_terms([getproperty(coltable, col) for col in cols], LC.coefficients))
return Tables.materializer(table)(_to_table(result, header))
struct LinearCombination <: Transform
coefficients::Vector{Real}
end

function apply_append(
A::AbstractArray{<:Real, N}, LC::LinearCombination; append_dim, kwargs...
)::AbstractArray{<:Real, N} where N
# A was reduced along the append_dim so we must reshape the result setting that dim to 1
new_size = collect(size(A))
setindex!(new_size, 1, dim(A, append_dim))
return cat(A, reshape(apply(A, LC; kwargs...), new_size...); dims=append_dim)
end
cardinality(::LinearCombination) = ManyToOne()

function _sum_terms(terms, coeffs)
function _apply(terms, LC::LinearCombination; kwargs...)
# Need this check because map will work even if there are more/less terms than coeffs
if length(terms) != length(coeffs)
if length(terms) != length(LC.coefficients)
throw(DimensionMismatch(
"Number of terms $(length(terms)) does not match "*
"number of coefficients $(length(coeffs))."
"number of coefficients $(length(LC.coefficients))."
))
end
return sum(map(*, terms, coeffs))

return sum(map(*, terms, LC.coefficients))
end
2 changes: 1 addition & 1 deletion src/utils.jl
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
_to_vec(x::AbstractArray) = x
_to_vec(x::Tuple) = x
_to_vec(x::Tuple) = collect(x)
_to_vec(x::Nothing) = x
_to_vec(x) = [x]

Expand Down
6 changes: 3 additions & 3 deletions test/linear_combination.jl
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@
@testset "dims = :" begin
M = [1 1; 2 2; 3 5]
lc = LinearCombination([1, -1, 1])
@test_throws ArgumentError FeatureTransforms.apply(M, lc; dims=:)
@test_deprecated FeatureTransforms.apply(M, lc; dims=:)
end

@testset "dims = 1" begin
Expand Down Expand Up @@ -120,7 +120,7 @@

@testset "dims" begin
@testset "dims = :" begin
@test_throws ArgumentError FeatureTransforms.apply(A, lc; dims=:)
@test_deprecated FeatureTransforms.apply(A, lc; dims=:)
end

@testset "dims = 1" begin
Expand Down Expand Up @@ -167,7 +167,7 @@

@testset "dims" begin
@testset "dims = :" begin
@test_throws ArgumentError FeatureTransforms.apply(A, lc; dims=:)
@test_deprecated FeatureTransforms.apply(A, lc; dims=:)
end

@testset "dims = 1" begin
Expand Down
14 changes: 7 additions & 7 deletions test/one_hot_encoding.jl
Original file line number Diff line number Diff line change
Expand Up @@ -147,8 +147,8 @@
nt = (a = ["foo", "bar"], b = ["foo2", "bar2"])

@testset "all cols" begin
expected = NamedTuple{Tuple(Symbol.(:Column, x) for x in 1:10)}(
([1, 0], [0, 1], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [1, 0], [0, 1])
expected = NamedTuple{Tuple(Symbol.(:Column, x) for x in 1:5)}(
eachcol(Bool[1 0 0 0 0; 0 1 0 0 0; 0 0 0 1 0; 0 0 0 0 1])
)
@test FeatureTransforms.apply(nt, ohe) == expected
@test ohe(nt) == expected
Expand All @@ -175,14 +175,14 @@

df = DataFrame(:a => ["foo", "bar"], :b => ["foo2", "bar2"])
expected = DataFrame(
[[1, 0], [0, 1], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [1, 0], [0, 1]],
[Symbol.(:Column, x) for x in 1:10],
Bool[1 0 0 0 0; 0 1 0 0 0; 0 0 0 1 0; 0 0 0 0 1],
[Symbol.(:Column, x) for x in 1:5],
)

@test FeatureTransforms.apply(df, ohe) == expected

@test FeatureTransforms.apply(df, ohe; cols=[:a]) == expected[:, 1:5]
@test FeatureTransforms.apply(df, ohe; cols=:a) == expected[:, 1:5]
@test FeatureTransforms.apply(df, ohe; cols=[:a]) == expected[1:2, :]
@test FeatureTransforms.apply(df, ohe; cols=:a) == expected[1:2, :]

expected = DataFrame(
[[false, false], [false, false], [false, false], [true, false], [false, true]],
Expand All @@ -191,7 +191,7 @@
@test FeatureTransforms.apply(df, ohe; cols=[:b]) == expected

@testset "apply_append" begin
@test FeatureTransforms.apply_append(df, ohe) == hcat(df, ohe(df))
@test_throws DimensionMismatch FeatureTransforms.apply_append(df, ohe)
end
end
end

0 comments on commit e95ea9c

Please sign in to comment.