Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Clean up apply! code #25

Merged
merged 7 commits into from
Feb 24, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Project.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
name = "Transforms"
uuid = "8fd68953-04b8-4117-ac19-158bf6de9782"
authors = ["Invenia Technical Computing Corporation"]
version = "0.1.0"
version = "0.1.1"

[deps]
Dates = "ade2ca70-3891-5945-98fb-dc099432e06a"
Expand Down
2 changes: 1 addition & 1 deletion src/one_hot_encoding.jl
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ function _apply(x, encoding::OneHotEncoding; kwargs...)

results = zeros(Int, length(x), n_categories)

for (i, value) in enumerate(x)
@views for (i, value) in enumerate(x)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why @views at this level? Is it because it should be done whenever iterating the data, and this is the only internal _apply that does so?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not necessarily every time but since we are indexing into results and assigning new values I figured using @views might help here. I could be wrong though.

col_pos = encoding.categories[value]
results[i, col_pos] = 1
end
Expand Down
5 changes: 0 additions & 5 deletions src/periodic.jl
Original file line number Diff line number Diff line change
Expand Up @@ -43,11 +43,6 @@ function _apply(x, P::Periodic{T}; kwargs...) where T <: Period
map(xi -> _periodic(P.f, xi, P.period, P.phase_shift), x)
end

function _apply!(x::AbstractArray{T}, P::Periodic; kwargs...) where T <: Real
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not sure if this is informative benchmarking, but I was curious about the effect.

This branch:

julia> using BenchmarkTools, Transforms

julia> x = collect(1.:1_000_000.);

julia> p = Periodic(sin, 7);

julia> @benchmark Transforms.apply!(x, p)
BenchmarkTools.Trial: 
  memory estimate:  22.89 MiB
  allocs estimate:  9
  --------------
  minimum time:     6.602 ms (0.00% GC)
  median time:      10.553 ms (19.06% GC)
  mean time:        10.830 ms (33.86% GC)
  maximum time:     20.315 ms (63.28% GC)
  --------------
  samples:          462
  evals/sample:     1

main branch:

julia> @benchmark Transforms.apply!(x, p)
BenchmarkTools.Trial: 
  memory estimate:  22.89 MiB
  allocs estimate:  7
  --------------
  minimum time:     6.271 ms (0.00% GC)
  median time:      10.960 ms (17.62% GC)
  mean time:        10.793 ms (35.26% GC)
  maximum time:     21.596 ms (59.10% GC)
  --------------
  samples:          464
  evals/sample:     1

Why are there more allocations? What are approaches to reducing allocations? I'm not familiar with it.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why are there more allocations? What are approaches to reducing allocations? I'm not familiar with it.

I'm not sure either, I gotta look into it.

One thing to note however, is that when using BenchmarkTools I think you have to interpolate the function args as this can affect the result https://github.com/JuliaCI/BenchmarkTools.jl#quick-start (note: this doesn't apply to @time)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

just to note that with @views this goes back to 7

julia> @benchmark Transforms.apply!($x, $p)
BenchmarkTools.Trial: 
  memory estimate:  22.89 MiB
  allocs estimate:  7
  --------------
  minimum time:     5.029 ms (0.00% GC)
  median time:      6.940 ms (16.92% GC)
  mean time:        6.427 ms (19.47% GC)
  maximum time:     9.906 ms (32.37% GC)
  --------------
  samples:          778
  evals/sample:     1

x[:] = _apply(x, P; kwargs...)
return x
end

"""
_periodic(f, instant, period, phase_shift=Day(0))

Expand Down
5 changes: 0 additions & 5 deletions src/power.jl
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,3 @@ end
function _apply(x::AbstractArray{T}, P::Power; kwargs...) where T <: Real
return x .^ P.exponent
end

function _apply!(x::AbstractArray{T}, P::Power; kwargs...) where T <: Real
x[:] = _apply(x, P; kwargs...)
return x
end
7 changes: 3 additions & 4 deletions src/scaling.jl
Original file line number Diff line number Diff line change
Expand Up @@ -74,21 +74,20 @@ function compute_stats(table; cols=nothing)
return (; μ_pairs...), (; σ_pairs...)
end

function _apply!(
function _apply(
A::AbstractArray, scaling::MeanStdScaling;
name=nothing, inverse=false, eps=1e-3, kwargs...
)
name = name === nothing ? :all : name
μ = scaling.mean[name]
σ = scaling.std[name]
if inverse
A[:] = μ .+ σ .* A
return μ .+ σ .* A
else
# Avoid division by 0
# If std is 0 then data was uniform, so the scaled value would end up ≈ 0
# Therefore the particular `eps` value should not matter much.
σ_safe = σ == 0 ? eps : σ
A[:] = (A .- μ) ./ σ_safe
return (A .- μ) ./ σ_safe
end
return A
end
1 change: 0 additions & 1 deletion src/temporal.jl
Original file line number Diff line number Diff line change
Expand Up @@ -5,5 +5,4 @@ Get the hour of day corresponding to the data.
"""
struct HoD <: Transform end


_apply(x, ::HoD; kwargs...) = hour.(x)
50 changes: 19 additions & 31 deletions src/transformers.jl
Original file line number Diff line number Diff line change
Expand Up @@ -40,10 +40,10 @@ function apply end
"""
apply!(data::T, ::Transform; kwargs...) -> T

Applies the [`Transform`](@ref) mutating the input `data`. New transforms should usually
only extend `_apply!` which this method delegates to.
Applies the [`Transform`](@ref) mutating the input `data`. This method delegates to
[`apply`](@ref) under the hood so does not need to be defined separately.

Where necessary, this should be extended for new data types `T`.
If [`Transform`](@ref) does not support mutation, this method will error.
"""
function apply! end

Expand All @@ -52,7 +52,11 @@ function apply! end
apply(A::AbstractArray, ::Transform; dims=:, inds=:, kwargs...)

Applies the [`Transform`](@ref) to the elements of `A`.

Provide the `dims` keyword to apply the [`Transform`](@ref) along a certain dimension.
For example, given a `Matrix`, `dims=1` applies to each column, while `dims=2` applies
to each row.

Provide the `inds` keyword to apply the [`Transform`](@ref) to certain indices along the
`dims` specified.

Expand All @@ -68,17 +72,27 @@ function apply(A::AbstractArray, t::Transform; dims=:, inds=:, kwargs...)
if inds === Colon()
return _apply(A, t; kwargs...)
else
return _apply(A[:][inds], t; kwargs...)
return @views _apply(A[:][inds], t; kwargs...)
end
end

slice_index = 0
return mapslices(A, dims=dims) do x
return @views mapslices(A, dims=dims) do x
slice_index += 1
_apply(x[inds], t; name=Symbol(slice_index), kwargs...)
end
end

"""
apply!(A::AbstractArray, ::Transform; dims=:, kwargs...)

Applies the [`Transform`](@ref) to each element of `A`, mutating the data.
"""
function apply!(A::AbstractArray, t::Transform; kwargs...)
A[:] = apply(A, t; kwargs...)
return A
end

"""
apply(table, ::Transform; cols=nothing, kwargs...) -> Vector

Expand All @@ -102,32 +116,6 @@ function apply(table, t::Transform; cols=nothing, kwargs...)
]
end

_apply(x, t::Transform; kwargs...) = _apply!(_try_copy(x), t; kwargs...)


"""
apply!(A::AbstractArray, ::Transform; dims=:, kwargs...)

Applies the [`Transform`](@ref) to each element of `A`.
Optionally specify the `dims` to apply the [`Transform`](@ref) along certain dimensions.
For example in a `Matrix`, `dims=1` applies to each column, while `dims=2` applies
to each row.

!!! note
For arrays with more than 2 dimensions, single `dims` are not supported.
"""
function apply!(A::AbstractArray, t::Transform; dims=:, kwargs...)
dims == Colon() && return _apply!(A, t; kwargs...)

_dims = invert_dims(A, dims) # opposite convention to iterating `eachslice`
# TODO support multiple _dims https://github.com/invenia/Transforms.jl/issues/21
for (slice_index, slice) in enumerate(eachslice(A; dims=_dims))
_apply!(slice, t; name=Symbol(slice_index), kwargs...)
end

return A
end

"""
apply!(table::T, ::Transform; cols=nothing)::T where T

Expand Down
11 changes: 0 additions & 11 deletions src/utils.jl
Original file line number Diff line number Diff line change
Expand Up @@ -11,14 +11,3 @@ function _try_copy(data)
deepcopy(data)
end
end

function invert_dims(A::AbstractArray, dims)
ndims(A) == 1 && return dims
# TODO: support named dims https://github.com/invenia/Transforms.jl/issues/20
inverted_dims = setdiff(1:ndims(A), dims)
if length(inverted_dims) == 1
inverted_dims = inverted_dims[1]
end

return inverted_dims
end