Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

StandardScaling with separate constructor and fit methods to replace MeanStdScaling #107

Merged
merged 16 commits into from
May 17, 2022
4 changes: 3 additions & 1 deletion Project.toml
Original file line number Diff line number Diff line change
@@ -1,13 +1,14 @@
name = "FeatureTransforms"
uuid = "8fd68953-04b8-4117-ac19-158bf6de9782"
authors = ["Invenia Technical Computing Corporation"]
version = "0.3.11"
version = "0.3.12"

[deps]
Dates = "ade2ca70-3891-5945-98fb-dc099432e06a"
InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
NamedDims = "356022a1-0364-5f58-8944-0da4b18d706f"
Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c"

[compat]
Expand All @@ -16,6 +17,7 @@ AxisKeys = "0.1"
DataFrames = "0.22, 1"
Documenter = "0.26"
NamedDims = "0.2.32"
StatsBase = "0.33"
Tables = "1.3"
julia = "1.3"

Expand Down
44 changes: 42 additions & 2 deletions docs/Manifest.toml
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,18 @@ git-tree-sha1 = "f713d583d10fc036252fd826feebc6c173c522a8"
uuid = "324d7699-5711-5eae-9e2f-1d82baa6b597"
version = "0.9.5"

[[ChainRulesCore]]
deps = ["Compat", "LinearAlgebra", "SparseArrays"]
git-tree-sha1 = "f9982ef575e19b0e5c7a98c6e75ee496c0f73a93"
uuid = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
version = "1.12.0"

[[ChangesOfVariables]]
deps = ["ChainRulesCore", "LinearAlgebra", "Test"]
git-tree-sha1 = "bf98fa45a0a4cee295de98d4c1462be26345b9a1"
uuid = "9e997f8a-9a97-42d5-a9f1-ce6bfc15e2c0"
version = "0.1.2"

[[Compat]]
deps = ["Base64", "Dates", "DelimitedFiles", "Distributed", "InteractiveUtils", "LibGit2", "Libdl", "LinearAlgebra", "Markdown", "Mmap", "Pkg", "Printf", "REPL", "Random", "SHA", "Serialization", "SharedArrays", "Sockets", "SparseArrays", "Statistics", "Test", "UUIDs", "Unicode"]
git-tree-sha1 = "ac4132ad78082518ec2037ae5770b6e796f7f956"
Expand Down Expand Up @@ -83,10 +95,10 @@ deps = ["ArgTools", "LibCURL", "NetworkOptions"]
uuid = "f43a241f-c20a-4ad4-852c-f6b1247861c6"

[[FeatureTransforms]]
deps = ["Dates", "NamedDims", "Statistics", "Tables"]
deps = ["Dates", "InteractiveUtils", "NamedDims", "Statistics", "StatsBase", "Tables"]
path = ".."
uuid = "8fd68953-04b8-4117-ac19-158bf6de9782"
version = "0.3.3"
version = "0.3.12"

[[Formatting]]
deps = ["Printf"]
Expand All @@ -108,12 +120,23 @@ version = "0.1.1"
deps = ["Markdown"]
uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240"

[[InverseFunctions]]
deps = ["Test"]
git-tree-sha1 = "a7254c0acd8e62f1ac75ad24d5db43f5f19f3c65"
uuid = "3587e190-3f89-42d0-90ee-14403ec27112"
version = "0.1.2"

[[InvertedIndices]]
deps = ["Test"]
git-tree-sha1 = "15732c475062348b0165684ffe28e85ea8396afc"
uuid = "41ab1584-1d38-5bbf-9106-f11c6c58b48f"
version = "1.0.0"

[[IrrationalConstants]]
git-tree-sha1 = "7fd44fd4ff43fc60815f8e764c0f352b83c49151"
uuid = "92d709cd-6900-40b7-9082-c6be49f344b6"
version = "0.1.1"

[[IteratorInterfaceExtensions]]
git-tree-sha1 = "a3f24677c21f5bbe9d2a714f95dcd58337fb2856"
uuid = "82899510-4779-5014-852e-03e436cf321d"
Expand Down Expand Up @@ -148,6 +171,12 @@ uuid = "8f399da3-3557-5675-b5ff-fb832c97cbdb"
deps = ["Libdl"]
uuid = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"

[[LogExpFunctions]]
deps = ["ChainRulesCore", "ChangesOfVariables", "DocStringExtensions", "InverseFunctions", "IrrationalConstants", "LinearAlgebra"]
git-tree-sha1 = "e5718a00af0ab9756305a0392832c8952c7426c1"
uuid = "2ab3a3ac-af41-5b50-aa03-7779005ae688"
version = "0.3.6"

[[Logging]]
uuid = "56ddb016-857b-54e1-b83d-db4d58db5568"

Expand Down Expand Up @@ -257,6 +286,17 @@ uuid = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
deps = ["LinearAlgebra", "SparseArrays"]
uuid = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"

[[StatsAPI]]
git-tree-sha1 = "d88665adc9bcf45903013af0982e2fd05ae3d0a6"
uuid = "82ae8749-77ed-4fe6-ae5f-f523153014b0"
version = "1.2.0"

[[StatsBase]]
deps = ["DataAPI", "DataStructures", "LinearAlgebra", "LogExpFunctions", "Missings", "Printf", "Random", "SortingAlgorithms", "SparseArrays", "Statistics", "StatsAPI"]
git-tree-sha1 = "8977b17906b0a1cc74ab2e3a05faa16cf08a8291"
uuid = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
version = "0.33.16"

[[StructTypes]]
deps = ["Dates", "UUIDs"]
git-tree-sha1 = "5d8e3d60f17791c4c64baf69a2bc5e7023ee73aa"
Expand Down
1 change: 1 addition & 0 deletions docs/Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
FeatureTransforms = "8fd68953-04b8-4117-ac19-158bf6de9782"
StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
mzgubic marked this conversation as resolved.
Show resolved Hide resolved

[compat]
DataFrames = "0.22"
Expand Down
7 changes: 6 additions & 1 deletion docs/src/api.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ AbstractScaling
HoD
Power
Periodic
MeanStdScaling
StandardScaling
IdentityScaling
InverseHyperbolicSine
LinearCombination
Expand All @@ -35,3 +35,8 @@ FeatureTransforms.is_transformable
FeatureTransforms.transform!
FeatureTransforms.transform
```

## Deprecated funtionality
```@docs
MeanStdScaling
```
12 changes: 8 additions & 4 deletions docs/src/examples.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ In the following example, we will imagine we are training a model to predict the
First we load some hourly weather data:

```jldoctest example
julia> using DataFrames, Dates, FeatureTransforms
julia> using DataFrames, Dates, FeatureTransforms, StatsBase

julia> df = DataFrame(
:time => DateTime(2018, 9, 10):Hour(1):DateTime(2018, 9, 10, 23),
Expand Down Expand Up @@ -77,13 +77,17 @@ julia> output_cols = [:temperature, :humidity];
```

For many models it is helpful to normalize the training data.
mzgubic marked this conversation as resolved.
Show resolved Hide resolved
We can use `MeanStdScaling` for that purpose.
We can use `StandardScaling` for that purpose.
Note that we are mutating the data frame in-place using `apply!` one column at a time.

```jldoctest example
julia> temp_scaling = MeanStdScaling(train_df; cols=:temperature);
julia> temp_scaling = StandardScaling();

julia> hum_scaling = MeanStdScaling(train_df; cols=:humidity);
julia> fit!(temp_scaling, train_df; cols=:temperature);

julia> hum_scaling = StandardScaling();

julia> fit!(hum_scaling, train_df; cols=:humidity);

julia> FeatureTransforms.apply!(train_df, temp_scaling; cols=:temperature);

Expand Down
27 changes: 18 additions & 9 deletions docs/src/transforms.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ DocTestSetup = quote
using DataFrames
using Dates
using FeatureTransforms
using StatsBase
end
```

Expand Down Expand Up @@ -90,8 +91,8 @@ A single `Transform` instance can be applied to different data types, with suppo
!!! note

Some `Transform` subtypes have restrictions on how they can be applied once constructed.
For instance, `MeanStdScaling` stores the mean and standard deviation of some data, potentially specified via some dimension and column names.
So `MeanStdScaling` should only be applied to the same data, and for the same dimension and subset of column names, as those used in construction.
For instance, `StandardScaling` stores the mean and standard deviation of some data, potentially specified via some dimension and column names.
So `StandardScaling` should only be applied to the same data, and for the same dimension and subset of column names, as those used in construction.

## Applying to `AbstractArray`

Expand Down Expand Up @@ -144,15 +145,19 @@ julia> M
1.0 5.0
3.0 6.0

julia> normalize_row = MeanStdScaling(M; dims=1, inds=[2])
MeanStdScaling(3.0, 2.8284271247461903)
julia> normalize_row = StandardScaling();

julia> fit!(normalize_row, M; dims=1, inds=[2])
StandardScaling(3.0, 2.8284271247461903, true)

julia> normalize_row(M; dims=1, inds=[2])
1×2 Matrix{Float64}:
-0.707107 0.707107

julia> normalize_col = MeanStdScaling(M; dims=2, inds=[2])
MeanStdScaling(5.0, 1.0)
julia> normalize_col = StandardScaling();

julia> fit!(normalize_col, M; dims=2, inds=[2])
StandardScaling(5.0, 1.0, true)

julia> normalize_col(M; dims=2, inds=[2])
3×1 Matrix{Float64}:
Expand All @@ -172,7 +177,9 @@ If no `header` is given, the default from [`Tables.table`](https://tables.juliad
```jldoctest transforms
julia> nt = (a = [2.0, 1.0, 3.0], b = [4.0, 5.0, 6.0]);

julia> scaling = MeanStdScaling(nt); # compute statistics using all data
julia> scaling = StandardScaling();

julia> fit!(scaling, nt); # compute statistics using all data

julia> FeatureTransforms.apply(nt, scaling; header=[:a_norm, :b_norm])
(a_norm = [-0.8017837257372732, -1.3363062095621219, -0.2672612419124244], b_norm = [0.2672612419124244, 0.8017837257372732, 1.3363062095621219])
Expand Down Expand Up @@ -219,12 +226,14 @@ julia> feature_df = hcat(hod_df, lc_df)
## Transform-specific keyword arguments

Some transforms have specific keyword arguments that can be passed to `apply`/`apply!`.
For example, `MeanStdScaling` can invert the original scaling using the `inverse` argument:
For example, `StandardScaling` can invert the original scaling using the `inverse` argument:

```jldoctest transforms
julia> nt = (a = [2.0, 1.0, 3.0], b = [4.0, 5.0, 6.0]);

julia> scaling = MeanStdScaling(nt);
julia> scaling = StandardScaling();

julia> fit!(scaling, nt);

julia> FeatureTransforms.apply!(nt, scaling);

Expand Down
6 changes: 5 additions & 1 deletion src/FeatureTransforms.jl
Original file line number Diff line number Diff line change
Expand Up @@ -3,17 +3,19 @@ module FeatureTransforms
using Dates: TimeType, Period, Day, hour
using NamedDims: dim
using Statistics: mean, std
using StatsBase
using Tables

export Transform, transform, transform!
export HoD, LinearCombination, OneHotEncoding, Periodic, Power
export AbstractScaling, IdentityScaling, MeanStdScaling
export AbstractScaling, IdentityScaling, MeanStdScaling, StandardScaling
export LogTransform, InverseHyperbolicSine

include("utils.jl")
include("traits.jl")
include("transform.jl")
include("apply.jl")
include("fit.jl")

# Transform implementations
include("linear_combination.jl")
Expand All @@ -26,6 +28,8 @@ include("temporal.jl")

include("test_utils.jl")

include("deprecated.jl")

# TODO: remove in v0.4 https://github.com/invenia/FeatureTransforms.jl/issues/82
Base.@deprecate_binding is_transformable TestUtils.is_transformable
mzgubic marked this conversation as resolved.
Show resolved Hide resolved

Expand Down
73 changes: 73 additions & 0 deletions src/deprecated.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
"""
MeanStdScaling(μ, σ) <: AbstractScaling

Linearly scale the data by the statistical mean `μ` and standard deviation `σ`.
This is also known as standardization, or the Z score transform.

# Keyword arguments to `apply`
* `inverse=true`: inverts the scaling (e.g. to reconstruct the unscaled data).
* `eps=1e-3`: used in place of all 0 values in `σ` before scaling (if `inverse=false`).
"""
struct MeanStdScaling <: AbstractScaling
μ::Real
σ::Real

"""
MeanStdScaling(A::AbstractArray; dims=:, inds=:) -> MeanStdScaling
MeanStdScaling(table, [cols]) -> MeanStdScaling

Construct a [`MeanStdScaling`](@ref) transform from the statistics of the given data.
By default _all the data_ is considered when computing the mean and standard deviation.
This can be restricted to certain slices via the keyword arguments (see below).

Since `MeanStdScaling` is a stateful transform, i.e. the parameters depend on the data
it's given, you should define it independently before applying it so you can keep the
information for later use. For instance, if you want to invert the transform or apply it
to a test set.

# `AbstractArray` keyword arguments
* `dims=:`: the dimension along which to take the `inds` slices. Default uses all dims.
* `inds=:`: the indices to use in computing the statistics. Default uses all indices.

# `Table` keyword arguments
* `cols`: the columns to use in computing the statistics. Default uses all columns.

!!! note
If you want the `MeanStdScaling` to transform your data consistently you should use
the same `inds`, `dims`, or `cols` keywords when calling `apply`. Otherwise, `apply`
might rescale the wrong data or throw an error.
"""
function MeanStdScaling(A::AbstractArray; dims=:, inds=:)
_depwarn()
dims == Colon() && return new(compute_stats(A)...)
return new(compute_stats(selectdim(A, dims, inds))...)
end

function MeanStdScaling(table; cols=_get_cols(table))
_depwarn()
Tables.istable(table) || throw(MethodError(MeanStdScaling, table))
columntable = Tables.columns(table)
data = reduce(vcat, [getproperty(columntable, c) for c in _to_vec(cols)])
return new(compute_stats(data)...)
end
end

function _depwarn()
Base.depwarn(
"`MeanStdScaling(args...; kwargs...)` is deprecated. Use " *
"`ss = StandardScaling(); fit!(scaling, args...; kwargs...)` instead",
:MeanStdScaling
)
return nothing
end

compute_stats(x) = (mean(x), std(x))

function _apply(A::AbstractArray, scaling::MeanStdScaling; inverse=false, eps=1e-3, kwargs...)
inverse && return scaling.μ .+ scaling.σ .* A
# Avoid division by 0
# If std is 0 then data was uniform, so the scaled value would end up ≈ 0
# Therefore the particular `eps` value should not matter much.
σ_safe = max(scaling.σ, eps)
return (A .- scaling.μ) ./ σ_safe
end
1 change: 1 addition & 0 deletions src/fit.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
StatsBase.fit!(t::Transform, args...; kwargs...) = return t
mzgubic marked this conversation as resolved.
Show resolved Hide resolved
Loading