invenia · mzgubic · May 17, 2022 · Feb 28, 2022 · Feb 28, 2022 · Feb 28, 2022
diff --git a/Project.toml b/Project.toml
@@ -1,13 +1,14 @@
 name = "FeatureTransforms"
 uuid = "8fd68953-04b8-4117-ac19-158bf6de9782"
 authors = ["Invenia Technical Computing Corporation"]
-version = "0.3.11"
+version = "0.3.12"
 
 [deps]
 Dates = "ade2ca70-3891-5945-98fb-dc099432e06a"
 InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
 NamedDims = "356022a1-0364-5f58-8944-0da4b18d706f"
 Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
+StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
 Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c"
 
 [compat]
@@ -16,6 +17,7 @@ AxisKeys = "0.1"
 DataFrames = "0.22, 1"
 Documenter = "0.26"
 NamedDims = "0.2.32"
+StatsBase = "0.33"
 Tables = "1.3"
 julia = "1.3"
 

diff --git a/docs/Manifest.toml b/docs/Manifest.toml
@@ -21,6 +21,18 @@ git-tree-sha1 = "f713d583d10fc036252fd826feebc6c173c522a8"
 uuid = "324d7699-5711-5eae-9e2f-1d82baa6b597"
 version = "0.9.5"
 
+[[ChainRulesCore]]
+deps = ["Compat", "LinearAlgebra", "SparseArrays"]
+git-tree-sha1 = "f9982ef575e19b0e5c7a98c6e75ee496c0f73a93"
+uuid = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
+version = "1.12.0"
+
+[[ChangesOfVariables]]
+deps = ["ChainRulesCore", "LinearAlgebra", "Test"]
+git-tree-sha1 = "bf98fa45a0a4cee295de98d4c1462be26345b9a1"
+uuid = "9e997f8a-9a97-42d5-a9f1-ce6bfc15e2c0"
+version = "0.1.2"
+
 [[Compat]]
 deps = ["Base64", "Dates", "DelimitedFiles", "Distributed", "InteractiveUtils", "LibGit2", "Libdl", "LinearAlgebra", "Markdown", "Mmap", "Pkg", "Printf", "REPL", "Random", "SHA", "Serialization", "SharedArrays", "Sockets", "SparseArrays", "Statistics", "Test", "UUIDs", "Unicode"]
 git-tree-sha1 = "ac4132ad78082518ec2037ae5770b6e796f7f956"
@@ -83,10 +95,10 @@ deps = ["ArgTools", "LibCURL", "NetworkOptions"]
 uuid = "f43a241f-c20a-4ad4-852c-f6b1247861c6"
 
 [[FeatureTransforms]]
-deps = ["Dates", "NamedDims", "Statistics", "Tables"]
+deps = ["Dates", "InteractiveUtils", "NamedDims", "Statistics", "StatsBase", "Tables"]
 path = ".."
 uuid = "8fd68953-04b8-4117-ac19-158bf6de9782"
-version = "0.3.3"
+version = "0.3.12"
 
 [[Formatting]]
 deps = ["Printf"]
@@ -108,12 +120,23 @@ version = "0.1.1"
 deps = ["Markdown"]
 uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
 
+[[InverseFunctions]]
+deps = ["Test"]
+git-tree-sha1 = "a7254c0acd8e62f1ac75ad24d5db43f5f19f3c65"
+uuid = "3587e190-3f89-42d0-90ee-14403ec27112"
+version = "0.1.2"
+
 [[InvertedIndices]]
 deps = ["Test"]
 git-tree-sha1 = "15732c475062348b0165684ffe28e85ea8396afc"
 uuid = "41ab1584-1d38-5bbf-9106-f11c6c58b48f"
 version = "1.0.0"
 
+[[IrrationalConstants]]
+git-tree-sha1 = "7fd44fd4ff43fc60815f8e764c0f352b83c49151"
+uuid = "92d709cd-6900-40b7-9082-c6be49f344b6"
+version = "0.1.1"
+
 [[IteratorInterfaceExtensions]]
 git-tree-sha1 = "a3f24677c21f5bbe9d2a714f95dcd58337fb2856"
 uuid = "82899510-4779-5014-852e-03e436cf321d"
@@ -148,6 +171,12 @@ uuid = "8f399da3-3557-5675-b5ff-fb832c97cbdb"
 deps = ["Libdl"]
 uuid = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 
+[[LogExpFunctions]]
+deps = ["ChainRulesCore", "ChangesOfVariables", "DocStringExtensions", "InverseFunctions", "IrrationalConstants", "LinearAlgebra"]
+git-tree-sha1 = "e5718a00af0ab9756305a0392832c8952c7426c1"
+uuid = "2ab3a3ac-af41-5b50-aa03-7779005ae688"
+version = "0.3.6"
+
 [[Logging]]
 uuid = "56ddb016-857b-54e1-b83d-db4d58db5568"
 
@@ -257,6 +286,17 @@ uuid = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
 deps = ["LinearAlgebra", "SparseArrays"]
 uuid = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
 
+[[StatsAPI]]
+git-tree-sha1 = "d88665adc9bcf45903013af0982e2fd05ae3d0a6"
+uuid = "82ae8749-77ed-4fe6-ae5f-f523153014b0"
+version = "1.2.0"
+
+[[StatsBase]]
+deps = ["DataAPI", "DataStructures", "LinearAlgebra", "LogExpFunctions", "Missings", "Printf", "Random", "SortingAlgorithms", "SparseArrays", "Statistics", "StatsAPI"]
+git-tree-sha1 = "8977b17906b0a1cc74ab2e3a05faa16cf08a8291"
+uuid = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
+version = "0.33.16"
+
 [[StructTypes]]
 deps = ["Dates", "UUIDs"]
 git-tree-sha1 = "5d8e3d60f17791c4c64baf69a2bc5e7023ee73aa"

diff --git a/docs/Project.toml b/docs/Project.toml
@@ -2,6 +2,7 @@
 DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
 Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
 FeatureTransforms = "8fd68953-04b8-4117-ac19-158bf6de9782"
+StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
 
 [compat]
 DataFrames = "0.22"

diff --git a/docs/src/api.md b/docs/src/api.md
@@ -13,7 +13,7 @@ AbstractScaling
 HoD
 Power
 Periodic
-MeanStdScaling
+StandardScaling
 IdentityScaling
 InverseHyperbolicSine
 LinearCombination
@@ -35,3 +35,8 @@ FeatureTransforms.is_transformable
 FeatureTransforms.transform!
 FeatureTransforms.transform
 ```
+
+## Deprecated funtionality
+```@docs
+MeanStdScaling
+```
diff --git a/docs/src/examples.md b/docs/src/examples.md
@@ -5,7 +5,7 @@ In the following example, we will imagine we are training a model to predict the
 First we load some hourly weather data:
 
 ```jldoctest example
-julia> using DataFrames, Dates, FeatureTransforms
+julia> using DataFrames, Dates, FeatureTransforms, StatsBase
 
 julia> df = DataFrame(
             :time => DateTime(2018, 9, 10):Hour(1):DateTime(2018, 9, 10, 23),
@@ -77,13 +77,17 @@ julia> output_cols = [:temperature, :humidity];
 ```
 
 For many models it is helpful to normalize the training data.
-We can use `MeanStdScaling` for that purpose.
+We can use `StandardScaling` for that purpose.
 Note that we are mutating the data frame in-place using `apply!` one column at a time.
 
 ```jldoctest example
-julia> temp_scaling = MeanStdScaling(train_df; cols=:temperature);
+julia> temp_scaling = StandardScaling();
 
-julia> hum_scaling = MeanStdScaling(train_df; cols=:humidity);
+julia> fit!(temp_scaling, train_df; cols=:temperature);
+
+julia> hum_scaling = StandardScaling();
+
+julia> fit!(hum_scaling, train_df; cols=:humidity);
 
 julia> FeatureTransforms.apply!(train_df, temp_scaling; cols=:temperature);
 

diff --git a/docs/src/transforms.md b/docs/src/transforms.md
@@ -8,6 +8,7 @@ DocTestSetup = quote
     using DataFrames
     using Dates
     using FeatureTransforms
+    using StatsBase
 end
 ```
 
@@ -90,8 +91,8 @@ A single `Transform` instance can be applied to different data types, with suppo
 !!! note
 
     Some `Transform` subtypes have restrictions on how they can be applied once constructed.
-    For instance, `MeanStdScaling` stores the mean and standard deviation of some data, potentially specified via some dimension and column names.
-    So `MeanStdScaling` should only be applied to the same data, and for the same dimension and subset of column names, as those used in construction.
+    For instance, `StandardScaling` stores the mean and standard deviation of some data, potentially specified via some dimension and column names.
+    So `StandardScaling` should only be applied to the same data, and for the same dimension and subset of column names, as those used in construction.
 
 ## Applying to `AbstractArray`
 
@@ -144,15 +145,19 @@ julia> M
  1.0  5.0
  3.0  6.0
 
-julia> normalize_row = MeanStdScaling(M; dims=1, inds=[2])
-MeanStdScaling(3.0, 2.8284271247461903)
+julia> normalize_row = StandardScaling();
+
+julia> fit!(normalize_row, M; dims=1, inds=[2])
+StandardScaling(3.0, 2.8284271247461903, true)
 
 julia> normalize_row(M; dims=1, inds=[2])
 1×2 Matrix{Float64}:
  -0.707107  0.707107
 
-julia> normalize_col = MeanStdScaling(M; dims=2, inds=[2])
-MeanStdScaling(5.0, 1.0)
+julia> normalize_col = StandardScaling();
+
+julia> fit!(normalize_col, M; dims=2, inds=[2])
+StandardScaling(5.0, 1.0, true)
 
 julia> normalize_col(M; dims=2, inds=[2])
 3×1 Matrix{Float64}:
@@ -172,7 +177,9 @@ If no `header` is given, the default from [`Tables.table`](https://tables.juliad
 ```jldoctest transforms
 julia> nt = (a = [2.0, 1.0, 3.0], b = [4.0, 5.0, 6.0]);
 
-julia> scaling = MeanStdScaling(nt);  # compute statistics using all data
+julia> scaling = StandardScaling();
+
+julia> fit!(scaling, nt);  # compute statistics using all data
 
 julia> FeatureTransforms.apply(nt, scaling; header=[:a_norm, :b_norm])
 (a_norm = [-0.8017837257372732, -1.3363062095621219, -0.2672612419124244], b_norm = [0.2672612419124244, 0.8017837257372732, 1.3363062095621219])
@@ -219,12 +226,14 @@ julia> feature_df = hcat(hod_df, lc_df)
 ## Transform-specific keyword arguments
 
 Some transforms have specific keyword arguments that can be passed to `apply`/`apply!`.
-For example, `MeanStdScaling` can invert the original scaling using the `inverse` argument:
+For example, `StandardScaling` can invert the original scaling using the `inverse` argument:
 
 ```jldoctest transforms
 julia> nt = (a = [2.0, 1.0, 3.0], b = [4.0, 5.0, 6.0]);
 
-julia> scaling = MeanStdScaling(nt);
+julia> scaling = StandardScaling();
+
+julia> fit!(scaling, nt);
 
 julia> FeatureTransforms.apply!(nt, scaling);
 

diff --git a/src/FeatureTransforms.jl b/src/FeatureTransforms.jl
@@ -3,17 +3,19 @@ module FeatureTransforms
 using Dates: TimeType, Period, Day, hour
 using NamedDims: dim
 using Statistics: mean, std
+using StatsBase
 using Tables
 
 export Transform, transform, transform!
 export HoD, LinearCombination, OneHotEncoding, Periodic, Power
-export AbstractScaling, IdentityScaling, MeanStdScaling
+export AbstractScaling, IdentityScaling, MeanStdScaling, StandardScaling
 export LogTransform, InverseHyperbolicSine
 
 include("utils.jl")
 include("traits.jl")
 include("transform.jl")
 include("apply.jl")
+include("fit.jl")
 
 # Transform implementations
 include("linear_combination.jl")
@@ -26,6 +28,8 @@ include("temporal.jl")
 
 include("test_utils.jl")
 
+include("deprecated.jl")
+
 # TODO: remove in v0.4 https://github.com/invenia/FeatureTransforms.jl/issues/82
 Base.@deprecate_binding is_transformable TestUtils.is_transformable
 

diff --git a/src/deprecated.jl b/src/deprecated.jl
@@ -0,0 +1,73 @@
+"""
+    MeanStdScaling(μ, σ) <: AbstractScaling
+
+Linearly scale the data by the statistical mean `μ` and standard deviation `σ`.
+This is also known as standardization, or the Z score transform.
+
+# Keyword arguments to `apply`
+* `inverse=true`: inverts the scaling (e.g. to reconstruct the unscaled data).
+* `eps=1e-3`: used in place of all 0 values in `σ` before scaling (if `inverse=false`).
+"""
+struct MeanStdScaling <: AbstractScaling
+    μ::Real
+    σ::Real
+
+    """
+        MeanStdScaling(A::AbstractArray; dims=:, inds=:) -> MeanStdScaling
+        MeanStdScaling(table, [cols]) -> MeanStdScaling
+
+    Construct a [`MeanStdScaling`](@ref) transform from the statistics of the given data.
+    By default _all the data_ is considered when computing the mean and standard deviation.
+    This can be restricted to certain slices via the keyword arguments (see below).
+
+    Since `MeanStdScaling` is a stateful transform, i.e. the parameters depend on the data
+    it's given, you should define it independently before applying it so you can keep the
+    information for later use. For instance, if you want to invert the transform or apply it
+    to a test set.
+
+    # `AbstractArray` keyword arguments
+    * `dims=:`: the dimension along which to take the `inds` slices. Default uses all dims.
+    * `inds=:`: the indices to use in computing the statistics. Default uses all indices.
+
+    # `Table` keyword arguments
+    * `cols`: the columns to use in computing the statistics. Default uses all columns.
+
+    !!! note
+        If you want the `MeanStdScaling` to transform your data consistently you should use
+        the same `inds`, `dims`, or `cols` keywords when calling `apply`. Otherwise, `apply`
+        might rescale the wrong data or throw an error.
+    """
+    function MeanStdScaling(A::AbstractArray; dims=:, inds=:)
+        _depwarn()
+        dims == Colon() && return new(compute_stats(A)...)
+        return new(compute_stats(selectdim(A, dims, inds))...)
+    end
+
+    function MeanStdScaling(table; cols=_get_cols(table))
+        _depwarn()
+        Tables.istable(table) || throw(MethodError(MeanStdScaling, table))
+        columntable = Tables.columns(table)
+        data = reduce(vcat, [getproperty(columntable, c) for c in _to_vec(cols)])
+        return new(compute_stats(data)...)
+    end
+end
+
+function _depwarn()
+    Base.depwarn(
+        "`MeanStdScaling(args...; kwargs...)` is deprecated. Use " *
+        "`ss = StandardScaling(); fit!(scaling, args...; kwargs...)` instead",
+        :MeanStdScaling
+    )
+    return nothing
+end
+
+compute_stats(x) = (mean(x), std(x))
+
+function _apply(A::AbstractArray, scaling::MeanStdScaling; inverse=false, eps=1e-3, kwargs...)
+    inverse && return scaling.μ .+ scaling.σ .* A
+    # Avoid division by 0
+    # If std is 0 then data was uniform, so the scaled value would end up ≈ 0
+    # Therefore the particular `eps` value should not matter much.
+    σ_safe = max(scaling.σ, eps)
+    return (A .- scaling.μ) ./ σ_safe
+end
diff --git a/src/fit.jl b/src/fit.jl
@@ -0,0 +1 @@
+StatsBase.fit!(t::Transform, args...; kwargs...) = return t
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		StatsBase.fit!(t::Transform, args...; kwargs...) = return t
mzgubic marked this conversation as resolved. Show resolved Hide resolved