From 55e3c1af2329512f1989b97c58db99dceeff83d7 Mon Sep 17 00:00:00 2001 From: Glenn Moynihan Date: Thu, 1 Apr 2021 14:01:04 +0100 Subject: [PATCH] Update documentation --- README.md | 12 +++- docs/Manifest.toml | 110 ++++++++++++++++++++++++++------ docs/make.jl | 1 + docs/src/api.md | 11 +++- docs/src/index.md | 21 +++--- docs/src/transform interface.md | 11 ---- docs/src/transform_interface.md | 59 +++++++++++++++++ src/one_hot_encoding.jl | 4 ++ src/scaling.jl | 16 +++-- 9 files changed, 192 insertions(+), 53 deletions(-) delete mode 100644 docs/src/transform interface.md create mode 100644 docs/src/transform_interface.md diff --git a/README.md b/README.md index 0c38f06..e9433fb 100644 --- a/README.md +++ b/README.md @@ -7,8 +7,16 @@ [![Code Style: Blue](https://img.shields.io/badge/code%20style-blue-4495d1.svg)](https://github.com/invenia/BlueStyle) [![ColPrac: Contributor's Guide on Collaborative Practices for Community Packages](https://img.shields.io/badge/ColPrac-Contributor's%20Guide-blueviolet)](https://github.com/SciML/ColPrac) -FeatureTransforms.jl provides utilities for performing feature engineering in machine learning pipelines. -FeatureTransforms supports operations on `AbstractArrays` and [Tables](https://github.com/JuliaData/Tables.jl). +FeatureTransforms.jl provides utilities for performing feature engineering in machine learning pipelines with support for `AbstractArray`s and [`Table`](https://github.com/JuliaData/Tables.jl)s. + +## Getting Started + +There are a few key parts to the Transforms.jl API, refer to the documentation for each to learn more. + +1. `Transform`s are callable types that define certain operations to be performed on data, for example, normalizating or computing a linear combination. Refer to the [Guide to Transforms](https://invenia.github.io/FeatureTransforms.jl/stable/transforms) to learn how they are defined and used on various types of input. +1. The `apply`, `apply!` and `apply_append` methods are used to implement `Transform`s in various ways. Consult the [Examples Section](https://invenia.github.io/FeatureTransforms.jl/stable/examples) for a guide to some typical use cases. See also the example below. +1. The [Transform Interface](https://invenia.github.io/FeatureTransforms.jl/stable/transform_interface) is used when you want to encapsulate sequences of `Transform`s in an end-to-end feature engineering pipeline. +1. For a full list of currently implemented `Transform`s, consult the [API](https://invenia.github.io/FeatureTransforms.jl/stable/api). ## Installation ```julia diff --git a/docs/Manifest.toml b/docs/Manifest.toml index d5aa082..53ff6df 100644 --- a/docs/Manifest.toml +++ b/docs/Manifest.toml @@ -1,13 +1,25 @@ # This file is machine-generated - editing it directly is not advised +[[AbstractFFTs]] +deps = ["LinearAlgebra"] +git-tree-sha1 = "051c95d6836228d120f5f4b984dd5aba1624f716" +uuid = "621f4979-c628-5d54-868e-fcf4e3e8185c" +version = "0.5.0" + +[[ArgTools]] +uuid = "0dad84c5-d112-42e6-8d28-ef12dabb789f" + +[[Artifacts]] +uuid = "56f22d72-fd6d-98f1-02f0-08ddc0907c33" + [[Base64]] uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f" [[CategoricalArrays]] deps = ["DataAPI", "Future", "JSON", "Missings", "Printf", "Statistics", "StructTypes", "Unicode"] -git-tree-sha1 = "dbfddfafb75fae5356e00529ce67454125935945" +git-tree-sha1 = "9f6101597998e8d8cc8c99b85e4aca144354403b" uuid = "324d7699-5711-5eae-9e2f-1d82baa6b597" -version = "0.9.3" +version = "0.9.4" [[Compat]] deps = ["Base64", "Dates", "DelimitedFiles", "Distributed", "InteractiveUtils", "LibGit2", "Libdl", "LinearAlgebra", "Markdown", "Mmap", "Pkg", "Printf", "REPL", "Random", "SHA", "Serialization", "SharedArrays", "Sockets", "SparseArrays", "Statistics", "Test", "UUIDs", "Unicode"] @@ -27,9 +39,9 @@ version = "1.6.0" [[DataFrames]] deps = ["CategoricalArrays", "Compat", "DataAPI", "Future", "InvertedIndices", "IteratorInterfaceExtensions", "LinearAlgebra", "Markdown", "Missings", "PooledArrays", "PrettyTables", "Printf", "REPL", "Reexport", "SortingAlgorithms", "Statistics", "TableTraits", "Tables", "Unicode"] -git-tree-sha1 = "b0db5579803eabb33f1274ca7ca2f472fdfb7f2a" +git-tree-sha1 = "d50972453ef464ddcebdf489d11885468b7b83a3" uuid = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" -version = "0.22.5" +version = "0.22.7" [[DataStructures]] deps = ["Compat", "InteractiveUtils", "OrderedCollections"] @@ -56,21 +68,25 @@ uuid = "8ba89e20-285c-5b6f-9357-94700520ee1b" [[DocStringExtensions]] deps = ["LibGit2", "Markdown", "Pkg", "Test"] -git-tree-sha1 = "50ddf44c53698f5e784bbebb3f4b21c5807401b1" +git-tree-sha1 = "9d4f64f79012636741cf01133158a54b24924c32" uuid = "ffbed154-4ef7-542d-bbb7-c09d3a79fcae" -version = "0.8.3" +version = "0.8.4" [[Documenter]] deps = ["Base64", "Dates", "DocStringExtensions", "IOCapture", "InteractiveUtils", "JSON", "LibGit2", "Logging", "Markdown", "REPL", "Test", "Unicode"] -git-tree-sha1 = "b7715ae18be02110a8cf9cc8ed2ccdb1e3e3aba2" +git-tree-sha1 = "3ebb967819b284dc1e3c0422229b58a40a255649" uuid = "e30172f5-a6a5-5a46-863b-614d45cd2de4" -version = "0.26.1" +version = "0.26.3" + +[[Downloads]] +deps = ["ArgTools", "LibCURL", "NetworkOptions"] +uuid = "f43a241f-c20a-4ad4-852c-f6b1247861c6" [[FeatureTransforms]] -deps = ["Dates", "Statistics", "Tables"] +deps = ["Dates", "NamedDims", "Statistics", "Tables"] path = ".." uuid = "8fd68953-04b8-4117-ac19-158bf6de9782" -version = "0.2.3" +version = "0.3.1" [[Formatting]] deps = ["Printf"] @@ -109,10 +125,22 @@ git-tree-sha1 = "81690084b6198a2e1da36fcfda16eeca9f9f24e4" uuid = "682c06a0-de6a-54ab-a142-c8b1cf79cde6" version = "0.21.1" +[[LibCURL]] +deps = ["LibCURL_jll", "MozillaCACerts_jll"] +uuid = "b27032c2-a3e7-50c8-80cd-2d36dbcbfd21" + +[[LibCURL_jll]] +deps = ["Artifacts", "LibSSH2_jll", "Libdl", "MbedTLS_jll", "Zlib_jll", "nghttp2_jll"] +uuid = "deac9b47-8bc7-5906-a0fe-35ac56dc84c0" + [[LibGit2]] -deps = ["Printf"] +deps = ["Base64", "NetworkOptions", "Printf", "SHA"] uuid = "76f85450-5226-5b5a-8eaa-529ad045b433" +[[LibSSH2_jll]] +deps = ["Artifacts", "Libdl", "MbedTLS_jll"] +uuid = "29816b5a-b9ab-546f-933c-edad1886dfa8" + [[Libdl]] uuid = "8f399da3-3557-5675-b5ff-fb832c97cbdb" @@ -127,6 +155,10 @@ uuid = "56ddb016-857b-54e1-b83d-db4d58db5568" deps = ["Base64"] uuid = "d6f4376e-aef5-505a-96c1-9c027394607a" +[[MbedTLS_jll]] +deps = ["Artifacts", "Libdl"] +uuid = "c8ffd9c3-330d-5841-b78e-0817d7145fa1" + [[Missings]] deps = ["DataAPI"] git-tree-sha1 = "f8c673ccc215eb50fcadb285f522420e29e69e1c" @@ -136,6 +168,18 @@ version = "0.4.5" [[Mmap]] uuid = "a63ad114-7e13-5084-954f-fe012c677804" +[[MozillaCACerts_jll]] +uuid = "14a3606d-f60d-562e-9121-12d972cd8159" + +[[NamedDims]] +deps = ["AbstractFFTs", "LinearAlgebra", "Pkg", "Requires", "Statistics"] +git-tree-sha1 = "0838a2ee62194d1a4dbf3904dca75cf62374b701" +uuid = "356022a1-0364-5f58-8944-0da4b18d706f" +version = "0.2.32" + +[[NetworkOptions]] +uuid = "ca575930-c2e3-43a9-ace4-1e988b2c1908" + [[OrderedCollections]] git-tree-sha1 = "4fa2ba51070ec13fcc7517db714445b4ab986bdf" uuid = "bac558e1-5e72-5ebc-8fee-abe8a469f55d" @@ -143,12 +187,12 @@ version = "1.4.0" [[Parsers]] deps = ["Dates"] -git-tree-sha1 = "50c9a9ed8c714945e01cd53a21007ed3865ed714" +git-tree-sha1 = "c8abc88faa3f7a3950832ac5d6e690881590d6dc" uuid = "69de0a69-1ddd-5017-9359-2bf0b02dc9f0" -version = "1.0.15" +version = "1.1.0" [[Pkg]] -deps = ["Dates", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "UUIDs"] +deps = ["Artifacts", "Dates", "Downloads", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "Serialization", "TOML", "Tar", "UUIDs", "p7zip_jll"] uuid = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f" [[PooledArrays]] @@ -168,7 +212,7 @@ deps = ["Unicode"] uuid = "de0858da-6303-5e67-8744-51eddeeeb8d7" [[REPL]] -deps = ["InteractiveUtils", "Markdown", "Sockets"] +deps = ["InteractiveUtils", "Markdown", "Sockets", "Unicode"] uuid = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb" [[Random]] @@ -180,6 +224,12 @@ git-tree-sha1 = "57d8440b0c7d98fc4f889e478e80f268d534c9d5" uuid = "189a3867-3050-52da-a836-e630ba90ab69" version = "1.0.0" +[[Requires]] +deps = ["UUIDs"] +git-tree-sha1 = "4036a3bd08ac7e968e27c203d45f5fff15020621" +uuid = "ae029012-a4dd-5104-9daa-d747884805df" +version = "1.1.3" + [[SHA]] uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce" @@ -209,9 +259,13 @@ uuid = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" [[StructTypes]] deps = ["Dates", "UUIDs"] -git-tree-sha1 = "d7f4287dbc1e590265f50ceda1b40ed2bb31bbbb" +git-tree-sha1 = "89b390141d2fb2ef3ac2dc32e336f7a5c4810751" uuid = "856f2bd8-1eba-4b0a-8007-ebc267875bd4" -version = "1.4.0" +version = "1.5.0" + +[[TOML]] +deps = ["Dates"] +uuid = "fa267f1f-6049-4f14-aa54-33bafae1ed76" [[TableTraits]] deps = ["IteratorInterfaceExtensions"] @@ -221,12 +275,16 @@ version = "1.0.0" [[Tables]] deps = ["DataAPI", "DataValueInterfaces", "IteratorInterfaceExtensions", "LinearAlgebra", "TableTraits", "Test"] -git-tree-sha1 = "a716dde43d57fa537a19058d044b495301ba6565" +git-tree-sha1 = "a9ff3dfec713c6677af435d6a6d65f9744feef67" uuid = "bd369af6-aec1-5ad0-b16a-f7cc5008161c" -version = "1.3.2" +version = "1.4.1" + +[[Tar]] +deps = ["ArgTools", "SHA"] +uuid = "a4e569a6-e804-4fa4-b0f3-eef7a1d5b13e" [[Test]] -deps = ["Distributed", "InteractiveUtils", "Logging", "Random"] +deps = ["InteractiveUtils", "Logging", "Random", "Serialization"] uuid = "8dfed614-e22c-5e08-85e1-65c5234f0b40" [[UUIDs]] @@ -235,3 +293,15 @@ uuid = "cf7118a7-6976-5b1a-9a39-7adc72f591a4" [[Unicode]] uuid = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5" + +[[Zlib_jll]] +deps = ["Libdl"] +uuid = "83775a58-1f1d-513f-b197-d71354ab007a" + +[[nghttp2_jll]] +deps = ["Artifacts", "Libdl"] +uuid = "8e850ede-7688-5339-a07c-302acd2aaf8d" + +[[p7zip_jll]] +deps = ["Artifacts", "Libdl"] +uuid = "3f19e933-33d8-53b3-aaab-bd5110c3b7a0" diff --git a/docs/make.jl b/docs/make.jl index 0c16615..569f14f 100644 --- a/docs/make.jl +++ b/docs/make.jl @@ -16,6 +16,7 @@ makedocs(; pages=[ "Introduction" => "index.md", "Guide to Transforms" => "transforms.md", + "Transform Interface" => "transform_interface.md", "Examples" => "examples.md", "API" => "api.md", ], diff --git a/docs/src/api.md b/docs/src/api.md index b2ba5a4..9f96ecf 100644 --- a/docs/src/api.md +++ b/docs/src/api.md @@ -2,12 +2,17 @@ ## Transforms +### Abstract Transform Types ```@docs Transform +AbstractScaling +``` + +### Implemented Transforms +```@docs HoD Power Periodic -AbstractScaling MeanStdScaling IdentityScaling LinearCombination @@ -20,6 +25,10 @@ OneHotEncoding FeatureTransforms.apply FeatureTransforms.apply! FeatureTransforms.apply_append +``` + +## Transform Interface +```@docs FeatureTransforms.is_transformable FeatureTransforms.transform! FeatureTransforms.transform diff --git a/docs/src/index.md b/docs/src/index.md index 9908d36..6f12ec2 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -1,18 +1,15 @@ # FeatureTransforms -FeatureTransforms.jl provides utilities for performing feature engineering in machine learning pipelines. -FeatureTransforms supports operations on `AbstractArray`s and [`Table`](https://github.com/JuliaData/Tables.jl)s. +FeatureTransforms.jl provides utilities for performing feature engineering in machine learning pipelines with support for `AbstractArray`s and [`Table`](https://github.com/JuliaData/Tables.jl)s. -There are three key parts of the Transforms.jl API: +## Why does this package exist? -* Subtypes of [`Transform`](@ref about-transforms) define transformations of data, for example normalization or a periodic function. -* The [`FeatureTransforms.apply`](@ref), [`FeatureTransforms.apply!`](@ref) and [`FeatureTransforms.apply_append`](@ref) methods transform data according to the given [`Transform`](@ref about-transforms), in a manner determined by the data type and specified dimensions, column names, indices, and other `Transform`-specific parameters. -* The [`transform`](@ref transform-interface) method should be overloaded to define feature engineering pipelines that include [`Transform`](@ref about-transforms)s. +FeatureTransforms.jl aims to provide common feature engineering transforms that are composable, reusable, and performant. -## Getting Started +FeatureTransforms.jl is conceptually different from other widely-known packages that provide similar utilities for manipulating data, such as [DataFramesMeta.jl](https://github.com/JuliaData/DataFramesMeta.jl), [DataKnots.jl](https://github.com/rbt-lang/DataKnots.jl), and [Query.jl](https://github.com/queryverse/Query.jl). +These packages provide methods for composing relational operations to filter, join, or combine structured data. +However, a query-based syntax or an API that only supports one type are not the most suitable for composing the kinds of mathematical operations, such as one-hot-encoding, that underpin most (non-trivial) feature engineering pipelines. -Here are some resources for getting started with FeatureTransforms.jl: - -* Refer to the page on [Transforms](@ref about-transforms) to learn how they are defined and used. -* Consult the [examples](@ref) section for a quick guide to some typical use cases. -* The [API](@ref) page has the list of all currently supported `Transform`s. +The composability of transforms reflects the practice of piping the output of one operation to the input of another, as well as combining the pipelines of multiple features. +Reusability is achieved by having native support for the `Tables` and `AbstractArray` types, which includes [DataFrames](https://github.com/JuliaData/DataFrames.jl/), [TypedTables](https://github.com/JuliaData/TypedTables.jl), [LibPQ.Result](https://github.com/invenia/LibPQ.jl), etc, as well as [AxisArrays](https://github.com/JuliaArrays/AxisArrays.jl), [KeyedArrays](https://github.com/mcabbott/AxisKeys.jl), and [NamedDimsArrays](https://github.com/invenia/NamedDims.jl). +This flexible design allows for performant code that should satisfy the needs of most users while not being restricted to (or by) any one data type. diff --git a/docs/src/transform interface.md b/docs/src/transform interface.md deleted file mode 100644 index 6c529d5..0000000 --- a/docs/src/transform interface.md +++ /dev/null @@ -1,11 +0,0 @@ -# [Transform Interface](@id transform-interface) - -The idea around a "transform interface” is to make feature transformations composable, i.e. the output of one `Transform` should be valid input to another. - -Feature engineering pipelines, which comprise a sequence of multiple `Transform`s and other steps, should obey the same principle and one should be able to add/remove subsequent `Transform`s without the pipeline breaking. -So the output of an end-to-end transform pipeline should itself be "transformable". - -We have enforced this in Transforms.jl by only supporting certain input types, i.e. AbstractArrays and Tables, which produce other AbstractArrays and Tables. -We also have specified this in the `transform` function API, which is expected to be overloaded for implementing pipelines (the exact method is an implementation detail for the user). -Our only requirement is that the return of the implemented `transform` is itself "transformable", i.e. an AbstractArray or Table. -This can be checked by calling `is_transformable` on the output. diff --git a/docs/src/transform_interface.md b/docs/src/transform_interface.md new file mode 100644 index 0000000..970f79e --- /dev/null +++ b/docs/src/transform_interface.md @@ -0,0 +1,59 @@ +# [Transform Interface](@id transform-interface) + +The "transform interface” is a mechanism that allows sequences of `Transform`s to be combined (with other steps) into end-to-end feature engineering pipelines. + +This is supported by the return of a `Transform`s having the same type as the input. +This type consistency helps to make `Transform`s _composable_, i.e., the output of one is always a valid input to another, which allows users to "stack" sequences of `Transform`s together with minimal glue code needed to keep it working. + +Morever, the end-to-end pipelines themselves should obey the same principle: you should be able to add or remove `Transform`s (or another pipeline) to the output without breaking your code. +That is, the output should also be a valid "transformable" type: either an `AbstractArray`, a `Table`, or other type for which the user has extended [`FeatureTransforms.apply`](@ref) to support. +Valid types can be checked by calling `is_transformable`, which is the first part of the transform interface. + +The second part is the `transform` method stub, which users should overload when they want to "encapsulate" an end-to-end pipeline. +The exact method for doing so is an implementation detail for the user but refer to the code below as an example. +The only requirement of the transform API is that the return of the implemented `transform` method is itself "transformable", i.e. satisfies `is_transformable`. + +## Example + +This is a trivial example of a feature engineering pipeline. +In practice, there may be other steps involved, such as checking for missing data or logging, which are omitted for clarity. +An advantage of the transform API is that the output can be readily integrated into another transform pipeline downstream. +For example, if `MyModel` were being stacked with the result of a previous model. + + +```@meta +DocTestSetup = quote + using FeatureTransforms +end +``` + +```jldoctest transform + +function FeatureTransforms.transform(data) + # Define the Transforms we will apply + p = Power(0.123) + lc = LinearCombination([0.1, 0.9]) + ohe = OneHotEncoding(["type1", "type2", "type3"]) + + features = deepcopy(data) + FeatureTransforms.apply!(features, p; cols=[:a], header=[:a]) + features = FeatureTransforms.apply_append(features, lc; cols=[:a, :b], header=[:ab]) + features = FeatureTransforms.apply_append(features, ohe; cols=:types, header=[:type1, :type2, :type3]) +end + +# this could be any table-type, including a DataFrame +input = (a=rand(5), b=rand(5), types=["type1", "type2", "type1", "type1", "type1"]); + +output = FeatureTransforms.transform(input); + +# verify the output is transformable +is_transformable(output) && print("output is transformable") + +# output + +output is transformable +``` + +```@meta +DocTestSetup = Nothing +``` diff --git a/src/one_hot_encoding.jl b/src/one_hot_encoding.jl index 14cdd51..6864b63 100644 --- a/src/one_hot_encoding.jl +++ b/src/one_hot_encoding.jl @@ -12,6 +12,10 @@ of results. It defaults to a `Matrix` of `Bool`s. Note that this Transform does not support specifying dims other than `:` (all dims) because it is a one-to-many transform (for example a `Vector` input produces a `Matrix` output). + +Note that `OneHotEncoding` needs to be first encoded with the expected categories before it +can be used. This is because the data might be missing certain categories which will lead to +incomplete classification. """ struct OneHotEncoding{R<:Real} <: Transform categories::Dict diff --git a/src/scaling.jl b/src/scaling.jl index 278d048..9976f0d 100644 --- a/src/scaling.jl +++ b/src/scaling.jl @@ -30,18 +30,23 @@ struct MeanStdScaling <: AbstractScaling """ MeanStdScaling(A::AbstractArray; dims=:, inds=:) -> MeanStdScaling - MeanStdScaling(table, cols=nothing) -> MeanStdScaling + MeanStdScaling(table, [cols]) -> MeanStdScaling Construct a [`MeanStdScaling`](@ref) transform from the statistics of the given data. By default _all the data_ is considered when computing the mean and standard deviation. This can be restricted to certain slices via the keyword arguments (see below). + Since `MeanStdScaling` is a stateful transform, i.e. the parameters depend on the data + it's given, you should define it independently before applying it so you can keep the + information for later use. For instance, if you want to invert the transform or apply it + to a test set. + # `AbstractArray` keyword arguments * `dims=:`: the dimension along which to take the `inds` slices. Default uses all dims. * `inds=:`: the indices to use in computing the statistics. Default uses all indices. # `Table` keyword arguments - * `cols=nothing`: the columns to use in computing the statistics. Default uses all columns. + * `cols`: the columns to use in computing the statistics. Default uses all columns. !!! note If you want the `MeanStdScaling` to transform your data consistently you should use @@ -53,13 +58,10 @@ struct MeanStdScaling <: AbstractScaling return new(compute_stats(selectdim(A, dims, inds))...) end - function MeanStdScaling(table; cols=nothing) + function MeanStdScaling(table; cols=_get_cols(table)) Tables.istable(table) || throw(MethodError(MeanStdScaling, table)) columntable = Tables.columns(table) - - cols = _to_vec(cols) # handle single column name - cnames = cols === nothing ? propertynames(columntable) : cols - data = reduce(vcat, [getproperty(columntable, c) for c in cnames]) + data = reduce(vcat, [getproperty(columntable, c) for c in _to_vec(cols)]) return new(compute_stats(data)...) end end