From c024ecac4a542320129b45f75520a726b09739f9 Mon Sep 17 00:00:00 2001 From: Nicole Epp Date: Wed, 24 Feb 2021 11:14:31 -0600 Subject: [PATCH 1/3] Switch to Bool for OneHotEncoding Transform --- src/one_hot_encoding.jl | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/src/one_hot_encoding.jl b/src/one_hot_encoding.jl index 2d43adc..6568aea 100644 --- a/src/one_hot_encoding.jl +++ b/src/one_hot_encoding.jl @@ -6,16 +6,16 @@ One-hot encode the categorical value for each target element. Construct a n-by-p binary matrix, given a `Vector` of target data `x` (of length n) and a `Vector` of all unique possible values in x (of length p). -The element [i, j] is `1` if the i^th target in `x` corresponds to the j^th possible -value and `0` otherwise. +The element [i, j] is `true` if the i^th target in `x` corresponds to the j^th possible +value and `false` otherwise. Note that this Transform does not support specifying dims other than `:` (all dims) because it is a one-to-many transform (for example a `Vector` input produces a `Matrix` output). """ -struct OneHotEncoding <: Transform - categories::Dict{Any, Int} +struct OneHotEncoding{T} <: Transform + categories::Dict{T, Int} - function OneHotEncoding(possible_values::AbstractVector) + function OneHotEncoding(possible_values::AbstractVector{T}) where T if length(unique(possible_values)) < length(possible_values) throw(ArgumentError("Expected a list of all unique possible values")) end @@ -23,18 +23,17 @@ struct OneHotEncoding <: Transform # Create a dictionary that maps unique values in the input array to column positions # in the sparse matrix that results from applying the OneHotEncoding transform categories = Dict(value => i for (i, value) in enumerate(possible_values)) - return new(categories) + return new{T}(categories) end end function _apply(x, encoding::OneHotEncoding; kwargs...) n_categories = length(encoding.categories) + results = zeros(Bool, length(x), n_categories) - results = zeros(Int, length(x), n_categories) - - @views for (i, value) in enumerate(x) + for (i, value) in enumerate(x) col_pos = encoding.categories[value] - results[i, col_pos] = 1 + results[i, col_pos] = true end return results From ac2f78ece3f1073caf447ab0724d86e1b2c0f496 Mon Sep 17 00:00:00 2001 From: Nicole Epp Date: Wed, 24 Feb 2021 14:32:36 -0600 Subject: [PATCH 2/3] Make OneHotEncoding return type flexible --- src/one_hot_encoding.jl | 19 ++++++++++++------- test/one_hot_encoding.jl | 9 ++++++++- 2 files changed, 20 insertions(+), 8 deletions(-) diff --git a/src/one_hot_encoding.jl b/src/one_hot_encoding.jl index 6568aea..b3cb4df 100644 --- a/src/one_hot_encoding.jl +++ b/src/one_hot_encoding.jl @@ -1,5 +1,5 @@ """ - OneHotEncoding <: Transform + OneHotEncoding{R<:Real} <: Transform One-hot encode the categorical value for each target element. @@ -7,15 +7,16 @@ Construct a n-by-p binary matrix, given a `Vector` of target data `x` (of length `Vector` of all unique possible values in x (of length p). The element [i, j] is `true` if the i^th target in `x` corresponds to the j^th possible -value and `false` otherwise. +value and `false` otherwise. Note that `R`can be specified to determine the return type +of results. It defaults to a `Matrix` of `Bool`s. Note that this Transform does not support specifying dims other than `:` (all dims) because it is a one-to-many transform (for example a `Vector` input produces a `Matrix` output). """ -struct OneHotEncoding{T} <: Transform +struct OneHotEncoding{R<:Real, T} <: Transform categories::Dict{T, Int} - function OneHotEncoding(possible_values::AbstractVector{T}) where T + function OneHotEncoding{R}(possible_values::AbstractVector{T}) where {R<:Real, T} if length(unique(possible_values)) < length(possible_values) throw(ArgumentError("Expected a list of all unique possible values")) end @@ -23,13 +24,17 @@ struct OneHotEncoding{T} <: Transform # Create a dictionary that maps unique values in the input array to column positions # in the sparse matrix that results from applying the OneHotEncoding transform categories = Dict(value => i for (i, value) in enumerate(possible_values)) - return new{T}(categories) + return new{R, T}(categories) end end -function _apply(x, encoding::OneHotEncoding; kwargs...) +function OneHotEncoding(possible_values::AbstractVector{T}) where T + return OneHotEncoding{Bool}(possible_values) +end + +function _apply(x, encoding::OneHotEncoding{R}; kwargs...) where R <: Real n_categories = length(encoding.categories) - results = zeros(Bool, length(x), n_categories) + results = zeros(R, length(x), n_categories) for (i, value) in enumerate(x) col_pos = encoding.categories[value] diff --git a/test/one_hot_encoding.jl b/test/one_hot_encoding.jl index 9b2c717..19a4b3c 100644 --- a/test/one_hot_encoding.jl +++ b/test/one_hot_encoding.jl @@ -8,9 +8,16 @@ x = ["foo", "bar", "baz"] expected = [1 0 0; 0 1 0; 0 0 1] - @test Transforms.apply(x, ohe) == expected + transformed = Transforms.apply(x, ohe) + @test transformed == expected + @test transformed isa AbstractMatrix{Bool} @test ohe(x) == expected + # Test specifying return type + transformed = Transforms.apply(x, OneHotEncoding{AbstractFloat}(categories)) + @test transformed == expected + @test transformed isa AbstractMatrix{AbstractFloat} + # Test duplicate values x = ["foo", "baz", "bar", "baz"] expected = [1 0 0; 0 0 1; 0 1 0; 0 0 1] From 1f663951b82e812f43ea7d39278dba76cf504caa Mon Sep 17 00:00:00 2001 From: Nicole Epp Date: Thu, 25 Feb 2021 12:08:17 -0600 Subject: [PATCH 3/3] Fixup --- src/one_hot_encoding.jl | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/one_hot_encoding.jl b/src/one_hot_encoding.jl index b3cb4df..5fcb277 100644 --- a/src/one_hot_encoding.jl +++ b/src/one_hot_encoding.jl @@ -13,10 +13,10 @@ of results. It defaults to a `Matrix` of `Bool`s. Note that this Transform does not support specifying dims other than `:` (all dims) because it is a one-to-many transform (for example a `Vector` input produces a `Matrix` output). """ -struct OneHotEncoding{R<:Real, T} <: Transform - categories::Dict{T, Int} +struct OneHotEncoding{R<:Real} <: Transform + categories::Dict - function OneHotEncoding{R}(possible_values::AbstractVector{T}) where {R<:Real, T} + function OneHotEncoding{R}(possible_values::AbstractVector) where {R<:Real} if length(unique(possible_values)) < length(possible_values) throw(ArgumentError("Expected a list of all unique possible values")) end @@ -24,7 +24,7 @@ struct OneHotEncoding{R<:Real, T} <: Transform # Create a dictionary that maps unique values in the input array to column positions # in the sparse matrix that results from applying the OneHotEncoding transform categories = Dict(value => i for (i, value) in enumerate(possible_values)) - return new{R, T}(categories) + return new{R}(categories) end end @@ -36,7 +36,7 @@ function _apply(x, encoding::OneHotEncoding{R}; kwargs...) where R <: Real n_categories = length(encoding.categories) results = zeros(R, length(x), n_categories) - for (i, value) in enumerate(x) + @views for (i, value) in enumerate(x) col_pos = encoding.categories[value] results[i, col_pos] = true end