diff --git a/Project.toml b/Project.toml index b1bca87..ad65c7d 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "DataConvenience" uuid = "3b531cbf-ee43-4e67-8118-dca2c9372f86" authors = ["Dai ZJ "] -version = "0.3.1" +version = "0.3.2" [deps] CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b" diff --git a/README.jmd b/README.jmd index cce96a1..5207ee4 100644 --- a/README.jmd +++ b/README.jmd @@ -60,6 +60,21 @@ bar(["DataFrames.sort 1 col","DataFrames.sort 2 col2", "DataCon.sort 1 col","Dat ### Clean column names with `cleannames!` Somewhat similiar to R's `janitor::clean_names` so that `cleannames!(df)` cleans the names of a `DataFrame`. +### One hot encoding + +``` +a = DataFrame( + player1 = ["a", "b", "c"], + player2 = ["d", "c", "a"] +) + +# does not modify a +onehot(a, :player1) + +# modfies a +onehot!(a, :player1) +``` + ### CSV Chunk Reader diff --git a/README.md b/README.md index fbfbd19..cb20c7f 100644 --- a/README.md +++ b/README.md @@ -42,26 +42,26 @@ fsort!(df, [:col1, :col2]) # sort in-place by `:col1` and `:col2` ``` 1000000×3 DataFrame - Row │ col col1 col2 - │ Float64 Float64 Float64 -─────────┼─────────────────────────────────── - 1 │ 0.3708 7.98914e-7 0.0982182 - 2 │ 0.743345 8.62962e-7 0.609425 - 3 │ 0.379679 1.0321e-6 0.353734 - 4 │ 0.0357946 4.01304e-6 0.632459 - 5 │ 0.588126 4.32507e-6 0.439859 - 6 │ 0.706394 4.54834e-6 0.811462 - 7 │ 0.228183 4.76902e-6 0.0418427 - 8 │ 0.3761 5.15514e-6 0.163736 - ⋮ │ ⋮ ⋮ ⋮ - 999994 │ 0.469715 0.999991 0.442478 - 999995 │ 0.971895 0.999992 0.637568 - 999996 │ 0.891238 0.999993 0.72935 - 999997 │ 0.404767 0.999993 0.905502 - 999998 │ 0.249169 0.999996 0.584482 - 999999 │ 0.784547 0.999997 0.362961 - 1000000 │ 0.705492 1.0 0.296773 - 999985 rows omitted + Row │ col col1 col2 + │ Float64 Float64 Float64 +─────────┼───────────────────────────────── + 1 │ 0.95298 2.40468e-8 0.274197 + 2 │ 0.100822 5.43567e-7 0.764761 + 3 │ 0.445557 9.55935e-7 0.263688 + 4 │ 0.136386 2.60883e-6 0.659607 + 5 │ 0.568898 2.66762e-6 0.77789 + 6 │ 0.666694 2.71565e-6 0.838029 + 7 │ 0.311161 3.46322e-6 0.774721 + 8 │ 0.743033 3.56981e-6 0.979397 + ⋮ │ ⋮ ⋮ ⋮ + 999994 │ 0.33961 0.999993 0.503673 + 999995 │ 0.907282 0.999996 0.264907 + 999996 │ 0.69553 0.999996 0.295978 + 999997 │ 0.419872 0.999996 0.48607 + 999998 │ 0.154967 0.999998 0.0984277 + 999999 │ 0.536315 0.999999 0.217873 + 1000000 │ 0.859866 0.999999 0.117873 + 999985 rows omitted ``` @@ -90,6 +90,21 @@ bar(["DataFrames.sort 1 col","DataFrames.sort 2 col2", "DataCon.sort 1 col","Dat ### Clean column names with `cleannames!` Somewhat similiar to R's `janitor::clean_names` so that `cleannames!(df)` cleans the names of a `DataFrame`. +### One hot encoding + +``` +a = DataFrame( + player1 = ["a", "b", "c"], + player2 = ["d", "c", "a"] +) + +# does not modify a +onehot(a, :player1) + +# modfies a +onehot!(a, :player1) +``` + ### CSV Chunk Reader @@ -117,11 +132,11 @@ end DataType ─────┼───────────────────────────────────────────────────────────────────── ────────── - 1 │ a 0.500112 9.77158e-7 0.500207 0.999999 0 + 1 │ a 0.500244 1.37953e-6 0.500304 0.999999 0 Float64 - 2 │ b -0.446016 -128 0.0 127 0 + 2 │ b -0.509408 -128 -1.0 127 0 Int64 - 3 │ c -0.667185 -128 -1.0 127 0 + 3 │ c -0.587495 -128 -1.0 127 0 Int64 ``` @@ -140,18 +155,18 @@ end ``` 3×7 DataFrame - Row │ variable mean min median max - nmissing eltype - │ Symbol Nothing String Nothing String - Int64 DataType + Row │ variable mean min median max + nmissing eltype + │ Symbol Nothing String Nothing String + Int64 DataType ─────┼───────────────────────────────────────────────────────────────────── -───────────────────────── - 1 │ a 0.00010057134141727708 9.77544678875119 -6e-5 0 String - 2 │ b -1 99 - 0 String - 3 │ c -1 99 - 0 String +──────────────────────── + 1 │ a 0.0001000242096453885 9.918498010730303 +e-5 0 String + 2 │ b -1 99 + 0 String + 3 │ c -1 99 + 0 String ``` @@ -165,18 +180,18 @@ end ``` 3×7 DataFrame - Row │ variable mean min median max - nmissing eltype - │ Symbol Union… Any Union… Any - Int64 DataType + Row │ variable mean min median max + nmissing eltype + │ Symbol Union… Any Union… Any + Int64 DataType ─────┼───────────────────────────────────────────────────────────────────── -────────────────────────── - 1 │ a 0.00010057134141727708 9.7754467887511 -96e-5 0 String - 2 │ b -0.446016 -128 0.0 127 - 0 Int64 - 3 │ c -0.667185 -128.0 -1.0 127.0 - 0 Float32 +───────────────────────── + 1 │ a 0.0001000242096453885 9.91849801073030 +3e-5 0 String + 2 │ b -0.509408 -128 -1.0 127 + 0 Int64 + 3 │ c -0.587495 -128.0 -1.0 127.0 + 0 Float32 ``` diff --git a/figures/README_2_1.png b/figures/README_2_1.png index 1ff1cc2..63665f4 100644 Binary files a/figures/README_2_1.png and b/figures/README_2_1.png differ diff --git a/src/DataConvenience.jl b/src/DataConvenience.jl index 848ae5d..6faae90 100644 --- a/src/DataConvenience.jl +++ b/src/DataConvenience.jl @@ -14,7 +14,7 @@ include("cate-arrays.jl") include("CCA.jl") include("janitor.jl") include("dfcor.jl") -# include("replace_onehot.jl") +include("onehot.jl") include("create-missing.jl") include("read-csv-in-chunks.jl") include("fsort-dataframes.jl") diff --git a/src/onehot.jl b/src/onehot.jl new file mode 100644 index 0000000..a351ec6 --- /dev/null +++ b/src/onehot.jl @@ -0,0 +1,25 @@ +export onehot, onehot! + +using DataFrames: AbstractDataFrame + +""" + onehot(df, col, cate = sort(unique(df[!, col])); outnames = Symbol.(:ohe_, cate)) + + onehot!(df, col, cate = sort(unique(df[!, col])); outnames = Symbol.(:ohe_, cate)) + +One-hot encode a column and create columns. The output columns will be overwritten WITHOUT warning + +Arguments: + + df - The DataFrame + col - The column to onehot encode + cate - The categories + +""" +function onehot!(df::AbstractDataFrame, col, cate = sort(unique(df[!, col])); outnames = Symbol.(:ohe_, cate)) + transform!(df, @. col => ByRow(isequal(cate)) .=> outnames) +end + +function onehot(df::AbstractDataFrame, col, cate = sort(unique(df[!, col])); outnames = Symbol.(:ohe_, cate)) + transform(df, @. col => ByRow(isequal(cate)) .=> outnames) +end \ No newline at end of file diff --git a/src/replace_onehot.jl b/src/replace_onehot.jl deleted file mode 100644 index 9b72642..0000000 --- a/src/replace_onehot.jl +++ /dev/null @@ -1,18 +0,0 @@ -# export replace_onehot! -# -# using DataFrames: select!, Not - -""" - replace_onehot!(df, col) = begin - -Replace `col` with the onehot representation -""" -replace_onehot!(df, col) = begin - x = df[!, col] - oh = onehotbatch(x.refs, 1:length(x.pool)) - for (i, n) in enumerate(string(col).*string.(x.pool.index)) - df[!, Symbol(n)] = x.refs .== i - end - select!(df, Not(col)) - df -end