added one hot encoding

xiaodaigh · Jun 23, 2021 · f007a18 · f007a18 · xiaodaigh · Jun 23, 2021
1 parent 4476660
commit f007a18
Show file tree

Hide file tree

Showing 7 changed files with 102 additions and 65 deletions.
diff --git a/Project.toml b/Project.toml
@@ -1,7 +1,7 @@
 name = "DataConvenience"
 uuid = "3b531cbf-ee43-4e67-8118-dca2c9372f86"
 authors = ["Dai ZJ <[email protected]>"]
-version = "0.3.1"
+version = "0.3.2"
 
 [deps]
 CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b"

diff --git a/README.jmd b/README.jmd
@@ -60,6 +60,21 @@ bar(["DataFrames.sort 1 col","DataFrames.sort 2 col2", "DataCon.sort 1 col","Dat
 ### Clean column names with `cleannames!`
 Somewhat similiar to R's `janitor::clean_names` so that `cleannames!(df)` cleans the names of a `DataFrame`.
 
+### One hot encoding
+
+```
+a = DataFrame(
+  player1 = ["a", "b", "c"],
+  player2 = ["d", "c", "a"]
+)
+
+# does not modify a
+onehot(a, :player1)
+
+# modfies a
+onehot!(a, :player1)
+```
+
 
 ### CSV Chunk Reader
 

diff --git a/README.md b/README.md
@@ -42,26 +42,26 @@ fsort!(df, [:col1, :col2]) # sort in-place by `:col1` and `:col2`
 
 ```
 1000000×3 DataFrame
-     Row │ col         col1        col2
-         │ Float64     Float64     Float64
-─────────┼───────────────────────────────────
-       1 │ 0.3708      7.98914e-7  0.0982182
-       2 │ 0.743345    8.62962e-7  0.609425
-       3 │ 0.379679    1.0321e-6   0.353734
-       4 │ 0.0357946   4.01304e-6  0.632459
-       5 │ 0.588126    4.32507e-6  0.439859
-       6 │ 0.706394    4.54834e-6  0.811462
-       7 │ 0.228183    4.76902e-6  0.0418427
-       8 │ 0.3761      5.15514e-6  0.163736
-    ⋮    │     ⋮           ⋮           ⋮
-  999994 │ 0.469715    0.999991    0.442478
-  999995 │ 0.971895    0.999992    0.637568
-  999996 │ 0.891238    0.999993    0.72935
-  999997 │ 0.404767    0.999993    0.905502
-  999998 │ 0.249169    0.999996    0.584482
-  999999 │ 0.784547    0.999997    0.362961
- 1000000 │ 0.705492    1.0         0.296773
-                          999985 rows omitted
+     Row │ col       col1        col2
+         │ Float64   Float64     Float64
+─────────┼─────────────────────────────────
+       1 │ 0.95298   2.40468e-8  0.274197
+       2 │ 0.100822  5.43567e-7  0.764761
+       3 │ 0.445557  9.55935e-7  0.263688
+       4 │ 0.136386  2.60883e-6  0.659607
+       5 │ 0.568898  2.66762e-6  0.77789
+       6 │ 0.666694  2.71565e-6  0.838029
+       7 │ 0.311161  3.46322e-6  0.774721
+       8 │ 0.743033  3.56981e-6  0.979397
+    ⋮    │    ⋮          ⋮           ⋮
+  999994 │ 0.33961   0.999993    0.503673
+  999995 │ 0.907282  0.999996    0.264907
+  999996 │ 0.69553   0.999996    0.295978
+  999997 │ 0.419872  0.999996    0.48607
+  999998 │ 0.154967  0.999998    0.0984277
+  999999 │ 0.536315  0.999999    0.217873
+ 1000000 │ 0.859866  0.999999    0.117873
+                        999985 rows omitted
 ```
 
 
@@ -90,6 +90,21 @@ bar(["DataFrames.sort 1 col","DataFrames.sort 2 col2", "DataCon.sort 1 col","Dat
 ### Clean column names with `cleannames!`
 Somewhat similiar to R's `janitor::clean_names` so that `cleannames!(df)` cleans the names of a `DataFrame`.
 
+### One hot encoding
+
+```
+a = DataFrame(
+  player1 = ["a", "b", "c"],
+  player2 = ["d", "c", "a"]
+)
+
+# does not modify a
+onehot(a, :player1)
+
+# modfies a
+onehot!(a, :player1)
+```
+
 
 ### CSV Chunk Reader
 
@@ -117,11 +132,11 @@ end
  DataType
 ─────┼─────────────────────────────────────────────────────────────────────
 ──────────
-   1 │ a          0.500112     9.77158e-7   0.500207    0.999999         0 
+   1 │ a          0.500244     1.37953e-6   0.500304    0.999999         0 
  Float64
-   2 │ b         -0.446016  -128            0.0       127                0 
+   2 │ b         -0.509408  -128           -1.0       127                0 
  Int64
-   3 │ c         -0.667185  -128           -1.0       127                0 
+   3 │ c         -0.587495  -128           -1.0       127                0 
  Int64
 ```
 
@@ -140,18 +155,18 @@ end
 
 ```
 3×7 DataFrame
- Row │ variable  mean     min                     median   max             
-      nmissing  eltype
-     │ Symbol    Nothing  String                  Nothing  String          
-      Int64     DataType
+ Row │ variable  mean     min                    median   max              
+     nmissing  eltype
+     │ Symbol    Nothing  String                 Nothing  String           
+     Int64     DataType
 ─────┼─────────────────────────────────────────────────────────────────────
-─────────────────────────
-   1 │ a                  0.00010057134141727708           9.77544678875119
-6e-5         0  String
-   2 │ b                  -1                               99              
-             0  String
-   3 │ c                  -1                               99              
-             0  String
+────────────────────────
+   1 │ a                  0.0001000242096453885           9.918498010730303
+e-5         0  String
+   2 │ b                  -1                              99               
+            0  String
+   3 │ c                  -1                              99               
+            0  String
 ```
 
 
@@ -165,18 +180,18 @@ end
 
 ```
 3×7 DataFrame
- Row │ variable  mean       min                     median  max            
-       nmissing  eltype
-     │ Symbol    Union…     Any                     Union…  Any            
-       Int64     DataType
+ Row │ variable  mean       min                    median  max             
+      nmissing  eltype
+     │ Symbol    Union…     Any                    Union…  Any             
+      Int64     DataType
 ─────┼─────────────────────────────────────────────────────────────────────
-──────────────────────────
-   1 │ a                    0.00010057134141727708          9.7754467887511
-96e-5         0  String
-   2 │ b         -0.446016  -128                    0.0     127            
-              0  Int64
-   3 │ c         -0.667185  -128.0                  -1.0    127.0          
-              0  Float32
+─────────────────────────
+   1 │ a                    0.0001000242096453885          9.91849801073030
+3e-5         0  String
+   2 │ b         -0.509408  -128                   -1.0    127             
+             0  Int64
+   3 │ c         -0.587495  -128.0                 -1.0    127.0           
+             0  Float32
 ```
 
 

diff --git a/figures/README_2_1.png b/figures/README_2_1.png
diff --git a/src/DataConvenience.jl b/src/DataConvenience.jl
@@ -14,7 +14,7 @@ include("cate-arrays.jl")
 include("CCA.jl")
 include("janitor.jl")
 include("dfcor.jl")
-# include("replace_onehot.jl")
+include("onehot.jl")
 include("create-missing.jl")
 include("read-csv-in-chunks.jl")
 include("fsort-dataframes.jl")

diff --git a/src/onehot.jl b/src/onehot.jl
@@ -0,0 +1,25 @@
+export onehot, onehot!
+
+using DataFrames: AbstractDataFrame
+
+"""
+    onehot(df, col, cate = sort(unique(df[!, col])); outnames = Symbol.(:ohe_, cate))
+
+    onehot!(df, col, cate = sort(unique(df[!, col])); outnames = Symbol.(:ohe_, cate))
+
+One-hot encode a column and create columns. The output columns will be overwritten WITHOUT warning
+
+Arguments:
+
+    df   -   The DataFrame
+    col  -   The column to onehot encode
+    cate -  The categories
+
+"""
+function onehot!(df::AbstractDataFrame, col, cate = sort(unique(df[!, col])); outnames = Symbol.(:ohe_, cate))
+    transform!(df, @. col => ByRow(isequal(cate)) .=> outnames)
+end
+
+function onehot(df::AbstractDataFrame, col, cate = sort(unique(df[!, col])); outnames = Symbol.(:ohe_, cate))
+    transform(df, @. col => ByRow(isequal(cate)) .=> outnames)
+end
diff --git a/src/replace_onehot.jl b/src/replace_onehot.jl