updated to remove Lazy dependency

xiaodaigh · Nov 27, 2020 · bfe89e4 · bfe89e4 · xiaodaigh · Nov 27, 2020
1 parent cd4d7dd
commit bfe89e4
Show file tree

Hide file tree

Showing 5 changed files with 106 additions and 126 deletions.
diff --git a/Project.toml b/Project.toml
@@ -1,14 +1,13 @@
 name = "DataConvenience"
 uuid = "3b531cbf-ee43-4e67-8118-dca2c9372f86"
 authors = ["Dai ZJ <[email protected]>"]
-version = "0.1.12"
+version = "0.2.0"
 
 [deps]
 CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b"
 CategoricalArrays = "324d7699-5711-5eae-9e2f-1d82baa6b597"
 DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
 Dates = "ade2ca70-3891-5945-98fb-dc099432e06a"
-Lazy = "50d2b5c4-7a5e-59d5-8109-a42b560f39c0"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 Missings = "e1d29d7a-bbdc-5cf2-9ac0-f12de2c33e28"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
@@ -22,7 +21,6 @@ WeakRefStrings = "ea10d353-3f73-51f8-a26c-33c1cb351aa5"
 CSV = "0.7, 0.8"
 CategoricalArrays = "0.7, 0.8, 0.9"
 DataFrames = "0.19, 0.20, 0.21, 0.22"
-Lazy = "0.14, 0.15"
 Missings = "0.4"
 SortingLab = "0.2"
 StatsBase = "0.32, 0.33"

diff --git a/README.md b/README.md
@@ -6,41 +6,38 @@ An eclectic collection of convenience functions for your data manipulation needs
 
 ### Piping Convenience
 
-#### Re-exporting Lazy.jl's `@>` `@>>` `@as` for piping convenenice
-Lazy.jl has some macros for piping operations. However, it also exports the `groupby` function which conflicts with `DataFrames.groupby`. I have made it easier here so that `using DataConvenience` will only export the macros `@>`, `@>>`,  `@as`. You can achieve the same with just Lazy.jl by doing `using Lazy: @>, @>>, @as`.
-
 #### Defining `filter(::AbstractDataFrame, arg)`
 DataFrames.jl does not define `filter(::AbstractDataFrame, arg)` and instead has `filter(arg, ::AbstractDataFrame)` only. This makes it inconsistent with the other functions so that's why I am defining `filter` with the signature `filter(::AbstractDataFrame, arg)`.
 
 #### Examples
-````julia
-
+```julia
 using DataConvenience
 using DataFrames
+using Chain: @chain
 
 df = DataFrame(a=1:8)
 
-@> df begin
+@chain df begin
     filter(:a => ==(1))
 end
+```
 
-@as x df begin
-    filter(x, :a => ==(1))
-end
-````
-
+```
+Error: MethodError: no method matching filter(::DataFrames.DataFrame, ::Pai
+r{Symbol,Base.Fix2{typeof(==),Int64}})
+Closest candidates are:
+  filter(::Any, !Matched::Tuple{Any,Any,Any,Any,Any,Any,Any,Any,Any,Any,Any
+,Any,Any,Any,Any,Any,Vararg{Any,N}} where N) at tuple.jl:267
+  filter(::Any, !Matched::Tuple) at tuple.jl:264
+  filter(::Any, !Matched::Array{T,N}) where {T, N} at array.jl:2457
+  ...
+```
 
-````
-1×1 DataFrame
-│ Row │ a     │
-│     │ Int64 │
-├─────┼───────┤
-│ 1   │ 1     │
-````
 
 
 
 
+Note: DataConvenience.jl used to re-export Lazy.jl's `@>` which it no longer does. Users are encouraged to use [Chain.jl](https://github.com/jkrumbiegel/Chain.jl) instead.
 
 ### Sampling with `sample`
 
@@ -67,45 +64,43 @@ sample(df, 1//10)
 
 You can sort `DataFrame`s (in ascending order only) faster than the `sort` function by using the `fsort` function. E.g.
 
-````julia
-
+```julia
 using DataFrames
 df = DataFrame(col = rand(1_000_000), col1 = rand(1_000_000), col2 = rand(1_000_000))
 
 fsort(df, :col) # sort by `:col`
 fsort(df, [:col1, :col2]) # sort by `:col1` and `:col2`
 fsort!(df, :col) # sort by `:col` # sort in-place by `:col`
 fsort!(df, [:col1, :col2]) # sort in-place by `:col1` and `:col2`
-````
-
+```
 
-````
+```
 1000000×3 DataFrame
-│ Row     │ col       │ col1       │ col2      │
-│         │ Float64   │ Float64    │ Float64   │
-├─────────┼───────────┼────────────┼───────────┤
-│ 1       │ 0.7632    │ 4.11895e-7 │ 0.676792  │
-│ 2       │ 0.832511  │ 9.90985e-7 │ 0.404122  │
-│ 3       │ 0.986408  │ 9.91194e-7 │ 0.79919   │
-│ 4       │ 0.994865  │ 1.40468e-6 │ 0.0211249 │
-│ 5       │ 0.624364  │ 2.25037e-6 │ 0.45385   │
-│ 6       │ 0.352928  │ 2.50274e-6 │ 0.856137  │
-│ 7       │ 0.483774  │ 3.53432e-6 │ 0.395825  │
-⋮
-│ 999993  │ 0.0910381 │ 0.999992   │ 0.23127   │
-│ 999994  │ 0.679958  │ 0.999994   │ 0.88958   │
-│ 999995  │ 0.229172  │ 0.999994   │ 0.385509  │
-│ 999996  │ 0.612217  │ 0.999994   │ 0.350795  │
-│ 999997  │ 0.443417  │ 0.999996   │ 0.0680759 │
-│ 999998  │ 0.697342  │ 0.999997   │ 0.888837  │
-│ 999999  │ 0.672206  │ 0.999998   │ 0.257211  │
-│ 1000000 │ 0.643341  │ 1.0        │ 0.0267822 │
-````
-
-
-
-````julia
+     Row │ col        col1        col2
+         │ Float64    Float64     Float64
+─────────┼──────────────────────────────────
+       1 │ 0.811363   7.97813e-8  0.0531397
+       2 │ 0.82789    1.09458e-6  0.839517
+       3 │ 0.755735   2.31673e-6  0.73836
+       4 │ 0.133499   2.53483e-6  0.100679
+       5 │ 0.131827   3.3895e-6   0.792947
+       6 │ 0.665072   4.03591e-6  0.290273
+       7 │ 0.765728   4.26244e-6  0.321963
+       8 │ 0.70778    4.43784e-6  0.575208
+    ⋮    │     ⋮          ⋮           ⋮
+  999994 │ 0.0464722  0.999992    0.65355
+  999995 │ 0.738753   0.999992    0.588945
+  999996 │ 0.0490022  0.999992    0.976147
+  999997 │ 0.860753   0.999993    0.626292
+  999998 │ 0.334551   0.999994    0.357504
+  999999 │ 0.602822   0.999996    0.222439
+ 1000000 │ 0.501476   0.999999    0.582362
+                         999985 rows omitted
+```
+
+
 
+```julia
 df = DataFrame(col = rand(1_000_000), col1 = rand(1_000_000), col2 = rand(1_000_000))
 
 using BenchmarkTools
@@ -120,8 +115,7 @@ bar(["DataFrames.sort 1 col","DataFrames.sort 2 col2", "DataCon.sort 1 col","Dat
     [sort_1col, sort_2col, fsort_1col, fsort_2col],
     title="DataFrames sort performance comparison",
     label = "seconds")
-````
-
+```
 
 ![](figures/README_3_1.png)
 
@@ -135,8 +129,7 @@ Somewhat similiar to R's `janitor::clean_names` so that `cleannames!(df)` cleans
 
 You can read a CSV in chunks and apply logic to each chunk. The types of each column is inferred by `CSV.read`.
 
-````julia
-
+```julia
 using DataFrames
 using CSV
 
@@ -148,82 +141,77 @@ CSV.write(filepath, df)
 for chunk in CsvChunkIterator(filepath)
   print(describe(chunk))
 end
-````
-
+```
 
-````
-3×8 DataFrame
-│ Row │ variable │ mean      │ min        │ median   │ max  │ nunique │ nmi
-ssing │ eltype   │
-│     │ Symbol   │ Float64   │ Real       │ Float64  │ Real │ Nothing │ Not
-hing  │ DataType │
-├─────┼──────────┼───────────┼────────────┼──────────┼──────┼─────────┼────
-──────┼──────────┤
-│ 1   │ a        │ 0.500052  │ 1.97181e-6 │ 0.500362 │ 1.0  │         │    
-      │ Float64  │
-│ 2   │ b        │ -0.373433 │ -128       │ 0.0      │ 127  │         │    
-      │ Int64    │
-│ 3   │ c        │ -0.433914 │ -128       │ 0.0      │ 127  │         │    
-      │ Int64    │
-````
+```
+3×7 DataFrame
+ Row │ variable  mean       min         median     max       nmissing  elty
+pe
+     │ Symbol    Float64    Real        Float64    Real      Int64     Data
+Type
+─────┼─────────────────────────────────────────────────────────────────────
+─────
+   1 │ a          0.500324  4.12341e-8   0.500358  0.999999         0  Floa
+t64
+   2 │ b         -0.445843        -128   0.0            127         0  Int6
+4
+   3 │ c         -0.569673        -128  -1.0            127         0  Int6
+4
+```
 
 
 
 
 
 The chunk iterator uses `CSV.read` parameters. The user can pass in `type` and `types` to dictate the types of each column e.g.
 
-````julia
-
+```julia
 # read all column as String
 for chunk in CsvChunkIterator(filepath, type=String)
     print(describe(chunk))
 end
-````
-
-
-````
-3×8 DataFrame
-│ Row │ variable │ mean    │ min                    │ median  │ max        
-         │ nunique │ nmissing │ eltype   │
-│     │ Symbol   │ Nothing │ String                 │ Nothing │ String     
-         │ Int64   │ Nothing  │ DataType │
-├─────┼──────────┼─────────┼────────────────────────┼─────────┼────────────
-─────────┼─────────┼──────────┼──────────┤
-│ 1   │ a        │         │ 0.00010059504823334287 │         │ 9.990023351
-82464e-5 │ 1000000 │          │ String   │
-│ 2   │ b        │         │ -1                     │         │ 99         
-         │ 256     │          │ String   │
-│ 3   │ c        │         │ -1                     │         │ 99         
-         │ 256     │          │ String   │
-````
+```
 
+```
+3×7 DataFrame
+ Row │ variable  mean     min                     median   max             
+      nmissing  eltype
+     │ Symbol    Nothing  String                  Nothing  String          
+      Int64     DataType
+─────┼─────────────────────────────────────────────────────────────────────
+─────────────────────────
+   1 │ a                  0.00010042823574352155           9.68828506404673
+1e-5         0  String
+   2 │ b                  -1                               99              
+             0  String
+   3 │ c                  -1                               99              
+             0  String
+```
 
 
-````julia
 
+```julia
 # read a three colunms csv where the column types are String, Int, Float32
 for chunk in CsvChunkIterator(filepath, types=[String, Int, Float32])
   print(describe(chunk))
 end
-````
-
+```
 
-````
-3×8 DataFrame
-│ Row │ variable │ mean      │ min                    │ median │ max       
-          │ nunique │ nmissing │ eltype   │
-│     │ Symbol   │ Any       │ Any                    │ Union… │ Any       
-          │ Union…  │ Nothing  │ DataType │
-├─────┼──────────┼───────────┼────────────────────────┼────────┼───────────
-──────────┼─────────┼──────────┼──────────┤
-│ 1   │ a        │           │ 0.00010059504823334287 │        │ 9.99002335
-182464e-5 │ 1000000 │          │ String   │
-│ 2   │ b        │ -0.373433 │ -128                   │ 0.0    │ 127       
-          │         │          │ Int64    │
-│ 3   │ c        │ -0.433914 │ -128.0                 │ 0.0    │ 127.0     
-          │         │          │ Float32  │
-````
+```
+3×7 DataFrame
+ Row │ variable  mean       min                     median  max            
+       nmissing  eltype
+     │ Symbol    Any        Any                     Union…  Any            
+       Int64     DataType
+─────┼─────────────────────────────────────────────────────────────────────
+──────────────────────────
+   1 │ a                    0.00010042823574352155          9.6882850640467
+31e-5         0  String
+   2 │ b         -0.445843  -128                    0.0     127            
+              0  Int64
+   3 │ c         -0.569673  -128.0                  -1.0    127.0          
+              0  Float32
+```
 
 
 
@@ -258,13 +246,11 @@ will be computed
 ### `@replicate`
 `@replicate code times` will run `code` multiple times e.g.
 
-````julia
-
+```julia
 @replicate 10 8
-````
-
+```
 
-````
+```
 10-element Array{Int64,1}:
  8
  8
@@ -276,7 +262,7 @@ will be computed
  8
  8
  8
-````
+```
 
 
 
@@ -289,8 +275,7 @@ will be computed
 
 There is a `count_missisng` function
 
-````julia
-
+```julia
 x = Vector{Union{Missing, Int}}(undef, 10_000_000)
 
 cmx = count_missing(x) # this is faster
@@ -301,12 +286,11 @@ cimx = count(ismissing, x) # the way available at base
 
 
 cmx == cimx # true
-````
-
+```
 
-````
+```
 true
-````
+```
 
 
 

diff --git a/build-readme.jl b/build-readme.jl
@@ -2,6 +2,8 @@
 using Pkg
 cd("c:/git/DataConvenience/")
 Pkg.activate("c:/git/DataConvenience/readme-env")
+Pkg.update()
+upcheck()
 
 using Weave
 

diff --git a/src/DataConvenience.jl b/src/DataConvenience.jl
@@ -17,7 +17,6 @@ include("janitor.jl")
 include("create-missing.jl")
 include("read-csv-in-chunks.jl")
 include("fsort-dataframes.jl")
-include("pipe.jl")
 include("fast-missing-count.jl")
 include("sample.jl")
 

diff --git a/src/pipe.jl b/src/pipe.jl
@@ -1,14 +1,11 @@
-export filter, @>, @as, @>>
-
-import Lazy: @>, @as, @>>
+export filter
 
 import DataFrames: filter
 
 function filter(df::AbstractDataFrame, arg; kwargs...)
     filter(arg, df; kwargs...)
 end
 
-
 if false
     using Pkg
     Pkg.activate("c:/git/DataConvenience")