added count_missing

xiaodaigh · Sep 10, 2020 · c20ae30 · c20ae30 · xiaodaigh · Sep 10, 2020
1 parent d03efdb
commit c20ae30
Show file tree

Hide file tree

Showing 7 changed files with 62 additions and 7 deletions.
diff --git a/Project.toml b/Project.toml
@@ -1,7 +1,7 @@
 name = "DataConvenience"
 uuid = "3b531cbf-ee43-4e67-8118-dca2c9372f86"
 authors = ["Dai ZJ <[email protected]>"]
-version = "0.1.8"
+version = "0.1.9"
 
 [deps]
 CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b"
@@ -19,16 +19,16 @@ Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c"
 WeakRefStrings = "ea10d353-3f73-51f8-a26c-33c1cb351aa5"
 
 [compat]
-julia = "1"
-CategoricalArrays = "0.7, 0.8"
 CSV = "0.7"
+CategoricalArrays = "0.7, 0.8"
 DataFrames = "0.19, 0.20, 0.21"
 Lazy = "0.14, 0.15"
 Missings = "0.4"
 SortingLab = "0.2"
 StatsBase = "0.32, 0.33"
 Tables = "1"
 WeakRefStrings = "0.6"
+julia = "1"
 
 [extras]
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"

diff --git a/README.md b/README.md
@@ -106,3 +106,22 @@ will be computed
 
 ### StringVector
 `StringVector(v::CategoricalVector{String})` - Convert `v::CategoricalVector` efficiently to `WeakRefStrings.StringVector`
+
+### Faster count missing
+
+There is a `count_missisng` function
+
+```julia
+x = Vector{Union{Missing, Int}}(undef, 10_000_000)
+
+cmx = count_missing(x) # this is faster
+
+cmx2 = countmissing(x) # this is faster
+
+cimx = count(ismissing, x) # the way available at base
+
+
+cmx == cimx # true
+```
+
+There is also the `count_non_missisng` function and `countnonmissing` is its synonym.
diff --git a/src/DataConvenience.jl b/src/DataConvenience.jl
@@ -18,6 +18,7 @@ include("create-missing.jl")
 include("read-csv-in-chunks.jl")
 include("fsort-dataframes.jl")
 include("pipe.jl")
+include("fast-missing-count.jl")
 
 # head(df::AbstractDataFrame) = first(df, 10)
 #

diff --git a/src/create-missing.jl b/src/create-missing.jl
@@ -7,12 +7,12 @@ using Missings: disallowmissing
 
 Create a new column for where `col` is missing
 """
-create_missing!(df, col::Symbol) = begin
-	df[!, Symbol(string(col)*"_missing")] = ismissing.(df[!, col])
+create_missing!(df, col::Symbol; prefix="", suffix = "_missing") = begin
+	df[!, prefix*string(col)*suffix] = ismissing.(df[!, col])
 	if eltype(df[!, col]) <: Union{String, Missing}
-		df[!, col] = disallowmissing(coalesce.(df[!, col], "JULIA.MISSING"))
+		df[!, col] .= disallowmissing.(coalesce.(df[!, col], "JULIA.MISSING"))
 	else
-		df[!, col] = disallowmissing(coalesce.(df[!, col], zero(eltype(df[!, col]))))
+		df[!, col] .= disallowmissing.(coalesce.(df[!, col], zero(eltype(df[!, col]))))
 	end
 	df
 end
diff --git a/src/fast-missing-count.jl b/src/fast-missing-count.jl
@@ -0,0 +1,25 @@
+export count_not_missing, count_missing, countmissing, countnotmissing
+
+countmissing(args...) = count_missing(args...)
+countnotmissing(args...) = count_not_missing(args...)
+
+
+count_not_missing(x) = length(x) - count(ismissing, x)
+
+count_missing(x) = count(ismissing, x)
+
+function count_not_missing(::Type{S}, x::Vector{Union{T, Missing}}) where {S, T}
+    @assert isbitstype(T)
+    res = zero(S)
+    @inbounds for i in 1:length(x)
+        res += !ismissing(x[i])
+    end
+    res
+end
+
+count_not_missing(x::Vector{Union{T, Missing}}) where T = count_not_missing(Int, x)
+
+count_missing(::Type{S}, x::Vector{Union{T, Missing}}) where {S, T} =
+    length(x) - count_not_missing(S, x)
+
+count_missing(x::Vector{Union{T, Missing}}) where T = count_missing(Int, x)
diff --git a/test/missing.jl b/test/missing.jl
@@ -0,0 +1,9 @@
+using Test
+using DataConvenience
+
+
+x = Vector{Union{Missing, Int64}}(undef, 1_000_000)
+
+@testset "count_missing" begin
+    @test count(ismissing, x) == count_missing(x)
+end
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -5,6 +5,7 @@ include("canonicalcor.jl")
 include("janitor.jl")
 include("read-csv-in-chunks.jl")
 include("test-fsort-dataframes.jl")
+include("missing.jl")
 
 @testset "DataConvenience.jl" begin
     # Write your own tests here.