more functionalities for data convenience

xiaodaigh · Nov 19, 2019 · e05fe01 · e05fe01
1 parent f0a3d09
commit e05fe01
Show file tree

Hide file tree

Showing 10 changed files with 185 additions and 114 deletions.
diff --git a/Project.toml b/Project.toml
@@ -6,17 +6,21 @@ version = "0.1.0"
 [deps]
 CategoricalArrays = "324d7699-5711-5eae-9e2f-1d82baa6b597"
 DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
+Dates = "ade2ca70-3891-5945-98fb-dc099432e06a"
+LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 Missings = "e1d29d7a-bbdc-5cf2-9ac0-f12de2c33e28"
-RCall = "6f49c342-dc21-5d91-9882-a32aef131414"
+SortingLab = "562c1548-17b8-5b69-83cf-d8aebec229f5"
 Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
+StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
 WeakRefStrings = "ea10d353-3f73-51f8-a26c-33c1cb351aa5"
 
 [compat]
 DataFrames = "0.19"
 julia = "1"
 
 [extras]
+RCall = "6f49c342-dc21-5d91-9882-a32aef131414"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 
 [targets]
-test = ["Test"]
+test = ["Test", "RCall"]
diff --git a/src/CCA.jl b/src/CCA.jl
@@ -1,9 +1,11 @@
 using LinearAlgebra
 
-function CCA(x::AbstractMatrix, y::AbstractMatrix)
+export canonicalcor
+function canonicalcor(x::AbstractMatrix, y::AbstractMatrix)
     ma = inv(cov(x))*cov(x, y)*inv(cov(y))*cov(y,x)
     mb = inv(cov(y))*cov(y, x)*inv(cov(x))*cov(x,y)
-    cor(x*eigvecs(ma)[5], y*eigvecs(mb)[5])
+    evx = eigvecs(ma)
+    evy = eigvecs(mb)
+    abs(cor(x*evx[:, end], y*evy[:, end]))
+    #[-cor(x*evx, y*evy) for (evx, evy) in zip(eachcol(evx), eachcol(evy))]
 end
-
-using RCall
diff --git a/src/DataConvenience.jl b/src/DataConvenience.jl
@@ -5,30 +5,18 @@ using DataFrames: categorical, AbstractDataFrame, DataFrame, names!
 using CategoricalArrays
 using Statistics
 using Missings:nonmissingtype
-using RCall
 
 import Statistics:cor
 export cor, dfcor, @replicate, StringVector
-export cleannames!
 
-"""
-    cleannames!(df::DataFrame)
 
-Uses R's `janitor::clean_names` to clean the names
-"""
-function cleannames!(df::AbstractDataFrame)
-    rdf = DataFrame(df[1, :])
-    @rput rdf
-    R"""
-    new_names = names(janitor::clean_names(rdf))
-    """
-    @rget new_names
-    if new_names isa AbstractVector
-        names!(df, Symbol.(new_names))
-    else # must be singular
-        names!(df, [Symbol(new_names)])
-    end
-end
+include("cate-arrays.jl")
+include("CCA.jl")
+include("janitor.jl")
+
+
+
+
 
 # head(df::AbstractDataFrame) = first(df, 10)
 #
@@ -48,31 +36,6 @@ macro replicate(n, expr)
     :([$(esc(expr)) for i=1:$(esc(n))])
 end
 
-"""
-    StringVector(v::CategoricalVector{String})
-
-Convert `v::CategoricalVector` efficiently to WeakRefStrings.StringVector
-
-## Example
-```julia
-using DataFrames
-a  = categorical(["a","c", "a"])
-a.refs
-a.pool.index
-
-# efficiently convert
-sa = StringVector(a)
-
-sa.buffer
-sa.lengths
-sa.offsets
-```
-"""
-StringVector(v::CategoricalVector{S}) where S<:AbstractString = begin
-    sa = StringVector(v.pool.index)
-    StringVector{S}(sa.buffer, sa.offsets[v.refs], sa.lengths[v.refs])
-end
-
 
 """
     cor(x::AbstractVector{Bool}, y)
@@ -125,68 +88,4 @@ dfcor(df::AbstractDataFrame, cols1 = names(df), cols2 = names(df); verbose=false
     (names1[1:k-1], names2[1:k-1], res[1:k-1])
 end
 
-# support for nanoseconds in dates
-using Dates
-
-struct DateTimeN
-    d::Date
-    t::Time
-end
-
-str = "2019-10-23T12:01:15.123456789"
-
-parseDateTimeN(str)
-parseDateTimeN( "2019-10-23T12:01:15.230")
-
-function parseDateTimeN(str)
-    date, mmn = split(str, '.')
-    date1, time1 = split(date,'T')
-
-    time2 = parse.(Int64, split(time1, ':'))
-
-    mmn1 = mmn * reduce(*, ["0" for i in 1:(9-length(mmn))])
-
-    rd = reverse(digits(parse(Int, mmn1), pad = 9))
-
-    t = reduce(vcat, [
-        time2,
-        parse(Int, reduce(*, string.(rd[1:3]))),
-        parse(Int, reduce(*, string.(rd[4:6]))),
-        parse(Int, reduce(*, string.(rd[7:9])))]
-        )
-
-    DateTimeN(Date(date1), Time(t...))
-end
-
-parseDateTimeN(str)
-
-import Base:show
-
-show(io::IO, dd::DateTimeN) = begin
-    print(io, dd.d)
-    print(io, dd.t)
-end
-
-DateTimeN(str::String) = parseDateTimeN(str)
-
-################################################################################
-# convenient function for CategoricalArrays
-################################################################################
-import SortingLab:sorttwo!
-import StatsBase: rle
-using CategoricalArrays
-
-SortingLab.sorttwo!(x::CategoricalVector, y) = begin
-    SortingLab.sorttwo!(x.refs, y)
-    x, y
-end
-
-pooltype(::CategoricalPool{T,S}) where {T, S} = T,S
-
-rle(x::CategoricalVector) = begin
-   	refrle = rle(x.refs)
-   	T,S = pooltype(x.pool)
-   	(CategoricalArray{T, 1}(S.(refrle[1]), x.pool), refrle[2])
-end
-
 end # module
diff --git a/src/Dates.jl b/src/Dates.jl
@@ -0,0 +1,29 @@
+function parseDateTimeN(str)
+    date, mmn = split(str, '.')
+    date1, time1 = split(date,'T')
+
+    time2 = parse.(Int64, split(time1, ':'))
+
+    mmn1 = mmn * reduce(*, ["0" for i in 1:(9-length(mmn))])
+
+    rd = reverse(digits(parse(Int, mmn1), pad = 9))
+
+    t = reduce(vcat, [
+        time2,
+        parse(Int, reduce(*, string.(rd[1:3]))),
+        parse(Int, reduce(*, string.(rd[4:6]))),
+        parse(Int, reduce(*, string.(rd[7:9])))]
+        )
+
+    DateTimeN(Date(date1), Time(t...))
+end
+
+
+import Base:show
+
+show(io::IO, dd::DateTimeN) = begin
+    print(io, dd.d)
+    print(io, dd.t)
+end
+
+DateTimeN(str::String) = parseDateTimeN(str)
diff --git a/src/cate-arrays.jl b/src/cate-arrays.jl
@@ -0,0 +1,45 @@
+################################################################################
+# convenient function for CategoricalArrays
+################################################################################
+import SortingLab:sorttwo!
+using SortingLab
+import StatsBase: rle
+using CategoricalArrays
+
+SortingLab.sorttwo!(x::CategoricalVector, y) = begin
+    SortingLab.sorttwo!(x.refs, y)
+    x, y
+end
+
+pooltype(::CategoricalPool{T,S}) where {T, S} = T,S
+
+rle(x::CategoricalVector) = begin
+   	refrle = rle(x.refs)
+   	T,S = pooltype(x.pool)
+   	(CategoricalArray{T, 1}(S.(refrle[1]), x.pool), refrle[2])
+end
+
+"""
+    StringVector(v::CategoricalVector{String})
+
+Convert `v::CategoricalVector` efficiently to WeakRefStrings.StringVector
+
+## Example
+```julia
+using DataFrames
+a  = categorical(["a","c", "a"])
+a.refs
+a.pool.index
+
+# efficiently convert
+sa = StringVector(a)
+
+sa.buffer
+sa.lengths
+sa.offsets
+```
+"""
+StringVector(v::CategoricalVector{S}) where S<:AbstractString = begin
+    sa = StringVector(v.pool.index)
+    StringVector{S}(sa.buffer, sa.offsets[v.refs], sa.lengths[v.refs])
+end
diff --git a/src/janitor.jl b/src/janitor.jl
@@ -0,0 +1,47 @@
+import DataFrames: AbstractDataFrame
+
+using DataFrames: rename!
+
+export cleannames!, cleanname, renamedups!
+
+"""
+    cleannames!(df::DataFrame)
+
+Uses R's `janitor::clean_names` to clean the names
+"""
+const ALLOWED_CHARS = vcat(vcat(vcat(Char.(-32+97:-32+97+25), Char.(97:97+25)), '_'), Char.(48:57))
+
+renamedups!(n::AbstractVector{Symbol}) = begin
+    # are the uniques?
+    d = Dict{Symbol, Bool}()
+    for (i, n1) in enumerate(n)
+        if haskey(d, n1)
+            n[i] = Symbol(string(n[i])*"_1")
+            d[n[i]] = true
+        else
+            d[n1] = true
+        end
+    end
+    n
+end
+
+cleanname(s) = begin
+    ss = string(s)
+    res = join([c in ALLOWED_CHARS ? c : '_' for c in ss])
+
+    if res[1] in vcat(Char.(48:57))
+        res = "x" * res
+    end
+    Symbol(res)
+end
+
+function cleannames!(df::AbstractDataFrame)
+    n = names(df)
+    cn = cleanname.(n)
+    cn = renamedups!(cn)
+
+    for p in Pair.(n, cn)
+        rename!(df, p)
+    end
+    df
+end
diff --git a/test/canonicalcor.jl b/test/canonicalcor.jl
@@ -0,0 +1,17 @@
+using RCall
+
+@testset "DataConvenience.jl" begin
+    for i in 1:100
+        # Write your own tests here.
+        x = rand(100, 5)
+        y = rand(100, 5)
+
+        @rput x
+        @rput y
+        R"""
+        res = cancor(x,y)$cor[1]
+        """
+        @rget res
+        @test res ≈  canonicalcor(x,y)
+    end
+end
diff --git a/test/dates.jl b/test/dates.jl
@@ -0,0 +1,14 @@
+# support for nanoseconds in dates
+using Dates
+
+struct DateTimeN
+    d::Date
+    t::Time
+end
+
+str = "2019-10-23T12:01:15.123456789"
+
+parseDateTimeN(str)
+parseDateTimeN( "2019-10-23T12:01:15.230")
+
+parseDateTimeN(str)
diff --git a/test/janitor.jl b/test/janitor.jl
@@ -0,0 +1,11 @@
+using DataFrames
+using Test
+
+@testset "clean names " begin
+    df = DataFrame(ok = 2:3, ok2 = 2:3, ok3=2:3)
+    rename!(df, :ok => Symbol("ok-2"))
+
+    @test names(cleannames!(df)) == [:ok_2, :ok2, :ok3]
+
+    @test renamedups!([:ok, :ok_1, :ok_1]) == [:ok, :ok_1, :ok_1_1]
+end
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -1,6 +1,9 @@
 using DataConvenience
 using Test
 
+include("canonicalcor.jl")
+include("janitor.jl")
+
 @testset "DataConvenience.jl" begin
     # Write your own tests here.
 end