Skip to content

Commit

Permalink
more functionalities for data convenience
Browse files Browse the repository at this point in the history
  • Loading branch information
xiaodaigh committed Nov 19, 2019
1 parent f0a3d09 commit e05fe01
Show file tree
Hide file tree
Showing 10 changed files with 185 additions and 114 deletions.
8 changes: 6 additions & 2 deletions Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -6,17 +6,21 @@ version = "0.1.0"
[deps]
CategoricalArrays = "324d7699-5711-5eae-9e2f-1d82baa6b597"
DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
Dates = "ade2ca70-3891-5945-98fb-dc099432e06a"
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
Missings = "e1d29d7a-bbdc-5cf2-9ac0-f12de2c33e28"
RCall = "6f49c342-dc21-5d91-9882-a32aef131414"
SortingLab = "562c1548-17b8-5b69-83cf-d8aebec229f5"
Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
WeakRefStrings = "ea10d353-3f73-51f8-a26c-33c1cb351aa5"

[compat]
DataFrames = "0.19"
julia = "1"

[extras]
RCall = "6f49c342-dc21-5d91-9882-a32aef131414"
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"

[targets]
test = ["Test"]
test = ["Test", "RCall"]
10 changes: 6 additions & 4 deletions src/CCA.jl
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
using LinearAlgebra

function CCA(x::AbstractMatrix, y::AbstractMatrix)
export canonicalcor
function canonicalcor(x::AbstractMatrix, y::AbstractMatrix)
ma = inv(cov(x))*cov(x, y)*inv(cov(y))*cov(y,x)
mb = inv(cov(y))*cov(y, x)*inv(cov(x))*cov(x,y)
cor(x*eigvecs(ma)[5], y*eigvecs(mb)[5])
evx = eigvecs(ma)
evy = eigvecs(mb)
abs(cor(x*evx[:, end], y*evy[:, end]))
#[-cor(x*evx, y*evy) for (evx, evy) in zip(eachcol(evx), eachcol(evy))]
end

using RCall
115 changes: 7 additions & 108 deletions src/DataConvenience.jl
Original file line number Diff line number Diff line change
Expand Up @@ -5,30 +5,18 @@ using DataFrames: categorical, AbstractDataFrame, DataFrame, names!
using CategoricalArrays
using Statistics
using Missings:nonmissingtype
using RCall

import Statistics:cor
export cor, dfcor, @replicate, StringVector
export cleannames!

"""
cleannames!(df::DataFrame)

Uses R's `janitor::clean_names` to clean the names
"""
function cleannames!(df::AbstractDataFrame)
rdf = DataFrame(df[1, :])
@rput rdf
R"""
new_names = names(janitor::clean_names(rdf))
"""
@rget new_names
if new_names isa AbstractVector
names!(df, Symbol.(new_names))
else # must be singular
names!(df, [Symbol(new_names)])
end
end
include("cate-arrays.jl")
include("CCA.jl")
include("janitor.jl")





# head(df::AbstractDataFrame) = first(df, 10)
#
Expand All @@ -48,31 +36,6 @@ macro replicate(n, expr)
:([$(esc(expr)) for i=1:$(esc(n))])
end

"""
StringVector(v::CategoricalVector{String})
Convert `v::CategoricalVector` efficiently to WeakRefStrings.StringVector
## Example
```julia
using DataFrames
a = categorical(["a","c", "a"])
a.refs
a.pool.index
# efficiently convert
sa = StringVector(a)
sa.buffer
sa.lengths
sa.offsets
```
"""
StringVector(v::CategoricalVector{S}) where S<:AbstractString = begin
sa = StringVector(v.pool.index)
StringVector{S}(sa.buffer, sa.offsets[v.refs], sa.lengths[v.refs])
end


"""
cor(x::AbstractVector{Bool}, y)
Expand Down Expand Up @@ -125,68 +88,4 @@ dfcor(df::AbstractDataFrame, cols1 = names(df), cols2 = names(df); verbose=false
(names1[1:k-1], names2[1:k-1], res[1:k-1])
end

# support for nanoseconds in dates
using Dates

struct DateTimeN
d::Date
t::Time
end

str = "2019-10-23T12:01:15.123456789"

parseDateTimeN(str)
parseDateTimeN( "2019-10-23T12:01:15.230")

function parseDateTimeN(str)
date, mmn = split(str, '.')
date1, time1 = split(date,'T')

time2 = parse.(Int64, split(time1, ':'))

mmn1 = mmn * reduce(*, ["0" for i in 1:(9-length(mmn))])

rd = reverse(digits(parse(Int, mmn1), pad = 9))

t = reduce(vcat, [
time2,
parse(Int, reduce(*, string.(rd[1:3]))),
parse(Int, reduce(*, string.(rd[4:6]))),
parse(Int, reduce(*, string.(rd[7:9])))]
)

DateTimeN(Date(date1), Time(t...))
end

parseDateTimeN(str)

import Base:show

show(io::IO, dd::DateTimeN) = begin
print(io, dd.d)
print(io, dd.t)
end

DateTimeN(str::String) = parseDateTimeN(str)

################################################################################
# convenient function for CategoricalArrays
################################################################################
import SortingLab:sorttwo!
import StatsBase: rle
using CategoricalArrays

SortingLab.sorttwo!(x::CategoricalVector, y) = begin
SortingLab.sorttwo!(x.refs, y)
x, y
end

pooltype(::CategoricalPool{T,S}) where {T, S} = T,S

rle(x::CategoricalVector) = begin
refrle = rle(x.refs)
T,S = pooltype(x.pool)
(CategoricalArray{T, 1}(S.(refrle[1]), x.pool), refrle[2])
end

end # module
29 changes: 29 additions & 0 deletions src/Dates.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
function parseDateTimeN(str)
date, mmn = split(str, '.')
date1, time1 = split(date,'T')

time2 = parse.(Int64, split(time1, ':'))

mmn1 = mmn * reduce(*, ["0" for i in 1:(9-length(mmn))])

rd = reverse(digits(parse(Int, mmn1), pad = 9))

t = reduce(vcat, [
time2,
parse(Int, reduce(*, string.(rd[1:3]))),
parse(Int, reduce(*, string.(rd[4:6]))),
parse(Int, reduce(*, string.(rd[7:9])))]
)

DateTimeN(Date(date1), Time(t...))
end


import Base:show

show(io::IO, dd::DateTimeN) = begin
print(io, dd.d)
print(io, dd.t)
end

DateTimeN(str::String) = parseDateTimeN(str)
45 changes: 45 additions & 0 deletions src/cate-arrays.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
################################################################################
# convenient function for CategoricalArrays
################################################################################
import SortingLab:sorttwo!
using SortingLab
import StatsBase: rle
using CategoricalArrays

SortingLab.sorttwo!(x::CategoricalVector, y) = begin
SortingLab.sorttwo!(x.refs, y)
x, y
end

pooltype(::CategoricalPool{T,S}) where {T, S} = T,S

rle(x::CategoricalVector) = begin
refrle = rle(x.refs)
T,S = pooltype(x.pool)
(CategoricalArray{T, 1}(S.(refrle[1]), x.pool), refrle[2])
end

"""
StringVector(v::CategoricalVector{String})
Convert `v::CategoricalVector` efficiently to WeakRefStrings.StringVector
## Example
```julia
using DataFrames
a = categorical(["a","c", "a"])
a.refs
a.pool.index
# efficiently convert
sa = StringVector(a)
sa.buffer
sa.lengths
sa.offsets
```
"""
StringVector(v::CategoricalVector{S}) where S<:AbstractString = begin
sa = StringVector(v.pool.index)
StringVector{S}(sa.buffer, sa.offsets[v.refs], sa.lengths[v.refs])
end
47 changes: 47 additions & 0 deletions src/janitor.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
import DataFrames: AbstractDataFrame

using DataFrames: rename!

export cleannames!, cleanname, renamedups!

"""
cleannames!(df::DataFrame)
Uses R's `janitor::clean_names` to clean the names
"""
const ALLOWED_CHARS = vcat(vcat(vcat(Char.(-32+97:-32+97+25), Char.(97:97+25)), '_'), Char.(48:57))

renamedups!(n::AbstractVector{Symbol}) = begin
# are the uniques?
d = Dict{Symbol, Bool}()
for (i, n1) in enumerate(n)
if haskey(d, n1)
n[i] = Symbol(string(n[i])*"_1")
d[n[i]] = true
else
d[n1] = true
end
end
n
end

cleanname(s) = begin
ss = string(s)
res = join([c in ALLOWED_CHARS ? c : '_' for c in ss])

if res[1] in vcat(Char.(48:57))
res = "x" * res
end
Symbol(res)
end

function cleannames!(df::AbstractDataFrame)
n = names(df)
cn = cleanname.(n)
cn = renamedups!(cn)

for p in Pair.(n, cn)
rename!(df, p)
end
df
end
17 changes: 17 additions & 0 deletions test/canonicalcor.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
using RCall

@testset "DataConvenience.jl" begin
for i in 1:100
# Write your own tests here.
x = rand(100, 5)
y = rand(100, 5)

@rput x
@rput y
R"""
res = cancor(x,y)$cor[1]
"""
@rget res
@test res canonicalcor(x,y)
end
end
14 changes: 14 additions & 0 deletions test/dates.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
# support for nanoseconds in dates
using Dates

struct DateTimeN
d::Date
t::Time
end

str = "2019-10-23T12:01:15.123456789"

parseDateTimeN(str)
parseDateTimeN( "2019-10-23T12:01:15.230")

parseDateTimeN(str)
11 changes: 11 additions & 0 deletions test/janitor.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
using DataFrames
using Test

@testset "clean names " begin
df = DataFrame(ok = 2:3, ok2 = 2:3, ok3=2:3)
rename!(df, :ok => Symbol("ok-2"))

@test names(cleannames!(df)) == [:ok_2, :ok2, :ok3]

@test renamedups!([:ok, :ok_1, :ok_1]) == [:ok, :ok_1, :ok_1_1]
end
3 changes: 3 additions & 0 deletions test/runtests.jl
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
using DataConvenience
using Test

include("canonicalcor.jl")
include("janitor.jl")

@testset "DataConvenience.jl" begin
# Write your own tests here.
end

0 comments on commit e05fe01

Please sign in to comment.