Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Allow stored names/types in Schema for very large schemas #241

Merged
merged 7 commits into from
Jun 23, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
61 changes: 45 additions & 16 deletions src/Tables.jl
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@ if !hasmethod(getproperty, Tuple{Tuple, Int})
Base.getproperty(t::Tuple, i::Int) = t[i]
end

import Base: ==

"""
Tables.AbstractColumns
Expand Down Expand Up @@ -187,7 +189,7 @@ Base.isempty(r::RorC) = length(r) == 0

function Base.NamedTuple(r::RorC)
names = columnnames(r)
return NamedTuple{Tuple(Base.map(Symbol, names))}(Tuple(getcolumn(r, nm) for nm in names))
return NamedTuple{Tuple(map(Symbol, names))}(Tuple(getcolumn(r, nm) for nm in names))
end

function Base.show(io::IO, x::T) where {T <: AbstractRow}
Expand Down Expand Up @@ -402,44 +404,71 @@ are unknown (usually not inferrable). This is similar to the `Base.EltypeUnknown
when `Base.IteratorEltype` is called. Users should account for the `Tables.schema(tbl) => nothing` case
by using the properties of the results of `Tables.rows(x)` and `Tables.columns(x)` directly.
To access the names, one can simply call `sch.names` to return the tuple of Symbols.
To access column element types, one can similarly call `sch.types`, which will return a tuple of types (like `(Int64, Float64, String)`).
To access the names, one can simply call `sch.names` to return a collection of Symbols (`Tuple` or `Vector`).
To access column element types, one can similarly call `sch.types`, which will return a collection of types (like `(Int64, Float64, String)`).
The actual type definition is
```julia
struct Schema{names, types} end
struct Schema{names, types}
storednames::Union{Nothing, Vector{Symbol}}
storedtypes::Union{Nothing, Vector{Type}}
end
```
Where `names` is a tuple of Symbols, and `types` is a tuple _type_ of types (like `Tuple{Int64, Float64, String}`).
Where `names` is a tuple of `Symbol`s or `nothing`, and `types` is a tuple _type_ of types (like `Tuple{Int64, Float64, String}`) or `nothing`.
Encoding the names & types as type parameters allows convenient use of the type in generated functions
and other optimization use-cases.
"""
struct Schema{names, types} end
Schema(names::Tuple{Vararg{Symbol}}, types::Type{T}) where {T <: Tuple} = Schema{names, T}()
and other optimization use-cases, but users should note that when `names` and/or `types` are the `nothing` value, the names and/or types
are stored in the `storednames` and `storedtypes` fields. This is to account for extremely wide tables with columns in the 10s of thousands
where encoding the names/types as type parameters becomes prohibitive to the compiler. So while optimizations can be written on the typed
`names`/`types` type parameters, users should also consider handling the extremely wide tables by specializing on `Tables.Schema{nothing, nothing}`.
"""
struct Schema{names, types}
storednames::Union{Nothing, Vector{Symbol}}
storedtypes::Union{Nothing, Vector{Type}}
end

Schema{names, types}() where {names, types} = Schema{names, types}(nothing, nothing)
Schema(names::Tuple{Vararg{Symbol}}, ::Type{T}) where {T <: Tuple} = Schema{names, T}()
Schema(::Type{NamedTuple{names, types}}) where {names, types} = Schema{names, types}()

# whether names/types are stored or not
stored(::Schema{names, types}) where {names, types} = names === nothing && types === nothing
stored(::Nothing) = false

# pass through Ints to allow Tuples to act as rows
sym(x) = Symbol(x)
sym(x::Int) = x

Schema(names, ::Nothing) = Schema{Tuple(Base.map(sym, names)), nothing}()
Schema(names, types) = Schema{Tuple(Base.map(sym, names)), Tuple{types...}}()
Schema(names, ::Nothing) = Schema{Tuple(map(sym, names)), nothing}()

const SCHEMA_SPECIALIZATION_THRESHOLD = 67000

function Schema(names, types; stored::Bool=false)
if stored || length(names) > SCHEMA_SPECIALIZATION_THRESHOLD
return Schema{nothing, nothing}([sym(x) for x in names], Type[T for T in types])
else
return Schema{Tuple(map(sym, names)), Tuple{types...}}()
end
end

function Base.show(io::IO, sch::Schema{names, types}) where {names, types}
function Base.show(io::IO, sch::Schema)
get(io, :print_schema_header, true) && println(io, "Tables.Schema:")
Base.print_matrix(io, hcat(collect(names), types === nothing ? fill(nothing, length(names)) : collect(fieldtype(types, i) for i = 1:fieldcount(types))))
nms = sch.names
Base.print_matrix(io, hcat(nms isa Vector ? nms : collect(nms), sch.types === nothing ? fill(nothing, length(nms)) : collect(sch.types)))
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

just to be sure - are both paths tested?

end

function Base.getproperty(sch::Schema{names, types}, field::Symbol) where {names, types}
if field === :names
return names
return names === nothing ? getfield(sch, :storednames) : names
elseif field === :types
return types === nothing ? nothing : Tuple(fieldtype(types, i) for i = 1:fieldcount(types))
T = getfield(sch, :storedtypes)
return types === nothing ? (T !== nothing ? T : nothing) : Tuple(fieldtype(types, i) for i = 1:fieldcount(types))
else
throw(ArgumentError("unsupported property for Tables.Schema"))
end
end

Base.propertynames(sch::Schema) = (:names, :types)
Base.propertynames(::Schema) = (:names, :types)
==(a::Schema, b::Schema) = a.names == b.names && a.types == b.types

# partitions

Expand Down
8 changes: 4 additions & 4 deletions src/fallbacks.jl
Original file line number Diff line number Diff line change
Expand Up @@ -109,17 +109,17 @@ allocatecolumn(T, len) = DataAPI.defaultarray(T, 1)(undef, len)
@inline function _allocatecolumns(::Schema{names, types}, len) where {names, types}
if @generated
vals = Tuple(:(allocatecolumn($(fieldtype(types, i)), len)) for i = 1:fieldcount(types))
return :(NamedTuple{$(Base.map(Symbol, names))}(($(vals...),)))
return :(NamedTuple{$(map(Symbol, names))}(($(vals...),)))
else
return NamedTuple{Base.map(Symbol, names)}(Tuple(allocatecolumn(fieldtype(types, i), len) for i = 1:fieldcount(types)))
return NamedTuple{map(Symbol, names)}(Tuple(allocatecolumn(fieldtype(types, i), len) for i = 1:fieldcount(types)))
end
end

@inline function allocatecolumns(sch::Schema{names, types}, len) where {names, types}
if fieldcount(types) <= SPECIALIZATION_THRESHOLD
return _allocatecolumns(sch, len)
else
return NamedTuple{Base.map(Symbol, names)}(Tuple(allocatecolumn(fieldtype(types, i), len) for i = 1:fieldcount(types)))
return NamedTuple{map(Symbol, names)}(Tuple(allocatecolumn(fieldtype(types, i), len) for i = 1:fieldcount(types)))
end
end

Expand Down Expand Up @@ -214,7 +214,7 @@ end
len = Base.haslength(T) ? length(rowitr) : 0
sch = Schema(names, nothing)
columns = Tuple(EmptyVector(len) for _ = 1:length(names))
return NamedTuple{Base.map(Symbol, names)}(_buildcolumns(rowitr, row, st, sch, columns, Ref{Any}(columns))[])
return NamedTuple{map(Symbol, names)}(_buildcolumns(rowitr, row, st, sch, columns, Ref{Any}(columns))[])
end

"""
Expand Down
27 changes: 20 additions & 7 deletions src/namedtuples.jl
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,14 @@ end
Pass any table input source and return a `NamedTuple` iterator

See also [`rows`](@ref) and [`rowtable`](@ref).

Not for use with extremely wide tables with # of columns > 67K; current fundamental compiler limits
prevent constructing `NamedTuple`s that large.
"""
function namedtupleiterator(x)
r = rows(x)
sch = schema(r)
stored(sch) && throw(ArgumentError("input table too wide ($(length(sch.names)) columns) to construct `NamedTuple` rows"))
return NamedTupleIterator{typeof(sch), typeof(r)}(r)
end

Expand All @@ -29,7 +33,7 @@ namedtupleiterator(T, x) = namedtupleiterator(x)

Base.IteratorEltype(::Type{NamedTupleIterator{Schema{names, types}, T}}) where {names, types, T} = Base.HasEltype()
Base.IteratorEltype(::Type{NamedTupleIterator{Nothing, T}}) where {T} = Base.EltypeUnknown()
Base.eltype(::Type{NamedTupleIterator{Schema{names, types}, T}}) where {names, types, T} = NamedTuple{Base.map(Symbol, names), types}
Base.eltype(::Type{NamedTupleIterator{Schema{names, types}, T}}) where {names, types, T} = NamedTuple{map(Symbol, names), types}
Base.IteratorSize(::Type{NamedTupleIterator{sch, T}}) where {sch, T} = Base.IteratorSize(T)
Base.length(nt::NamedTupleIterator) = length(nt.x)
Base.size(nt::NamedTupleIterator) = (length(nt.x),)
Expand All @@ -49,7 +53,7 @@ Base.size(nt::NamedTupleIterator) = (length(nt.x),)
x = iterate(rows.x, st...)
x === nothing && return nothing
row, st = x
return NamedTuple{Base.map(Symbol, names), T}(Tuple(getcolumn(row, fieldtype(T, i), i, names[i]) for i = 1:fieldcount(T))), (st,)
return NamedTuple{map(Symbol, names), T}(Tuple(getcolumn(row, fieldtype(T, i), i, names[i]) for i = 1:fieldcount(T))), (st,)
end
end

Expand All @@ -60,7 +64,7 @@ end
x = iterate(rows.x, st...)
x === nothing && return nothing
row, st = x
return NamedTuple{Base.map(Symbol, names), T}(Tuple(getcolumn(row, fieldtype(T, i), i, names[i]) for i = 1:fieldcount(T))), (st,)
return NamedTuple{map(Symbol, names), T}(Tuple(getcolumn(row, fieldtype(T, i), i, names[i]) for i = 1:fieldcount(T))), (st,)
end
end

Expand Down Expand Up @@ -91,6 +95,9 @@ naturally, i.e. a `Vector` naturally iterates its elements, and
indexing value by index, name, and getting all names).

For a lazy iterator over rows see [`rows`](@ref) and [`namedtupleiterator`](@ref).

Not for use with extremely wide tables with # of columns > 67K; current fundamental compiler limits
prevent constructing `NamedTuple`s that large.
"""
function rowtable end

Expand Down Expand Up @@ -130,29 +137,35 @@ Takes any input table source `x` and returns a `NamedTuple` of `Vector`s,
also known as a "column table". A "column table" is a kind of default
table type of sorts, since it satisfies the Tables.jl column interface
naturally.

Not for use with extremely wide tables with # of columns > 67K; current fundamental compiler limits
prevent constructing `NamedTuple`s that large.
"""
function columntable end

function _columntable(sch::Schema{names, types}, cols) where {names, types}
# use of @generated justified because it's user-controlled; they explicitly asked for namedtuple of vectors
if @generated
vals = Tuple(:(getarray(getcolumn(cols, $(fieldtype(types, i)), $i, $(quot(names[i]))))) for i = 1:fieldcount(types))
return :(NamedTuple{Base.map(Symbol, names)}(($(vals...),)))
return :(NamedTuple{map(Symbol, names)}(($(vals...),)))
else
return NamedTuple{Base.map(Symbol, names)}(Tuple(getarray(getcolumn(cols, fieldtype(types, i), i, names[i])) for i = 1:fieldcount(types)))
return NamedTuple{map(Symbol, names)}(Tuple(getarray(getcolumn(cols, fieldtype(types, i), i, names[i])) for i = 1:fieldcount(types)))
end
end

function columntable(sch::Schema{names, types}, cols) where {names, types}
if fieldcount(types) <= SPECIALIZATION_THRESHOLD
return _columntable(sch, cols)
else
return NamedTuple{Base.map(Symbol, names)}(Tuple(getarray(getcolumn(cols, fieldtype(types, i), i, names[i])) for i = 1:fieldcount(types)))
return NamedTuple{map(Symbol, names)}(Tuple(getarray(getcolumn(cols, fieldtype(types, i), i, names[i])) for i = 1:fieldcount(types)))
end
end

# extremely large tables
columntable(sch::Schema{nothing, nothing}, cols) = throw(ArgumentError("input table too wide ($(length(sch.names)) columns) to convert to `NamedTuple` of `Vector`s"))

# unknown schema case
columntable(::Nothing, cols) = NamedTuple{Tuple(Base.map(Symbol, columnnames(cols)))}(Tuple(getarray(getcolumn(cols, col)) for col in columnnames(cols)))
columntable(::Nothing, cols) = NamedTuple{Tuple(map(Symbol, columnnames(cols)))}(Tuple(getarray(getcolumn(cols, col)) for col in columnnames(cols)))

function columntable(itr::T) where {T}
cols = columns(itr)
Expand Down
14 changes: 14 additions & 0 deletions src/utils.jl
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,13 @@ end
return
end

@inline function eachcolumn(f::F, sch::Schema{nothing, nothing}, row::T) where {F, T}
for (i, nm) in enumerate(sch.names)
f(getcolumn(row, nm), i, nm)
end
return
end

# these are specialized `eachcolumn`s where we also want
# the indexing of `columns` to be constant propagated, so it needs to be returned from the generated function
@inline function eachcolumns(f::F, sch::Schema{names, types}, row::T, columns::S, args...) where {F, names, types, T, S}
Expand Down Expand Up @@ -128,6 +135,13 @@ end
return
end

@inline function eachcolumns(f::F, sch::Schema{nothing, nothing}, row::T, columns::S, args...) where {F, T, S}
for (i, nm) in enumerate(sch.names)
f(getcolumn(row, nm), i, nm, columns[i], args...)
end
return
end

"""
rowmerge(row, other_rows...)
rowmerge(row; fields_to_merge...)
Expand Down
74 changes: 74 additions & 0 deletions test/runtests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,11 @@ using Test, Tables, TableTraits, DataValues, QueryOperators, IteratorInterfaceEx

@test Tables.getarray([1,2,3]) == [1,2,3]
@test Tables.getarray((1,2,3)) == [1,2,3]

# empty schema
sch = Tables.Schema((), ())
@test sch.names == ()
@test sch.types == ()
end

@testset "namedtuples.jl" begin
Expand Down Expand Up @@ -703,3 +708,72 @@ end
@test isequal(dct.d, [missing, 5, 7, missing, 11])

end

# extremely wide tables
struct WideTable <: Tables.AbstractColumns
end

Tables.istable(::Type{WideTable}) = true
Tables.columnaccess(::Type{WideTable}) = true
Tables.columns(x::WideTable) = x
Tables.schema(::WideTable) = Tables.Schema([Symbol("x", i) for i = 1:(Tables.SCHEMA_SPECIALIZATION_THRESHOLD + 1)], [Float64 for _ = 1:(Tables.SCHEMA_SPECIALIZATION_THRESHOLD + 1)])
Tables.getcolumn(g::WideTable, nm::Symbol) = rand(100)
Tables.getcolumn(g::WideTable, i::Int) = rand(100)
Base.getindex(::WideTable, i::Int) = rand(100)
Tables.columnnames(::WideTable) = [Symbol("x", i) for i = 1:(Tables.SCHEMA_SPECIALIZATION_THRESHOLD + 1)]

struct WideTable2 <: Tables.AbstractColumns
end

Tables.istable(::Type{WideTable2}) = true
Tables.columnaccess(::Type{WideTable2}) = true
Tables.columns(x::WideTable2) = x
Tables.schema(::WideTable2) = Tables.Schema([Symbol("x", i) for i = 1:1000], [Float64 for _ = 1:1000]; stored=true)
Tables.getcolumn(g::WideTable2, nm::Symbol) = rand(100)
Base.getindex(::WideTable2, i::Int) = rand(100)
Tables.columnnames(::WideTable2) = [Symbol("x", i) for i = 1:1000]

@testset "wide tables" begin
x = WideTable();
sch = Tables.schema(x)
@test sch.names == [Symbol("x", i) for i = 1:(Tables.SCHEMA_SPECIALIZATION_THRESHOLD + 1)]
@test sch.types == [Float64 for _ = 1:(Tables.SCHEMA_SPECIALIZATION_THRESHOLD + 1)]
@test typeof(sch) == Tables.Schema{nothing, nothing}
r = Tables.rows(x)
Tables.eachcolumn(sch, first(r)) do y, i, nm
@test y isa Float64
@test i isa Integer
@test nm isa Symbol
end
Tables.eachcolumns(sch, first(r), x) do y, i, nm, col
@test y isa Float64
@test i isa Integer
@test nm isa Symbol
@test col isa Vector{Float64}
end
@test_throws ArgumentError Tables.columntable(x)
@test_throws ArgumentError Tables.rowtable(x)
y = Tables.dictrowtable(x);
@test length(y) == 100
y = Tables.dictcolumntable(x);
@test Tables.schema(y) == Tables.schema(x)
# y = Tables.matrix(x); # works, just takes a really long time and a lot of memory

x = WideTable2();
sch = Tables.schema(x)
@test sch.names == [Symbol("x", i) for i = 1:1000]
@test sch.types == [Float64 for _ = 1:1000]
@test typeof(sch) == Tables.Schema{nothing, nothing}
r = Tables.rows(x)
Tables.eachcolumn(sch, first(r)) do y, i, nm
@test y isa Float64
@test i isa Integer
@test nm isa Symbol
end
Tables.eachcolumns(sch, first(r), x) do y, i, nm, col
@test y isa Float64
@test i isa Integer
@test nm isa Symbol
@test col isa Vector{Float64}
end
end