Skip to content

Commit

Permalink
Allow stored names/types in Schema for very large schemas (#241)
Browse files Browse the repository at this point in the history
* Allow stored names/types in Schema for very large schemas

There have been a few cases of extremely wide tables where users have
run into fundamental compiler limits for lengths of tuples (as discussed
with core devs). One example is
JuliaData/CSV.jl#635. This PR proposes for
very large schemas (> 65,000 columns), to store names/types in `Vector`
instead of tuples with the aim to avoid breaking the runtime. The aim
here is to be as non-disruptive as possible, hence the very high
threshold for switching over to store names/types. Another goal is that
downstream packages don't break with just these changes in place. I'm
not aware of any packages testing such wide tables, but in my own
testing, I've seen issues where packages are relying on the
`Tables.Schema` type parameters for names/types. There's also an issue
in DataFrames where `Tables.schema` attempts to construct a
`Tables.Schema` directly instead of using the `Tables.Schema(names,
types)` constructor. So while this PR is needed, we'll need to play
whack-a-mole with downstream packages to ensure these really wide tables
can be properly supported end-to-end. Going through those downstream
package changes, we should probably make notes of how we can clarify
Tables.jl interface docs to hopefully help future implementors do so
properly and avoid the same pitfalls.

* Add tests; update eachcolumn/eachcolumns

* Add some more testing for Tables.jl-provided types

* fix

* fix2

* fix corner case

* fix tests
  • Loading branch information
quinnj authored Jun 23, 2021
1 parent d038805 commit aab3e55
Show file tree
Hide file tree
Showing 5 changed files with 157 additions and 27 deletions.
61 changes: 45 additions & 16 deletions src/Tables.jl
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@ if !hasmethod(getproperty, Tuple{Tuple, Int})
Base.getproperty(t::Tuple, i::Int) = t[i]
end

import Base: ==

"""
Tables.AbstractColumns
Expand Down Expand Up @@ -187,7 +189,7 @@ Base.isempty(r::RorC) = length(r) == 0

function Base.NamedTuple(r::RorC)
names = columnnames(r)
return NamedTuple{Tuple(Base.map(Symbol, names))}(Tuple(getcolumn(r, nm) for nm in names))
return NamedTuple{Tuple(map(Symbol, names))}(Tuple(getcolumn(r, nm) for nm in names))
end

function Base.show(io::IO, x::T) where {T <: AbstractRow}
Expand Down Expand Up @@ -402,44 +404,71 @@ are unknown (usually not inferrable). This is similar to the `Base.EltypeUnknown
when `Base.IteratorEltype` is called. Users should account for the `Tables.schema(tbl) => nothing` case
by using the properties of the results of `Tables.rows(x)` and `Tables.columns(x)` directly.
To access the names, one can simply call `sch.names` to return the tuple of Symbols.
To access column element types, one can similarly call `sch.types`, which will return a tuple of types (like `(Int64, Float64, String)`).
To access the names, one can simply call `sch.names` to return a collection of Symbols (`Tuple` or `Vector`).
To access column element types, one can similarly call `sch.types`, which will return a collection of types (like `(Int64, Float64, String)`).
The actual type definition is
```julia
struct Schema{names, types} end
struct Schema{names, types}
storednames::Union{Nothing, Vector{Symbol}}
storedtypes::Union{Nothing, Vector{Type}}
end
```
Where `names` is a tuple of Symbols, and `types` is a tuple _type_ of types (like `Tuple{Int64, Float64, String}`).
Where `names` is a tuple of `Symbol`s or `nothing`, and `types` is a tuple _type_ of types (like `Tuple{Int64, Float64, String}`) or `nothing`.
Encoding the names & types as type parameters allows convenient use of the type in generated functions
and other optimization use-cases.
"""
struct Schema{names, types} end
Schema(names::Tuple{Vararg{Symbol}}, types::Type{T}) where {T <: Tuple} = Schema{names, T}()
and other optimization use-cases, but users should note that when `names` and/or `types` are the `nothing` value, the names and/or types
are stored in the `storednames` and `storedtypes` fields. This is to account for extremely wide tables with columns in the 10s of thousands
where encoding the names/types as type parameters becomes prohibitive to the compiler. So while optimizations can be written on the typed
`names`/`types` type parameters, users should also consider handling the extremely wide tables by specializing on `Tables.Schema{nothing, nothing}`.
"""
struct Schema{names, types}
storednames::Union{Nothing, Vector{Symbol}}
storedtypes::Union{Nothing, Vector{Type}}
end

Schema{names, types}() where {names, types} = Schema{names, types}(nothing, nothing)
Schema(names::Tuple{Vararg{Symbol}}, ::Type{T}) where {T <: Tuple} = Schema{names, T}()
Schema(::Type{NamedTuple{names, types}}) where {names, types} = Schema{names, types}()

# whether names/types are stored or not
stored(::Schema{names, types}) where {names, types} = names === nothing && types === nothing
stored(::Nothing) = false

# pass through Ints to allow Tuples to act as rows
sym(x) = Symbol(x)
sym(x::Int) = x

Schema(names, ::Nothing) = Schema{Tuple(Base.map(sym, names)), nothing}()
Schema(names, types) = Schema{Tuple(Base.map(sym, names)), Tuple{types...}}()
Schema(names, ::Nothing) = Schema{Tuple(map(sym, names)), nothing}()

const SCHEMA_SPECIALIZATION_THRESHOLD = 67000

function Schema(names, types; stored::Bool=false)
if stored || length(names) > SCHEMA_SPECIALIZATION_THRESHOLD
return Schema{nothing, nothing}([sym(x) for x in names], Type[T for T in types])
else
return Schema{Tuple(map(sym, names)), Tuple{types...}}()
end
end

function Base.show(io::IO, sch::Schema{names, types}) where {names, types}
function Base.show(io::IO, sch::Schema)
get(io, :print_schema_header, true) && println(io, "Tables.Schema:")
Base.print_matrix(io, hcat(collect(names), types === nothing ? fill(nothing, length(names)) : collect(fieldtype(types, i) for i = 1:fieldcount(types))))
nms = sch.names
Base.print_matrix(io, hcat(nms isa Vector ? nms : collect(nms), sch.types === nothing ? fill(nothing, length(nms)) : collect(sch.types)))
end

function Base.getproperty(sch::Schema{names, types}, field::Symbol) where {names, types}
if field === :names
return names
return names === nothing ? getfield(sch, :storednames) : names
elseif field === :types
return types === nothing ? nothing : Tuple(fieldtype(types, i) for i = 1:fieldcount(types))
T = getfield(sch, :storedtypes)
return types === nothing ? (T !== nothing ? T : nothing) : Tuple(fieldtype(types, i) for i = 1:fieldcount(types))
else
throw(ArgumentError("unsupported property for Tables.Schema"))
end
end

Base.propertynames(sch::Schema) = (:names, :types)
Base.propertynames(::Schema) = (:names, :types)
==(a::Schema, b::Schema) = a.names == b.names && a.types == b.types

# partitions

Expand Down
8 changes: 4 additions & 4 deletions src/fallbacks.jl
Original file line number Diff line number Diff line change
Expand Up @@ -109,17 +109,17 @@ allocatecolumn(T, len) = DataAPI.defaultarray(T, 1)(undef, len)
@inline function _allocatecolumns(::Schema{names, types}, len) where {names, types}
if @generated
vals = Tuple(:(allocatecolumn($(fieldtype(types, i)), len)) for i = 1:fieldcount(types))
return :(NamedTuple{$(Base.map(Symbol, names))}(($(vals...),)))
return :(NamedTuple{$(map(Symbol, names))}(($(vals...),)))
else
return NamedTuple{Base.map(Symbol, names)}(Tuple(allocatecolumn(fieldtype(types, i), len) for i = 1:fieldcount(types)))
return NamedTuple{map(Symbol, names)}(Tuple(allocatecolumn(fieldtype(types, i), len) for i = 1:fieldcount(types)))
end
end

@inline function allocatecolumns(sch::Schema{names, types}, len) where {names, types}
if fieldcount(types) <= SPECIALIZATION_THRESHOLD
return _allocatecolumns(sch, len)
else
return NamedTuple{Base.map(Symbol, names)}(Tuple(allocatecolumn(fieldtype(types, i), len) for i = 1:fieldcount(types)))
return NamedTuple{map(Symbol, names)}(Tuple(allocatecolumn(fieldtype(types, i), len) for i = 1:fieldcount(types)))
end
end

Expand Down Expand Up @@ -214,7 +214,7 @@ end
len = Base.haslength(T) ? length(rowitr) : 0
sch = Schema(names, nothing)
columns = Tuple(EmptyVector(len) for _ = 1:length(names))
return NamedTuple{Base.map(Symbol, names)}(_buildcolumns(rowitr, row, st, sch, columns, Ref{Any}(columns))[])
return NamedTuple{map(Symbol, names)}(_buildcolumns(rowitr, row, st, sch, columns, Ref{Any}(columns))[])
end

"""
Expand Down
27 changes: 20 additions & 7 deletions src/namedtuples.jl
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,14 @@ end
Pass any table input source and return a `NamedTuple` iterator
See also [`rows`](@ref) and [`rowtable`](@ref).
Not for use with extremely wide tables with # of columns > 67K; current fundamental compiler limits
prevent constructing `NamedTuple`s that large.
"""
function namedtupleiterator(x)
r = rows(x)
sch = schema(r)
stored(sch) && throw(ArgumentError("input table too wide ($(length(sch.names)) columns) to construct `NamedTuple` rows"))
return NamedTupleIterator{typeof(sch), typeof(r)}(r)
end

Expand All @@ -29,7 +33,7 @@ namedtupleiterator(T, x) = namedtupleiterator(x)

Base.IteratorEltype(::Type{NamedTupleIterator{Schema{names, types}, T}}) where {names, types, T} = Base.HasEltype()
Base.IteratorEltype(::Type{NamedTupleIterator{Nothing, T}}) where {T} = Base.EltypeUnknown()
Base.eltype(::Type{NamedTupleIterator{Schema{names, types}, T}}) where {names, types, T} = NamedTuple{Base.map(Symbol, names), types}
Base.eltype(::Type{NamedTupleIterator{Schema{names, types}, T}}) where {names, types, T} = NamedTuple{map(Symbol, names), types}
Base.IteratorSize(::Type{NamedTupleIterator{sch, T}}) where {sch, T} = Base.IteratorSize(T)
Base.length(nt::NamedTupleIterator) = length(nt.x)
Base.size(nt::NamedTupleIterator) = (length(nt.x),)
Expand All @@ -49,7 +53,7 @@ Base.size(nt::NamedTupleIterator) = (length(nt.x),)
x = iterate(rows.x, st...)
x === nothing && return nothing
row, st = x
return NamedTuple{Base.map(Symbol, names), T}(Tuple(getcolumn(row, fieldtype(T, i), i, names[i]) for i = 1:fieldcount(T))), (st,)
return NamedTuple{map(Symbol, names), T}(Tuple(getcolumn(row, fieldtype(T, i), i, names[i]) for i = 1:fieldcount(T))), (st,)
end
end

Expand All @@ -60,7 +64,7 @@ end
x = iterate(rows.x, st...)
x === nothing && return nothing
row, st = x
return NamedTuple{Base.map(Symbol, names), T}(Tuple(getcolumn(row, fieldtype(T, i), i, names[i]) for i = 1:fieldcount(T))), (st,)
return NamedTuple{map(Symbol, names), T}(Tuple(getcolumn(row, fieldtype(T, i), i, names[i]) for i = 1:fieldcount(T))), (st,)
end
end

Expand Down Expand Up @@ -91,6 +95,9 @@ naturally, i.e. a `Vector` naturally iterates its elements, and
indexing value by index, name, and getting all names).
For a lazy iterator over rows see [`rows`](@ref) and [`namedtupleiterator`](@ref).
Not for use with extremely wide tables with # of columns > 67K; current fundamental compiler limits
prevent constructing `NamedTuple`s that large.
"""
function rowtable end

Expand Down Expand Up @@ -130,29 +137,35 @@ Takes any input table source `x` and returns a `NamedTuple` of `Vector`s,
also known as a "column table". A "column table" is a kind of default
table type of sorts, since it satisfies the Tables.jl column interface
naturally.
Not for use with extremely wide tables with # of columns > 67K; current fundamental compiler limits
prevent constructing `NamedTuple`s that large.
"""
function columntable end

function _columntable(sch::Schema{names, types}, cols) where {names, types}
# use of @generated justified because it's user-controlled; they explicitly asked for namedtuple of vectors
if @generated
vals = Tuple(:(getarray(getcolumn(cols, $(fieldtype(types, i)), $i, $(quot(names[i]))))) for i = 1:fieldcount(types))
return :(NamedTuple{Base.map(Symbol, names)}(($(vals...),)))
return :(NamedTuple{map(Symbol, names)}(($(vals...),)))
else
return NamedTuple{Base.map(Symbol, names)}(Tuple(getarray(getcolumn(cols, fieldtype(types, i), i, names[i])) for i = 1:fieldcount(types)))
return NamedTuple{map(Symbol, names)}(Tuple(getarray(getcolumn(cols, fieldtype(types, i), i, names[i])) for i = 1:fieldcount(types)))
end
end

function columntable(sch::Schema{names, types}, cols) where {names, types}
if fieldcount(types) <= SPECIALIZATION_THRESHOLD
return _columntable(sch, cols)
else
return NamedTuple{Base.map(Symbol, names)}(Tuple(getarray(getcolumn(cols, fieldtype(types, i), i, names[i])) for i = 1:fieldcount(types)))
return NamedTuple{map(Symbol, names)}(Tuple(getarray(getcolumn(cols, fieldtype(types, i), i, names[i])) for i = 1:fieldcount(types)))
end
end

# extremely large tables
columntable(sch::Schema{nothing, nothing}, cols) = throw(ArgumentError("input table too wide ($(length(sch.names)) columns) to convert to `NamedTuple` of `Vector`s"))

# unknown schema case
columntable(::Nothing, cols) = NamedTuple{Tuple(Base.map(Symbol, columnnames(cols)))}(Tuple(getarray(getcolumn(cols, col)) for col in columnnames(cols)))
columntable(::Nothing, cols) = NamedTuple{Tuple(map(Symbol, columnnames(cols)))}(Tuple(getarray(getcolumn(cols, col)) for col in columnnames(cols)))

function columntable(itr::T) where {T}
cols = columns(itr)
Expand Down
14 changes: 14 additions & 0 deletions src/utils.jl
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,13 @@ end
return
end

@inline function eachcolumn(f::F, sch::Schema{nothing, nothing}, row::T) where {F, T}
for (i, nm) in enumerate(sch.names)
f(getcolumn(row, nm), i, nm)
end
return
end

# these are specialized `eachcolumn`s where we also want
# the indexing of `columns` to be constant propagated, so it needs to be returned from the generated function
@inline function eachcolumns(f::F, sch::Schema{names, types}, row::T, columns::S, args...) where {F, names, types, T, S}
Expand Down Expand Up @@ -128,6 +135,13 @@ end
return
end

@inline function eachcolumns(f::F, sch::Schema{nothing, nothing}, row::T, columns::S, args...) where {F, T, S}
for (i, nm) in enumerate(sch.names)
f(getcolumn(row, nm), i, nm, columns[i], args...)
end
return
end

"""
rowmerge(row, other_rows...)
rowmerge(row; fields_to_merge...)
Expand Down
74 changes: 74 additions & 0 deletions test/runtests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,11 @@ using Test, Tables, TableTraits, DataValues, QueryOperators, IteratorInterfaceEx

@test Tables.getarray([1,2,3]) == [1,2,3]
@test Tables.getarray((1,2,3)) == [1,2,3]

# empty schema
sch = Tables.Schema((), ())
@test sch.names == ()
@test sch.types == ()
end

@testset "namedtuples.jl" begin
Expand Down Expand Up @@ -703,3 +708,72 @@ end
@test isequal(dct.d, [missing, 5, 7, missing, 11])

end

# extremely wide tables
struct WideTable <: Tables.AbstractColumns
end

Tables.istable(::Type{WideTable}) = true
Tables.columnaccess(::Type{WideTable}) = true
Tables.columns(x::WideTable) = x
Tables.schema(::WideTable) = Tables.Schema([Symbol("x", i) for i = 1:(Tables.SCHEMA_SPECIALIZATION_THRESHOLD + 1)], [Float64 for _ = 1:(Tables.SCHEMA_SPECIALIZATION_THRESHOLD + 1)])
Tables.getcolumn(g::WideTable, nm::Symbol) = rand(100)
Tables.getcolumn(g::WideTable, i::Int) = rand(100)
Base.getindex(::WideTable, i::Int) = rand(100)
Tables.columnnames(::WideTable) = [Symbol("x", i) for i = 1:(Tables.SCHEMA_SPECIALIZATION_THRESHOLD + 1)]

struct WideTable2 <: Tables.AbstractColumns
end

Tables.istable(::Type{WideTable2}) = true
Tables.columnaccess(::Type{WideTable2}) = true
Tables.columns(x::WideTable2) = x
Tables.schema(::WideTable2) = Tables.Schema([Symbol("x", i) for i = 1:1000], [Float64 for _ = 1:1000]; stored=true)
Tables.getcolumn(g::WideTable2, nm::Symbol) = rand(100)
Base.getindex(::WideTable2, i::Int) = rand(100)
Tables.columnnames(::WideTable2) = [Symbol("x", i) for i = 1:1000]

@testset "wide tables" begin
x = WideTable();
sch = Tables.schema(x)
@test sch.names == [Symbol("x", i) for i = 1:(Tables.SCHEMA_SPECIALIZATION_THRESHOLD + 1)]
@test sch.types == [Float64 for _ = 1:(Tables.SCHEMA_SPECIALIZATION_THRESHOLD + 1)]
@test typeof(sch) == Tables.Schema{nothing, nothing}
r = Tables.rows(x)
Tables.eachcolumn(sch, first(r)) do y, i, nm
@test y isa Float64
@test i isa Integer
@test nm isa Symbol
end
Tables.eachcolumns(sch, first(r), x) do y, i, nm, col
@test y isa Float64
@test i isa Integer
@test nm isa Symbol
@test col isa Vector{Float64}
end
@test_throws ArgumentError Tables.columntable(x)
@test_throws ArgumentError Tables.rowtable(x)
y = Tables.dictrowtable(x);
@test length(y) == 100
y = Tables.dictcolumntable(x);
@test Tables.schema(y) == Tables.schema(x)
# y = Tables.matrix(x); # works, just takes a really long time and a lot of memory

x = WideTable2();
sch = Tables.schema(x)
@test sch.names == [Symbol("x", i) for i = 1:1000]
@test sch.types == [Float64 for _ = 1:1000]
@test typeof(sch) == Tables.Schema{nothing, nothing}
r = Tables.rows(x)
Tables.eachcolumn(sch, first(r)) do y, i, nm
@test y isa Float64
@test i isa Integer
@test nm isa Symbol
end
Tables.eachcolumns(sch, first(r), x) do y, i, nm, col
@test y isa Float64
@test i isa Integer
@test nm isa Symbol
@test col isa Vector{Float64}
end
end

0 comments on commit aab3e55

Please sign in to comment.