Allow stored names/types in Schema for very large schemas (#241)

* Allow stored names/types in Schema for very large schemas There have been a few cases of extremely wide tables where users have run into fundamental compiler limits for lengths of tuples (as discussed with core devs). One example is JuliaData/CSV.jl#635. This PR proposes for very large schemas (> 65,000 columns), to store names/types in `Vector` instead of tuples with the aim to avoid breaking the runtime. The aim here is to be as non-disruptive as possible, hence the very high threshold for switching over to store names/types. Another goal is that downstream packages don't break with just these changes in place. I'm not aware of any packages testing such wide tables, but in my own testing, I've seen issues where packages are relying on the `Tables.Schema` type parameters for names/types. There's also an issue in DataFrames where `Tables.schema` attempts to construct a `Tables.Schema` directly instead of using the `Tables.Schema(names, types)` constructor. So while this PR is needed, we'll need to play whack-a-mole with downstream packages to ensure these really wide tables can be properly supported end-to-end. Going through those downstream package changes, we should probably make notes of how we can clarify Tables.jl interface docs to hopefully help future implementors do so properly and avoid the same pitfalls. * Add tests; update eachcolumn/eachcolumns * Add some more testing for Tables.jl-provided types * fix * fix2 * fix corner case * fix tests
JuliaData · Jun 23, 2021 · aab3e55 · aab3e55
1 parent d038805
commit aab3e55
Show file tree

Hide file tree

Showing 5 changed files with 157 additions and 27 deletions.
diff --git a/src/Tables.jl b/src/Tables.jl
@@ -8,6 +8,8 @@ if !hasmethod(getproperty, Tuple{Tuple, Int})
     Base.getproperty(t::Tuple, i::Int) = t[i]
 end
 
+import Base: ==
+
 """
     Tables.AbstractColumns
 
@@ -187,7 +189,7 @@ Base.isempty(r::RorC) = length(r) == 0
 
 function Base.NamedTuple(r::RorC)
     names = columnnames(r)
-    return NamedTuple{Tuple(Base.map(Symbol, names))}(Tuple(getcolumn(r, nm) for nm in names))
+    return NamedTuple{Tuple(map(Symbol, names))}(Tuple(getcolumn(r, nm) for nm in names))
 end
 
 function Base.show(io::IO, x::T) where {T <: AbstractRow}
@@ -402,44 +404,71 @@ are unknown (usually not inferrable). This is similar to the `Base.EltypeUnknown
 when `Base.IteratorEltype` is called. Users should account for the `Tables.schema(tbl) => nothing` case
 by using the properties of the results of `Tables.rows(x)` and `Tables.columns(x)` directly.
 
-To access the names, one can simply call `sch.names` to return the tuple of Symbols.
-To access column element types, one can similarly call `sch.types`, which will return a tuple of types (like `(Int64, Float64, String)`).
+To access the names, one can simply call `sch.names` to return a collection of Symbols (`Tuple` or `Vector`).
+To access column element types, one can similarly call `sch.types`, which will return a collection of types (like `(Int64, Float64, String)`).
 
 The actual type definition is
 ```julia
-struct Schema{names, types} end
+struct Schema{names, types}
+    storednames::Union{Nothing, Vector{Symbol}}
+    storedtypes::Union{Nothing, Vector{Type}}
+end
 ```
-Where `names` is a tuple of Symbols, and `types` is a tuple _type_ of types (like `Tuple{Int64, Float64, String}`).
+Where `names` is a tuple of `Symbol`s or `nothing`, and `types` is a tuple _type_ of types (like `Tuple{Int64, Float64, String}`) or `nothing`.
 Encoding the names & types as type parameters allows convenient use of the type in generated functions
-and other optimization use-cases.
-"""
-struct Schema{names, types} end
-Schema(names::Tuple{Vararg{Symbol}}, types::Type{T}) where {T <: Tuple} = Schema{names, T}()
+and other optimization use-cases, but users should note that when `names` and/or `types` are the `nothing` value, the names and/or types
+are stored in the `storednames` and `storedtypes` fields. This is to account for extremely wide tables with columns in the 10s of thousands
+where encoding the names/types as type parameters becomes prohibitive to the compiler. So while optimizations can be written on the typed
+`names`/`types` type parameters, users should also consider handling the extremely wide tables by specializing on `Tables.Schema{nothing, nothing}`.
+"""
+struct Schema{names, types}
+    storednames::Union{Nothing, Vector{Symbol}}
+    storedtypes::Union{Nothing, Vector{Type}}
+end
+
+Schema{names, types}() where {names, types} = Schema{names, types}(nothing, nothing)
+Schema(names::Tuple{Vararg{Symbol}}, ::Type{T}) where {T <: Tuple} = Schema{names, T}()
 Schema(::Type{NamedTuple{names, types}}) where {names, types} = Schema{names, types}()
 
+# whether names/types are stored or not
+stored(::Schema{names, types}) where {names, types} = names === nothing && types === nothing
+stored(::Nothing) = false
+
 # pass through Ints to allow Tuples to act as rows
 sym(x) = Symbol(x)
 sym(x::Int) = x
 
-Schema(names, ::Nothing) = Schema{Tuple(Base.map(sym, names)), nothing}()
-Schema(names, types) = Schema{Tuple(Base.map(sym, names)), Tuple{types...}}()
+Schema(names, ::Nothing) = Schema{Tuple(map(sym, names)), nothing}()
+
+const SCHEMA_SPECIALIZATION_THRESHOLD = 67000
+
+function Schema(names, types; stored::Bool=false)
+    if stored || length(names) > SCHEMA_SPECIALIZATION_THRESHOLD
+        return Schema{nothing, nothing}([sym(x) for x in names], Type[T for T in types])
+    else
+        return Schema{Tuple(map(sym, names)), Tuple{types...}}()
+    end
+end
 
-function Base.show(io::IO, sch::Schema{names, types}) where {names, types}
+function Base.show(io::IO, sch::Schema)
     get(io, :print_schema_header, true) && println(io, "Tables.Schema:")
-    Base.print_matrix(io, hcat(collect(names), types === nothing ? fill(nothing, length(names)) : collect(fieldtype(types, i) for i = 1:fieldcount(types))))
+    nms = sch.names
+    Base.print_matrix(io, hcat(nms isa Vector ? nms : collect(nms), sch.types === nothing ? fill(nothing, length(nms)) : collect(sch.types)))
 end
 
 function Base.getproperty(sch::Schema{names, types}, field::Symbol) where {names, types}
     if field === :names
-        return names
+        return names === nothing ? getfield(sch, :storednames) : names
     elseif field === :types
-        return types === nothing ? nothing : Tuple(fieldtype(types, i) for i = 1:fieldcount(types))
+        T = getfield(sch, :storedtypes)
+        return types === nothing ? (T !== nothing ? T : nothing) : Tuple(fieldtype(types, i) for i = 1:fieldcount(types))
     else
         throw(ArgumentError("unsupported property for Tables.Schema"))
     end
 end
 
-Base.propertynames(sch::Schema) = (:names, :types)
+Base.propertynames(::Schema) = (:names, :types)
+==(a::Schema, b::Schema) = a.names == b.names && a.types == b.types
 
 # partitions
 

diff --git a/src/fallbacks.jl b/src/fallbacks.jl
@@ -109,17 +109,17 @@ allocatecolumn(T, len) = DataAPI.defaultarray(T, 1)(undef, len)
 @inline function _allocatecolumns(::Schema{names, types}, len) where {names, types}
     if @generated
         vals = Tuple(:(allocatecolumn($(fieldtype(types, i)), len)) for i = 1:fieldcount(types))
-        return :(NamedTuple{$(Base.map(Symbol, names))}(($(vals...),)))
+        return :(NamedTuple{$(map(Symbol, names))}(($(vals...),)))
     else
-        return NamedTuple{Base.map(Symbol, names)}(Tuple(allocatecolumn(fieldtype(types, i), len) for i = 1:fieldcount(types)))
+        return NamedTuple{map(Symbol, names)}(Tuple(allocatecolumn(fieldtype(types, i), len) for i = 1:fieldcount(types)))
     end
 end
 
 @inline function allocatecolumns(sch::Schema{names, types}, len) where {names, types}
     if fieldcount(types) <= SPECIALIZATION_THRESHOLD
         return _allocatecolumns(sch, len)
     else
-        return NamedTuple{Base.map(Symbol, names)}(Tuple(allocatecolumn(fieldtype(types, i), len) for i = 1:fieldcount(types)))
+        return NamedTuple{map(Symbol, names)}(Tuple(allocatecolumn(fieldtype(types, i), len) for i = 1:fieldcount(types)))
     end
 end
 
@@ -214,7 +214,7 @@ end
     len = Base.haslength(T) ? length(rowitr) : 0
     sch = Schema(names, nothing)
     columns = Tuple(EmptyVector(len) for _ = 1:length(names))
-    return NamedTuple{Base.map(Symbol, names)}(_buildcolumns(rowitr, row, st, sch, columns, Ref{Any}(columns))[])
+    return NamedTuple{map(Symbol, names)}(_buildcolumns(rowitr, row, st, sch, columns, Ref{Any}(columns))[])
 end
 
 """

diff --git a/src/namedtuples.jl b/src/namedtuples.jl
@@ -17,10 +17,14 @@ end
 Pass any table input source and return a `NamedTuple` iterator
 
 See also [`rows`](@ref) and [`rowtable`](@ref).
+
+Not for use with extremely wide tables with # of columns > 67K; current fundamental compiler limits
+prevent constructing `NamedTuple`s that large.
 """
 function namedtupleiterator(x)
     r = rows(x)
     sch = schema(r)
+    stored(sch) && throw(ArgumentError("input table too wide ($(length(sch.names)) columns) to construct `NamedTuple` rows"))
     return NamedTupleIterator{typeof(sch), typeof(r)}(r)
 end
 
@@ -29,7 +33,7 @@ namedtupleiterator(T, x) = namedtupleiterator(x)
 
 Base.IteratorEltype(::Type{NamedTupleIterator{Schema{names, types}, T}}) where {names, types, T} = Base.HasEltype()
 Base.IteratorEltype(::Type{NamedTupleIterator{Nothing, T}}) where {T} = Base.EltypeUnknown()
-Base.eltype(::Type{NamedTupleIterator{Schema{names, types}, T}}) where {names, types, T} = NamedTuple{Base.map(Symbol, names), types}
+Base.eltype(::Type{NamedTupleIterator{Schema{names, types}, T}}) where {names, types, T} = NamedTuple{map(Symbol, names), types}
 Base.IteratorSize(::Type{NamedTupleIterator{sch, T}}) where {sch, T} = Base.IteratorSize(T)
 Base.length(nt::NamedTupleIterator) = length(nt.x)
 Base.size(nt::NamedTupleIterator) = (length(nt.x),)
@@ -49,7 +53,7 @@ Base.size(nt::NamedTupleIterator) = (length(nt.x),)
         x = iterate(rows.x, st...)
         x === nothing && return nothing
         row, st = x
-        return NamedTuple{Base.map(Symbol, names), T}(Tuple(getcolumn(row, fieldtype(T, i), i, names[i]) for i = 1:fieldcount(T))), (st,)
+        return NamedTuple{map(Symbol, names), T}(Tuple(getcolumn(row, fieldtype(T, i), i, names[i]) for i = 1:fieldcount(T))), (st,)
     end
 end
 
@@ -60,7 +64,7 @@ end
         x = iterate(rows.x, st...)
         x === nothing && return nothing
         row, st = x
-        return NamedTuple{Base.map(Symbol, names), T}(Tuple(getcolumn(row, fieldtype(T, i), i, names[i]) for i = 1:fieldcount(T))), (st,)
+        return NamedTuple{map(Symbol, names), T}(Tuple(getcolumn(row, fieldtype(T, i), i, names[i]) for i = 1:fieldcount(T))), (st,)
     end
 end
 
@@ -91,6 +95,9 @@ naturally, i.e. a `Vector` naturally iterates its elements, and
 indexing value by index, name, and getting all names).
 
 For a lazy iterator over rows see [`rows`](@ref) and [`namedtupleiterator`](@ref).
+
+Not for use with extremely wide tables with # of columns > 67K; current fundamental compiler limits
+prevent constructing `NamedTuple`s that large.
 """
 function rowtable end
 
@@ -130,29 +137,35 @@ Takes any input table source `x` and returns a `NamedTuple` of `Vector`s,
 also known as a "column table". A "column table" is a kind of default
 table type of sorts, since it satisfies the Tables.jl column interface
 naturally.
+
+Not for use with extremely wide tables with # of columns > 67K; current fundamental compiler limits
+prevent constructing `NamedTuple`s that large.
 """
 function columntable end
 
 function _columntable(sch::Schema{names, types}, cols) where {names, types}
     # use of @generated justified because it's user-controlled; they explicitly asked for namedtuple of vectors
     if @generated
         vals = Tuple(:(getarray(getcolumn(cols, $(fieldtype(types, i)), $i, $(quot(names[i]))))) for i = 1:fieldcount(types))
-        return :(NamedTuple{Base.map(Symbol, names)}(($(vals...),)))
+        return :(NamedTuple{map(Symbol, names)}(($(vals...),)))
     else
-        return NamedTuple{Base.map(Symbol, names)}(Tuple(getarray(getcolumn(cols, fieldtype(types, i), i, names[i])) for i = 1:fieldcount(types)))
+        return NamedTuple{map(Symbol, names)}(Tuple(getarray(getcolumn(cols, fieldtype(types, i), i, names[i])) for i = 1:fieldcount(types)))
     end
 end
 
 function columntable(sch::Schema{names, types}, cols) where {names, types}
     if fieldcount(types) <= SPECIALIZATION_THRESHOLD
         return _columntable(sch, cols)
     else
-        return NamedTuple{Base.map(Symbol, names)}(Tuple(getarray(getcolumn(cols, fieldtype(types, i), i, names[i])) for i = 1:fieldcount(types)))
+        return NamedTuple{map(Symbol, names)}(Tuple(getarray(getcolumn(cols, fieldtype(types, i), i, names[i])) for i = 1:fieldcount(types)))
     end
 end
 
+# extremely large tables
+columntable(sch::Schema{nothing, nothing}, cols) = throw(ArgumentError("input table too wide ($(length(sch.names)) columns) to convert to `NamedTuple` of `Vector`s"))
+
 # unknown schema case
-columntable(::Nothing, cols) = NamedTuple{Tuple(Base.map(Symbol, columnnames(cols)))}(Tuple(getarray(getcolumn(cols, col)) for col in columnnames(cols)))
+columntable(::Nothing, cols) = NamedTuple{Tuple(map(Symbol, columnnames(cols)))}(Tuple(getarray(getcolumn(cols, col)) for col in columnnames(cols)))
 
 function columntable(itr::T) where {T}
     cols = columns(itr)

diff --git a/src/utils.jl b/src/utils.jl
@@ -94,6 +94,13 @@ end
     return
 end
 
+@inline function eachcolumn(f::F, sch::Schema{nothing, nothing}, row::T) where {F, T}
+    for (i, nm) in enumerate(sch.names)
+        f(getcolumn(row, nm), i, nm)
+    end
+    return
+end
+
 # these are specialized `eachcolumn`s where we also want
 # the indexing of `columns` to be constant propagated, so it needs to be returned from the generated function
 @inline function eachcolumns(f::F, sch::Schema{names, types}, row::T, columns::S, args...) where {F, names, types, T, S}
@@ -128,6 +135,13 @@ end
     return
 end
 
+@inline function eachcolumns(f::F, sch::Schema{nothing, nothing}, row::T, columns::S, args...) where {F, T, S}
+    for (i, nm) in enumerate(sch.names)
+        f(getcolumn(row, nm), i, nm, columns[i], args...)
+    end
+    return
+end
+
 """
     rowmerge(row, other_rows...)
     rowmerge(row; fields_to_merge...)

diff --git a/test/runtests.jl b/test/runtests.jl
@@ -115,6 +115,11 @@ using Test, Tables, TableTraits, DataValues, QueryOperators, IteratorInterfaceEx
 
     @test Tables.getarray([1,2,3]) == [1,2,3]
     @test Tables.getarray((1,2,3)) == [1,2,3]
+
+    # empty schema
+    sch = Tables.Schema((), ())
+    @test sch.names == ()
+    @test sch.types == ()
 end
 
 @testset "namedtuples.jl" begin
@@ -703,3 +708,72 @@ end
     @test isequal(dct.d, [missing, 5, 7, missing, 11])
 
 end
+
+# extremely wide tables
+struct WideTable <: Tables.AbstractColumns
+end
+
+Tables.istable(::Type{WideTable}) = true
+Tables.columnaccess(::Type{WideTable}) = true
+Tables.columns(x::WideTable) = x
+Tables.schema(::WideTable) = Tables.Schema([Symbol("x", i) for i = 1:(Tables.SCHEMA_SPECIALIZATION_THRESHOLD + 1)], [Float64 for _ = 1:(Tables.SCHEMA_SPECIALIZATION_THRESHOLD + 1)])
+Tables.getcolumn(g::WideTable, nm::Symbol) = rand(100)
+Tables.getcolumn(g::WideTable, i::Int) = rand(100)
+Base.getindex(::WideTable, i::Int) = rand(100)
+Tables.columnnames(::WideTable) = [Symbol("x", i) for i = 1:(Tables.SCHEMA_SPECIALIZATION_THRESHOLD + 1)]
+
+struct WideTable2 <: Tables.AbstractColumns
+end
+
+Tables.istable(::Type{WideTable2}) = true
+Tables.columnaccess(::Type{WideTable2}) = true
+Tables.columns(x::WideTable2) = x
+Tables.schema(::WideTable2) = Tables.Schema([Symbol("x", i) for i = 1:1000], [Float64 for _ = 1:1000]; stored=true)
+Tables.getcolumn(g::WideTable2, nm::Symbol) = rand(100)
+Base.getindex(::WideTable2, i::Int) = rand(100)
+Tables.columnnames(::WideTable2) = [Symbol("x", i) for i = 1:1000]
+
+@testset "wide tables" begin
+    x = WideTable();
+    sch = Tables.schema(x)
+    @test sch.names == [Symbol("x", i) for i = 1:(Tables.SCHEMA_SPECIALIZATION_THRESHOLD + 1)]
+    @test sch.types == [Float64 for _ = 1:(Tables.SCHEMA_SPECIALIZATION_THRESHOLD + 1)]
+    @test typeof(sch) == Tables.Schema{nothing, nothing}
+    r = Tables.rows(x)
+    Tables.eachcolumn(sch, first(r)) do y, i, nm
+        @test y isa Float64
+        @test i isa Integer
+        @test nm isa Symbol
+    end
+    Tables.eachcolumns(sch, first(r), x) do y, i, nm, col
+        @test y isa Float64
+        @test i isa Integer
+        @test nm isa Symbol
+        @test col isa Vector{Float64}
+    end
+    @test_throws ArgumentError Tables.columntable(x)
+    @test_throws ArgumentError Tables.rowtable(x)
+    y = Tables.dictrowtable(x);
+    @test length(y) == 100
+    y = Tables.dictcolumntable(x);
+    @test Tables.schema(y) == Tables.schema(x)
+    # y = Tables.matrix(x); # works, just takes a really long time and a lot of memory
+
+    x = WideTable2();
+    sch = Tables.schema(x)
+    @test sch.names == [Symbol("x", i) for i = 1:1000]
+    @test sch.types == [Float64 for _ = 1:1000]
+    @test typeof(sch) == Tables.Schema{nothing, nothing}
+    r = Tables.rows(x)
+    Tables.eachcolumn(sch, first(r)) do y, i, nm
+        @test y isa Float64
+        @test i isa Integer
+        @test nm isa Symbol
+    end
+    Tables.eachcolumns(sch, first(r), x) do y, i, nm, col
+        @test y isa Float64
+        @test i isa Integer
+        @test nm isa Symbol
+        @test col isa Vector{Float64}
+    end
+end