JuliaData · quinnj · Jun 23, 2021 · Jun 18, 2021 · Jun 22, 2021 · Jun 22, 2021
diff --git a/src/Tables.jl b/src/Tables.jl
@@ -8,6 +8,8 @@ if !hasmethod(getproperty, Tuple{Tuple, Int})
     Base.getproperty(t::Tuple, i::Int) = t[i]
 end
 
+import Base: ==
+
 """
     Tables.AbstractColumns
 
@@ -187,7 +189,7 @@ Base.isempty(r::RorC) = length(r) == 0
 
 function Base.NamedTuple(r::RorC)
     names = columnnames(r)
-    return NamedTuple{Tuple(Base.map(Symbol, names))}(Tuple(getcolumn(r, nm) for nm in names))
+    return NamedTuple{Tuple(map(Symbol, names))}(Tuple(getcolumn(r, nm) for nm in names))
 end
 
 function Base.show(io::IO, x::T) where {T <: AbstractRow}
@@ -402,44 +404,71 @@ are unknown (usually not inferrable). This is similar to the `Base.EltypeUnknown
 when `Base.IteratorEltype` is called. Users should account for the `Tables.schema(tbl) => nothing` case
 by using the properties of the results of `Tables.rows(x)` and `Tables.columns(x)` directly.
 
-To access the names, one can simply call `sch.names` to return the tuple of Symbols.
-To access column element types, one can similarly call `sch.types`, which will return a tuple of types (like `(Int64, Float64, String)`).
+To access the names, one can simply call `sch.names` to return a collection of Symbols (`Tuple` or `Vector`).
+To access column element types, one can similarly call `sch.types`, which will return a collection of types (like `(Int64, Float64, String)`).
 
 The actual type definition is
 ```julia
-struct Schema{names, types} end
+struct Schema{names, types}
+    storednames::Union{Nothing, Vector{Symbol}}
+    storedtypes::Union{Nothing, Vector{Type}}
+end
 ```
-Where `names` is a tuple of Symbols, and `types` is a tuple _type_ of types (like `Tuple{Int64, Float64, String}`).
+Where `names` is a tuple of `Symbol`s or `nothing`, and `types` is a tuple _type_ of types (like `Tuple{Int64, Float64, String}`) or `nothing`.
 Encoding the names & types as type parameters allows convenient use of the type in generated functions
-and other optimization use-cases.
-"""
-struct Schema{names, types} end
-Schema(names::Tuple{Vararg{Symbol}}, types::Type{T}) where {T <: Tuple} = Schema{names, T}()
+and other optimization use-cases, but users should note that when `names` and/or `types` are the `nothing` value, the names and/or types
+are stored in the `storednames` and `storedtypes` fields. This is to account for extremely wide tables with columns in the 10s of thousands
+where encoding the names/types as type parameters becomes prohibitive to the compiler. So while optimizations can be written on the typed
+`names`/`types` type parameters, users should also consider handling the extremely wide tables by specializing on `Tables.Schema{nothing, nothing}`.
+"""
+struct Schema{names, types}
+    storednames::Union{Nothing, Vector{Symbol}}
+    storedtypes::Union{Nothing, Vector{Type}}
+end
+
+Schema{names, types}() where {names, types} = Schema{names, types}(nothing, nothing)
+Schema(names::Tuple{Vararg{Symbol}}, ::Type{T}) where {T <: Tuple} = Schema{names, T}()
 Schema(::Type{NamedTuple{names, types}}) where {names, types} = Schema{names, types}()
 
+# whether names/types are stored or not
+stored(::Schema{names, types}) where {names, types} = names === nothing && types === nothing
+stored(::Nothing) = false
+
 # pass through Ints to allow Tuples to act as rows
 sym(x) = Symbol(x)
 sym(x::Int) = x
 
-Schema(names, ::Nothing) = Schema{Tuple(Base.map(sym, names)), nothing}()
-Schema(names, types) = Schema{Tuple(Base.map(sym, names)), Tuple{types...}}()
+Schema(names, ::Nothing) = Schema{Tuple(map(sym, names)), nothing}()
+
+const SCHEMA_SPECIALIZATION_THRESHOLD = 67000
+
+function Schema(names, types; stored::Bool=false)
+    if stored || length(names) > SCHEMA_SPECIALIZATION_THRESHOLD
+        return Schema{nothing, nothing}([sym(x) for x in names], Type[T for T in types])
+    else
+        return Schema{Tuple(map(sym, names)), Tuple{types...}}()
+    end
+end
 
-function Base.show(io::IO, sch::Schema{names, types}) where {names, types}
+function Base.show(io::IO, sch::Schema)
     get(io, :print_schema_header, true) && println(io, "Tables.Schema:")
-    Base.print_matrix(io, hcat(collect(names), types === nothing ? fill(nothing, length(names)) : collect(fieldtype(types, i) for i = 1:fieldcount(types))))
+    nms = sch.names
+    Base.print_matrix(io, hcat(nms isa Vector ? nms : collect(nms), sch.types === nothing ? fill(nothing, length(nms)) : collect(sch.types)))
 end
 
 function Base.getproperty(sch::Schema{names, types}, field::Symbol) where {names, types}
     if field === :names
-        return names
+        return names === nothing ? getfield(sch, :storednames) : names
     elseif field === :types
-        return types === nothing ? nothing : Tuple(fieldtype(types, i) for i = 1:fieldcount(types))
+        T = getfield(sch, :storedtypes)
+        return types === nothing ? (T !== nothing ? T : nothing) : Tuple(fieldtype(types, i) for i = 1:fieldcount(types))
     else
         throw(ArgumentError("unsupported property for Tables.Schema"))
     end
 end
 
-Base.propertynames(sch::Schema) = (:names, :types)
+Base.propertynames(::Schema) = (:names, :types)
+==(a::Schema, b::Schema) = a.names == b.names && a.types == b.types
 
 # partitions
 

diff --git a/src/fallbacks.jl b/src/fallbacks.jl
@@ -109,17 +109,17 @@ allocatecolumn(T, len) = DataAPI.defaultarray(T, 1)(undef, len)
 @inline function _allocatecolumns(::Schema{names, types}, len) where {names, types}
     if @generated
         vals = Tuple(:(allocatecolumn($(fieldtype(types, i)), len)) for i = 1:fieldcount(types))
-        return :(NamedTuple{$(Base.map(Symbol, names))}(($(vals...),)))
+        return :(NamedTuple{$(map(Symbol, names))}(($(vals...),)))
     else
-        return NamedTuple{Base.map(Symbol, names)}(Tuple(allocatecolumn(fieldtype(types, i), len) for i = 1:fieldcount(types)))
+        return NamedTuple{map(Symbol, names)}(Tuple(allocatecolumn(fieldtype(types, i), len) for i = 1:fieldcount(types)))
     end
 end
 
 @inline function allocatecolumns(sch::Schema{names, types}, len) where {names, types}
     if fieldcount(types) <= SPECIALIZATION_THRESHOLD
         return _allocatecolumns(sch, len)
     else
-        return NamedTuple{Base.map(Symbol, names)}(Tuple(allocatecolumn(fieldtype(types, i), len) for i = 1:fieldcount(types)))
+        return NamedTuple{map(Symbol, names)}(Tuple(allocatecolumn(fieldtype(types, i), len) for i = 1:fieldcount(types)))
     end
 end
 
@@ -214,7 +214,7 @@ end
     len = Base.haslength(T) ? length(rowitr) : 0
     sch = Schema(names, nothing)
     columns = Tuple(EmptyVector(len) for _ = 1:length(names))
-    return NamedTuple{Base.map(Symbol, names)}(_buildcolumns(rowitr, row, st, sch, columns, Ref{Any}(columns))[])
+    return NamedTuple{map(Symbol, names)}(_buildcolumns(rowitr, row, st, sch, columns, Ref{Any}(columns))[])
 end
 
 """

diff --git a/src/namedtuples.jl b/src/namedtuples.jl
@@ -17,10 +17,14 @@ end
 Pass any table input source and return a `NamedTuple` iterator
 
 See also [`rows`](@ref) and [`rowtable`](@ref).
+
+Not for use with extremely wide tables with # of columns > 67K; current fundamental compiler limits
+prevent constructing `NamedTuple`s that large.
 """
 function namedtupleiterator(x)
     r = rows(x)
     sch = schema(r)
+    stored(sch) && throw(ArgumentError("input table too wide ($(length(sch.names)) columns) to construct `NamedTuple` rows"))
     return NamedTupleIterator{typeof(sch), typeof(r)}(r)
 end
 
@@ -29,7 +33,7 @@ namedtupleiterator(T, x) = namedtupleiterator(x)
 
 Base.IteratorEltype(::Type{NamedTupleIterator{Schema{names, types}, T}}) where {names, types, T} = Base.HasEltype()
 Base.IteratorEltype(::Type{NamedTupleIterator{Nothing, T}}) where {T} = Base.EltypeUnknown()
-Base.eltype(::Type{NamedTupleIterator{Schema{names, types}, T}}) where {names, types, T} = NamedTuple{Base.map(Symbol, names), types}
+Base.eltype(::Type{NamedTupleIterator{Schema{names, types}, T}}) where {names, types, T} = NamedTuple{map(Symbol, names), types}
 Base.IteratorSize(::Type{NamedTupleIterator{sch, T}}) where {sch, T} = Base.IteratorSize(T)
 Base.length(nt::NamedTupleIterator) = length(nt.x)
 Base.size(nt::NamedTupleIterator) = (length(nt.x),)
@@ -49,7 +53,7 @@ Base.size(nt::NamedTupleIterator) = (length(nt.x),)
         x = iterate(rows.x, st...)
         x === nothing && return nothing
         row, st = x
-        return NamedTuple{Base.map(Symbol, names), T}(Tuple(getcolumn(row, fieldtype(T, i), i, names[i]) for i = 1:fieldcount(T))), (st,)
+        return NamedTuple{map(Symbol, names), T}(Tuple(getcolumn(row, fieldtype(T, i), i, names[i]) for i = 1:fieldcount(T))), (st,)
     end
 end
 
@@ -60,7 +64,7 @@ end
         x = iterate(rows.x, st...)
         x === nothing && return nothing
         row, st = x
-        return NamedTuple{Base.map(Symbol, names), T}(Tuple(getcolumn(row, fieldtype(T, i), i, names[i]) for i = 1:fieldcount(T))), (st,)
+        return NamedTuple{map(Symbol, names), T}(Tuple(getcolumn(row, fieldtype(T, i), i, names[i]) for i = 1:fieldcount(T))), (st,)
     end
 end
 
@@ -91,6 +95,9 @@ naturally, i.e. a `Vector` naturally iterates its elements, and
 indexing value by index, name, and getting all names).
 
 For a lazy iterator over rows see [`rows`](@ref) and [`namedtupleiterator`](@ref).
+
+Not for use with extremely wide tables with # of columns > 67K; current fundamental compiler limits
+prevent constructing `NamedTuple`s that large.
 """
 function rowtable end
 
@@ -130,29 +137,35 @@ Takes any input table source `x` and returns a `NamedTuple` of `Vector`s,
 also known as a "column table". A "column table" is a kind of default
 table type of sorts, since it satisfies the Tables.jl column interface
 naturally.
+
+Not for use with extremely wide tables with # of columns > 67K; current fundamental compiler limits
+prevent constructing `NamedTuple`s that large.
 """
 function columntable end
 
 function _columntable(sch::Schema{names, types}, cols) where {names, types}
     # use of @generated justified because it's user-controlled; they explicitly asked for namedtuple of vectors
     if @generated
         vals = Tuple(:(getarray(getcolumn(cols, $(fieldtype(types, i)), $i, $(quot(names[i]))))) for i = 1:fieldcount(types))
-        return :(NamedTuple{Base.map(Symbol, names)}(($(vals...),)))
+        return :(NamedTuple{map(Symbol, names)}(($(vals...),)))
     else
-        return NamedTuple{Base.map(Symbol, names)}(Tuple(getarray(getcolumn(cols, fieldtype(types, i), i, names[i])) for i = 1:fieldcount(types)))
+        return NamedTuple{map(Symbol, names)}(Tuple(getarray(getcolumn(cols, fieldtype(types, i), i, names[i])) for i = 1:fieldcount(types)))
     end
 end
 
 function columntable(sch::Schema{names, types}, cols) where {names, types}
     if fieldcount(types) <= SPECIALIZATION_THRESHOLD
         return _columntable(sch, cols)
     else
-        return NamedTuple{Base.map(Symbol, names)}(Tuple(getarray(getcolumn(cols, fieldtype(types, i), i, names[i])) for i = 1:fieldcount(types)))
+        return NamedTuple{map(Symbol, names)}(Tuple(getarray(getcolumn(cols, fieldtype(types, i), i, names[i])) for i = 1:fieldcount(types)))
     end
 end
 
+# extremely large tables
+columntable(sch::Schema{nothing, nothing}, cols) = throw(ArgumentError("input table too wide ($(length(sch.names)) columns) to convert to `NamedTuple` of `Vector`s"))
+
 # unknown schema case
-columntable(::Nothing, cols) = NamedTuple{Tuple(Base.map(Symbol, columnnames(cols)))}(Tuple(getarray(getcolumn(cols, col)) for col in columnnames(cols)))
+columntable(::Nothing, cols) = NamedTuple{Tuple(map(Symbol, columnnames(cols)))}(Tuple(getarray(getcolumn(cols, col)) for col in columnnames(cols)))
 
 function columntable(itr::T) where {T}
     cols = columns(itr)

diff --git a/src/utils.jl b/src/utils.jl
@@ -94,6 +94,13 @@ end
     return
 end
 
+@inline function eachcolumn(f::F, sch::Schema{nothing, nothing}, row::T) where {F, T}
+    for (i, nm) in enumerate(sch.names)
+        f(getcolumn(row, nm), i, nm)
+    end
+    return
+end
+
 # these are specialized `eachcolumn`s where we also want
 # the indexing of `columns` to be constant propagated, so it needs to be returned from the generated function
 @inline function eachcolumns(f::F, sch::Schema{names, types}, row::T, columns::S, args...) where {F, names, types, T, S}
@@ -128,6 +135,13 @@ end
     return
 end
 
+@inline function eachcolumns(f::F, sch::Schema{nothing, nothing}, row::T, columns::S, args...) where {F, T, S}
+    for (i, nm) in enumerate(sch.names)
+        f(getcolumn(row, nm), i, nm, columns[i], args...)
+    end
+    return
+end
+
 """
     rowmerge(row, other_rows...)
     rowmerge(row; fields_to_merge...)

diff --git a/test/runtests.jl b/test/runtests.jl
@@ -115,6 +115,11 @@ using Test, Tables, TableTraits, DataValues, QueryOperators, IteratorInterfaceEx
 
     @test Tables.getarray([1,2,3]) == [1,2,3]
     @test Tables.getarray((1,2,3)) == [1,2,3]
+
+    # empty schema
+    sch = Tables.Schema((), ())
+    @test sch.names == ()
+    @test sch.types == ()
 end
 
 @testset "namedtuples.jl" begin
@@ -703,3 +708,72 @@ end
     @test isequal(dct.d, [missing, 5, 7, missing, 11])
 
 end
+
+# extremely wide tables
+struct WideTable <: Tables.AbstractColumns
+end
+
+Tables.istable(::Type{WideTable}) = true
+Tables.columnaccess(::Type{WideTable}) = true
+Tables.columns(x::WideTable) = x
+Tables.schema(::WideTable) = Tables.Schema([Symbol("x", i) for i = 1:(Tables.SCHEMA_SPECIALIZATION_THRESHOLD + 1)], [Float64 for _ = 1:(Tables.SCHEMA_SPECIALIZATION_THRESHOLD + 1)])
+Tables.getcolumn(g::WideTable, nm::Symbol) = rand(100)
+Tables.getcolumn(g::WideTable, i::Int) = rand(100)
+Base.getindex(::WideTable, i::Int) = rand(100)
+Tables.columnnames(::WideTable) = [Symbol("x", i) for i = 1:(Tables.SCHEMA_SPECIALIZATION_THRESHOLD + 1)]
+
+struct WideTable2 <: Tables.AbstractColumns
+end
+
+Tables.istable(::Type{WideTable2}) = true
+Tables.columnaccess(::Type{WideTable2}) = true
+Tables.columns(x::WideTable2) = x
+Tables.schema(::WideTable2) = Tables.Schema([Symbol("x", i) for i = 1:1000], [Float64 for _ = 1:1000]; stored=true)
+Tables.getcolumn(g::WideTable2, nm::Symbol) = rand(100)
+Base.getindex(::WideTable2, i::Int) = rand(100)
+Tables.columnnames(::WideTable2) = [Symbol("x", i) for i = 1:1000]
+
+@testset "wide tables" begin
+    x = WideTable();
+    sch = Tables.schema(x)
+    @test sch.names == [Symbol("x", i) for i = 1:(Tables.SCHEMA_SPECIALIZATION_THRESHOLD + 1)]
+    @test sch.types == [Float64 for _ = 1:(Tables.SCHEMA_SPECIALIZATION_THRESHOLD + 1)]
+    @test typeof(sch) == Tables.Schema{nothing, nothing}
+    r = Tables.rows(x)
+    Tables.eachcolumn(sch, first(r)) do y, i, nm
+        @test y isa Float64
+        @test i isa Integer
+        @test nm isa Symbol
+    end
+    Tables.eachcolumns(sch, first(r), x) do y, i, nm, col
+        @test y isa Float64
+        @test i isa Integer
+        @test nm isa Symbol
+        @test col isa Vector{Float64}
+    end
+    @test_throws ArgumentError Tables.columntable(x)
+    @test_throws ArgumentError Tables.rowtable(x)
+    y = Tables.dictrowtable(x);
+    @test length(y) == 100
+    y = Tables.dictcolumntable(x);
+    @test Tables.schema(y) == Tables.schema(x)
+    # y = Tables.matrix(x); # works, just takes a really long time and a lot of memory
+
+    x = WideTable2();
+    sch = Tables.schema(x)
+    @test sch.names == [Symbol("x", i) for i = 1:1000]
+    @test sch.types == [Float64 for _ = 1:1000]
+    @test typeof(sch) == Tables.Schema{nothing, nothing}
+    r = Tables.rows(x)
+    Tables.eachcolumn(sch, first(r)) do y, i, nm
+        @test y isa Float64
+        @test i isa Integer
+        @test nm isa Symbol
+    end
+    Tables.eachcolumns(sch, first(r), x) do y, i, nm, col
+        @test y isa Float64
+        @test i isa Integer
+        @test nm isa Symbol
+        @test col isa Vector{Float64}
+    end
+end