From 0e3ea807c256d6e227705462ffb5d8b6a8dea145 Mon Sep 17 00:00:00 2001 From: Jacob Quinn Date: Sat, 1 Feb 2020 10:39:20 -0700 Subject: [PATCH 01/15] Tables API enhancement --- .travis.yml | 16 +++++- appveyor.yml | 31 ----------- src/Tables.jl | 119 ++++++++++++++++++++++++++++++++++++---- src/fallbacks.jl | 79 ++++++++++++++++++++------ src/matrix.jl | 25 ++++----- src/namedtuples.jl | 28 +++++----- src/operations.jl | 49 +++++++++++------ src/tofromdatavalues.jl | 36 +++++------- src/utils.jl | 50 ++++++++--------- test/runtests.jl | 2 - 10 files changed, 279 insertions(+), 156 deletions(-) delete mode 100644 appveyor.yml diff --git a/.travis.yml b/.travis.yml index 604a594..4e969ba 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,12 +1,24 @@ +# Documentation: http://docs.travis-ci.com/user/languages/julia/ language: julia os: - linux - osx + - windows +arch: + - x64 + - x86 julia: - 1.0 - - 1.1 + - 1.3 - nightly +matrix: + allow_failures: + - julia: nightly + fast_finish: true + exclude: + - os: osx + arch: x86 notifications: email: false after_success: - - julia -e 'using Pkg; Pkg.add("Coverage"); using Coverage; Codecov.submit(process_folder())' \ No newline at end of file + - julia -e 'ENV["TRAVIS_JULIA_VERSION"] == "1.3" && ENV["TRAVIS_OS_NAME"] != "linux" && exit(); using Pkg; Pkg.add("Coverage"); using Coverage; Codecov.submit(Codecov.process_folder())' \ No newline at end of file diff --git a/appveyor.yml b/appveyor.yml deleted file mode 100644 index c6719b1..0000000 --- a/appveyor.yml +++ /dev/null @@ -1,31 +0,0 @@ -environment: - matrix: - - julia_version: 1.0 - - julia_version: 1.1 - - julia_version: nightly - -platform: - - x86 # 32-bit - - x64 # 64-bit - -branches: - only: - - master - - /release-.*/ - -notifications: - - provider: Email - on_build_success: false - on_build_failure: false - on_build_status_changed: false - -install: - - ps: iex ((new-object net.webclient).DownloadString("https://raw.githubusercontent.com/JuliaCI/Appveyor.jl/version-1/bin/install.ps1")) - -build_script: - - echo "%JL_BUILD_SCRIPT%" - - C:\julia\bin\julia -e "%JL_BUILD_SCRIPT%" - -test_script: - - echo "%JL_TEST_SCRIPT%" - - C:\julia\bin\julia -e "%JL_TEST_SCRIPT%" \ No newline at end of file diff --git a/src/Tables.jl b/src/Tables.jl index d90e527..e2a3f4d 100644 --- a/src/Tables.jl +++ b/src/Tables.jl @@ -8,21 +8,120 @@ if !hasmethod(getproperty, Tuple{Tuple, Int}) Base.getproperty(t::Tuple, i::Int) = t[i] end -"Abstract row type with a simple required interface: row values are accessible via `getproperty(row, field)`; for example, a NamedTuple like `nt = (a=1, b=2, c=3)` can access its value for `a` like `nt.a` which turns into a call to the function `getproperty(nt, :a)`" -abstract type Row end +""" + Tables.AbstractColumns + +Abstract type provided to allow custom table types to inherit useful and required behavior. + +Interface definition: +| Required Methods | Default Definition | Brief Description | +| ---------------- | ------------------ | ----------------- | +| `Tables.getcolumn(table, i::Int)` | getfield(table, i) | Retrieve a column by index | +| `Tables.getcolumn(table, nm::Symbol)` | getproperty(table, nm) | Retrieve a column by name | +| `Tables.columnnames(table)` | propertynames(table) | Return column names for a table as an indexable collection | +| Optional methods | | | +| `Tables.getcolumn(table, ::Type{T}, i::Int, nm::Symbol)` | Tables.getcolumn(table, nm) | Given a column eltype `T`, index `i`, and column name `nm`, retrieve the column. Provides a type-stable or even constant-prop-able mechanism for efficiency. + +While custom table types aren't required to subtype `Tables.AbstractColumns`, benefits of doing so include: + * Indexing interface defined (using `getcolumn`); i.e. `tbl[i]` will retrieve the column at index `i` + * Property access interface defined (using `columnnames` and `getcolumn`); i.e. `tbl.col1` will retrieve column named `col1` + * Iteration interface defined; i.e. `for col in table` will iterate each column in the table + * A default `show` method +This allows a custom table type to behave as close as possible to a builtin `NamedTuple` of vectors object. +""" +abstract type AbstractColumns end + +""" + Tables.AbstractRow + +Abstract type provided to allow custom row types to inherit useful and required behavior. + +Interface definition: +| Required Methods | Default Definition | Brief Description | +| ---------------- | ------------------ | ----------------- | +| `Tables.getcolumn(row, i::Int)` | getfield(row, i) | Retrieve a column value by index | +| `Tables.getcolumn(row, nm::Symbol)` | getproperty(row, nm) | Retrieve a column value by name | +| `Tables.columnnames(row)` | propertynames(row) | Return column names for a row as an indexable collection | +| Optional methods | | | +| `Tables.getcolumn(row, ::Type{T}, i::Int, nm::Symbol)` | Tables.getcolumn(row, nm) | Given a column type `T`, index `i`, and column name `nm`, retrieve the column value. Provides a type-stable or even constant-prop-able mechanism for efficiency. + +While custom row types aren't required to subtype `Tables.AbstractRow`, benefits of doing so include: + * Indexing interface defined (using `getcolumn`); i.e. `row[i]` will return the column value at index `i` + * Property access interface defined (using `columnnames` and `getcolumn`); i.e. `row.col1` will retrieve the value for the column named `col1` + * Iteration interface defined; i.e. `for x in row` will iterate each column value in the row + * A default `show` method +This allows the custom row type to behave as close as possible to a builtin `NamedTuple` object. +""" +abstract type AbstractRow <: AbstractColumns end + +""" + Tables.getcolumn(::Columns, nm::Symbol) => Indexable collection with known length + Tables.getcolumn(::Columns, i::Int) => Indexable collection with known length + Tables.getcolumn(::Columns, T, i::Int, nm::Symbol) => Indexable collection with known length + + Tables.getcolumn(::Row, nm::Symbol) => Column value + Tables.getcolumn(::Row, i::Int) => Column value + Tables.getcolumn(::Row, T, i::Int, nm::Symbol) => Column value + +Retrieve an entire column (`Columns`) or single row column value (`Row`) by column name (`nm`), index (`i`), +or if desired, by column type (`T`), index (`i`), and name (`nm`). When called on a `Columns` interface object, +a `Column` is returned, which is an indexable collection with known length. When called on a `Row` interface +object, it returns the single column value. The methods taking a single `Symbol` or `Int` are both required +for the `AbstractColumns` and `AbstractRow` interfaces; the third method is optional if type stability is possible. +The default definition of `Tables.getcolumn(x, i::Int)` is `getfield(x, i)`. The default definition of +`Tables.getcolumn(x, nm::Symbol)` is `getproperty(x, nm)`. +""" +function getcolumn end + +getcolumn(x, i::Int) = getfield(x, i) +getcolumn(x, nm::Symbol) = getproperty(x, nm) +getcolumn(x, ::Type{T}, i::Int, nm::Symbol) where {T} = getcolumn(x, nm) +getcolumn(x::NamedTuple{names, types}, ::Type{T}, i::Int, nm::Symbol) where {names, types, T} = Core.getfield(x, i) + +""" + Tables.columnnames(::Union{Columns, Row}) => Indexable collection + +Retrieves the list of column names as an indexable collection (like a `Tuple` or `Vector`) for a `Columns` or `Row` interface object. The default definition calls `propertynames(x)`. +""" +function columnnames end + +columnnames(x) = propertynames(x) + +Base.IteratorSize(::Type{R}) where {R <: AbstractColumns} = Base.HasLength() +Base.length(r::AbstractColumns) = length(columnnames(r)) +Base.firstindex(r::AbstractColumns) = 1 +Base.lastindex(r::AbstractColumns) = length(r) +Base.getindex(r::AbstractColumns, i::Int) = getcolumn(r, i) +Base.getindex(r::AbstractColumns, nm::Symbol) = getcolumn(r, nm) +Base.getproperty(r::AbstractColumns, nm::Symbol) = getcolumn(r, nm) +Base.getproperty(r::AbstractColumns, i::Int) = getcolumn(r, i) +Base.propertynames(r::AbstractColumns) = columnnames(r) +Base.keys(r::AbstractColumns) = columnnames(r) +Base.values(r::AbstractColumns) = collect(r) +Base.haskey(r::AbstractColumns, key::Union{Integer, Symbol}) = key in columnnames(r) +Base.get(r::AbstractColumns, key::Union{Integer, Symbol}, default) = haskey(r, key) ? getcolumn(r, key) : default +Base.get(f::Base.Callable, r::AbstractColumns, key::Union{Integer, Symbol}) = haskey(r, key) ? getcolumn(r, key) : f() +Base.@propagate_inbounds Base.iterate(r::AbstractColumns, i=1) = i > length(r) ? nothing : (getcolumn(r, i), i + 1) + +function Base.show(io::IO, x::T) where {T <: AbstractColumns} + println(io, "$T:") + names = collect(columnnames(x)) + values = [getcolumn(row, nm) for nm in names] + Base.print_matrix(io, hcat(names, values)) +end """ -The Tables.jl package provides simple, yet powerful interface functions for working with all kinds tabular data through predictable access patterns. +The Tables.jl package provides simple, yet powerful interface functions for working with all kinds of tabular data through predictable access patterns. ```julia - Tables.rows(table) => Rows + Tables.rows(table) => Row iterator (also known as a Rows object) Tables.columns(table) => Columns ``` -Where `Rows` and `Columns` are the duals of each other: -* `Rows` is an iterator of property-accessible objects (any type that supports `propertynames(row)` and `getproperty(row, nm::Symbol`) -* `Columns` is a property-accessible object of iterators (i.e. each column is an iterator) +Where `Row` and `Columns` are objects that support a common interface: + * `Tables.getcolumn(x, col::Union{Int, Symbol})`: Retrieve an entire column (`Columns`), or single column value (`Row`) by column index (as an `Int`), or by column name (as a `Symbol`) + * `Tables.columnnames(x)`: Retrieve the possible column names for a `Row` or `Columns` object -In addition to these `Rows` and `Columns` objects, it's useful to be able to query properties of these objects: +In addition to these `Row` and `Columns` objects, it's useful to be able to query properties of these objects: * `Tables.schema(x::Union{Rows, Columns}) => Union{Tables.Schema, Nothing}`: returns a `Tables.Schema` object, or `nothing` if the table's schema is unknown * For the `Tables.Schema` object: * column names can be accessed as a tuple of Symbols like `sch.names` @@ -30,7 +129,7 @@ In addition to these `Rows` and `Columns` objects, it's useful to be able to que * See `?Tables.Schema` for more details on this type A big part of the power in these simple interface functions is that each (`Tables.rows` & `Tables.columns`) is defined for any table type, even if the table type only explicitly implements one interface function or the other. -This is accomplished by providing performant, generic fallback definitions in Tables.jl itself (though obviously nothing prevents a table type from implementing each interface function directly). +This is accomplished by providing performant, generic fallback definitions in Tables.jl itself (though obviously nothing prevents a table type from implementing each interface function directly if so desired). With these simple definitions, powerful workflows are enabled: * A package providing data cleansing, manipulation, visualization, or analysis can automatically handle any number of decoupled input table types @@ -173,7 +272,7 @@ include("operations.jl") include("matrix.jl") "Return the column index (1-based) of a `colname` in a table with a known schema; returns 0 if `colname` doesn't exist in table" -columnindex(table, colname) = columnindex(schema(table).names, colname) +columnindex(table, colname) = columnindex(schema(table), colname) "Return the column type of a `colname` in a table with a known schema; returns Union{} if `colname` doesn't exist in table" columntype(table, colname) = columntype(schema(table), colname) diff --git a/src/fallbacks.jl b/src/fallbacks.jl index ca2945e..ce52df0 100644 --- a/src/fallbacks.jl +++ b/src/fallbacks.jl @@ -1,23 +1,30 @@ ## generic `Tables.rows` and `Tables.columns` fallbacks ## if a table provides Tables.rows or Tables.columns, -## we'll provide a default implementation of the dual +## we'll provide a default implementation of the other -# generic row iteration of columns +# for Columns objects, we define a generic RowIterator wrapper to turn any Columns into a Rows + +# get the number of rows in the incoming table function rowcount(cols) - props = propertynames(cols) - isempty(props) && return 0 - return length(getproperty(cols, props[1])) + names = columnnames(cols) + isempty(names) && return 0 + return length(getcolumn(cols, names[1])) end -struct ColumnsRow{T} +# a lazy row view into a Columns object +struct ColumnsRow{T} <: AbstractRow columns::T # a `Columns` object - row::Int + row::Int # row number end -Base.getproperty(c::ColumnsRow, ::Type{T}, col::Int, nm::Symbol) where {T} = getproperty(getfield(c, 1), T, col, nm)[getfield(c, 2)] -Base.getproperty(c::ColumnsRow, nm::Int) = getproperty(getfield(c, 1), nm)[getfield(c, 2)] -Base.getproperty(c::ColumnsRow, nm::Symbol) = getproperty(getfield(c, 1), nm)[getfield(c, 2)] -Base.propertynames(c::ColumnsRow) = propertynames(getfield(c, 1)) +getcolumns(c::ColumnsRow) = getfield(c, :columns) +getrow(c::ColumnsRow) = getfield(c, :row) + +# AbstractRow interface +Base.@propagate_inbounds getcolumn(c::ColumnsRow, ::Type{T}, col::Int, nm::Symbol) where {T} = getcolumn(getcolumns(c), T, col, nm)[getrow(c)] +Base.@propagate_inbounds getcolumn(c::ColumnsRow, i::Int) = getcolumn(getcolumns(c), i)[getrow(c)] +Base.@propagate_inbounds getcolumn(c::ColumnsRow, nm::Symbol) = getcolumn(getcolumns(c), nm)[getrow(c)] +columnnames(c::ColumnsRow) = columnnames(getcolumns(c)) @generated function Base.isless(c::ColumnsRow{T}, d::ColumnsRow{T}) where {T <: NamedTuple{names}} where names exprs = Expr[] @@ -46,16 +53,19 @@ end Expr(:block, exprs...) end +# RowIterator wraps a Columns object and provides row iteration via lazy row views struct RowIterator{T} columns::T len::Int end + Base.eltype(x::RowIterator{T}) where {T} = ColumnsRow{T} Base.length(x::RowIterator) = x.len istable(::Type{<:RowIterator}) = true rowaccess(::Type{<:RowIterator}) = true rows(x::RowIterator) = x -columnaccess(::Type{<:RowIterator{T}}) where T = columnaccess(T) + +columnaccess(::Type{<:RowIterator}) = true columns(x::RowIterator) = x.columns materializer(x::RowIterator) = materializer(x.columns) schema(x::RowIterator) = schema(x.columns) @@ -65,21 +75,29 @@ function Base.iterate(rows::RowIterator, st=1) return ColumnsRow(rows.columns, st), st + 1 end +# this is our generic Tables.rows fallback definition function rows(x::T) where {T} + # because this method is being called, we know `x` didn't define it's own Tables.rows + # first check if it supports column access, and if so, wrap it in a RowIterator if columnaccess(T) cols = columns(x) return RowIterator(cols, Int(rowcount(cols))) + # otherwise, if the input is at least iterable, we'll wrap it in an IteratorWrapper + # which will iterate the input, validating that it supports the AbstractRow interface + # and unwrapping any DataValues that are encountered elseif IteratorInterfaceExtensions.isiterable(x) return nondatavaluerows(x) end throw(ArgumentError("no default `Tables.rows` implementation for type: $T")) end -# build columns from rows +# for Rows objects, we define a "collect"-like routine to build up columns from iterated rows + """ - Tables.allocatecolumn(::Type{T}, len) => returns a column type (usually AbstractVector) w/ size to hold `len` elements + Tables.allocatecolumn(::Type{T}, len) => returns a column type (usually AbstractVector) with size to hold `len` elements Custom column types can override with an appropriate "scalar" element type that should dispatch to their column allocator. + Alternatively, and more generally, custom scalars can overload `DataAPI.defaultarray` to signal the default array type """ allocatecolumn(T, len) = DataAPI.defaultarray(T, 1)(undef, len) @@ -131,11 +149,20 @@ function __buildcolumns(rowitr, st, sch, columns, rownbr, updated) row, st = state rownbr += 1 eachcolumns(add_or_widen!, sch, row, columns, rownbr, updated, Base.IteratorSize(rowitr)) + # little explanation here: we just called add_or_widen! for each column value of our row + # note that when a column's type is widened, `updated` is set w/ the new set of columns + # we then check if our current `columns` isn't the same object as our `updated` ref + # if it isn't, we're going to call __buildcolumns again, passing our new updated ref as + # columns, which allows __buildcolumns to specialize (i.e. recompile) based on the new types + # of updated. So a new __buildcolumns will be compiled for each widening event. columns !== updated[] && return __buildcolumns(rowitr, st, sch, updated[], rownbr, updated) end return updated end +# for the schema-less case, we do one extra step of initializing each column as an `EmptyVector` +# and doing an initial widening for each column in _buildcolumns, before passing the widened +# set of columns on to __buildcolumns struct EmptyVector <: AbstractVector{Union{}} len::Int end @@ -153,14 +180,20 @@ end state = iterate(rowitr) state === nothing && return NamedTuple() row, st = state - names = Tuple(propertynames(row)) + names = Tuple(columnnames(row)) len = Base.haslength(T) ? length(rowitr) : 0 sch = Schema(names, nothing) columns = Tuple(EmptyVector(len) for _ = 1:length(names)) return NamedTuple{Base.map(Symbol, names)}(_buildcolumns(rowitr, row, st, sch, columns, Ref{Any}(columns))[]) end -struct CopiedColumns{T} +# for some sinks, there's a concern about whether they can safely "own" columns from the input +# to be safe, they should always copy input columns, to avoid unintended mutation. +# when we've called buildcolumns, however, Tables.jl essentially built/owns the columns, +# and it's happy to pass ownership to the sink. Thus, any built columns will be wrapped +# in a CopiedColumns struct to signal to the sink that essentially "a copy has already been made" +# and they're safe to assume ownership +struct CopiedColumns{T} <: AbstractColumns x::T end @@ -170,15 +203,25 @@ columnaccess(::Type{<:CopiedColumns}) = true columns(x::CopiedColumns) = x schema(x::CopiedColumns) = schema(source(x)) materializer(x::CopiedColumns) = materializer(source(x)) -Base.propertynames(x::CopiedColumns) = propertynames(source(x)) -Base.getproperty(x::CopiedColumns, nm::Symbol) = getproperty(source(x), nm) +getcolumn(x::CopiedColumns, ::Type{T}, col::Int, nm::Symbol) where {T} = getcolumn(source(x), T, col, nm) +getcolumn(x::CopiedColumns, i::Int) = getcolumn(source(x), i) +getcolumn(x::CopiedColumns, nm::Symbol) = getcolumn(source(x), nm) +columnnames(x::CopiedColumns) = columnnames(source(x)) + +# here's our generic fallback Tables.columns definition @inline function columns(x::T) where {T} + # because this method is being called, we know `x` didn't define it's own Tables.columns method + # first check if it supports row access, and if so, build up the desired columns if rowaccess(T) r = rows(x) return CopiedColumns(buildcolumns(schema(r), r)) + # though not widely supported, if a source supports the TableTraits column interface, use it elseif TableTraits.supports_get_columns_copy_using_missing(x) return CopiedColumns(TableTraits.get_columns_copy_using_missing(x)) + # otherwise, if the source is at least iterable, we'll wrap it in an IteratorWrapper and + # build columns from that, which will check if the source correctly iterates AbstractRows + # and unwraps DataValues for us elseif IteratorInterfaceExtensions.isiterable(x) iw = nondatavaluerows(x) return CopiedColumns(buildcolumns(schema(iw), iw)) diff --git a/src/matrix.jl b/src/matrix.jl index 88539df..e93872b 100644 --- a/src/matrix.jl +++ b/src/matrix.jl @@ -1,10 +1,9 @@ istable(::Type{<:AbstractMatrix}) = false -istable(::AbstractMatrix) = false rows(m::T) where {T <: AbstractMatrix} = throw(ArgumentError("a '$T' is not a table; see `?Tables.table` for ways to treat an AbstractMatrix as a table")) columns(m::T) where {T <: AbstractMatrix} = throw(ArgumentError("a '$T' is not a table; see `?Tables.table` for ways to treat an AbstractMatrix as a table")) -struct MatrixTable{T <: AbstractMatrix} +struct MatrixTable{T <: AbstractMatrix} <: AbstractColumns names::Vector{Symbol} lookup::Dict{Symbol, Int} matrix::T @@ -14,16 +13,18 @@ istable(::Type{<:MatrixTable}) = true names(m::MatrixTable) = getfield(m, :names) # row interface -struct MatrixRow{T} +struct MatrixRow{T} <: AbstractRow row::Int source::MatrixTable{T} end -Base.getproperty(m::MatrixRow, ::Type, col::Int, nm::Symbol) = +getcolumn(m::MatrixRow, ::Type, col::Int, nm::Symbol) = getfield(getfield(m, :source), :matrix)[getfield(m, :row), col] -Base.getproperty(m::MatrixRow, nm::Symbol) = +getcolumn(m::MatrixRow, i::Int) = + getfield(getfield(m, :source), :matrix)[getfield(m, :row), i] +getcolumn(m::MatrixRow, nm::Symbol) = getfield(getfield(m, :source), :matrix)[getfield(m, :row), getfield(getfield(m, :source), :lookup)[nm]] -Base.propertynames(m::MatrixRow) = names(getfield(m, :source)) +columnnames(m::MatrixRow) = names(getfield(m, :source)) rowaccess(::Type{<:MatrixTable}) = true schema(m::MatrixTable{T}) where {T} = Schema(Tuple(names(m)), NTuple{size(getfield(m, :matrix), 2), eltype(T)}) @@ -31,17 +32,15 @@ rows(m::MatrixTable) = m Base.eltype(m::MatrixTable{T}) where {T} = MatrixRow{T} Base.length(m::MatrixTable) = size(getfield(m, :matrix), 1) -function Base.iterate(m::MatrixTable, st=1) - st > length(m) && return nothing - return MatrixRow(st, m), st + 1 -end +Base.iterate(m::MatrixTable, st=1) = st > length(m) ? nothing : (MatrixRow(st, m), st + 1) # column interface columnaccess(::Type{<:MatrixTable}) = true columns(m::MatrixTable) = m -Base.getproperty(m::MatrixTable, ::Type{T}, col::Int, nm::Symbol) where {T} = getfield(m, :matrix)[:, col] -Base.getproperty(m::MatrixTable, nm::Symbol) = getfield(m, :matrix)[:, getfield(m, :lookup)[nm]] -Base.propertynames(m::MatrixTable) = names(m) +getcolumn(m::MatrixTable, ::Type{T}, col::Int, nm::Symbol) where {T} = getfield(m, :matrix)[:, col] +getcolumn(m::MatrixTable, nm::Symbol) = getfield(m, :matrix)[:, getfield(m, :lookup)[nm]] +getcolumn(m::MatrixTable, i::Int) = getfield(m, :matrix)[:, i] +columnnames(m::MatrixTable) = names(m) """ Tables.table(m::AbstractMatrix; [header::Vector{Symbol}]) diff --git a/src/namedtuples.jl b/src/namedtuples.jl index 55bcd5a..79d4d9a 100644 --- a/src/namedtuples.jl +++ b/src/namedtuples.jl @@ -1,12 +1,12 @@ # Vector of NamedTuples -const RowTable{T} = Vector{T} where {T <: NamedTuple} +const RowTable{T} = AbstractVector{T} where {T <: NamedTuple} # interface implementation istable(::Type{<:RowTable}) = true rowaccess(::Type{<:RowTable}) = true -# a Vector of NamedTuple iterates `Row`s itself +# an AbstractVector of NamedTuple iterates `Row`s itself rows(x::RowTable) = x -schema(x::Vector{NamedTuple{names, types}}) where {names, types} = Schema(names, types) +schema(x::AbstractVector{NamedTuple{names, types}}) where {names, types} = Schema(names, types) materializer(x::RowTable) = rowtable # struct to transform `Row`s into NamedTuples @@ -21,7 +21,7 @@ Base.size(nt::NamedTupleIterator) = (length(nt.x),) function Base.iterate(rows::NamedTupleIterator{Schema{names, T}}, st=()) where {names, T} if @generated - vals = Tuple(:(getproperty(row, $(fieldtype(T, i)), $i, $(quot(names[i])))) for i = 1:fieldcount(T)) + vals = Tuple(:(getcolumn(row, $(fieldtype(T, i)), $i, $(quot(names[i])))) for i = 1:fieldcount(T)) return quote x = iterate(rows.x, st...) x === nothing && return nothing @@ -32,7 +32,7 @@ function Base.iterate(rows::NamedTupleIterator{Schema{names, T}}, st=()) where { x = iterate(rows.x, st...) x === nothing && return nothing row, st = x - return NamedTuple{Base.map(Symbol, names), T}(Tuple(getproperty(row, fieldtype(T, i), i, names[i]) for i = 1:fieldcount(T))), (st,) + return NamedTuple{Base.map(Symbol, names), T}(Tuple(getcolumn(row, fieldtype(T, i), i, names[i]) for i = 1:fieldcount(T))), (st,) end end @@ -41,8 +41,8 @@ function Base.iterate(rows::NamedTupleIterator{Nothing, T}, st=()) where {T} x = iterate(rows.x, st...) x === nothing && return nothing row, st = x - names = Tuple(propertynames(row)) - return NamedTuple{Base.map(Symbol, names)}(Tuple(getproperty(row, nm) for nm in names)), (st,) + names = Tuple(columnnames(row)) + return NamedTuple{Base.map(Symbol, names)}(Tuple(getcolumn(row, nm) for nm in names)), (st,) end namedtupleiterator(::Type{T}, rows::S) where {T <: NamedTuple, S} = rows @@ -68,27 +68,29 @@ istable(::Type{<:ColumnTable}) = true columnaccess(::Type{<:ColumnTable}) = true # a NamedTuple of AbstractVectors is itself a `Columns` object columns(x::ColumnTable) = x -schema(x::T) where {T <: ColumnTable} = Schema(names(T), _types(T)) -materializer(x::ColumnTable) = columntable _eltype(::Type{A}) where {A <: AbstractVector{T}} where {T} = T -Base.@pure function _types(::Type{NT}) where {NT <: NamedTuple{names, T}} where {names, T <: NTuple{N, AbstractVector{S} where S}} where {N} +Base.@pure function _eltypes(::Type{NT}) where {NT <: NamedTuple{names, T}} where {names, T <: NTuple{N, AbstractVector{S} where S}} where {N} return Tuple{Any[ _eltype(fieldtype(NT, i)) for i = 1:fieldcount(NT) ]...} end +schema(x::T) where {T <: ColumnTable} = Schema(names(T), _eltypes(T)) +materializer(x::ColumnTable) = columntable + getarray(x::AbstractArray) = x getarray(x) = collect(x) function columntable(sch::Schema{names, types}, cols) where {names, types} if @generated - vals = Tuple(:(getarray(getproperty(cols, $(fieldtype(types, i)), $i, $(quot(names[i]))))) for i = 1:fieldcount(types)) + vals = Tuple(:(getarray(getcolumn(cols, $(fieldtype(types, i)), $i, $(quot(names[i]))))) for i = 1:fieldcount(types)) return :(NamedTuple{Base.map(Symbol, names)}(($(vals...),))) else - return NamedTuple{Base.map(Symbol, names)}(Tuple(getarray(getproperty(cols, fieldtype(types, i), i, names[i])) for i = 1:fieldcount(types))) + return NamedTuple{Base.map(Symbol, names)}(Tuple(getarray(getcolumn(cols, fieldtype(types, i), i, names[i])) for i = 1:fieldcount(types))) end end + # unknown schema case -columntable(::Nothing, cols) = NamedTuple{Tuple(Base.map(Symbol, propertynames(cols)))}(Tuple(getarray(col) for col in eachcolumn(cols))) +columntable(::Nothing, cols) = NamedTuple{Tuple(Base.map(Symbol, columnnames(cols)))}(Tuple(getarray(col) for col in eachcolumn(cols))) function columntable(itr::T) where {T} cols = columns(itr) diff --git a/src/operations.jl b/src/operations.jl index 9a558cc..82fa1cb 100644 --- a/src/operations.jl +++ b/src/operations.jl @@ -1,18 +1,23 @@ -struct TransformsRow{T, F} +struct TransformsRow{T, F} <: AbstractRow row::T funcs::F end -Base.getproperty(row::TransformsRow, nm::Symbol) = (getfunc(row, getfield(row, 2), nm))(getproperty(getfield(row, 1), nm)) -Base.propertynames(row::TransformsRow) = propertynames(getfield(row, 1)) +getrow(r::TransformsRow) = getfield(r, :row) +getfuncs(r::TransformsRow) = getfield(r, :funcs) -struct Transforms{C, T, F} +getcolumn(row::TransformsRow, nm::Symbol) = (getfunc(row, getfuncs(row), nm))(getcolumn(getrow(row), nm)) +getcolumn(row::TransformsRow, i::Int) = (getfunc(row, getfuncs(row), i))(getcolumn(getrow(row), i)) +columnnames(row::TransformsRow) = columnnames(getrow(row)) + +struct Transforms{C, T, F} <: AbstractColumns source::T funcs::F # NamedTuple of columnname=>transform function end -Base.propertynames(t::Transforms{true}) = propertynames(getfield(t, 1)) -Base.getproperty(t::Transforms{true}, nm::Symbol) = Base.map(getfunc(t, getfield(t, 2), nm), getproperty(getfield(t, 1), nm)) +columnnames(t::Transforms{true}) = columnnames(getfield(t, 1)) +getcolumn(t::Transforms{true}, nm::Symbol) = Base.map(getfunc(t, getfield(t, 2), nm), getcolumn(getfield(t, 1), nm)) +getcolumn(t::Transforms{true}, i::Int) = Base.map(getfunc(t, getfield(t, 2), nm), getcolumn(getfield(t, 1), i)) transform(funcs) = x->transform(x, funcs) transform(; kw...) = transform(kw.data) @@ -22,10 +27,15 @@ function transform(src::T, funcs::F) where {T, F} return Transforms{C, typeof(x), F}(x, funcs) end -getfunc(row, nt::NamedTuple, nm) = get(nt, nm, identity) -getfunc(row, d::Dict{String, <:Base.Callable}, nm) = get(d, String(nm), identity) -getfunc(row, d::Dict{Symbol, <:Base.Callable}, nm) = get(d, nm, identity) -getfunc(row, d::Dict{Int, <:Base.Callable}, nm) = get(d, findfirst(isequal(nm), propertynames(row)), identity) +getfunc(row, nt::NamedTuple, nm::Symbol) = get(nt, nm, identity) +getfunc(row, d::Dict{String, <:Base.Callable}, nm::Symbol) = get(d, String(nm), identity) +getfunc(row, d::Dict{Symbol, <:Base.Callable}, nm::Symbol) = get(d, nm, identity) +getfunc(row, d::Dict{Int, <:Base.Callable}, nm::Symbol) = get(d, findfirst(isequal(nm), columnnames(row)), identity) + +getfunc(row, nt::NamedTuple, i::Int) = i > fieldcount(typeof(nt)) ? identity : getfield(nt, i) +getfunc(row, d::Dict{String, <:Base.Callable}, i::Int) = get(d, String(columnnames(row)[i]), identity) +getfunc(row, d::Dict{Symbol, <:Base.Callable}, i::Int) = get(d, columnnames(row)[i], identity) +getfunc(row, d::Dict{Int, <:Base.Callable}, i::Int) = get(d, i, identity) istable(::Type{<:Transforms}) = true rowaccess(::Type{Transforms{C, T, F}}) where {C, T, F} = !C @@ -48,7 +58,7 @@ Base.eltype(t::Transforms{false, T, F}) where {T, F} = TransformsRow{eltype(getf end # select -struct Select{T, columnaccess, names} +struct Select{T, columnaccess, names} <: AbstractColumns source::T end @@ -90,8 +100,9 @@ function schema(s::Select{T, columnaccess, names}) where {T, columnaccess, names end # columns: make Select property-accessible -Base.getproperty(s::Select{T, true, names}, nm::Symbol) where {T, names} = getproperty(getfield(s, 1), nm) -Base.propertynames(s::Select{T, true, names}) where {T, names} = namesubset(propertynames(getfield(s, 1)), names) +getcolumn(s::Select{T, true, names}, nm::Symbol) where {T, names} = getcolumn(getfield(s, 1), nm) +getcolumn(s::Select{T, true, names}, i::Int) where {T, names} = getcolumn(getfield(s, 1), i) +columnnames(s::Select{T, true, names}) where {T, names} = namesubset(columnnames(getfield(s, 1)), names) columnaccess(::Type{Select{T, C, names}}) where {T, C, names} = C columns(s::Select{T, true, names}) where {T, names} = s @@ -104,18 +115,20 @@ rowaccess(::Type{Select{T, columnaccess, names}}) where {T, columnaccess, names} rows(s::Select{T, false, names}) where {T, names} = s # we need to iterate a "row view" in case the underlying source has unknown schema -# to ensure each iterated row only has `names` propertynames -struct SelectRow{T, names} +# to ensure each iterated row only has `names` columnnames +struct SelectRow{T, names} <: AbstractRow row::T end -Base.getproperty(row::SelectRow, nm::Symbol) = getproperty(getfield(row, 1), nm) +getcolumn(row::SelectRow, nm::Symbol) = getcolumn(getfield(row, 1), nm) +getcolumn(row::SelectRow, i::Int) = getcolumn(getfield(row, 1), i) +getcolumn(row::SelectRow, ::Type{T}, i::Int, nm::Symbol) where {T} = getcolumn(getfield(row, 1), T, i, nm) getprops(row, nms::NTuple{N, Symbol}) where {N} = nms -getprops(row, inds::NTuple{N, Int}) where {N} = ntuple(i->propertynames(getfield(row, 1))[inds[i]], N) +getprops(row, inds::NTuple{N, Int}) where {N} = ntuple(i->columnnames(getfield(row, 1))[inds[i]], N) getprops(row, ::Tuple{}) = () -Base.propertynames(row::SelectRow{T, names}) where {T, names} = getprops(row, names) +columnnames(row::SelectRow{T, names}) where {T, names} = getprops(row, names) @inline function Base.iterate(s::Select{T, false, names}) where {T, names} state = iterate(getfield(s, 1)) diff --git a/src/tofromdatavalues.jl b/src/tofromdatavalues.jl index 8167d6e..2329296 100644 --- a/src/tofromdatavalues.jl +++ b/src/tofromdatavalues.jl @@ -37,7 +37,7 @@ Base.size(rows::IteratorWrapper) = size(rows.x) x = iterate(rows.x) x === nothing && return nothing row, st = x - propertynames(row) === () && invalidtable(rows.x, row) + columnnames(row) === () && invalidtable(rows.x, row) return IteratorRow(row), st end @@ -48,32 +48,22 @@ end return IteratorRow(row), st end -struct IteratorRow{T} +struct IteratorRow{T} <: AbstractRow row::T end +getrow(r::IteratorRow) = getfield(r, :row) + unwrap(::Type{T}, x) where {T} = convert(T, x) unwrap(::Type{Any}, x) = x.hasvalue ? x.value : missing -function Base.getproperty(d::IteratorRow, ::Type{T}, col::Int, nm) where {T} - x = getproperty(getfield(d, 1), T, col, nm) - TT = typeof(x) - TTT = DataValueInterfaces.nondatavaluetype(TT) - return TT == TTT ? x : unwrap(TTT, x) -end -function Base.getproperty(d::IteratorRow, nm::Symbol) - x = getproperty(getfield(d, 1), nm) - TT = typeof(x) - TTT = DataValueInterfaces.nondatavaluetype(TT) - return TT == TTT ? x : unwrap(TTT, x) -end -function Base.getproperty(d::IteratorRow, nm::Int) - x = getproperty(getfield(d, 1), nm) - TT = typeof(x) - TTT = DataValueInterfaces.nondatavaluetype(TT) - return TT == TTT ? x : unwrap(TTT, x) -end -Base.propertynames(d::IteratorRow) = propertynames(getfield(d, 1)) +nondv(T) = DataValueInterfaces.nondatavaluetype(T) +undatavalue(x::T) where {T} = T == nondv(T) ? x : unwrap(nondv(T), x) + +getcolumn(r::IteratorRow, ::Type{T}, col::Int, nm::Symbol) where {T} = undatavalue(getcolumn(getrow(r), T, col, nm)) +getcolumn(r::IteratorRow, nm::Symbol) = undatavalue(getcolumn(getrow(r), nm)) +getcolumn(r::IteratorRow, i::Int) = undatavalue(getcolumn(getrow(r), i)) +columnnames(r::IteratorRow) = columnnames(getrow(r)) # DataValueRowIterator wraps a Row iterator and will wrap `Union{T, Missing}` typed fields in DataValues struct DataValueRowIterator{NT, S} @@ -94,7 +84,7 @@ Base.size(rows::DataValueRowIterator) = size(rows.x) function Base.iterate(rows::DataValueRowIterator{NT, S}, st=()) where {NT <: NamedTuple{names}, S} where {names} if @generated - vals = Tuple(:(convert($(fieldtype(NT, i)), getproperty(row, $(DataValueInterfaces.nondatavaluetype(fieldtype(NT, i))), $i, $(Meta.QuoteNode(names[i]))))) for i = 1:fieldcount(NT)) + vals = Tuple(:(convert($(fieldtype(NT, i)), getcolumn(row, $(nondv(fieldtype(NT, i))), $i, $(Meta.QuoteNode(names[i]))))) for i = 1:fieldcount(NT)) q = quote x = iterate(rows.x, st...) x === nothing && return nothing @@ -107,6 +97,6 @@ function Base.iterate(rows::DataValueRowIterator{NT, S}, st=()) where {NT <: Nam x = iterate(rows.x, st...) x === nothing && return nothing row, st = x - return NT(Tuple(convert(fieldtype(NT, i), getproperty(row, DataValueInterfaces.nondatavaluetype(fieldtype(NT, i)), i, names[i])) for i = 1:fieldcount(NT))), (st,) + return NT(Tuple(convert(fieldtype(NT, i), getcolumn(row, nondv(fieldtype(NT, i)), i, names[i])) for i = 1:fieldcount(NT))), (st,) end end diff --git a/src/utils.jl b/src/utils.jl index 6d68628..57249b3 100644 --- a/src/utils.jl +++ b/src/utils.jl @@ -22,19 +22,15 @@ Base.@pure function runlength(::Type{T}) where {T <: Tuple} return rle end -# generic fallback from getproperty w/ type information to basic symbol lookup -Base.getproperty(x, ::Type{T}, i::Int, nm) where {T} = getproperty(x, nm) -Base.getproperty(x::NamedTuple{names, types}, ::Type{T}, i::Int, nm::Symbol) where {names, types, T} = Core.getfield(x, i) - """ Tables.eachcolumn(f, sch, row, args...) Tables.eachcolumn(Tables.columns(x)) - The first definition takes a function `f`, table schema `sch`, a `row` type (that satisfies the `Tables.Row` interface), and any other `args...`; - it generates calls to get the value for each column in the row (`getproperty(row, nm)`) and then calls `f(val, col, name, args...)`, where `f` is the + The first definition takes a function `f`, table schema `sch`, a `row` type (that satisfies the `Tables.AbstractRow` interface), and any other `args...`; + it generates calls to get the value for each column in the row (`Tables.getcolumn(row, nm)`) and then calls `f(val, col, name, args...)`, where `f` is the user-provided function, `val` is a row's column value, `col` is the column index as an `Int`, and `name` is the row's column name as a `Symbol`. - While the first definition applies to a `Row` object, the last definition simply returns a property-iterator over a `Columns` object. + While the first definition applies to an `AbstractRow` object, the last definition simply returns an AbstractColumn iterator for a `Columns` object. For example, one could "collect" every column of a `Columns` object by doing: ```julia vectors = [collect(col) for col in Tables.eachcolumn(Tables.columns(x))] @@ -51,7 +47,7 @@ quot(x::Int) = x block = Expr(:block, Expr(:meta, :inline)) for i = 1:length(names) push!(block.args, quote - f(getproperty(row, $(fieldtype(types, i)), $i, $(quot(names[i]))), $i, $(quot(names[i])), args...) + f(getcolumn(row, $(fieldtype(types, i)), $i, $(quot(names[i]))), $i, $(quot(names[i])), args...) end) end return block @@ -63,7 +59,7 @@ quot(x::Int) = x for (T, len) in rle push!(block.args, quote for j = 0:$(len-1) - @inbounds f(getproperty(row, $T, $i + j, names[$i + j]), $i + j, names[$i + j], args...) + @inbounds f(getcolumn(row, $T, $i + j, names[$i + j]), $i + j, names[$i + j], args...) end end) i += len @@ -73,7 +69,7 @@ quot(x::Int) = x b = quote $(Expr(:meta, :inline)) for (i, nm) in enumerate(names) - f(getproperty(row, fieldtype(types, i), i, nm), i, nm, args...) + f(getcolumn(row, fieldtype(types, i), i, nm), i, nm, args...) end return end @@ -82,7 +78,7 @@ quot(x::Int) = x return b else for (i, nm) in enumerate(names) - f(getproperty(row, fieldtype(types, i), i, nm), i, nm, args...) + f(getcolumn(row, fieldtype(types, i), i, nm), i, nm, args...) end return end @@ -94,7 +90,7 @@ end block = Expr(:block, Expr(:meta, :inline)) for i = 1:length(names) push!(block.args, quote - f(getproperty(row, $(quot(names[i]))), $i, $(quot(names[i])), args...) + f(getcolumn(row, $(quot(names[i]))), $i, $(quot(names[i])), args...) end) end return block @@ -102,7 +98,7 @@ end b = quote $(Expr(:meta, :inline)) for (i, nm) in enumerate(names) - f(getproperty(row, nm), i, nm, args...) + f(getcolumn(row, nm), i, nm, args...) end return end @@ -110,7 +106,7 @@ end end else for (i, nm) in enumerate(names) - f(getproperty(row, nm), i, nm, args...) + f(getcolumn(row, nm), i, nm, args...) end return end @@ -124,7 +120,7 @@ end block = Expr(:block, Expr(:meta, :inline)) for i = 1:length(names) push!(block.args, quote - f(getproperty(row, $(fieldtype(types, i)), $i, $(quot(names[i]))), $i, $(quot(names[i])), columns[$i], args...) + f(getcolumn(row, $(fieldtype(types, i)), $i, $(quot(names[i]))), $i, $(quot(names[i])), columns[$i], args...) end) end return block @@ -136,7 +132,7 @@ end for (T, len) in rle push!(block.args, quote for j = 0:$(len-1) - @inbounds f(getproperty(row, $T, $i + j, names[$i + j]), $i + j, names[$i + j], columns[$i + j], args...) + @inbounds f(getcolumn(row, $T, $i + j, names[$i + j]), $i + j, names[$i + j], columns[$i + j], args...) end end) i += len @@ -146,7 +142,7 @@ end b = quote $(Expr(:meta, :inline)) for (i, nm) in enumerate(names) - f(getproperty(row, fieldtype(types, i), i, nm), i, nm, columns[i], args...) + f(getcolumn(row, fieldtype(types, i), i, nm), i, nm, columns[i], args...) end return end @@ -155,7 +151,7 @@ end return b else for (i, nm) in enumerate(names) - f(getproperty(row, fieldtype(types, i), i, nm), i, nm, columns[i], args...) + f(getcolumn(row, fieldtype(types, i), i, nm), i, nm, columns[i], args...) end return end @@ -167,7 +163,7 @@ end block = Expr(:block, Expr(:meta, :inline)) for i = 1:length(names) push!(block.args, quote - f(getproperty(row, $(quot(names[i]))), $i, $(quot(names[i])), columns[$i], args...) + f(getcolumn(row, $(quot(names[i]))), $i, $(quot(names[i])), columns[$i], args...) end) end return block @@ -175,7 +171,7 @@ end b = quote $(Expr(:meta, :inline)) for (i, nm) in enumerate(names) - f(getproperty(row, nm), i, nm, columns[i], args...) + f(getcolumn(row, nm), i, nm, columns[i], args...) end return end @@ -183,7 +179,7 @@ end end else for (i, nm) in enumerate(names) - f(getproperty(row, nm), i, nm, columns[i], args...) + f(getcolumn(row, nm), i, nm, columns[i], args...) end return end @@ -194,16 +190,18 @@ struct EachColumn{T} source::T end -Base.length(e::EachColumn) = length(propertynames(e.source)) +Base.length(e::EachColumn) = length(columnnames(e.source)) Base.IteratorEltype(::Type{<:EachColumn}) = Base.EltypeUnknown() -function Base.iterate(e::EachColumn, (idx, props)=(1, propertynames(e.source))) +function Base.iterate(e::EachColumn, (idx, props)=(1, columnnames(e.source))) idx > length(props) && return nothing - return getproperty(e.source, props[idx]), (idx + 1, props) + return getcolumn(e.source, props[idx]), (idx + 1, props) end eachcolumn(c) = EachColumn(c) +Base.@pure columnindex(::Schema{names, types}, name::Symbol) where {names, types} = columnindex(names, name) + "given names and a Symbol `name`, compute the index (1-based) of the name in names" Base.@pure function columnindex(names::Tuple{Vararg{Symbol}}, name::Symbol) i = 1 @@ -217,10 +215,10 @@ end Base.@pure columntype(::Schema{names, types}, name::Symbol) where {names, types} = columntype(names, types, name) "given tuple type and a Symbol `name`, compute the type of the name in the tuples types" -Base.@pure function columntype(names::Tuple{Vararg{Symbol}}, ::Type{T}, name::Symbol) where {T <: Tuple} +Base.@pure function columntype(names::Tuple{Vararg{Symbol}}, ::Type{types}, name::Symbol) where {types <: Tuple} i = 1 for nm in names - nm === name && return fieldtype(T, i) + nm === name && return fieldtype(types, i) i += 1 end return Union{} diff --git a/test/runtests.jl b/test/runtests.jl index fda6e18..4d58d38 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -28,8 +28,6 @@ using Test, Tables, TableTraits, DataValues, QueryOperators, IteratorInterfaceEx @test Tables.columntype(Tables.names(NT), Tables.types(NT), :i) == Union{} nt = (a=1, b=2, c=3) - @test getproperty(nt, Int, 1, :a) === 1 - NT = typeof(nt) output = [0, 0, 0] Tables.eachcolumn(Tables.Schema(Tables.names(NT), Tables.types(NT)), nt, output) do val, col, nm, out From 4cf8f0b20d5f48b7a7d14721902cc1c579952f25 Mon Sep 17 00:00:00 2001 From: Jacob Quinn Date: Mon, 3 Feb 2020 07:59:06 -0700 Subject: [PATCH 02/15] Update docs --- README.md | 30 ++++---- src/Tables.jl | 148 +++++++++++++++++++--------------------- src/fallbacks.jl | 22 +++--- src/tofromdatavalues.jl | 2 +- src/utils.jl | 20 +++--- 5 files changed, 111 insertions(+), 111 deletions(-) diff --git a/README.md b/README.md index af60135..1fbdcad 100644 --- a/README.md +++ b/README.md @@ -6,25 +6,32 @@ The Tables.jl package provides simple, yet powerful interface functions for working with all kinds tabular data through predictable access patterns. At its core, it provides two simple functions for accessing a source table's data, regardless of its storage format or orientation: ```julia - Tables.rows(table) => Rows + Tables.rows(table) => Row iterator (aka Rows) Tables.columns(table) => Columns ``` -These two functions return objects that satisfy the `Rows` or `Columns` interfaces: -* `Rows` is an iterator (i.e. implements `Base.iterate(x)`) of property-accessible objects (any type that supports `propertynames(row)` and `getproperty(row, nm::Symbol`) -* `Columns` is a property-accessible object of iterators (i.e. each column can be retrieved via `getproperty` and is an iterator) +These two functions return objects that satisfy the `Row` or `Columns` interfaces, which are: +| Required Methods | Default Definition | Brief Description | +| ---------------- | ------------------ | ----------------- | +| `Tables.getcolumn(x, i::Int)` | getfield(x, i) | Retrieve an entire column (`Columns`) or single column value (`Row`) by index | +| `Tables.getcolumn(x, nm::Symbol)` | getproperty(x, nm) | Retrieve an entire column (`Columns`) or single column value (`Row`) by name | +| `Tables.columnnames(x)` | propertynames(x) | Return column names for `Columns` or a `Row` as an indexable collection | +| Optional methods | | | +| `Tables.getcolumn(x, ::Type{T}, i::Int, nm::Symbol)` | Tables.getcolumn(x, nm) | Given a column eltype `T`, index `i`, and column name `nm`, retrieve the column or column value. Provides a type-stable or even constant-prop-able mechanism for efficiency. So `Rows` is any object that can be used like: ```julia -for rows in table - for columnname in propertynames(row) - value = getproperty(row, columnname) +rows = Tables.rows(x) +for row in rows + for columnname in Tables.columnnames(row) + value = Tables.getcolumn(row, columnname) end end ``` And `Columns` is any object that can be used like: ```julia -for columnname in propertynames(table) - column = getproperty(table, columnname) +columns = Tables.columns(x) +for columnname in Tables.columnnames(columns) + column = Tables.getcolumn(columns, columnname) end ``` @@ -75,7 +82,6 @@ The answer is mostly straightforward: just use the interface functions. A note d ```julia function MyTable(x) - # Tables.istable(x) || throw(ArgumentError("input is not a table")) rows = Tables.rows(x) sch = Tables.schema(rows) names = sch.names @@ -100,7 +106,7 @@ Alternatively, it may be more natural for `MyTable` to consume input data column function MyTable(x) cols = Tables.columns(x) # here we use Tables.eachcolumn to iterate over each column in `cols`, which satisfies the `Columns` interface - return MyTable(collect(propertynames(cols)), [collect(col) for col in Tables.eachcolumn(cols)]) + return MyTable(collect(Tables.columnnames(cols)), [collect(col) for col in Tables.eachcolumn(cols)]) end ``` @@ -114,7 +120,7 @@ function MyTable(x) return MyTable() end row, st = state - columnnames = propertynames(row) + columnnames = Tables.columnnames(row) # create a Tables.Schema manually w/ just the column names from the first row sch = Tables.Schema(columnnames, nothing) cols = length(columnnames) diff --git a/src/Tables.jl b/src/Tables.jl index e2a3f4d..7fc041e 100644 --- a/src/Tables.jl +++ b/src/Tables.jl @@ -11,7 +11,10 @@ end """ Tables.AbstractColumns -Abstract type provided to allow custom table types to inherit useful and required behavior. +Abstract type provided to allow custom table types to inherit useful and required behavior. Note that this type +is for convenience for table _source_ authors to provide useful default behavior to their `Columns` object, +and not to be used or relied upon by sink authors to dispatch on; i.e. not all `Columns` objects will inherit +from `Tables.AbstractColumns`. Interface definition: | Required Methods | Default Definition | Brief Description | @@ -34,7 +37,10 @@ abstract type AbstractColumns end """ Tables.AbstractRow -Abstract type provided to allow custom row types to inherit useful and required behavior. +Abstract type provided to allow custom row types to inherit useful and required behavior. Note that this type +is for convenience for table _source_ authors to provide useful default behavior to their `Row` object, +and not to be used or relied upon by sink authors to dispatch on; i.e. not all `Row` objects will inherit +from `Tables.AbstractRow`. Interface definition: | Required Methods | Default Definition | Brief Description | @@ -101,7 +107,7 @@ Base.values(r::AbstractColumns) = collect(r) Base.haskey(r::AbstractColumns, key::Union{Integer, Symbol}) = key in columnnames(r) Base.get(r::AbstractColumns, key::Union{Integer, Symbol}, default) = haskey(r, key) ? getcolumn(r, key) : default Base.get(f::Base.Callable, r::AbstractColumns, key::Union{Integer, Symbol}) = haskey(r, key) ? getcolumn(r, key) : f() -Base.@propagate_inbounds Base.iterate(r::AbstractColumns, i=1) = i > length(r) ? nothing : (getcolumn(r, i), i + 1) +Base.iterate(r::AbstractColumns, i=1) = i > length(r) ? nothing : (getcolumn(r, i), i + 1) function Base.show(io::IO, x::T) where {T <: AbstractColumns} println(io, "$T:") @@ -110,95 +116,79 @@ function Base.show(io::IO, x::T) where {T <: AbstractColumns} Base.print_matrix(io, hcat(names, values)) end +# default definitions """ -The Tables.jl package provides simple, yet powerful interface functions for working with all kinds of tabular data through predictable access patterns. - -```julia - Tables.rows(table) => Row iterator (also known as a Rows object) - Tables.columns(table) => Columns -``` -Where `Row` and `Columns` are objects that support a common interface: - * `Tables.getcolumn(x, col::Union{Int, Symbol})`: Retrieve an entire column (`Columns`), or single column value (`Row`) by column index (as an `Int`), or by column name (as a `Symbol`) - * `Tables.columnnames(x)`: Retrieve the possible column names for a `Row` or `Columns` object - -In addition to these `Row` and `Columns` objects, it's useful to be able to query properties of these objects: -* `Tables.schema(x::Union{Rows, Columns}) => Union{Tables.Schema, Nothing}`: returns a `Tables.Schema` object, or `nothing` if the table's schema is unknown -* For the `Tables.Schema` object: - * column names can be accessed as a tuple of Symbols like `sch.names` - * column types can be accessed as a tuple of types like `sch.types` - * See `?Tables.Schema` for more details on this type + Tables.istable(x) => Bool -A big part of the power in these simple interface functions is that each (`Tables.rows` & `Tables.columns`) is defined for any table type, even if the table type only explicitly implements one interface function or the other. -This is accomplished by providing performant, generic fallback definitions in Tables.jl itself (though obviously nothing prevents a table type from implementing each interface function directly if so desired). - -With these simple definitions, powerful workflows are enabled: -* A package providing data cleansing, manipulation, visualization, or analysis can automatically handle any number of decoupled input table types -* A tabular file format can have automatic integration with in-memory structures and translation to other file formats +Check if an object has specifically defined that it is a table. Note that +not all valid tables will return true, since it's possible to satisfy the +Tables.jl interface at "run-time", e.g. a Generator of NamedTuples iterates +NamedTuples, which satisfies the Row interface, but there's no static way +of knowing that the generator is a table. +""" +function istable end -So how does one go about satisfying the Tables.jl interface functions? It mainly depends on what you've already defined and the natural access patterns of your table: +istable(x::T) where {T} = istable(T) || TableTraits.isiterabletable(x) === true +istable(::Type{T}) where {T} = false -First: -* `Tables.istable(::Type{<:MyTable}) = true`: this provides an explicit affirmation that your type implements the Tables interface +""" + Tables.rowaccess(x) => Bool + +Check whether an object has specifically defined that it implements the `Tables.rows` +function. Note that `Tables.rows` will work on any object that iterates Row-compatible +objects, even if they don't define `rowaccess`, e.g. a Generator of NamedTuples. Also +note that just because an object defines `rowaccess` doesn't mean a user should call +`Tables.rows` on it; `Tables.columns` will also work, providing a valid `Columns` +object from the rows. Hence, users should call `Tables.rows` or `Tables.columns` +depending on what is most natural for them to *consume* instead of worrying about +what and how the input produces. +""" +function rowaccess end -To support `Rows`: -* Define `Tables.rowaccess(::Type{<:MyTable}) = true`: this signals to other types that `MyTable` supports valid `Row`-iteration -* Define `Tables.rows(x::MyTable)`: return a `Row`-iterator object (perhaps the table itself if already defined) -* Define `Tables.schema(Tables.rows(x::MyTable))` to either return a `Tables.Schema` object, or `nothing` if the schema is unknown or non-inferrable for some reason +rowaccess(x::T) where {T} = rowaccess(T) +rowaccess(::Type{T}) where {T} = false -To support `Columns`: -* Define `Tables.columnaccess(::Type{<:MyTable}) = true`: this signals to other types that `MyTable` supports returning a valid `Columns` object -* Define `Tables.columns(x::MyTable)`: return a `Columns`, property-accessible object (perhaps the table itself if it naturally supports property-access to columns) -* Define `Tables.schema(Tables.columns(x::MyTable))` to either return a `Tables.Schema` object, or `nothing` if the schema is unknown or non-inferrable for some reason +""" + Tables.columnaccess(x) => Bool + +Check whether an object has specifically defined that it implements the `Tables.columns` +function. Note that `Tables.columns` has generic fallbacks allowing it to produces `Columns` +objects, even if the input doesn't define `columnaccess`. Also note that just because an +object defines `columnaccess` doesn't mean a user should call `Tables.columns` on it; +`Tables.rows` will also work, providing a valid `Row` iterator. Hence, users should call +`Tables.rows` or `Tables.columns` depending on what is most natural for them to *consume* +instead of worrying about what and how the input produces. +""" +function columnaccess end -The final question is how `MyTable` can be a "sink" for any other table type. The answer is quite simple: use the interface functions! +columnaccess(x::T) where {T} = columnaccess(T) +columnaccess(::Type{T}) where {T} = false -* Define a function or constructor that takes, at a minimum, a single, untyped argument and then calls `Tables.rows` or `Tables.columns` on that argument to construct an instance of `MyTable` +""" + Tables.schema(x) => Union{Nothing, Tables.Schema} -For example, if `MyTable` is a row-oriented format, I might define my "sink" function like: -```julia -function MyTable(x) - Tables.istable(x) || throw(ArgumentError("MyTable requires a table input")) - rows = Tables.rows(x) - sch = Tables.schema(rows) - names = sch.names - types = sch.types - # custom constructor that creates an "empty" MyTable according to given column names & types - # note that the "unknown" schema case should be considered, i.e. when `sch.types => nothing` - mytbl = MyTable(names, types) - for row in rows - # a convenience function provided in Tables.jl for "unrolling" access to each column/property of a `Row` - # it works by applying a provided function to each value; see `?Tables.eachcolumn` for more details - Tables.eachcolumn(sch, row) do val, col, name - push!(mytbl[col], val) - end - end - return mytbl -end -``` +Attempt to retrieve the schema of the object returned by `Tables.rows` or `Tables.columns`. +If the `Row` iterator or `Columns` object can't determine its schema, `nothing` will be returned. +Otherwise, a `Tables.Schema` object is returned, with the column names and types available for use. +""" +function schema end -Alternatively, if `MyTable` is column-oriented, perhaps my definition would be more like: -```julia -function MyTable(x) - Tables.istable(x) || throw(ArgumentError("MyTable requires a table input")) - cols = Tables.columns(x) - # here we use Tables.eachcolumn to iterate over each column in a `Columns` object - return MyTable(collect(propertynames(cols)), [collect(col) for col in Tables.eachcolumn(cols)]) -end -``` +schema(x) = nothing -Obviously every table type is different, but via a combination of `Tables.rows` and `Tables.columns` each table type should be able to construct an instance of itself. """ -abstract type Table end + Tables.materializer(x) => Callable + +For a table input, return the "sink" function or "materializing" function that can take a +Tables.jl-compatible table input and make an instance of the table type. This enables "transform" +workflows that take table inputs, apply transformations, potentially converting the table to +a different form, and end with producing a table of the same type as the original input. The +default materializer is `Tables.columntable`, which converts any table input into a NamedTuple +of Vectors. +""" +function materializer end -# default definitions -istable(x::T) where {T} = istable(T) || TableTraits.isiterabletable(x) === true -istable(::Type{T}) where {T} = false -rowaccess(x::T) where {T} = rowaccess(T) -rowaccess(::Type{T}) where {T} = false -columnaccess(x::T) where {T} = columnaccess(T) -columnaccess(::Type{T}) where {T} = false -schema(x) = nothing -materializer(x) = columntable +materializer(x::T) where {T} = materializer(T) +materializer(::Type{T}) where {T} = columntable # Schema implementation """ diff --git a/src/fallbacks.jl b/src/fallbacks.jl index ce52df0..051a304 100644 --- a/src/fallbacks.jl +++ b/src/fallbacks.jl @@ -83,7 +83,7 @@ function rows(x::T) where {T} cols = columns(x) return RowIterator(cols, Int(rowcount(cols))) # otherwise, if the input is at least iterable, we'll wrap it in an IteratorWrapper - # which will iterate the input, validating that it supports the AbstractRow interface + # which will iterate the input, validating that it supports the Row interface # and unwrapping any DataValues that are encountered elseif IteratorInterfaceExtensions.isiterable(x) return nondatavaluerows(x) @@ -187,12 +187,16 @@ end return NamedTuple{Base.map(Symbol, names)}(_buildcolumns(rowitr, row, st, sch, columns, Ref{Any}(columns))[]) end -# for some sinks, there's a concern about whether they can safely "own" columns from the input -# to be safe, they should always copy input columns, to avoid unintended mutation. -# when we've called buildcolumns, however, Tables.jl essentially built/owns the columns, -# and it's happy to pass ownership to the sink. Thus, any built columns will be wrapped -# in a CopiedColumns struct to signal to the sink that essentially "a copy has already been made" -# and they're safe to assume ownership +""" + Tables.CopiedColumns + +For some sinks, there's a concern about whether they can safely "own" columns from the input. +To be safe, they should always copy input columns, to avoid unintended mutation. +When we've called buildcolumns, however, Tables.jl essentially built/owns the columns, +and it's happy to pass ownership to the sink. Thus, any built columns will be wrapped +in a CopiedColumns struct to signal to the sink that essentially "a copy has already been made" +and they're safe to assume ownership +""" struct CopiedColumns{T} <: AbstractColumns x::T end @@ -212,7 +216,7 @@ columnnames(x::CopiedColumns) = columnnames(source(x)) # here's our generic fallback Tables.columns definition @inline function columns(x::T) where {T} # because this method is being called, we know `x` didn't define it's own Tables.columns method - # first check if it supports row access, and if so, build up the desired columns + # first check if it explicitly supports row access, and if so, build up the desired columns if rowaccess(T) r = rows(x) return CopiedColumns(buildcolumns(schema(r), r)) @@ -220,7 +224,7 @@ columnnames(x::CopiedColumns) = columnnames(source(x)) elseif TableTraits.supports_get_columns_copy_using_missing(x) return CopiedColumns(TableTraits.get_columns_copy_using_missing(x)) # otherwise, if the source is at least iterable, we'll wrap it in an IteratorWrapper and - # build columns from that, which will check if the source correctly iterates AbstractRows + # build columns from that, which will check if the source correctly iterates valid Row objects # and unwraps DataValues for us elseif IteratorInterfaceExtensions.isiterable(x) iw = nondatavaluerows(x) diff --git a/src/tofromdatavalues.jl b/src/tofromdatavalues.jl index 2329296..db71f3b 100644 --- a/src/tofromdatavalues.jl +++ b/src/tofromdatavalues.jl @@ -31,7 +31,7 @@ Base.IteratorSize(::Type{IteratorWrapper{S}}) where {S} = Base.IteratorSize(S) Base.length(rows::IteratorWrapper) = length(rows.x) Base.size(rows::IteratorWrapper) = size(rows.x) -@noinline invalidtable(::T, ::S) where {T, S} = throw(ArgumentError("'$T' iterates '$S' values, which don't satisfy the Tables.jl Row-iterator interface")) +@noinline invalidtable(::T, ::S) where {T, S} = throw(ArgumentError("'$T' iterates '$S' values, which doesn't satisfy the Tables.jl Row-iterator interface")) @inline function Base.iterate(rows::IteratorWrapper) x = iterate(rows.x) diff --git a/src/utils.jl b/src/utils.jl index 57249b3..7871a11 100644 --- a/src/utils.jl +++ b/src/utils.jl @@ -26,15 +26,15 @@ end Tables.eachcolumn(f, sch, row, args...) Tables.eachcolumn(Tables.columns(x)) - The first definition takes a function `f`, table schema `sch`, a `row` type (that satisfies the `Tables.AbstractRow` interface), and any other `args...`; - it generates calls to get the value for each column in the row (`Tables.getcolumn(row, nm)`) and then calls `f(val, col, name, args...)`, where `f` is the - user-provided function, `val` is a row's column value, `col` is the column index as an `Int`, and `name` is the row's column name as a `Symbol`. - - While the first definition applies to an `AbstractRow` object, the last definition simply returns an AbstractColumn iterator for a `Columns` object. - For example, one could "collect" every column of a `Columns` object by doing: - ```julia - vectors = [collect(col) for col in Tables.eachcolumn(Tables.columns(x))] - ``` +The first definition takes a function `f`, table schema `sch`, a `row` type (that satisfies the `Row` interface), and any other `args...`; +it generates calls to get the value for each column in the row (`Tables.getcolumn(row, nm)`) and then calls `f(val, col, name, args...)`, where `f` is the +user-provided function, `val` is a row's column value, `col` is the column index as an `Int`, and `name` is the row's column name as a `Symbol`. + +While the first definition applies to an `Row` object, the last definition simply returns an AbstractColumn iterator for a `Columns` object. +For example, one could "collect" every column of a `Columns` object by doing: +```julia +vectors = [collect(col) for col in Tables.eachcolumn(Tables.columns(x))] +``` """ function eachcolumn end @@ -112,7 +112,7 @@ end end end -# this are specialized `eachcolumn`s where we also want +# these are specialized `eachcolumn`s where we also want # the indexing of `columns` to be constant propagated, so it needs to be returned from the generated function @inline function eachcolumns(f::Base.Callable, sch::Schema{names, types}, row, columns, args...) where {names, types} if @generated From 74e303e4c56897c28c26b1a4bdfd8c58c05927c6 Mon Sep 17 00:00:00 2001 From: Jacob Quinn Date: Tue, 4 Feb 2020 01:44:22 -0700 Subject: [PATCH 03/15] Lots of doc updates --- .travis.yml | 8 ++ README.md | 180 +---------------------------------- docs/Manifest.toml | 93 ++++++++++++++++++ docs/Project.toml | 2 + docs/make.jl | 17 ++++ docs/src/index.md | 202 ++++++++++++++++++++++++++++++++++++++++ src/Tables.jl | 126 +++++++++++++++++++++---- src/matrix.jl | 4 +- src/namedtuples.jl | 29 ++++++ src/tofromdatavalues.jl | 20 ++++ 10 files changed, 485 insertions(+), 196 deletions(-) create mode 100644 docs/Manifest.toml create mode 100644 docs/Project.toml create mode 100644 docs/make.jl create mode 100644 docs/src/index.md diff --git a/.travis.yml b/.travis.yml index 4e969ba..21a7fe6 100644 --- a/.travis.yml +++ b/.travis.yml @@ -18,6 +18,14 @@ matrix: exclude: - os: osx arch: x86 + include: + - stage: "Documentation" + julia: 1.3 + os: linux + script: + - julia --project=docs/ -e 'using Pkg; Pkg.develop(PackageSpec(path=pwd())); Pkg.instantiate(); Pkg.build("Tables")' + - julia --project=docs/ docs/make.jl + after_success: skip notifications: email: false after_success: diff --git a/README.md b/README.md index 1fbdcad..d8722e7 100644 --- a/README.md +++ b/README.md @@ -3,181 +3,9 @@ [![Build Status](https://travis-ci.org/JuliaData/Tables.jl.svg?branch=master)](https://travis-ci.org/JuliaData/Tables.jl) [![Codecov](https://codecov.io/gh/JuliaData/Tables.jl/branch/master/graph/badge.svg)](https://codecov.io/gh/JuliaData/Tables.jl) -The Tables.jl package provides simple, yet powerful interface functions for working with all kinds tabular data through predictable access patterns. At its core, it provides two simple functions for accessing a source table's data, regardless of its storage format or orientation: +The Tables.jl package provides simple, yet powerful interface functions for working with all kinds tabular data. -```julia - Tables.rows(table) => Row iterator (aka Rows) - Tables.columns(table) => Columns -``` -These two functions return objects that satisfy the `Row` or `Columns` interfaces, which are: -| Required Methods | Default Definition | Brief Description | -| ---------------- | ------------------ | ----------------- | -| `Tables.getcolumn(x, i::Int)` | getfield(x, i) | Retrieve an entire column (`Columns`) or single column value (`Row`) by index | -| `Tables.getcolumn(x, nm::Symbol)` | getproperty(x, nm) | Retrieve an entire column (`Columns`) or single column value (`Row`) by name | -| `Tables.columnnames(x)` | propertynames(x) | Return column names for `Columns` or a `Row` as an indexable collection | -| Optional methods | | | -| `Tables.getcolumn(x, ::Type{T}, i::Int, nm::Symbol)` | Tables.getcolumn(x, nm) | Given a column eltype `T`, index `i`, and column name `nm`, retrieve the column or column value. Provides a type-stable or even constant-prop-able mechanism for efficiency. +### Documentation -So `Rows` is any object that can be used like: -```julia -rows = Tables.rows(x) -for row in rows - for columnname in Tables.columnnames(row) - value = Tables.getcolumn(row, columnname) - end -end -``` -And `Columns` is any object that can be used like: -```julia -columns = Tables.columns(x) -for columnname in Tables.columnnames(columns) - column = Tables.getcolumn(columns, columnname) -end -``` - -In addition to these `Rows` and `Columns` objects, it's useful to be able to query properties of these objects: -* `Tables.schema(x::Union{Rows, Columns}) => Union{Tables.Schema, Nothing}`: returns a `Tables.Schema` object, or `nothing` if the table's schema is unknown -* For the `Tables.Schema` object: - * column names can be accessed as an indexable collection of Symbols like `sch.names` - * column types can be accessed as an indexable collection of types like `sch.types` - * See `?Tables.Schema` for more details on this type -Because many table types are able to provide a well-defined schema, it can enable optimizations for consumers when this schema can be queried upfront before data access. - -A big part of the power in these simple interface functions is that each, `Tables.rows` ***and*** `Tables.columns`, is defined for any table type, even if the table type only explicitly implements one interface function or the other. -This is accomplished by providing performant, generic fallback definitions in Tables.jl itself (though obviously nothing prevents a table type from implementing each interface function directly). - -This means that table *authors* only need to worry about providing a single, most natural access pattern to their table type, whereas table *consumers* don't need to worry about the storage format or orientation of a table source, but can instead focus on the most natural *consumption* pattern for data access (row-by-row or on entire columns). - -With these simple definitions, powerful workflows are enabled: -* A package providing data cleansing, manipulation, visualization, or analysis can automatically handle any number of decoupled input table types -* A tabular file format can have automatic integration with in-memory structures and translation to other file formats -* table-like database objects can be queried, streaming the results direclty to various file formats or in-memory table structures - -# Tables Interface - -So how does one go about satisfying the Tables.jl interface functions? It mainly depends on what you've already defined and the natural access patterns of your table: - -## `Tables.istable`: - -* `Tables.istable(::Type{<:MyTable}) = true`: this provides an explicit affirmation that your type implements the Tables interface -* `Tables.istable(x::MyTable) = x.istable`: alternatively, it may be the case that `MyTable` can only implement that Tables interface in some cases, known only at runtime; in this case, we can define `Tables.istable` on an ***instance*** of `MyTable` instead of the type. For consumers, this function should always be called on ***instances*** (like `Tables.istable(x)`), to ensure input tables are appropriately supported - -## To support `Rows`: - -* Define `Tables.rowaccess(::Type{<:MyTable}) = true`: this signals that `MyTable` supports iterating objects that satisfy the `Row` interface; note this function isn't meant for public use, but is instead used by Tables.jl itself to provide a generic fallback definition for `Tables.columns` on row-oriented sources -* Define `Tables.rows(x::MyTable)`: return a `Row`-iterator object (perhaps the table itself if it already defines a `Base.iterate` method that returns `Row` interface objects) -* Define `Tables.schema(Tables.rows(x::MyTable))` to either return a `Tables.Schema` object, or `nothing` if the schema is unknown or non-inferrable for some reason - -## To support `Columns`: - -* Define `Tables.columnaccess(::Type{<:MyTable}) = true`: this signals that `MyTable` supports returning an object satisfying the `Columns` interface; note this function isn't meant for public use, but is instead used by Tables.jl itself to provide a generic fallback definition for `Tables.rows` on column-oriented sources -* Define `Tables.columns(x::MyTable)`: return an object satisfying the `Columns` interface, perhaps the table itself if it naturally supports property-access to columns -* Define `Tables.schema(Tables.columns(x::MyTable))` to either return a `Tables.Schema` object, or `nothing` if the schema is unknown or non-inferrable for some reason - -## Consuming table inputs (i.e. ***using*** the Tables.jl interface) - -As the author of `MyTable`, I'm ecstatic that `MyTable` can now automatically be used by a number of other "table" packages, but another question is how `MyTable` can be a "sink" for any other table type. In other words, how do I actually ***use*** the Tables.jl interface? - -The answer is mostly straightforward: just use the interface functions. A note does need to be made with regards to how interfaces currently operate in Julia; there's no support for "dispatching" on objects satisfying interfaces, which means I can't just define `MyTable(table::Tables.Table)`. What most packages do is define a constructor (or "sink function") that takes a single, un-typed argument like: - -```julia -function MyTable(x) - rows = Tables.rows(x) - sch = Tables.schema(rows) - names = sch.names - types = sch.types - # custom constructor that creates an "empty" MyTable according to given column names & types - # note that the "unknown" schema case should be considered, i.e. when `Tables.schema(x) === nothing` - mytbl = MyTable(names, types) - for row in rows - # a convenience function provided in Tables.jl for "unrolling" access to each column/property of a `Row` - # it works by applying a provided function to each value; see `?Tables.eachcolumn` for more details - Tables.eachcolumn(sch, row) do val, columnindex::Int, columnname::Symbol - push!(mytbl[columnindex], val) - end - end - return mytbl -end -``` -In this example, `MyTable` defines a constructor that takes any tables input source, initializes an empty `MyTable`, and proceeds to iterate over the input rows, appending values to each column. Note that the function didn't do any validation on the input to check if it was a valid table: `Tables.rows(x)` will throw an error if `x` doesn't actually satisfy the Tables.jl interface. Alternatively, we could call `Tables.istable(x)` (as shown in the commented line at the start of the function) on the input before calling `Tables.rows(x)` if we needed to restrict things to known, valid Tables.jl. Note that doing this will prevent certain, valid table inputs from being consumed, due to their inability to confidently return `true` for `Tables.istable`, even at runtime (cases like `Generator`s, or `Vector{Any}`). In short, most package just call `Tables.rows`, allowing invalid source errors to be thrown while also accepting the maximum number of possible valid inputs. - -Alternatively, it may be more natural for `MyTable` to consume input data column-by-column, so my definition would be more like: -```julia -function MyTable(x) - cols = Tables.columns(x) - # here we use Tables.eachcolumn to iterate over each column in `cols`, which satisfies the `Columns` interface - return MyTable(collect(Tables.columnnames(cols)), [collect(col) for col in Tables.eachcolumn(cols)]) -end -``` - -Note that in neither case did we need to call `Tables.rowaccess` or `Tables.columnaccess`; those interface functions are only used internally by Tables.jl itself to provide the `Tables.rows` and `Tables.columns` fallback definitions. As a consumer, I only need to consider which of `Tables.rows` or `Tables.columns` better fits my use-case, knowing that if the input table isn't oriented naturally, the fallback definition will provide the access pattern I desire. Also note that in the column-oriented definition, we didn't even call `Tables.schema` since we just do a single iteration over each column. Also note that in the row-oriented case, we didn't account for the case when `Tables.schema(x) === nothing`; one way to support the unknown schema case is to do something like: -```julia -function MyTable(x) - rows = Tables.rows(x) - state = iterate(rows) - if state === nothing - # the input table was empty, so return an empty MyTable - return MyTable() - end - row, st = state - columnnames = Tables.columnnames(row) - # create a Tables.Schema manually w/ just the column names from the first row - sch = Tables.Schema(columnnames, nothing) - cols = length(columnnames) - # create an emtpy MyTable with just the expected column names - mytbl = MyTable(columnnames) - while state !== nothing - row, st = state - Tables.eachcolumn(sch, row) do val, columnindex::Int, columnname::Symbol - push!(mytbl[columnindex], val) - end - state = iterate(rows, st) - end - return mytbl -end -``` - -## Functions that input and output tables: - -For functions that input a table, perform some calculation, and output a new table, we need a way of constructing the preferred output table given the input. For this purpose, `Tables.materializer(table)` returns the preferred sink function for a table (`Tables.columntable`, which creates a named tuple of AbstractVectors, is the default). - -Note that an in-memory table with a properly defined "sink" function can reconstruct itself with the following: - -```julia -materializer(table)(Tables.columns(table)) - -materializer(table)(Tables.rows(table)) -``` - -For example, we may want to select a subset of columns from a column-access table. One way we could implement it is with the following: - -```julia -function select(table, cols::Symbol...) - nt = Tables.columntable(table) # columntable(t) creates a NamedTuple of AbstractVectors - newcols = NamedTuple{cols}(nt) - Tables.materializer(table)(newcols) -end - -# Example of selecting columns from a columntable -tbl = (x=1:100, y=rand(100), z=randn(100)) -select(tbl, :x) -select(tbl, :x, :z) - -tbl = [(x=1, y="a", z=1.0), (x=2, y="b", z=2.0)] -select(tbl, :z, :x) -``` - -## Utilities -A number of "helper" utility functions are provided to aid in working with the Tables.jl collection of interfaces: - -* `rowtable(x)`: takes any input that satisfies the Tables.jl interface and converts it to a `Vector` of `NamedTuple`s, which itself satisfies the Tables.jl interface -* `rowtable(rt, x)`: take a "row table" (`Vector` of `NamedTuples`) and any table input `x` and appends `x` to `rt` -* `columntable(x)`: takes any input that satisfies the Tables.jl interface and converts it to a `NamedTuple` of `AbstractVector`s, which itself satisfies the Tables.jl interface -* `columntable(ct, x)`: takes a "column table (`NamedTuple` of `AbstractVector`s) and a table input `x` and appends `x` to `ct` -* `Tables.datavaluerows(x)`: takes any table input `x` and returns an iterator that will replace `missing` values with `DataValue`-wrapped values; this allows any table type to satisfy the `TableTraits.jl` Queryverse integration interface by defining: `IteratorInterfaceExtensions.getiterator(x::MyTable) = Tables.datavaluerows(x)` -* `Tables.nondatavaluerows(x)`: takes any iterator and replaces any `DataValue` values that are actually missing with `missing` -* `Tables.transform(x, transformfunctions...)`: create a lazy wrapper that satisfies the Tables.jl interface and applies `transformfunctions` to values when accessed; the tranform functions can be a NamedTuple or Dict mapping column name (`String` or `Symbol` or `Integer` index) to `Function` -* `Tables.select(x, columns...)`: create a lazy wrapper that satisfies the Tables.jl interface and keeps only the columns given by the `columns` arguments, which can be `String`s, `Symbol`s, or `Integer`s -* `Tables.table(x::AbstractMatrix)`: because any `AbstractMatrix` isn't a table by default, a convenience function is provided to treat an `AbstractMatrix` as a table; see `?Tables.table` for more details -* `Tables.matrix(x; transpose::Bool=false)`: a matrix "sink" function; takes any table input and converts to a dense `Matrix`; see `?Tables.matrix` for more details -* `Tables.eachcolumn`: convenience function for objects satisfying the `Row` or `Columns` interfaces which allows iterating or applying a function over each column; see `?Tables.eachcolumn` for more details +[![Stable](https://img.shields.io/badge/docs-stable-blue.svg)](https://juliadata.github.io/Tables.jl/stable) +[![Dev](https://img.shields.io/badge/docs-dev-blue.svg)](https://juliadata.github.io/Tables.jl/dev) \ No newline at end of file diff --git a/docs/Manifest.toml b/docs/Manifest.toml new file mode 100644 index 0000000..485804e --- /dev/null +++ b/docs/Manifest.toml @@ -0,0 +1,93 @@ +# This file is machine-generated - editing it directly is not advised + +[[Base64]] +uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f" + +[[Dates]] +deps = ["Printf"] +uuid = "ade2ca70-3891-5945-98fb-dc099432e06a" + +[[Distributed]] +deps = ["Random", "Serialization", "Sockets"] +uuid = "8ba89e20-285c-5b6f-9357-94700520ee1b" + +[[DocStringExtensions]] +deps = ["LibGit2", "Markdown", "Pkg", "Test"] +git-tree-sha1 = "88bb0edb352b16608036faadcc071adda068582a" +uuid = "ffbed154-4ef7-542d-bbb7-c09d3a79fcae" +version = "0.8.1" + +[[Documenter]] +deps = ["Base64", "DocStringExtensions", "InteractiveUtils", "JSON", "LibGit2", "Logging", "Markdown", "REPL", "Test", "Unicode"] +git-tree-sha1 = "d45c163c7a3ae293c15361acc52882c0f853f97c" +uuid = "e30172f5-a6a5-5a46-863b-614d45cd2de4" +version = "0.23.4" + +[[InteractiveUtils]] +deps = ["Markdown"] +uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240" + +[[JSON]] +deps = ["Dates", "Mmap", "Parsers", "Unicode"] +git-tree-sha1 = "b34d7cef7b337321e97d22242c3c2b91f476748e" +uuid = "682c06a0-de6a-54ab-a142-c8b1cf79cde6" +version = "0.21.0" + +[[LibGit2]] +deps = ["Printf"] +uuid = "76f85450-5226-5b5a-8eaa-529ad045b433" + +[[Libdl]] +uuid = "8f399da3-3557-5675-b5ff-fb832c97cbdb" + +[[Logging]] +uuid = "56ddb016-857b-54e1-b83d-db4d58db5568" + +[[Markdown]] +deps = ["Base64"] +uuid = "d6f4376e-aef5-505a-96c1-9c027394607a" + +[[Mmap]] +uuid = "a63ad114-7e13-5084-954f-fe012c677804" + +[[Parsers]] +deps = ["Dates", "Test"] +git-tree-sha1 = "c56ecb484f286639f161e712b8311f5ab77e8d32" +uuid = "69de0a69-1ddd-5017-9359-2bf0b02dc9f0" +version = "0.3.8" + +[[Pkg]] +deps = ["Dates", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "UUIDs"] +uuid = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f" + +[[Printf]] +deps = ["Unicode"] +uuid = "de0858da-6303-5e67-8744-51eddeeeb8d7" + +[[REPL]] +deps = ["InteractiveUtils", "Markdown", "Sockets"] +uuid = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb" + +[[Random]] +deps = ["Serialization"] +uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" + +[[SHA]] +uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce" + +[[Serialization]] +uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b" + +[[Sockets]] +uuid = "6462fe0b-24de-5631-8697-dd941f90decc" + +[[Test]] +deps = ["Distributed", "InteractiveUtils", "Logging", "Random"] +uuid = "8dfed614-e22c-5e08-85e1-65c5234f0b40" + +[[UUIDs]] +deps = ["Random", "SHA"] +uuid = "cf7118a7-6976-5b1a-9a39-7adc72f591a4" + +[[Unicode]] +uuid = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5" diff --git a/docs/Project.toml b/docs/Project.toml new file mode 100644 index 0000000..dfa65cd --- /dev/null +++ b/docs/Project.toml @@ -0,0 +1,2 @@ +[deps] +Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4" diff --git a/docs/make.jl b/docs/make.jl new file mode 100644 index 0000000..81328ad --- /dev/null +++ b/docs/make.jl @@ -0,0 +1,17 @@ +using Documenter, Tables + +makedocs(; + modules=[Tables], + format=Documenter.HTML(), + pages=[ + "Home" => "index.md", + ], + repo="https://github.com/JuliaData/Tables.jl/blob/{commit}{path}#L{line}", + sitename="Tables.jl", + authors="Jacob Quinn", + assets=String[], +) + +deploydocs(; + repo="github.com/JuliaData/Tables.jl", +) diff --git a/docs/src/index.md b/docs/src/index.md new file mode 100644 index 0000000..5da4fc9 --- /dev/null +++ b/docs/src/index.md @@ -0,0 +1,202 @@ +# Tables.jl Documentation + +This guide provides documentation around the powerful tables interfaces in the Tables.jl package. +Note that the package, and hence, documentation, are geared towards package and library developers +who intend to implement and consume the interfaces. Users, on the other hand, benefit from these +other packages that provide useful access to table data in various formats or workflows. + +With that said, don't hesitate to [open a new issue](https://github.com/JuliaData/Tables.jl/issues/new), even +just for a question, or come chat with us on the [#data](https://julialang.slack.com/messages/data/) slack +channel with question, concerns, or clarifications. + +```@contents +``` + +## Using the Interface (i.e. consuming Tables.jl sources) + +We start by discussing _usage_ of the Tables.jl interfaces, since that can help contextualize _implementing_ them. + +At a high level, Tables.jl provides two powerful APIs for predictably accessing data from any table-like source: +```julia +# access data of input table `x` row-by-row +rows = Tables.rows(x) + +for row in rows + # example of getting all values in the row + # there are other ways to more efficiently process rows + rowvalues = [Tables.getcolumn(row, col) for col in Tables.columnnames(row)] +end + +# access data of input table `x` column-by-column +columns = Tables.columns(x) + +# iterate through each column name in table +for col in Tables.columnnames(columns) + # retrieve entire column by column name + # a column is an indexable collection + # with known length (i.e. supports + # `length(column)` and `column[i]`) + column = Tables.getcolumn(columns, col) +end +``` + +So we see two high-level functions here, `Tables.rows`, and `Tables.columns`. + +```@docs +Tables.rows +Tables.columns +``` + +Given these two powerful data access methods, let's walk through real, albeit somewhat simplified versions of how packages actually use these methods. + +### Tables.rows usage + +First up, let's take a look at the [SQLite.jl]() package and how it uses the Tables.jl interface to allow loading of generic table-like data into a sqlite relational table. Here's the code: +```julia +function load!(table, db::DB, tablename) + # get input table rows + rows = Tables.rows(table) + # query for schema of data + sch = Tables.schema(rows) + # create table using tablename and data schema + createtable!(db, tablename, sch) + # build insert statement + params = chop(repeat("?,", length(sch.names))) + stmt = Stmt(db, "INSERT INTO $tablename VALUES ($params)") + # start a transaction for inserting rows + transaction(db) do + # iterate over rows in the input table + for row in rows + # Tables.jl provides a utility function + # Tables.eachcolumn, which allows efficiently + # applying a function to each column value in a row + # it's called with a schema and row, and applies + # a user-provided function to the column `val`, index `i` + # and column name `nm`. Here, we bind the row values + # to our parameterized SQl INSERT statement and then + # call `sqlite3_step` to execute the INSERT statement. + Tables.eachcolumn(sch, row) do val, i, nm + bind!(stmt, i, val) + end + sqlite3_step(stmt.handle) + sqlite3_reset(stmt.handle) + end + end + return +end +``` + +This is pretty straightforward usage: it calls `Tables.rows` on the input table source, +and since we need the schema to setup the database table, we query it via `Tables.schema`. +We then iterate the rows in our table via `for row in rows`, and use the convenient +`Tables.eachcolumn` to efficiently apply a function to each value in the row. Note that +we didn't call `Tables.columnnames` or `Tables.getcolumn` at all, since they're utilized +by `Tables.eachcolumn` itself. + +One wrinkle to consider is the "unknown schema" case; i.e. what if our [`Tables.schema`](@ref) +call had returned `nothing`. +```julia +function load!(sch::Nothing, rows, db::DB, tablename) + # sch is nothing === unknown schema + # start iteration on input table rows + state = iterate(rows) + state === nothing && return + row, st = state + # query column names of first row + names = Tables.columnnames(row) + # partially construct Tables.Schema by at least passing + # the column names to it + sch = Tables.Schema(names, nothing) + # create table if needed + createtable!(db, tablename, sch) + # build insert statement + params = chop(repeat("?,", length(names))) + stmt = Stmt(db, "INSERT INTO $nm VALUES ($params)") + # start a transaction for inserting rows + transaction(db) do + while true + # just like before, we can still use `Tables.eachcolumn` + # even with our partially constructed Tables.Schema + # to apply a function to each value in the row + Tables.eachcolumn(sch, row) do val, i, nm + bind!(stmt, i, val) + end + sqlite3_step(stmt.handle) + sqlite3_reset(stmt.handle) + # keep iterating rows until we finish + state = iterate(rows, st) + state === nothing && break + row, st = state + end + end + return name +end +``` + +The strategy taken here is to start iterating the input source, and using the first row +as a guide, we make a `Tables.Schema` object with just the column names, which we can +then still pass to `Tables.eachcolumn` to apply our `bind!` function to each row value. + +### Tables.columns usage + +Ok, now let's take a look at a case utlizing `Tables.columns`. +The following code is taken from the [DataFrames.jl](https://github.com/JuliaData/DataFrames.jl/blob/master/src/other/tables.jl) +Tables.jl implementation: +```julia +getvector(x::AbstractVector) = x +getvector(x) = collect(x) + +# note that copycols is ignored in this definition (Tables.CopiedColumns implies copies have already been made) +fromcolumns(x::Tables.CopiedColumns, names; copycols::Bool=true) = + DataFrame(AbstractVector[getvector(Tables.getcolumn(x, nm) for nm in names], + Index(names), + copycols=false) +fromcolumns(x; copycols::Bool=true) = + DataFrame(AbstractVector[getvector(Tables.getcolumn(x, nm) for nm in names], + Index(names), + copycols=copycols) + +function DataFrame(x; copycols::Bool=true) + # get columns from input table source + cols = Tables.columns(x) + # get column names as Vector{Symbol}, which is required + # by core DataFrame constructor + names = collect(Symbol, Tables.columnnames(cols)) + return fromcolumns(cols, names; copycols=copycols) +end +``` + +So here we have a generic `DataFrame` constructor that takes a single, untyped argument, +calls `Tables.columns` on it, then `Tables.columnnames` to get the column names. +It then passes the `Columns`-compatible object to an internal function `fromcolumns`, +which dispatches on a special kind of `Columns` object called a [`Tables.CopiedColumns`](@ref), +which wraps any `Columns` object that has already had copies of its columns made, and are thus +safe for the columns-consumer to assume ownership of (this is because DataFrames.jl, by default +makes copies of all columns upon construction). In both cases, individual columns are collected +in `Vector{AbstractVector}`s by calling `Tables.getcolumn(x, nm)` for each column name. +A final note is the call to `getvector` on each column, which ensures each column is materialized +as an `AbstractVector`, as is required by the DataFrame constructor. + +Note in the both the rows and columns usages, we didn't need to worry about the natural orientation +of the input data; we just called `Tables.rows` or `Tables.columns` as was most natural for +the table-specific use-case, knowing that it will Just Work™️. + +### Tables.jl Utilities + +Before moving on to _implementing_ the Tables.jl interfaces, we take a quick +break to highlight some useful utility functions provided by Tables.jl: +```@docs +Tables.rowtable +Tables.columntable +Tables.datavaluerows +Tables.nondatavaluerows +Tables.table +Tables.matrix +Tables.eachcolumn +Tables.materializer +Tables.columnindex +Tables.columntype +``` + +## Implementing the Interface (i.e. becoming a Tables.jl source) + diff --git a/src/Tables.jl b/src/Tables.jl index 7fc041e..1143726 100644 --- a/src/Tables.jl +++ b/src/Tables.jl @@ -8,6 +8,31 @@ if !hasmethod(getproperty, Tuple{Tuple, Int}) Base.getproperty(t::Tuple, i::Int) = t[i] end +""" + Columns + +An interface type defined as an ordered set of columns that support +retrieval of individual columns by name or index. A retrieved column +must be an indexable collection with known length, i.e. an object +that supports `length(col)` and `col[i]` for any `i = 1:length(col)`. +The high-level [`Tables.columns`](@ref) function returns a `Columns`-compatible +object from any input table source. + +Any object implements the `Columns` interface, by satisfying the following: +| Required Methods | Default Definition | Brief Description | +| ---------------- | ------------------ | ----------------- | +| `Tables.getcolumn(table, i::Int)` | getfield(table, i) | Retrieve a column by index | +| `Tables.getcolumn(table, nm::Symbol)` | getproperty(table, nm) | Retrieve a column by name | +| `Tables.columnnames(table)` | propertynames(table) | Return column names for a table as an indexable collection | +| Optional methods | | | +| `Tables.getcolumn(table, ::Type{T}, i::Int, nm::Symbol)` | Tables.getcolumn(table, nm) | Given a column eltype `T`, index `i`, and column name `nm`, retrieve the column. Provides a type-stable or even constant-prop-able mechanism for efficiency. + +Note that table sources shouldn't subtype `Columns`, as it is purely an interface type +to help document the Tables.jl API. See the [`Tables.AbstractColumns`](@ref) type +for a type to potentially subtype to gain useful default behaviors. +""" +abstract type Columns end + """ Tables.AbstractColumns @@ -34,6 +59,28 @@ This allows a custom table type to behave as close as possible to a builtin `Nam """ abstract type AbstractColumns end +""" + Row + +An interface type that represents a single row of a table, with column values retrievable by name or index. +The high-level [`Tables.rows`](@ref) function returns a `Row`-compatible +iterator from any input table source. + +Any object implements the `Row` interface, by satisfying the following: +| Required Methods | Default Definition | Brief Description | +| ---------------- | ------------------ | ----------------- | +| `Tables.getcolumn(row, i::Int)` | getfield(row, i) | Retrieve a column value by index | +| `Tables.getcolumn(row, nm::Symbol)` | getproperty(row, nm) | Retrieve a column value by name | +| `Tables.columnnames(row)` | propertynames(row) | Return column names for a row as an indexable collection | +| Optional methods | | | +| `Tables.getcolumn(row, ::Type{T}, i::Int, nm::Symbol)` | Tables.getcolumn(row, nm) | Given a column type `T`, index `i`, and column name `nm`, retrieve the column value. Provides a type-stable or even constant-prop-able mechanism for efficiency. + +Note that custom row types shouldn't subtype `Row`, as it is purely an interface type +to help document the Tables.jl API. See the [`Tables.AbstractRow`](@ref) type +for a type to potentially subtype to gain useful default behaviors. +""" +abstract type Row end + """ Tables.AbstractRow @@ -58,7 +105,7 @@ While custom row types aren't required to subtype `Tables.AbstractRow`, benefits * A default `show` method This allows the custom row type to behave as close as possible to a builtin `NamedTuple` object. """ -abstract type AbstractRow <: AbstractColumns end +abstract type AbstractRow end """ Tables.getcolumn(::Columns, nm::Symbol) => Indexable collection with known length @@ -93,23 +140,31 @@ function columnnames end columnnames(x) = propertynames(x) -Base.IteratorSize(::Type{R}) where {R <: AbstractColumns} = Base.HasLength() -Base.length(r::AbstractColumns) = length(columnnames(r)) -Base.firstindex(r::AbstractColumns) = 1 -Base.lastindex(r::AbstractColumns) = length(r) -Base.getindex(r::AbstractColumns, i::Int) = getcolumn(r, i) -Base.getindex(r::AbstractColumns, nm::Symbol) = getcolumn(r, nm) -Base.getproperty(r::AbstractColumns, nm::Symbol) = getcolumn(r, nm) -Base.getproperty(r::AbstractColumns, i::Int) = getcolumn(r, i) -Base.propertynames(r::AbstractColumns) = columnnames(r) -Base.keys(r::AbstractColumns) = columnnames(r) -Base.values(r::AbstractColumns) = collect(r) -Base.haskey(r::AbstractColumns, key::Union{Integer, Symbol}) = key in columnnames(r) -Base.get(r::AbstractColumns, key::Union{Integer, Symbol}, default) = haskey(r, key) ? getcolumn(r, key) : default -Base.get(f::Base.Callable, r::AbstractColumns, key::Union{Integer, Symbol}) = haskey(r, key) ? getcolumn(r, key) : f() -Base.iterate(r::AbstractColumns, i=1) = i > length(r) ? nothing : (getcolumn(r, i), i + 1) - -function Base.show(io::IO, x::T) where {T <: AbstractColumns} +# default definitions for AbstractDict +getcolumn(x::AbstractDict, i::Int) = x[i] +getcolumn(x::AbstractDict, nm::Symbol) = x[nm] +columnnames(x::AbstractDict) = collect(keys(x)) + +# default definitions for AbstractRow, AbstractColumns +const RorC = Union{AbstractRow, AbstractColumns} + +Base.IteratorSize(::Type{R}) where {R <: RorC} = Base.HasLength() +Base.length(r::RorC) = length(columnnames(r)) +Base.firstindex(r::RorC) = 1 +Base.lastindex(r::RorC) = length(r) +Base.getindex(r::RorC, i::Int) = getcolumn(r, i) +Base.getindex(r::RorC, nm::Symbol) = getcolumn(r, nm) +Base.getproperty(r::RorC, nm::Symbol) = getcolumn(r, nm) +Base.getproperty(r::RorC, i::Int) = getcolumn(r, i) +Base.propertynames(r::RorC) = columnnames(r) +Base.keys(r::RorC) = columnnames(r) +Base.values(r::RorC) = collect(r) +Base.haskey(r::RorC, key::Union{Integer, Symbol}) = key in columnnames(r) +Base.get(r::RorC, key::Union{Integer, Symbol}, default) = haskey(r, key) ? getcolumn(r, key) : default +Base.get(f::Base.Callable, r::RorC, key::Union{Integer, Symbol}) = haskey(r, key) ? getcolumn(r, key) : f() +Base.iterate(r::RorC, i=1) = i > length(r) ? nothing : (getcolumn(r, i), i + 1) + +function Base.show(io::IO, x::T) where {T <: RorC} println(io, "$T:") names = collect(columnnames(x)) values = [getcolumn(row, nm) for nm in names] @@ -190,6 +245,41 @@ function materializer end materializer(x::T) where {T} = materializer(T) materializer(::Type{T}) where {T} = columntable +""" + Tables.columns(x) => Columns-compatible object + +Accesses data of input table source `x` by returning a [`Columns`](@ref)-compatible +object, which allows retrieving entire columns by name or index. A retrieved column +is an object that is indexable and has a known length, i.e. supports +`length(col)` and `col[i]` for any `i = 1:length(col)`. Note that +even if the input table source is row-oriented by nature, an efficient generic +definition of `Tables.columns` is defined in Tables.jl to build a `Columns`- +compatible object object from the input rows. + +The [`Tables.Schema`](@ref) of a `Columns` object can be queried via `Tables.schema(columns)`, +which may return `nothing` if the schema is unknown. +Column names can be queried by calling `Tables.columnnames(columns)`. And individual columns +can be accessed by calling `Tables.getcolumn(columns, i::Int )` or `Tables.getcolumn(columns, nm::Symbol)` +with a column index or name, respectively. +""" +function columns end + +""" + Tables.rows(x) => Row iterator + +Accesses data of input table source `x` row-by-row by returning a [`Row`](@ref) iterator. +Note that even if the input table source is column-oriented by nature, an efficient generic +definition of `Tables.rows` is defined in Tables.jl to return an iterator of row views into +the columns of the input. + +The [`Tables.Schema`](@ref) of a `Row` iterator can be queried via `Tables.schema(rows)`, +which may return `nothing` if the schema is unknown. +Column names can be queried by calling `Tables.columnnames(row)` on an individual row. +And row values can be accessed by calling `Tables.getcolumn(rows, i::Int )` or +`Tables.getcolumn(rows, nm::Symbol)` with a column index or name, respectively. +""" +function rows end + # Schema implementation """ Tables.Schema(names, types) diff --git a/src/matrix.jl b/src/matrix.jl index e93872b..93fa7f5 100644 --- a/src/matrix.jl +++ b/src/matrix.jl @@ -43,7 +43,7 @@ getcolumn(m::MatrixTable, i::Int) = getfield(m, :matrix)[:, i] columnnames(m::MatrixTable) = names(m) """ -Tables.table(m::AbstractMatrix; [header::Vector{Symbol}]) + Tables.table(m::AbstractMatrix; [header::Vector{Symbol}]) Wrap an `AbstractMatrix` (`Matrix`, `Adjoint`, etc.) in a `MatrixTable`, which satisfies the Tables.jl interface. This allows accesing the matrix via `Tables.rows` and @@ -57,7 +57,7 @@ function table(m::AbstractMatrix; header::Vector{Symbol}=[Symbol("Column$i") for end """ -Tables.matrix(table; transpose::Bool=false) + Tables.matrix(table; transpose::Bool=false) Materialize any table source input as a `Matrix`. If the table column types are not homogenous, they will be promoted to a common type in the materialized `Matrix`. Note that column names are diff --git a/src/namedtuples.jl b/src/namedtuples.jl index 79d4d9a..75ed0f8 100644 --- a/src/namedtuples.jl +++ b/src/namedtuples.jl @@ -49,6 +49,21 @@ namedtupleiterator(::Type{T}, rows::S) where {T <: NamedTuple, S} = rows namedtupleiterator(::Type{T}, rows::S) where {T, S} = NamedTupleIterator{typeof(schema(rows)), S}(rows) # sink function +""" + Tables.rowtable(x) => Vector{NamedTuple} + Tables.rowtable(rt, x) => rt + +Take any input table source, and produce a Vector of NamedTuples, +also known as a "row table". A "row table" is a kind of default +table type of sorts, since it satisfies the Tables.jl row interface +naturally. + +The 2nd definition takes +an existing row table and appends the input table source `x` +to the existing row table. +""" +function rowtable end + function rowtable(itr::T) where {T} r = rows(itr) return collect(namedtupleiterator(eltype(r), r)) @@ -80,6 +95,20 @@ materializer(x::ColumnTable) = columntable getarray(x::AbstractArray) = x getarray(x) = collect(x) +""" + Tables.columntable(x) => NamedTuple of Vectors + Tables.columntable(ct, x) => ct + +Takes any input table source `x` and returns a NamedTuple of Vectors, +also known as a "column table". A "column table" is a kind of default +table type of sorts, since it satisfies the Tables.jl column interface +naturally. + +The 2nd definition takes an input table source `x` and appends it to an +existing column table `ct`. +""" +function columntable end + function columntable(sch::Schema{names, types}, cols) where {names, types} if @generated vals = Tuple(:(getarray(getcolumn(cols, $(fieldtype(types, i)), $i, $(quot(names[i]))))) for i = 1:fieldcount(types)) diff --git a/src/tofromdatavalues.jl b/src/tofromdatavalues.jl index db71f3b..cb328f0 100644 --- a/src/tofromdatavalues.jl +++ b/src/tofromdatavalues.jl @@ -13,6 +13,14 @@ struct IteratorWrapper{S} x::S end +""" + Tables.nondatavaluerows(x) + +Takes any Queryverse-compatible NamedTuple iterator source and +converts to a Tables.jl-compatible Row iterator. Will automatically +unwrap any `DataValue`s, replacing `NA` with `missing`. +Useful for translating Query.jl results back to non-DataValue-based tables. +""" nondatavaluerows(x) = IteratorWrapper(IteratorInterfaceExtensions.getiterator(x)) Tables.istable(::Type{<:IteratorWrapper}) = true Tables.rowaccess(::Type{<:IteratorWrapper}) = true @@ -70,6 +78,18 @@ struct DataValueRowIterator{NT, S} x::S end +""" + Tables.datavaluerows(x) => NamedTuple iterator + +Takes any table input `x` and returns a NamedTuple iterator +that will replace missing values with DataValue-wrapped values; +this allows any table type to satisfy the TableTraits.jl +Queryverse integration interface by defining: + +``` +IteratorInterfaceExtensions.getiterator(x::MyTable) = Tables.datavaluerows(x) +``` +""" function datavaluerows(x) r = Tables.rows(x) s = Tables.schema(r) From 7f790aa8ae376155d877a19f0dbd9da5f897c428 Mon Sep 17 00:00:00 2001 From: Jacob Quinn Date: Wed, 5 Feb 2020 06:57:19 -0700 Subject: [PATCH 04/15] Doc updates from review --- docs/Manifest.toml | 93 ----------------------------------------- docs/Project.toml | 3 ++ docs/src/index.md | 2 +- src/Tables.jl | 16 +++---- src/fallbacks.jl | 12 +++--- src/matrix.jl | 4 +- src/namedtuples.jl | 4 +- src/operations.jl | 21 +++++++++- src/tofromdatavalues.jl | 10 ++--- src/utils.jl | 2 +- 10 files changed, 48 insertions(+), 119 deletions(-) delete mode 100644 docs/Manifest.toml diff --git a/docs/Manifest.toml b/docs/Manifest.toml deleted file mode 100644 index 485804e..0000000 --- a/docs/Manifest.toml +++ /dev/null @@ -1,93 +0,0 @@ -# This file is machine-generated - editing it directly is not advised - -[[Base64]] -uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f" - -[[Dates]] -deps = ["Printf"] -uuid = "ade2ca70-3891-5945-98fb-dc099432e06a" - -[[Distributed]] -deps = ["Random", "Serialization", "Sockets"] -uuid = "8ba89e20-285c-5b6f-9357-94700520ee1b" - -[[DocStringExtensions]] -deps = ["LibGit2", "Markdown", "Pkg", "Test"] -git-tree-sha1 = "88bb0edb352b16608036faadcc071adda068582a" -uuid = "ffbed154-4ef7-542d-bbb7-c09d3a79fcae" -version = "0.8.1" - -[[Documenter]] -deps = ["Base64", "DocStringExtensions", "InteractiveUtils", "JSON", "LibGit2", "Logging", "Markdown", "REPL", "Test", "Unicode"] -git-tree-sha1 = "d45c163c7a3ae293c15361acc52882c0f853f97c" -uuid = "e30172f5-a6a5-5a46-863b-614d45cd2de4" -version = "0.23.4" - -[[InteractiveUtils]] -deps = ["Markdown"] -uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240" - -[[JSON]] -deps = ["Dates", "Mmap", "Parsers", "Unicode"] -git-tree-sha1 = "b34d7cef7b337321e97d22242c3c2b91f476748e" -uuid = "682c06a0-de6a-54ab-a142-c8b1cf79cde6" -version = "0.21.0" - -[[LibGit2]] -deps = ["Printf"] -uuid = "76f85450-5226-5b5a-8eaa-529ad045b433" - -[[Libdl]] -uuid = "8f399da3-3557-5675-b5ff-fb832c97cbdb" - -[[Logging]] -uuid = "56ddb016-857b-54e1-b83d-db4d58db5568" - -[[Markdown]] -deps = ["Base64"] -uuid = "d6f4376e-aef5-505a-96c1-9c027394607a" - -[[Mmap]] -uuid = "a63ad114-7e13-5084-954f-fe012c677804" - -[[Parsers]] -deps = ["Dates", "Test"] -git-tree-sha1 = "c56ecb484f286639f161e712b8311f5ab77e8d32" -uuid = "69de0a69-1ddd-5017-9359-2bf0b02dc9f0" -version = "0.3.8" - -[[Pkg]] -deps = ["Dates", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "UUIDs"] -uuid = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f" - -[[Printf]] -deps = ["Unicode"] -uuid = "de0858da-6303-5e67-8744-51eddeeeb8d7" - -[[REPL]] -deps = ["InteractiveUtils", "Markdown", "Sockets"] -uuid = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb" - -[[Random]] -deps = ["Serialization"] -uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" - -[[SHA]] -uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce" - -[[Serialization]] -uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b" - -[[Sockets]] -uuid = "6462fe0b-24de-5631-8697-dd941f90decc" - -[[Test]] -deps = ["Distributed", "InteractiveUtils", "Logging", "Random"] -uuid = "8dfed614-e22c-5e08-85e1-65c5234f0b40" - -[[UUIDs]] -deps = ["Random", "SHA"] -uuid = "cf7118a7-6976-5b1a-9a39-7adc72f591a4" - -[[Unicode]] -uuid = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5" diff --git a/docs/Project.toml b/docs/Project.toml index dfa65cd..2ccde02 100644 --- a/docs/Project.toml +++ b/docs/Project.toml @@ -1,2 +1,5 @@ [deps] Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4" + +[compat] +Documenter = "~0.23" \ No newline at end of file diff --git a/docs/src/index.md b/docs/src/index.md index 5da4fc9..4d397f0 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -51,7 +51,7 @@ Given these two powerful data access methods, let's walk through real, albeit so ### Tables.rows usage -First up, let's take a look at the [SQLite.jl]() package and how it uses the Tables.jl interface to allow loading of generic table-like data into a sqlite relational table. Here's the code: +First up, let's take a look at the [SQLite.jl](https://github.com/JuliaDatabases/SQLite.jl) package and how it uses the Tables.jl interface to allow loading of generic table-like data into a sqlite relational table. Here's the code: ```julia function load!(table, db::DB, tablename) # get input table rows diff --git a/src/Tables.jl b/src/Tables.jl index 1143726..dde5e26 100644 --- a/src/Tables.jl +++ b/src/Tables.jl @@ -9,7 +9,7 @@ if !hasmethod(getproperty, Tuple{Tuple, Int}) end """ - Columns + Tables.Columns An interface type defined as an ordered set of columns that support retrieval of individual columns by name or index. A retrieved column @@ -60,7 +60,7 @@ This allows a custom table type to behave as close as possible to a builtin `Nam abstract type AbstractColumns end """ - Row + Tables.Row An interface type that represents a single row of a table, with column values retrievable by name or index. The high-level [`Tables.rows`](@ref) function returns a `Row`-compatible @@ -177,8 +177,8 @@ end Check if an object has specifically defined that it is a table. Note that not all valid tables will return true, since it's possible to satisfy the -Tables.jl interface at "run-time", e.g. a Generator of NamedTuples iterates -NamedTuples, which satisfies the Row interface, but there's no static way +Tables.jl interface at "run-time", e.g. a `Generator` of `NamedTuple`s iterates +`NamedTuple`s, which satisfies the Row interface, but there's no static way of knowing that the generator is a table. """ function istable end @@ -190,8 +190,8 @@ istable(::Type{T}) where {T} = false Tables.rowaccess(x) => Bool Check whether an object has specifically defined that it implements the `Tables.rows` -function. Note that `Tables.rows` will work on any object that iterates Row-compatible -objects, even if they don't define `rowaccess`, e.g. a Generator of NamedTuples. Also +function. Note that `Tables.rows` will work on any object that iterates `Row`-compatible +objects, even if they don't define `rowaccess`, e.g. a `Generator` of `NamedTuple`s. Also note that just because an object defines `rowaccess` doesn't mean a user should call `Tables.rows` on it; `Tables.columns` will also work, providing a valid `Columns` object from the rows. Hence, users should call `Tables.rows` or `Tables.columns` @@ -237,8 +237,8 @@ For a table input, return the "sink" function or "materializing" function that c Tables.jl-compatible table input and make an instance of the table type. This enables "transform" workflows that take table inputs, apply transformations, potentially converting the table to a different form, and end with producing a table of the same type as the original input. The -default materializer is `Tables.columntable`, which converts any table input into a NamedTuple -of Vectors. +default materializer is `Tables.columntable`, which converts any table input into a `NamedTuple` +of `Vector`s. """ function materializer end diff --git a/src/fallbacks.jl b/src/fallbacks.jl index 051a304..946bfe5 100644 --- a/src/fallbacks.jl +++ b/src/fallbacks.jl @@ -94,10 +94,10 @@ end # for Rows objects, we define a "collect"-like routine to build up columns from iterated rows """ - Tables.allocatecolumn(::Type{T}, len) => returns a column type (usually AbstractVector) with size to hold `len` elements + Tables.allocatecolumn(::Type{T}, len) => returns a column type (usually `AbstractVector`) with size to hold `len` elements - Custom column types can override with an appropriate "scalar" element type that should dispatch to their column allocator. - Alternatively, and more generally, custom scalars can overload `DataAPI.defaultarray` to signal the default array type +Custom column types can override with an appropriate "scalar" element type that should dispatch to their column allocator. +Alternatively, and more generally, custom scalars can overload `DataAPI.defaultarray` to signal the default array type. """ allocatecolumn(T, len) = DataAPI.defaultarray(T, 1)(undef, len) @@ -192,10 +192,10 @@ end For some sinks, there's a concern about whether they can safely "own" columns from the input. To be safe, they should always copy input columns, to avoid unintended mutation. -When we've called buildcolumns, however, Tables.jl essentially built/owns the columns, +When we've called `buildcolumns`, however, Tables.jl essentially built/owns the columns, and it's happy to pass ownership to the sink. Thus, any built columns will be wrapped -in a CopiedColumns struct to signal to the sink that essentially "a copy has already been made" -and they're safe to assume ownership +in a `CopiedColumns` struct to signal to the sink that essentially "a copy has already been made" +and they're safe to assume ownership. """ struct CopiedColumns{T} <: AbstractColumns x::T diff --git a/src/matrix.jl b/src/matrix.jl index 93fa7f5..e23caa2 100644 --- a/src/matrix.jl +++ b/src/matrix.jl @@ -43,7 +43,7 @@ getcolumn(m::MatrixTable, i::Int) = getfield(m, :matrix)[:, i] columnnames(m::MatrixTable) = names(m) """ - Tables.table(m::AbstractMatrix; [header::Vector{Symbol}]) + Tables.table(m::AbstractMatrix; [header::Vector{Symbol}]) Wrap an `AbstractMatrix` (`Matrix`, `Adjoint`, etc.) in a `MatrixTable`, which satisfies the Tables.jl interface. This allows accesing the matrix via `Tables.rows` and @@ -57,7 +57,7 @@ function table(m::AbstractMatrix; header::Vector{Symbol}=[Symbol("Column$i") for end """ - Tables.matrix(table; transpose::Bool=false) + Tables.matrix(table; transpose::Bool=false) Materialize any table source input as a `Matrix`. If the table column types are not homogenous, they will be promoted to a common type in the materialized `Matrix`. Note that column names are diff --git a/src/namedtuples.jl b/src/namedtuples.jl index 75ed0f8..1fd1178 100644 --- a/src/namedtuples.jl +++ b/src/namedtuples.jl @@ -53,7 +53,7 @@ namedtupleiterator(::Type{T}, rows::S) where {T, S} = NamedTupleIterator{typeof( Tables.rowtable(x) => Vector{NamedTuple} Tables.rowtable(rt, x) => rt -Take any input table source, and produce a Vector of NamedTuples, +Take any input table source, and produce a `Vector` of `NamedTuple`s, also known as a "row table". A "row table" is a kind of default table type of sorts, since it satisfies the Tables.jl row interface naturally. @@ -99,7 +99,7 @@ getarray(x) = collect(x) Tables.columntable(x) => NamedTuple of Vectors Tables.columntable(ct, x) => ct -Takes any input table source `x` and returns a NamedTuple of Vectors, +Takes any input table source `x` and returns a `NamedTuple` of `Vector`s, also known as a "column table". A "column table" is a kind of default table type of sorts, since it satisfies the Tables.jl column interface naturally. diff --git a/src/operations.jl b/src/operations.jl index 82fa1cb..62bdbe1 100644 --- a/src/operations.jl +++ b/src/operations.jl @@ -17,7 +17,17 @@ end columnnames(t::Transforms{true}) = columnnames(getfield(t, 1)) getcolumn(t::Transforms{true}, nm::Symbol) = Base.map(getfunc(t, getfield(t, 2), nm), getcolumn(getfield(t, 1), nm)) -getcolumn(t::Transforms{true}, i::Int) = Base.map(getfunc(t, getfield(t, 2), nm), getcolumn(getfield(t, 1), i)) +getcolumn(t::Transforms{true}, i::Int) = Base.map(getfunc(t, getfield(t, 2), i), getcolumn(getfield(t, 1), i)) + +""" + Tables.transform(source, funcs) => Tables.Transforms + source |> Tables.transform(funcs) => Tables.Transform + +***EXPERIMENTAL - May be moved or removed in a future release*** +Given any Tables.jl-compatible source, apply a series of transformation functions, for the columns specified in `funcs`. +The tranform functions can be a NamedTuple or Dict mapping column name (`String` or `Symbol` or `Integer` index) to Function. +""" +function transform end transform(funcs) = x->transform(x, funcs) transform(; kw...) = transform(kw.data) @@ -62,6 +72,15 @@ struct Select{T, columnaccess, names} <: AbstractColumns source::T end +""" + Tables.select(source, columns...) => Tables.Select + source |> Tables.select(columns...) => Tables.Select + +***EXPERIMENTAL - May be moved or removed in a future release*** +Create a lazy wrapper that satisfies the Tables.jl interface and keeps only the columns given by the columns arguments, which can be `String`s, `Symbol`s, or `Integer`s +""" +function select end + select(names::Symbol...) = x->select(x, names...) select(names::String...) = x->select(x, Base.map(Symbol, names)...) select(inds::Integer...) = x->select(x, Base.map(Int, inds)...) diff --git a/src/tofromdatavalues.jl b/src/tofromdatavalues.jl index cb328f0..b380e09 100644 --- a/src/tofromdatavalues.jl +++ b/src/tofromdatavalues.jl @@ -16,10 +16,10 @@ end """ Tables.nondatavaluerows(x) -Takes any Queryverse-compatible NamedTuple iterator source and -converts to a Tables.jl-compatible Row iterator. Will automatically +Takes any Queryverse-compatible `NamedTuple` iterator source and +converts to a Tables.jl-compatible `Row` iterator. Will automatically unwrap any `DataValue`s, replacing `NA` with `missing`. -Useful for translating Query.jl results back to non-DataValue-based tables. +Useful for translating Query.jl results back to non-`DataValue`-based tables. """ nondatavaluerows(x) = IteratorWrapper(IteratorInterfaceExtensions.getiterator(x)) Tables.istable(::Type{<:IteratorWrapper}) = true @@ -81,8 +81,8 @@ end """ Tables.datavaluerows(x) => NamedTuple iterator -Takes any table input `x` and returns a NamedTuple iterator -that will replace missing values with DataValue-wrapped values; +Takes any table input `x` and returns a `NamedTuple` iterator +that will replace missing values with `DataValue`-wrapped values; this allows any table type to satisfy the TableTraits.jl Queryverse integration interface by defining: diff --git a/src/utils.jl b/src/utils.jl index 7871a11..8fbe343 100644 --- a/src/utils.jl +++ b/src/utils.jl @@ -30,7 +30,7 @@ The first definition takes a function `f`, table schema `sch`, a `row` type (tha it generates calls to get the value for each column in the row (`Tables.getcolumn(row, nm)`) and then calls `f(val, col, name, args...)`, where `f` is the user-provided function, `val` is a row's column value, `col` is the column index as an `Int`, and `name` is the row's column name as a `Symbol`. -While the first definition applies to an `Row` object, the last definition simply returns an AbstractColumn iterator for a `Columns` object. +While the first definition applies to a `Row` object, the last definition simply returns an AbstractColumn iterator for a `Columns` object. For example, one could "collect" every column of a `Columns` object by doing: ```julia vectors = [collect(col) for col in Tables.eachcolumn(Tables.columns(x))] From 97fb679cfecd48bf6cdf96dfd7b8065894a2749f Mon Sep 17 00:00:00 2001 From: Jacob Quinn Date: Wed, 5 Feb 2020 15:01:58 -0700 Subject: [PATCH 05/15] Rewrite namedtupleiterator and add it to docs/utilities --- docs/src/index.md | 1 + src/Tables.jl | 2 +- src/namedtuples.jl | 79 ++++++++++++++++++++++++++++++++++++---------- 3 files changed, 64 insertions(+), 18 deletions(-) diff --git a/docs/src/index.md b/docs/src/index.md index 4d397f0..fc0b84a 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -188,6 +188,7 @@ break to highlight some useful utility functions provided by Tables.jl: ```@docs Tables.rowtable Tables.columntable +Tables.namedtupleiterator Tables.datavaluerows Tables.nondatavaluerows Tables.table diff --git a/src/Tables.jl b/src/Tables.jl index dde5e26..02132fe 100644 --- a/src/Tables.jl +++ b/src/Tables.jl @@ -167,7 +167,7 @@ Base.iterate(r::RorC, i=1) = i > length(r) ? nothing : (getcolumn(r, i), i + 1) function Base.show(io::IO, x::T) where {T <: RorC} println(io, "$T:") names = collect(columnnames(x)) - values = [getcolumn(row, nm) for nm in names] + values = [getcolumn(x, nm) for nm in names] Base.print_matrix(io, hcat(names, values)) end diff --git a/src/namedtuples.jl b/src/namedtuples.jl index 1fd1178..a204c40 100644 --- a/src/namedtuples.jl +++ b/src/namedtuples.jl @@ -10,44 +10,89 @@ schema(x::AbstractVector{NamedTuple{names, types}}) where {names, types} = Schem materializer(x::RowTable) = rowtable # struct to transform `Row`s into NamedTuples -struct NamedTupleIterator{S, T} +struct NamedTupleIterator{schema, T, S} x::T + st::S end -Base.IteratorEltype(::Type{NamedTupleIterator{S, T}}) where {S, T} = S === Nothing ? Base.EltypeUnknown() : Base.HasEltype() -Base.eltype(::Type{NamedTupleIterator{Schema{names, T}, S}}) where {names, T, S} = NamedTuple{Base.map(Symbol, names), T} -Base.IteratorSize(::Type{NamedTupleIterator{S, T}}) where {S, T} = Base.IteratorSize(T) + +""" + Tables.namedtupleiterator(x) + +Pass any table input source and return a `NamedTuple` iterator +""" +function namedtupleiterator(x) + r = rows(x) + sch = schema(r) + st = iterate(r) + if st === nothing + # input was empty + return NamedTupleIterator{Schema((), ()), typeof(r), typeof(st)}(r, st) + end + row, state = st + if sch === nothing + s = Schema(columnnames(row), nothing) + else + s = sch + end + return NamedTupleIterator{typeof(s), typeof(r), typeof(st)}(r, st) +end + +namedtupleiterator(::Type{T}, x) where {T <: NamedTuple} = x +namedtupleiterator(T, x) = namedtupleiterator(x) + +Base.IteratorEltype(::Type{NamedTupleIterator{Schema{names, types}, T, S}}) where {names, types, T, S} = Base.HasEltype() +Base.eltype(::Type{NamedTupleIterator{Schema{names, types}, T, S}}) where {names, types, T, S} = types === nothing ? NamedTuple{Base.map(Symbol, names)} : NamedTuple{Base.map(Symbol, names), types} +Base.IteratorSize(::Type{NamedTupleIterator{sch, T, S}}) where {sch, T, S} = Base.IteratorSize(T) Base.length(nt::NamedTupleIterator) = length(nt.x) Base.size(nt::NamedTupleIterator) = (length(nt.x),) -function Base.iterate(rows::NamedTupleIterator{Schema{names, T}}, st=()) where {names, T} +@inline function Base.iterate(rows::NamedTupleIterator{Schema{names, T}, T1, T2}) where {names, T, T1, T2} + if @generated + vals = Tuple(:(getcolumn(row, $(fieldtype(T, i)), $i, $(quot(names[i])))) for i = 1:fieldcount(T)) + return quote + Base.@inline_meta + rows.st === nothing && return nothing + row, st = rows.st + return $(NamedTuple{Base.map(Symbol, names), T})(($(vals...),)), st + end + else + rows.st === nothing && return nothing + row, st = rows.st + return NamedTuple{Base.map(Symbol, names), T}(Tuple(getcolumn(row, fieldtype(T, i), i, names[i]) for i = 1:fieldcount(T))), st + end +end + +@inline function Base.iterate(rows::NamedTupleIterator{Schema{names, T}}, st) where {names, T} if @generated vals = Tuple(:(getcolumn(row, $(fieldtype(T, i)), $i, $(quot(names[i])))) for i = 1:fieldcount(T)) return quote - x = iterate(rows.x, st...) + Base.@inline_meta + x = iterate(rows.x, st) x === nothing && return nothing row, st = x - return $(NamedTuple{Base.map(Symbol, names), T})(($(vals...),)), (st,) + return $(NamedTuple{Base.map(Symbol, names), T})(($(vals...),)), st end else - x = iterate(rows.x, st...) + x = iterate(rows.x, st) x === nothing && return nothing row, st = x - return NamedTuple{Base.map(Symbol, names), T}(Tuple(getcolumn(row, fieldtype(T, i), i, names[i]) for i = 1:fieldcount(T))), (st,) + return NamedTuple{Base.map(Symbol, names), T}(Tuple(getcolumn(row, fieldtype(T, i), i, names[i]) for i = 1:fieldcount(T))), st end end -# unknown schema case -function Base.iterate(rows::NamedTupleIterator{Nothing, T}, st=()) where {T} - x = iterate(rows.x, st...) +@inline function Base.iterate(rows::NamedTupleIterator{Schema{names, nothing}}) where {names} + rows.st === nothing && return nothing + row, st = rows.st + return NamedTuple{names}(Tuple(getcolumn(row, nm) for nm in names)), st +end + +@inline function Base.iterate(rows::NamedTupleIterator{Schema{names, nothing}}, st) where {names} + x = iterate(rows.x, st) x === nothing && return nothing row, st = x - names = Tuple(columnnames(row)) - return NamedTuple{Base.map(Symbol, names)}(Tuple(getcolumn(row, nm) for nm in names)), (st,) + return NamedTuple{names}(Tuple(getcolumn(row, nm) for nm in names)), st end -namedtupleiterator(::Type{T}, rows::S) where {T <: NamedTuple, S} = rows -namedtupleiterator(::Type{T}, rows::S) where {T, S} = NamedTupleIterator{typeof(schema(rows)), S}(rows) - # sink function """ Tables.rowtable(x) => Vector{NamedTuple} From 5de2ed5aa9c76a0b5943a40e3ba30d0ec8297eb9 Mon Sep 17 00:00:00 2001 From: Jacob Quinn Date: Wed, 5 Feb 2020 15:07:56 -0700 Subject: [PATCH 06/15] Take out unnecessary inline --- src/namedtuples.jl | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/namedtuples.jl b/src/namedtuples.jl index a204c40..818169f 100644 --- a/src/namedtuples.jl +++ b/src/namedtuples.jl @@ -50,7 +50,6 @@ Base.size(nt::NamedTupleIterator) = (length(nt.x),) if @generated vals = Tuple(:(getcolumn(row, $(fieldtype(T, i)), $i, $(quot(names[i])))) for i = 1:fieldcount(T)) return quote - Base.@inline_meta rows.st === nothing && return nothing row, st = rows.st return $(NamedTuple{Base.map(Symbol, names), T})(($(vals...),)), st @@ -66,7 +65,6 @@ end if @generated vals = Tuple(:(getcolumn(row, $(fieldtype(T, i)), $i, $(quot(names[i])))) for i = 1:fieldcount(T)) return quote - Base.@inline_meta x = iterate(rows.x, st) x === nothing && return nothing row, st = x From 26b7d93387de40e7aa92ee1ee50fb1e6d4e60f0a Mon Sep 17 00:00:00 2001 From: Jacob Quinn Date: Wed, 5 Feb 2020 17:09:53 -0700 Subject: [PATCH 07/15] Fix tests --- src/namedtuples.jl | 12 ++++++------ test/runtests.jl | 8 +++++--- 2 files changed, 11 insertions(+), 9 deletions(-) diff --git a/src/namedtuples.jl b/src/namedtuples.jl index 818169f..af13754 100644 --- a/src/namedtuples.jl +++ b/src/namedtuples.jl @@ -24,13 +24,13 @@ function namedtupleiterator(x) r = rows(x) sch = schema(r) st = iterate(r) - if st === nothing - # input was empty - return NamedTupleIterator{Schema((), ()), typeof(r), typeof(st)}(r, st) - end - row, state = st if sch === nothing - s = Schema(columnnames(row), nothing) + if st !== nothing + row, state = st + s = Schema(columnnames(row), nothing) + else + s = Schema((), ()) + end else s = sch end diff --git a/test/runtests.jl b/test/runtests.jl index 4d58d38..8dbdc18 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -151,13 +151,15 @@ end @test tt.a[2] === 2.0 @test tt.a[3] === 3.0 - nti = Tables.NamedTupleIterator{Nothing, typeof(rt)}(rt) - @test Base.IteratorEltype(typeof(nti)) == Base.EltypeUnknown() + sch = Tables.Schema((:a, :b, :c), nothing) + st = iterate(rt) + nti = Tables.NamedTupleIterator{typeof(sch), typeof(rt), typeof(st)}(rt, st) + @test Base.IteratorEltype(typeof(nti)) == Base.HasEltype() @test Base.IteratorSize(typeof(nti)) == Base.HasShape{1}() @test length(nti) == 3 nti2 = collect(nti) @test isequal(rt, nti2) - nti = Tables.NamedTupleIterator{typeof(Tables.Schema((:a, :b, :c), (Union{Int, Float64}, Union{Float64, Missing}, String))), typeof(rt)}(rt) + nti = Tables.NamedTupleIterator{typeof(Tables.Schema((:a, :b, :c), (Union{Int, Float64}, Union{Float64, Missing}, String))), typeof(rt), typeof(st)}(rt, st) @test eltype(typeof(nti)) == NamedTuple{(:a, :b, :c),Tuple{Union{Float64, Int},Union{Missing, Float64},String}} # test really wide tables From 6e18a29d74a58abff4ef491019a13ae156579373 Mon Sep 17 00:00:00 2001 From: Jacob Quinn Date: Thu, 6 Feb 2020 14:58:35 -0700 Subject: [PATCH 08/15] Fix tests --- src/namedtuples.jl | 67 ++++++++++++++--------------------------- src/tofromdatavalues.jl | 17 ++++++----- 2 files changed, 31 insertions(+), 53 deletions(-) diff --git a/src/namedtuples.jl b/src/namedtuples.jl index af13754..51fded7 100644 --- a/src/namedtuples.jl +++ b/src/namedtuples.jl @@ -10,9 +10,8 @@ schema(x::AbstractVector{NamedTuple{names, types}}) where {names, types} = Schem materializer(x::RowTable) = rowtable # struct to transform `Row`s into NamedTuples -struct NamedTupleIterator{schema, T, S} +struct NamedTupleIterator{schema, T} x::T - st::S end """ @@ -23,72 +22,50 @@ Pass any table input source and return a `NamedTuple` iterator function namedtupleiterator(x) r = rows(x) sch = schema(r) - st = iterate(r) - if sch === nothing - if st !== nothing - row, state = st - s = Schema(columnnames(row), nothing) - else - s = Schema((), ()) - end - else - s = sch - end - return NamedTupleIterator{typeof(s), typeof(r), typeof(st)}(r, st) + return NamedTupleIterator{typeof(sch), typeof(r)}(r) end namedtupleiterator(::Type{T}, x) where {T <: NamedTuple} = x namedtupleiterator(T, x) = namedtupleiterator(x) -Base.IteratorEltype(::Type{NamedTupleIterator{Schema{names, types}, T, S}}) where {names, types, T, S} = Base.HasEltype() -Base.eltype(::Type{NamedTupleIterator{Schema{names, types}, T, S}}) where {names, types, T, S} = types === nothing ? NamedTuple{Base.map(Symbol, names)} : NamedTuple{Base.map(Symbol, names), types} -Base.IteratorSize(::Type{NamedTupleIterator{sch, T, S}}) where {sch, T, S} = Base.IteratorSize(T) +Base.IteratorEltype(::Type{NamedTupleIterator{Schema{names, types}, T}}) where {names, types, T} = Base.HasEltype() +Base.IteratorEltype(::Type{NamedTupleIterator{nothing, T}}) where {T} = Base.EltypeUnknown() +Base.eltype(::Type{NamedTupleIterator{Schema{names, types}, T}}) where {names, types, T} = NamedTuple{Base.map(Symbol, names), types} +Base.IteratorSize(::Type{NamedTupleIterator{sch, T}}) where {sch, T} = Base.IteratorSize(T) Base.length(nt::NamedTupleIterator) = length(nt.x) Base.size(nt::NamedTupleIterator) = (length(nt.x),) -@inline function Base.iterate(rows::NamedTupleIterator{Schema{names, T}, T1, T2}) where {names, T, T1, T2} - if @generated - vals = Tuple(:(getcolumn(row, $(fieldtype(T, i)), $i, $(quot(names[i])))) for i = 1:fieldcount(T)) - return quote - rows.st === nothing && return nothing - row, st = rows.st - return $(NamedTuple{Base.map(Symbol, names), T})(($(vals...),)), st - end - else - rows.st === nothing && return nothing - row, st = rows.st - return NamedTuple{Base.map(Symbol, names), T}(Tuple(getcolumn(row, fieldtype(T, i), i, names[i]) for i = 1:fieldcount(T))), st - end -end - -@inline function Base.iterate(rows::NamedTupleIterator{Schema{names, T}}, st) where {names, T} +@inline function Base.iterate(rows::NamedTupleIterator{Schema{names, T}}, st=()) where {names, T} if @generated - vals = Tuple(:(getcolumn(row, $(fieldtype(T, i)), $i, $(quot(names[i])))) for i = 1:fieldcount(T)) + vals = Any[ :(getcolumn(row, $(fieldtype(T, i)), $i, $(quot(names[i])))) for i = 1:fieldcount(T) ] + ret = Expr(:new, :(NamedTuple{names, T}), vals...) return quote - x = iterate(rows.x, st) + x = iterate(rows.x, st...) x === nothing && return nothing row, st = x - return $(NamedTuple{Base.map(Symbol, names), T})(($(vals...),)), st + return $ret, (st,) end else - x = iterate(rows.x, st) + x = iterate(rows.x, st...) x === nothing && return nothing row, st = x - return NamedTuple{Base.map(Symbol, names), T}(Tuple(getcolumn(row, fieldtype(T, i), i, names[i]) for i = 1:fieldcount(T))), st + return NamedTuple{Base.map(Symbol, names), T}(Tuple(getcolumn(row, fieldtype(T, i), i, names[i]) for i = 1:fieldcount(T))), (st,) end end -@inline function Base.iterate(rows::NamedTupleIterator{Schema{names, nothing}}) where {names} - rows.st === nothing && return nothing - row, st = rows.st - return NamedTuple{names}(Tuple(getcolumn(row, nm) for nm in names)), st +function Base.iterate(rows::NamedTupleIterator{nothing}) + x = iterate(rows.x, st...) + x === nothing && return nothing + row, st = x + names = Tuple(columnnames(row)) + return NamedTuple{names}(Tuple(getcolumn(row, nm) for nm in names)), (Val(names), (st,)) end -@inline function Base.iterate(rows::NamedTupleIterator{Schema{names, nothing}}, st) where {names} - x = iterate(rows.x, st) +function Base.iterate(rows::NamedTupleIterator{nothing}, state::Tuple{Val{names}, T}) where {names, T} + x = iterate(rows.x, state[2]...) x === nothing && return nothing row, st = x - return NamedTuple{names}(Tuple(getcolumn(row, nm) for nm in names)), st + return NamedTuple{names}(Tuple(getcolumn(row, nm) for nm in names)), (Val(names), (st,)) end # sink function diff --git a/src/tofromdatavalues.jl b/src/tofromdatavalues.jl index b380e09..bc29ec4 100644 --- a/src/tofromdatavalues.jl +++ b/src/tofromdatavalues.jl @@ -74,7 +74,7 @@ getcolumn(r::IteratorRow, i::Int) = undatavalue(getcolumn(getrow(r), i)) columnnames(r::IteratorRow) = columnnames(getrow(r)) # DataValueRowIterator wraps a Row iterator and will wrap `Union{T, Missing}` typed fields in DataValues -struct DataValueRowIterator{NT, S} +struct DataValueRowIterator{NT, sch, S} x::S end @@ -94,22 +94,23 @@ function datavaluerows(x) r = Tables.rows(x) s = Tables.schema(r) s === nothing && error("Schemaless sources cannot be passed to datavaluerows.") - return DataValueRowIterator{datavaluenamedtuple(s), typeof(r)}(r) + return DataValueRowIterator{datavaluenamedtuple(s), typeof(s), typeof(r)}(r) end -Base.eltype(rows::DataValueRowIterator{NT, S}) where {NT, S} = NT -Base.IteratorSize(::Type{DataValueRowIterator{NT, S}}) where {NT, S} = Base.IteratorSize(S) +Base.eltype(rows::DataValueRowIterator{NT}) where {NT} = NT +Base.IteratorSize(::Type{DataValueRowIterator{NT, sch, S}}) where {NT, sch, S} = Base.IteratorSize(S) Base.length(rows::DataValueRowIterator) = length(rows.x) Base.size(rows::DataValueRowIterator) = size(rows.x) -function Base.iterate(rows::DataValueRowIterator{NT, S}, st=()) where {NT <: NamedTuple{names}, S} where {names} +function Base.iterate(rows::DataValueRowIterator{NamedTuple{names, dtypes}, Schema{names, rtypes}, S}, st=()) where {names, dtypes, rtypes, S} if @generated - vals = Tuple(:(convert($(fieldtype(NT, i)), getcolumn(row, $(nondv(fieldtype(NT, i))), $i, $(Meta.QuoteNode(names[i]))))) for i = 1:fieldcount(NT)) + vals = Any[ :(convert($(fieldtype(dtypes, i)), getcolumn(row, $(fieldtype(rtypes, i)), $i, $(Meta.QuoteNode(names[i]))))) for i = 1:length(names) ] + ret = Expr(:new, :(NamedTuple{names, dtypes}), vals...) q = quote x = iterate(rows.x, st...) x === nothing && return nothing row, st = x - return $NT(($(vals...),)), (st,) + return $ret, (st,) end # @show q return q @@ -117,6 +118,6 @@ function Base.iterate(rows::DataValueRowIterator{NT, S}, st=()) where {NT <: Nam x = iterate(rows.x, st...) x === nothing && return nothing row, st = x - return NT(Tuple(convert(fieldtype(NT, i), getcolumn(row, nondv(fieldtype(NT, i)), i, names[i])) for i = 1:fieldcount(NT))), (st,) + return NamedTuple{names, dtypes}(Tuple(convert(fieldtype(dtypes, i), getcolumn(row, fieldtype(rtypes, i), i, names[i])) for i = 1:length(names))), (st,) end end From 4611bb39c078070f6e92da182fbfaeb9327d1dae Mon Sep 17 00:00:00 2001 From: Jacob Quinn Date: Thu, 6 Feb 2020 15:05:45 -0700 Subject: [PATCH 09/15] Fix test --- src/namedtuples.jl | 2 +- test/runtests.jl | 8 +++----- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/src/namedtuples.jl b/src/namedtuples.jl index 51fded7..da576db 100644 --- a/src/namedtuples.jl +++ b/src/namedtuples.jl @@ -54,7 +54,7 @@ Base.size(nt::NamedTupleIterator) = (length(nt.x),) end function Base.iterate(rows::NamedTupleIterator{nothing}) - x = iterate(rows.x, st...) + x = iterate(rows.x) x === nothing && return nothing row, st = x names = Tuple(columnnames(row)) diff --git a/test/runtests.jl b/test/runtests.jl index 8dbdc18..cbaa98f 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -151,15 +151,13 @@ end @test tt.a[2] === 2.0 @test tt.a[3] === 3.0 - sch = Tables.Schema((:a, :b, :c), nothing) - st = iterate(rt) - nti = Tables.NamedTupleIterator{typeof(sch), typeof(rt), typeof(st)}(rt, st) - @test Base.IteratorEltype(typeof(nti)) == Base.HasEltype() + nti = Tables.NamedTupleIterator{nothing, typeof(rt)}(rt) + @test Base.IteratorEltype(typeof(nti)) == Base.EltypeUnknown() @test Base.IteratorSize(typeof(nti)) == Base.HasShape{1}() @test length(nti) == 3 nti2 = collect(nti) @test isequal(rt, nti2) - nti = Tables.NamedTupleIterator{typeof(Tables.Schema((:a, :b, :c), (Union{Int, Float64}, Union{Float64, Missing}, String))), typeof(rt), typeof(st)}(rt, st) + nti = Tables.NamedTupleIterator{typeof(Tables.Schema((:a, :b, :c), (Union{Int, Float64}, Union{Float64, Missing}, String))), typeof(rt)}(rt) @test eltype(typeof(nti)) == NamedTuple{(:a, :b, :c),Tuple{Union{Float64, Int},Union{Missing, Float64},String}} # test really wide tables From cd93aabf41c48b5d7c990f2de83af1dd60417135 Mon Sep 17 00:00:00 2001 From: Jacob Quinn Date: Thu, 6 Feb 2020 15:19:02 -0700 Subject: [PATCH 10/15] Fix tests --- src/namedtuples.jl | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/namedtuples.jl b/src/namedtuples.jl index da576db..a227cb6 100644 --- a/src/namedtuples.jl +++ b/src/namedtuples.jl @@ -29,7 +29,7 @@ namedtupleiterator(::Type{T}, x) where {T <: NamedTuple} = x namedtupleiterator(T, x) = namedtupleiterator(x) Base.IteratorEltype(::Type{NamedTupleIterator{Schema{names, types}, T}}) where {names, types, T} = Base.HasEltype() -Base.IteratorEltype(::Type{NamedTupleIterator{nothing, T}}) where {T} = Base.EltypeUnknown() +Base.IteratorEltype(::Type{NamedTupleIterator{Nothing, T}}) where {T} = Base.EltypeUnknown() Base.eltype(::Type{NamedTupleIterator{Schema{names, types}, T}}) where {names, types, T} = NamedTuple{Base.map(Symbol, names), types} Base.IteratorSize(::Type{NamedTupleIterator{sch, T}}) where {sch, T} = Base.IteratorSize(T) Base.length(nt::NamedTupleIterator) = length(nt.x) @@ -53,7 +53,7 @@ Base.size(nt::NamedTupleIterator) = (length(nt.x),) end end -function Base.iterate(rows::NamedTupleIterator{nothing}) +function Base.iterate(rows::NamedTupleIterator{Nothing}) x = iterate(rows.x) x === nothing && return nothing row, st = x @@ -61,7 +61,7 @@ function Base.iterate(rows::NamedTupleIterator{nothing}) return NamedTuple{names}(Tuple(getcolumn(row, nm) for nm in names)), (Val(names), (st,)) end -function Base.iterate(rows::NamedTupleIterator{nothing}, state::Tuple{Val{names}, T}) where {names, T} +function Base.iterate(rows::NamedTupleIterator{Nothing}, state::Tuple{Val{names}, T}) where {names, T} x = iterate(rows.x, state[2]...) x === nothing && return nothing row, st = x From d3bf9d5b342a67fc2ee1f55329c05142e706ed82 Mon Sep 17 00:00:00 2001 From: Jacob Quinn Date: Thu, 6 Feb 2020 15:55:08 -0700 Subject: [PATCH 11/15] Actually fix tests --- test/runtests.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/runtests.jl b/test/runtests.jl index cbaa98f..4d58d38 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -151,7 +151,7 @@ end @test tt.a[2] === 2.0 @test tt.a[3] === 3.0 - nti = Tables.NamedTupleIterator{nothing, typeof(rt)}(rt) + nti = Tables.NamedTupleIterator{Nothing, typeof(rt)}(rt) @test Base.IteratorEltype(typeof(nti)) == Base.EltypeUnknown() @test Base.IteratorSize(typeof(nti)) == Base.HasShape{1}() @test length(nti) == 3 From dc438ef634b637291310fe17d22b3b225437717c Mon Sep 17 00:00:00 2001 From: Jacob Quinn Date: Thu, 6 Feb 2020 21:39:29 -0700 Subject: [PATCH 12/15] Test new functionality and clean things up --- src/Tables.jl | 40 +++++++++++++- src/operations.jl | 6 +-- test/runtests.jl | 130 ++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 171 insertions(+), 5 deletions(-) diff --git a/src/Tables.jl b/src/Tables.jl index 02132fe..1f8d606 100644 --- a/src/Tables.jl +++ b/src/Tables.jl @@ -140,11 +140,33 @@ function columnnames end columnnames(x) = propertynames(x) -# default definitions for AbstractDict +# default definitions for AbstractDict to act as Row getcolumn(x::AbstractDict, i::Int) = x[i] getcolumn(x::AbstractDict, nm::Symbol) = x[nm] +getcolumn(x::AbstractDict, ::Type{T}, i::Int, nm::Symbol) where {T} = x[nm] columnnames(x::AbstractDict) = collect(keys(x)) +# Dict iterator as Rows +const DictRows = AbstractVector{T} where {T <: AbstractDict} +istable(::Type{<:DictRows}) = true +rowaccess(::Type{<:DictRows}) = true +rows(x::DictRows) = x +# DictRows doesn't naturally lend itself to the `Tables.schema` requirement +# we can't just look at the first row, because the types might change, +# row-to-row (e.g. `missing`, then `1.1`, etc.). Therefore, the safest option +# is to just return `nothing` +schema(x::DictRows) = nothing + +# and as Columns +const DictColumns = AbstractDict{K, V} where {K <: Union{Integer, Symbol, String}, V <: AbstractVector} +istable(::Type{<:DictColumns}) = true +columnaccess(::Type{<:AbstractDict}) = true +columns(x::DictColumns) = x +schema(x::DictColumns) = Schema(collect(keys(x)), eltype.(values(x))) + +# for other AbstractDict, let's throw an informative error +columns(x::T) where {T <: AbstractDict} = error("to treat $T as a table, it must have a key type of `Integer`, `Symbol`, or `String`, and a value type `<: AbstractVector`") + # default definitions for AbstractRow, AbstractColumns const RorC = Union{AbstractRow, AbstractColumns} @@ -159,7 +181,8 @@ Base.getproperty(r::RorC, i::Int) = getcolumn(r, i) Base.propertynames(r::RorC) = columnnames(r) Base.keys(r::RorC) = columnnames(r) Base.values(r::RorC) = collect(r) -Base.haskey(r::RorC, key::Union{Integer, Symbol}) = key in columnnames(r) +Base.haskey(r::RorC, key::Symbol) = key in columnnames(r) +Base.haskey(r::RorC, i::Int) = 0 < i < length(columnnames(r)) Base.get(r::RorC, key::Union{Integer, Symbol}, default) = haskey(r, key) ? getcolumn(r, key) : default Base.get(f::Base.Callable, r::RorC, key::Union{Integer, Symbol}) = haskey(r, key) ? getcolumn(r, key) : f() Base.iterate(r::RorC, i=1) = i > length(r) ? nothing : (getcolumn(r, i), i + 1) @@ -171,6 +194,19 @@ function Base.show(io::IO, x::T) where {T <: RorC} Base.print_matrix(io, hcat(names, values)) end +# AbstractRow AbstractVector as Rows +const AbstractRowTable = AbstractVector{T} where {T <: AbstractRow} +istable(::Type{<:AbstractRowTable}) = true +rowaccess(::Type{<:AbstractRowTable}) = true +rows(x::AbstractRowTable) = x +schema(x::AbstractRowTable) = nothing + +# AbstractColumns as Columns +istable(::Type{<:AbstractColumns}) = true +columnaccess(::Type{<:AbstractColumns}) = true +columns(x::AbstractColumns) = x +schema(x::AbstractColumns) = nothing + # default definitions """ Tables.istable(x) => Bool diff --git a/src/operations.jl b/src/operations.jl index 62bdbe1..63b6d76 100644 --- a/src/operations.jl +++ b/src/operations.jl @@ -10,7 +10,7 @@ getcolumn(row::TransformsRow, nm::Symbol) = (getfunc(row, getfuncs(row), nm))(ge getcolumn(row::TransformsRow, i::Int) = (getfunc(row, getfuncs(row), i))(getcolumn(getrow(row), i)) columnnames(row::TransformsRow) = columnnames(getrow(row)) -struct Transforms{C, T, F} <: AbstractColumns +struct Transforms{C, T, F} source::T funcs::F # NamedTuple of columnname=>transform function end @@ -42,7 +42,7 @@ getfunc(row, d::Dict{String, <:Base.Callable}, nm::Symbol) = get(d, String(nm), getfunc(row, d::Dict{Symbol, <:Base.Callable}, nm::Symbol) = get(d, nm, identity) getfunc(row, d::Dict{Int, <:Base.Callable}, nm::Symbol) = get(d, findfirst(isequal(nm), columnnames(row)), identity) -getfunc(row, nt::NamedTuple, i::Int) = i > fieldcount(typeof(nt)) ? identity : getfield(nt, i) +getfunc(row, nt::NamedTuple, i::Int) = get(nt, columnnames(row)[i], identity) getfunc(row, d::Dict{String, <:Base.Callable}, i::Int) = get(d, String(columnnames(row)[i]), identity) getfunc(row, d::Dict{Symbol, <:Base.Callable}, i::Int) = get(d, columnnames(row)[i], identity) getfunc(row, d::Dict{Int, <:Base.Callable}, i::Int) = get(d, i, identity) @@ -68,7 +68,7 @@ Base.eltype(t::Transforms{false, T, F}) where {T, F} = TransformsRow{eltype(getf end # select -struct Select{T, columnaccess, names} <: AbstractColumns +struct Select{T, columnaccess, names} source::T end diff --git a/test/runtests.jl b/test/runtests.jl index 4d58d38..a7ef6d5 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -2,6 +2,8 @@ using Test, Tables, TableTraits, DataValues, QueryOperators, IteratorInterfaceEx @testset "utils.jl" begin + @test getproperty((1, 2), 1) == 1 + NT = NamedTuple{(), Tuple{}} @test Tables.names(NT) === () @test Tables.types(NT) === Tuple{} @@ -72,6 +74,8 @@ using Test, Tables, TableTraits, DataValues, QueryOperators, IteratorInterfaceEx @test Tables.schema(rows) == Tables.Schema((:a, :b), (Int, Int)) row = first(rows) @test row.a == 1 + @test Tables.getcolumn(row, :a) == 1 + @test Tables.getcolumn(row, 1) == 1 @test Tables.istable(rows) @test Tables.rowaccess(rows) @test Tables.rows(rows) === rows @@ -93,6 +97,8 @@ using Test, Tables, TableTraits, DataValues, QueryOperators, IteratorInterfaceEx c = Tables.CopiedColumns(nt) @test Tables.columns(c) === c @test Tables.materializer(c) == Tables.materializer(nt) + @test Tables.getcolumn(c, :a) == [1,2,3] + @test Tables.getcolumn(c, 1) == [1,2,3] @test_throws ArgumentError Tables.columntable([1,2,3]) @@ -229,9 +235,13 @@ end @test Tables.columnaccess(typeof(mattbl)) @test Tables.columns(mattbl) === mattbl @test mattbl.Column1 == [1,2,3] + @test Tables.getcolumn(mattbl, :Column1) == [1,2,3] + @test Tables.getcolumn(mattbl, 1) == [1,2,3] matrow = first(mattbl) @test eltype(mattbl) == typeof(matrow) @test matrow.Column1 == 1 + @test Tables.getcolumn(matrow, :Column1) == 1 + @test Tables.getcolumn(matrow, 1) == 1 @test propertynames(mattbl) == propertynames(matrow) == [:Column1, :Column2, :Column3] end @@ -370,6 +380,8 @@ tran = ctable |> Tables.transform(C=Symbol) @test Tables.columns(tran) === tran @test IteratorInterfaceExtensions.isiterable(tran) @test typeof(IteratorInterfaceExtensions.getiterator(tran)) <: Tables.DataValueRowIterator +@test isequal(Tables.getcolumn(tran, :A), [1,missing,3]) +@test isequal(Tables.getcolumn(tran, 1), [1,missing,3]) tran2 = rtable |> Tables.transform(C=Symbol) @test Tables.istable(typeof(tran2)) @@ -383,6 +395,8 @@ trow = first(tran2) @test trow.A === 1 @test trow.B === 1.0 @test trow.C == :hey +@test Tables.getcolumn(trow, 1) == 1 +@test Tables.getcolumn(trow, :A) == 1 ctable2 = Tables.columntable(tran2) @test isequal(ctable2.A, ctable.A) @test ctable2.C == map(Symbol, ctable.C) @@ -453,6 +467,8 @@ sel = Tables.select(ctable) @test Tables.columnaccess(typeof(sel)) @test Tables.columns(sel) === sel @test propertynames(sel) == () +@test isequal(Tables.getcolumn(sel, 1), [1, missing, 3]) +@test isequal(Tables.getcolumn(sel, :A), [1, missing, 3]) @test Tables.columntable(sel) == NamedTuple() @test Tables.rowtable(sel) == NamedTuple{(), Tuple{}}[] @@ -512,6 +528,8 @@ sel = rtable |> Tables.select(1) @test isequal(Tables.rowtable(sel), [(A=1,), (A=missing,), (A=3,)]) srow = first(sel) @test propertynames(srow) == (:A,) +@test Tables.getcolumn(srow, 1) == 1 +@test Tables.getcolumn(srow, :A) == 1 table = ctable |> Tables.select(:A) |> Tables.columntable @test length(table) == 1 @@ -619,3 +637,115 @@ end # DataValue{Any} @test isequal(Tables.columntable(Tables.nondatavaluerows([(a=DataValue{Any}(), b=DataValue{Int}())])), (a = Any[missing], b = Union{Missing, Int64}[missing])) end + +@testset "AbstractDict" begin + + d = Dict(:a => 1, :b => missing, :c => "7") + n = (a=1, b=missing, c="7") + drt = [d, d, d] + rt = [n, n, n] + dct = Dict(:a => [1, 1, 1], :b => [missing, missing, missing], :c => ["7", "7", "7"]) + ct = (a = [1, 1, 1], b = [missing, missing, missing], c = ["7", "7", "7"]) + @test Tables.istable(drt) + @test Tables.rowaccess(drt) + @test Tables.rows(drt) === drt + @test Tables.schema(drt) === nothing + @test isequal(Tables.rowtable(drt), rt) + @test isequal(Tables.columntable(drt), ct) + + @test Tables.istable(dct) + @test Tables.columnaccess(dct) + @test Tables.columns(dct) === dct + @test Tables.schema(dct) == Tables.Schema((:a, :b, :c), Tuple{Int, Missing, String}) + @test isequal(Tables.rowtable(dct), rt) + @test isequal(Tables.columntable(dct), ct) + + # a Dict w/ scalar values isn't a table + @test_throws Exception Tables.columns(d) + @test_throws Exception Tables.rows(d) +end + +struct Row <: Tables.AbstractRow + a::Int + b::Union{Float64, Missing} + c::String +end + +Tables.getcolumn(r::Row, i::Int) = getfield(r, i) +Tables.getcolumn(r::Row, nm::Symbol) = getfield(r, nm) +Tables.getcolumn(r::Row, ::Type{T}, i::Int, nm::Symbol) where {T} = getfield(r, i) +Tables.columnnames(r::Row) = fieldnames(Row) + +@testset "AbstractRow" begin + + row = Row(1, missing, "hey") + row2 = Row(2, 3.14, "ho") + + @test Base.IteratorSize(typeof(row)) == Base.HasLength() + @test length(row) == 3 + @test firstindex(row) == 1 + @test lastindex(row) == 3 + @test isequal((row[1], row[2], row[3]), (1, missing, "hey")) + @test isequal((row[:a], row[:b], row[:c]), (1, missing, "hey")) + @test isequal((row.a, row.b, row.c), (1, missing, "hey")) + @test isequal((getproperty(row, 1), getproperty(row, 2), getproperty(row, 3)), (1, missing, "hey")) + @test propertynames(row) == (:a, :b, :c) + @test keys(row) == (:a, :b, :c) + @test isequal(values(row), [1, missing, "hey"]) + @test haskey(row, :a) + @test haskey(row, 1) + @test get(row, 1, 0) == get(row, :a, 0) == 1 + @test get(() -> 0, row, 1) == get(() -> 0, row, :a) == 1 + @test isequal(collect(row), [1, missing, "hey"]) + show(row) + + art = [row, row2] + ct = (a=[1, 2], b=[missing, 3.14], c=["hey", "ho"]) + @test Tables.istable(art) + @test Tables.rowaccess(art) + @test Tables.rows(art) === art + @test Tables.schema(art) === nothing + @test isequal(Tables.columntable(art), ct) + +end + +struct Columns <: Tables.AbstractColumns + a::Vector{Int} + b::Vector{Union{Float64, Missing}} + c::Vector{String} +end + +Tables.getcolumn(r::Columns, i::Int) = getfield(r, i) +Tables.getcolumn(r::Columns, nm::Symbol) = getfield(r, nm) +Tables.getcolumn(r::Columns, ::Type{T}, i::Int, nm::Symbol) where {T} = getfield(r, i) +Tables.columnnames(r::Columns) = fieldnames(Columns) + +@testset "AbstractColumns" begin + + col = Columns([1, 2], [missing, 3.14], ["hey", "ho"]) + + @test Base.IteratorSize(typeof(col)) == Base.HasLength() + @test length(col) == 3 + @test firstindex(col) == 1 + @test lastindex(col) == 3 + @test isequal((col[1], col[2], col[3]), ([1,2], [missing,3.14], ["hey","ho"])) + @test isequal((col[:a], col[:b], col[:c]), ([1,2], [missing,3.14], ["hey","ho"])) + @test isequal((col.a, col.b, col.c), ([1,2], [missing,3.14], ["hey","ho"])) + @test isequal((getproperty(col, 1), getproperty(col, 2), getproperty(col, 3)), ([1,2], [missing,3.14], ["hey","ho"])) + @test propertynames(col) == (:a, :b, :c) + @test keys(col) == (:a, :b, :c) + @test isequal(values(col), [[1,2], [missing,3.14], ["hey","ho"]]) + @test haskey(col, :a) + @test haskey(col, 1) + @test get(col, 1, 0) == get(col, :a, 0) == [1,2] + @test get(() -> 0, col, 1) == get(() -> 0, col, :a) == [1,2] + @test isequal(collect(col), [[1,2], [missing,3.14], ["hey","ho"]]) + show(col) + + ct = (a=[1, 2], b=[missing, 3.14], c=["hey", "ho"]) + @test Tables.istable(col) + @test Tables.columnaccess(col) + @test Tables.columns(col) === col + @test Tables.schema(col) === nothing + @test isequal(Tables.columntable(col), ct) +end From 3c549d4970c2dab879f1ec32124cdbc6fad6a986 Mon Sep 17 00:00:00 2001 From: Jacob Quinn Date: Thu, 6 Feb 2020 21:43:48 -0700 Subject: [PATCH 13/15] Fix tests --- src/operations.jl | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/operations.jl b/src/operations.jl index 63b6d76..9d794fb 100644 --- a/src/operations.jl +++ b/src/operations.jl @@ -18,6 +18,9 @@ end columnnames(t::Transforms{true}) = columnnames(getfield(t, 1)) getcolumn(t::Transforms{true}, nm::Symbol) = Base.map(getfunc(t, getfield(t, 2), nm), getcolumn(getfield(t, 1), nm)) getcolumn(t::Transforms{true}, i::Int) = Base.map(getfunc(t, getfield(t, 2), i), getcolumn(getfield(t, 1), i)) +# for backwards compat +Base.propertynames(t::Transforms{true}) = columnnames(t) +Base.getproperty(t::Transforms{true}, nm::Symbol) = getcolumn(t, nm) """ Tables.transform(source, funcs) => Tables.Transforms @@ -124,6 +127,9 @@ getcolumn(s::Select{T, true, names}, i::Int) where {T, names} = getcolumn(getfie columnnames(s::Select{T, true, names}) where {T, names} = namesubset(columnnames(getfield(s, 1)), names) columnaccess(::Type{Select{T, C, names}}) where {T, C, names} = C columns(s::Select{T, true, names}) where {T, names} = s +# for backwards compat +Base.propertynames(s::Select{T, true, names}) where {T, names} = columnnames(s) +Base.getproperty(s::Select{T, true, names}, nm::Symbol) where {T, names} = getcolumn(s, nm) # rows: implement Iterator interface Base.IteratorSize(::Type{Select{T, false, names}}) where {T, names} = Base.IteratorSize(T) From f32bd2e996417eacd0a4d0cd17467f2eefe72445 Mon Sep 17 00:00:00 2001 From: Jacob Quinn Date: Fri, 7 Feb 2020 00:29:56 -0700 Subject: [PATCH 14/15] Remove appending functions for row/column tables --- src/namedtuples.jl | 29 ----------------------------- test/runtests.jl | 8 -------- 2 files changed, 37 deletions(-) diff --git a/src/namedtuples.jl b/src/namedtuples.jl index a227cb6..7d44b3d 100644 --- a/src/namedtuples.jl +++ b/src/namedtuples.jl @@ -71,16 +71,11 @@ end # sink function """ Tables.rowtable(x) => Vector{NamedTuple} - Tables.rowtable(rt, x) => rt Take any input table source, and produce a `Vector` of `NamedTuple`s, also known as a "row table". A "row table" is a kind of default table type of sorts, since it satisfies the Tables.jl row interface naturally. - -The 2nd definition takes -an existing row table and appends the input table source `x` -to the existing row table. """ function rowtable end @@ -89,11 +84,6 @@ function rowtable(itr::T) where {T} return collect(namedtupleiterator(eltype(r), r)) end -function rowtable(rt::RowTable, itr::T) where {T} - r = rows(itr) - return append!(rt, namedtupleiterator(eltype(r), r)) -end - # NamedTuple of arrays of matching dimensionality const ColumnTable = NamedTuple{names, T} where {names, T <: NTuple{N, AbstractArray{S, D} where S}} where {N, D} rowcount(c::ColumnTable) = length(c) == 0 ? 0 : length(c[1]) @@ -117,15 +107,11 @@ getarray(x) = collect(x) """ Tables.columntable(x) => NamedTuple of Vectors - Tables.columntable(ct, x) => ct Takes any input table source `x` and returns a `NamedTuple` of `Vector`s, also known as a "column table". A "column table" is a kind of default table type of sorts, since it satisfies the Tables.jl column interface naturally. - -The 2nd definition takes an input table source `x` and appends it to an -existing column table `ct`. """ function columntable end @@ -147,18 +133,3 @@ function columntable(itr::T) where {T} return columntable(schema(cols), cols) end columntable(x::ColumnTable) = x - -function ctappend(ct1::NamedTuple{N1, T1}, ct2::NamedTuple{N2, T2}) where {N1, T1, N2, T2} - if @generated - appends = Expr(:block, Any[:(append!(ct1[$(quot(nm))], ct2[$(quot(nm))])) for nm in N1]...) - return quote - $appends - return ct1 - end - else - foreach(nm->append!(ct1[nm], ct2[nm]), N1) - return ct1 - end -end - -columntable(ct::ColumnTable, itr) = ctappend(ct, columntable(itr)) diff --git a/test/runtests.jl b/test/runtests.jl index a7ef6d5..2a49a7f 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -140,14 +140,6 @@ end @test Tables.columntable(rtf) == nt @test Tables.buildcolumns(nothing, rtf) == nt - # append - nt2 = columntable(nt, rt) - @test Tables.rowcount(nt2) == 6 - @test Tables.schema(nt2) == Tables.Schema((:a, :b, :c), Tuple{Int, Float64, String}) - @test nt2 == (a = [1, 2, 3, 1, 2, 3], b = [4.0, 5.0, 6.0, 4.0, 5.0, 6.0], c = ["7", "8", "9", "7", "8", "9"]) - rt2 = rowtable(rt, nt) - @test length(rt2) == 9 - rt = [(a=1, b=4.0, c="7"), (a=2.0, b=missing, c="8"), (a=3, b=6.0, c="9")] @test Tables.istable(typeof(rt)) @test Tables.rowaccess(typeof(rt)) From 8bdd32708d3f4a7d26b5b0f9c6b5dfd6273571b0 Mon Sep 17 00:00:00 2001 From: Jacob Quinn Date: Fri, 7 Feb 2020 07:00:46 -0700 Subject: [PATCH 15/15] Update manual with implementation walk-through --- docs/src/index.md | 117 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 117 insertions(+) diff --git a/docs/src/index.md b/docs/src/index.md index fc0b84a..5305e4a 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -201,3 +201,120 @@ Tables.columntype ## Implementing the Interface (i.e. becoming a Tables.jl source) +Now that we've seen how one _uses_ the Tables.jl interface, let's walk-through how to implement it; i.e. how can I +make my custom type valid for Tables.jl consumers? + +The interface to becoming a proper table is straightforward: +| Required Methods | Default Definition | Brief Description | +| ---------------- | ------------------ | ----------------- | +| `Tables.istable(table)` | | Declare that your table type implements the interface | +| One of: | | | +| `Tables.rowaccess(table)` | | Declare that your table type defines a `Tables.rows(table)` method | +| `Tables.rows(table)` | | Return a `Row` iterator from your table | +| Or: | | | +| `Tables.columnaccess(table)` | | Declare that your table type defines a `Tables.columns(table)` method | +| `Tables.columns(table)` | | Return a `Columns`-compatible object from your table | +| Optional methods | | | +| `Tables.schema(x)` | `Tables.schema(x) = nothing` | Return a `Tables.Schema` object from your `Row` iterator or `Columns` object; or `nothing` for unknown schema | +| `Tables.materializer(table)` | `Tables.columntable` | Declare a "materializer" sink function for your table type that can construct an instance of your type from any Tables.jl input | + +Based on whether your table type has defined `Tables.rows` or `Tables.columns`, you then ensure that the `Row` iterator +or `Columns` object satisfies the respective interface: +```@docs +Tables.Row +Tables.Columns +``` + +Though the strict requirements for `Row` and `Columns` are minimal (just `getcolumn` and `columnnames`), you may desire +additional behavior for your row or columns types (and you're implementing them yourself). For convenience, Tables.jl +defines the `Tables.AbstractRow` and `Tables.AbstractColumns` abstract types, to allow subtyped custom types to +inherit convenient behavior, such as indexing, iteration, and property access, all defined in terms of `getcolumn` and `columnnames`. +```@docs +Tables.AbstractRow +Tables.AbstractColumns +``` + +As an extended example, let's take a look at some code defined in Tables.jl for treating `AbstractMatrix`s as tables. + +First, we define a special `MatrixTable` type that will wrap an `AbstractMatrix`, and allow easy overloading for the +Tables.jl interface. +```julia +struct MatrixTable{T <: AbstractMatrix} <: Tables.AbstractColumns + names::Vector{Symbol} + lookup::Dict{Symbol, Int} + matrix::T +end +# declare that MatrixTable is a table +Tables.istable(::Type{<:MatrixTable}) = true +# getter method on stored column names +names(m::MatrixTable) = getfield(m, :names) +# schema is column names and types +Tables.schema(m::MatrixTable{T}) where {T} = Tables.Schema(names(m), fill(eltype(T), size(getfield(m, :matrix), 2))) +``` +Here we defined `Tables.istable` for all `MatrixTable` types, signaling that my type implements the Tables.jl interfaces. +We also defined `Tables.schema` by pulling the column names out that we stored, and since `AbstractMatrix` have a single +`eltype`, we repeat it for each column. Note that defining `Tables.schema` is optional on tables; by default, `nothing` +is returned and Tables.jl consumers should account for both known and unknown schema cases. It tends to allow consumers +to have certain optimizations when they can know the types of all columns upfront (and if the # of columns isn't too large) +to generate more efficient code. + +Now, in this example, we're actually going to have `MatrixTable` implement _both_ `Tables.rows` and `Tables.columns` +methods itself, i.e. it's going to return itself from those functions, so here's first how we make our `MatrixTable` a +valid `Columns` object: +```julia +# column interface +Tables.columnaccess(::Type{<:MatrixTable}) = true +Tables.columns(m::MatrixTable) = m +# required Columns object methods +Tables.getcolumn(m::MatrixTable, ::Type{T}, col::Int, nm::Symbol) where {T} = getfield(m, :matrix)[:, col] +Tables.getcolumn(m::MatrixTable, nm::Symbol) = getfield(m, :matrix)[:, getfield(m, :lookup)[nm]] +Tables.getcolumn(m::MatrixTable, i::Int) = getfield(m, :matrix)[:, i] +Tables.columnnames(m::MatrixTable) = names(m) +``` +We define `columnaccess` for our type, then `columns` just returns the `MatrixTable` itself, and then we define +the three `getcolumn` methods and `columnnames`. Note the use of a `lookup` Dict that maps column name to column index +so we can figure out which column to return from the matrix. We're also storing the column names in our `names` field +so the `columnnames` implementation is trivial. And that's it! Literally! It can now be written out to a csv file, +stored in a sqlite or other database, converted to DataFrame or JuliaDB table, etc. Pretty fun. + +And now for the `Tables.rows` implementation: +```julia +# declare that any MatrixTable defines its own `Tables.rows` method +rowaccess(::Type{<:MatrixTable}) = true +# just return itself, which means MatrixTable must iterate `Row`-compatible objects +rows(m::MatrixTable) = m +# the iteration interface, at a minimum, requires `eltype`, `length`, and `iterate` +# for `MatrixTable` `eltype`, we're going to provide a custom row type +Base.eltype(m::MatrixTable{T}) where {T} = MatrixRow{T} +Base.length(m::MatrixTable) = size(getfield(m, :matrix), 1) + +Base.iterate(m::MatrixTable, st=1) = st > length(m) ? nothing : (MatrixRow(st, m), st + 1) + +# a custom Row type; acts as a "view" into a row of an AbstractMatrix +struct MatrixRow{T} <: Tables.AbstractRow + row::Int + source::MatrixTable{T} +end +# required `Row` interface methods (same as for `Columns` object before) +getcolumn(m::MatrixRow, ::Type, col::Int, nm::Symbol) = + getfield(getfield(m, :source), :matrix)[getfield(m, :row), col] +getcolumn(m::MatrixRow, i::Int) = + getfield(getfield(m, :source), :matrix)[getfield(m, :row), i] +getcolumn(m::MatrixRow, nm::Symbol) = + getfield(getfield(m, :source), :matrix)[getfield(m, :row), getfield(getfield(m, :source), :lookup)[nm]] +columnnames(m::MatrixRow) = names(getfield(m, :source)) +``` +Here we start by defining `Tables.rowaccess` and `Tables.rows`, and then the iteration interface methods, +since we declared that a `MatrixTable` itself is an iterator of `Row`-compatible objects. For `eltype`, +we say that a `MatrixTable` iterates our own custom row type, `MatrixRow`. `MatrixRow` subtypes +`Tables.AbstractRow`, which has the same required interface as a `Row` object, but also provides interface +implementations for several useful behaviors (indexing, iteration, property-access, etc.); essentially it +makes our custom `MatrixRow` type more convenient to work with. + +Implementing the `Row`/`Tables.AbstractRow` interface is straightfoward, and very similar to our implementation +of `Columns` previously (i.e. the same methods for `getcolumn` and `columnnames`). + +And that's it. Our `MatrixTable` type is now a fully fledged, valid Tables.jl source and can be used throughout +the ecosystem. Now, this is obviously not a lot of code; but then again, the actual Tables.jl interface +implementations tend to be fairly simple, given the other behaviors that are already defined for table types +(i.e. table types tend to already have a `getcolumn` like function defined).