From fb4a1257602d56856265d65c827e206df710251b Mon Sep 17 00:00:00 2001 From: Jacob Quinn Date: Fri, 7 Feb 2020 20:59:15 -0700 Subject: [PATCH] Tables API enhancement (#131) * Tables API enhancement --- .travis.yml | 24 ++- README.md | 174 +------------------ appveyor.yml | 31 ---- docs/Project.toml | 5 + docs/make.jl | 17 ++ docs/src/index.md | 320 +++++++++++++++++++++++++++++++++++ src/Tables.jl | 359 ++++++++++++++++++++++++++++++++-------- src/fallbacks.jl | 85 +++++++--- src/matrix.jl | 29 ++-- src/namedtuples.jl | 110 +++++++----- src/operations.jl | 70 ++++++-- src/tofromdatavalues.jl | 71 ++++---- src/utils.jl | 62 ++++--- test/runtests.jl | 140 ++++++++++++++-- 14 files changed, 1056 insertions(+), 441 deletions(-) delete mode 100644 appveyor.yml create mode 100644 docs/Project.toml create mode 100644 docs/make.jl create mode 100644 docs/src/index.md diff --git a/.travis.yml b/.travis.yml index 604a594..21a7fe6 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,12 +1,32 @@ +# Documentation: http://docs.travis-ci.com/user/languages/julia/ language: julia os: - linux - osx + - windows +arch: + - x64 + - x86 julia: - 1.0 - - 1.1 + - 1.3 - nightly +matrix: + allow_failures: + - julia: nightly + fast_finish: true + exclude: + - os: osx + arch: x86 + include: + - stage: "Documentation" + julia: 1.3 + os: linux + script: + - julia --project=docs/ -e 'using Pkg; Pkg.develop(PackageSpec(path=pwd())); Pkg.instantiate(); Pkg.build("Tables")' + - julia --project=docs/ docs/make.jl + after_success: skip notifications: email: false after_success: - - julia -e 'using Pkg; Pkg.add("Coverage"); using Coverage; Codecov.submit(process_folder())' \ No newline at end of file + - julia -e 'ENV["TRAVIS_JULIA_VERSION"] == "1.3" && ENV["TRAVIS_OS_NAME"] != "linux" && exit(); using Pkg; Pkg.add("Coverage"); using Coverage; Codecov.submit(Codecov.process_folder())' \ No newline at end of file diff --git a/README.md b/README.md index af60135..d8722e7 100644 --- a/README.md +++ b/README.md @@ -3,175 +3,9 @@ [![Build Status](https://travis-ci.org/JuliaData/Tables.jl.svg?branch=master)](https://travis-ci.org/JuliaData/Tables.jl) [![Codecov](https://codecov.io/gh/JuliaData/Tables.jl/branch/master/graph/badge.svg)](https://codecov.io/gh/JuliaData/Tables.jl) -The Tables.jl package provides simple, yet powerful interface functions for working with all kinds tabular data through predictable access patterns. At its core, it provides two simple functions for accessing a source table's data, regardless of its storage format or orientation: +The Tables.jl package provides simple, yet powerful interface functions for working with all kinds tabular data. -```julia - Tables.rows(table) => Rows - Tables.columns(table) => Columns -``` -These two functions return objects that satisfy the `Rows` or `Columns` interfaces: -* `Rows` is an iterator (i.e. implements `Base.iterate(x)`) of property-accessible objects (any type that supports `propertynames(row)` and `getproperty(row, nm::Symbol`) -* `Columns` is a property-accessible object of iterators (i.e. each column can be retrieved via `getproperty` and is an iterator) +### Documentation -So `Rows` is any object that can be used like: -```julia -for rows in table - for columnname in propertynames(row) - value = getproperty(row, columnname) - end -end -``` -And `Columns` is any object that can be used like: -```julia -for columnname in propertynames(table) - column = getproperty(table, columnname) -end -``` - -In addition to these `Rows` and `Columns` objects, it's useful to be able to query properties of these objects: -* `Tables.schema(x::Union{Rows, Columns}) => Union{Tables.Schema, Nothing}`: returns a `Tables.Schema` object, or `nothing` if the table's schema is unknown -* For the `Tables.Schema` object: - * column names can be accessed as an indexable collection of Symbols like `sch.names` - * column types can be accessed as an indexable collection of types like `sch.types` - * See `?Tables.Schema` for more details on this type -Because many table types are able to provide a well-defined schema, it can enable optimizations for consumers when this schema can be queried upfront before data access. - -A big part of the power in these simple interface functions is that each, `Tables.rows` ***and*** `Tables.columns`, is defined for any table type, even if the table type only explicitly implements one interface function or the other. -This is accomplished by providing performant, generic fallback definitions in Tables.jl itself (though obviously nothing prevents a table type from implementing each interface function directly). - -This means that table *authors* only need to worry about providing a single, most natural access pattern to their table type, whereas table *consumers* don't need to worry about the storage format or orientation of a table source, but can instead focus on the most natural *consumption* pattern for data access (row-by-row or on entire columns). - -With these simple definitions, powerful workflows are enabled: -* A package providing data cleansing, manipulation, visualization, or analysis can automatically handle any number of decoupled input table types -* A tabular file format can have automatic integration with in-memory structures and translation to other file formats -* table-like database objects can be queried, streaming the results direclty to various file formats or in-memory table structures - -# Tables Interface - -So how does one go about satisfying the Tables.jl interface functions? It mainly depends on what you've already defined and the natural access patterns of your table: - -## `Tables.istable`: - -* `Tables.istable(::Type{<:MyTable}) = true`: this provides an explicit affirmation that your type implements the Tables interface -* `Tables.istable(x::MyTable) = x.istable`: alternatively, it may be the case that `MyTable` can only implement that Tables interface in some cases, known only at runtime; in this case, we can define `Tables.istable` on an ***instance*** of `MyTable` instead of the type. For consumers, this function should always be called on ***instances*** (like `Tables.istable(x)`), to ensure input tables are appropriately supported - -## To support `Rows`: - -* Define `Tables.rowaccess(::Type{<:MyTable}) = true`: this signals that `MyTable` supports iterating objects that satisfy the `Row` interface; note this function isn't meant for public use, but is instead used by Tables.jl itself to provide a generic fallback definition for `Tables.columns` on row-oriented sources -* Define `Tables.rows(x::MyTable)`: return a `Row`-iterator object (perhaps the table itself if it already defines a `Base.iterate` method that returns `Row` interface objects) -* Define `Tables.schema(Tables.rows(x::MyTable))` to either return a `Tables.Schema` object, or `nothing` if the schema is unknown or non-inferrable for some reason - -## To support `Columns`: - -* Define `Tables.columnaccess(::Type{<:MyTable}) = true`: this signals that `MyTable` supports returning an object satisfying the `Columns` interface; note this function isn't meant for public use, but is instead used by Tables.jl itself to provide a generic fallback definition for `Tables.rows` on column-oriented sources -* Define `Tables.columns(x::MyTable)`: return an object satisfying the `Columns` interface, perhaps the table itself if it naturally supports property-access to columns -* Define `Tables.schema(Tables.columns(x::MyTable))` to either return a `Tables.Schema` object, or `nothing` if the schema is unknown or non-inferrable for some reason - -## Consuming table inputs (i.e. ***using*** the Tables.jl interface) - -As the author of `MyTable`, I'm ecstatic that `MyTable` can now automatically be used by a number of other "table" packages, but another question is how `MyTable` can be a "sink" for any other table type. In other words, how do I actually ***use*** the Tables.jl interface? - -The answer is mostly straightforward: just use the interface functions. A note does need to be made with regards to how interfaces currently operate in Julia; there's no support for "dispatching" on objects satisfying interfaces, which means I can't just define `MyTable(table::Tables.Table)`. What most packages do is define a constructor (or "sink function") that takes a single, un-typed argument like: - -```julia -function MyTable(x) - # Tables.istable(x) || throw(ArgumentError("input is not a table")) - rows = Tables.rows(x) - sch = Tables.schema(rows) - names = sch.names - types = sch.types - # custom constructor that creates an "empty" MyTable according to given column names & types - # note that the "unknown" schema case should be considered, i.e. when `Tables.schema(x) === nothing` - mytbl = MyTable(names, types) - for row in rows - # a convenience function provided in Tables.jl for "unrolling" access to each column/property of a `Row` - # it works by applying a provided function to each value; see `?Tables.eachcolumn` for more details - Tables.eachcolumn(sch, row) do val, columnindex::Int, columnname::Symbol - push!(mytbl[columnindex], val) - end - end - return mytbl -end -``` -In this example, `MyTable` defines a constructor that takes any tables input source, initializes an empty `MyTable`, and proceeds to iterate over the input rows, appending values to each column. Note that the function didn't do any validation on the input to check if it was a valid table: `Tables.rows(x)` will throw an error if `x` doesn't actually satisfy the Tables.jl interface. Alternatively, we could call `Tables.istable(x)` (as shown in the commented line at the start of the function) on the input before calling `Tables.rows(x)` if we needed to restrict things to known, valid Tables.jl. Note that doing this will prevent certain, valid table inputs from being consumed, due to their inability to confidently return `true` for `Tables.istable`, even at runtime (cases like `Generator`s, or `Vector{Any}`). In short, most package just call `Tables.rows`, allowing invalid source errors to be thrown while also accepting the maximum number of possible valid inputs. - -Alternatively, it may be more natural for `MyTable` to consume input data column-by-column, so my definition would be more like: -```julia -function MyTable(x) - cols = Tables.columns(x) - # here we use Tables.eachcolumn to iterate over each column in `cols`, which satisfies the `Columns` interface - return MyTable(collect(propertynames(cols)), [collect(col) for col in Tables.eachcolumn(cols)]) -end -``` - -Note that in neither case did we need to call `Tables.rowaccess` or `Tables.columnaccess`; those interface functions are only used internally by Tables.jl itself to provide the `Tables.rows` and `Tables.columns` fallback definitions. As a consumer, I only need to consider which of `Tables.rows` or `Tables.columns` better fits my use-case, knowing that if the input table isn't oriented naturally, the fallback definition will provide the access pattern I desire. Also note that in the column-oriented definition, we didn't even call `Tables.schema` since we just do a single iteration over each column. Also note that in the row-oriented case, we didn't account for the case when `Tables.schema(x) === nothing`; one way to support the unknown schema case is to do something like: -```julia -function MyTable(x) - rows = Tables.rows(x) - state = iterate(rows) - if state === nothing - # the input table was empty, so return an empty MyTable - return MyTable() - end - row, st = state - columnnames = propertynames(row) - # create a Tables.Schema manually w/ just the column names from the first row - sch = Tables.Schema(columnnames, nothing) - cols = length(columnnames) - # create an emtpy MyTable with just the expected column names - mytbl = MyTable(columnnames) - while state !== nothing - row, st = state - Tables.eachcolumn(sch, row) do val, columnindex::Int, columnname::Symbol - push!(mytbl[columnindex], val) - end - state = iterate(rows, st) - end - return mytbl -end -``` - -## Functions that input and output tables: - -For functions that input a table, perform some calculation, and output a new table, we need a way of constructing the preferred output table given the input. For this purpose, `Tables.materializer(table)` returns the preferred sink function for a table (`Tables.columntable`, which creates a named tuple of AbstractVectors, is the default). - -Note that an in-memory table with a properly defined "sink" function can reconstruct itself with the following: - -```julia -materializer(table)(Tables.columns(table)) - -materializer(table)(Tables.rows(table)) -``` - -For example, we may want to select a subset of columns from a column-access table. One way we could implement it is with the following: - -```julia -function select(table, cols::Symbol...) - nt = Tables.columntable(table) # columntable(t) creates a NamedTuple of AbstractVectors - newcols = NamedTuple{cols}(nt) - Tables.materializer(table)(newcols) -end - -# Example of selecting columns from a columntable -tbl = (x=1:100, y=rand(100), z=randn(100)) -select(tbl, :x) -select(tbl, :x, :z) - -tbl = [(x=1, y="a", z=1.0), (x=2, y="b", z=2.0)] -select(tbl, :z, :x) -``` - -## Utilities -A number of "helper" utility functions are provided to aid in working with the Tables.jl collection of interfaces: - -* `rowtable(x)`: takes any input that satisfies the Tables.jl interface and converts it to a `Vector` of `NamedTuple`s, which itself satisfies the Tables.jl interface -* `rowtable(rt, x)`: take a "row table" (`Vector` of `NamedTuples`) and any table input `x` and appends `x` to `rt` -* `columntable(x)`: takes any input that satisfies the Tables.jl interface and converts it to a `NamedTuple` of `AbstractVector`s, which itself satisfies the Tables.jl interface -* `columntable(ct, x)`: takes a "column table (`NamedTuple` of `AbstractVector`s) and a table input `x` and appends `x` to `ct` -* `Tables.datavaluerows(x)`: takes any table input `x` and returns an iterator that will replace `missing` values with `DataValue`-wrapped values; this allows any table type to satisfy the `TableTraits.jl` Queryverse integration interface by defining: `IteratorInterfaceExtensions.getiterator(x::MyTable) = Tables.datavaluerows(x)` -* `Tables.nondatavaluerows(x)`: takes any iterator and replaces any `DataValue` values that are actually missing with `missing` -* `Tables.transform(x, transformfunctions...)`: create a lazy wrapper that satisfies the Tables.jl interface and applies `transformfunctions` to values when accessed; the tranform functions can be a NamedTuple or Dict mapping column name (`String` or `Symbol` or `Integer` index) to `Function` -* `Tables.select(x, columns...)`: create a lazy wrapper that satisfies the Tables.jl interface and keeps only the columns given by the `columns` arguments, which can be `String`s, `Symbol`s, or `Integer`s -* `Tables.table(x::AbstractMatrix)`: because any `AbstractMatrix` isn't a table by default, a convenience function is provided to treat an `AbstractMatrix` as a table; see `?Tables.table` for more details -* `Tables.matrix(x; transpose::Bool=false)`: a matrix "sink" function; takes any table input and converts to a dense `Matrix`; see `?Tables.matrix` for more details -* `Tables.eachcolumn`: convenience function for objects satisfying the `Row` or `Columns` interfaces which allows iterating or applying a function over each column; see `?Tables.eachcolumn` for more details +[![Stable](https://img.shields.io/badge/docs-stable-blue.svg)](https://juliadata.github.io/Tables.jl/stable) +[![Dev](https://img.shields.io/badge/docs-dev-blue.svg)](https://juliadata.github.io/Tables.jl/dev) \ No newline at end of file diff --git a/appveyor.yml b/appveyor.yml deleted file mode 100644 index c6719b1..0000000 --- a/appveyor.yml +++ /dev/null @@ -1,31 +0,0 @@ -environment: - matrix: - - julia_version: 1.0 - - julia_version: 1.1 - - julia_version: nightly - -platform: - - x86 # 32-bit - - x64 # 64-bit - -branches: - only: - - master - - /release-.*/ - -notifications: - - provider: Email - on_build_success: false - on_build_failure: false - on_build_status_changed: false - -install: - - ps: iex ((new-object net.webclient).DownloadString("https://raw.githubusercontent.com/JuliaCI/Appveyor.jl/version-1/bin/install.ps1")) - -build_script: - - echo "%JL_BUILD_SCRIPT%" - - C:\julia\bin\julia -e "%JL_BUILD_SCRIPT%" - -test_script: - - echo "%JL_TEST_SCRIPT%" - - C:\julia\bin\julia -e "%JL_TEST_SCRIPT%" \ No newline at end of file diff --git a/docs/Project.toml b/docs/Project.toml new file mode 100644 index 0000000..2ccde02 --- /dev/null +++ b/docs/Project.toml @@ -0,0 +1,5 @@ +[deps] +Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4" + +[compat] +Documenter = "~0.23" \ No newline at end of file diff --git a/docs/make.jl b/docs/make.jl new file mode 100644 index 0000000..81328ad --- /dev/null +++ b/docs/make.jl @@ -0,0 +1,17 @@ +using Documenter, Tables + +makedocs(; + modules=[Tables], + format=Documenter.HTML(), + pages=[ + "Home" => "index.md", + ], + repo="https://github.com/JuliaData/Tables.jl/blob/{commit}{path}#L{line}", + sitename="Tables.jl", + authors="Jacob Quinn", + assets=String[], +) + +deploydocs(; + repo="github.com/JuliaData/Tables.jl", +) diff --git a/docs/src/index.md b/docs/src/index.md new file mode 100644 index 0000000..5305e4a --- /dev/null +++ b/docs/src/index.md @@ -0,0 +1,320 @@ +# Tables.jl Documentation + +This guide provides documentation around the powerful tables interfaces in the Tables.jl package. +Note that the package, and hence, documentation, are geared towards package and library developers +who intend to implement and consume the interfaces. Users, on the other hand, benefit from these +other packages that provide useful access to table data in various formats or workflows. + +With that said, don't hesitate to [open a new issue](https://github.com/JuliaData/Tables.jl/issues/new), even +just for a question, or come chat with us on the [#data](https://julialang.slack.com/messages/data/) slack +channel with question, concerns, or clarifications. + +```@contents +``` + +## Using the Interface (i.e. consuming Tables.jl sources) + +We start by discussing _usage_ of the Tables.jl interfaces, since that can help contextualize _implementing_ them. + +At a high level, Tables.jl provides two powerful APIs for predictably accessing data from any table-like source: +```julia +# access data of input table `x` row-by-row +rows = Tables.rows(x) + +for row in rows + # example of getting all values in the row + # there are other ways to more efficiently process rows + rowvalues = [Tables.getcolumn(row, col) for col in Tables.columnnames(row)] +end + +# access data of input table `x` column-by-column +columns = Tables.columns(x) + +# iterate through each column name in table +for col in Tables.columnnames(columns) + # retrieve entire column by column name + # a column is an indexable collection + # with known length (i.e. supports + # `length(column)` and `column[i]`) + column = Tables.getcolumn(columns, col) +end +``` + +So we see two high-level functions here, `Tables.rows`, and `Tables.columns`. + +```@docs +Tables.rows +Tables.columns +``` + +Given these two powerful data access methods, let's walk through real, albeit somewhat simplified versions of how packages actually use these methods. + +### Tables.rows usage + +First up, let's take a look at the [SQLite.jl](https://github.com/JuliaDatabases/SQLite.jl) package and how it uses the Tables.jl interface to allow loading of generic table-like data into a sqlite relational table. Here's the code: +```julia +function load!(table, db::DB, tablename) + # get input table rows + rows = Tables.rows(table) + # query for schema of data + sch = Tables.schema(rows) + # create table using tablename and data schema + createtable!(db, tablename, sch) + # build insert statement + params = chop(repeat("?,", length(sch.names))) + stmt = Stmt(db, "INSERT INTO $tablename VALUES ($params)") + # start a transaction for inserting rows + transaction(db) do + # iterate over rows in the input table + for row in rows + # Tables.jl provides a utility function + # Tables.eachcolumn, which allows efficiently + # applying a function to each column value in a row + # it's called with a schema and row, and applies + # a user-provided function to the column `val`, index `i` + # and column name `nm`. Here, we bind the row values + # to our parameterized SQl INSERT statement and then + # call `sqlite3_step` to execute the INSERT statement. + Tables.eachcolumn(sch, row) do val, i, nm + bind!(stmt, i, val) + end + sqlite3_step(stmt.handle) + sqlite3_reset(stmt.handle) + end + end + return +end +``` + +This is pretty straightforward usage: it calls `Tables.rows` on the input table source, +and since we need the schema to setup the database table, we query it via `Tables.schema`. +We then iterate the rows in our table via `for row in rows`, and use the convenient +`Tables.eachcolumn` to efficiently apply a function to each value in the row. Note that +we didn't call `Tables.columnnames` or `Tables.getcolumn` at all, since they're utilized +by `Tables.eachcolumn` itself. + +One wrinkle to consider is the "unknown schema" case; i.e. what if our [`Tables.schema`](@ref) +call had returned `nothing`. +```julia +function load!(sch::Nothing, rows, db::DB, tablename) + # sch is nothing === unknown schema + # start iteration on input table rows + state = iterate(rows) + state === nothing && return + row, st = state + # query column names of first row + names = Tables.columnnames(row) + # partially construct Tables.Schema by at least passing + # the column names to it + sch = Tables.Schema(names, nothing) + # create table if needed + createtable!(db, tablename, sch) + # build insert statement + params = chop(repeat("?,", length(names))) + stmt = Stmt(db, "INSERT INTO $nm VALUES ($params)") + # start a transaction for inserting rows + transaction(db) do + while true + # just like before, we can still use `Tables.eachcolumn` + # even with our partially constructed Tables.Schema + # to apply a function to each value in the row + Tables.eachcolumn(sch, row) do val, i, nm + bind!(stmt, i, val) + end + sqlite3_step(stmt.handle) + sqlite3_reset(stmt.handle) + # keep iterating rows until we finish + state = iterate(rows, st) + state === nothing && break + row, st = state + end + end + return name +end +``` + +The strategy taken here is to start iterating the input source, and using the first row +as a guide, we make a `Tables.Schema` object with just the column names, which we can +then still pass to `Tables.eachcolumn` to apply our `bind!` function to each row value. + +### Tables.columns usage + +Ok, now let's take a look at a case utlizing `Tables.columns`. +The following code is taken from the [DataFrames.jl](https://github.com/JuliaData/DataFrames.jl/blob/master/src/other/tables.jl) +Tables.jl implementation: +```julia +getvector(x::AbstractVector) = x +getvector(x) = collect(x) + +# note that copycols is ignored in this definition (Tables.CopiedColumns implies copies have already been made) +fromcolumns(x::Tables.CopiedColumns, names; copycols::Bool=true) = + DataFrame(AbstractVector[getvector(Tables.getcolumn(x, nm) for nm in names], + Index(names), + copycols=false) +fromcolumns(x; copycols::Bool=true) = + DataFrame(AbstractVector[getvector(Tables.getcolumn(x, nm) for nm in names], + Index(names), + copycols=copycols) + +function DataFrame(x; copycols::Bool=true) + # get columns from input table source + cols = Tables.columns(x) + # get column names as Vector{Symbol}, which is required + # by core DataFrame constructor + names = collect(Symbol, Tables.columnnames(cols)) + return fromcolumns(cols, names; copycols=copycols) +end +``` + +So here we have a generic `DataFrame` constructor that takes a single, untyped argument, +calls `Tables.columns` on it, then `Tables.columnnames` to get the column names. +It then passes the `Columns`-compatible object to an internal function `fromcolumns`, +which dispatches on a special kind of `Columns` object called a [`Tables.CopiedColumns`](@ref), +which wraps any `Columns` object that has already had copies of its columns made, and are thus +safe for the columns-consumer to assume ownership of (this is because DataFrames.jl, by default +makes copies of all columns upon construction). In both cases, individual columns are collected +in `Vector{AbstractVector}`s by calling `Tables.getcolumn(x, nm)` for each column name. +A final note is the call to `getvector` on each column, which ensures each column is materialized +as an `AbstractVector`, as is required by the DataFrame constructor. + +Note in the both the rows and columns usages, we didn't need to worry about the natural orientation +of the input data; we just called `Tables.rows` or `Tables.columns` as was most natural for +the table-specific use-case, knowing that it will Just Work™️. + +### Tables.jl Utilities + +Before moving on to _implementing_ the Tables.jl interfaces, we take a quick +break to highlight some useful utility functions provided by Tables.jl: +```@docs +Tables.rowtable +Tables.columntable +Tables.namedtupleiterator +Tables.datavaluerows +Tables.nondatavaluerows +Tables.table +Tables.matrix +Tables.eachcolumn +Tables.materializer +Tables.columnindex +Tables.columntype +``` + +## Implementing the Interface (i.e. becoming a Tables.jl source) + +Now that we've seen how one _uses_ the Tables.jl interface, let's walk-through how to implement it; i.e. how can I +make my custom type valid for Tables.jl consumers? + +The interface to becoming a proper table is straightforward: +| Required Methods | Default Definition | Brief Description | +| ---------------- | ------------------ | ----------------- | +| `Tables.istable(table)` | | Declare that your table type implements the interface | +| One of: | | | +| `Tables.rowaccess(table)` | | Declare that your table type defines a `Tables.rows(table)` method | +| `Tables.rows(table)` | | Return a `Row` iterator from your table | +| Or: | | | +| `Tables.columnaccess(table)` | | Declare that your table type defines a `Tables.columns(table)` method | +| `Tables.columns(table)` | | Return a `Columns`-compatible object from your table | +| Optional methods | | | +| `Tables.schema(x)` | `Tables.schema(x) = nothing` | Return a `Tables.Schema` object from your `Row` iterator or `Columns` object; or `nothing` for unknown schema | +| `Tables.materializer(table)` | `Tables.columntable` | Declare a "materializer" sink function for your table type that can construct an instance of your type from any Tables.jl input | + +Based on whether your table type has defined `Tables.rows` or `Tables.columns`, you then ensure that the `Row` iterator +or `Columns` object satisfies the respective interface: +```@docs +Tables.Row +Tables.Columns +``` + +Though the strict requirements for `Row` and `Columns` are minimal (just `getcolumn` and `columnnames`), you may desire +additional behavior for your row or columns types (and you're implementing them yourself). For convenience, Tables.jl +defines the `Tables.AbstractRow` and `Tables.AbstractColumns` abstract types, to allow subtyped custom types to +inherit convenient behavior, such as indexing, iteration, and property access, all defined in terms of `getcolumn` and `columnnames`. +```@docs +Tables.AbstractRow +Tables.AbstractColumns +``` + +As an extended example, let's take a look at some code defined in Tables.jl for treating `AbstractMatrix`s as tables. + +First, we define a special `MatrixTable` type that will wrap an `AbstractMatrix`, and allow easy overloading for the +Tables.jl interface. +```julia +struct MatrixTable{T <: AbstractMatrix} <: Tables.AbstractColumns + names::Vector{Symbol} + lookup::Dict{Symbol, Int} + matrix::T +end +# declare that MatrixTable is a table +Tables.istable(::Type{<:MatrixTable}) = true +# getter method on stored column names +names(m::MatrixTable) = getfield(m, :names) +# schema is column names and types +Tables.schema(m::MatrixTable{T}) where {T} = Tables.Schema(names(m), fill(eltype(T), size(getfield(m, :matrix), 2))) +``` +Here we defined `Tables.istable` for all `MatrixTable` types, signaling that my type implements the Tables.jl interfaces. +We also defined `Tables.schema` by pulling the column names out that we stored, and since `AbstractMatrix` have a single +`eltype`, we repeat it for each column. Note that defining `Tables.schema` is optional on tables; by default, `nothing` +is returned and Tables.jl consumers should account for both known and unknown schema cases. It tends to allow consumers +to have certain optimizations when they can know the types of all columns upfront (and if the # of columns isn't too large) +to generate more efficient code. + +Now, in this example, we're actually going to have `MatrixTable` implement _both_ `Tables.rows` and `Tables.columns` +methods itself, i.e. it's going to return itself from those functions, so here's first how we make our `MatrixTable` a +valid `Columns` object: +```julia +# column interface +Tables.columnaccess(::Type{<:MatrixTable}) = true +Tables.columns(m::MatrixTable) = m +# required Columns object methods +Tables.getcolumn(m::MatrixTable, ::Type{T}, col::Int, nm::Symbol) where {T} = getfield(m, :matrix)[:, col] +Tables.getcolumn(m::MatrixTable, nm::Symbol) = getfield(m, :matrix)[:, getfield(m, :lookup)[nm]] +Tables.getcolumn(m::MatrixTable, i::Int) = getfield(m, :matrix)[:, i] +Tables.columnnames(m::MatrixTable) = names(m) +``` +We define `columnaccess` for our type, then `columns` just returns the `MatrixTable` itself, and then we define +the three `getcolumn` methods and `columnnames`. Note the use of a `lookup` Dict that maps column name to column index +so we can figure out which column to return from the matrix. We're also storing the column names in our `names` field +so the `columnnames` implementation is trivial. And that's it! Literally! It can now be written out to a csv file, +stored in a sqlite or other database, converted to DataFrame or JuliaDB table, etc. Pretty fun. + +And now for the `Tables.rows` implementation: +```julia +# declare that any MatrixTable defines its own `Tables.rows` method +rowaccess(::Type{<:MatrixTable}) = true +# just return itself, which means MatrixTable must iterate `Row`-compatible objects +rows(m::MatrixTable) = m +# the iteration interface, at a minimum, requires `eltype`, `length`, and `iterate` +# for `MatrixTable` `eltype`, we're going to provide a custom row type +Base.eltype(m::MatrixTable{T}) where {T} = MatrixRow{T} +Base.length(m::MatrixTable) = size(getfield(m, :matrix), 1) + +Base.iterate(m::MatrixTable, st=1) = st > length(m) ? nothing : (MatrixRow(st, m), st + 1) + +# a custom Row type; acts as a "view" into a row of an AbstractMatrix +struct MatrixRow{T} <: Tables.AbstractRow + row::Int + source::MatrixTable{T} +end +# required `Row` interface methods (same as for `Columns` object before) +getcolumn(m::MatrixRow, ::Type, col::Int, nm::Symbol) = + getfield(getfield(m, :source), :matrix)[getfield(m, :row), col] +getcolumn(m::MatrixRow, i::Int) = + getfield(getfield(m, :source), :matrix)[getfield(m, :row), i] +getcolumn(m::MatrixRow, nm::Symbol) = + getfield(getfield(m, :source), :matrix)[getfield(m, :row), getfield(getfield(m, :source), :lookup)[nm]] +columnnames(m::MatrixRow) = names(getfield(m, :source)) +``` +Here we start by defining `Tables.rowaccess` and `Tables.rows`, and then the iteration interface methods, +since we declared that a `MatrixTable` itself is an iterator of `Row`-compatible objects. For `eltype`, +we say that a `MatrixTable` iterates our own custom row type, `MatrixRow`. `MatrixRow` subtypes +`Tables.AbstractRow`, which has the same required interface as a `Row` object, but also provides interface +implementations for several useful behaviors (indexing, iteration, property-access, etc.); essentially it +makes our custom `MatrixRow` type more convenient to work with. + +Implementing the `Row`/`Tables.AbstractRow` interface is straightfoward, and very similar to our implementation +of `Columns` previously (i.e. the same methods for `getcolumn` and `columnnames`). + +And that's it. Our `MatrixTable` type is now a fully fledged, valid Tables.jl source and can be used throughout +the ecosystem. Now, this is obviously not a lot of code; but then again, the actual Tables.jl interface +implementations tend to be fairly simple, given the other behaviors that are already defined for table types +(i.e. table types tend to already have a `getcolumn` like function defined). diff --git a/src/Tables.jl b/src/Tables.jl index d90e527..1f8d606 100644 --- a/src/Tables.jl +++ b/src/Tables.jl @@ -8,98 +8,313 @@ if !hasmethod(getproperty, Tuple{Tuple, Int}) Base.getproperty(t::Tuple, i::Int) = t[i] end -"Abstract row type with a simple required interface: row values are accessible via `getproperty(row, field)`; for example, a NamedTuple like `nt = (a=1, b=2, c=3)` can access its value for `a` like `nt.a` which turns into a call to the function `getproperty(nt, :a)`" -abstract type Row end - """ -The Tables.jl package provides simple, yet powerful interface functions for working with all kinds tabular data through predictable access patterns. - -```julia - Tables.rows(table) => Rows - Tables.columns(table) => Columns -``` -Where `Rows` and `Columns` are the duals of each other: -* `Rows` is an iterator of property-accessible objects (any type that supports `propertynames(row)` and `getproperty(row, nm::Symbol`) -* `Columns` is a property-accessible object of iterators (i.e. each column is an iterator) - -In addition to these `Rows` and `Columns` objects, it's useful to be able to query properties of these objects: -* `Tables.schema(x::Union{Rows, Columns}) => Union{Tables.Schema, Nothing}`: returns a `Tables.Schema` object, or `nothing` if the table's schema is unknown -* For the `Tables.Schema` object: - * column names can be accessed as a tuple of Symbols like `sch.names` - * column types can be accessed as a tuple of types like `sch.types` - * See `?Tables.Schema` for more details on this type + Tables.Columns + +An interface type defined as an ordered set of columns that support +retrieval of individual columns by name or index. A retrieved column +must be an indexable collection with known length, i.e. an object +that supports `length(col)` and `col[i]` for any `i = 1:length(col)`. +The high-level [`Tables.columns`](@ref) function returns a `Columns`-compatible +object from any input table source. + +Any object implements the `Columns` interface, by satisfying the following: +| Required Methods | Default Definition | Brief Description | +| ---------------- | ------------------ | ----------------- | +| `Tables.getcolumn(table, i::Int)` | getfield(table, i) | Retrieve a column by index | +| `Tables.getcolumn(table, nm::Symbol)` | getproperty(table, nm) | Retrieve a column by name | +| `Tables.columnnames(table)` | propertynames(table) | Return column names for a table as an indexable collection | +| Optional methods | | | +| `Tables.getcolumn(table, ::Type{T}, i::Int, nm::Symbol)` | Tables.getcolumn(table, nm) | Given a column eltype `T`, index `i`, and column name `nm`, retrieve the column. Provides a type-stable or even constant-prop-able mechanism for efficiency. + +Note that table sources shouldn't subtype `Columns`, as it is purely an interface type +to help document the Tables.jl API. See the [`Tables.AbstractColumns`](@ref) type +for a type to potentially subtype to gain useful default behaviors. +""" +abstract type Columns end -A big part of the power in these simple interface functions is that each (`Tables.rows` & `Tables.columns`) is defined for any table type, even if the table type only explicitly implements one interface function or the other. -This is accomplished by providing performant, generic fallback definitions in Tables.jl itself (though obviously nothing prevents a table type from implementing each interface function directly). +""" + Tables.AbstractColumns + +Abstract type provided to allow custom table types to inherit useful and required behavior. Note that this type +is for convenience for table _source_ authors to provide useful default behavior to their `Columns` object, +and not to be used or relied upon by sink authors to dispatch on; i.e. not all `Columns` objects will inherit +from `Tables.AbstractColumns`. + +Interface definition: +| Required Methods | Default Definition | Brief Description | +| ---------------- | ------------------ | ----------------- | +| `Tables.getcolumn(table, i::Int)` | getfield(table, i) | Retrieve a column by index | +| `Tables.getcolumn(table, nm::Symbol)` | getproperty(table, nm) | Retrieve a column by name | +| `Tables.columnnames(table)` | propertynames(table) | Return column names for a table as an indexable collection | +| Optional methods | | | +| `Tables.getcolumn(table, ::Type{T}, i::Int, nm::Symbol)` | Tables.getcolumn(table, nm) | Given a column eltype `T`, index `i`, and column name `nm`, retrieve the column. Provides a type-stable or even constant-prop-able mechanism for efficiency. + +While custom table types aren't required to subtype `Tables.AbstractColumns`, benefits of doing so include: + * Indexing interface defined (using `getcolumn`); i.e. `tbl[i]` will retrieve the column at index `i` + * Property access interface defined (using `columnnames` and `getcolumn`); i.e. `tbl.col1` will retrieve column named `col1` + * Iteration interface defined; i.e. `for col in table` will iterate each column in the table + * A default `show` method +This allows a custom table type to behave as close as possible to a builtin `NamedTuple` of vectors object. +""" +abstract type AbstractColumns end -With these simple definitions, powerful workflows are enabled: -* A package providing data cleansing, manipulation, visualization, or analysis can automatically handle any number of decoupled input table types -* A tabular file format can have automatic integration with in-memory structures and translation to other file formats +""" + Tables.Row + +An interface type that represents a single row of a table, with column values retrievable by name or index. +The high-level [`Tables.rows`](@ref) function returns a `Row`-compatible +iterator from any input table source. + +Any object implements the `Row` interface, by satisfying the following: +| Required Methods | Default Definition | Brief Description | +| ---------------- | ------------------ | ----------------- | +| `Tables.getcolumn(row, i::Int)` | getfield(row, i) | Retrieve a column value by index | +| `Tables.getcolumn(row, nm::Symbol)` | getproperty(row, nm) | Retrieve a column value by name | +| `Tables.columnnames(row)` | propertynames(row) | Return column names for a row as an indexable collection | +| Optional methods | | | +| `Tables.getcolumn(row, ::Type{T}, i::Int, nm::Symbol)` | Tables.getcolumn(row, nm) | Given a column type `T`, index `i`, and column name `nm`, retrieve the column value. Provides a type-stable or even constant-prop-able mechanism for efficiency. + +Note that custom row types shouldn't subtype `Row`, as it is purely an interface type +to help document the Tables.jl API. See the [`Tables.AbstractRow`](@ref) type +for a type to potentially subtype to gain useful default behaviors. +""" +abstract type Row end -So how does one go about satisfying the Tables.jl interface functions? It mainly depends on what you've already defined and the natural access patterns of your table: +""" + Tables.AbstractRow + +Abstract type provided to allow custom row types to inherit useful and required behavior. Note that this type +is for convenience for table _source_ authors to provide useful default behavior to their `Row` object, +and not to be used or relied upon by sink authors to dispatch on; i.e. not all `Row` objects will inherit +from `Tables.AbstractRow`. + +Interface definition: +| Required Methods | Default Definition | Brief Description | +| ---------------- | ------------------ | ----------------- | +| `Tables.getcolumn(row, i::Int)` | getfield(row, i) | Retrieve a column value by index | +| `Tables.getcolumn(row, nm::Symbol)` | getproperty(row, nm) | Retrieve a column value by name | +| `Tables.columnnames(row)` | propertynames(row) | Return column names for a row as an indexable collection | +| Optional methods | | | +| `Tables.getcolumn(row, ::Type{T}, i::Int, nm::Symbol)` | Tables.getcolumn(row, nm) | Given a column type `T`, index `i`, and column name `nm`, retrieve the column value. Provides a type-stable or even constant-prop-able mechanism for efficiency. + +While custom row types aren't required to subtype `Tables.AbstractRow`, benefits of doing so include: + * Indexing interface defined (using `getcolumn`); i.e. `row[i]` will return the column value at index `i` + * Property access interface defined (using `columnnames` and `getcolumn`); i.e. `row.col1` will retrieve the value for the column named `col1` + * Iteration interface defined; i.e. `for x in row` will iterate each column value in the row + * A default `show` method +This allows the custom row type to behave as close as possible to a builtin `NamedTuple` object. +""" +abstract type AbstractRow end -First: -* `Tables.istable(::Type{<:MyTable}) = true`: this provides an explicit affirmation that your type implements the Tables interface +""" + Tables.getcolumn(::Columns, nm::Symbol) => Indexable collection with known length + Tables.getcolumn(::Columns, i::Int) => Indexable collection with known length + Tables.getcolumn(::Columns, T, i::Int, nm::Symbol) => Indexable collection with known length + + Tables.getcolumn(::Row, nm::Symbol) => Column value + Tables.getcolumn(::Row, i::Int) => Column value + Tables.getcolumn(::Row, T, i::Int, nm::Symbol) => Column value + +Retrieve an entire column (`Columns`) or single row column value (`Row`) by column name (`nm`), index (`i`), +or if desired, by column type (`T`), index (`i`), and name (`nm`). When called on a `Columns` interface object, +a `Column` is returned, which is an indexable collection with known length. When called on a `Row` interface +object, it returns the single column value. The methods taking a single `Symbol` or `Int` are both required +for the `AbstractColumns` and `AbstractRow` interfaces; the third method is optional if type stability is possible. +The default definition of `Tables.getcolumn(x, i::Int)` is `getfield(x, i)`. The default definition of +`Tables.getcolumn(x, nm::Symbol)` is `getproperty(x, nm)`. +""" +function getcolumn end -To support `Rows`: -* Define `Tables.rowaccess(::Type{<:MyTable}) = true`: this signals to other types that `MyTable` supports valid `Row`-iteration -* Define `Tables.rows(x::MyTable)`: return a `Row`-iterator object (perhaps the table itself if already defined) -* Define `Tables.schema(Tables.rows(x::MyTable))` to either return a `Tables.Schema` object, or `nothing` if the schema is unknown or non-inferrable for some reason +getcolumn(x, i::Int) = getfield(x, i) +getcolumn(x, nm::Symbol) = getproperty(x, nm) +getcolumn(x, ::Type{T}, i::Int, nm::Symbol) where {T} = getcolumn(x, nm) +getcolumn(x::NamedTuple{names, types}, ::Type{T}, i::Int, nm::Symbol) where {names, types, T} = Core.getfield(x, i) -To support `Columns`: -* Define `Tables.columnaccess(::Type{<:MyTable}) = true`: this signals to other types that `MyTable` supports returning a valid `Columns` object -* Define `Tables.columns(x::MyTable)`: return a `Columns`, property-accessible object (perhaps the table itself if it naturally supports property-access to columns) -* Define `Tables.schema(Tables.columns(x::MyTable))` to either return a `Tables.Schema` object, or `nothing` if the schema is unknown or non-inferrable for some reason +""" + Tables.columnnames(::Union{Columns, Row}) => Indexable collection -The final question is how `MyTable` can be a "sink" for any other table type. The answer is quite simple: use the interface functions! +Retrieves the list of column names as an indexable collection (like a `Tuple` or `Vector`) for a `Columns` or `Row` interface object. The default definition calls `propertynames(x)`. +""" +function columnnames end + +columnnames(x) = propertynames(x) + +# default definitions for AbstractDict to act as Row +getcolumn(x::AbstractDict, i::Int) = x[i] +getcolumn(x::AbstractDict, nm::Symbol) = x[nm] +getcolumn(x::AbstractDict, ::Type{T}, i::Int, nm::Symbol) where {T} = x[nm] +columnnames(x::AbstractDict) = collect(keys(x)) + +# Dict iterator as Rows +const DictRows = AbstractVector{T} where {T <: AbstractDict} +istable(::Type{<:DictRows}) = true +rowaccess(::Type{<:DictRows}) = true +rows(x::DictRows) = x +# DictRows doesn't naturally lend itself to the `Tables.schema` requirement +# we can't just look at the first row, because the types might change, +# row-to-row (e.g. `missing`, then `1.1`, etc.). Therefore, the safest option +# is to just return `nothing` +schema(x::DictRows) = nothing + +# and as Columns +const DictColumns = AbstractDict{K, V} where {K <: Union{Integer, Symbol, String}, V <: AbstractVector} +istable(::Type{<:DictColumns}) = true +columnaccess(::Type{<:AbstractDict}) = true +columns(x::DictColumns) = x +schema(x::DictColumns) = Schema(collect(keys(x)), eltype.(values(x))) + +# for other AbstractDict, let's throw an informative error +columns(x::T) where {T <: AbstractDict} = error("to treat $T as a table, it must have a key type of `Integer`, `Symbol`, or `String`, and a value type `<: AbstractVector`") + +# default definitions for AbstractRow, AbstractColumns +const RorC = Union{AbstractRow, AbstractColumns} + +Base.IteratorSize(::Type{R}) where {R <: RorC} = Base.HasLength() +Base.length(r::RorC) = length(columnnames(r)) +Base.firstindex(r::RorC) = 1 +Base.lastindex(r::RorC) = length(r) +Base.getindex(r::RorC, i::Int) = getcolumn(r, i) +Base.getindex(r::RorC, nm::Symbol) = getcolumn(r, nm) +Base.getproperty(r::RorC, nm::Symbol) = getcolumn(r, nm) +Base.getproperty(r::RorC, i::Int) = getcolumn(r, i) +Base.propertynames(r::RorC) = columnnames(r) +Base.keys(r::RorC) = columnnames(r) +Base.values(r::RorC) = collect(r) +Base.haskey(r::RorC, key::Symbol) = key in columnnames(r) +Base.haskey(r::RorC, i::Int) = 0 < i < length(columnnames(r)) +Base.get(r::RorC, key::Union{Integer, Symbol}, default) = haskey(r, key) ? getcolumn(r, key) : default +Base.get(f::Base.Callable, r::RorC, key::Union{Integer, Symbol}) = haskey(r, key) ? getcolumn(r, key) : f() +Base.iterate(r::RorC, i=1) = i > length(r) ? nothing : (getcolumn(r, i), i + 1) + +function Base.show(io::IO, x::T) where {T <: RorC} + println(io, "$T:") + names = collect(columnnames(x)) + values = [getcolumn(x, nm) for nm in names] + Base.print_matrix(io, hcat(names, values)) +end -* Define a function or constructor that takes, at a minimum, a single, untyped argument and then calls `Tables.rows` or `Tables.columns` on that argument to construct an instance of `MyTable` +# AbstractRow AbstractVector as Rows +const AbstractRowTable = AbstractVector{T} where {T <: AbstractRow} +istable(::Type{<:AbstractRowTable}) = true +rowaccess(::Type{<:AbstractRowTable}) = true +rows(x::AbstractRowTable) = x +schema(x::AbstractRowTable) = nothing -For example, if `MyTable` is a row-oriented format, I might define my "sink" function like: -```julia -function MyTable(x) - Tables.istable(x) || throw(ArgumentError("MyTable requires a table input")) - rows = Tables.rows(x) - sch = Tables.schema(rows) - names = sch.names - types = sch.types - # custom constructor that creates an "empty" MyTable according to given column names & types - # note that the "unknown" schema case should be considered, i.e. when `sch.types => nothing` - mytbl = MyTable(names, types) - for row in rows - # a convenience function provided in Tables.jl for "unrolling" access to each column/property of a `Row` - # it works by applying a provided function to each value; see `?Tables.eachcolumn` for more details - Tables.eachcolumn(sch, row) do val, col, name - push!(mytbl[col], val) - end - end - return mytbl -end -``` +# AbstractColumns as Columns +istable(::Type{<:AbstractColumns}) = true +columnaccess(::Type{<:AbstractColumns}) = true +columns(x::AbstractColumns) = x +schema(x::AbstractColumns) = nothing -Alternatively, if `MyTable` is column-oriented, perhaps my definition would be more like: -```julia -function MyTable(x) - Tables.istable(x) || throw(ArgumentError("MyTable requires a table input")) - cols = Tables.columns(x) - # here we use Tables.eachcolumn to iterate over each column in a `Columns` object - return MyTable(collect(propertynames(cols)), [collect(col) for col in Tables.eachcolumn(cols)]) -end -``` +# default definitions +""" + Tables.istable(x) => Bool -Obviously every table type is different, but via a combination of `Tables.rows` and `Tables.columns` each table type should be able to construct an instance of itself. +Check if an object has specifically defined that it is a table. Note that +not all valid tables will return true, since it's possible to satisfy the +Tables.jl interface at "run-time", e.g. a `Generator` of `NamedTuple`s iterates +`NamedTuple`s, which satisfies the Row interface, but there's no static way +of knowing that the generator is a table. """ -abstract type Table end +function istable end -# default definitions istable(x::T) where {T} = istable(T) || TableTraits.isiterabletable(x) === true istable(::Type{T}) where {T} = false + +""" + Tables.rowaccess(x) => Bool + +Check whether an object has specifically defined that it implements the `Tables.rows` +function. Note that `Tables.rows` will work on any object that iterates `Row`-compatible +objects, even if they don't define `rowaccess`, e.g. a `Generator` of `NamedTuple`s. Also +note that just because an object defines `rowaccess` doesn't mean a user should call +`Tables.rows` on it; `Tables.columns` will also work, providing a valid `Columns` +object from the rows. Hence, users should call `Tables.rows` or `Tables.columns` +depending on what is most natural for them to *consume* instead of worrying about +what and how the input produces. +""" +function rowaccess end + rowaccess(x::T) where {T} = rowaccess(T) rowaccess(::Type{T}) where {T} = false + +""" + Tables.columnaccess(x) => Bool + +Check whether an object has specifically defined that it implements the `Tables.columns` +function. Note that `Tables.columns` has generic fallbacks allowing it to produces `Columns` +objects, even if the input doesn't define `columnaccess`. Also note that just because an +object defines `columnaccess` doesn't mean a user should call `Tables.columns` on it; +`Tables.rows` will also work, providing a valid `Row` iterator. Hence, users should call +`Tables.rows` or `Tables.columns` depending on what is most natural for them to *consume* +instead of worrying about what and how the input produces. +""" +function columnaccess end + columnaccess(x::T) where {T} = columnaccess(T) columnaccess(::Type{T}) where {T} = false + +""" + Tables.schema(x) => Union{Nothing, Tables.Schema} + +Attempt to retrieve the schema of the object returned by `Tables.rows` or `Tables.columns`. +If the `Row` iterator or `Columns` object can't determine its schema, `nothing` will be returned. +Otherwise, a `Tables.Schema` object is returned, with the column names and types available for use. +""" +function schema end + schema(x) = nothing -materializer(x) = columntable + +""" + Tables.materializer(x) => Callable + +For a table input, return the "sink" function or "materializing" function that can take a +Tables.jl-compatible table input and make an instance of the table type. This enables "transform" +workflows that take table inputs, apply transformations, potentially converting the table to +a different form, and end with producing a table of the same type as the original input. The +default materializer is `Tables.columntable`, which converts any table input into a `NamedTuple` +of `Vector`s. +""" +function materializer end + +materializer(x::T) where {T} = materializer(T) +materializer(::Type{T}) where {T} = columntable + +""" + Tables.columns(x) => Columns-compatible object + +Accesses data of input table source `x` by returning a [`Columns`](@ref)-compatible +object, which allows retrieving entire columns by name or index. A retrieved column +is an object that is indexable and has a known length, i.e. supports +`length(col)` and `col[i]` for any `i = 1:length(col)`. Note that +even if the input table source is row-oriented by nature, an efficient generic +definition of `Tables.columns` is defined in Tables.jl to build a `Columns`- +compatible object object from the input rows. + +The [`Tables.Schema`](@ref) of a `Columns` object can be queried via `Tables.schema(columns)`, +which may return `nothing` if the schema is unknown. +Column names can be queried by calling `Tables.columnnames(columns)`. And individual columns +can be accessed by calling `Tables.getcolumn(columns, i::Int )` or `Tables.getcolumn(columns, nm::Symbol)` +with a column index or name, respectively. +""" +function columns end + +""" + Tables.rows(x) => Row iterator + +Accesses data of input table source `x` row-by-row by returning a [`Row`](@ref) iterator. +Note that even if the input table source is column-oriented by nature, an efficient generic +definition of `Tables.rows` is defined in Tables.jl to return an iterator of row views into +the columns of the input. + +The [`Tables.Schema`](@ref) of a `Row` iterator can be queried via `Tables.schema(rows)`, +which may return `nothing` if the schema is unknown. +Column names can be queried by calling `Tables.columnnames(row)` on an individual row. +And row values can be accessed by calling `Tables.getcolumn(rows, i::Int )` or +`Tables.getcolumn(rows, nm::Symbol)` with a column index or name, respectively. +""" +function rows end # Schema implementation """ @@ -173,7 +388,7 @@ include("operations.jl") include("matrix.jl") "Return the column index (1-based) of a `colname` in a table with a known schema; returns 0 if `colname` doesn't exist in table" -columnindex(table, colname) = columnindex(schema(table).names, colname) +columnindex(table, colname) = columnindex(schema(table), colname) "Return the column type of a `colname` in a table with a known schema; returns Union{} if `colname` doesn't exist in table" columntype(table, colname) = columntype(schema(table), colname) diff --git a/src/fallbacks.jl b/src/fallbacks.jl index ca2945e..946bfe5 100644 --- a/src/fallbacks.jl +++ b/src/fallbacks.jl @@ -1,23 +1,30 @@ ## generic `Tables.rows` and `Tables.columns` fallbacks ## if a table provides Tables.rows or Tables.columns, -## we'll provide a default implementation of the dual +## we'll provide a default implementation of the other -# generic row iteration of columns +# for Columns objects, we define a generic RowIterator wrapper to turn any Columns into a Rows + +# get the number of rows in the incoming table function rowcount(cols) - props = propertynames(cols) - isempty(props) && return 0 - return length(getproperty(cols, props[1])) + names = columnnames(cols) + isempty(names) && return 0 + return length(getcolumn(cols, names[1])) end -struct ColumnsRow{T} +# a lazy row view into a Columns object +struct ColumnsRow{T} <: AbstractRow columns::T # a `Columns` object - row::Int + row::Int # row number end -Base.getproperty(c::ColumnsRow, ::Type{T}, col::Int, nm::Symbol) where {T} = getproperty(getfield(c, 1), T, col, nm)[getfield(c, 2)] -Base.getproperty(c::ColumnsRow, nm::Int) = getproperty(getfield(c, 1), nm)[getfield(c, 2)] -Base.getproperty(c::ColumnsRow, nm::Symbol) = getproperty(getfield(c, 1), nm)[getfield(c, 2)] -Base.propertynames(c::ColumnsRow) = propertynames(getfield(c, 1)) +getcolumns(c::ColumnsRow) = getfield(c, :columns) +getrow(c::ColumnsRow) = getfield(c, :row) + +# AbstractRow interface +Base.@propagate_inbounds getcolumn(c::ColumnsRow, ::Type{T}, col::Int, nm::Symbol) where {T} = getcolumn(getcolumns(c), T, col, nm)[getrow(c)] +Base.@propagate_inbounds getcolumn(c::ColumnsRow, i::Int) = getcolumn(getcolumns(c), i)[getrow(c)] +Base.@propagate_inbounds getcolumn(c::ColumnsRow, nm::Symbol) = getcolumn(getcolumns(c), nm)[getrow(c)] +columnnames(c::ColumnsRow) = columnnames(getcolumns(c)) @generated function Base.isless(c::ColumnsRow{T}, d::ColumnsRow{T}) where {T <: NamedTuple{names}} where names exprs = Expr[] @@ -46,16 +53,19 @@ end Expr(:block, exprs...) end +# RowIterator wraps a Columns object and provides row iteration via lazy row views struct RowIterator{T} columns::T len::Int end + Base.eltype(x::RowIterator{T}) where {T} = ColumnsRow{T} Base.length(x::RowIterator) = x.len istable(::Type{<:RowIterator}) = true rowaccess(::Type{<:RowIterator}) = true rows(x::RowIterator) = x -columnaccess(::Type{<:RowIterator{T}}) where T = columnaccess(T) + +columnaccess(::Type{<:RowIterator}) = true columns(x::RowIterator) = x.columns materializer(x::RowIterator) = materializer(x.columns) schema(x::RowIterator) = schema(x.columns) @@ -65,21 +75,29 @@ function Base.iterate(rows::RowIterator, st=1) return ColumnsRow(rows.columns, st), st + 1 end +# this is our generic Tables.rows fallback definition function rows(x::T) where {T} + # because this method is being called, we know `x` didn't define it's own Tables.rows + # first check if it supports column access, and if so, wrap it in a RowIterator if columnaccess(T) cols = columns(x) return RowIterator(cols, Int(rowcount(cols))) + # otherwise, if the input is at least iterable, we'll wrap it in an IteratorWrapper + # which will iterate the input, validating that it supports the Row interface + # and unwrapping any DataValues that are encountered elseif IteratorInterfaceExtensions.isiterable(x) return nondatavaluerows(x) end throw(ArgumentError("no default `Tables.rows` implementation for type: $T")) end -# build columns from rows +# for Rows objects, we define a "collect"-like routine to build up columns from iterated rows + """ - Tables.allocatecolumn(::Type{T}, len) => returns a column type (usually AbstractVector) w/ size to hold `len` elements + Tables.allocatecolumn(::Type{T}, len) => returns a column type (usually `AbstractVector`) with size to hold `len` elements - Custom column types can override with an appropriate "scalar" element type that should dispatch to their column allocator. +Custom column types can override with an appropriate "scalar" element type that should dispatch to their column allocator. +Alternatively, and more generally, custom scalars can overload `DataAPI.defaultarray` to signal the default array type. """ allocatecolumn(T, len) = DataAPI.defaultarray(T, 1)(undef, len) @@ -131,11 +149,20 @@ function __buildcolumns(rowitr, st, sch, columns, rownbr, updated) row, st = state rownbr += 1 eachcolumns(add_or_widen!, sch, row, columns, rownbr, updated, Base.IteratorSize(rowitr)) + # little explanation here: we just called add_or_widen! for each column value of our row + # note that when a column's type is widened, `updated` is set w/ the new set of columns + # we then check if our current `columns` isn't the same object as our `updated` ref + # if it isn't, we're going to call __buildcolumns again, passing our new updated ref as + # columns, which allows __buildcolumns to specialize (i.e. recompile) based on the new types + # of updated. So a new __buildcolumns will be compiled for each widening event. columns !== updated[] && return __buildcolumns(rowitr, st, sch, updated[], rownbr, updated) end return updated end +# for the schema-less case, we do one extra step of initializing each column as an `EmptyVector` +# and doing an initial widening for each column in _buildcolumns, before passing the widened +# set of columns on to __buildcolumns struct EmptyVector <: AbstractVector{Union{}} len::Int end @@ -153,14 +180,24 @@ end state = iterate(rowitr) state === nothing && return NamedTuple() row, st = state - names = Tuple(propertynames(row)) + names = Tuple(columnnames(row)) len = Base.haslength(T) ? length(rowitr) : 0 sch = Schema(names, nothing) columns = Tuple(EmptyVector(len) for _ = 1:length(names)) return NamedTuple{Base.map(Symbol, names)}(_buildcolumns(rowitr, row, st, sch, columns, Ref{Any}(columns))[]) end -struct CopiedColumns{T} +""" + Tables.CopiedColumns + +For some sinks, there's a concern about whether they can safely "own" columns from the input. +To be safe, they should always copy input columns, to avoid unintended mutation. +When we've called `buildcolumns`, however, Tables.jl essentially built/owns the columns, +and it's happy to pass ownership to the sink. Thus, any built columns will be wrapped +in a `CopiedColumns` struct to signal to the sink that essentially "a copy has already been made" +and they're safe to assume ownership. +""" +struct CopiedColumns{T} <: AbstractColumns x::T end @@ -170,15 +207,25 @@ columnaccess(::Type{<:CopiedColumns}) = true columns(x::CopiedColumns) = x schema(x::CopiedColumns) = schema(source(x)) materializer(x::CopiedColumns) = materializer(source(x)) -Base.propertynames(x::CopiedColumns) = propertynames(source(x)) -Base.getproperty(x::CopiedColumns, nm::Symbol) = getproperty(source(x), nm) +getcolumn(x::CopiedColumns, ::Type{T}, col::Int, nm::Symbol) where {T} = getcolumn(source(x), T, col, nm) +getcolumn(x::CopiedColumns, i::Int) = getcolumn(source(x), i) +getcolumn(x::CopiedColumns, nm::Symbol) = getcolumn(source(x), nm) +columnnames(x::CopiedColumns) = columnnames(source(x)) + +# here's our generic fallback Tables.columns definition @inline function columns(x::T) where {T} + # because this method is being called, we know `x` didn't define it's own Tables.columns method + # first check if it explicitly supports row access, and if so, build up the desired columns if rowaccess(T) r = rows(x) return CopiedColumns(buildcolumns(schema(r), r)) + # though not widely supported, if a source supports the TableTraits column interface, use it elseif TableTraits.supports_get_columns_copy_using_missing(x) return CopiedColumns(TableTraits.get_columns_copy_using_missing(x)) + # otherwise, if the source is at least iterable, we'll wrap it in an IteratorWrapper and + # build columns from that, which will check if the source correctly iterates valid Row objects + # and unwraps DataValues for us elseif IteratorInterfaceExtensions.isiterable(x) iw = nondatavaluerows(x) return CopiedColumns(buildcolumns(schema(iw), iw)) diff --git a/src/matrix.jl b/src/matrix.jl index 88539df..e23caa2 100644 --- a/src/matrix.jl +++ b/src/matrix.jl @@ -1,10 +1,9 @@ istable(::Type{<:AbstractMatrix}) = false -istable(::AbstractMatrix) = false rows(m::T) where {T <: AbstractMatrix} = throw(ArgumentError("a '$T' is not a table; see `?Tables.table` for ways to treat an AbstractMatrix as a table")) columns(m::T) where {T <: AbstractMatrix} = throw(ArgumentError("a '$T' is not a table; see `?Tables.table` for ways to treat an AbstractMatrix as a table")) -struct MatrixTable{T <: AbstractMatrix} +struct MatrixTable{T <: AbstractMatrix} <: AbstractColumns names::Vector{Symbol} lookup::Dict{Symbol, Int} matrix::T @@ -14,16 +13,18 @@ istable(::Type{<:MatrixTable}) = true names(m::MatrixTable) = getfield(m, :names) # row interface -struct MatrixRow{T} +struct MatrixRow{T} <: AbstractRow row::Int source::MatrixTable{T} end -Base.getproperty(m::MatrixRow, ::Type, col::Int, nm::Symbol) = +getcolumn(m::MatrixRow, ::Type, col::Int, nm::Symbol) = getfield(getfield(m, :source), :matrix)[getfield(m, :row), col] -Base.getproperty(m::MatrixRow, nm::Symbol) = +getcolumn(m::MatrixRow, i::Int) = + getfield(getfield(m, :source), :matrix)[getfield(m, :row), i] +getcolumn(m::MatrixRow, nm::Symbol) = getfield(getfield(m, :source), :matrix)[getfield(m, :row), getfield(getfield(m, :source), :lookup)[nm]] -Base.propertynames(m::MatrixRow) = names(getfield(m, :source)) +columnnames(m::MatrixRow) = names(getfield(m, :source)) rowaccess(::Type{<:MatrixTable}) = true schema(m::MatrixTable{T}) where {T} = Schema(Tuple(names(m)), NTuple{size(getfield(m, :matrix), 2), eltype(T)}) @@ -31,20 +32,18 @@ rows(m::MatrixTable) = m Base.eltype(m::MatrixTable{T}) where {T} = MatrixRow{T} Base.length(m::MatrixTable) = size(getfield(m, :matrix), 1) -function Base.iterate(m::MatrixTable, st=1) - st > length(m) && return nothing - return MatrixRow(st, m), st + 1 -end +Base.iterate(m::MatrixTable, st=1) = st > length(m) ? nothing : (MatrixRow(st, m), st + 1) # column interface columnaccess(::Type{<:MatrixTable}) = true columns(m::MatrixTable) = m -Base.getproperty(m::MatrixTable, ::Type{T}, col::Int, nm::Symbol) where {T} = getfield(m, :matrix)[:, col] -Base.getproperty(m::MatrixTable, nm::Symbol) = getfield(m, :matrix)[:, getfield(m, :lookup)[nm]] -Base.propertynames(m::MatrixTable) = names(m) +getcolumn(m::MatrixTable, ::Type{T}, col::Int, nm::Symbol) where {T} = getfield(m, :matrix)[:, col] +getcolumn(m::MatrixTable, nm::Symbol) = getfield(m, :matrix)[:, getfield(m, :lookup)[nm]] +getcolumn(m::MatrixTable, i::Int) = getfield(m, :matrix)[:, i] +columnnames(m::MatrixTable) = names(m) """ -Tables.table(m::AbstractMatrix; [header::Vector{Symbol}]) + Tables.table(m::AbstractMatrix; [header::Vector{Symbol}]) Wrap an `AbstractMatrix` (`Matrix`, `Adjoint`, etc.) in a `MatrixTable`, which satisfies the Tables.jl interface. This allows accesing the matrix via `Tables.rows` and @@ -58,7 +57,7 @@ function table(m::AbstractMatrix; header::Vector{Symbol}=[Symbol("Column$i") for end """ -Tables.matrix(table; transpose::Bool=false) + Tables.matrix(table; transpose::Bool=false) Materialize any table source input as a `Matrix`. If the table column types are not homogenous, they will be promoted to a common type in the materialized `Matrix`. Note that column names are diff --git a/src/namedtuples.jl b/src/namedtuples.jl index 55bcd5a..7d44b3d 100644 --- a/src/namedtuples.jl +++ b/src/namedtuples.jl @@ -1,64 +1,89 @@ # Vector of NamedTuples -const RowTable{T} = Vector{T} where {T <: NamedTuple} +const RowTable{T} = AbstractVector{T} where {T <: NamedTuple} # interface implementation istable(::Type{<:RowTable}) = true rowaccess(::Type{<:RowTable}) = true -# a Vector of NamedTuple iterates `Row`s itself +# an AbstractVector of NamedTuple iterates `Row`s itself rows(x::RowTable) = x -schema(x::Vector{NamedTuple{names, types}}) where {names, types} = Schema(names, types) +schema(x::AbstractVector{NamedTuple{names, types}}) where {names, types} = Schema(names, types) materializer(x::RowTable) = rowtable # struct to transform `Row`s into NamedTuples -struct NamedTupleIterator{S, T} +struct NamedTupleIterator{schema, T} x::T end -Base.IteratorEltype(::Type{NamedTupleIterator{S, T}}) where {S, T} = S === Nothing ? Base.EltypeUnknown() : Base.HasEltype() -Base.eltype(::Type{NamedTupleIterator{Schema{names, T}, S}}) where {names, T, S} = NamedTuple{Base.map(Symbol, names), T} -Base.IteratorSize(::Type{NamedTupleIterator{S, T}}) where {S, T} = Base.IteratorSize(T) + +""" + Tables.namedtupleiterator(x) + +Pass any table input source and return a `NamedTuple` iterator +""" +function namedtupleiterator(x) + r = rows(x) + sch = schema(r) + return NamedTupleIterator{typeof(sch), typeof(r)}(r) +end + +namedtupleiterator(::Type{T}, x) where {T <: NamedTuple} = x +namedtupleiterator(T, x) = namedtupleiterator(x) + +Base.IteratorEltype(::Type{NamedTupleIterator{Schema{names, types}, T}}) where {names, types, T} = Base.HasEltype() +Base.IteratorEltype(::Type{NamedTupleIterator{Nothing, T}}) where {T} = Base.EltypeUnknown() +Base.eltype(::Type{NamedTupleIterator{Schema{names, types}, T}}) where {names, types, T} = NamedTuple{Base.map(Symbol, names), types} +Base.IteratorSize(::Type{NamedTupleIterator{sch, T}}) where {sch, T} = Base.IteratorSize(T) Base.length(nt::NamedTupleIterator) = length(nt.x) Base.size(nt::NamedTupleIterator) = (length(nt.x),) -function Base.iterate(rows::NamedTupleIterator{Schema{names, T}}, st=()) where {names, T} +@inline function Base.iterate(rows::NamedTupleIterator{Schema{names, T}}, st=()) where {names, T} if @generated - vals = Tuple(:(getproperty(row, $(fieldtype(T, i)), $i, $(quot(names[i])))) for i = 1:fieldcount(T)) + vals = Any[ :(getcolumn(row, $(fieldtype(T, i)), $i, $(quot(names[i])))) for i = 1:fieldcount(T) ] + ret = Expr(:new, :(NamedTuple{names, T}), vals...) return quote x = iterate(rows.x, st...) x === nothing && return nothing row, st = x - return $(NamedTuple{Base.map(Symbol, names), T})(($(vals...),)), (st,) + return $ret, (st,) end else x = iterate(rows.x, st...) x === nothing && return nothing row, st = x - return NamedTuple{Base.map(Symbol, names), T}(Tuple(getproperty(row, fieldtype(T, i), i, names[i]) for i = 1:fieldcount(T))), (st,) + return NamedTuple{Base.map(Symbol, names), T}(Tuple(getcolumn(row, fieldtype(T, i), i, names[i]) for i = 1:fieldcount(T))), (st,) end end -# unknown schema case -function Base.iterate(rows::NamedTupleIterator{Nothing, T}, st=()) where {T} - x = iterate(rows.x, st...) +function Base.iterate(rows::NamedTupleIterator{Nothing}) + x = iterate(rows.x) x === nothing && return nothing row, st = x - names = Tuple(propertynames(row)) - return NamedTuple{Base.map(Symbol, names)}(Tuple(getproperty(row, nm) for nm in names)), (st,) + names = Tuple(columnnames(row)) + return NamedTuple{names}(Tuple(getcolumn(row, nm) for nm in names)), (Val(names), (st,)) end -namedtupleiterator(::Type{T}, rows::S) where {T <: NamedTuple, S} = rows -namedtupleiterator(::Type{T}, rows::S) where {T, S} = NamedTupleIterator{typeof(schema(rows)), S}(rows) +function Base.iterate(rows::NamedTupleIterator{Nothing}, state::Tuple{Val{names}, T}) where {names, T} + x = iterate(rows.x, state[2]...) + x === nothing && return nothing + row, st = x + return NamedTuple{names}(Tuple(getcolumn(row, nm) for nm in names)), (Val(names), (st,)) +end # sink function +""" + Tables.rowtable(x) => Vector{NamedTuple} + +Take any input table source, and produce a `Vector` of `NamedTuple`s, +also known as a "row table". A "row table" is a kind of default +table type of sorts, since it satisfies the Tables.jl row interface +naturally. +""" +function rowtable end + function rowtable(itr::T) where {T} r = rows(itr) return collect(namedtupleiterator(eltype(r), r)) end -function rowtable(rt::RowTable, itr::T) where {T} - r = rows(itr) - return append!(rt, namedtupleiterator(eltype(r), r)) -end - # NamedTuple of arrays of matching dimensionality const ColumnTable = NamedTuple{names, T} where {names, T <: NTuple{N, AbstractArray{S, D} where S}} where {N, D} rowcount(c::ColumnTable) = length(c) == 0 ? 0 : length(c[1]) @@ -68,27 +93,39 @@ istable(::Type{<:ColumnTable}) = true columnaccess(::Type{<:ColumnTable}) = true # a NamedTuple of AbstractVectors is itself a `Columns` object columns(x::ColumnTable) = x -schema(x::T) where {T <: ColumnTable} = Schema(names(T), _types(T)) -materializer(x::ColumnTable) = columntable _eltype(::Type{A}) where {A <: AbstractVector{T}} where {T} = T -Base.@pure function _types(::Type{NT}) where {NT <: NamedTuple{names, T}} where {names, T <: NTuple{N, AbstractVector{S} where S}} where {N} +Base.@pure function _eltypes(::Type{NT}) where {NT <: NamedTuple{names, T}} where {names, T <: NTuple{N, AbstractVector{S} where S}} where {N} return Tuple{Any[ _eltype(fieldtype(NT, i)) for i = 1:fieldcount(NT) ]...} end +schema(x::T) where {T <: ColumnTable} = Schema(names(T), _eltypes(T)) +materializer(x::ColumnTable) = columntable + getarray(x::AbstractArray) = x getarray(x) = collect(x) +""" + Tables.columntable(x) => NamedTuple of Vectors + +Takes any input table source `x` and returns a `NamedTuple` of `Vector`s, +also known as a "column table". A "column table" is a kind of default +table type of sorts, since it satisfies the Tables.jl column interface +naturally. +""" +function columntable end + function columntable(sch::Schema{names, types}, cols) where {names, types} if @generated - vals = Tuple(:(getarray(getproperty(cols, $(fieldtype(types, i)), $i, $(quot(names[i]))))) for i = 1:fieldcount(types)) + vals = Tuple(:(getarray(getcolumn(cols, $(fieldtype(types, i)), $i, $(quot(names[i]))))) for i = 1:fieldcount(types)) return :(NamedTuple{Base.map(Symbol, names)}(($(vals...),))) else - return NamedTuple{Base.map(Symbol, names)}(Tuple(getarray(getproperty(cols, fieldtype(types, i), i, names[i])) for i = 1:fieldcount(types))) + return NamedTuple{Base.map(Symbol, names)}(Tuple(getarray(getcolumn(cols, fieldtype(types, i), i, names[i])) for i = 1:fieldcount(types))) end end + # unknown schema case -columntable(::Nothing, cols) = NamedTuple{Tuple(Base.map(Symbol, propertynames(cols)))}(Tuple(getarray(col) for col in eachcolumn(cols))) +columntable(::Nothing, cols) = NamedTuple{Tuple(Base.map(Symbol, columnnames(cols)))}(Tuple(getarray(col) for col in eachcolumn(cols))) function columntable(itr::T) where {T} cols = columns(itr) @@ -96,18 +133,3 @@ function columntable(itr::T) where {T} return columntable(schema(cols), cols) end columntable(x::ColumnTable) = x - -function ctappend(ct1::NamedTuple{N1, T1}, ct2::NamedTuple{N2, T2}) where {N1, T1, N2, T2} - if @generated - appends = Expr(:block, Any[:(append!(ct1[$(quot(nm))], ct2[$(quot(nm))])) for nm in N1]...) - return quote - $appends - return ct1 - end - else - foreach(nm->append!(ct1[nm], ct2[nm]), N1) - return ct1 - end -end - -columntable(ct::ColumnTable, itr) = ctappend(ct, columntable(itr)) diff --git a/src/operations.jl b/src/operations.jl index 9a558cc..9d794fb 100644 --- a/src/operations.jl +++ b/src/operations.jl @@ -1,18 +1,36 @@ -struct TransformsRow{T, F} +struct TransformsRow{T, F} <: AbstractRow row::T funcs::F end -Base.getproperty(row::TransformsRow, nm::Symbol) = (getfunc(row, getfield(row, 2), nm))(getproperty(getfield(row, 1), nm)) -Base.propertynames(row::TransformsRow) = propertynames(getfield(row, 1)) +getrow(r::TransformsRow) = getfield(r, :row) +getfuncs(r::TransformsRow) = getfield(r, :funcs) + +getcolumn(row::TransformsRow, nm::Symbol) = (getfunc(row, getfuncs(row), nm))(getcolumn(getrow(row), nm)) +getcolumn(row::TransformsRow, i::Int) = (getfunc(row, getfuncs(row), i))(getcolumn(getrow(row), i)) +columnnames(row::TransformsRow) = columnnames(getrow(row)) struct Transforms{C, T, F} source::T funcs::F # NamedTuple of columnname=>transform function end -Base.propertynames(t::Transforms{true}) = propertynames(getfield(t, 1)) -Base.getproperty(t::Transforms{true}, nm::Symbol) = Base.map(getfunc(t, getfield(t, 2), nm), getproperty(getfield(t, 1), nm)) +columnnames(t::Transforms{true}) = columnnames(getfield(t, 1)) +getcolumn(t::Transforms{true}, nm::Symbol) = Base.map(getfunc(t, getfield(t, 2), nm), getcolumn(getfield(t, 1), nm)) +getcolumn(t::Transforms{true}, i::Int) = Base.map(getfunc(t, getfield(t, 2), i), getcolumn(getfield(t, 1), i)) +# for backwards compat +Base.propertynames(t::Transforms{true}) = columnnames(t) +Base.getproperty(t::Transforms{true}, nm::Symbol) = getcolumn(t, nm) + +""" + Tables.transform(source, funcs) => Tables.Transforms + source |> Tables.transform(funcs) => Tables.Transform + +***EXPERIMENTAL - May be moved or removed in a future release*** +Given any Tables.jl-compatible source, apply a series of transformation functions, for the columns specified in `funcs`. +The tranform functions can be a NamedTuple or Dict mapping column name (`String` or `Symbol` or `Integer` index) to Function. +""" +function transform end transform(funcs) = x->transform(x, funcs) transform(; kw...) = transform(kw.data) @@ -22,10 +40,15 @@ function transform(src::T, funcs::F) where {T, F} return Transforms{C, typeof(x), F}(x, funcs) end -getfunc(row, nt::NamedTuple, nm) = get(nt, nm, identity) -getfunc(row, d::Dict{String, <:Base.Callable}, nm) = get(d, String(nm), identity) -getfunc(row, d::Dict{Symbol, <:Base.Callable}, nm) = get(d, nm, identity) -getfunc(row, d::Dict{Int, <:Base.Callable}, nm) = get(d, findfirst(isequal(nm), propertynames(row)), identity) +getfunc(row, nt::NamedTuple, nm::Symbol) = get(nt, nm, identity) +getfunc(row, d::Dict{String, <:Base.Callable}, nm::Symbol) = get(d, String(nm), identity) +getfunc(row, d::Dict{Symbol, <:Base.Callable}, nm::Symbol) = get(d, nm, identity) +getfunc(row, d::Dict{Int, <:Base.Callable}, nm::Symbol) = get(d, findfirst(isequal(nm), columnnames(row)), identity) + +getfunc(row, nt::NamedTuple, i::Int) = get(nt, columnnames(row)[i], identity) +getfunc(row, d::Dict{String, <:Base.Callable}, i::Int) = get(d, String(columnnames(row)[i]), identity) +getfunc(row, d::Dict{Symbol, <:Base.Callable}, i::Int) = get(d, columnnames(row)[i], identity) +getfunc(row, d::Dict{Int, <:Base.Callable}, i::Int) = get(d, i, identity) istable(::Type{<:Transforms}) = true rowaccess(::Type{Transforms{C, T, F}}) where {C, T, F} = !C @@ -52,6 +75,15 @@ struct Select{T, columnaccess, names} source::T end +""" + Tables.select(source, columns...) => Tables.Select + source |> Tables.select(columns...) => Tables.Select + +***EXPERIMENTAL - May be moved or removed in a future release*** +Create a lazy wrapper that satisfies the Tables.jl interface and keeps only the columns given by the columns arguments, which can be `String`s, `Symbol`s, or `Integer`s +""" +function select end + select(names::Symbol...) = x->select(x, names...) select(names::String...) = x->select(x, Base.map(Symbol, names)...) select(inds::Integer...) = x->select(x, Base.map(Int, inds)...) @@ -90,10 +122,14 @@ function schema(s::Select{T, columnaccess, names}) where {T, columnaccess, names end # columns: make Select property-accessible -Base.getproperty(s::Select{T, true, names}, nm::Symbol) where {T, names} = getproperty(getfield(s, 1), nm) -Base.propertynames(s::Select{T, true, names}) where {T, names} = namesubset(propertynames(getfield(s, 1)), names) +getcolumn(s::Select{T, true, names}, nm::Symbol) where {T, names} = getcolumn(getfield(s, 1), nm) +getcolumn(s::Select{T, true, names}, i::Int) where {T, names} = getcolumn(getfield(s, 1), i) +columnnames(s::Select{T, true, names}) where {T, names} = namesubset(columnnames(getfield(s, 1)), names) columnaccess(::Type{Select{T, C, names}}) where {T, C, names} = C columns(s::Select{T, true, names}) where {T, names} = s +# for backwards compat +Base.propertynames(s::Select{T, true, names}) where {T, names} = columnnames(s) +Base.getproperty(s::Select{T, true, names}, nm::Symbol) where {T, names} = getcolumn(s, nm) # rows: implement Iterator interface Base.IteratorSize(::Type{Select{T, false, names}}) where {T, names} = Base.IteratorSize(T) @@ -104,18 +140,20 @@ rowaccess(::Type{Select{T, columnaccess, names}}) where {T, columnaccess, names} rows(s::Select{T, false, names}) where {T, names} = s # we need to iterate a "row view" in case the underlying source has unknown schema -# to ensure each iterated row only has `names` propertynames -struct SelectRow{T, names} +# to ensure each iterated row only has `names` columnnames +struct SelectRow{T, names} <: AbstractRow row::T end -Base.getproperty(row::SelectRow, nm::Symbol) = getproperty(getfield(row, 1), nm) +getcolumn(row::SelectRow, nm::Symbol) = getcolumn(getfield(row, 1), nm) +getcolumn(row::SelectRow, i::Int) = getcolumn(getfield(row, 1), i) +getcolumn(row::SelectRow, ::Type{T}, i::Int, nm::Symbol) where {T} = getcolumn(getfield(row, 1), T, i, nm) getprops(row, nms::NTuple{N, Symbol}) where {N} = nms -getprops(row, inds::NTuple{N, Int}) where {N} = ntuple(i->propertynames(getfield(row, 1))[inds[i]], N) +getprops(row, inds::NTuple{N, Int}) where {N} = ntuple(i->columnnames(getfield(row, 1))[inds[i]], N) getprops(row, ::Tuple{}) = () -Base.propertynames(row::SelectRow{T, names}) where {T, names} = getprops(row, names) +columnnames(row::SelectRow{T, names}) where {T, names} = getprops(row, names) @inline function Base.iterate(s::Select{T, false, names}) where {T, names} state = iterate(getfield(s, 1)) diff --git a/src/tofromdatavalues.jl b/src/tofromdatavalues.jl index 8167d6e..bc29ec4 100644 --- a/src/tofromdatavalues.jl +++ b/src/tofromdatavalues.jl @@ -13,6 +13,14 @@ struct IteratorWrapper{S} x::S end +""" + Tables.nondatavaluerows(x) + +Takes any Queryverse-compatible `NamedTuple` iterator source and +converts to a Tables.jl-compatible `Row` iterator. Will automatically +unwrap any `DataValue`s, replacing `NA` with `missing`. +Useful for translating Query.jl results back to non-`DataValue`-based tables. +""" nondatavaluerows(x) = IteratorWrapper(IteratorInterfaceExtensions.getiterator(x)) Tables.istable(::Type{<:IteratorWrapper}) = true Tables.rowaccess(::Type{<:IteratorWrapper}) = true @@ -31,13 +39,13 @@ Base.IteratorSize(::Type{IteratorWrapper{S}}) where {S} = Base.IteratorSize(S) Base.length(rows::IteratorWrapper) = length(rows.x) Base.size(rows::IteratorWrapper) = size(rows.x) -@noinline invalidtable(::T, ::S) where {T, S} = throw(ArgumentError("'$T' iterates '$S' values, which don't satisfy the Tables.jl Row-iterator interface")) +@noinline invalidtable(::T, ::S) where {T, S} = throw(ArgumentError("'$T' iterates '$S' values, which doesn't satisfy the Tables.jl Row-iterator interface")) @inline function Base.iterate(rows::IteratorWrapper) x = iterate(rows.x) x === nothing && return nothing row, st = x - propertynames(row) === () && invalidtable(rows.x, row) + columnnames(row) === () && invalidtable(rows.x, row) return IteratorRow(row), st end @@ -48,58 +56,61 @@ end return IteratorRow(row), st end -struct IteratorRow{T} +struct IteratorRow{T} <: AbstractRow row::T end +getrow(r::IteratorRow) = getfield(r, :row) + unwrap(::Type{T}, x) where {T} = convert(T, x) unwrap(::Type{Any}, x) = x.hasvalue ? x.value : missing -function Base.getproperty(d::IteratorRow, ::Type{T}, col::Int, nm) where {T} - x = getproperty(getfield(d, 1), T, col, nm) - TT = typeof(x) - TTT = DataValueInterfaces.nondatavaluetype(TT) - return TT == TTT ? x : unwrap(TTT, x) -end -function Base.getproperty(d::IteratorRow, nm::Symbol) - x = getproperty(getfield(d, 1), nm) - TT = typeof(x) - TTT = DataValueInterfaces.nondatavaluetype(TT) - return TT == TTT ? x : unwrap(TTT, x) -end -function Base.getproperty(d::IteratorRow, nm::Int) - x = getproperty(getfield(d, 1), nm) - TT = typeof(x) - TTT = DataValueInterfaces.nondatavaluetype(TT) - return TT == TTT ? x : unwrap(TTT, x) -end -Base.propertynames(d::IteratorRow) = propertynames(getfield(d, 1)) +nondv(T) = DataValueInterfaces.nondatavaluetype(T) +undatavalue(x::T) where {T} = T == nondv(T) ? x : unwrap(nondv(T), x) + +getcolumn(r::IteratorRow, ::Type{T}, col::Int, nm::Symbol) where {T} = undatavalue(getcolumn(getrow(r), T, col, nm)) +getcolumn(r::IteratorRow, nm::Symbol) = undatavalue(getcolumn(getrow(r), nm)) +getcolumn(r::IteratorRow, i::Int) = undatavalue(getcolumn(getrow(r), i)) +columnnames(r::IteratorRow) = columnnames(getrow(r)) # DataValueRowIterator wraps a Row iterator and will wrap `Union{T, Missing}` typed fields in DataValues -struct DataValueRowIterator{NT, S} +struct DataValueRowIterator{NT, sch, S} x::S end +""" + Tables.datavaluerows(x) => NamedTuple iterator + +Takes any table input `x` and returns a `NamedTuple` iterator +that will replace missing values with `DataValue`-wrapped values; +this allows any table type to satisfy the TableTraits.jl +Queryverse integration interface by defining: + +``` +IteratorInterfaceExtensions.getiterator(x::MyTable) = Tables.datavaluerows(x) +``` +""" function datavaluerows(x) r = Tables.rows(x) s = Tables.schema(r) s === nothing && error("Schemaless sources cannot be passed to datavaluerows.") - return DataValueRowIterator{datavaluenamedtuple(s), typeof(r)}(r) + return DataValueRowIterator{datavaluenamedtuple(s), typeof(s), typeof(r)}(r) end -Base.eltype(rows::DataValueRowIterator{NT, S}) where {NT, S} = NT -Base.IteratorSize(::Type{DataValueRowIterator{NT, S}}) where {NT, S} = Base.IteratorSize(S) +Base.eltype(rows::DataValueRowIterator{NT}) where {NT} = NT +Base.IteratorSize(::Type{DataValueRowIterator{NT, sch, S}}) where {NT, sch, S} = Base.IteratorSize(S) Base.length(rows::DataValueRowIterator) = length(rows.x) Base.size(rows::DataValueRowIterator) = size(rows.x) -function Base.iterate(rows::DataValueRowIterator{NT, S}, st=()) where {NT <: NamedTuple{names}, S} where {names} +function Base.iterate(rows::DataValueRowIterator{NamedTuple{names, dtypes}, Schema{names, rtypes}, S}, st=()) where {names, dtypes, rtypes, S} if @generated - vals = Tuple(:(convert($(fieldtype(NT, i)), getproperty(row, $(DataValueInterfaces.nondatavaluetype(fieldtype(NT, i))), $i, $(Meta.QuoteNode(names[i]))))) for i = 1:fieldcount(NT)) + vals = Any[ :(convert($(fieldtype(dtypes, i)), getcolumn(row, $(fieldtype(rtypes, i)), $i, $(Meta.QuoteNode(names[i]))))) for i = 1:length(names) ] + ret = Expr(:new, :(NamedTuple{names, dtypes}), vals...) q = quote x = iterate(rows.x, st...) x === nothing && return nothing row, st = x - return $NT(($(vals...),)), (st,) + return $ret, (st,) end # @show q return q @@ -107,6 +118,6 @@ function Base.iterate(rows::DataValueRowIterator{NT, S}, st=()) where {NT <: Nam x = iterate(rows.x, st...) x === nothing && return nothing row, st = x - return NT(Tuple(convert(fieldtype(NT, i), getproperty(row, DataValueInterfaces.nondatavaluetype(fieldtype(NT, i)), i, names[i])) for i = 1:fieldcount(NT))), (st,) + return NamedTuple{names, dtypes}(Tuple(convert(fieldtype(dtypes, i), getcolumn(row, fieldtype(rtypes, i), i, names[i])) for i = 1:length(names))), (st,) end end diff --git a/src/utils.jl b/src/utils.jl index 6d68628..8fbe343 100644 --- a/src/utils.jl +++ b/src/utils.jl @@ -22,23 +22,19 @@ Base.@pure function runlength(::Type{T}) where {T <: Tuple} return rle end -# generic fallback from getproperty w/ type information to basic symbol lookup -Base.getproperty(x, ::Type{T}, i::Int, nm) where {T} = getproperty(x, nm) -Base.getproperty(x::NamedTuple{names, types}, ::Type{T}, i::Int, nm::Symbol) where {names, types, T} = Core.getfield(x, i) - """ Tables.eachcolumn(f, sch, row, args...) Tables.eachcolumn(Tables.columns(x)) - The first definition takes a function `f`, table schema `sch`, a `row` type (that satisfies the `Tables.Row` interface), and any other `args...`; - it generates calls to get the value for each column in the row (`getproperty(row, nm)`) and then calls `f(val, col, name, args...)`, where `f` is the - user-provided function, `val` is a row's column value, `col` is the column index as an `Int`, and `name` is the row's column name as a `Symbol`. +The first definition takes a function `f`, table schema `sch`, a `row` type (that satisfies the `Row` interface), and any other `args...`; +it generates calls to get the value for each column in the row (`Tables.getcolumn(row, nm)`) and then calls `f(val, col, name, args...)`, where `f` is the +user-provided function, `val` is a row's column value, `col` is the column index as an `Int`, and `name` is the row's column name as a `Symbol`. - While the first definition applies to a `Row` object, the last definition simply returns a property-iterator over a `Columns` object. - For example, one could "collect" every column of a `Columns` object by doing: - ```julia - vectors = [collect(col) for col in Tables.eachcolumn(Tables.columns(x))] - ``` +While the first definition applies to a `Row` object, the last definition simply returns an AbstractColumn iterator for a `Columns` object. +For example, one could "collect" every column of a `Columns` object by doing: +```julia +vectors = [collect(col) for col in Tables.eachcolumn(Tables.columns(x))] +``` """ function eachcolumn end @@ -51,7 +47,7 @@ quot(x::Int) = x block = Expr(:block, Expr(:meta, :inline)) for i = 1:length(names) push!(block.args, quote - f(getproperty(row, $(fieldtype(types, i)), $i, $(quot(names[i]))), $i, $(quot(names[i])), args...) + f(getcolumn(row, $(fieldtype(types, i)), $i, $(quot(names[i]))), $i, $(quot(names[i])), args...) end) end return block @@ -63,7 +59,7 @@ quot(x::Int) = x for (T, len) in rle push!(block.args, quote for j = 0:$(len-1) - @inbounds f(getproperty(row, $T, $i + j, names[$i + j]), $i + j, names[$i + j], args...) + @inbounds f(getcolumn(row, $T, $i + j, names[$i + j]), $i + j, names[$i + j], args...) end end) i += len @@ -73,7 +69,7 @@ quot(x::Int) = x b = quote $(Expr(:meta, :inline)) for (i, nm) in enumerate(names) - f(getproperty(row, fieldtype(types, i), i, nm), i, nm, args...) + f(getcolumn(row, fieldtype(types, i), i, nm), i, nm, args...) end return end @@ -82,7 +78,7 @@ quot(x::Int) = x return b else for (i, nm) in enumerate(names) - f(getproperty(row, fieldtype(types, i), i, nm), i, nm, args...) + f(getcolumn(row, fieldtype(types, i), i, nm), i, nm, args...) end return end @@ -94,7 +90,7 @@ end block = Expr(:block, Expr(:meta, :inline)) for i = 1:length(names) push!(block.args, quote - f(getproperty(row, $(quot(names[i]))), $i, $(quot(names[i])), args...) + f(getcolumn(row, $(quot(names[i]))), $i, $(quot(names[i])), args...) end) end return block @@ -102,7 +98,7 @@ end b = quote $(Expr(:meta, :inline)) for (i, nm) in enumerate(names) - f(getproperty(row, nm), i, nm, args...) + f(getcolumn(row, nm), i, nm, args...) end return end @@ -110,13 +106,13 @@ end end else for (i, nm) in enumerate(names) - f(getproperty(row, nm), i, nm, args...) + f(getcolumn(row, nm), i, nm, args...) end return end end -# this are specialized `eachcolumn`s where we also want +# these are specialized `eachcolumn`s where we also want # the indexing of `columns` to be constant propagated, so it needs to be returned from the generated function @inline function eachcolumns(f::Base.Callable, sch::Schema{names, types}, row, columns, args...) where {names, types} if @generated @@ -124,7 +120,7 @@ end block = Expr(:block, Expr(:meta, :inline)) for i = 1:length(names) push!(block.args, quote - f(getproperty(row, $(fieldtype(types, i)), $i, $(quot(names[i]))), $i, $(quot(names[i])), columns[$i], args...) + f(getcolumn(row, $(fieldtype(types, i)), $i, $(quot(names[i]))), $i, $(quot(names[i])), columns[$i], args...) end) end return block @@ -136,7 +132,7 @@ end for (T, len) in rle push!(block.args, quote for j = 0:$(len-1) - @inbounds f(getproperty(row, $T, $i + j, names[$i + j]), $i + j, names[$i + j], columns[$i + j], args...) + @inbounds f(getcolumn(row, $T, $i + j, names[$i + j]), $i + j, names[$i + j], columns[$i + j], args...) end end) i += len @@ -146,7 +142,7 @@ end b = quote $(Expr(:meta, :inline)) for (i, nm) in enumerate(names) - f(getproperty(row, fieldtype(types, i), i, nm), i, nm, columns[i], args...) + f(getcolumn(row, fieldtype(types, i), i, nm), i, nm, columns[i], args...) end return end @@ -155,7 +151,7 @@ end return b else for (i, nm) in enumerate(names) - f(getproperty(row, fieldtype(types, i), i, nm), i, nm, columns[i], args...) + f(getcolumn(row, fieldtype(types, i), i, nm), i, nm, columns[i], args...) end return end @@ -167,7 +163,7 @@ end block = Expr(:block, Expr(:meta, :inline)) for i = 1:length(names) push!(block.args, quote - f(getproperty(row, $(quot(names[i]))), $i, $(quot(names[i])), columns[$i], args...) + f(getcolumn(row, $(quot(names[i]))), $i, $(quot(names[i])), columns[$i], args...) end) end return block @@ -175,7 +171,7 @@ end b = quote $(Expr(:meta, :inline)) for (i, nm) in enumerate(names) - f(getproperty(row, nm), i, nm, columns[i], args...) + f(getcolumn(row, nm), i, nm, columns[i], args...) end return end @@ -183,7 +179,7 @@ end end else for (i, nm) in enumerate(names) - f(getproperty(row, nm), i, nm, columns[i], args...) + f(getcolumn(row, nm), i, nm, columns[i], args...) end return end @@ -194,16 +190,18 @@ struct EachColumn{T} source::T end -Base.length(e::EachColumn) = length(propertynames(e.source)) +Base.length(e::EachColumn) = length(columnnames(e.source)) Base.IteratorEltype(::Type{<:EachColumn}) = Base.EltypeUnknown() -function Base.iterate(e::EachColumn, (idx, props)=(1, propertynames(e.source))) +function Base.iterate(e::EachColumn, (idx, props)=(1, columnnames(e.source))) idx > length(props) && return nothing - return getproperty(e.source, props[idx]), (idx + 1, props) + return getcolumn(e.source, props[idx]), (idx + 1, props) end eachcolumn(c) = EachColumn(c) +Base.@pure columnindex(::Schema{names, types}, name::Symbol) where {names, types} = columnindex(names, name) + "given names and a Symbol `name`, compute the index (1-based) of the name in names" Base.@pure function columnindex(names::Tuple{Vararg{Symbol}}, name::Symbol) i = 1 @@ -217,10 +215,10 @@ end Base.@pure columntype(::Schema{names, types}, name::Symbol) where {names, types} = columntype(names, types, name) "given tuple type and a Symbol `name`, compute the type of the name in the tuples types" -Base.@pure function columntype(names::Tuple{Vararg{Symbol}}, ::Type{T}, name::Symbol) where {T <: Tuple} +Base.@pure function columntype(names::Tuple{Vararg{Symbol}}, ::Type{types}, name::Symbol) where {types <: Tuple} i = 1 for nm in names - nm === name && return fieldtype(T, i) + nm === name && return fieldtype(types, i) i += 1 end return Union{} diff --git a/test/runtests.jl b/test/runtests.jl index fda6e18..2a49a7f 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -2,6 +2,8 @@ using Test, Tables, TableTraits, DataValues, QueryOperators, IteratorInterfaceEx @testset "utils.jl" begin + @test getproperty((1, 2), 1) == 1 + NT = NamedTuple{(), Tuple{}} @test Tables.names(NT) === () @test Tables.types(NT) === Tuple{} @@ -28,8 +30,6 @@ using Test, Tables, TableTraits, DataValues, QueryOperators, IteratorInterfaceEx @test Tables.columntype(Tables.names(NT), Tables.types(NT), :i) == Union{} nt = (a=1, b=2, c=3) - @test getproperty(nt, Int, 1, :a) === 1 - NT = typeof(nt) output = [0, 0, 0] Tables.eachcolumn(Tables.Schema(Tables.names(NT), Tables.types(NT)), nt, output) do val, col, nm, out @@ -74,6 +74,8 @@ using Test, Tables, TableTraits, DataValues, QueryOperators, IteratorInterfaceEx @test Tables.schema(rows) == Tables.Schema((:a, :b), (Int, Int)) row = first(rows) @test row.a == 1 + @test Tables.getcolumn(row, :a) == 1 + @test Tables.getcolumn(row, 1) == 1 @test Tables.istable(rows) @test Tables.rowaccess(rows) @test Tables.rows(rows) === rows @@ -95,6 +97,8 @@ using Test, Tables, TableTraits, DataValues, QueryOperators, IteratorInterfaceEx c = Tables.CopiedColumns(nt) @test Tables.columns(c) === c @test Tables.materializer(c) == Tables.materializer(nt) + @test Tables.getcolumn(c, :a) == [1,2,3] + @test Tables.getcolumn(c, 1) == [1,2,3] @test_throws ArgumentError Tables.columntable([1,2,3]) @@ -136,14 +140,6 @@ end @test Tables.columntable(rtf) == nt @test Tables.buildcolumns(nothing, rtf) == nt - # append - nt2 = columntable(nt, rt) - @test Tables.rowcount(nt2) == 6 - @test Tables.schema(nt2) == Tables.Schema((:a, :b, :c), Tuple{Int, Float64, String}) - @test nt2 == (a = [1, 2, 3, 1, 2, 3], b = [4.0, 5.0, 6.0, 4.0, 5.0, 6.0], c = ["7", "8", "9", "7", "8", "9"]) - rt2 = rowtable(rt, nt) - @test length(rt2) == 9 - rt = [(a=1, b=4.0, c="7"), (a=2.0, b=missing, c="8"), (a=3, b=6.0, c="9")] @test Tables.istable(typeof(rt)) @test Tables.rowaccess(typeof(rt)) @@ -231,9 +227,13 @@ end @test Tables.columnaccess(typeof(mattbl)) @test Tables.columns(mattbl) === mattbl @test mattbl.Column1 == [1,2,3] + @test Tables.getcolumn(mattbl, :Column1) == [1,2,3] + @test Tables.getcolumn(mattbl, 1) == [1,2,3] matrow = first(mattbl) @test eltype(mattbl) == typeof(matrow) @test matrow.Column1 == 1 + @test Tables.getcolumn(matrow, :Column1) == 1 + @test Tables.getcolumn(matrow, 1) == 1 @test propertynames(mattbl) == propertynames(matrow) == [:Column1, :Column2, :Column3] end @@ -372,6 +372,8 @@ tran = ctable |> Tables.transform(C=Symbol) @test Tables.columns(tran) === tran @test IteratorInterfaceExtensions.isiterable(tran) @test typeof(IteratorInterfaceExtensions.getiterator(tran)) <: Tables.DataValueRowIterator +@test isequal(Tables.getcolumn(tran, :A), [1,missing,3]) +@test isequal(Tables.getcolumn(tran, 1), [1,missing,3]) tran2 = rtable |> Tables.transform(C=Symbol) @test Tables.istable(typeof(tran2)) @@ -385,6 +387,8 @@ trow = first(tran2) @test trow.A === 1 @test trow.B === 1.0 @test trow.C == :hey +@test Tables.getcolumn(trow, 1) == 1 +@test Tables.getcolumn(trow, :A) == 1 ctable2 = Tables.columntable(tran2) @test isequal(ctable2.A, ctable.A) @test ctable2.C == map(Symbol, ctable.C) @@ -455,6 +459,8 @@ sel = Tables.select(ctable) @test Tables.columnaccess(typeof(sel)) @test Tables.columns(sel) === sel @test propertynames(sel) == () +@test isequal(Tables.getcolumn(sel, 1), [1, missing, 3]) +@test isequal(Tables.getcolumn(sel, :A), [1, missing, 3]) @test Tables.columntable(sel) == NamedTuple() @test Tables.rowtable(sel) == NamedTuple{(), Tuple{}}[] @@ -514,6 +520,8 @@ sel = rtable |> Tables.select(1) @test isequal(Tables.rowtable(sel), [(A=1,), (A=missing,), (A=3,)]) srow = first(sel) @test propertynames(srow) == (:A,) +@test Tables.getcolumn(srow, 1) == 1 +@test Tables.getcolumn(srow, :A) == 1 table = ctable |> Tables.select(:A) |> Tables.columntable @test length(table) == 1 @@ -621,3 +629,115 @@ end # DataValue{Any} @test isequal(Tables.columntable(Tables.nondatavaluerows([(a=DataValue{Any}(), b=DataValue{Int}())])), (a = Any[missing], b = Union{Missing, Int64}[missing])) end + +@testset "AbstractDict" begin + + d = Dict(:a => 1, :b => missing, :c => "7") + n = (a=1, b=missing, c="7") + drt = [d, d, d] + rt = [n, n, n] + dct = Dict(:a => [1, 1, 1], :b => [missing, missing, missing], :c => ["7", "7", "7"]) + ct = (a = [1, 1, 1], b = [missing, missing, missing], c = ["7", "7", "7"]) + @test Tables.istable(drt) + @test Tables.rowaccess(drt) + @test Tables.rows(drt) === drt + @test Tables.schema(drt) === nothing + @test isequal(Tables.rowtable(drt), rt) + @test isequal(Tables.columntable(drt), ct) + + @test Tables.istable(dct) + @test Tables.columnaccess(dct) + @test Tables.columns(dct) === dct + @test Tables.schema(dct) == Tables.Schema((:a, :b, :c), Tuple{Int, Missing, String}) + @test isequal(Tables.rowtable(dct), rt) + @test isequal(Tables.columntable(dct), ct) + + # a Dict w/ scalar values isn't a table + @test_throws Exception Tables.columns(d) + @test_throws Exception Tables.rows(d) +end + +struct Row <: Tables.AbstractRow + a::Int + b::Union{Float64, Missing} + c::String +end + +Tables.getcolumn(r::Row, i::Int) = getfield(r, i) +Tables.getcolumn(r::Row, nm::Symbol) = getfield(r, nm) +Tables.getcolumn(r::Row, ::Type{T}, i::Int, nm::Symbol) where {T} = getfield(r, i) +Tables.columnnames(r::Row) = fieldnames(Row) + +@testset "AbstractRow" begin + + row = Row(1, missing, "hey") + row2 = Row(2, 3.14, "ho") + + @test Base.IteratorSize(typeof(row)) == Base.HasLength() + @test length(row) == 3 + @test firstindex(row) == 1 + @test lastindex(row) == 3 + @test isequal((row[1], row[2], row[3]), (1, missing, "hey")) + @test isequal((row[:a], row[:b], row[:c]), (1, missing, "hey")) + @test isequal((row.a, row.b, row.c), (1, missing, "hey")) + @test isequal((getproperty(row, 1), getproperty(row, 2), getproperty(row, 3)), (1, missing, "hey")) + @test propertynames(row) == (:a, :b, :c) + @test keys(row) == (:a, :b, :c) + @test isequal(values(row), [1, missing, "hey"]) + @test haskey(row, :a) + @test haskey(row, 1) + @test get(row, 1, 0) == get(row, :a, 0) == 1 + @test get(() -> 0, row, 1) == get(() -> 0, row, :a) == 1 + @test isequal(collect(row), [1, missing, "hey"]) + show(row) + + art = [row, row2] + ct = (a=[1, 2], b=[missing, 3.14], c=["hey", "ho"]) + @test Tables.istable(art) + @test Tables.rowaccess(art) + @test Tables.rows(art) === art + @test Tables.schema(art) === nothing + @test isequal(Tables.columntable(art), ct) + +end + +struct Columns <: Tables.AbstractColumns + a::Vector{Int} + b::Vector{Union{Float64, Missing}} + c::Vector{String} +end + +Tables.getcolumn(r::Columns, i::Int) = getfield(r, i) +Tables.getcolumn(r::Columns, nm::Symbol) = getfield(r, nm) +Tables.getcolumn(r::Columns, ::Type{T}, i::Int, nm::Symbol) where {T} = getfield(r, i) +Tables.columnnames(r::Columns) = fieldnames(Columns) + +@testset "AbstractColumns" begin + + col = Columns([1, 2], [missing, 3.14], ["hey", "ho"]) + + @test Base.IteratorSize(typeof(col)) == Base.HasLength() + @test length(col) == 3 + @test firstindex(col) == 1 + @test lastindex(col) == 3 + @test isequal((col[1], col[2], col[3]), ([1,2], [missing,3.14], ["hey","ho"])) + @test isequal((col[:a], col[:b], col[:c]), ([1,2], [missing,3.14], ["hey","ho"])) + @test isequal((col.a, col.b, col.c), ([1,2], [missing,3.14], ["hey","ho"])) + @test isequal((getproperty(col, 1), getproperty(col, 2), getproperty(col, 3)), ([1,2], [missing,3.14], ["hey","ho"])) + @test propertynames(col) == (:a, :b, :c) + @test keys(col) == (:a, :b, :c) + @test isequal(values(col), [[1,2], [missing,3.14], ["hey","ho"]]) + @test haskey(col, :a) + @test haskey(col, 1) + @test get(col, 1, 0) == get(col, :a, 0) == [1,2] + @test get(() -> 0, col, 1) == get(() -> 0, col, :a) == [1,2] + @test isequal(collect(col), [[1,2], [missing,3.14], ["hey","ho"]]) + show(col) + + ct = (a=[1, 2], b=[missing, 3.14], c=["hey", "ho"]) + @test Tables.istable(col) + @test Tables.columnaccess(col) + @test Tables.columns(col) === col + @test Tables.schema(col) === nothing + @test isequal(Tables.columntable(col), ct) +end