datafusion-contrib · wjones127 · Jan 30, 2022 · Feb 19, 2022 · Feb 20, 2022
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -28,11 +28,25 @@ edition = "2021"
 rust-version = "1.57"
 
 [dependencies]
-tokio = { version = "1.0", features = ["macros", "rt", "rt-multi-thread", "sync"] }
+tokio = { version = "1.0", features = [
+    "macros",
+    "rt",
+    "rt-multi-thread",
+    "sync",
+] }
 rand = "0.7"
-pyo3 = { version = "0.14", features = ["extension-module", "abi3", "abi3-py36"] }
-datafusion = { version = "6.0.0", features = ["pyarrow"] }
+pyo3 = { version = "0.15", features = [
+    "extension-module",
+    "abi3",
+    "abi3-py36",
+] }
+# datafusion = { version = "6.0.0", features = ["pyarrow"] }
+datafusion = { git = "https://github.com/apache/arrow-datafusion.git", features = [
+    "pyarrow",
+] }
 uuid = { version = "0.8", features = ["v4"] }
+async-trait = "0.1.41"
+futures = "0.3"
 
 [lib]
 name = "_internal"

diff --git a/datafusion/tests/test_pyarrow_dataset.py b/datafusion/tests/test_pyarrow_dataset.py
@@ -0,0 +1,77 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+from datetime import date, timedelta
+from tempfile import mkdtemp
+
+import pyarrow as pa
+import pyarrow.dataset as ds
+import pytest
+
+from datafusion import ExecutionContext
+
+
+@pytest.fixture
+def ctx():
+    return ExecutionContext()
+
+
+@pytest.fixture
+def table():
+    table = pa.table({
+        'z': pa.array([x / 3 for x in range(8)]),
+        'x': pa.array(['a'] * 3 + ['b'] * 5),
+        'y': pa.array([date(2020, 1, 1) + timedelta(days=x) for x in range(8)]),
+    })
+    return table
+
+
+@pytest.fixture
+def dataset(ctx, table):
+    tmp_dir = mkdtemp()
+
+    part = ds.partitioning(
+        pa.schema([('x', pa.string()), ('y', pa.date32())]),
+        flavor="hive",
+    )
+
+    ds.write_dataset(table, tmp_dir, partitioning=part, format="parquet")
+
+    dataset = ds.dataset(tmp_dir, partitioning=part)
+    ctx.register_dataset("ds", dataset)
+    return dataset
+
+
+def test_catalog(ctx, table, dataset):
+    catalog_table = ctx.catalog().database().table("ds")
+    assert catalog_table.kind == "physical"
+    assert catalog_table.schema == table.schema
+
+
+def test_scan_full(ctx, table, dataset):
+    result = ctx.sql("SELECT * FROM ds").collect()
+    assert pa.Table.from_batches(result) == table
+
+
+def test_dataset_filter(ctx: ExecutionContext, table: pa.Table, dataset):
+    result = ctx.sql("SELECT * FROM ds WHERE y BETWEEN 2020-01-02 AND 2020-01-06 AND x = 'b'").collect()
+    assert result.record_count() == 3
+
+
+def test_dataset_project(ctx: ExecutionContext, table: pa.Table, dataset):
+    result = ctx.sql("SELECT z, y FROM ds").collect()
+    assert result.col_names() == ['z', 'y']
diff --git a/src/context.rs b/src/context.rs
@@ -31,6 +31,7 @@ use datafusion::prelude::CsvReadOptions;
 
 use crate::catalog::PyCatalog;
 use crate::dataframe::PyDataFrame;
+use crate::dataset::PyArrowDatasetTable;
 use crate::errors::DataFusionError;
 use crate::udf::PyScalarUDF;
 use crate::utils::wait_for_future;
@@ -60,10 +61,7 @@ impl PyExecutionContext {
         Ok(PyDataFrame::new(df))
     }
 
-    fn create_dataframe(
-        &mut self,
-        partitions: Vec<Vec<RecordBatch>>,
-    ) -> PyResult<PyDataFrame> {
+    fn create_dataframe(&mut self, partitions: Vec<Vec<RecordBatch>>) -> PyResult<PyDataFrame> {
         let table = MemTable::try_new(partitions[0][0].schema(), partitions)
             .map_err(DataFusionError::from)?;
 
@@ -143,6 +141,13 @@ impl PyExecutionContext {
         Ok(())
     }
 
+    fn register_dataset(&mut self, name: &str, dataset: PyArrowDatasetTable) -> PyResult<()> {
+        self.ctx
+            .register_table(name, Arc::new(dataset))
+            .map_err(DataFusionError::from)?;
+        Ok(())
+    }
+
     fn register_udf(&mut self, udf: PyScalarUDF) -> PyResult<()> {
         self.ctx.register_udf(udf.function);
         Ok(())

diff --git a/src/dataframe.rs b/src/dataframe.rs
@@ -19,8 +19,10 @@ use std::sync::Arc;
 
 use pyo3::prelude::*;
 
+use datafusion::arrow::array::StringArray;
 use datafusion::arrow::datatypes::Schema;
 use datafusion::arrow::pyarrow::PyArrowConvert;
+use datafusion::arrow::record_batch::RecordBatch;
 use datafusion::arrow::util::pretty;
 use datafusion::dataframe::DataFrame;
 use datafusion::logical_plan::JoinType;
@@ -100,6 +102,33 @@ impl PyDataFrame {
         Ok(pretty::print_batches(&batches)?)
     }
 
+    #[args(verbose = false, analyze = false)]
+    fn explain(&self, verbose: bool, analyze: bool, py: Python) -> PyResult<()> {
+        let df = self.df.explain(verbose, analyze)?;
+        let batches = wait_for_future(py, df.collect())?;
+        let batch = RecordBatch::concat(&batches[0].schema(), &batches)?;
+
+        let plan_types = batch
+            .column(0)
+            .as_any()
+            .downcast_ref::<StringArray>()
+            .expect("Plan types is not a String anymore");
+        let plans = batch
+            .column(1)
+            .as_any()
+            .downcast_ref::<StringArray>()
+            .expect("Plan is not a String anymore");
+
+        for (plan_type, plan) in plan_types.iter().zip(plans.iter()) {
+            if plan_type.is_some() && plan.is_some() {
+                println!("{}", plan_type.unwrap());
+                println!("{}", plan.unwrap());
+            }
+        }
+
+        Ok(())
+    }
+
     fn join(
         &self,
         right: PyDataFrame,