From 8476d3f48001d4bab24129a46b5f9878ad73c4b3 Mon Sep 17 00:00:00 2001 From: "Jorge C. Leitao" Date: Sun, 25 Apr 2021 14:02:03 +0000 Subject: [PATCH] WIP. --- Cargo.toml | 9 - README.md | 8 +- datafusion-examples/examples/csv_sql.rs | 2 +- datafusion-examples/examples/dataframe.rs | 2 +- datafusion-examples/examples/flight_client.rs | 2 +- datafusion-examples/examples/flight_server.rs | 2 +- datafusion-examples/examples/parquet_sql.rs | 2 +- datafusion/Cargo.toml | 4 +- datafusion/benches/aggregate_query_sql.rs | 2 +- datafusion/benches/filter_query_sql.rs | 2 +- datafusion/benches/math_query_sql.rs | 2 +- datafusion/benches/sort_limit_query_sql.rs | 7 +- datafusion/src/catalog/information_schema.rs | 212 ++++--- datafusion/src/dataframe.rs | 2 +- datafusion/src/datasource/csv.rs | 23 +- datafusion/src/datasource/datasource.rs | 4 +- datafusion/src/datasource/empty.rs | 4 +- datafusion/src/datasource/memory.rs | 56 +- datafusion/src/datasource/parquet.rs | 22 +- datafusion/src/error.rs | 16 +- datafusion/src/execution/context.rs | 153 +++-- datafusion/src/execution/dataframe_impl.rs | 6 +- datafusion/src/lib.rs | 25 +- datafusion/src/logical_plan/builder.rs | 10 +- datafusion/src/logical_plan/dfschema.rs | 6 +- datafusion/src/logical_plan/display.rs | 6 +- datafusion/src/logical_plan/expr.rs | 18 +- datafusion/src/logical_plan/plan.rs | 12 +- datafusion/src/optimizer/constant_folding.rs | 4 +- datafusion/src/optimizer/filter_push_down.rs | 11 +- .../src/optimizer/hash_build_probe_order.rs | 5 +- .../src/optimizer/projection_push_down.rs | 6 +- datafusion/src/optimizer/utils.rs | 4 +- .../src/physical_optimizer/repartition.rs | 14 +- datafusion/src/physical_plan/aggregates.rs | 2 +- .../src/physical_plan/array_expressions.rs | 100 +--- .../src/physical_plan/coalesce_batches.rs | 20 +- datafusion/src/physical_plan/common.rs | 7 +- datafusion/src/physical_plan/cross_join.rs | 13 +- .../src/physical_plan/crypto_expressions.rs | 16 +- datafusion/src/physical_plan/csv.rs | 79 ++- .../src/physical_plan/datetime_expressions.rs | 159 ++---- .../src/physical_plan/distinct_expressions.rs | 139 ++--- datafusion/src/physical_plan/empty.rs | 11 +- datafusion/src/physical_plan/explain.rs | 19 +- .../src/physical_plan/expressions/average.rs | 33 +- .../src/physical_plan/expressions/binary.rs | 511 ++++++----------- .../src/physical_plan/expressions/case.rs | 253 ++------- .../src/physical_plan/expressions/cast.rs | 96 +--- .../src/physical_plan/expressions/coercion.rs | 8 +- .../src/physical_plan/expressions/column.rs | 2 +- .../src/physical_plan/expressions/count.rs | 33 +- .../src/physical_plan/expressions/in_list.rs | 23 +- .../physical_plan/expressions/is_not_null.rs | 14 +- .../src/physical_plan/expressions/is_null.rs | 14 +- .../src/physical_plan/expressions/literal.rs | 6 +- .../src/physical_plan/expressions/min_max.rs | 131 ++--- .../src/physical_plan/expressions/mod.rs | 14 +- .../src/physical_plan/expressions/negative.rs | 25 +- .../src/physical_plan/expressions/not.rs | 10 +- .../src/physical_plan/expressions/nullif.rs | 90 +-- .../src/physical_plan/expressions/sum.rs | 39 +- .../src/physical_plan/expressions/try_cast.rs | 38 +- datafusion/src/physical_plan/filter.rs | 13 +- datafusion/src/physical_plan/functions.rs | 106 ++-- datafusion/src/physical_plan/group_scalar.rs | 18 +- .../src/physical_plan/hash_aggregate.rs | 213 +++---- datafusion/src/physical_plan/hash_join.rs | 229 +++----- datafusion/src/physical_plan/hash_utils.rs | 2 +- datafusion/src/physical_plan/limit.rs | 18 +- .../src/physical_plan/math_expressions.rs | 45 +- datafusion/src/physical_plan/memory.rs | 7 +- datafusion/src/physical_plan/merge.rs | 11 +- datafusion/src/physical_plan/mod.rs | 14 +- datafusion/src/physical_plan/parquet.rs | 428 ++++---------- datafusion/src/physical_plan/planner.rs | 22 +- datafusion/src/physical_plan/projection.rs | 9 +- .../src/physical_plan/regex_expressions.rs | 26 +- datafusion/src/physical_plan/repartition.rs | 44 +- datafusion/src/physical_plan/sort.rs | 73 +-- .../src/physical_plan/string_expressions.rs | 83 ++- datafusion/src/physical_plan/type_coercion.rs | 4 +- datafusion/src/physical_plan/udaf.rs | 2 +- datafusion/src/physical_plan/udf.rs | 2 +- .../src/physical_plan/unicode_expressions.rs | 116 ++-- datafusion/src/physical_plan/union.rs | 5 +- datafusion/src/scalar.rs | 537 +++++------------- datafusion/src/sql/planner.rs | 6 +- datafusion/src/test/exec.rs | 9 +- datafusion/src/test/mod.rs | 138 +++-- datafusion/tests/custom_sources.rs | 8 +- datafusion/tests/dataframe.rs | 4 +- datafusion/tests/provider_filter_pushdown.rs | 6 +- datafusion/tests/sql.rs | 24 +- datafusion/tests/user_defined_plan.rs | 2 +- 95 files changed, 1886 insertions(+), 2906 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 351523d74c36a..ebb3051f3ea05 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -18,13 +18,4 @@ [workspace] members = [ "datafusion", - "datafusion-cli", - "datafusion-examples", - "benchmarks", - "ballista/rust/client", - "ballista/rust/core", - "ballista/rust/executor", - "ballista/rust/scheduler", ] - -exclude = ["python"] diff --git a/README.md b/README.md index f72c73bb80372..ec271b221798f 100644 --- a/README.md +++ b/README.md @@ -69,8 +69,8 @@ Run a SQL query against data stored in a CSV: ```rust use datafusion::prelude::*; -use arrow::util::pretty::print_batches; -use arrow::record_batch::RecordBatch; +use arrow2::util::pretty::print_batches; +use arrow2::record_batch::RecordBatch; #[tokio::main] async fn main() -> datafusion::error::Result<()> { @@ -92,8 +92,8 @@ Use the DataFrame API to process data stored in a CSV: ```rust use datafusion::prelude::*; -use arrow::util::pretty::print_batches; -use arrow::record_batch::RecordBatch; +use arrow2::util::pretty::print_batches; +use arrow2::record_batch::RecordBatch; #[tokio::main] async fn main() -> datafusion::error::Result<()> { diff --git a/datafusion-examples/examples/csv_sql.rs b/datafusion-examples/examples/csv_sql.rs index 76c87960d71d3..95a9afb035eda 100644 --- a/datafusion-examples/examples/csv_sql.rs +++ b/datafusion-examples/examples/csv_sql.rs @@ -27,7 +27,7 @@ async fn main() -> Result<()> { // create local execution context let mut ctx = ExecutionContext::new(); - let testdata = datafusion::arrow::util::test_util::arrow_test_data(); + let testdata = datafusion::crate::test::arrow_test_data(); // register csv file with the execution context ctx.register_csv( diff --git a/datafusion-examples/examples/dataframe.rs b/datafusion-examples/examples/dataframe.rs index dcf6bc32be6b2..60147748d77a1 100644 --- a/datafusion-examples/examples/dataframe.rs +++ b/datafusion-examples/examples/dataframe.rs @@ -27,7 +27,7 @@ async fn main() -> Result<()> { // create local execution context let mut ctx = ExecutionContext::new(); - let testdata = datafusion::arrow::util::test_util::parquet_test_data(); + let testdata = datafusion::crate::test::parquet_test_data(); let filename = &format!("{}/alltypes_plain.parquet", testdata); diff --git a/datafusion-examples/examples/flight_client.rs b/datafusion-examples/examples/flight_client.rs index 53347826ff89c..634652c6d9cb2 100644 --- a/datafusion-examples/examples/flight_client.rs +++ b/datafusion-examples/examples/flight_client.rs @@ -31,7 +31,7 @@ use arrow_flight::{FlightDescriptor, Ticket}; /// This example is run along-side the example `flight_server`. #[tokio::main] async fn main() -> Result<(), Box> { - let testdata = datafusion::arrow::util::test_util::parquet_test_data(); + let testdata = datafusion::crate::test::parquet_test_data(); // Create Flight client let mut client = FlightServiceClient::connect("http://localhost:50051").await?; diff --git a/datafusion-examples/examples/flight_server.rs b/datafusion-examples/examples/flight_server.rs index 8496bcb18914f..06efb04f76e06 100644 --- a/datafusion-examples/examples/flight_server.rs +++ b/datafusion-examples/examples/flight_server.rs @@ -87,7 +87,7 @@ impl FlightService for FlightServiceImpl { // create local execution context let mut ctx = ExecutionContext::new(); - let testdata = datafusion::arrow::util::test_util::parquet_test_data(); + let testdata = datafusion::crate::test::parquet_test_data(); // register parquet file with the execution context ctx.register_parquet( diff --git a/datafusion-examples/examples/parquet_sql.rs b/datafusion-examples/examples/parquet_sql.rs index f679b22ceb904..2a3becf9913f7 100644 --- a/datafusion-examples/examples/parquet_sql.rs +++ b/datafusion-examples/examples/parquet_sql.rs @@ -27,7 +27,7 @@ async fn main() -> Result<()> { // create local execution context let mut ctx = ExecutionContext::new(); - let testdata = datafusion::arrow::util::test_util::parquet_test_data(); + let testdata = datafusion::crate::test::parquet_test_data(); // register parquet file with the execution context ctx.register_parquet( diff --git a/datafusion/Cargo.toml b/datafusion/Cargo.toml index a127076135f12..878446889762b 100644 --- a/datafusion/Cargo.toml +++ b/datafusion/Cargo.toml @@ -38,7 +38,6 @@ path = "src/lib.rs" [features] default = ["crypto_expressions", "regex_expressions", "unicode_expressions"] -simd = ["arrow/simd"] crypto_expressions = ["md-5", "sha2"] regex_expressions = ["regex", "lazy_static"] unicode_expressions = ["unicode-segmentation"] @@ -46,8 +45,7 @@ unicode_expressions = ["unicode-segmentation"] [dependencies] ahash = "0.7" hashbrown = "0.11" -arrow = { git = "https://github.com/apache/arrow-rs", rev = "4449ee96fe3fd4a0b275da8dd25ce2792699bc98", features = ["prettyprint"] } -parquet = { git = "https://github.com/apache/arrow-rs", rev = "4449ee96fe3fd4a0b275da8dd25ce2792699bc98", features = ["arrow"] } +arrow2 = { git = "https://github.com/jorgecarleitao/arrow2", rev = "b645c6320a119b017fe147ea9edc59201284d4fa" } sqlparser = "0.9.0" paste = "^1.0" num_cpus = "1.13.0" diff --git a/datafusion/benches/aggregate_query_sql.rs b/datafusion/benches/aggregate_query_sql.rs index 8f1a97e198d3b..6f10b03ad4784 100644 --- a/datafusion/benches/aggregate_query_sql.rs +++ b/datafusion/benches/aggregate_query_sql.rs @@ -26,7 +26,7 @@ use tokio::runtime::Runtime; extern crate arrow; extern crate datafusion; -use arrow::{ +use arrow2::{ array::Float32Array, array::Float64Array, array::StringArray, diff --git a/datafusion/benches/filter_query_sql.rs b/datafusion/benches/filter_query_sql.rs index 8600bdc88c6af..c5637b1441fb2 100644 --- a/datafusion/benches/filter_query_sql.rs +++ b/datafusion/benches/filter_query_sql.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -use arrow::{ +use arrow2::{ array::{Float32Array, Float64Array}, datatypes::{DataType, Field, Schema}, record_batch::RecordBatch, diff --git a/datafusion/benches/math_query_sql.rs b/datafusion/benches/math_query_sql.rs index 1aaa2d3403cfd..71fc864a5439d 100644 --- a/datafusion/benches/math_query_sql.rs +++ b/datafusion/benches/math_query_sql.rs @@ -26,7 +26,7 @@ use tokio::runtime::Runtime; extern crate arrow; extern crate datafusion; -use arrow::{ +use arrow2::{ array::{Float32Array, Float64Array}, datatypes::{DataType, Field, Schema}, record_batch::RecordBatch, diff --git a/datafusion/benches/sort_limit_query_sql.rs b/datafusion/benches/sort_limit_query_sql.rs index be065f32e0090..50b71e3c0179f 100644 --- a/datafusion/benches/sort_limit_query_sql.rs +++ b/datafusion/benches/sort_limit_query_sql.rs @@ -21,10 +21,7 @@ use criterion::Criterion; use std::sync::{Arc, Mutex}; -extern crate arrow; -extern crate datafusion; - -use arrow::datatypes::{DataType, Field, Schema}; +use arrow2::datatypes::{DataType, Field, Schema}; use datafusion::datasource::{CsvFile, CsvReadOptions, MemTable}; use datafusion::execution::context::ExecutionContext; @@ -57,7 +54,7 @@ fn create_context() -> Arc> { Field::new("c13", DataType::Utf8, false), ])); - let testdata = arrow::util::test_util::arrow_test_data(); + let testdata = crate::test::arrow_test_data(); // create CSV data source let csv = CsvFile::try_new( diff --git a/datafusion/src/catalog/information_schema.rs b/datafusion/src/catalog/information_schema.rs index fd7fcb4b901a6..6b588ac1398a6 100644 --- a/datafusion/src/catalog/information_schema.rs +++ b/datafusion/src/catalog/information_schema.rs @@ -21,8 +21,8 @@ use std::{any, sync::Arc}; -use arrow::{ - array::{StringBuilder, UInt64Builder}, +use arrow2::{ + array::*, datatypes::{DataType, Field, Schema}, record_batch::RecordBatch, }; @@ -189,23 +189,23 @@ impl SchemaProvider for InformationSchemaProvider { /// /// Columns are based on https://www.postgresql.org/docs/current/infoschema-columns.html struct InformationSchemaTablesBuilder { - catalog_names: StringBuilder, - schema_names: StringBuilder, - table_names: StringBuilder, - table_types: StringBuilder, + catalog_names: Utf8Primitive, + schema_names: Utf8Primitive, + table_names: Utf8Primitive, + table_types: Utf8Primitive, } impl InformationSchemaTablesBuilder { fn new() -> Self { - // StringBuilder requires providing an initial capacity, so + // Utf8Primitive requires providing an initial capacity, so // pick 10 here arbitrarily as this is not performance // critical code and the number of tables is unavailable here. let default_capacity = 10; Self { - catalog_names: StringBuilder::new(default_capacity), - schema_names: StringBuilder::new(default_capacity), - table_names: StringBuilder::new(default_capacity), - table_types: StringBuilder::new(default_capacity), + catalog_names: Utf8Primitive::with_capacity(default_capacity), + schema_names: Utf8Primitive::with_capacity(default_capacity), + table_names: Utf8Primitive::with_capacity(default_capacity), + table_types: Utf8Primitive::with_capacity(default_capacity), } } @@ -217,20 +217,24 @@ impl InformationSchemaTablesBuilder { table_type: TableType, ) { // Note: append_value is actually infallable. - self.catalog_names - .append_value(catalog_name.as_ref()) - .unwrap(); - self.schema_names - .append_value(schema_name.as_ref()) - .unwrap(); - self.table_names.append_value(table_name.as_ref()).unwrap(); - self.table_types - .append_value(match table_type { - TableType::Base => "BASE TABLE", - TableType::View => "VIEW", - TableType::Temporary => "LOCAL TEMPORARY", - }) - .unwrap(); + self.catalog_names.push(Some(&catalog_name.as_ref())); + self.schema_names.push(Some(&schema_name.as_ref())); + self.table_names.push(Some(&table_name.as_ref())); + self.table_types.push(Some(&match table_type { + TableType::Base => "BASE TABLE", + TableType::View => "VIEW", + TableType::Temporary => "LOCAL TEMPORARY", + })); + } + + fn add_system_table( + &mut self, + catalog_name: impl AsRef, + schema_name: impl AsRef, + table_name: impl AsRef, + ) { + // Note: append_value is actually infallable. + self.catalog_names.push(Some(&catalog_name.as_ref())); } } @@ -254,10 +258,10 @@ impl From for MemTable { let batch = RecordBatch::try_new( schema.clone(), vec![ - Arc::new(catalog_names.finish()), - Arc::new(schema_names.finish()), - Arc::new(table_names.finish()), - Arc::new(table_types.finish()), + Arc::new(catalog_names.to()), + Arc::new(schema_names.to()), + Arc::new(table_names.to()), + Arc::new(table_types.to()), ], ) .unwrap(); @@ -270,45 +274,45 @@ impl From for MemTable { /// /// Columns are based on https://www.postgresql.org/docs/current/infoschema-columns.html struct InformationSchemaColumnsBuilder { - catalog_names: StringBuilder, - schema_names: StringBuilder, - table_names: StringBuilder, - column_names: StringBuilder, - ordinal_positions: UInt64Builder, - column_defaults: StringBuilder, - is_nullables: StringBuilder, - data_types: StringBuilder, - character_maximum_lengths: UInt64Builder, - character_octet_lengths: UInt64Builder, - numeric_precisions: UInt64Builder, - numeric_precision_radixes: UInt64Builder, - numeric_scales: UInt64Builder, - datetime_precisions: UInt64Builder, - interval_types: StringBuilder, + catalog_names: Utf8Primitive, + schema_names: Utf8Primitive, + table_names: Utf8Primitive, + column_names: Utf8Primitive, + ordinal_positions: Primitive, + column_defaults: Utf8Primitive, + is_nullables: Utf8Primitive, + data_types: Utf8Primitive, + character_maximum_lengths: Primitive, + character_octet_lengths: Primitive, + numeric_precisions: Primitive, + numeric_precision_radixes: Primitive, + numeric_scales: Primitive, + datetime_precisions: Primitive, + interval_types: Utf8Primitive, } impl InformationSchemaColumnsBuilder { fn new() -> Self { - // StringBuilder requires providing an initial capacity, so + // Utf8Primitive requires providing an initial capacity, so // pick 10 here arbitrarily as this is not performance // critical code and the number of tables is unavailable here. let default_capacity = 10; Self { - catalog_names: StringBuilder::new(default_capacity), - schema_names: StringBuilder::new(default_capacity), - table_names: StringBuilder::new(default_capacity), - column_names: StringBuilder::new(default_capacity), - ordinal_positions: UInt64Builder::new(default_capacity), - column_defaults: StringBuilder::new(default_capacity), - is_nullables: StringBuilder::new(default_capacity), - data_types: StringBuilder::new(default_capacity), - character_maximum_lengths: UInt64Builder::new(default_capacity), - character_octet_lengths: UInt64Builder::new(default_capacity), - numeric_precisions: UInt64Builder::new(default_capacity), - numeric_precision_radixes: UInt64Builder::new(default_capacity), - numeric_scales: UInt64Builder::new(default_capacity), - datetime_precisions: UInt64Builder::new(default_capacity), - interval_types: StringBuilder::new(default_capacity), + catalog_names: Utf8Primitive::::with_capacity(default_capacity), + schema_names: Utf8Primitive::::with_capacity(default_capacity), + table_names: Utf8Primitive::::with_capacity(default_capacity), + column_names: Utf8Primitive::::with_capacity(default_capacity), + ordinal_positions: Primitive::::with_capacity(default_capacity), + column_defaults: Utf8Primitive::::with_capacity(default_capacity), + is_nullables: Utf8Primitive::::with_capacity(default_capacity), + data_types: Utf8Primitive::::with_capacity(default_capacity), + character_maximum_lengths: Primitive::::with_capacity(default_capacity), + character_octet_lengths: Primitive::::with_capacity(default_capacity), + numeric_precisions: Primitive::::with_capacity(default_capacity), + numeric_precision_radixes: Primitive::::with_capacity(default_capacity), + numeric_scales: Primitive::::with_capacity(default_capacity), + datetime_precisions: Primitive::::with_capacity(default_capacity), + interval_types: Utf8Primitive::::with_capacity(default_capacity), } } @@ -326,33 +330,24 @@ impl InformationSchemaColumnsBuilder { use DataType::*; // Note: append_value is actually infallable. - self.catalog_names - .append_value(catalog_name.as_ref()) - .unwrap(); - self.schema_names - .append_value(schema_name.as_ref()) - .unwrap(); - self.table_names.append_value(table_name.as_ref()).unwrap(); - - self.column_names - .append_value(column_name.as_ref()) - .unwrap(); - - self.ordinal_positions - .append_value(column_position as u64) - .unwrap(); + self.catalog_names.push(Some(&catalog_name.as_ref())); + self.schema_names.push(Some(&schema_name.as_ref())); + self.table_names.push(Some(&table_name.as_ref())); + + self.column_names.push(Some(&column_name.as_ref())); + + self.ordinal_positions.push(Some(&(column_position as u64))); // DataFusion does not support column default values, so null - self.column_defaults.append_null().unwrap(); + self.column_defaults.push(None); // "YES if the column is possibly nullable, NO if it is known not nullable. " let nullable_str = if is_nullable { "YES" } else { "NO" }; - self.is_nullables.append_value(nullable_str).unwrap(); + self.is_nullables.push(Some(&nullable_str)); // "System supplied type" --> Use debug format of the datatype self.data_types - .append_value(format!("{:?}", data_type)) - .unwrap(); + .push(Some(&format!("{:?}", data_type).as_ref())); // "If data_type identifies a character or bit string type, the // declared maximum length; null for all other data types or @@ -360,9 +355,7 @@ impl InformationSchemaColumnsBuilder { // // Arrow has no equivalent of VARCHAR(20), so we leave this as Null let max_chars = None; - self.character_maximum_lengths - .append_option(max_chars) - .unwrap(); + self.character_maximum_lengths.push(max_chars); // "Maximum length, in bytes, for binary data, character data, // or text and image data." @@ -371,9 +364,7 @@ impl InformationSchemaColumnsBuilder { LargeBinary | LargeUtf8 => Some(i64::MAX as u64), _ => None, }; - self.character_octet_lengths - .append_option(char_len) - .unwrap(); + self.character_octet_lengths.push(char_len.as_ref()); // numeric_precision: "If data_type identifies a numeric type, this column // contains the (declared or implicit) precision of the type @@ -414,16 +405,12 @@ impl InformationSchemaColumnsBuilder { _ => (None, None, None), }; - self.numeric_precisions - .append_option(numeric_precision) - .unwrap(); - self.numeric_precision_radixes - .append_option(numeric_radix) - .unwrap(); - self.numeric_scales.append_option(numeric_scale).unwrap(); + self.numeric_precisions.push(numeric_precision.as_ref()); + self.numeric_precision_radixes.push(numeric_radix.as_ref()); + self.numeric_scales.push(numeric_scale.as_ref()); - self.datetime_precisions.append_option(None).unwrap(); - self.interval_types.append_null().unwrap(); + self.datetime_precisions.push(None); + self.interval_types.push(None); } } @@ -464,26 +451,33 @@ impl From for MemTable { mut datetime_precisions, mut interval_types, } = value; + let ordinal_positions: UInt64Array = ordinal_positions.into(); + let character_maximum_lengths: UInt64Array = character_maximum_lengths.into(); + let character_octet_lengths: UInt64Array = character_octet_lengths.into(); + let numeric_precisions: UInt64Array = numeric_precisions.into(); + let numeric_precision_radixes: UInt64Array = numeric_precision_radixes.into(); + let numeric_scales: UInt64Array = numeric_scales.into(); + let datetime_precisions: UInt64Array = datetime_precisions.into(); let schema = Arc::new(schema); let batch = RecordBatch::try_new( schema.clone(), vec![ - Arc::new(catalog_names.finish()), - Arc::new(schema_names.finish()), - Arc::new(table_names.finish()), - Arc::new(column_names.finish()), - Arc::new(ordinal_positions.finish()), - Arc::new(column_defaults.finish()), - Arc::new(is_nullables.finish()), - Arc::new(data_types.finish()), - Arc::new(character_maximum_lengths.finish()), - Arc::new(character_octet_lengths.finish()), - Arc::new(numeric_precisions.finish()), - Arc::new(numeric_precision_radixes.finish()), - Arc::new(numeric_scales.finish()), - Arc::new(datetime_precisions.finish()), - Arc::new(interval_types.finish()), + Arc::new(catalog_names.to()), + Arc::new(schema_names.to()), + Arc::new(table_names.to()), + Arc::new(column_names.to()), + Arc::new(ordinal_positions), + Arc::new(column_defaults.to()), + Arc::new(is_nullables.to()), + Arc::new(data_types.to()), + Arc::new(character_maximum_lengths), + Arc::new(character_octet_lengths), + Arc::new(numeric_precisions), + Arc::new(numeric_precision_radixes), + Arc::new(numeric_scales), + Arc::new(datetime_precisions), + Arc::new(interval_types.to()), ], ) .unwrap(); diff --git a/datafusion/src/dataframe.rs b/datafusion/src/dataframe.rs index 9c7c2ef96d6be..c244b2d1d71ea 100644 --- a/datafusion/src/dataframe.rs +++ b/datafusion/src/dataframe.rs @@ -17,7 +17,7 @@ //! DataFrame API for building and executing query plans. -use crate::arrow::record_batch::RecordBatch; +use crate::arrow2::record_batch::RecordBatch; use crate::error::Result; use crate::logical_plan::{ DFSchema, Expr, FunctionRegistry, JoinType, LogicalPlan, Partitioning, diff --git a/datafusion/src/datasource/csv.rs b/datafusion/src/datasource/csv.rs index 33cbeb12ca6bd..3e8f5726392f3 100644 --- a/datafusion/src/datasource/csv.rs +++ b/datafusion/src/datasource/csv.rs @@ -25,7 +25,7 @@ //! use datafusion::datasource::TableProvider; //! use datafusion::datasource::csv::{CsvFile, CsvReadOptions}; //! -//! let testdata = arrow::util::test_util::arrow_test_data(); +//! let testdata = crate::test::arrow_test_data(); //! let csvdata = CsvFile::try_new( //! &format!("{}/csv/aggregate_test_100.csv", testdata), //! CsvReadOptions::new().delimiter(b'|'), @@ -33,12 +33,14 @@ //! let schema = csvdata.schema(); //! ``` -use arrow::datatypes::SchemaRef; use std::any::Any; use std::io::{Read, Seek}; use std::string::String; use std::sync::{Arc, Mutex}; +use arrow2::datatypes::Schema; +use arrow2::io::csv::read as csv_read; + use crate::datasource::datasource::Statistics; use crate::datasource::TableProvider; use crate::error::{DataFusionError, Result}; @@ -47,6 +49,7 @@ use crate::physical_plan::csv::CsvExec; pub use crate::physical_plan::csv::CsvReadOptions; use crate::physical_plan::{common, ExecutionPlan}; +type SchemaRef = Arc; enum Source { /// Path to a single CSV file or a directory containing one of more CSV files Path(String), @@ -119,21 +122,25 @@ impl CsvFile { /// Attempt to initialize a `CsvRead` from a reader impls `Seek`. The schema can be inferred automatically. pub fn try_new_from_reader_infer_schema( - mut reader: R, + reader: R, options: CsvReadOptions, ) -> Result { + let mut reader = csv_read::ReaderBuilder::new() + .delimiter(options.delimiter) + .from_reader(reader); let schema = Arc::new(match options.schema { Some(s) => s.clone(), None => { - let (schema, _) = arrow::csv::reader::infer_file_schema( + let schema = csv_read::infer_schema( &mut reader, - options.delimiter, Some(options.schema_infer_max_records), options.has_header, + &csv_read::infer, )?; schema } }); + let reader = reader.into_inner(); Ok(Self { source: Source::Reader(Mutex::new(Some(Box::new(reader)))), @@ -228,9 +235,11 @@ mod tests { use super::*; use crate::prelude::*; + use arrow2::array::*; + #[tokio::test] async fn csv_file_from_reader() -> Result<()> { - let testdata = arrow::util::test_util::arrow_test_data(); + let testdata = crate::test::arrow_test_data(); let filename = "aggregate_test_100.csv"; let path = format!("{}/csv/{}", testdata, filename); let buf = std::fs::read(path).unwrap(); @@ -249,7 +258,7 @@ mod tests { batches[0] .column(0) .as_any() - .downcast_ref::() + .downcast_ref::() .unwrap() .value(0), 5 diff --git a/datafusion/src/datasource/datasource.rs b/datafusion/src/datasource/datasource.rs index 0349a49e491ba..0c8065ae6f074 100644 --- a/datafusion/src/datasource/datasource.rs +++ b/datafusion/src/datasource/datasource.rs @@ -23,7 +23,9 @@ use std::sync::Arc; use crate::error::Result; use crate::logical_plan::Expr; use crate::physical_plan::ExecutionPlan; -use crate::{arrow::datatypes::SchemaRef, scalar::ScalarValue}; +use crate::{arrow2::datatypes::Schema, scalar::ScalarValue}; + +type SchemaRef = Arc; /// This table statistics are estimates. /// It can not be used directly in the precise compute diff --git a/datafusion/src/datasource/empty.rs b/datafusion/src/datasource/empty.rs index e6140cdb8de69..e0033f29df2e1 100644 --- a/datafusion/src/datasource/empty.rs +++ b/datafusion/src/datasource/empty.rs @@ -20,7 +20,9 @@ use std::any::Any; use std::sync::Arc; -use arrow::datatypes::*; +use arrow2::datatypes::*; + +type SchemaRef = Arc; use crate::datasource::datasource::Statistics; use crate::datasource::TableProvider; diff --git a/datafusion/src/datasource/memory.rs b/datafusion/src/datasource/memory.rs index af40480870287..02488aee112b1 100644 --- a/datafusion/src/datasource/memory.rs +++ b/datafusion/src/datasource/memory.rs @@ -24,8 +24,10 @@ use log::debug; use std::any::Any; use std::sync::Arc; -use arrow::datatypes::{Field, Schema, SchemaRef}; -use arrow::record_batch::RecordBatch; +use arrow2::datatypes::{Field, Schema}; +use arrow2::record_batch::RecordBatch; + +type SchemaRef = Arc; use crate::datasource::TableProvider; use crate::error::{DataFusionError, Result}; @@ -91,7 +93,7 @@ impl MemTable { if partitions .iter() .flatten() - .all(|batches| schema.contains(&batches.schema())) + .all(|batches| schema.as_ref() == batches.schema().as_ref()) { let statistics = calculate_statistics(&schema, &partitions); debug!("MemTable statistics: {:?}", statistics); @@ -221,8 +223,8 @@ impl TableProvider for MemTable { #[cfg(test)] mod tests { use super::*; - use arrow::array::Int32Array; - use arrow::datatypes::{DataType, Field, Schema}; + use arrow2::array::Int32Array; + use arrow2::datatypes::{DataType, Field, Schema}; use futures::StreamExt; use std::collections::HashMap; @@ -238,10 +240,10 @@ mod tests { let batch = RecordBatch::try_new( schema.clone(), vec![ - Arc::new(Int32Array::from(vec![1, 2, 3])), - Arc::new(Int32Array::from(vec![4, 5, 6])), - Arc::new(Int32Array::from(vec![7, 8, 9])), - Arc::new(Int32Array::from(vec![None, None, Some(9)])), + Arc::new(Int32Array::from_slice(&[1, 2, 3])), + Arc::new(Int32Array::from_slice(&[4, 5, 6])), + Arc::new(Int32Array::from_slice(&[7, 8, 9])), + Arc::new(Int32Array::from(&[None, None, Some(9)])), ], )?; @@ -301,9 +303,9 @@ mod tests { let batch = RecordBatch::try_new( schema.clone(), vec![ - Arc::new(Int32Array::from(vec![1, 2, 3])), - Arc::new(Int32Array::from(vec![4, 5, 6])), - Arc::new(Int32Array::from(vec![7, 8, 9])), + Arc::new(Int32Array::from_slice(&[1, 2, 3])), + Arc::new(Int32Array::from_slice(&[4, 5, 6])), + Arc::new(Int32Array::from_slice(&[7, 8, 9])), ], )?; @@ -329,9 +331,9 @@ mod tests { let batch = RecordBatch::try_new( schema.clone(), vec![ - Arc::new(Int32Array::from(vec![1, 2, 3])), - Arc::new(Int32Array::from(vec![4, 5, 6])), - Arc::new(Int32Array::from(vec![7, 8, 9])), + Arc::new(Int32Array::from_slice(&[1, 2, 3])), + Arc::new(Int32Array::from_slice(&[4, 5, 6])), + Arc::new(Int32Array::from_slice(&[7, 8, 9])), ], )?; @@ -366,9 +368,9 @@ mod tests { let batch = RecordBatch::try_new( schema1, vec![ - Arc::new(Int32Array::from(vec![1, 2, 3])), - Arc::new(Int32Array::from(vec![4, 5, 6])), - Arc::new(Int32Array::from(vec![7, 8, 9])), + Arc::new(Int32Array::from_slice(&[1, 2, 3])), + Arc::new(Int32Array::from_slice(&[4, 5, 6])), + Arc::new(Int32Array::from_slice(&[7, 8, 9])), ], )?; @@ -399,8 +401,8 @@ mod tests { let batch = RecordBatch::try_new( schema1, vec![ - Arc::new(Int32Array::from(vec![1, 2, 3])), - Arc::new(Int32Array::from(vec![7, 5, 9])), + Arc::new(Int32Array::from_slice(&[1, 2, 3])), + Arc::new(Int32Array::from_slice(&[7, 5, 9])), ], )?; @@ -420,7 +422,7 @@ mod tests { let mut metadata = HashMap::new(); metadata.insert("foo".to_string(), "bar".to_string()); - let schema1 = Schema::new_with_metadata( + let schema1 = Schema::new_from( vec![ Field::new("a", DataType::Int32, false), Field::new("b", DataType::Int32, false), @@ -442,18 +444,18 @@ mod tests { let batch1 = RecordBatch::try_new( Arc::new(schema1), vec![ - Arc::new(Int32Array::from(vec![1, 2, 3])), - Arc::new(Int32Array::from(vec![4, 5, 6])), - Arc::new(Int32Array::from(vec![7, 8, 9])), + Arc::new(Int32Array::from_slice(&[1, 2, 3])), + Arc::new(Int32Array::from_slice(&[4, 5, 6])), + Arc::new(Int32Array::from_slice(&[7, 8, 9])), ], )?; let batch2 = RecordBatch::try_new( Arc::new(schema2), vec![ - Arc::new(Int32Array::from(vec![1, 2, 3])), - Arc::new(Int32Array::from(vec![4, 5, 6])), - Arc::new(Int32Array::from(vec![7, 8, 9])), + Arc::new(Int32Array::from_slice(&[1, 2, 3])), + Arc::new(Int32Array::from_slice(&[4, 5, 6])), + Arc::new(Int32Array::from_slice(&[7, 8, 9])), ], )?; diff --git a/datafusion/src/datasource/parquet.rs b/datafusion/src/datasource/parquet.rs index 30e47df5f6491..7bea9042458b8 100644 --- a/datafusion/src/datasource/parquet.rs +++ b/datafusion/src/datasource/parquet.rs @@ -21,7 +21,7 @@ use std::any::Any; use std::string::String; use std::sync::Arc; -use arrow::datatypes::*; +use arrow2::datatypes::*; use crate::datasource::datasource::Statistics; use crate::datasource::TableProvider; @@ -32,6 +32,8 @@ use crate::physical_plan::ExecutionPlan; use super::datasource::TableProviderFilterPushDown; +type SchemaRef = Arc; + /// Table-based representation of a `ParquetFile`. pub struct ParquetTable { path: String, @@ -43,7 +45,7 @@ pub struct ParquetTable { impl ParquetTable { /// Attempt to initialize a new `ParquetTable` from a file path. pub fn try_new(path: &str, max_concurrency: usize) -> Result { - let parquet_exec = ParquetExec::try_from_path(path, None, None, 0, 1, None)?; + let parquet_exec = ParquetExec::try_from_path(path, None, None, 1, None)?; let schema = parquet_exec.schema(); Ok(Self { path: path.to_string(), @@ -90,9 +92,6 @@ impl TableProvider for ParquetTable { &self.path, projection.clone(), predicate, - limit - .map(|l| std::cmp::min(l, batch_size)) - .unwrap_or(batch_size), self.max_concurrency, limit, )?)) @@ -106,11 +105,8 @@ impl TableProvider for ParquetTable { #[cfg(test)] mod tests { use super::*; - use arrow::array::{ - BinaryArray, BooleanArray, Float32Array, Float64Array, Int32Array, - TimestampNanosecondArray, - }; - use arrow::record_batch::RecordBatch; + use arrow2::array::*; + use arrow2::record_batch::RecordBatch; use futures::StreamExt; #[tokio::test] @@ -234,7 +230,7 @@ mod tests { let array = batch .column(0) .as_any() - .downcast_ref::() + .downcast_ref::() .unwrap(); let mut values: Vec = vec![]; for i in 0..batch.num_rows() { @@ -312,7 +308,7 @@ mod tests { let array = batch .column(0) .as_any() - .downcast_ref::() + .downcast_ref::>() .unwrap(); let mut values: Vec<&str> = vec![]; for i in 0..batch.num_rows() { @@ -328,7 +324,7 @@ mod tests { } fn load_table(name: &str) -> Result> { - let testdata = arrow::util::test_util::parquet_test_data(); + let testdata = crate::test::parquet_test_data(); let filename = format!("{}/{}", testdata, name); let table = ParquetTable::try_new(&filename, 2)?; Ok(Arc::new(table)) diff --git a/datafusion/src/error.rs b/datafusion/src/error.rs index 903faeabf6954..b7b656733fde0 100644 --- a/datafusion/src/error.rs +++ b/datafusion/src/error.rs @@ -22,8 +22,7 @@ use std::fmt::{Display, Formatter}; use std::io; use std::result; -use arrow::error::ArrowError; -use parquet::errors::ParquetError; +use arrow2::error::ArrowError; use sqlparser::parser::ParserError; /// Result type for operations that could result in an [DataFusionError] @@ -35,8 +34,6 @@ pub type Result = result::Result; pub enum DataFusionError { /// Error returned by arrow. ArrowError(ArrowError), - /// Wraps an error from the Parquet crate - ParquetError(ParquetError), /// Error associated to I/O operations and associated traits. IoError(io::Error), /// Error returned when SQL is syntactically incorrect. @@ -59,7 +56,7 @@ pub enum DataFusionError { } impl DataFusionError { - /// Wraps this [DataFusionError] as an [arrow::error::ArrowError]. + /// Wraps this [DataFusionError] as an [arrow2::error::ArrowError]. pub fn into_arrow_external_error(self) -> ArrowError { ArrowError::from_external_error(Box::new(self)) } @@ -77,12 +74,6 @@ impl From for DataFusionError { } } -impl From for DataFusionError { - fn from(e: ParquetError) -> Self { - DataFusionError::ParquetError(e) - } -} - impl From for DataFusionError { fn from(e: ParserError) -> Self { DataFusionError::SQL(e) @@ -93,9 +84,6 @@ impl Display for DataFusionError { fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { match *self { DataFusionError::ArrowError(ref desc) => write!(f, "Arrow error: {}", desc), - DataFusionError::ParquetError(ref desc) => { - write!(f, "Parquet error: {}", desc) - } DataFusionError::IoError(ref desc) => write!(f, "IO error: {}", desc), DataFusionError::SQL(ref desc) => { write!(f, "SQL error: {:?}", desc) diff --git a/datafusion/src/execution/context.rs b/datafusion/src/execution/context.rs index 272e75acba6fd..b7b51ca3b71bd 100644 --- a/datafusion/src/execution/context.rs +++ b/datafusion/src/execution/context.rs @@ -39,7 +39,9 @@ use std::{ use futures::{StreamExt, TryStreamExt}; use tokio::task::{self, JoinHandle}; -use arrow::csv; +use arrow2::error::{ArrowError, Result as ArrowResult}; +use arrow2::io::csv::write as csv_write; +use arrow2::io::parquet::write; use crate::catalog::{ catalog::{CatalogProvider, MemoryCatalogProvider}, @@ -75,8 +77,6 @@ use crate::sql::{ use crate::variable::{VarProvider, VarType}; use crate::{dataframe::DataFrame, physical_plan::udaf::AggregateUDF}; use chrono::{DateTime, Utc}; -use parquet::arrow::ArrowWriter; -use parquet::file::properties::WriterProperties; /// ExecutionContext is the main interface for executing queries with DataFusion. The context /// provides the following functionality: @@ -500,12 +500,21 @@ impl ExecutionContext { let plan = plan.clone(); let filename = format!("part-{}.csv", i); let path = fs_path.join(&filename); - let file = fs::File::create(path)?; - let mut writer = csv::Writer::new(file); + + let mut writer = csv_write::WriterBuilder::new() + .from_path(path) + .map_err(ArrowError::from)?; + + csv_write::write_header(&mut writer, plan.schema().as_ref())?; + + let options = csv_write::SerializeOptions::default(); + let stream = plan.execute(i).await?; let handle: JoinHandle> = task::spawn(async move { stream - .map(|batch| writer.write(&batch?)) + .map(|batch| { + csv_write::write_batch(&mut writer, &batch?, &options) + }) .try_collect() .await .map_err(DataFusionError::from) @@ -527,7 +536,6 @@ impl ExecutionContext { &self, plan: Arc, path: String, - writer_properties: Option, ) -> Result<()> { // create directory to contain the Parquet files (one per partition) let fs_path = Path::new(&path); @@ -536,22 +544,45 @@ impl ExecutionContext { let mut tasks = vec![]; for i in 0..plan.output_partitioning().partition_count() { let plan = plan.clone(); + let schema = plan.schema(); let filename = format!("part-{}.parquet", i); let path = fs_path.join(&filename); - let file = fs::File::create(path)?; - let mut writer = ArrowWriter::try_new( - file.try_clone().unwrap(), - plan.schema(), - writer_properties.clone(), - )?; + + let mut file = fs::File::create(path)?; let stream = plan.execute(i).await?; + + let compression = write::CompressionCodec::Uncompressed; + let handle: JoinHandle> = task::spawn(async move { - stream - .map(|batch| writer.write(&batch?)) - .try_collect() - .await - .map_err(DataFusionError::from)?; - writer.close().map_err(DataFusionError::from).map(|_| ()) + let parquet_types = schema + .fields() + .iter() + .map(write::to_parquet_type) + .collect::>>()?; + + // do not do this. + let batches = + crate::physical_plan::common::collect(stream).await?; + + let groups = batches.iter().map(|batch| { + Ok(batch.columns().iter().zip(parquet_types.iter()).map( + |(array, type_)| { + Ok(std::iter::once(write::array_to_page( + array.as_ref(), + type_, + compression, + ))) + }, + )) + }); + + Ok(write::write_file( + &mut file, + groups, + schema.as_ref(), + compression, + None, + )?) }); tasks.push(handle); } @@ -905,20 +936,18 @@ mod tests { logical_plan::create_udaf, physical_plan::expressions::AvgAccumulator, }; - use arrow::array::{ - Array, ArrayRef, BinaryArray, DictionaryArray, Float64Array, Int32Array, - Int64Array, LargeBinaryArray, LargeStringArray, StringArray, - TimestampNanosecondArray, - }; - use arrow::compute::add; - use arrow::datatypes::*; - use arrow::record_batch::RecordBatch; + use arrow2::array::*; + use arrow2::datatypes::*; + use arrow2::record_batch::RecordBatch; use std::fs::File; use std::thread::{self, JoinHandle}; use std::{io::prelude::*, sync::Mutex}; use tempfile::TempDir; use test::*; + type ArrayRef = Arc; + type SchemaRef = Arc; + #[tokio::test] async fn parallel_projection() -> Result<()> { let partition_count = 4; @@ -1150,9 +1179,9 @@ mod tests { let partitions = vec![vec![RecordBatch::try_new( schema.clone(), vec![ - Arc::new(Int32Array::from(vec![1, 10, 10, 100])), - Arc::new(Int32Array::from(vec![2, 12, 12, 120])), - Arc::new(Int32Array::from(vec![3, 12, 12, 120])), + Arc::new(Int32Array::from_slice(&[1, 10, 10, 100])), + Arc::new(Int32Array::from_slice(&[2, 12, 12, 120])), + Arc::new(Int32Array::from_slice(&[3, 12, 12, 120])), ], )?]]; @@ -1710,13 +1739,13 @@ mod tests { // C, 1 // A, 1 - let str_array: LargeStringArray = vec!["A", "B", "A", "A", "C", "A"] + let str_array: Utf8Array = vec!["A", "B", "A", "A", "C", "A"] .into_iter() .map(Some) .collect(); let str_array = Arc::new(str_array); - let val_array: Int64Array = vec![1, 2, 2, 4, 1, 1].into(); + let val_array = Int64Array::from_slice(&[1, 2, 2, 4, 1, 1]); let val_array = Arc::new(val_array); let schema = Arc::new(Schema::new(vec![ @@ -1750,7 +1779,7 @@ mod tests { #[tokio::test] async fn group_by_dictionary() { - async fn run_test_case() { + async fn run_test_case() { let mut ctx = ExecutionContext::new(); // input data looks like: @@ -1761,11 +1790,11 @@ mod tests { // C, 1 // A, 1 - let dict_array: DictionaryArray = - vec!["A", "B", "A", "A", "C", "A"].into_iter().collect(); + let dict_array: DictionaryPrimitive, _> = + vec!["A", "B", "A", "A", "C", "A"].iter().collect(); let dict_array = Arc::new(dict_array); - let val_array: Int64Array = vec![1, 2, 2, 4, 1, 1].into(); + let val_array = Int64Array::from_slice(&[1, 2, 2, 4, 1, 1]); let val_array = Arc::new(val_array); let schema = Arc::new(Schema::new(vec![ @@ -1834,14 +1863,14 @@ mod tests { assert_batches_sorted_eq!(expected, &results); } - run_test_case::().await; - run_test_case::().await; - run_test_case::().await; - run_test_case::().await; - run_test_case::().await; - run_test_case::().await; - run_test_case::().await; - run_test_case::().await; + run_test_case::().await; + run_test_case::().await; + run_test_case::().await; + run_test_case::().await; + run_test_case::().await; + run_test_case::().await; + run_test_case::().await; + run_test_case::().await; } async fn run_count_distinct_integers_aggregated_scenario( @@ -2046,7 +2075,7 @@ mod tests { vec![test::make_partition(4)], vec![test::make_partition(5)], ]; - let schema = partitions[0][0].schema(); + let schema = partitions[0][0].schema().clone(); let provider = Arc::new(MemTable::try_new(schema, partitions).unwrap()); ctx.register_table("t", provider).unwrap(); @@ -2384,8 +2413,8 @@ mod tests { let batch = RecordBatch::try_new( Arc::new(schema.clone()), vec![ - Arc::new(Int32Array::from(vec![1, 10, 10, 100])), - Arc::new(Int32Array::from(vec![2, 12, 12, 120])), + Arc::new(Int32Array::from_slice(&[1, 10, 10, 100])), + Arc::new(Int32Array::from_slice(&[2, 12, 12, 120])), ], )?; @@ -2403,7 +2432,7 @@ mod tests { .as_any() .downcast_ref::() .expect("cast failed"); - Ok(Arc::new(add(l, r)?) as ArrayRef) + Ok(Arc::new(add::add(l, r)?) as ArrayRef) }; let myfunc = make_scalar_function(myfunc); @@ -2483,11 +2512,11 @@ mod tests { let batch1 = RecordBatch::try_new( Arc::new(schema.clone()), - vec![Arc::new(Int32Array::from(vec![1, 2, 3]))], + vec![Arc::new(Int32Array::from_slice(&[1, 2, 3]))], )?; let batch2 = RecordBatch::try_new( Arc::new(schema.clone()), - vec![Arc::new(Int32Array::from(vec![4, 5]))], + vec![Arc::new(Int32Array::from_slice(&[4, 5]))], )?; let mut ctx = ExecutionContext::new(); @@ -2520,11 +2549,11 @@ mod tests { let batch1 = RecordBatch::try_new( Arc::new(schema.clone()), - vec![Arc::new(Int32Array::from(vec![1, 2, 3]))], + vec![Arc::new(Int32Array::from_slice(&[1, 2, 3]))], )?; let batch2 = RecordBatch::try_new( Arc::new(schema.clone()), - vec![Arc::new(Int32Array::from(vec![4, 5]))], + vec![Arc::new(Int32Array::from_slice(&[4, 5]))], )?; let mut ctx = ExecutionContext::new(); @@ -2986,16 +3015,16 @@ mod tests { let batch = RecordBatch::try_new( Arc::new(schema.clone()), vec![ - Arc::new(Int32Array::from(vec![1])), - Arc::new(Float64Array::from(vec![1.0])), - Arc::new(StringArray::from(vec![Some("foo")])), - Arc::new(LargeStringArray::from(vec![Some("bar")])), - Arc::new(BinaryArray::from(vec![b"foo" as &[u8]])), - Arc::new(LargeBinaryArray::from(vec![b"foo" as &[u8]])), - Arc::new(TimestampNanosecondArray::from_opt_vec( - vec![Some(123)], - None, - )), + Arc::new(Int32Array::from_slice(&[1])), + Arc::new(Float64Array::from_slice(&[1.0])), + Arc::new(Utf8Array::::from(vec![Some("foo")])), + Arc::new(Utf8Array::::from(vec![Some("bar")])), + Arc::new(BinaryArray::::from_slice(&[b"foo" as &[u8]])), + Arc::new(BinaryArray::::from_slice(&[b"foo" as &[u8]])), + Arc::new( + Primitive::::from(vec![Some(123)]) + .to(DataType::Timestamp(TimeUnit::Nanosecond, None)), + ), ], ) .unwrap(); diff --git a/datafusion/src/execution/dataframe_impl.rs b/datafusion/src/execution/dataframe_impl.rs index fdc75f92f2e75..59af408b38592 100644 --- a/datafusion/src/execution/dataframe_impl.rs +++ b/datafusion/src/execution/dataframe_impl.rs @@ -19,7 +19,7 @@ use std::sync::{Arc, Mutex}; -use crate::arrow::record_batch::RecordBatch; +use crate::arrow2::record_batch::RecordBatch; use crate::error::Result; use crate::execution::context::{ExecutionContext, ExecutionContextState}; use crate::logical_plan::{ @@ -184,7 +184,7 @@ mod tests { use crate::{assert_batches_sorted_eq, execution::context::ExecutionContext}; use crate::{datasource::csv::CsvReadOptions, physical_plan::ColumnarValue}; use crate::{physical_plan::functions::ScalarFunctionImplementation, test}; - use arrow::datatypes::DataType; + use arrow2::datatypes::DataType; #[test] fn select_columns() -> Result<()> { @@ -369,7 +369,7 @@ mod tests { fn register_aggregate_csv(ctx: &mut ExecutionContext) -> Result<()> { let schema = test::aggr_test_schema(); - let testdata = arrow::util::test_util::arrow_test_data(); + let testdata = crate::test::arrow_test_data(); ctx.register_csv( "aggregate_test_100", &format!("{}/csv/aggregate_test_100.csv", testdata), diff --git a/datafusion/src/lib.rs b/datafusion/src/lib.rs index b6f64feb70d2a..ced5a9be6ba3c 100644 --- a/datafusion/src/lib.rs +++ b/datafusion/src/lib.rs @@ -39,7 +39,7 @@ //! ```rust //! # use datafusion::prelude::*; //! # use datafusion::error::Result; -//! # use arrow::record_batch::RecordBatch; +//! # use arrow2::record_batch::RecordBatch; //! //! # #[tokio::main] //! # async fn main() -> Result<()> { @@ -57,7 +57,7 @@ //! let results: Vec = df.collect().await?; //! //! // format the results -//! let pretty_results = arrow::util::pretty::pretty_format_batches(&results)?; +//! let pretty_results = arrow2::util::pretty::pretty_format_batches(&results)?; //! //! let expected = vec![ //! "+---+--------+", @@ -77,7 +77,7 @@ //! ``` //! # use datafusion::prelude::*; //! # use datafusion::error::Result; -//! # use arrow::record_batch::RecordBatch; +//! # use arrow2::record_batch::RecordBatch; //! //! # #[tokio::main] //! # async fn main() -> Result<()> { @@ -92,7 +92,7 @@ //! let results: Vec = df.collect().await?; //! //! // format the results -//! let pretty_results = arrow::util::pretty::pretty_format_batches(&results)?; +//! let pretty_results = arrow2::util::pretty::pretty_format_batches(&results)?; //! //! let expected = vec![ //! "+---+--------+", @@ -130,7 +130,7 @@ //! ### Logical plan //! //! Logical planning yields [`logical plans`](logical_plan::LogicalPlan) and [`logical expressions`](logical_plan::Expr). -//! These are [`Schema`](arrow::datatypes::Schema)-aware traits that represent statements whose result is independent of how it should physically be executed. +//! These are [`Schema`](arrow2::datatypes::Schema)-aware traits that represent statements whose result is independent of how it should physically be executed. //! //! A [`LogicalPlan`](logical_plan::LogicalPlan) is a Direct Asyclic graph of other [`LogicalPlan`s](logical_plan::LogicalPlan) and each node contains logical expressions ([`Expr`s](logical_plan::Expr)). //! All of these are located in [`logical_plan`](logical_plan). @@ -152,12 +152,12 @@ //! Broadly speaking, //! //! * an [`ExecutionPlan`](physical_plan::ExecutionPlan) receives a partition number and asyncronosly returns -//! an iterator over [`RecordBatch`](arrow::record_batch::RecordBatch) -//! (a node-specific struct that implements [`RecordBatchReader`](arrow::record_batch::RecordBatchReader)) -//! * a [`PhysicalExpr`](physical_plan::PhysicalExpr) receives a [`RecordBatch`](arrow::record_batch::RecordBatch) -//! and returns an [`Array`](arrow::array::Array) -//! * an [`AggregateExpr`](physical_plan::AggregateExpr) receives [`RecordBatch`es](arrow::record_batch::RecordBatch) -//! and returns a [`RecordBatch`](arrow::record_batch::RecordBatch) of a single row(*) +//! an iterator over [`RecordBatch`](arrow2::record_batch::RecordBatch) +//! (a node-specific struct that implements [`RecordBatchReader`](arrow2::record_batch::RecordBatchReader)) +//! * a [`PhysicalExpr`](physical_plan::PhysicalExpr) receives a [`RecordBatch`](arrow2::record_batch::RecordBatch) +//! and returns an [`Array`](arrow2::array::Array) +//! * an [`AggregateExpr`](physical_plan::AggregateExpr) receives [`RecordBatch`es](arrow2::record_batch::RecordBatch) +//! and returns a [`RecordBatch`](arrow2::record_batch::RecordBatch) of a single row(*) //! //! (*) Technically, it aggregates the results on each partition and then merges the results into a single partition. //! @@ -200,8 +200,7 @@ pub mod sql; pub mod variable; // re-export dependencies from arrow-rs to minimise version maintenance for crate users -pub use arrow; -pub use parquet; +pub use arrow2; #[cfg(test)] pub mod test; diff --git a/datafusion/src/logical_plan/builder.rs b/datafusion/src/logical_plan/builder.rs index 2e69814d2634e..e987fc9c335c2 100644 --- a/datafusion/src/logical_plan/builder.rs +++ b/datafusion/src/logical_plan/builder.rs @@ -19,11 +19,13 @@ use std::{collections::HashMap, sync::Arc}; -use arrow::{ - datatypes::{Schema, SchemaRef}, +use arrow2::{ + datatypes::Schema, record_batch::RecordBatch, }; +type SchemaRef = Arc; + use crate::datasource::TableProvider; use crate::error::{DataFusionError, Result}; use crate::{ @@ -44,7 +46,7 @@ use std::collections::HashSet; /// # use datafusion::prelude::*; /// # use datafusion::logical_plan::LogicalPlanBuilder; /// # use datafusion::error::Result; -/// # use arrow::datatypes::{Schema, DataType, Field}; +/// # use arrow2::datatypes::{Schema, DataType, Field}; /// # /// # fn main() -> Result<()> { /// # @@ -416,7 +418,7 @@ fn validate_unique_names<'a>( #[cfg(test)] mod tests { - use arrow::datatypes::{DataType, Field}; + use arrow2::datatypes::{DataType, Field}; use super::super::{lit, sum}; use super::*; diff --git a/datafusion/src/logical_plan/dfschema.rs b/datafusion/src/logical_plan/dfschema.rs index 9adb22b43d075..651eecb9aa185 100644 --- a/datafusion/src/logical_plan/dfschema.rs +++ b/datafusion/src/logical_plan/dfschema.rs @@ -24,9 +24,11 @@ use std::sync::Arc; use crate::error::{DataFusionError, Result}; -use arrow::datatypes::{DataType, Field, Schema, SchemaRef}; +use arrow2::datatypes::{DataType, Field, Schema}; use std::fmt::{Display, Formatter}; +type SchemaRef = Arc; + /// A reference-counted reference to a `DFSchema`. pub type DFSchemaRef = Arc; @@ -356,7 +358,7 @@ impl DFField { #[cfg(test)] mod tests { use super::*; - use arrow::datatypes::DataType; + use arrow2::datatypes::DataType; #[test] fn from_unqualified_field() { diff --git a/datafusion/src/logical_plan/display.rs b/datafusion/src/logical_plan/display.rs index f285534fdf1b6..8fe96ecf8aeec 100644 --- a/datafusion/src/logical_plan/display.rs +++ b/datafusion/src/logical_plan/display.rs @@ -17,7 +17,7 @@ //! This module provides logic for displaying LogicalPlans in various styles use super::{LogicalPlan, PlanVisitor}; -use arrow::datatypes::Schema; +use arrow2::datatypes::Schema; use std::fmt; /// Formats plans with a single line per node. For example: @@ -81,7 +81,7 @@ impl<'a, 'b> PlanVisitor for IndentVisitor<'a, 'b> { /// `foo:Utf8;N` if `foo` is nullable. /// /// ``` -/// use arrow::datatypes::{Field, Schema, DataType}; +/// use arrow2::datatypes::{Field, Schema, DataType}; /// # use datafusion::logical_plan::display_schema; /// let schema = Schema::new(vec![ /// Field::new("id", DataType::Int32, false), @@ -238,7 +238,7 @@ impl<'a, 'b> PlanVisitor for GraphvizVisitor<'a, 'b> { #[cfg(test)] mod tests { - use arrow::datatypes::{DataType, Field}; + use arrow2::datatypes::{DataType, Field}; use super::*; diff --git a/datafusion/src/logical_plan/expr.rs b/datafusion/src/logical_plan/expr.rs index 3365bf2603234..45b3541883c26 100644 --- a/datafusion/src/logical_plan/expr.rs +++ b/datafusion/src/logical_plan/expr.rs @@ -24,7 +24,7 @@ use std::fmt; use std::sync::Arc; use aggregates::{AccumulatorFunctionImplementation, StateTypeFunction}; -use arrow::{compute::can_cast_types, datatypes::DataType}; +use arrow2::{compute::cast::can_cast_types, datatypes::DataType}; use crate::error::{DataFusionError, Result}; use crate::logical_plan::{DFField, DFSchema}; @@ -39,7 +39,7 @@ use std::collections::HashSet; /// represent logical expressions such as `A + 1`, or `CAST(c1 AS /// int)`. /// -/// An `Expr` can compute its [DataType](arrow::datatypes::DataType) +/// An `Expr` can compute its [DataType](arrow2::datatypes::DataType) /// and nullability, and has functions for building up complex /// expressions. /// @@ -211,11 +211,11 @@ pub enum Expr { } impl Expr { - /// Returns the [arrow::datatypes::DataType] of the expression based on [arrow::datatypes::Schema]. + /// Returns the [arrow2::datatypes::DataType] of the expression based on [arrow2::datatypes::Schema]. /// /// # Errors /// - /// This function errors when it is not possible to compute its [arrow::datatypes::DataType]. + /// This function errors when it is not possible to compute its [arrow2::datatypes::DataType]. /// This happens when e.g. the expression refers to a column that does not exist in the schema, or when /// the expression is incorrectly typed (e.g. `[utf8] + [bool]`). pub fn get_type(&self, schema: &DFSchema) -> Result { @@ -280,7 +280,7 @@ impl Expr { } } - /// Returns the nullability of the expression based on [arrow::datatypes::Schema]. + /// Returns the nullability of the expression based on [arrow2::datatypes::Schema]. /// /// # Errors /// @@ -336,14 +336,14 @@ impl Expr { } } - /// Returns the name of this expression based on [arrow::datatypes::Schema]. + /// Returns the name of this expression based on [arrow2::datatypes::Schema]. /// /// This represents how a column with this expression is named when no alias is chosen pub fn name(&self, input_schema: &DFSchema) -> Result { create_name(self, input_schema) } - /// Returns a [arrow::datatypes::Field] compatible with this expression. + /// Returns a [arrow2::datatypes::Field] compatible with this expression. pub fn to_field(&self, input_schema: &DFSchema) -> Result { Ok(DFField::new( None, //TODO qualifier @@ -353,12 +353,12 @@ impl Expr { )) } - /// Wraps this expression in a cast to a target [arrow::datatypes::DataType]. + /// Wraps this expression in a cast to a target [arrow2::datatypes::DataType]. /// /// # Errors /// /// This function errors when it is impossible to cast the - /// expression to the target [arrow::datatypes::DataType]. + /// expression to the target [arrow2::datatypes::DataType]. pub fn cast_to(self, cast_to_type: &DataType, schema: &DFSchema) -> Result { let this_type = self.get_type(schema)?; if this_type == *cast_to_type { diff --git a/datafusion/src/logical_plan/plan.rs b/datafusion/src/logical_plan/plan.rs index 8b9aac9ea73b9..82fcd3d3a55e9 100644 --- a/datafusion/src/logical_plan/plan.rs +++ b/datafusion/src/logical_plan/plan.rs @@ -23,7 +23,7 @@ use std::{ sync::Arc, }; -use arrow::datatypes::{DataType, Field, Schema, SchemaRef}; +use arrow2::datatypes::{DataType, Field, Schema}; use crate::datasource::TableProvider; use crate::sql::parser::FileType; @@ -36,6 +36,8 @@ use super::{ }; use crate::logical_plan::dfschema::DFSchemaRef; +type SchemaRef = Arc; + /// Join type #[derive(Debug, Clone, Copy)] pub enum JoinType { @@ -468,7 +470,7 @@ impl LogicalPlan { /// ``` /// /// ``` - /// use arrow::datatypes::{Field, Schema, DataType}; + /// use arrow2::datatypes::{Field, Schema, DataType}; /// use datafusion::logical_plan::{lit, col, LogicalPlanBuilder}; /// let schema = Schema::new(vec![ /// Field::new("id", DataType::Int32, false), @@ -509,7 +511,7 @@ impl LogicalPlan { /// ``` /// /// ``` - /// use arrow::datatypes::{Field, Schema, DataType}; + /// use arrow2::datatypes::{Field, Schema, DataType}; /// use datafusion::logical_plan::{lit, col, LogicalPlanBuilder}; /// let schema = Schema::new(vec![ /// Field::new("id", DataType::Int32, false), @@ -549,7 +551,7 @@ impl LogicalPlan { /// structure, and one with additional details such as schema. /// /// ``` - /// use arrow::datatypes::{Field, Schema, DataType}; + /// use arrow2::datatypes::{Field, Schema, DataType}; /// use datafusion::logical_plan::{lit, col, LogicalPlanBuilder}; /// let schema = Schema::new(vec![ /// Field::new("id", DataType::Int32, false), @@ -608,7 +610,7 @@ impl LogicalPlan { /// Projection: #id /// ``` /// ``` - /// use arrow::datatypes::{Field, Schema, DataType}; + /// use arrow2::datatypes::{Field, Schema, DataType}; /// use datafusion::logical_plan::{lit, col, LogicalPlanBuilder}; /// let schema = Schema::new(vec![ /// Field::new("id", DataType::Int32, false), diff --git a/datafusion/src/optimizer/constant_folding.rs b/datafusion/src/optimizer/constant_folding.rs index 51bf0ce1b5054..f85ef44929d10 100644 --- a/datafusion/src/optimizer/constant_folding.rs +++ b/datafusion/src/optimizer/constant_folding.rs @@ -20,7 +20,7 @@ use std::sync::Arc; -use arrow::datatypes::DataType; +use arrow2::datatypes::DataType; use crate::error::Result; use crate::execution::context::ExecutionProps; @@ -232,7 +232,7 @@ mod tests { col, lit, max, min, DFField, DFSchema, LogicalPlanBuilder, }; - use arrow::datatypes::*; + use arrow2::datatypes::*; use chrono::{DateTime, Utc}; fn test_table_scan() -> Result { diff --git a/datafusion/src/optimizer/filter_push_down.rs b/datafusion/src/optimizer/filter_push_down.rs index 4c248e2b6483d..9e73ea9102c8d 100644 --- a/datafusion/src/optimizer/filter_push_down.rs +++ b/datafusion/src/optimizer/filter_push_down.rs @@ -453,7 +453,10 @@ mod tests { use crate::physical_plan::ExecutionPlan; use crate::test::*; use crate::{logical_plan::col, prelude::JoinType}; - use arrow::datatypes::SchemaRef; + use arrow2::datatypes::Schema; + use std::sync::Arc; + + type SchemaRef = Arc; fn assert_optimized_plan_eq(plan: &LogicalPlan, expected: &str) { let rule = FilterPushDown::new(); @@ -958,10 +961,10 @@ mod tests { impl TableProvider for PushDownProvider { fn schema(&self) -> SchemaRef { - Arc::new(arrow::datatypes::Schema::new(vec![ - arrow::datatypes::Field::new( + Arc::new(arrow2::datatypes::Schema::new(vec![ + arrow2::datatypes::Field::new( "a", - arrow::datatypes::DataType::Int32, + arrow2::datatypes::DataType::Int32, true, ), ])) diff --git a/datafusion/src/optimizer/hash_build_probe_order.rs b/datafusion/src/optimizer/hash_build_probe_order.rs index 168c4a17edfd0..72a86ed667493 100644 --- a/datafusion/src/optimizer/hash_build_probe_order.rs +++ b/datafusion/src/optimizer/hash_build_probe_order.rs @@ -224,6 +224,9 @@ mod tests { logical_plan::{DFSchema, Expr}, test::*, }; + use arrow2::datatypes::Schema; + + type SchemaRef = Arc; struct TestTableProvider { num_rows: usize, @@ -233,7 +236,7 @@ mod tests { fn as_any(&self) -> &dyn std::any::Any { unimplemented!() } - fn schema(&self) -> arrow::datatypes::SchemaRef { + fn schema(&self) -> SchemaRef { unimplemented!() } diff --git a/datafusion/src/optimizer/projection_push_down.rs b/datafusion/src/optimizer/projection_push_down.rs index 21c9caba3316d..ac49ac9edea74 100644 --- a/datafusion/src/optimizer/projection_push_down.rs +++ b/datafusion/src/optimizer/projection_push_down.rs @@ -23,8 +23,8 @@ use crate::execution::context::ExecutionProps; use crate::logical_plan::{DFField, DFSchema, DFSchemaRef, LogicalPlan, ToDFSchema}; use crate::optimizer::optimizer::OptimizerRule; use crate::optimizer::utils; -use arrow::datatypes::Schema; -use arrow::error::Result as ArrowResult; +use arrow2::datatypes::Schema; +use arrow2::error::Result as ArrowResult; use std::{collections::HashSet, sync::Arc}; use utils::optimize_explain; @@ -324,7 +324,7 @@ mod tests { use crate::logical_plan::{col, lit}; use crate::logical_plan::{max, min, Expr, LogicalPlanBuilder}; use crate::test::*; - use arrow::datatypes::DataType; + use arrow2::datatypes::DataType; #[test] fn aggregate_no_group_by() -> Result<()> { diff --git a/datafusion/src/optimizer/utils.rs b/datafusion/src/optimizer/utils.rs index 9288c65ac4dac..d2c962d38bceb 100644 --- a/datafusion/src/optimizer/utils.rs +++ b/datafusion/src/optimizer/utils.rs @@ -19,7 +19,7 @@ use std::{collections::HashSet, sync::Arc}; -use arrow::datatypes::Schema; +use arrow2::datatypes::Schema; use super::optimizer::OptimizerRule; use crate::execution::context::ExecutionProps; @@ -419,7 +419,7 @@ pub fn rewrite_expression(expr: &Expr, expressions: &[Expr]) -> Result { mod tests { use super::*; use crate::logical_plan::{col, LogicalPlanBuilder}; - use arrow::datatypes::DataType; + use arrow2::datatypes::DataType; use std::collections::HashSet; #[test] diff --git a/datafusion/src/physical_optimizer/repartition.rs b/datafusion/src/physical_optimizer/repartition.rs index fee4b3e11e5d2..1518db37a84a4 100644 --- a/datafusion/src/physical_optimizer/repartition.rs +++ b/datafusion/src/physical_optimizer/repartition.rs @@ -106,7 +106,7 @@ impl PhysicalOptimizerRule for Repartition { } #[cfg(test)] mod tests { - use arrow::datatypes::Schema; + use arrow2::datatypes::Schema; use super::*; use crate::datasource::datasource::Statistics; @@ -119,14 +119,12 @@ mod tests { vec![], Arc::new(ParquetExec::new( vec![ParquetPartition { - filenames: vec!["x".to_string()], + filename: "x".to_string(), statistics: Statistics::default(), }], - Schema::empty(), - None, + Arc::new(Schema::empty()), None, 2048, - None, )), )?; @@ -155,14 +153,12 @@ mod tests { vec![], Arc::new(ParquetExec::new( vec![ParquetPartition { - filenames: vec!["x".to_string()], + filename: "x".to_string(), statistics: Statistics::default(), }], - Schema::empty(), - None, + Arc::new(Schema::empty()), None, 2048, - None, )), )?), )?; diff --git a/datafusion/src/physical_plan/aggregates.rs b/datafusion/src/physical_plan/aggregates.rs index 9417c7c8f05a5..94266fc1255d9 100644 --- a/datafusion/src/physical_plan/aggregates.rs +++ b/datafusion/src/physical_plan/aggregates.rs @@ -34,7 +34,7 @@ use super::{ use crate::error::{DataFusionError, Result}; use crate::physical_plan::distinct_expressions; use crate::physical_plan::expressions; -use arrow::datatypes::{DataType, Schema, TimeUnit}; +use arrow2::datatypes::{DataType, Schema, TimeUnit}; use expressions::{avg_return_type, sum_return_type}; use std::{fmt, str::FromStr, sync::Arc}; diff --git a/datafusion/src/physical_plan/array_expressions.rs b/datafusion/src/physical_plan/array_expressions.rs index a7e03b70e5d21..ebc3722cb5f4a 100644 --- a/datafusion/src/physical_plan/array_expressions.rs +++ b/datafusion/src/physical_plan/array_expressions.rs @@ -18,75 +18,30 @@ //! Array expressions use crate::error::{DataFusionError, Result}; -use arrow::array::*; -use arrow::datatypes::DataType; +use arrow2::array::*; +use arrow2::compute::concat; use std::sync::Arc; use super::ColumnarValue; -macro_rules! downcast_vec { - ($ARGS:expr, $ARRAY_TYPE:ident) => {{ - $ARGS - .iter() - .map(|e| match e.as_any().downcast_ref::<$ARRAY_TYPE>() { - Some(array) => Ok(array), - _ => Err(DataFusionError::Internal("failed to downcast".to_string())), - }) - }}; -} +type ArrayRef = Arc; -macro_rules! array { - ($ARGS:expr, $ARRAY_TYPE:ident, $BUILDER_TYPE:ident) => {{ - // downcast all arguments to their common format - let args = - downcast_vec!($ARGS, $ARRAY_TYPE).collect::>>()?; +fn array_array(arrays: &[&dyn Array]) -> Result { + assert!(arrays.len() > 0); + let first = arrays[0]; + assert!(arrays.iter().all(|x| x.len() == first.len())); + assert!(arrays.iter().all(|x| x.data_type() == first.data_type())); - let mut builder = FixedSizeListBuilder::<$BUILDER_TYPE>::new( - <$BUILDER_TYPE>::new(args[0].len()), - args.len() as i32, - ); - // for each entry in the array - for index in 0..args[0].len() { - for arg in &args { - if arg.is_null(index) { - builder.values().append_null()?; - } else { - builder.values().append_value(arg.value(index))?; - } - } - builder.append(true)?; - } - Ok(Arc::new(builder.finish())) - }}; -} + let size = arrays.len(); + let length = first.len(); -fn array_array(args: &[&dyn Array]) -> Result { - // do not accept 0 arguments. - if args.is_empty() { - return Err(DataFusionError::Internal( - "array requires at least one argument".to_string(), - )); - } - - match args[0].data_type() { - DataType::Utf8 => array!(args, StringArray, StringBuilder), - DataType::LargeUtf8 => array!(args, LargeStringArray, LargeStringBuilder), - DataType::Boolean => array!(args, BooleanArray, BooleanBuilder), - DataType::Float32 => array!(args, Float32Array, Float32Builder), - DataType::Float64 => array!(args, Float64Array, Float64Builder), - DataType::Int8 => array!(args, Int8Array, Int8Builder), - DataType::Int16 => array!(args, Int16Array, Int16Builder), - DataType::Int32 => array!(args, Int32Array, Int32Builder), - DataType::Int64 => array!(args, Int64Array, Int64Builder), - DataType::UInt8 => array!(args, UInt8Array, UInt8Builder), - DataType::UInt16 => array!(args, UInt16Array, UInt16Builder), - DataType::UInt32 => array!(args, UInt32Array, UInt32Builder), - DataType::UInt64 => array!(args, UInt64Array, UInt64Builder), - data_type => Err(DataFusionError::NotImplemented(format!( - "Array is not implemented for type '{:?}'.", - data_type - ))), - } + let values = concat::concatenate(arrays)?; + let data_type = FixedSizeListArray::default_datatype(first.data_type().clone(), size); + Ok(FixedSizeListArray::from_data( + data_type, + values.into(), + None, + )) } /// put values in an array. @@ -104,24 +59,5 @@ pub fn array(values: &[ColumnarValue]) -> Result { }) .collect::>()?; - Ok(ColumnarValue::Array(array_array(&arrays)?)) + Ok(ColumnarValue::Array(array_array(&arrays).map(Arc::new)?)) } - -/// Currently supported types by the array function. -/// The order of these types correspond to the order on which coercion applies -/// This should thus be from least informative to most informative -pub static SUPPORTED_ARRAY_TYPES: &[DataType] = &[ - DataType::Boolean, - DataType::UInt8, - DataType::UInt16, - DataType::UInt32, - DataType::UInt64, - DataType::Int8, - DataType::Int16, - DataType::Int32, - DataType::Int64, - DataType::Float32, - DataType::Float64, - DataType::Utf8, - DataType::LargeUtf8, -]; diff --git a/datafusion/src/physical_plan/coalesce_batches.rs b/datafusion/src/physical_plan/coalesce_batches.rs index e25412d9d6b8b..bc7a199fd7d3e 100644 --- a/datafusion/src/physical_plan/coalesce_batches.rs +++ b/datafusion/src/physical_plan/coalesce_batches.rs @@ -29,10 +29,11 @@ use crate::physical_plan::{ SendableRecordBatchStream, }; -use arrow::compute::kernels::concat::concat; -use arrow::datatypes::SchemaRef; -use arrow::error::Result as ArrowResult; -use arrow::record_batch::RecordBatch; +use arrow2::compute::concat::concatenate; +use arrow2::datatypes::Schema; +type SchemaRef = Arc; +use arrow2::error::Result as ArrowResult; +use arrow2::record_batch::RecordBatch; use async_trait::async_trait; use futures::stream::{Stream, StreamExt}; use log::debug; @@ -239,12 +240,13 @@ pub fn concat_batches( } let mut arrays = Vec::with_capacity(schema.fields().len()); for i in 0..schema.fields().len() { - let array = concat( + let array = concatenate( &batches .iter() .map(|batch| batch.column(i).as_ref()) .collect::>(), - )?; + )? + .into(); arrays.push(array); } debug!( @@ -259,8 +261,8 @@ pub fn concat_batches( mod tests { use super::*; use crate::physical_plan::{memory::MemoryExec, repartition::RepartitionExec}; - use arrow::array::UInt32Array; - use arrow::datatypes::{DataType, Field, Schema}; + use arrow2::array::UInt32Array; + use arrow2::datatypes::{DataType, Field, Schema}; #[tokio::test(flavor = "multi_thread")] async fn test_concat_batches() -> Result<()> { @@ -299,7 +301,7 @@ mod tests { fn create_batch(schema: &Arc) -> RecordBatch { RecordBatch::try_new( schema.clone(), - vec![Arc::new(UInt32Array::from(vec![1, 2, 3, 4, 5, 6, 7, 8]))], + vec![Arc::new(UInt32Array::from_slice(&[1, 2, 3, 4, 5, 6, 7, 8]))], ) .unwrap() } diff --git a/datafusion/src/physical_plan/common.rs b/datafusion/src/physical_plan/common.rs index f1ed3742340b0..761eab48fbc2d 100644 --- a/datafusion/src/physical_plan/common.rs +++ b/datafusion/src/physical_plan/common.rs @@ -25,9 +25,10 @@ use std::task::{Context, Poll}; use super::{RecordBatchStream, SendableRecordBatchStream}; use crate::error::{DataFusionError, Result}; -use arrow::datatypes::SchemaRef; -use arrow::error::Result as ArrowResult; -use arrow::record_batch::RecordBatch; +use arrow2::datatypes::Schema; +type SchemaRef = Arc; +use arrow2::error::Result as ArrowResult; +use arrow2::record_batch::RecordBatch; use futures::{Stream, TryStreamExt}; /// Stream of record batches diff --git a/datafusion/src/physical_plan/cross_join.rs b/datafusion/src/physical_plan/cross_join.rs index f6f5da4cf8db9..06f209858a857 100644 --- a/datafusion/src/physical_plan/cross_join.rs +++ b/datafusion/src/physical_plan/cross_join.rs @@ -21,9 +21,10 @@ use futures::{lock::Mutex, StreamExt}; use std::{any::Any, sync::Arc, task::Poll}; -use arrow::datatypes::{Schema, SchemaRef}; -use arrow::error::Result as ArrowResult; -use arrow::record_batch::RecordBatch; +use crate::physical_plan::memory::MemoryStream; +use arrow2::datatypes::Schema; +use arrow2::error::Result as ArrowResult; +use arrow2::record_batch::RecordBatch; use futures::{Stream, TryStreamExt}; @@ -36,11 +37,13 @@ use async_trait::async_trait; use std::time::Instant; use super::{ - coalesce_batches::concat_batches, memory::MemoryStream, DisplayFormatType, - ExecutionPlan, Partitioning, RecordBatchStream, SendableRecordBatchStream, + coalesce_batches::concat_batches, DisplayFormatType, ExecutionPlan, Partitioning, + RecordBatchStream, SendableRecordBatchStream, }; use log::debug; +type SchemaRef = Arc; + /// Data of the left side type JoinLeftData = RecordBatch; diff --git a/datafusion/src/physical_plan/crypto_expressions.rs b/datafusion/src/physical_plan/crypto_expressions.rs index 8ad876b24d0ce..07a68d30207ce 100644 --- a/datafusion/src/physical_plan/crypto_expressions.rs +++ b/datafusion/src/physical_plan/crypto_expressions.rs @@ -28,8 +28,8 @@ use crate::{ error::{DataFusionError, Result}, scalar::ScalarValue, }; -use arrow::{ - array::{Array, BinaryArray, GenericStringArray, StringOffsetSizeTrait}, +use arrow2::{ + array::{Array, BinaryArray, Offset, Utf8Array}, datatypes::DataType, }; @@ -60,15 +60,15 @@ fn sha_process(input: &str) -> SHA2DigestOutput { /// # Errors /// This function errors when: /// * the number of arguments is not 1 -/// * the first argument is not castable to a `GenericStringArray` +/// * the first argument is not castable to a `Utf8Array` fn unary_binary_function( args: &[&dyn Array], op: F, name: &str, -) -> Result +) -> Result> where R: AsRef<[u8]>, - T: StringOffsetSizeTrait, + T: Offset, F: Fn(&str) -> R, { if args.len() != 1 { @@ -81,7 +81,7 @@ where let array = args[0] .as_any() - .downcast_ref::>() + .downcast_ref::>() .ok_or_else(|| { DataFusionError::Internal("failed to downcast to string".to_string()) })?; @@ -137,9 +137,7 @@ where } } -fn md5_array( - args: &[&dyn Array], -) -> Result> { +fn md5_array(args: &[&dyn Array]) -> Result> { unary_string_function::(args, md5_process, "md5") } diff --git a/datafusion/src/physical_plan/csv.rs b/datafusion/src/physical_plan/csv.rs index 96b24cc33201f..203442150bdab 100644 --- a/datafusion/src/physical_plan/csv.rs +++ b/datafusion/src/physical_plan/csv.rs @@ -19,10 +19,12 @@ use crate::error::{DataFusionError, Result}; use crate::physical_plan::{common, DisplayFormatType, ExecutionPlan, Partitioning}; -use arrow::csv; -use arrow::datatypes::{Schema, SchemaRef}; -use arrow::error::Result as ArrowResult; -use arrow::record_batch::RecordBatch; + +use arrow2::datatypes::Schema; +use arrow2::error::Result as ArrowResult; +use arrow2::io::csv::read as csv_read; +use arrow2::record_batch::RecordBatch; + use futures::Stream; use std::any::Any; use std::fs::File; @@ -32,6 +34,8 @@ use std::sync::Arc; use std::sync::Mutex; use std::task::{Context, Poll}; +type SchemaRef = Arc; + use super::{RecordBatchStream, SendableRecordBatchStream}; use async_trait::async_trait; @@ -200,6 +204,15 @@ pub struct CsvExec { limit: Option, } +fn infer_schema_from_files( + filenames: &[String], + delimiter: u8, + max_records: &Option, + has_header: bool, +) -> Result { + todo!() +} + impl CsvExec { /// Create a new execution plan for reading a set of CSV files pub fn try_new( @@ -331,12 +344,12 @@ impl CsvExec { filenames: &[String], options: &CsvReadOptions, ) -> Result { - Ok(csv::infer_schema_from_files( + infer_schema_from_files( filenames, options.delimiter, - Some(options.schema_infer_max_records), + &Some(options.schema_infer_max_records), options.has_header, - )?) + ) } } @@ -437,8 +450,11 @@ impl ExecutionPlan for CsvExec { /// Iterator over batches struct CsvStream { - /// Arrow CSV reader - reader: csv::Reader, + reader: csv_read::Reader, + schema: SchemaRef, + batch_size: usize, + projection: Option>, + limit: Option, } impl CsvStream { /// Create an iterator for a CSV file @@ -468,20 +484,19 @@ impl CsvStream { batch_size: usize, limit: Option, ) -> Result> { - let start_line = if has_header { 1 } else { 0 }; - let bounds = limit.map(|x| (0, x + start_line)); + let reader = csv_read::ReaderBuilder::new() + .delimiter(delimiter.unwrap_or(b","[0])) + .has_headers(has_header) + .from_reader(reader); - let reader = csv::Reader::new( + let projection = projection.clone(); + Ok(Self { reader, schema, - has_header, - delimiter, batch_size, - bounds, - projection.clone(), - ); - - Ok(Self { reader }) + projection, + limit, + }) } } @@ -492,14 +507,30 @@ impl Stream for CsvStream { mut self: Pin<&mut Self>, _: &mut Context<'_>, ) -> Poll> { - Poll::Ready(self.reader.next()) + let batch_size = self.batch_size; + let maybe_rows = csv_read::read_rows(&mut self.reader, 0, batch_size); + let maybe_batch = maybe_rows.and_then(|rows| { + if rows.is_empty() { + Ok(None) + } else { + csv_read::parse( + &rows, + self.schema.fields(), + self.projection.as_ref().map(|x| x.as_ref()), + 0, + &csv_read::DefaultParser::default(), + ) + .map(Some) + } + }); + Poll::Ready(maybe_batch.transpose()) } } impl RecordBatchStream for CsvStream { /// Get the schema fn schema(&self) -> SchemaRef { - self.reader.schema() + self.schema.clone() } } @@ -512,7 +543,7 @@ mod tests { #[tokio::test] async fn csv_exec_with_projection() -> Result<()> { let schema = aggr_test_schema(); - let testdata = arrow::util::test_util::arrow_test_data(); + let testdata = crate::test::arrow_test_data(); let filename = "aggregate_test_100.csv"; let path = format!("{}/csv/{}", testdata, filename); let csv = CsvExec::try_new( @@ -540,7 +571,7 @@ mod tests { #[tokio::test] async fn csv_exec_without_projection() -> Result<()> { let schema = aggr_test_schema(); - let testdata = arrow::util::test_util::arrow_test_data(); + let testdata = crate::test::arrow_test_data(); let filename = "aggregate_test_100.csv"; let path = format!("{}/csv/{}", testdata, filename); let csv = CsvExec::try_new( @@ -568,7 +599,7 @@ mod tests { #[tokio::test] async fn csv_exec_with_reader() -> Result<()> { let schema = aggr_test_schema(); - let testdata = arrow::util::test_util::arrow_test_data(); + let testdata = crate::test::arrow_test_data(); let filename = "aggregate_test_100.csv"; let path = format!("{}/csv/{}", testdata, filename); let buf = std::fs::read(path).unwrap(); diff --git a/datafusion/src/physical_plan/datetime_expressions.rs b/datafusion/src/physical_plan/datetime_expressions.rs index ec52e6bc4d528..c63e99dfc7429 100644 --- a/datafusion/src/physical_plan/datetime_expressions.rs +++ b/datafusion/src/physical_plan/datetime_expressions.rs @@ -21,24 +21,23 @@ use std::sync::Arc; use super::ColumnarValue; use crate::{ error::{DataFusionError, Result}, - scalar::{ScalarType, ScalarValue}, + scalar::ScalarValue, }; -use arrow::{ - array::{Array, ArrayRef, GenericStringArray, PrimitiveArray, StringOffsetSizeTrait}, - datatypes::{ArrowPrimitiveType, DataType, TimestampNanosecondType}, +use arrow2::{ + array::*, + compute::cast, + datatypes::{DataType, TimeUnit}, + types::NativeType, }; -use arrow::{ - array::{ - Date32Array, Date64Array, TimestampMicrosecondArray, TimestampMillisecondArray, - TimestampNanosecondArray, TimestampSecondArray, - }, - compute::kernels::temporal, - datatypes::TimeUnit, - temporal_conversions::timestamp_ns_to_datetime, -}; -use chrono::prelude::*; +use arrow2::{compute::temporal, temporal_conversions::timestamp_ns_to_datetime}; +use chrono::prelude::{DateTime, Local, NaiveDateTime, Utc}; +use chrono::Datelike; use chrono::Duration; use chrono::LocalResult; +use chrono::TimeZone; +use chrono::Timelike; + +type ArrayRef = Arc; #[inline] /// Accepts a string in RFC3339 / ISO8601 standard format and some @@ -185,17 +184,18 @@ fn naive_datetime_to_timestamp(s: &str, datetime: NaiveDateTime) -> Result /// # Errors /// This function errors iff: /// * the number of arguments is not 1 or -/// * the first argument is not castable to a `GenericStringArray` or +/// * the first argument is not castable to a `Utf8Array` or /// * the function `op` errors pub(crate) fn unary_string_to_primitive_function<'a, T, O, F>( args: &[&'a dyn Array], op: F, name: &str, + data_type: DataType, ) -> Result> where - O: ArrowPrimitiveType, - T: StringOffsetSizeTrait, - F: Fn(&'a str) -> Result, + O: NativeType, + T: Offset, + F: Fn(&'a str) -> Result, { if args.len() != 1 { return Err(DataFusionError::Internal(format!( @@ -207,13 +207,17 @@ where let array = args[0] .as_any() - .downcast_ref::>() + .downcast_ref::>() .ok_or_else(|| { DataFusionError::Internal("failed to downcast to string".to_string()) })?; // first map is the iterator, second is for the `Option<_>` - array.iter().map(|x| x.map(|x| op(x)).transpose()).collect() + array + .iter() + .map(|x| x.map(|x| op(x)).transpose()) + .collect::>>() + .map(|x| x.to(data_type)) } // given an function that maps a `&str` to a arrow native type, @@ -223,19 +227,31 @@ fn handle<'a, O, F, S>( args: &'a [ColumnarValue], op: F, name: &str, + data_type: DataType, ) -> Result where - O: ArrowPrimitiveType, - S: ScalarType, - F: Fn(&'a str) -> Result, + O: NativeType, + ScalarValue: From>, + S: NativeType, + F: Fn(&'a str) -> Result, { match &args[0] { ColumnarValue::Array(a) => match a.data_type() { DataType::Utf8 => Ok(ColumnarValue::Array(Arc::new( - unary_string_to_primitive_function::(&[a.as_ref()], op, name)?, + unary_string_to_primitive_function::( + &[a.as_ref()], + op, + name, + data_type, + )?, ))), DataType::LargeUtf8 => Ok(ColumnarValue::Array(Arc::new( - unary_string_to_primitive_function::(&[a.as_ref()], op, name)?, + unary_string_to_primitive_function::( + &[a.as_ref()], + op, + name, + data_type, + )?, ))), other => Err(DataFusionError::Internal(format!( "Unsupported data type {:?} for function {}", @@ -245,11 +261,11 @@ where ColumnarValue::Scalar(scalar) => match scalar { ScalarValue::Utf8(a) => { let result = a.as_ref().map(|x| (op)(x)).transpose()?; - Ok(ColumnarValue::Scalar(S::scalar(result))) + Ok(ColumnarValue::Scalar(result.into())) } ScalarValue::LargeUtf8(a) => { let result = a.as_ref().map(|x| (op)(x)).transpose()?; - Ok(ColumnarValue::Scalar(S::scalar(result))) + Ok(ColumnarValue::Scalar(result.into())) } other => Err(DataFusionError::Internal(format!( "Unsupported data type {:?} for function {}", @@ -261,10 +277,11 @@ where /// to_timestamp SQL function pub fn to_timestamp(args: &[ColumnarValue]) -> Result { - handle::( + handle::( args, string_to_timestamp_nanos, "to_timestamp", + DataType::Timestamp(TimeUnit::Nanosecond, None), ) } @@ -337,12 +354,12 @@ pub fn date_trunc(args: &[ColumnarValue]) -> Result { )); }; - let f = |x: Option| x.map(|x| date_trunc_single(granularity, x)).transpose(); + let f = |x: Option<&i64>| x.map(|x| date_trunc_single(granularity, *x)).transpose(); Ok(match array { ColumnarValue::Scalar(scalar) => { if let ScalarValue::TimestampNanosecond(v) = scalar { - ColumnarValue::Scalar(ScalarValue::TimestampNanosecond((f)(*v)?)) + ColumnarValue::Scalar(ScalarValue::TimestampNanosecond((f)(v.as_ref())?)) } else { return Err(DataFusionError::Execution( "array of `date_trunc` must be non-null scalar Utf8".to_string(), @@ -352,67 +369,19 @@ pub fn date_trunc(args: &[ColumnarValue]) -> Result { ColumnarValue::Array(array) => { let array = array .as_any() - .downcast_ref::() + .downcast_ref::>() .unwrap(); let array = array .iter() .map(f) - .collect::>()?; + .collect::>>()? + .to(DataType::Int64); ColumnarValue::Array(Arc::new(array)) } }) } -macro_rules! extract_date_part { - ($ARRAY: expr, $FN:expr) => { - match $ARRAY.data_type() { - DataType::Date32 => { - let array = $ARRAY.as_any().downcast_ref::().unwrap(); - Ok($FN(array)?) - } - DataType::Date64 => { - let array = $ARRAY.as_any().downcast_ref::().unwrap(); - Ok($FN(array)?) - } - DataType::Timestamp(time_unit, None) => match time_unit { - TimeUnit::Second => { - let array = $ARRAY - .as_any() - .downcast_ref::() - .unwrap(); - Ok($FN(array)?) - } - TimeUnit::Millisecond => { - let array = $ARRAY - .as_any() - .downcast_ref::() - .unwrap(); - Ok($FN(array)?) - } - TimeUnit::Microsecond => { - let array = $ARRAY - .as_any() - .downcast_ref::() - .unwrap(); - Ok($FN(array)?) - } - TimeUnit::Nanosecond => { - let array = $ARRAY - .as_any() - .downcast_ref::() - .unwrap(); - Ok($FN(array)?) - } - }, - datatype => Err(DataFusionError::Internal(format!( - "Extract does not support datatype {:?}", - datatype - ))), - } - }; -} - /// DATE_PART SQL function pub fn date_part(args: &[ColumnarValue]) -> Result { if args.len() != 2 { @@ -438,8 +407,9 @@ pub fn date_part(args: &[ColumnarValue]) -> Result { }; let arr = match date_part.to_lowercase().as_str() { - "hour" => extract_date_part!(array, temporal::hour), - "year" => extract_date_part!(array, temporal::year), + "hour" => Ok(temporal::hour(array.as_ref()) + .map(|x| cast::primitive_to_primitive::(&x, &DataType::Int32))?), + "year" => Ok(temporal::year(array.as_ref())?), _ => Err(DataFusionError::Execution(format!( "Date part '{}' not supported", date_part @@ -460,7 +430,8 @@ pub fn date_part(args: &[ColumnarValue]) -> Result { mod tests { use std::sync::Arc; - use arrow::array::{ArrayRef, Int64Array, StringBuilder}; + use arrow2::array::*; + use arrow2::datatypes::*; use super::*; @@ -468,18 +439,15 @@ mod tests { fn to_timestamp_arrays_and_nulls() -> Result<()> { // ensure that arrow array implementation is wired up and handles nulls correctly - let mut string_builder = StringBuilder::new(2); - let mut ts_builder = TimestampNanosecondArray::builder(2); + let string_array = + Utf8Array::::from(&vec![Some("2020-09-08T13:42:29.190855Z"), None]); - string_builder.append_value("2020-09-08T13:42:29.190855Z")?; - ts_builder.append_value(1599572549190855000)?; + let ts_array = Primitive::::from(&[Some(1599572549190855000), None]) + .to(DataType::Timestamp(TimeUnit::Nanosecond, None)); - string_builder.append_null()?; - ts_builder.append_null()?; - let expected_timestamps = &ts_builder.finish() as &dyn Array; + let expected_timestamps = &ts_array as &dyn Array; - let string_array = - ColumnarValue::Array(Arc::new(string_builder.finish()) as ArrayRef); + let string_array = ColumnarValue::Array(Arc::new(string_array) as ArrayRef); let parsed_timestamps = to_timestamp(&[string_array]) .expect("that to_timestamp parsed values without error"); if let ColumnarValue::Array(parsed_array) = parsed_timestamps { @@ -554,9 +522,8 @@ mod tests { // pass the wrong type of input array to to_timestamp and test // that we get an error. - let mut builder = Int64Array::builder(1); - builder.append_value(1)?; - let int64array = ColumnarValue::Array(Arc::new(builder.finish())); + let array = Int64Array::from_slice(&[1]); + let int64array = ColumnarValue::Array(Arc::new(array)); let expected_err = "Internal error: Unsupported data type Int64 for function to_timestamp"; diff --git a/datafusion/src/physical_plan/distinct_expressions.rs b/datafusion/src/physical_plan/distinct_expressions.rs index f3513c2950e4d..b8605ed94e1d2 100644 --- a/datafusion/src/physical_plan/distinct_expressions.rs +++ b/datafusion/src/physical_plan/distinct_expressions.rs @@ -18,23 +18,24 @@ //! Implementations for DISTINCT expressions, e.g. `COUNT(DISTINCT c)` use std::any::Any; +use std::collections::HashSet; use std::convert::TryFrom; use std::fmt::Debug; -use std::hash::Hash; use std::sync::Arc; -use arrow::datatypes::{DataType, Field}; - use ahash::RandomState; -use std::collections::HashSet; + +use arrow2::array::Array; +use arrow2::datatypes::{DataType, Field}; use crate::error::{DataFusionError, Result}; use crate::physical_plan::group_scalar::GroupByScalar; use crate::physical_plan::{Accumulator, AggregateExpr, PhysicalExpr}; use crate::scalar::ScalarValue; -#[derive(Debug, PartialEq, Eq, Hash, Clone)] -struct DistinctScalarValues(Vec); +type ArrayRef = Arc; + +type DistinctScalarValues = Vec; fn format_state_name(name: &str, state_name: &str) -> String { format!("{}[{}]", name, state_name) @@ -137,12 +138,12 @@ impl Accumulator for DistinctCountAccumulator { fn update(&mut self, values: &[ScalarValue]) -> Result<()> { // If a row has a NULL, it is not included in the final count. if !values.iter().any(|v| v.is_null()) { - self.values.insert(DistinctScalarValues( + self.values.insert( values .iter() .map(GroupByScalar::try_from) .collect::>>()?, - )); + ); } Ok(()) @@ -167,38 +168,28 @@ impl Accumulator for DistinctCountAccumulator { (0..col_values[0].len()).try_for_each(|row_index| { let row_values = col_values .iter() - .map(|col| col[row_index].clone()) - .collect::>(); + .map(|col| ScalarValue::try_from_array(col, row_index)) + .collect::>>()?; self.update(&row_values) }) } fn state(&self) -> Result> { - let mut cols_out = self - .state_data_types + self.values .iter() - .map(|state_data_type| { - ScalarValue::List(Some(Vec::new()), state_data_type.clone()) - }) - .collect::>(); - - let mut cols_vec = cols_out - .iter_mut() - .map(|c| match c { - ScalarValue::List(Some(ref mut v), _) => v, - _ => unreachable!(), + .map(|distinct_values| { + // create an array with all distinct values + let arrays = distinct_values + .iter() + .map(ScalarValue::from) + .map(|x| x.to_array()) + .collect::>(); + let arrays = arrays.iter().map(|x| x.as_ref()).collect::>(); + Ok(arrow2::compute::concat::concatenate(&arrays).map(|x| x.into())?) }) - .collect::>(); - - self.values.iter().for_each(|distinct_values| { - distinct_values.0.iter().enumerate().for_each( - |(col_index, distinct_value)| { - cols_vec[col_index].push(ScalarValue::from(distinct_value)); - }, - ) - }); - - Ok(cols_out) + .zip(self.state_data_types.iter()) + .map(|(x, type_)| x.map(|x| ScalarValue::List(Some(x), type_.clone()))) + .collect() } fn evaluate(&self) -> Result { @@ -214,42 +205,12 @@ impl Accumulator for DistinctCountAccumulator { #[cfg(test)] mod tests { - use super::*; - - use arrow::array::{ - ArrayRef, BooleanArray, Float32Array, Float64Array, Int16Array, Int32Array, - Int64Array, Int8Array, ListArray, UInt16Array, UInt32Array, UInt64Array, - UInt8Array, - }; - use arrow::array::{Int32Builder, ListBuilder, UInt64Builder}; - use arrow::datatypes::DataType; - - macro_rules! build_list { - ($LISTS:expr, $BUILDER_TYPE:ident) => {{ - let mut builder = ListBuilder::new($BUILDER_TYPE::new(0)); - for list in $LISTS.iter() { - match list { - Some(values) => { - for value in values.iter() { - match value { - Some(v) => builder.values().append_value((*v).into())?, - None => builder.values().append_null()?, - } - } - - builder.append(true)?; - } - None => { - builder.append(false)?; - } - } - } + use std::iter::FromIterator; - let array = Arc::new(builder.finish()) as ArrayRef; + use super::*; - Ok(array) as Result - }}; - } + use arrow2::array::*; + use arrow2::datatypes::DataType; macro_rules! state_to_vec { ($LIST:expr, $DATA_TYPE:ident, $PRIM_TY:ty) => {{ @@ -333,7 +294,7 @@ mod tests { let agg = DistinctCount::new( arrays .iter() - .map(|a| a.as_any().downcast_ref::().unwrap()) + .map(|a| a.as_any().downcast_ref::>().unwrap()) .map(|a| a.values().data_type().clone()) .collect::>(), vec![], @@ -516,13 +477,14 @@ mod tests { Ok((state_vec, count)) }; - let zero_count_values = BooleanArray::from(Vec::::new()); + let zero_count_values = BooleanArray::from_slice(&[]); - let one_count_values = BooleanArray::from(vec![false, false]); + let one_count_values = BooleanArray::from_slice(&[false, false]); let one_count_values_with_null = BooleanArray::from(vec![Some(true), Some(true), None, None]); - let two_count_values = BooleanArray::from(vec![true, false, true, false, true]); + let two_count_values = + BooleanArray::from_slice(&[true, false, true, false, true]); let two_count_values_with_null = BooleanArray::from(vec![ Some(true), Some(false), @@ -583,8 +545,8 @@ mod tests { #[test] fn count_distinct_update_batch_multiple_columns() -> Result<()> { - let array_int8: ArrayRef = Arc::new(Int8Array::from(vec![1, 1, 2])); - let array_int16: ArrayRef = Arc::new(Int16Array::from(vec![3, 3, 4])); + let array_int8: ArrayRef = Arc::new(Int8Array::from_slice(&[1, 1, 2])); + let array_int16: ArrayRef = Arc::new(Int16Array::from_slice(&[3, 3, 4])); let arrays = vec![array_int8, array_int16]; let (states, result) = run_update_batch(&arrays)?; @@ -673,23 +635,20 @@ mod tests { #[test] fn count_distinct_merge_batch() -> Result<()> { - let state_in1 = build_list!( - vec![ - Some(vec![Some(-1_i32), Some(-1_i32), Some(-2_i32), Some(-2_i32)]), - Some(vec![Some(-2_i32), Some(-3_i32)]), - ], - Int32Builder - )?; - - let state_in2 = build_list!( - vec![ - Some(vec![Some(5_u64), Some(6_u64), Some(5_u64), Some(7_u64)]), - Some(vec![Some(5_u64), Some(7_u64)]), - ], - UInt64Builder - )?; - - let (states, result) = run_merge_batch(&[state_in1, state_in2])?; + let state_in1 = ListPrimitive::, i32>::from_iter(vec![ + Some(vec![Some(-1_i32), Some(-1_i32), Some(-2_i32), Some(-2_i32)]), + Some(vec![Some(-2_i32), Some(-3_i32)]), + ]) + .to(ListArray::default_datatype(DataType::Int32)); + + let state_in2 = ListPrimitive::, u64>::from_iter(vec![ + Some(vec![Some(5_u64), Some(6_u64), Some(5_u64), Some(7_u64)]), + Some(vec![Some(5_u64), Some(7_u64)]), + ]) + .to(ListArray::default_datatype(DataType::UInt64)); + + let (states, result) = + run_merge_batch(&[Arc::new(state_in1), Arc::new(state_in2)])?; let state_out_vec1 = state_to_vec!(&states[0], Int32, i32).unwrap(); let state_out_vec2 = state_to_vec!(&states[1], UInt64, u64).unwrap(); diff --git a/datafusion/src/physical_plan/empty.rs b/datafusion/src/physical_plan/empty.rs index 391a695f45014..1cb57b716d907 100644 --- a/datafusion/src/physical_plan/empty.rs +++ b/datafusion/src/physical_plan/empty.rs @@ -24,14 +24,17 @@ use crate::error::{DataFusionError, Result}; use crate::physical_plan::{ memory::MemoryStream, DisplayFormatType, Distribution, ExecutionPlan, Partitioning, }; -use arrow::array::NullArray; -use arrow::datatypes::{DataType, Field, Schema, SchemaRef}; -use arrow::record_batch::RecordBatch; + +use arrow2::array::NullArray; +use arrow2::datatypes::{DataType, Field, Schema}; +use arrow2::record_batch::RecordBatch; use super::SendableRecordBatchStream; use async_trait::async_trait; +type SchemaRef = Arc; + /// Execution plan for empty relation (produces no rows) #[derive(Debug)] pub struct EmptyExec { @@ -109,7 +112,7 @@ impl ExecutionPlan for EmptyExec { DataType::Null, true, )])), - vec![Arc::new(NullArray::new(1))], + vec![Arc::new(NullArray::from_data(1))], )?] } else { vec![] diff --git a/datafusion/src/physical_plan/explain.rs b/datafusion/src/physical_plan/explain.rs index 3c5ef1af32366..801e72a24ea39 100644 --- a/datafusion/src/physical_plan/explain.rs +++ b/datafusion/src/physical_plan/explain.rs @@ -26,7 +26,9 @@ use crate::{ physical_plan::Partitioning, physical_plan::{common::SizedRecordBatchStream, DisplayFormatType, ExecutionPlan}, }; -use arrow::{array::StringBuilder, datatypes::SchemaRef, record_batch::RecordBatch}; +use arrow2::{array::*, datatypes::Schema, record_batch::RecordBatch}; + +type SchemaRef = Arc; use super::SendableRecordBatchStream; use async_trait::async_trait; @@ -100,20 +102,19 @@ impl ExecutionPlan for ExplainExec { ))); } - let mut type_builder = StringBuilder::new(self.stringified_plans.len()); - let mut plan_builder = StringBuilder::new(self.stringified_plans.len()); + let mut type_builder = + Utf8Primitive::::with_capacity(self.stringified_plans.len()); + let mut plan_builder = + Utf8Primitive::::with_capacity(self.stringified_plans.len()); for p in &self.stringified_plans { - type_builder.append_value(&String::from(&p.plan_type))?; - plan_builder.append_value(&*p.plan)?; + type_builder.push(Some(&String::from(&p.plan_type).as_ref())); + plan_builder.push(Some(&p.plan.as_ref().as_ref())); } let record_batch = RecordBatch::try_new( self.schema.clone(), - vec![ - Arc::new(type_builder.finish()), - Arc::new(plan_builder.finish()), - ], + vec![Arc::new(type_builder.to()), Arc::new(plan_builder.to())], )?; Ok(Box::pin(SizedRecordBatchStream::new( diff --git a/datafusion/src/physical_plan/expressions/average.rs b/datafusion/src/physical_plan/expressions/average.rs index 6a6332042188f..ca94d344ef95a 100644 --- a/datafusion/src/physical_plan/expressions/average.rs +++ b/datafusion/src/physical_plan/expressions/average.rs @@ -24,13 +24,15 @@ use std::sync::Arc; use crate::error::{DataFusionError, Result}; use crate::physical_plan::{Accumulator, AggregateExpr, PhysicalExpr}; use crate::scalar::ScalarValue; -use arrow::compute; -use arrow::datatypes::DataType; -use arrow::{ - array::{ArrayRef, UInt64Array}, +use arrow2::compute; +use arrow2::datatypes::DataType; +use arrow2::{ + array::{Array, UInt64Array}, datatypes::Field, }; +type ArrayRef = Arc; + use super::{format_state_name, sum}; /// AVG aggregate expression @@ -150,7 +152,7 @@ impl Accumulator for AvgAccumulator { fn update_batch(&mut self, values: &[ArrayRef]) -> Result<()> { let values = &values[0]; - self.count += (values.len() - values.data().null_count()) as u64; + self.count += (values.len() - values.null_count()) as u64; self.sum = sum::sum(&self.sum, &sum::sum_batch(values)?)?; Ok(()) } @@ -172,7 +174,7 @@ impl Accumulator for AvgAccumulator { fn merge_batch(&mut self, states: &[ArrayRef]) -> Result<()> { let counts = states[0].as_any().downcast_ref::().unwrap(); // counts are summed - self.count += compute::sum(counts).unwrap_or(0); + self.count += compute::aggregate::sum(counts).unwrap_or(0); // sums are summed self.sum = sum::sum(&self.sum, &sum::sum_batch(&states[1])?)?; @@ -196,12 +198,12 @@ mod tests { use super::*; use crate::physical_plan::expressions::col; use crate::{error::Result, generic_test_op}; - use arrow::record_batch::RecordBatch; - use arrow::{array::*, datatypes::*}; + use arrow2::record_batch::RecordBatch; + use arrow2::{array::*, datatypes::*}; #[test] fn avg_i32() -> Result<()> { - let a: ArrayRef = Arc::new(Int32Array::from(vec![1, 2, 3, 4, 5])); + let a: ArrayRef = Arc::new(Int32Array::from_slice(&[1, 2, 3, 4, 5])); generic_test_op!( a, DataType::Int32, @@ -243,8 +245,7 @@ mod tests { #[test] fn avg_u32() -> Result<()> { - let a: ArrayRef = - Arc::new(UInt32Array::from(vec![1_u32, 2_u32, 3_u32, 4_u32, 5_u32])); + let a: ArrayRef = Arc::new(UInt32Array::from_slice(&[1, 2, 3, 4, 5])); generic_test_op!( a, DataType::UInt32, @@ -256,8 +257,9 @@ mod tests { #[test] fn avg_f32() -> Result<()> { - let a: ArrayRef = - Arc::new(Float32Array::from(vec![1_f32, 2_f32, 3_f32, 4_f32, 5_f32])); + let a: ArrayRef = Arc::new(Float32Array::from_slice(&[ + 1_f32, 2_f32, 3_f32, 4_f32, 5_f32, + ])); generic_test_op!( a, DataType::Float32, @@ -269,8 +271,9 @@ mod tests { #[test] fn avg_f64() -> Result<()> { - let a: ArrayRef = - Arc::new(Float64Array::from(vec![1_f64, 2_f64, 3_f64, 4_f64, 5_f64])); + let a: ArrayRef = Arc::new(Float64Array::from_slice(&[ + 1_f64, 2_f64, 3_f64, 4_f64, 5_f64, + ])); generic_test_op!( a, DataType::Float64, diff --git a/datafusion/src/physical_plan/expressions/binary.rs b/datafusion/src/physical_plan/expressions/binary.rs index 5c2d9ce02f51f..a8d4d22db4aef 100644 --- a/datafusion/src/physical_plan/expressions/binary.rs +++ b/datafusion/src/physical_plan/expressions/binary.rs @@ -15,27 +15,14 @@ // specific language governing permissions and limitations // under the License. -use std::{any::Any, sync::Arc}; - -use arrow::array::*; -use arrow::compute::kernels::arithmetic::{ - add, divide, divide_scalar, multiply, subtract, -}; -use arrow::compute::kernels::boolean::{and_kleene, or_kleene}; -use arrow::compute::kernels::comparison::{eq, gt, gt_eq, lt, lt_eq, neq}; -use arrow::compute::kernels::comparison::{ - eq_scalar, gt_eq_scalar, gt_scalar, lt_eq_scalar, lt_scalar, neq_scalar, -}; -use arrow::compute::kernels::comparison::{ - eq_utf8, gt_eq_utf8, gt_utf8, like_utf8, like_utf8_scalar, lt_eq_utf8, lt_utf8, - neq_utf8, nlike_utf8, nlike_utf8_scalar, -}; -use arrow::compute::kernels::comparison::{ - eq_utf8_scalar, gt_eq_utf8_scalar, gt_utf8_scalar, lt_eq_utf8_scalar, lt_utf8_scalar, - neq_utf8_scalar, -}; -use arrow::datatypes::{DataType, Schema, TimeUnit}; -use arrow::record_batch::RecordBatch; +use std::{any::Any, convert::TryInto, sync::Arc}; + +use arrow2::array::*; +use arrow2::compute; +use arrow2::datatypes::{DataType, Schema}; +use arrow2::record_batch::RecordBatch; + +type StringArray = Utf8Array; use crate::error::{DataFusionError, Result}; use crate::logical_plan::Operator; @@ -85,157 +72,6 @@ impl std::fmt::Display for BinaryExpr { } } -/// Invoke a compute kernel on a pair of binary data arrays -macro_rules! compute_utf8_op { - ($LEFT:expr, $RIGHT:expr, $OP:ident, $DT:ident) => {{ - let ll = $LEFT - .as_any() - .downcast_ref::<$DT>() - .expect("compute_op failed to downcast array"); - let rr = $RIGHT - .as_any() - .downcast_ref::<$DT>() - .expect("compute_op failed to downcast array"); - Ok(Arc::new(paste::expr! {[<$OP _utf8>]}(&ll, &rr)?)) - }}; -} - -/// Invoke a compute kernel on a data array and a scalar value -macro_rules! compute_utf8_op_scalar { - ($LEFT:expr, $RIGHT:expr, $OP:ident, $DT:ident) => {{ - let ll = $LEFT - .as_any() - .downcast_ref::<$DT>() - .expect("compute_op failed to downcast array"); - if let ScalarValue::Utf8(Some(string_value)) = $RIGHT { - Ok(Arc::new(paste::expr! {[<$OP _utf8_scalar>]}( - &ll, - &string_value, - )?)) - } else { - Err(DataFusionError::Internal(format!( - "compute_utf8_op_scalar failed to cast literal value {}", - $RIGHT - ))) - } - }}; -} - -/// Invoke a compute kernel on a data array and a scalar value -macro_rules! compute_op_scalar { - ($LEFT:expr, $RIGHT:expr, $OP:ident, $DT:ident) => {{ - use std::convert::TryInto; - let ll = $LEFT - .as_any() - .downcast_ref::<$DT>() - .expect("compute_op failed to downcast array"); - // generate the scalar function name, such as lt_scalar, from the $OP parameter - // (which could have a value of lt) and the suffix _scalar - Ok(Arc::new(paste::expr! {[<$OP _scalar>]}( - &ll, - $RIGHT.try_into()?, - )?)) - }}; -} - -/// Invoke a compute kernel on array(s) -macro_rules! compute_op { - // invoke binary operator - ($LEFT:expr, $RIGHT:expr, $OP:ident, $DT:ident) => {{ - let ll = $LEFT - .as_any() - .downcast_ref::<$DT>() - .expect("compute_op failed to downcast array"); - let rr = $RIGHT - .as_any() - .downcast_ref::<$DT>() - .expect("compute_op failed to downcast array"); - Ok(Arc::new($OP(&ll, &rr)?)) - }}; - // invoke unary operator - ($OPERAND:expr, $OP:ident, $DT:ident) => {{ - let operand = $OPERAND - .as_any() - .downcast_ref::<$DT>() - .expect("compute_op failed to downcast array"); - Ok(Arc::new($OP(&operand)?)) - }}; -} - -macro_rules! binary_string_array_op_scalar { - ($LEFT:expr, $RIGHT:expr, $OP:ident) => {{ - let result: Result> = match $LEFT.data_type() { - DataType::Utf8 => compute_utf8_op_scalar!($LEFT, $RIGHT, $OP, StringArray), - other => Err(DataFusionError::Internal(format!( - "Data type {:?} not supported for scalar operation on string array", - other - ))), - }; - Some(result) - }}; -} - -macro_rules! binary_string_array_op { - ($LEFT:expr, $RIGHT:expr, $OP:ident) => {{ - match $LEFT.data_type() { - DataType::Utf8 => compute_utf8_op!($LEFT, $RIGHT, $OP, StringArray), - other => Err(DataFusionError::Internal(format!( - "Data type {:?} not supported for binary operation on string arrays", - other - ))), - } - }}; -} - -/// Invoke a compute kernel on a pair of arrays -/// The binary_primitive_array_op macro only evaluates for primitive types -/// like integers and floats. -macro_rules! binary_primitive_array_op { - ($LEFT:expr, $RIGHT:expr, $OP:ident) => {{ - match $LEFT.data_type() { - DataType::Int8 => compute_op!($LEFT, $RIGHT, $OP, Int8Array), - DataType::Int16 => compute_op!($LEFT, $RIGHT, $OP, Int16Array), - DataType::Int32 => compute_op!($LEFT, $RIGHT, $OP, Int32Array), - DataType::Int64 => compute_op!($LEFT, $RIGHT, $OP, Int64Array), - DataType::UInt8 => compute_op!($LEFT, $RIGHT, $OP, UInt8Array), - DataType::UInt16 => compute_op!($LEFT, $RIGHT, $OP, UInt16Array), - DataType::UInt32 => compute_op!($LEFT, $RIGHT, $OP, UInt32Array), - DataType::UInt64 => compute_op!($LEFT, $RIGHT, $OP, UInt64Array), - DataType::Float32 => compute_op!($LEFT, $RIGHT, $OP, Float32Array), - DataType::Float64 => compute_op!($LEFT, $RIGHT, $OP, Float64Array), - other => Err(DataFusionError::Internal(format!( - "Data type {:?} not supported for binary operation on primitive arrays", - other - ))), - } - }}; -} - -/// Invoke a compute kernel on an array and a scalar -/// The binary_primitive_array_op_scalar macro only evaluates for primitive -/// types like integers and floats. -macro_rules! binary_primitive_array_op_scalar { - ($LEFT:expr, $RIGHT:expr, $OP:ident) => {{ - let result: Result> = match $LEFT.data_type() { - DataType::Int8 => compute_op_scalar!($LEFT, $RIGHT, $OP, Int8Array), - DataType::Int16 => compute_op_scalar!($LEFT, $RIGHT, $OP, Int16Array), - DataType::Int32 => compute_op_scalar!($LEFT, $RIGHT, $OP, Int32Array), - DataType::Int64 => compute_op_scalar!($LEFT, $RIGHT, $OP, Int64Array), - DataType::UInt8 => compute_op_scalar!($LEFT, $RIGHT, $OP, UInt8Array), - DataType::UInt16 => compute_op_scalar!($LEFT, $RIGHT, $OP, UInt16Array), - DataType::UInt32 => compute_op_scalar!($LEFT, $RIGHT, $OP, UInt32Array), - DataType::UInt64 => compute_op_scalar!($LEFT, $RIGHT, $OP, UInt64Array), - DataType::Float32 => compute_op_scalar!($LEFT, $RIGHT, $OP, Float32Array), - DataType::Float64 => compute_op_scalar!($LEFT, $RIGHT, $OP, Float64Array), - other => Err(DataFusionError::Internal(format!( - "Data type {:?} not supported for scalar operation on primitive array", - other - ))), - }; - Some(result) - }}; -} - /// The binary_array_op_scalar macro includes types that extend beyond the primitive, /// such as Utf8 strings. #[macro_export] @@ -252,12 +88,12 @@ macro_rules! binary_array_op_scalar { DataType::UInt64 => compute_op_scalar!($LEFT, $RIGHT, $OP, UInt64Array), DataType::Float32 => compute_op_scalar!($LEFT, $RIGHT, $OP, Float32Array), DataType::Float64 => compute_op_scalar!($LEFT, $RIGHT, $OP, Float64Array), - DataType::Utf8 => compute_utf8_op_scalar!($LEFT, $RIGHT, $OP, StringArray), + DataType::Utf8 => compute_utf8_op_scalar!($LEFT, $RIGHT, $OP, Utf8Array), DataType::Timestamp(TimeUnit::Nanosecond, None) => { - compute_op_scalar!($LEFT, $RIGHT, $OP, TimestampNanosecondArray) + compute_op_scalar!($LEFT, $RIGHT, $OP, Int64Array) } DataType::Date32 => { - compute_op_scalar!($LEFT, $RIGHT, $OP, Date32Array) + compute_op_scalar!($LEFT, $RIGHT, $OP, Int32Array) } other => Err(DataFusionError::Internal(format!( "Data type {:?} not supported for scalar operation on dyn array", @@ -276,8 +112,12 @@ macro_rules! binary_array_op { match $LEFT.data_type() { DataType::Int8 => compute_op!($LEFT, $RIGHT, $OP, Int8Array), DataType::Int16 => compute_op!($LEFT, $RIGHT, $OP, Int16Array), - DataType::Int32 => compute_op!($LEFT, $RIGHT, $OP, Int32Array), - DataType::Int64 => compute_op!($LEFT, $RIGHT, $OP, Int64Array), + DataType::Int32 | DataType::Date32 => { + compute_op!($LEFT, $RIGHT, $OP, Int32Array) + } + DataType::Int64 | DataType::Timestamp(_, None) | DataType::Date64 => { + compute_op!($LEFT, $RIGHT, $OP, Int64Array) + } DataType::UInt8 => compute_op!($LEFT, $RIGHT, $OP, UInt8Array), DataType::UInt16 => compute_op!($LEFT, $RIGHT, $OP, UInt16Array), DataType::UInt32 => compute_op!($LEFT, $RIGHT, $OP, UInt32Array), @@ -285,15 +125,6 @@ macro_rules! binary_array_op { DataType::Float32 => compute_op!($LEFT, $RIGHT, $OP, Float32Array), DataType::Float64 => compute_op!($LEFT, $RIGHT, $OP, Float64Array), DataType::Utf8 => compute_utf8_op!($LEFT, $RIGHT, $OP, StringArray), - DataType::Timestamp(TimeUnit::Nanosecond, None) => { - compute_op!($LEFT, $RIGHT, $OP, TimestampNanosecondArray) - } - DataType::Date32 => { - compute_op!($LEFT, $RIGHT, $OP, Date32Array) - } - DataType::Date64 => { - compute_op!($LEFT, $RIGHT, $OP, Date64Array) - } other => Err(DataFusionError::Internal(format!( "Data type {:?} not supported for binary operation on dyn arrays", other @@ -304,19 +135,125 @@ macro_rules! binary_array_op { /// Invoke a boolean kernel on a pair of arrays macro_rules! boolean_op { - ($LEFT:expr, $RIGHT:expr, $OP:ident) => {{ + ($LEFT:expr, $RIGHT:expr, $OP:expr) => {{ let ll = $LEFT .as_any() - .downcast_ref::() + .downcast_ref() .expect("boolean_op failed to downcast array"); let rr = $RIGHT .as_any() - .downcast_ref::() + .downcast_ref() .expect("boolean_op failed to downcast array"); Ok(Arc::new($OP(&ll, &rr)?)) }}; } +fn to_arrow_comparison(op: &Operator) -> compute::comparison::Operator { + match op { + Operator::Eq => compute::comparison::Operator::Eq, + Operator::NotEq => compute::comparison::Operator::Neq, + Operator::Lt => compute::comparison::Operator::Lt, + Operator::LtEq => compute::comparison::Operator::LtEq, + Operator::Gt => compute::comparison::Operator::Gt, + Operator::GtEq => compute::comparison::Operator::GtEq, + _ => unreachable!(), + } +} + +fn to_arrow_arithmetics(op: &Operator) -> compute::arithmetics::Operator { + match op { + Operator::Plus => compute::arithmetics::Operator::Add, + Operator::Minus => compute::arithmetics::Operator::Subtract, + Operator::Multiply => compute::arithmetics::Operator::Divide, + Operator::Divide => compute::arithmetics::Operator::Multiply, + _ => unreachable!(), + } +} + +fn evaluate(lhs: &dyn Array, op: &Operator, rhs: &dyn Array) -> Result> { + use Operator::*; + if matches!(op, Plus | Minus | Divide | Multiply) { + let op = to_arrow_arithmetics(op); + Ok(compute::arithmetics::arithmetic(lhs, op, rhs).map(|x| x.into())?) + } else if matches!(op, Eq | NotEq | Lt | LtEq | Gt | GtEq) { + let op = to_arrow_comparison(op); + Ok(compute::comparison::compare(lhs, rhs, op).map(Arc::new)?) + } else if matches!(op, Or) { + boolean_op!(lhs, rhs, compute::boolean_kleene::or) + } else if matches!(op, And) { + boolean_op!(lhs, rhs, compute::boolean_kleene::and) + } else { + //Operator::Like => binary_string_array_op!(left, right, like), + //Operator::NotLike => binary_string_array_op!(left, right, nlike), + // add remaining: + /* + Modulus, + Like, + NotLike, + */ + todo!() + } +} + +macro_rules! dyn_scalar { + ($lhs:expr, $op:expr, $rhs:expr, $ty:ty) => {{ + Arc::new(compute::arithmetics::arithmetic_primitive_scalar::<$ty>( + $lhs.as_any().downcast_ref().unwrap(), + $op, + &$rhs.clone().try_into().unwrap(), + )?) + }}; +} + +fn evaluate_scalar( + lhs: &dyn Array, + op: &Operator, + rhs: &ScalarValue, +) -> Result>> { + use Operator::*; + if matches!(op, Plus | Minus | Divide | Multiply) { + let op = to_arrow_arithmetics(op); + Ok(Some(match lhs.data_type() { + DataType::Int8 => dyn_scalar!(lhs, op, rhs, i8), + DataType::Int16 => dyn_scalar!(lhs, op, rhs, i16), + DataType::Int32 => dyn_scalar!(lhs, op, rhs, i32), + DataType::Int64 => dyn_scalar!(lhs, op, rhs, i64), + DataType::UInt8 => dyn_scalar!(lhs, op, rhs, u8), + DataType::UInt16 => dyn_scalar!(lhs, op, rhs, u16), + DataType::UInt32 => dyn_scalar!(lhs, op, rhs, u32), + DataType::UInt64 => dyn_scalar!(lhs, op, rhs, u64), + DataType::Float32 => dyn_scalar!(lhs, op, rhs, f32), + DataType::Float64 => dyn_scalar!(lhs, op, rhs, f64), + _ => { + return Err(DataFusionError::NotImplemented( + "This operation is not yet implemented".to_string(), + )) + } + })) + } else { + Ok(None) + } +} + +fn evaluate_inverse_scalar( + lhs: &ScalarValue, + op: &Operator, + rhs: &dyn Array, +) -> Result>> { + use Operator::*; + match op { + Lt => evaluate_scalar(rhs, &GtEq, lhs), + Gt => evaluate_scalar(rhs, &LtEq, lhs), + GtEq => evaluate_scalar(rhs, &Lt, lhs), + LtEq => evaluate_scalar(rhs, &Gt, lhs), + Eq => evaluate_scalar(rhs, &NotEq, lhs), + NotEq => evaluate_scalar(rhs, &Eq, lhs), + Plus => evaluate_scalar(rhs, &Plus, lhs), + Multiply => evaluate_scalar(rhs, &Multiply, lhs), + _ => Ok(None), + } +} + /// Coercion rules for all binary operators. Returns the output type /// of applying `op` to an argument of `lhs_type` and `rhs_type`. fn common_binary_type( @@ -431,57 +368,16 @@ impl PhysicalExpr for BinaryExpr { let scalar_result = match (&left_value, &right_value) { (ColumnarValue::Array(array), ColumnarValue::Scalar(scalar)) => { - // if left is array and right is literal - use scalar operations - match &self.op { - Operator::Lt => binary_array_op_scalar!(array, scalar.clone(), lt), - Operator::LtEq => { - binary_array_op_scalar!(array, scalar.clone(), lt_eq) - } - Operator::Gt => binary_array_op_scalar!(array, scalar.clone(), gt), - Operator::GtEq => { - binary_array_op_scalar!(array, scalar.clone(), gt_eq) - } - Operator::Eq => binary_array_op_scalar!(array, scalar.clone(), eq), - Operator::NotEq => { - binary_array_op_scalar!(array, scalar.clone(), neq) - } - Operator::Like => { - binary_string_array_op_scalar!(array, scalar.clone(), like) - } - Operator::NotLike => { - binary_string_array_op_scalar!(array, scalar.clone(), nlike) - } - Operator::Divide => { - binary_primitive_array_op_scalar!(array, scalar.clone(), divide) - } - // if scalar operation is not supported - fallback to array implementation - _ => None, - } + evaluate_scalar(array.as_ref(), &self.op, scalar) } (ColumnarValue::Scalar(scalar), ColumnarValue::Array(array)) => { - // if right is literal and left is array - reverse operator and parameters - match &self.op { - Operator::Lt => binary_array_op_scalar!(array, scalar.clone(), gt), - Operator::LtEq => { - binary_array_op_scalar!(array, scalar.clone(), gt_eq) - } - Operator::Gt => binary_array_op_scalar!(array, scalar.clone(), lt), - Operator::GtEq => { - binary_array_op_scalar!(array, scalar.clone(), lt_eq) - } - Operator::Eq => binary_array_op_scalar!(array, scalar.clone(), eq), - Operator::NotEq => { - binary_array_op_scalar!(array, scalar.clone(), neq) - } - // if scalar operation is not supported - fallback to array implementation - _ => None, - } + evaluate_inverse_scalar(scalar, &self.op, array.as_ref()) } - (_, _) => None, - }; + (_, _) => Ok(None), + }?; if let Some(result) = scalar_result { - return result.map(|a| ColumnarValue::Array(a)); + return Ok(ColumnarValue::Array(result)); } // if both arrays or both literals - extract arrays and continue execution @@ -490,45 +386,7 @@ impl PhysicalExpr for BinaryExpr { right_value.into_array(batch.num_rows()), ); - let result: Result = match &self.op { - Operator::Like => binary_string_array_op!(left, right, like), - Operator::NotLike => binary_string_array_op!(left, right, nlike), - Operator::Lt => binary_array_op!(left, right, lt), - Operator::LtEq => binary_array_op!(left, right, lt_eq), - Operator::Gt => binary_array_op!(left, right, gt), - Operator::GtEq => binary_array_op!(left, right, gt_eq), - Operator::Eq => binary_array_op!(left, right, eq), - Operator::NotEq => binary_array_op!(left, right, neq), - Operator::Plus => binary_primitive_array_op!(left, right, add), - Operator::Minus => binary_primitive_array_op!(left, right, subtract), - Operator::Multiply => binary_primitive_array_op!(left, right, multiply), - Operator::Divide => binary_primitive_array_op!(left, right, divide), - Operator::And => { - if left_data_type == DataType::Boolean { - boolean_op!(left, right, and_kleene) - } else { - return Err(DataFusionError::Internal(format!( - "Cannot evaluate binary expression {:?} with types {:?} and {:?}", - self.op, - left.data_type(), - right.data_type() - ))); - } - } - Operator::Or => { - if left_data_type == DataType::Boolean { - boolean_op!(left, right, or_kleene) - } else { - return Err(DataFusionError::Internal(format!( - "Cannot evaluate binary expression {:?} with types {:?} and {:?}", - self.op, left_data_type, right_data_type - ))); - } - } - Operator::Modulus => Err(DataFusionError::NotImplemented( - "Modulus operator is still not supported".to_string(), - )), - }; + let result = evaluate(left.as_ref(), &self.op, right.as_ref()); result.map(|a| ColumnarValue::Array(a)) } } @@ -567,8 +425,8 @@ pub fn binary( #[cfg(test)] mod tests { - use arrow::datatypes::{ArrowNumericType, Field, Int32Type, SchemaRef}; - use arrow::util::display::array_value_to_string; + use arrow2::datatypes::*; + use arrow2::{array::*, types::NativeType}; use super::*; use crate::error::Result; @@ -590,8 +448,8 @@ mod tests { Field::new("a", DataType::Int32, false), Field::new("b", DataType::Int32, false), ]); - let a = Int32Array::from(vec![1, 2, 3, 4, 5]); - let b = Int32Array::from(vec![1, 2, 4, 8, 16]); + let a = Int32Array::from_slice(&[1, 2, 3, 4, 5]); + let b = Int32Array::from_slice(&[1, 2, 4, 8, 16]); let batch = RecordBatch::try_new(Arc::new(schema), vec![Arc::new(a), Arc::new(b)])?; @@ -618,8 +476,8 @@ mod tests { Field::new("a", DataType::Int32, false), Field::new("b", DataType::Int32, false), ]); - let a = Int32Array::from(vec![2, 4, 6, 8, 10]); - let b = Int32Array::from(vec![2, 5, 4, 8, 8]); + let a = Int32Array::from_slice(&[2, 4, 6, 8, 10]); + let b = Int32Array::from_slice(&[2, 5, 4, 8, 8]); let batch = RecordBatch::try_new(Arc::new(schema), vec![Arc::new(a), Arc::new(b)])?; @@ -659,8 +517,8 @@ mod tests { Field::new("a", $A_TYPE, false), Field::new("b", $B_TYPE, false), ]); - let a = $A_ARRAY::from($A_VEC); - let b = $B_ARRAY::from($B_VEC); + let a = $A_ARRAY::from_slice(&$A_VEC); + let b = $B_ARRAY::from_slice(&$B_VEC); let batch = RecordBatch::try_new( Arc::new(schema.clone()), vec![Arc::new(a), Arc::new(b)], @@ -756,7 +614,7 @@ mod tests { StringArray, DataType::Utf8, vec!["1994-12-13", "1995-01-26"], - Date32Array, + Int32Array, DataType::Date32, vec![9112, 9156], Operator::Eq, @@ -768,7 +626,7 @@ mod tests { StringArray, DataType::Utf8, vec!["1994-12-13", "1995-01-26"], - Date32Array, + Int32Array, DataType::Date32, vec![9113, 9154], Operator::Lt, @@ -780,7 +638,7 @@ mod tests { StringArray, DataType::Utf8, vec!["1994-12-13T12:34:56", "1995-01-26T01:23:45"], - Date64Array, + Int64Array, DataType::Date64, vec![787322096000, 791083425000], Operator::Eq, @@ -792,7 +650,7 @@ mod tests { StringArray, DataType::Utf8, vec!["1994-12-13T12:34:56", "1995-01-26T01:23:45"], - Date64Array, + Int64Array, DataType::Date64, vec![787322096001, 791083424999], Operator::Lt, @@ -815,16 +673,12 @@ mod tests { DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)); let string_type = DataType::Utf8; - // build dictionary - let keys_builder = PrimitiveBuilder::::new(10); - let values_builder = arrow::array::StringBuilder::new(10); - let mut dict_builder = StringDictionaryBuilder::new(keys_builder, values_builder); - - dict_builder.append("one")?; - dict_builder.append_null()?; - dict_builder.append("three")?; - dict_builder.append("four")?; - let dict_array = dict_builder.finish(); + let dict = DictionaryPrimitive::, &str>::with_capacity(0); + dict.push(Some(&"one".as_ref())); + dict.push(None); + dict.push(Some(&"three".as_ref())); + dict.push(Some(&"four".as_ref())); + let dict_array = dict.to(dict_type.clone()); let str_array = StringArray::from(vec![Some("not one"), Some("two"), None, Some("four")]); @@ -839,7 +693,7 @@ mod tests { vec![Arc::new(dict_array), Arc::new(str_array)], )?; - let expected = "false\n\n\ntrue"; + let expected = BooleanArray::from(&[Some(false), None, Some(true)]); // Test 1: dict = str @@ -852,7 +706,7 @@ mod tests { assert_eq!(result.data_type(), &DataType::Boolean); // verify that the result itself is correct - assert_eq!(expected, array_to_string(&result)?); + assert_eq!(expected, result.as_ref()); // Test 2: now test the other direction // str = dict @@ -866,34 +720,25 @@ mod tests { assert_eq!(result.data_type(), &DataType::Boolean); // verify that the result itself is correct - assert_eq!(expected, array_to_string(&result)?); + assert_eq!(expected, result.as_ref()); Ok(()) } - // Convert the array to a newline delimited string of pretty printed values - fn array_to_string(array: &ArrayRef) -> Result { - let s = (0..array.len()) - .map(|i| array_value_to_string(array, i)) - .collect::, arrow::error::ArrowError>>()? - .join("\n"); - Ok(s) - } - #[test] fn plus_op() -> Result<()> { let schema = Schema::new(vec![ Field::new("a", DataType::Int32, false), Field::new("b", DataType::Int32, false), ]); - let a = Int32Array::from(vec![1, 2, 3, 4, 5]); - let b = Int32Array::from(vec![1, 2, 4, 8, 16]); + let a = Int32Array::from_slice(&[1, 2, 3, 4, 5]); + let b = Int32Array::from_slice(&[1, 2, 4, 8, 16]); - apply_arithmetic::( + apply_arithmetic::( Arc::new(schema), vec![Arc::new(a), Arc::new(b)], Operator::Plus, - Int32Array::from(vec![2, 4, 7, 12, 21]), + Int32Array::from_slice(&[2, 4, 7, 12, 21]), )?; Ok(()) @@ -905,22 +750,22 @@ mod tests { Field::new("a", DataType::Int32, false), Field::new("b", DataType::Int32, false), ])); - let a = Arc::new(Int32Array::from(vec![1, 2, 4, 8, 16])); - let b = Arc::new(Int32Array::from(vec![1, 2, 3, 4, 5])); + let a = Arc::new(Int32Array::from_slice(&[1, 2, 4, 8, 16])); + let b = Arc::new(Int32Array::from_slice(&[1, 2, 3, 4, 5])); - apply_arithmetic::( + apply_arithmetic::( schema.clone(), vec![a.clone(), b.clone()], Operator::Minus, - Int32Array::from(vec![0, 0, 1, 4, 11]), + Int32Array::from_slice(&[0, 0, 1, 4, 11]), )?; // should handle have negative values in result (for signed) - apply_arithmetic::( + apply_arithmetic::( schema, vec![b, a], Operator::Minus, - Int32Array::from(vec![0, 0, -1, -4, -11]), + Int32Array::from_slice(&[0, 0, -1, -4, -11]), )?; Ok(()) @@ -932,14 +777,14 @@ mod tests { Field::new("a", DataType::Int32, false), Field::new("b", DataType::Int32, false), ])); - let a = Arc::new(Int32Array::from(vec![4, 8, 16, 32, 64])); - let b = Arc::new(Int32Array::from(vec![2, 4, 8, 16, 32])); + let a = Arc::new(Int32Array::from_slice(&[4, 8, 16, 32, 64])); + let b = Arc::new(Int32Array::from_slice(&[2, 4, 8, 16, 32])); - apply_arithmetic::( + apply_arithmetic::( schema, vec![a, b], Operator::Multiply, - Int32Array::from(vec![8, 32, 128, 512, 2048]), + Int32Array::from_slice(&[8, 32, 128, 512, 2048]), )?; Ok(()) @@ -951,22 +796,22 @@ mod tests { Field::new("a", DataType::Int32, false), Field::new("b", DataType::Int32, false), ])); - let a = Arc::new(Int32Array::from(vec![8, 32, 128, 512, 2048])); - let b = Arc::new(Int32Array::from(vec![2, 4, 8, 16, 32])); + let a = Arc::new(Int32Array::from_slice(&[8, 32, 128, 512, 2048])); + let b = Arc::new(Int32Array::from_slice(&[2, 4, 8, 16, 32])); - apply_arithmetic::( + apply_arithmetic::( schema, vec![a, b], Operator::Divide, - Int32Array::from(vec![4, 8, 16, 32, 64]), + Int32Array::from_slice(&[4, 8, 16, 32, 64]), )?; Ok(()) } - fn apply_arithmetic( - schema: SchemaRef, - data: Vec, + fn apply_arithmetic( + schema: Arc, + data: Vec>, op: Operator, expected: PrimitiveArray, ) -> Result<()> { @@ -974,23 +819,23 @@ mod tests { let batch = RecordBatch::try_new(schema, data)?; let result = arithmetic_op.evaluate(&batch)?.into_array(batch.num_rows()); - assert_eq!(result.as_ref(), &expected); + assert_eq!(expected, result.as_ref()); Ok(()) } fn apply_logic_op( - schema: SchemaRef, + schema: Arc, left: BooleanArray, right: BooleanArray, op: Operator, expected: BooleanArray, ) -> Result<()> { let arithmetic_op = binary_simple(col("a"), op, col("b")); - let data: Vec = vec![Arc::new(left), Arc::new(right)]; + let data: Vec> = vec![Arc::new(left), Arc::new(right)]; let batch = RecordBatch::try_new(schema, data)?; let result = arithmetic_op.evaluate(&batch)?.into_array(batch.num_rows()); - assert_eq!(result.as_ref(), &expected); + assert_eq!(expected, result.as_ref()); Ok(()) } diff --git a/datafusion/src/physical_plan/expressions/case.rs b/datafusion/src/physical_plan/expressions/case.rs index 95ae5325af119..91bba0285f23f 100644 --- a/datafusion/src/physical_plan/expressions/case.rs +++ b/datafusion/src/physical_plan/expressions/case.rs @@ -17,12 +17,16 @@ use std::{any::Any, sync::Arc}; +use arrow2::array::*; +use arrow2::compute::comparison; +use arrow2::compute::if_then_else; +use arrow2::datatypes::{DataType, Schema}; +use arrow2::record_batch::RecordBatch; + use crate::error::{DataFusionError, Result}; use crate::physical_plan::{ColumnarValue, PhysicalExpr}; -use arrow::array::{self, *}; -use arrow::compute::{eq, eq_utf8}; -use arrow::datatypes::{DataType, Schema}; -use arrow::record_batch::RecordBatch; + +use super::ArrayRef; /// The CASE expression is similar to a series of nested if/else and there are two forms that /// can be used. The first form consists of a series of boolean "when" expressions with @@ -103,201 +107,6 @@ impl CaseExpr { } } -macro_rules! if_then_else { - ($BUILDER_TYPE:ty, $ARRAY_TYPE:ty, $BOOLS:expr, $TRUE:expr, $FALSE:expr) => {{ - let true_values = $TRUE - .as_ref() - .as_any() - .downcast_ref::<$ARRAY_TYPE>() - .expect("true_values downcast failed"); - - let false_values = $FALSE - .as_ref() - .as_any() - .downcast_ref::<$ARRAY_TYPE>() - .expect("false_values downcast failed"); - - let mut builder = <$BUILDER_TYPE>::new($BOOLS.len()); - for i in 0..$BOOLS.len() { - if $BOOLS.is_null(i) { - if false_values.is_null(i) { - builder.append_null()?; - } else { - builder.append_value(false_values.value(i))?; - } - } else if $BOOLS.value(i) { - if true_values.is_null(i) { - builder.append_null()?; - } else { - builder.append_value(true_values.value(i))?; - } - } else { - if false_values.is_null(i) { - builder.append_null()?; - } else { - builder.append_value(false_values.value(i))?; - } - } - } - Ok(Arc::new(builder.finish())) - }}; -} - -fn if_then_else( - bools: &BooleanArray, - true_values: ArrayRef, - false_values: ArrayRef, - data_type: &DataType, -) -> Result { - match data_type { - DataType::UInt8 => if_then_else!( - array::UInt8Builder, - array::UInt8Array, - bools, - true_values, - false_values - ), - DataType::UInt16 => if_then_else!( - array::UInt16Builder, - array::UInt16Array, - bools, - true_values, - false_values - ), - DataType::UInt32 => if_then_else!( - array::UInt32Builder, - array::UInt32Array, - bools, - true_values, - false_values - ), - DataType::UInt64 => if_then_else!( - array::UInt64Builder, - array::UInt64Array, - bools, - true_values, - false_values - ), - DataType::Int8 => if_then_else!( - array::Int8Builder, - array::Int8Array, - bools, - true_values, - false_values - ), - DataType::Int16 => if_then_else!( - array::Int16Builder, - array::Int16Array, - bools, - true_values, - false_values - ), - DataType::Int32 => if_then_else!( - array::Int32Builder, - array::Int32Array, - bools, - true_values, - false_values - ), - DataType::Int64 => if_then_else!( - array::Int64Builder, - array::Int64Array, - bools, - true_values, - false_values - ), - DataType::Float32 => if_then_else!( - array::Float32Builder, - array::Float32Array, - bools, - true_values, - false_values - ), - DataType::Float64 => if_then_else!( - array::Float64Builder, - array::Float64Array, - bools, - true_values, - false_values - ), - DataType::Utf8 => if_then_else!( - array::StringBuilder, - array::StringArray, - bools, - true_values, - false_values - ), - other => Err(DataFusionError::Execution(format!( - "CASE does not support '{:?}'", - other - ))), - } -} - -macro_rules! array_equals { - ($TY:ty, $L:expr, $R:expr, $eq_fn:expr) => {{ - let when_value = $L - .as_ref() - .as_any() - .downcast_ref::<$TY>() - .expect("array_equals downcast failed"); - - let base_value = $R - .as_ref() - .as_any() - .downcast_ref::<$TY>() - .expect("array_equals downcast failed"); - - $eq_fn(when_value, base_value).map_err(DataFusionError::from) - }}; -} - -fn array_equals( - data_type: &DataType, - when_value: ArrayRef, - base_value: ArrayRef, -) -> Result { - match data_type { - DataType::UInt8 => { - array_equals!(array::UInt8Array, when_value, base_value, eq) - } - DataType::UInt16 => { - array_equals!(array::UInt16Array, when_value, base_value, eq) - } - DataType::UInt32 => { - array_equals!(array::UInt32Array, when_value, base_value, eq) - } - DataType::UInt64 => { - array_equals!(array::UInt64Array, when_value, base_value, eq) - } - DataType::Int8 => { - array_equals!(array::Int8Array, when_value, base_value, eq) - } - DataType::Int16 => { - array_equals!(array::Int16Array, when_value, base_value, eq) - } - DataType::Int32 => { - array_equals!(array::Int32Array, when_value, base_value, eq) - } - DataType::Int64 => { - array_equals!(array::Int64Array, when_value, base_value, eq) - } - DataType::Float32 => { - array_equals!(array::Float32Array, when_value, base_value, eq) - } - DataType::Float64 => { - array_equals!(array::Float64Array, when_value, base_value, eq) - } - DataType::Utf8 => { - array_equals!(array::StringArray, when_value, base_value, eq_utf8) - } - other => Err(DataFusionError::Execution(format!( - "CASE does not support '{:?}'", - other - ))), - } -} - impl CaseExpr { /// This function evaluates the form of CASE that matches an expression to fixed values. /// @@ -317,7 +126,7 @@ impl CaseExpr { let mut current_value: Option = if let Some(e) = &self.else_expr { Some(e.evaluate(batch)?.into_array(batch.num_rows())) } else { - Some(new_null_array(&return_type, batch.num_rows())) + Some(new_null_array(return_type, batch.num_rows()).into()) }; // walk backwards through the when/then expressions @@ -331,14 +140,20 @@ impl CaseExpr { let then_value = then_value.into_array(batch.num_rows()); // build boolean array representing which rows match the "when" value - let when_match = array_equals(&base_type, when_value, base_value.clone())?; - - current_value = Some(if_then_else( - &when_match, - then_value, - current_value.unwrap(), - &return_type, - )?); + let when_match = comparison::compare( + when_value.as_ref(), + base_value.as_ref(), + comparison::Operator::Eq, + )?; + + current_value = Some( + if_then_else::if_then_else( + &when_match, + then_value.as_ref(), + current_value.unwrap().as_ref(), + )? + .into(), + ); } Ok(ColumnarValue::Array(current_value.unwrap())) @@ -358,7 +173,7 @@ impl CaseExpr { let mut current_value: Option = if let Some(e) = &self.else_expr { Some(e.evaluate(batch)?.into_array(batch.num_rows())) } else { - Some(new_null_array(&return_type, batch.num_rows())) + Some(new_null_array(return_type, batch.num_rows()).into()) }; // walk backwards through the when/then expressions @@ -376,12 +191,14 @@ impl CaseExpr { let then_value = self.when_then_expr[i].1.evaluate(batch)?; let then_value = then_value.into_array(batch.num_rows()); - current_value = Some(if_then_else( - &when_value, - then_value, - current_value.unwrap(), - &return_type, - )?); + current_value = Some( + if_then_else::if_then_else( + &when_value, + then_value.as_ref(), + current_value.unwrap().as_ref(), + )? + .into(), + ); } Ok(ColumnarValue::Array(current_value.unwrap())) @@ -445,8 +262,8 @@ mod tests { physical_plan::expressions::{binary, col, lit}, scalar::ScalarValue, }; - use arrow::array::StringArray; - use arrow::datatypes::*; + use arrow2::array::Utf8Array; + use arrow2::datatypes::*; #[test] fn case_with_expr() -> Result<()> { @@ -574,7 +391,7 @@ mod tests { fn case_test_batch() -> Result { let schema = Schema::new(vec![Field::new("a", DataType::Utf8, true)]); - let a = StringArray::from(vec![Some("foo"), Some("baz"), None, Some("bar")]); + let a = Utf8Array::::from(vec![Some("foo"), Some("baz"), None, Some("bar")]); let batch = RecordBatch::try_new(Arc::new(schema), vec![Arc::new(a)])?; Ok(batch) } diff --git a/datafusion/src/physical_plan/expressions/cast.rs b/datafusion/src/physical_plan/expressions/cast.rs index ba395f54d917c..451b676414f3e 100644 --- a/datafusion/src/physical_plan/expressions/cast.rs +++ b/datafusion/src/physical_plan/expressions/cast.rs @@ -23,15 +23,9 @@ use super::ColumnarValue; use crate::error::{DataFusionError, Result}; use crate::physical_plan::PhysicalExpr; use crate::scalar::ScalarValue; -use arrow::compute; -use arrow::compute::kernels; -use arrow::compute::CastOptions; -use arrow::datatypes::{DataType, Schema}; -use arrow::record_batch::RecordBatch; -use compute::can_cast_types; - -/// provide Datafusion default cast options -pub const DEFAULT_DATAFUSION_CAST_OPTIONS: CastOptions = CastOptions { safe: false }; +use arrow2::compute::cast; +use arrow2::datatypes::{DataType, Schema}; +use arrow2::record_batch::RecordBatch; /// CAST expression casts an expression to a specific data type and returns a runtime error on invalid cast #[derive(Debug)] @@ -40,22 +34,12 @@ pub struct CastExpr { expr: Arc, /// The data type to cast to cast_type: DataType, - /// Cast options - cast_options: CastOptions, } impl CastExpr { /// Create a new CastExpr - pub fn new( - expr: Arc, - cast_type: DataType, - cast_options: CastOptions, - ) -> Self { - Self { - expr, - cast_type, - cast_options, - } + pub fn new(expr: Arc, cast_type: DataType) -> Self { + Self { expr, cast_type } } /// The expression to cast @@ -92,20 +76,13 @@ impl PhysicalExpr for CastExpr { fn evaluate(&self, batch: &RecordBatch) -> Result { let value = self.expr.evaluate(batch)?; match value { - ColumnarValue::Array(array) => { - Ok(ColumnarValue::Array(kernels::cast::cast_with_options( - &array, - &self.cast_type, - &self.cast_options, - )?)) - } + ColumnarValue::Array(array) => Ok(ColumnarValue::Array( + cast::cast(array.as_ref(), &self.cast_type)?.into(), + )), ColumnarValue::Scalar(scalar) => { let scalar_array = scalar.to_array(); - let cast_array = kernels::cast::cast_with_options( - &scalar_array, - &self.cast_type, - &self.cast_options, - )?; + let cast_array = + cast::cast(scalar_array.as_ref(), &self.cast_type)?.into(); let cast_scalar = ScalarValue::try_from_array(&cast_array, 0)?; Ok(ColumnarValue::Scalar(cast_scalar)) } @@ -121,13 +98,12 @@ pub fn cast_with_options( expr: Arc, input_schema: &Schema, cast_type: DataType, - cast_options: CastOptions, ) -> Result> { let expr_type = expr.data_type(input_schema)?; if expr_type == cast_type { Ok(expr.clone()) - } else if can_cast_types(&expr_type, &cast_type) { - Ok(Arc::new(CastExpr::new(expr, cast_type, cast_options))) + } else if cast::can_cast_types(&expr_type, &cast_type) { + Ok(Arc::new(CastExpr::new(expr, cast_type))) } else { Err(DataFusionError::Internal(format!( "Unsupported CAST from {:?} to {:?}", @@ -145,12 +121,7 @@ pub fn cast( input_schema: &Schema, cast_type: DataType, ) -> Result> { - cast_with_options( - expr, - input_schema, - cast_type, - DEFAULT_DATAFUSION_CAST_OPTIONS, - ) + cast_with_options(expr, input_schema, cast_type) } #[cfg(test)] @@ -158,11 +129,9 @@ mod tests { use super::*; use crate::error::Result; use crate::physical_plan::expressions::col; - use arrow::array::{StringArray, Time64NanosecondArray}; - use arrow::{ - array::{Array, Int32Array, Int64Array, TimestampNanosecondArray, UInt32Array}, - datatypes::*, - }; + use arrow2::{array::*, datatypes::*}; + + type StringArray = Utf8Array; // runs an end-to-end test of physical type cast // 1. construct a record batch with a column "a" of type A @@ -171,14 +140,14 @@ mod tests { // 4. verify that the resulting expression is of type B // 5. verify that the resulting values are downcastable and correct macro_rules! generic_test_cast { - ($A_ARRAY:ident, $A_TYPE:expr, $A_VEC:expr, $TYPEARRAY:ident, $TYPE:expr, $VEC:expr, $CAST_OPTIONS:expr) => {{ + ($A_ARRAY:ident, $A_TYPE:expr, $A_VEC:expr, $TYPEARRAY:ident, $TYPE:expr, $VEC:expr) => {{ let schema = Schema::new(vec![Field::new("a", $A_TYPE, false)]); - let a = $A_ARRAY::from($A_VEC); + let a = $A_ARRAY::from_slice(&$A_VEC); let batch = RecordBatch::try_new(Arc::new(schema.clone()), vec![Arc::new(a)])?; // verify that we can construct the expression - let expression = cast_with_options(col("a"), &schema, $TYPE, $CAST_OPTIONS)?; + let expression = cast_with_options(col("a"), &schema, $TYPE)?; // verify that its display is correct assert_eq!(format!("CAST(a AS {:?})", $TYPE), format!("{}", expression)); @@ -225,8 +194,7 @@ mod tests { Some(3_u32), Some(4_u32), Some(5_u32) - ], - DEFAULT_DATAFUSION_CAST_OPTIONS + ] ); Ok(()) } @@ -239,8 +207,7 @@ mod tests { vec![1, 2, 3, 4, 5], StringArray, DataType::Utf8, - vec![Some("1"), Some("2"), Some("3"), Some("4"), Some("5")], - DEFAULT_DATAFUSION_CAST_OPTIONS + vec![Some("1"), Some("2"), Some("3"), Some("4"), Some("5")] ); Ok(()) } @@ -249,18 +216,14 @@ mod tests { #[test] fn test_cast_i64_t64() -> Result<()> { let original = vec![1, 2, 3, 4, 5]; - let expected: Vec> = original - .iter() - .map(|i| Some(Time64NanosecondArray::from(vec![*i]).value(0))) - .collect(); + let expected: Vec> = original.iter().map(|i| Some(*i)).collect(); generic_test_cast!( Int64Array, DataType::Int64, original.clone(), - TimestampNanosecondArray, + Int64Array, DataType::Timestamp(TimeUnit::Nanosecond, None), - expected, - DEFAULT_DATAFUSION_CAST_OPTIONS + expected ); Ok(()) } @@ -278,21 +241,16 @@ mod tests { fn invalid_cast_with_options_error() -> Result<()> { // Ensure a useful error happens at plan time if invalid casts are used let schema = Schema::new(vec![Field::new("a", DataType::Utf8, false)]); - let a = StringArray::from(vec!["9.1"]); + let a = StringArray::from_slice(&["9.1"]); let batch = RecordBatch::try_new(Arc::new(schema.clone()), vec![Arc::new(a)])?; - let expression = cast_with_options( - col("a"), - &schema, - DataType::Int32, - DEFAULT_DATAFUSION_CAST_OPTIONS, - )?; + let expression = cast_with_options(col("a"), &schema, DataType::Int32)?; let result = expression.evaluate(&batch); match result { Ok(_) => panic!("expected error"), Err(e) => { assert!(e.to_string().contains( - "Cast error: Cannot cast string '9.1' to value of arrow::datatypes::types::Int32Type type" + "Cast error: Cannot cast string '9.1' to value of arrow2::datatypes::types::Int32Type type" )) } } diff --git a/datafusion/src/physical_plan/expressions/coercion.rs b/datafusion/src/physical_plan/expressions/coercion.rs index e9949f5199e88..73470d5428492 100644 --- a/datafusion/src/physical_plan/expressions/coercion.rs +++ b/datafusion/src/physical_plan/expressions/coercion.rs @@ -17,7 +17,7 @@ //! Coercion rules used to coerce types to match existing expressions' implementations -use arrow::datatypes::DataType; +use arrow2::datatypes::DataType; /// Determine if a DataType is signed numeric or not pub fn is_signed_numeric(dt: &DataType) -> bool { @@ -79,7 +79,7 @@ pub fn dictionary_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option Option { - use arrow::datatypes::DataType::*; + use arrow2::datatypes::DataType::*; match (lhs_type, rhs_type) { (Utf8, Utf8) => Some(Utf8), (LargeUtf8, Utf8) => Some(LargeUtf8), @@ -92,7 +92,7 @@ pub fn string_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option Option { - use arrow::datatypes::DataType::*; + use arrow2::datatypes::DataType::*; match (lhs_type, rhs_type) { (Utf8, Date32) => Some(Date32), (Date32, Utf8) => Some(Date32), @@ -106,7 +106,7 @@ pub fn temporal_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option Option { - use arrow::datatypes::DataType::*; + use arrow2::datatypes::DataType::*; // error on any non-numeric type if !is_numeric(lhs_type) || !is_numeric(rhs_type) { diff --git a/datafusion/src/physical_plan/expressions/column.rs b/datafusion/src/physical_plan/expressions/column.rs index 7e0304e51fe73..de64f8fed98a9 100644 --- a/datafusion/src/physical_plan/expressions/column.rs +++ b/datafusion/src/physical_plan/expressions/column.rs @@ -19,7 +19,7 @@ use std::sync::Arc; -use arrow::{ +use arrow2::{ datatypes::{DataType, Schema}, record_batch::RecordBatch, }; diff --git a/datafusion/src/physical_plan/expressions/count.rs b/datafusion/src/physical_plan/expressions/count.rs index 4a3fbe4fa7d3d..4390fb52d2798 100644 --- a/datafusion/src/physical_plan/expressions/count.rs +++ b/datafusion/src/physical_plan/expressions/count.rs @@ -20,15 +20,13 @@ use std::any::Any; use std::sync::Arc; +use super::ArrayRef; use crate::error::Result; use crate::physical_plan::{Accumulator, AggregateExpr, PhysicalExpr}; use crate::scalar::ScalarValue; -use arrow::compute; -use arrow::datatypes::DataType; -use arrow::{ - array::{ArrayRef, UInt64Array}, - datatypes::Field, -}; +use arrow2::compute; +use arrow2::datatypes::DataType; +use arrow2::{array::UInt64Array, datatypes::Field}; use super::format_state_name; @@ -104,7 +102,7 @@ impl CountAccumulator { impl Accumulator for CountAccumulator { fn update_batch(&mut self, values: &[ArrayRef]) -> Result<()> { let array = &values[0]; - self.count += (array.len() - array.data().null_count()) as u64; + self.count += (array.len() - array.null_count()) as u64; Ok(()) } @@ -128,7 +126,7 @@ impl Accumulator for CountAccumulator { fn merge_batch(&mut self, states: &[ArrayRef]) -> Result<()> { let counts = states[0].as_any().downcast_ref::().unwrap(); - let delta = &compute::sum(counts); + let delta = &compute::aggregate::sum(counts); if let Some(d) = delta { self.count += *d; } @@ -150,12 +148,12 @@ mod tests { use crate::physical_plan::expressions::col; use crate::physical_plan::expressions::tests::aggregate; use crate::{error::Result, generic_test_op}; - use arrow::record_batch::RecordBatch; - use arrow::{array::*, datatypes::*}; + use arrow2::record_batch::RecordBatch; + use arrow2::{array::*, datatypes::*}; #[test] fn count_elements() -> Result<()> { - let a: ArrayRef = Arc::new(Int32Array::from(vec![1, 2, 3, 4, 5])); + let a: ArrayRef = Arc::new(Int32Array::from_slice(&[1, 2, 3, 4, 5])); generic_test_op!( a, DataType::Int32, @@ -200,8 +198,7 @@ mod tests { #[test] fn count_empty() -> Result<()> { - let a: Vec = vec![]; - let a: ArrayRef = Arc::new(BooleanArray::from(a)); + let a: ArrayRef = Arc::new(BooleanArray::new_empty()); generic_test_op!( a, DataType::Boolean, @@ -213,8 +210,9 @@ mod tests { #[test] fn count_utf8() -> Result<()> { - let a: ArrayRef = - Arc::new(StringArray::from(vec!["a", "bb", "ccc", "dddd", "ad"])); + let a: ArrayRef = Arc::new(Utf8Array::::from_slice(&[ + "a", "bb", "ccc", "dddd", "ad", + ])); generic_test_op!( a, DataType::Utf8, @@ -226,8 +224,9 @@ mod tests { #[test] fn count_large_utf8() -> Result<()> { - let a: ArrayRef = - Arc::new(LargeStringArray::from(vec!["a", "bb", "ccc", "dddd", "ad"])); + let a: ArrayRef = Arc::new(Utf8Array::::from_slice(&[ + "a", "bb", "ccc", "dddd", "ad", + ])); generic_test_op!( a, DataType::LargeUtf8, diff --git a/datafusion/src/physical_plan/expressions/in_list.rs b/datafusion/src/physical_plan/expressions/in_list.rs index 41f111006ea2a..cab06344793e9 100644 --- a/datafusion/src/physical_plan/expressions/in_list.rs +++ b/datafusion/src/physical_plan/expressions/in_list.rs @@ -20,17 +20,15 @@ use std::any::Any; use std::sync::Arc; -use arrow::array::GenericStringArray; -use arrow::array::{ - ArrayRef, BooleanArray, Float32Array, Float64Array, Int16Array, Int32Array, - Int64Array, Int8Array, StringOffsetSizeTrait, UInt16Array, UInt32Array, UInt64Array, - UInt8Array, -}; -use arrow::{ +use arrow2::array::Utf8Array; +use arrow2::array::*; +use arrow2::{ datatypes::{DataType, Schema}, record_batch::RecordBatch, }; +use super::ArrayRef; + use crate::error::Result; use crate::physical_plan::{ColumnarValue, PhysicalExpr}; use crate::scalar::ScalarValue; @@ -130,16 +128,13 @@ impl InListExpr { /// Compare for specific utf8 types #[allow(clippy::unnecessary_wraps)] - fn compare_utf8( + fn compare_utf8( &self, array: ArrayRef, list_values: Vec, negated: bool, ) -> Result { - let array = array - .as_any() - .downcast_ref::>() - .unwrap(); + let array = array.as_any().downcast_ref::>().unwrap(); let mut contains_null = false; let values = list_values @@ -288,7 +283,9 @@ pub fn in_list( #[cfg(test)] mod tests { - use arrow::{array::StringArray, datatypes::Field}; + use arrow2::{array::Utf8Array, datatypes::Field}; + + type StringArray = Utf8Array; use super::*; use crate::error::Result; diff --git a/datafusion/src/physical_plan/expressions/is_not_null.rs b/datafusion/src/physical_plan/expressions/is_not_null.rs index 7ac2110b50221..16694b384accf 100644 --- a/datafusion/src/physical_plan/expressions/is_not_null.rs +++ b/datafusion/src/physical_plan/expressions/is_not_null.rs @@ -19,8 +19,8 @@ use std::{any::Any, sync::Arc}; -use arrow::compute; -use arrow::{ +use arrow2::compute; +use arrow2::{ datatypes::{DataType, Schema}, record_batch::RecordBatch, }; @@ -71,7 +71,7 @@ impl PhysicalExpr for IsNotNullExpr { let arg = self.arg.evaluate(batch)?; match arg { ColumnarValue::Array(array) => Ok(ColumnarValue::Array(Arc::new( - compute::is_not_null(array.as_ref())?, + compute::boolean::is_not_null(array.as_ref()), ))), ColumnarValue::Scalar(scalar) => Ok(ColumnarValue::Scalar( ScalarValue::Boolean(Some(!scalar.is_null())), @@ -89,13 +89,15 @@ pub fn is_not_null(arg: Arc) -> Result> mod tests { use super::*; use crate::physical_plan::expressions::col; - use arrow::{ - array::{BooleanArray, StringArray}, + use arrow2::{ + array::{BooleanArray, Utf8Array}, datatypes::*, record_batch::RecordBatch, }; use std::sync::Arc; + type StringArray = Utf8Array; + #[test] fn is_not_null_op() -> Result<()> { let schema = Schema::new(vec![Field::new("a", DataType::Utf8, true)]); @@ -110,7 +112,7 @@ mod tests { .downcast_ref::() .expect("failed to downcast to BooleanArray"); - let expected = &BooleanArray::from(vec![true, false]); + let expected = &BooleanArray::from_slice(&[true, false]); assert_eq!(expected, result); diff --git a/datafusion/src/physical_plan/expressions/is_null.rs b/datafusion/src/physical_plan/expressions/is_null.rs index dfa53f3f7d264..2fb4acbbd3171 100644 --- a/datafusion/src/physical_plan/expressions/is_null.rs +++ b/datafusion/src/physical_plan/expressions/is_null.rs @@ -19,8 +19,8 @@ use std::{any::Any, sync::Arc}; -use arrow::compute; -use arrow::{ +use arrow2::compute; +use arrow2::{ datatypes::{DataType, Schema}, record_batch::RecordBatch, }; @@ -71,7 +71,7 @@ impl PhysicalExpr for IsNullExpr { let arg = self.arg.evaluate(batch)?; match arg { ColumnarValue::Array(array) => Ok(ColumnarValue::Array(Arc::new( - compute::is_null(array.as_ref())?, + compute::boolean::is_null(array.as_ref()), ))), ColumnarValue::Scalar(scalar) => Ok(ColumnarValue::Scalar( ScalarValue::Boolean(Some(scalar.is_null())), @@ -89,13 +89,15 @@ pub fn is_null(arg: Arc) -> Result> { mod tests { use super::*; use crate::physical_plan::expressions::col; - use arrow::{ - array::{BooleanArray, StringArray}, + use arrow2::{ + array::{BooleanArray, Utf8Array}, datatypes::*, record_batch::RecordBatch, }; use std::sync::Arc; + type StringArray = Utf8Array; + #[test] fn is_null_op() -> Result<()> { let schema = Schema::new(vec![Field::new("a", DataType::Utf8, true)]); @@ -110,7 +112,7 @@ mod tests { .downcast_ref::() .expect("failed to downcast to BooleanArray"); - let expected = &BooleanArray::from(vec![false, true]); + let expected = &BooleanArray::from_slice(&[false, true]); assert_eq!(expected, result); diff --git a/datafusion/src/physical_plan/expressions/literal.rs b/datafusion/src/physical_plan/expressions/literal.rs index 3110d39c87e0b..0bf71d63d89b6 100644 --- a/datafusion/src/physical_plan/expressions/literal.rs +++ b/datafusion/src/physical_plan/expressions/literal.rs @@ -20,7 +20,7 @@ use std::any::Any; use std::sync::Arc; -use arrow::{ +use arrow2::{ datatypes::{DataType, Schema}, record_batch::RecordBatch, }; @@ -80,8 +80,8 @@ pub fn lit(value: ScalarValue) -> Arc { mod tests { use super::*; use crate::error::Result; - use arrow::array::Int32Array; - use arrow::datatypes::*; + use arrow2::array::*; + use arrow2::datatypes::*; #[test] fn literal_i32() -> Result<()> { diff --git a/datafusion/src/physical_plan/expressions/min_max.rs b/datafusion/src/physical_plan/expressions/min_max.rs index ea917d30d940d..e1e299b2e7b56 100644 --- a/datafusion/src/physical_plan/expressions/min_max.rs +++ b/datafusion/src/physical_plan/expressions/min_max.rs @@ -21,20 +21,17 @@ use std::any::Any; use std::convert::TryFrom; use std::sync::Arc; +use arrow2::array::*; +use arrow2::compute::aggregate::*; +use arrow2::datatypes::*; + use crate::error::{DataFusionError, Result}; use crate::physical_plan::{Accumulator, AggregateExpr, PhysicalExpr}; use crate::scalar::ScalarValue; -use arrow::compute; -use arrow::datatypes::{DataType, TimeUnit}; -use arrow::{ - array::{ - ArrayRef, Float32Array, Float64Array, Int16Array, Int32Array, Int64Array, - Int8Array, LargeStringArray, StringArray, TimestampMicrosecondArray, - TimestampMillisecondArray, TimestampNanosecondArray, TimestampSecondArray, - UInt16Array, UInt32Array, UInt64Array, UInt8Array, - }, - datatypes::Field, -}; + +type StringArray = Utf8Array; +type LargeStringArray = Utf8Array; +type ArrayRef = Arc; use super::format_state_name; @@ -48,7 +45,7 @@ pub struct Max { } impl Max { - /// Create a new MAX aggregate function + /// Cre§ate a new MAX aggregate function pub fn new(expr: Arc, name: String, data_type: DataType) -> Self { Self { name, @@ -98,7 +95,7 @@ impl AggregateExpr for Max { macro_rules! typed_min_max_batch_string { ($VALUES:expr, $ARRAYTYPE:ident, $SCALAR:ident, $OP:ident) => {{ let array = $VALUES.as_any().downcast_ref::<$ARRAYTYPE>().unwrap(); - let value = compute::$OP(array); + let value = $OP(array); let value = value.and_then(|e| Some(e.to_string())); ScalarValue::$SCALAR(value) }}; @@ -108,7 +105,7 @@ macro_rules! typed_min_max_batch_string { macro_rules! typed_min_max_batch { ($VALUES:expr, $ARRAYTYPE:ident, $SCALAR:ident, $OP:ident) => {{ let array = $VALUES.as_any().downcast_ref::<$ARRAYTYPE>().unwrap(); - let value = compute::$OP(array); + let value = $OP(array); ScalarValue::$SCALAR(value) }}; } @@ -119,13 +116,9 @@ macro_rules! min_max_batch { ($VALUES:expr, $OP:ident) => {{ match $VALUES.data_type() { // all types that have a natural order - DataType::Float64 => { - typed_min_max_batch!($VALUES, Float64Array, Float64, $OP) + DataType::Int64 | DataType::Timestamp(TimeUnit::Second, _) => { + typed_min_max_batch!($VALUES, Int64Array, Int64, $OP) } - DataType::Float32 => { - typed_min_max_batch!($VALUES, Float32Array, Float32, $OP) - } - DataType::Int64 => typed_min_max_batch!($VALUES, Int64Array, Int64, $OP), DataType::Int32 => typed_min_max_batch!($VALUES, Int32Array, Int32, $OP), DataType::Int16 => typed_min_max_batch!($VALUES, Int16Array, Int16, $OP), DataType::Int8 => typed_min_max_batch!($VALUES, Int8Array, Int8, $OP), @@ -134,26 +127,17 @@ macro_rules! min_max_batch { DataType::UInt16 => typed_min_max_batch!($VALUES, UInt16Array, UInt16, $OP), DataType::UInt8 => typed_min_max_batch!($VALUES, UInt8Array, UInt8, $OP), DataType::Timestamp(TimeUnit::Second, _) => { - typed_min_max_batch!($VALUES, TimestampSecondArray, TimestampSecond, $OP) + typed_min_max_batch!($VALUES, Int64Array, TimestampSecond, $OP) + } + DataType::Timestamp(TimeUnit::Millisecond, _) => { + typed_min_max_batch!($VALUES, Int64Array, TimestampMillisecond, $OP) + } + DataType::Timestamp(TimeUnit::Microsecond, _) => { + typed_min_max_batch!($VALUES, Int64Array, TimestampMicrosecond, $OP) + } + DataType::Timestamp(TimeUnit::Nanosecond, _) => { + typed_min_max_batch!($VALUES, Int64Array, TimestampNanosecond, $OP) } - DataType::Timestamp(TimeUnit::Millisecond, _) => typed_min_max_batch!( - $VALUES, - TimestampMillisecondArray, - TimestampMillisecond, - $OP - ), - DataType::Timestamp(TimeUnit::Microsecond, _) => typed_min_max_batch!( - $VALUES, - TimestampMicrosecondArray, - TimestampMicrosecond, - $OP - ), - DataType::Timestamp(TimeUnit::Nanosecond, _) => typed_min_max_batch!( - $VALUES, - TimestampNanosecondArray, - TimestampNanosecond, - $OP - ), other => { // This should have been handled before return Err(DataFusionError::Internal(format!( @@ -174,7 +158,13 @@ fn min_batch(values: &ArrayRef) -> Result { DataType::LargeUtf8 => { typed_min_max_batch_string!(values, LargeStringArray, LargeUtf8, min_string) } - _ => min_max_batch!(values, min), + DataType::Float64 => { + typed_min_max_batch!(values, Float64Array, Float64, min_primitive) + } + DataType::Float32 => { + typed_min_max_batch!(values, Float32Array, Float32, min_primitive) + } + _ => min_max_batch!(values, min_primitive), }) } @@ -187,7 +177,13 @@ fn max_batch(values: &ArrayRef) -> Result { DataType::LargeUtf8 => { typed_min_max_batch_string!(values, LargeStringArray, LargeUtf8, max_string) } - _ => min_max_batch!(values, max), + DataType::Float64 => { + typed_min_max_batch!(values, Float64Array, Float64, max_primitive) + } + DataType::Float32 => { + typed_min_max_batch!(values, Float32Array, Float32, max_primitive) + } + _ => min_max_batch!(values, max_primitive), }) } @@ -448,12 +444,11 @@ mod tests { use crate::physical_plan::expressions::col; use crate::physical_plan::expressions::tests::aggregate; use crate::{error::Result, generic_test_op}; - use arrow::datatypes::*; - use arrow::record_batch::RecordBatch; + use arrow2::record_batch::RecordBatch; #[test] fn max_i32() -> Result<()> { - let a: ArrayRef = Arc::new(Int32Array::from(vec![1, 2, 3, 4, 5])); + let a: ArrayRef = Arc::new(Int32Array::from_slice(&[1, 2, 3, 4, 5])); generic_test_op!( a, DataType::Int32, @@ -465,7 +460,7 @@ mod tests { #[test] fn min_i32() -> Result<()> { - let a: ArrayRef = Arc::new(Int32Array::from(vec![1, 2, 3, 4, 5])); + let a: ArrayRef = Arc::new(Int32Array::from_slice(&[1, 2, 3, 4, 5])); generic_test_op!( a, DataType::Int32, @@ -477,7 +472,7 @@ mod tests { #[test] fn max_utf8() -> Result<()> { - let a: ArrayRef = Arc::new(StringArray::from(vec!["d", "a", "c", "b"])); + let a: ArrayRef = Arc::new(StringArray::from_slice(&["d", "a", "c", "b"])); generic_test_op!( a, DataType::Utf8, @@ -489,7 +484,7 @@ mod tests { #[test] fn max_large_utf8() -> Result<()> { - let a: ArrayRef = Arc::new(LargeStringArray::from(vec!["d", "a", "c", "b"])); + let a: ArrayRef = Arc::new(LargeStringArray::from_slice(&["d", "a", "c", "b"])); generic_test_op!( a, DataType::LargeUtf8, @@ -501,7 +496,7 @@ mod tests { #[test] fn min_utf8() -> Result<()> { - let a: ArrayRef = Arc::new(StringArray::from(vec!["d", "a", "c", "b"])); + let a: ArrayRef = Arc::new(StringArray::from_slice(&["d", "a", "c", "b"])); generic_test_op!( a, DataType::Utf8, @@ -513,7 +508,7 @@ mod tests { #[test] fn min_large_utf8() -> Result<()> { - let a: ArrayRef = Arc::new(LargeStringArray::from(vec!["d", "a", "c", "b"])); + let a: ArrayRef = Arc::new(LargeStringArray::from_slice(&["d", "a", "c", "b"])); generic_test_op!( a, DataType::LargeUtf8, @@ -525,7 +520,7 @@ mod tests { #[test] fn max_i32_with_nulls() -> Result<()> { - let a: ArrayRef = Arc::new(Int32Array::from(vec![ + let a: ArrayRef = Arc::new(Int32Array::from(&[ Some(1), None, Some(3), @@ -543,7 +538,7 @@ mod tests { #[test] fn min_i32_with_nulls() -> Result<()> { - let a: ArrayRef = Arc::new(Int32Array::from(vec![ + let a: ArrayRef = Arc::new(Int32Array::from(&[ Some(1), None, Some(3), @@ -561,7 +556,7 @@ mod tests { #[test] fn max_i32_all_nulls() -> Result<()> { - let a: ArrayRef = Arc::new(Int32Array::from(vec![None, None])); + let a: ArrayRef = Arc::new(Int32Array::from(&[None, None])); generic_test_op!( a, DataType::Int32, @@ -573,7 +568,7 @@ mod tests { #[test] fn min_i32_all_nulls() -> Result<()> { - let a: ArrayRef = Arc::new(Int32Array::from(vec![None, None])); + let a: ArrayRef = Arc::new(Int32Array::from(&[None, None])); generic_test_op!( a, DataType::Int32, @@ -585,8 +580,9 @@ mod tests { #[test] fn max_u32() -> Result<()> { - let a: ArrayRef = - Arc::new(UInt32Array::from(vec![1_u32, 2_u32, 3_u32, 4_u32, 5_u32])); + let a: ArrayRef = Arc::new(UInt32Array::from_slice(&[ + 1_u32, 2_u32, 3_u32, 4_u32, 5_u32, + ])); generic_test_op!( a, DataType::UInt32, @@ -598,8 +594,9 @@ mod tests { #[test] fn min_u32() -> Result<()> { - let a: ArrayRef = - Arc::new(UInt32Array::from(vec![1_u32, 2_u32, 3_u32, 4_u32, 5_u32])); + let a: ArrayRef = Arc::new(UInt32Array::from_slice(&[ + 1_u32, 2_u32, 3_u32, 4_u32, 5_u32, + ])); generic_test_op!( a, DataType::UInt32, @@ -611,8 +608,9 @@ mod tests { #[test] fn max_f32() -> Result<()> { - let a: ArrayRef = - Arc::new(Float32Array::from(vec![1_f32, 2_f32, 3_f32, 4_f32, 5_f32])); + let a: ArrayRef = Arc::new(Float32Array::from_slice(&[ + 1_f32, 2_f32, 3_f32, 4_f32, 5_f32, + ])); generic_test_op!( a, DataType::Float32, @@ -624,8 +622,9 @@ mod tests { #[test] fn min_f32() -> Result<()> { - let a: ArrayRef = - Arc::new(Float32Array::from(vec![1_f32, 2_f32, 3_f32, 4_f32, 5_f32])); + let a: ArrayRef = Arc::new(Float32Array::from_slice(&[ + 1_f32, 2_f32, 3_f32, 4_f32, 5_f32, + ])); generic_test_op!( a, DataType::Float32, @@ -637,8 +636,9 @@ mod tests { #[test] fn max_f64() -> Result<()> { - let a: ArrayRef = - Arc::new(Float64Array::from(vec![1_f64, 2_f64, 3_f64, 4_f64, 5_f64])); + let a: ArrayRef = Arc::new(Float64Array::from_slice(&[ + 1_f64, 2_f64, 3_f64, 4_f64, 5_f64, + ])); generic_test_op!( a, DataType::Float64, @@ -650,8 +650,9 @@ mod tests { #[test] fn min_f64() -> Result<()> { - let a: ArrayRef = - Arc::new(Float64Array::from(vec![1_f64, 2_f64, 3_f64, 4_f64, 5_f64])); + let a: ArrayRef = Arc::new(Float64Array::from_slice(&[ + 1_f64, 2_f64, 3_f64, 4_f64, 5_f64, + ])); generic_test_op!( a, DataType::Float64, diff --git a/datafusion/src/physical_plan/expressions/mod.rs b/datafusion/src/physical_plan/expressions/mod.rs index 4d57c39bb31cc..665cf248514b0 100644 --- a/datafusion/src/physical_plan/expressions/mod.rs +++ b/datafusion/src/physical_plan/expressions/mod.rs @@ -22,8 +22,18 @@ use std::sync::Arc; use super::ColumnarValue; use crate::error::{DataFusionError, Result}; use crate::physical_plan::PhysicalExpr; -use arrow::compute::kernels::sort::{SortColumn, SortOptions}; -use arrow::record_batch::RecordBatch; +use arrow2::array::Array; +use arrow2::compute::sort::SortOptions; +use arrow2::record_batch::RecordBatch; + +type ArrayRef = Arc; + +/// One column to be used in lexicographical sort +#[derive(Clone, Debug)] +pub struct SortColumn { + pub values: ArrayRef, + pub options: Option, +} mod average; #[macro_use] diff --git a/datafusion/src/physical_plan/expressions/negative.rs b/datafusion/src/physical_plan/expressions/negative.rs index 65010c6acd1ec..d17acf899b09d 100644 --- a/datafusion/src/physical_plan/expressions/negative.rs +++ b/datafusion/src/physical_plan/expressions/negative.rs @@ -20,14 +20,15 @@ use std::any::Any; use std::sync::Arc; -use arrow::array::ArrayRef; -use arrow::compute::kernels::arithmetic::negate; -use arrow::{ - array::{Float32Array, Float64Array, Int16Array, Int32Array, Int64Array, Int8Array}, +use arrow2::{ + array::*, + compute::arithmetics::negate, datatypes::{DataType, Schema}, record_batch::RecordBatch, }; +type ArrayRef = Arc; + use crate::error::{DataFusionError, Result}; use crate::physical_plan::{ColumnarValue, PhysicalExpr}; @@ -36,12 +37,12 @@ use super::coercion; /// Invoke a compute kernel on array(s) macro_rules! compute_op { // invoke unary operator - ($OPERAND:expr, $OP:ident, $DT:ident) => {{ + ($OPERAND:expr, $DT:ident) => {{ let operand = $OPERAND .as_any() .downcast_ref::<$DT>() .expect("compute_op failed to downcast array"); - Ok(Arc::new($OP(&operand)?)) + Ok(Arc::new(negate(operand))) }}; } @@ -89,12 +90,12 @@ impl PhysicalExpr for NegativeExpr { match arg { ColumnarValue::Array(array) => { let result: Result = match array.data_type() { - DataType::Int8 => compute_op!(array, negate, Int8Array), - DataType::Int16 => compute_op!(array, negate, Int16Array), - DataType::Int32 => compute_op!(array, negate, Int32Array), - DataType::Int64 => compute_op!(array, negate, Int64Array), - DataType::Float32 => compute_op!(array, negate, Float32Array), - DataType::Float64 => compute_op!(array, negate, Float64Array), + DataType::Int8 => compute_op!(array, Int8Array), + DataType::Int16 => compute_op!(array, Int16Array), + DataType::Int32 => compute_op!(array, Int32Array), + DataType::Int64 => compute_op!(array, Int64Array), + DataType::Float32 => compute_op!(array, Float32Array), + DataType::Float64 => compute_op!(array, Float64Array), _ => Err(DataFusionError::Internal(format!( "(- '{:?}') can't be evaluated because the expression's type is {:?}, not signed numeric", self, diff --git a/datafusion/src/physical_plan/expressions/not.rs b/datafusion/src/physical_plan/expressions/not.rs index 23a1a46651dee..ac467a8cc344d 100644 --- a/datafusion/src/physical_plan/expressions/not.rs +++ b/datafusion/src/physical_plan/expressions/not.rs @@ -25,9 +25,9 @@ use super::ColumnarValue; use crate::error::{DataFusionError, Result}; use crate::physical_plan::PhysicalExpr; use crate::scalar::ScalarValue; -use arrow::array::BooleanArray; -use arrow::datatypes::{DataType, Schema}; -use arrow::record_batch::RecordBatch; +use arrow2::array::BooleanArray; +use arrow2::datatypes::{DataType, Schema}; +use arrow2::record_batch::RecordBatch; /// Not expression #[derive(Debug)] @@ -82,7 +82,7 @@ impl PhysicalExpr for NotExpr { ) })?; Ok(ColumnarValue::Array(Arc::new( - arrow::compute::kernels::boolean::not(array)?, + arrow2::compute::boolean::not(array), ))) } ColumnarValue::Scalar(scalar) => { @@ -121,7 +121,7 @@ mod tests { use super::*; use crate::error::Result; use crate::physical_plan::expressions::col; - use arrow::datatypes::*; + use arrow2::datatypes::*; #[test] fn neg_op() -> Result<()> { diff --git a/datafusion/src/physical_plan/expressions/nullif.rs b/datafusion/src/physical_plan/expressions/nullif.rs index 7cc58ed2318f4..b632cc45c67f4 100644 --- a/datafusion/src/physical_plan/expressions/nullif.rs +++ b/datafusion/src/physical_plan/expressions/nullif.rs @@ -15,57 +15,10 @@ // specific language governing permissions and limitations // under the License. -use std::sync::Arc; - use super::ColumnarValue; use crate::error::{DataFusionError, Result}; -use crate::scalar::ScalarValue; -use arrow::array::Array; -use arrow::array::{ - ArrayRef, BooleanArray, Date32Array, Date64Array, Float32Array, Float64Array, - Int16Array, Int32Array, Int64Array, Int8Array, StringArray, TimestampNanosecondArray, - UInt16Array, UInt32Array, UInt64Array, UInt8Array, -}; -use arrow::compute::kernels::boolean::nullif; -use arrow::compute::kernels::comparison::{eq, eq_scalar, eq_utf8, eq_utf8_scalar}; -use arrow::datatypes::{DataType, TimeUnit}; - -/// Invoke a compute kernel on a primitive array and a Boolean Array -macro_rules! compute_bool_array_op { - ($LEFT:expr, $RIGHT:expr, $OP:ident, $DT:ident) => {{ - let ll = $LEFT - .as_any() - .downcast_ref::<$DT>() - .expect("compute_op failed to downcast array"); - let rr = $RIGHT - .as_any() - .downcast_ref::() - .expect("compute_op failed to downcast array"); - Ok(Arc::new($OP(&ll, &rr)?) as ArrayRef) - }}; -} - -/// Binary op between primitive and boolean arrays -macro_rules! primitive_bool_array_op { - ($LEFT:expr, $RIGHT:expr, $OP:ident) => {{ - match $LEFT.data_type() { - DataType::Int8 => compute_bool_array_op!($LEFT, $RIGHT, $OP, Int8Array), - DataType::Int16 => compute_bool_array_op!($LEFT, $RIGHT, $OP, Int16Array), - DataType::Int32 => compute_bool_array_op!($LEFT, $RIGHT, $OP, Int32Array), - DataType::Int64 => compute_bool_array_op!($LEFT, $RIGHT, $OP, Int64Array), - DataType::UInt8 => compute_bool_array_op!($LEFT, $RIGHT, $OP, UInt8Array), - DataType::UInt16 => compute_bool_array_op!($LEFT, $RIGHT, $OP, UInt16Array), - DataType::UInt32 => compute_bool_array_op!($LEFT, $RIGHT, $OP, UInt32Array), - DataType::UInt64 => compute_bool_array_op!($LEFT, $RIGHT, $OP, UInt64Array), - DataType::Float32 => compute_bool_array_op!($LEFT, $RIGHT, $OP, Float32Array), - DataType::Float64 => compute_bool_array_op!($LEFT, $RIGHT, $OP, Float64Array), - other => Err(DataFusionError::Internal(format!( - "Unsupported data type {:?} for NULLIF/primitive/boolean operator", - other - ))), - } - }}; -} +use arrow2::compute::nullif; +use arrow2::datatypes::DataType; /// Implements NULLIF(expr1, expr2) /// Args: 0 - left expr is any array @@ -83,20 +36,14 @@ pub fn nullif_func(args: &[ColumnarValue]) -> Result { match (lhs, rhs) { (ColumnarValue::Array(lhs), ColumnarValue::Scalar(rhs)) => { - let cond_array = binary_array_op_scalar!(lhs, rhs.clone(), eq).unwrap()?; - - let array = primitive_bool_array_op!(lhs, *cond_array, nullif)?; - - Ok(ColumnarValue::Array(array)) - } - (ColumnarValue::Array(lhs), ColumnarValue::Array(rhs)) => { - // Get args0 == args1 evaluated and produce a boolean array - let cond_array = binary_array_op!(lhs, rhs, eq)?; - - // Now, invoke nullif on the result - let array = primitive_bool_array_op!(lhs, *cond_array, nullif)?; - Ok(ColumnarValue::Array(array)) + Ok(ColumnarValue::Array( + nullif::nullif(lhs.as_ref(), rhs.to_array_of_size(lhs.len()).as_ref())? + .into(), + )) } + (ColumnarValue::Array(lhs), ColumnarValue::Array(rhs)) => Ok( + ColumnarValue::Array(nullif::nullif(lhs.as_ref(), rhs.as_ref())?.into()), + ), _ => Err(DataFusionError::NotImplemented( "nullif does not support a literal as first argument".to_string(), )), @@ -122,8 +69,11 @@ pub static SUPPORTED_NULLIF_TYPES: &[DataType] = &[ #[cfg(test)] mod tests { + use std::sync::Arc; + use super::*; - use crate::error::Result; + use crate::{error::Result, scalar::ScalarValue}; + use arrow2::array::Int32Array; #[test] fn nullif_int32() -> Result<()> { @@ -145,7 +95,7 @@ mod tests { let result = nullif_func(&[a, lit_array])?; let result = result.into_array(0); - let expected = Arc::new(Int32Array::from(vec![ + let expected = Int32Array::from(vec![ Some(1), None, None, @@ -155,15 +105,15 @@ mod tests { None, Some(4), Some(5), - ])) as ArrayRef; - assert_eq!(expected.as_ref(), result.as_ref()); + ]); + assert_eq!(expected, result.as_ref()); Ok(()) } #[test] // Ensure that arrays with no nulls can also invoke NULLIF() correctly fn nullif_int32_nonulls() -> Result<()> { - let a = Int32Array::from(vec![1, 3, 10, 7, 8, 1, 2, 4, 5]); + let a = Int32Array::from_slice(&[1, 3, 10, 7, 8, 1, 2, 4, 5]); let a = ColumnarValue::Array(Arc::new(a)); let lit_array = ColumnarValue::Scalar(ScalarValue::Int32(Some(1i32))); @@ -171,7 +121,7 @@ mod tests { let result = nullif_func(&[a, lit_array])?; let result = result.into_array(0); - let expected = Arc::new(Int32Array::from(vec![ + let expected = Int32Array::from(vec![ None, Some(3), Some(10), @@ -181,8 +131,8 @@ mod tests { Some(2), Some(4), Some(5), - ])) as ArrayRef; - assert_eq!(expected.as_ref(), result.as_ref()); + ]); + assert_eq!(expected, result.as_ref()); Ok(()) } } diff --git a/datafusion/src/physical_plan/expressions/sum.rs b/datafusion/src/physical_plan/expressions/sum.rs index 7bbbf99fa6598..f7e611de439c4 100644 --- a/datafusion/src/physical_plan/expressions/sum.rs +++ b/datafusion/src/physical_plan/expressions/sum.rs @@ -24,15 +24,11 @@ use std::sync::Arc; use crate::error::{DataFusionError, Result}; use crate::physical_plan::{Accumulator, AggregateExpr, PhysicalExpr}; use crate::scalar::ScalarValue; -use arrow::compute; -use arrow::datatypes::DataType; -use arrow::{ - array::{ - ArrayRef, Float32Array, Float64Array, Int16Array, Int32Array, Int64Array, - Int8Array, UInt16Array, UInt32Array, UInt64Array, UInt8Array, - }, - datatypes::Field, -}; +use arrow2::compute; +use arrow2::datatypes::DataType; +use arrow2::{array::*, datatypes::Field}; + +type ArrayRef = Arc; use super::format_state_name; @@ -128,7 +124,7 @@ impl SumAccumulator { macro_rules! typed_sum_delta_batch { ($VALUES:expr, $ARRAYTYPE:ident, $SCALAR:ident) => {{ let array = $VALUES.as_any().downcast_ref::<$ARRAYTYPE>().unwrap(); - let delta = compute::sum(array); + let delta = compute::aggregate::sum(array); ScalarValue::$SCALAR(delta) }}; } @@ -276,12 +272,12 @@ mod tests { use super::*; use crate::physical_plan::expressions::col; use crate::{error::Result, generic_test_op}; - use arrow::datatypes::*; - use arrow::record_batch::RecordBatch; + use arrow2::datatypes::*; + use arrow2::record_batch::RecordBatch; #[test] fn sum_i32() -> Result<()> { - let a: ArrayRef = Arc::new(Int32Array::from(vec![1, 2, 3, 4, 5])); + let a: ArrayRef = Arc::new(Int32Array::from_slice(&[1, 2, 3, 4, 5])); generic_test_op!( a, DataType::Int32, @@ -293,7 +289,7 @@ mod tests { #[test] fn sum_i32_with_nulls() -> Result<()> { - let a: ArrayRef = Arc::new(Int32Array::from(vec![ + let a: ArrayRef = Arc::new(Int32Array::from(&[ Some(1), None, Some(3), @@ -323,8 +319,9 @@ mod tests { #[test] fn sum_u32() -> Result<()> { - let a: ArrayRef = - Arc::new(UInt32Array::from(vec![1_u32, 2_u32, 3_u32, 4_u32, 5_u32])); + let a: ArrayRef = Arc::new(UInt32Array::from_slice(&[ + 1_u32, 2_u32, 3_u32, 4_u32, 5_u32, + ])); generic_test_op!( a, DataType::UInt32, @@ -336,8 +333,9 @@ mod tests { #[test] fn sum_f32() -> Result<()> { - let a: ArrayRef = - Arc::new(Float32Array::from(vec![1_f32, 2_f32, 3_f32, 4_f32, 5_f32])); + let a: ArrayRef = Arc::new(Float32Array::from_slice(&[ + 1_f32, 2_f32, 3_f32, 4_f32, 5_f32, + ])); generic_test_op!( a, DataType::Float32, @@ -349,8 +347,9 @@ mod tests { #[test] fn sum_f64() -> Result<()> { - let a: ArrayRef = - Arc::new(Float64Array::from(vec![1_f64, 2_f64, 3_f64, 4_f64, 5_f64])); + let a: ArrayRef = Arc::new(Float64Array::from_slice(&[ + 1_f64, 2_f64, 3_f64, 4_f64, 5_f64, + ])); generic_test_op!( a, DataType::Float64, diff --git a/datafusion/src/physical_plan/expressions/try_cast.rs b/datafusion/src/physical_plan/expressions/try_cast.rs index 5e402fdea28ad..60aadfdec7dbe 100644 --- a/datafusion/src/physical_plan/expressions/try_cast.rs +++ b/datafusion/src/physical_plan/expressions/try_cast.rs @@ -23,11 +23,10 @@ use super::ColumnarValue; use crate::error::{DataFusionError, Result}; use crate::physical_plan::PhysicalExpr; use crate::scalar::ScalarValue; -use arrow::compute; -use arrow::compute::kernels; -use arrow::datatypes::{DataType, Schema}; -use arrow::record_batch::RecordBatch; -use compute::can_cast_types; +use arrow2::compute; +use arrow2::datatypes::{DataType, Schema}; +use arrow2::record_batch::RecordBatch; +use compute::cast; /// TRY_CAST expression casts an expression to a specific data type and retuns NULL on invalid cast #[derive(Debug)] @@ -78,13 +77,13 @@ impl PhysicalExpr for TryCastExpr { fn evaluate(&self, batch: &RecordBatch) -> Result { let value = self.expr.evaluate(batch)?; match value { - ColumnarValue::Array(array) => Ok(ColumnarValue::Array(kernels::cast::cast( - &array, - &self.cast_type, - )?)), + ColumnarValue::Array(array) => Ok(ColumnarValue::Array( + cast::cast(array.as_ref(), &self.cast_type)?.into(), + )), ColumnarValue::Scalar(scalar) => { let scalar_array = scalar.to_array(); - let cast_array = kernels::cast::cast(&scalar_array, &self.cast_type)?; + let cast_array = + cast::cast(scalar_array.as_ref(), &self.cast_type)?.into(); let cast_scalar = ScalarValue::try_from_array(&cast_array, 0)?; Ok(ColumnarValue::Scalar(cast_scalar)) } @@ -104,7 +103,7 @@ pub fn try_cast( let expr_type = expr.data_type(input_schema)?; if expr_type == cast_type { Ok(expr.clone()) - } else if can_cast_types(&expr_type, &cast_type) { + } else if cast::can_cast_types(&expr_type, &cast_type) { Ok(Arc::new(TryCastExpr::new(expr, cast_type))) } else { Err(DataFusionError::Internal(format!( @@ -119,11 +118,9 @@ mod tests { use super::*; use crate::error::Result; use crate::physical_plan::expressions::col; - use arrow::array::{StringArray, Time64NanosecondArray}; - use arrow::{ - array::{Array, Int32Array, Int64Array, TimestampNanosecondArray, UInt32Array}, - datatypes::*, - }; + use arrow2::{array::*, datatypes::*}; + + type StringArray = Utf8Array; // runs an end-to-end test of physical type cast // 1. construct a record batch with a column "a" of type A @@ -134,7 +131,7 @@ mod tests { macro_rules! generic_test_cast { ($A_ARRAY:ident, $A_TYPE:expr, $A_VEC:expr, $TYPEARRAY:ident, $TYPE:expr, $VEC:expr) => {{ let schema = Schema::new(vec![Field::new("a", $A_TYPE, false)]); - let a = $A_ARRAY::from($A_VEC); + let a = $A_ARRAY::from_slice(&$A_VEC); let batch = RecordBatch::try_new(Arc::new(schema.clone()), vec![Arc::new(a)])?; @@ -221,15 +218,12 @@ mod tests { #[test] fn test_cast_i64_t64() -> Result<()> { let original = vec![1, 2, 3, 4, 5]; - let expected: Vec> = original - .iter() - .map(|i| Some(Time64NanosecondArray::from(vec![*i]).value(0))) - .collect(); + let expected: Vec> = original.iter().map(|i| Some(*i)).collect(); generic_test_cast!( Int64Array, DataType::Int64, original.clone(), - TimestampNanosecondArray, + Int64Array, DataType::Timestamp(TimeUnit::Nanosecond, None), expected ); diff --git a/datafusion/src/physical_plan/filter.rs b/datafusion/src/physical_plan/filter.rs index bc2b17aa4f47d..1bdd9a90042ae 100644 --- a/datafusion/src/physical_plan/filter.rs +++ b/datafusion/src/physical_plan/filter.rs @@ -28,11 +28,14 @@ use crate::error::{DataFusionError, Result}; use crate::physical_plan::{ DisplayFormatType, ExecutionPlan, Partitioning, PhysicalExpr, }; -use arrow::array::BooleanArray; -use arrow::compute::filter_record_batch; -use arrow::datatypes::{DataType, SchemaRef}; -use arrow::error::Result as ArrowResult; -use arrow::record_batch::RecordBatch; + +use arrow2::array::BooleanArray; +use arrow2::compute::filter::filter_record_batch; +use arrow2::datatypes::{DataType, Schema}; +use arrow2::error::Result as ArrowResult; +use arrow2::record_batch::RecordBatch; + +type SchemaRef = Arc; use async_trait::async_trait; diff --git a/datafusion/src/physical_plan/functions.rs b/datafusion/src/physical_plan/functions.rs index 367e594f6e977..f757361fbf95f 100644 --- a/datafusion/src/physical_plan/functions.rs +++ b/datafusion/src/physical_plan/functions.rs @@ -43,17 +43,19 @@ use crate::{ error::{DataFusionError, Result}, scalar::ScalarValue, }; -use arrow::{ - array::{ArrayRef, NullArray}, - compute::kernels::length::{bit_length, length}, +use arrow2::{ + array::{Array, NullArray}, + compute::length::length, datatypes::TimeUnit, - datatypes::{DataType, Field, Int32Type, Int64Type, Schema}, + datatypes::{DataType, Field, Schema}, record_batch::RecordBatch, }; use fmt::{Debug, Formatter}; use std::convert::From; use std::{any::Any, fmt, str::FromStr, sync::Arc}; +type ArrayRef = Arc; + /// A function's signature, which defines the function's supported argument types. #[derive(Debug, Clone, PartialEq)] pub enum Signature { @@ -557,7 +559,7 @@ pub fn create_physical_expr( ))), }, BuiltinScalarFunction::BitLength => |args| match &args[0] { - ColumnarValue::Array(v) => Ok(ColumnarValue::Array(bit_length(v.as_ref())?)), + ColumnarValue::Array(v) => todo!(), ColumnarValue::Scalar(v) => match v { ScalarValue::Utf8(v) => Ok(ColumnarValue::Scalar(ScalarValue::Int32( v.as_ref().map(|x| (x.len() * 8) as i32), @@ -584,7 +586,7 @@ pub fn create_physical_expr( DataType::Utf8 => { let func = invoke_if_unicode_expressions_feature_flag!( character_length, - Int32Type, + i32, "character_length" ); make_scalar_function(func)(args) @@ -592,7 +594,7 @@ pub fn create_physical_expr( DataType::LargeUtf8 => { let func = invoke_if_unicode_expressions_feature_flag!( character_length, - Int64Type, + i64, "character_length" ); make_scalar_function(func)(args) @@ -685,7 +687,9 @@ pub fn create_physical_expr( } BuiltinScalarFunction::NullIf => nullif_func, BuiltinScalarFunction::OctetLength => |args| match &args[0] { - ColumnarValue::Array(v) => Ok(ColumnarValue::Array(length(v.as_ref())?)), + ColumnarValue::Array(v) => { + Ok(ColumnarValue::Array(length(v.as_ref())?.into())) + } ColumnarValue::Scalar(v) => match v { ScalarValue::Utf8(v) => Ok(ColumnarValue::Scalar(ScalarValue::Int32( v.as_ref().map(|x| x.len() as i32), @@ -860,15 +864,13 @@ pub fn create_physical_expr( }, BuiltinScalarFunction::Strpos => |args| match args[0].data_type() { DataType::Utf8 => { - let func = invoke_if_unicode_expressions_feature_flag!( - strpos, Int32Type, "strpos" - ); + let func = + invoke_if_unicode_expressions_feature_flag!(strpos, i32, "strpos"); make_scalar_function(func)(args) } DataType::LargeUtf8 => { - let func = invoke_if_unicode_expressions_feature_flag!( - strpos, Int64Type, "strpos" - ); + let func = + invoke_if_unicode_expressions_feature_flag!(strpos, i64, "strpos"); make_scalar_function(func)(args) } other => Err(DataFusionError::Internal(format!( @@ -894,10 +896,10 @@ pub fn create_physical_expr( }, BuiltinScalarFunction::ToHex => |args| match args[0].data_type() { DataType::Int32 => { - make_scalar_function(string_expressions::to_hex::)(args) + make_scalar_function(string_expressions::to_hex::)(args) } DataType::Int64 => { - make_scalar_function(string_expressions::to_hex::)(args) + make_scalar_function(string_expressions::to_hex::)(args) } other => Err(DataFusionError::Internal(format!( "Unsupported data type {:?} for function to_hex", @@ -963,9 +965,7 @@ fn signature(fun: &BuiltinScalarFunction) -> Signature { // for now, the list is small, as we do not have many built-in functions. match fun { - BuiltinScalarFunction::Array => { - Signature::Variadic(array_expressions::SUPPORTED_ARRAY_TYPES.to_vec()) - } + BuiltinScalarFunction::Array => Signature::VariadicEqual, BuiltinScalarFunction::Concat | BuiltinScalarFunction::ConcatWithSeparator => { Signature::Variadic(vec![DataType::Utf8]) } @@ -1188,7 +1188,7 @@ type NullColumnarValue = ColumnarValue; impl From<&RecordBatch> for NullColumnarValue { fn from(batch: &RecordBatch) -> Self { let num_rows = batch.num_rows(); - ColumnarValue::Array(Arc::new(NullArray::new(num_rows))) + ColumnarValue::Array(Arc::new(NullArray::from_data(num_rows))) } } @@ -1272,14 +1272,9 @@ mod tests { physical_plan::expressions::{col, lit}, scalar::ScalarValue, }; - use arrow::{ - array::{ - Array, ArrayRef, BinaryArray, BooleanArray, FixedSizeListArray, Float64Array, - Int32Array, ListArray, StringArray, UInt32Array, UInt64Array, - }, - datatypes::Field, - record_batch::RecordBatch, - }; + use arrow2::{array::*, datatypes::Field, record_batch::RecordBatch}; + + type StringArray = Utf8Array; /// $FUNC function to test /// $ARGS arguments (vec) to pass to function @@ -1295,7 +1290,7 @@ mod tests { // any type works here: we evaluate against a literal of `value` let schema = Schema::new(vec![Field::new("a", DataType::Int32, false)]); - let columns: Vec = vec![Arc::new(Int32Array::from(vec![1]))]; + let columns: Vec = vec![Arc::new(Int32Array::from_slice(&[1]))]; let expr = create_physical_expr(&BuiltinScalarFunction::$FUNC, $ARGS, &schema, &ctx_state)?; @@ -2766,6 +2761,7 @@ mod tests { Utf8, StringArray ); + type B = BinaryArray; #[cfg(feature = "crypto_expressions")] test_function!( SHA224, @@ -2777,7 +2773,7 @@ mod tests { ])), &[u8], Binary, - BinaryArray + B ); #[cfg(feature = "crypto_expressions")] test_function!( @@ -2790,7 +2786,7 @@ mod tests { ])), &[u8], Binary, - BinaryArray + B ); #[cfg(feature = "crypto_expressions")] test_function!( @@ -2799,7 +2795,7 @@ mod tests { Ok(None), &[u8], Binary, - BinaryArray + B ); #[cfg(not(feature = "crypto_expressions"))] test_function!( @@ -2810,7 +2806,7 @@ mod tests { )), &[u8], Binary, - BinaryArray + B ); #[cfg(feature = "crypto_expressions")] test_function!( @@ -2823,7 +2819,7 @@ mod tests { ])), &[u8], Binary, - BinaryArray + B ); #[cfg(feature = "crypto_expressions")] test_function!( @@ -2836,7 +2832,7 @@ mod tests { ])), &[u8], Binary, - BinaryArray + B ); #[cfg(feature = "crypto_expressions")] test_function!( @@ -2845,7 +2841,7 @@ mod tests { Ok(None), &[u8], Binary, - BinaryArray + B ); #[cfg(not(feature = "crypto_expressions"))] test_function!( @@ -2856,7 +2852,7 @@ mod tests { )), &[u8], Binary, - BinaryArray + B ); #[cfg(feature = "crypto_expressions")] test_function!( @@ -2871,7 +2867,7 @@ mod tests { ])), &[u8], Binary, - BinaryArray + B ); #[cfg(feature = "crypto_expressions")] test_function!( @@ -2886,7 +2882,7 @@ mod tests { ])), &[u8], Binary, - BinaryArray + B ); #[cfg(feature = "crypto_expressions")] test_function!( @@ -2895,7 +2891,7 @@ mod tests { Ok(None), &[u8], Binary, - BinaryArray + B ); #[cfg(not(feature = "crypto_expressions"))] test_function!( @@ -2906,7 +2902,7 @@ mod tests { )), &[u8], Binary, - BinaryArray + B ); #[cfg(feature = "crypto_expressions")] test_function!( @@ -2922,7 +2918,7 @@ mod tests { ])), &[u8], Binary, - BinaryArray + B ); #[cfg(feature = "crypto_expressions")] test_function!( @@ -2938,7 +2934,7 @@ mod tests { ])), &[u8], Binary, - BinaryArray + B ); #[cfg(feature = "crypto_expressions")] test_function!( @@ -2947,7 +2943,7 @@ mod tests { Ok(None), &[u8], Binary, - BinaryArray + B ); #[cfg(not(feature = "crypto_expressions"))] test_function!( @@ -3469,7 +3465,7 @@ mod tests { &ctx_state, )?; - let columns: Vec = vec![Arc::new(Int32Array::from(vec![1]))]; + let columns: Vec = vec![Arc::new(Int32Array::from_slice(&[1]))]; let batch = RecordBatch::try_new(Arc::new(schema.clone()), columns)?; let result = expr.evaluate(&batch); @@ -3529,24 +3525,24 @@ mod tests { #[test] fn test_array() -> Result<()> { generic_test_array( - Arc::new(StringArray::from(vec!["aa"])), - Arc::new(StringArray::from(vec!["bb"])), + Arc::new(StringArray::from_slice(&["aa"])), + Arc::new(StringArray::from_slice(&["bb"])), DataType::Utf8, "StringArray\n[\n \"aa\",\n \"bb\",\n]", )?; // different types, to validate that casting happens generic_test_array( - Arc::new(UInt32Array::from(vec![1u32])), - Arc::new(UInt64Array::from(vec![1u64])), + Arc::new(UInt32Array::from_slice(&[1u32])), + Arc::new(UInt64Array::from_slice(&[1u64])), DataType::UInt64, "PrimitiveArray\n[\n 1,\n 1,\n]", )?; // different types (another order), to validate that casting happens generic_test_array( - Arc::new(UInt64Array::from(vec![1u64])), - Arc::new(UInt32Array::from(vec![1u32])), + Arc::new(UInt64Array::from_slice(&[1u64])), + Arc::new(UInt32Array::from_slice(&[1u32])), DataType::UInt64, "PrimitiveArray\n[\n 1,\n 1,\n]", ) @@ -3559,7 +3555,7 @@ mod tests { let ctx_state = ExecutionContextState::new(); // concat(value, value) - let col_value: ArrayRef = Arc::new(StringArray::from(vec!["aaa-555"])); + let col_value: ArrayRef = Arc::new(StringArray::from_slice(&["aaa-555"])); let pattern = lit(ScalarValue::Utf8(Some(r".*-(\d*)".to_string()))); let columns: Vec = vec![col_value]; let expr = create_physical_expr( @@ -3580,7 +3576,7 @@ mod tests { let result = expr.evaluate(&batch)?.into_array(batch.num_rows()); // downcast works - let result = result.as_any().downcast_ref::().unwrap(); + let result = result.as_any().downcast_ref::>().unwrap(); let first_row = result.value(0); let first_row = first_row.as_any().downcast_ref::().unwrap(); @@ -3600,7 +3596,7 @@ mod tests { // concat(value, value) let col_value = lit(ScalarValue::Utf8(Some("aaa-555".to_string()))); let pattern = lit(ScalarValue::Utf8(Some(r".*-(\d*)".to_string()))); - let columns: Vec = vec![Arc::new(Int32Array::from(vec![1]))]; + let columns: Vec = vec![Arc::new(Int32Array::from_slice(&[1]))]; let expr = create_physical_expr( &BuiltinScalarFunction::RegexpMatch, &[col_value, pattern], @@ -3619,7 +3615,7 @@ mod tests { let result = expr.evaluate(&batch)?.into_array(batch.num_rows()); // downcast works - let result = result.as_any().downcast_ref::().unwrap(); + let result = result.as_any().downcast_ref::>().unwrap(); let first_row = result.value(0); let first_row = first_row.as_any().downcast_ref::().unwrap(); diff --git a/datafusion/src/physical_plan/group_scalar.rs b/datafusion/src/physical_plan/group_scalar.rs index 943386d215c4f..7c81be072e6e4 100644 --- a/datafusion/src/physical_plan/group_scalar.rs +++ b/datafusion/src/physical_plan/group_scalar.rs @@ -39,9 +39,9 @@ pub(crate) enum GroupByScalar { Utf8(Box), LargeUtf8(Box), Boolean(bool), - TimeMillisecond(i64), - TimeMicrosecond(i64), - TimeNanosecond(i64), + TimestampMillisecond(i64), + TimestampMicrosecond(i64), + TimestampNanosecond(i64), Date32(i32), } @@ -66,13 +66,13 @@ impl TryFrom<&ScalarValue> for GroupByScalar { ScalarValue::UInt32(Some(v)) => GroupByScalar::UInt32(*v), ScalarValue::UInt64(Some(v)) => GroupByScalar::UInt64(*v), ScalarValue::TimestampMillisecond(Some(v)) => { - GroupByScalar::TimeMillisecond(*v) + GroupByScalar::TimestampMillisecond(*v) } ScalarValue::TimestampMicrosecond(Some(v)) => { - GroupByScalar::TimeMicrosecond(*v) + GroupByScalar::TimestampMicrosecond(*v) } ScalarValue::TimestampNanosecond(Some(v)) => { - GroupByScalar::TimeNanosecond(*v) + GroupByScalar::TimestampNanosecond(*v) } ScalarValue::Utf8(Some(v)) => GroupByScalar::Utf8(Box::new(v.clone())), ScalarValue::LargeUtf8(Some(v)) => { @@ -121,13 +121,13 @@ impl From<&GroupByScalar> for ScalarValue { GroupByScalar::UInt64(v) => ScalarValue::UInt64(Some(*v)), GroupByScalar::Utf8(v) => ScalarValue::Utf8(Some(v.to_string())), GroupByScalar::LargeUtf8(v) => ScalarValue::LargeUtf8(Some(v.to_string())), - GroupByScalar::TimeMillisecond(v) => { + GroupByScalar::TimestampMillisecond(v) => { ScalarValue::TimestampMillisecond(Some(*v)) } - GroupByScalar::TimeMicrosecond(v) => { + GroupByScalar::TimestampMicrosecond(v) => { ScalarValue::TimestampMicrosecond(Some(*v)) } - GroupByScalar::TimeNanosecond(v) => { + GroupByScalar::TimestampNanosecond(v) => { ScalarValue::TimestampNanosecond(Some(*v)) } GroupByScalar::Date32(v) => ScalarValue::Date32(Some(*v)), diff --git a/datafusion/src/physical_plan/hash_aggregate.rs b/datafusion/src/physical_plan/hash_aggregate.rs index 0a822dc898afb..f74cccbb1a5cd 100644 --- a/datafusion/src/physical_plan/hash_aggregate.rs +++ b/datafusion/src/physical_plan/hash_aggregate.rs @@ -27,43 +27,29 @@ use futures::{ Future, }; -use crate::error::{DataFusionError, Result}; use crate::physical_plan::{ Accumulator, AggregateExpr, DisplayFormatType, Distribution, ExecutionPlan, Partitioning, PhysicalExpr, SQLMetric, }; - -use arrow::{ - array::{Array, UInt32Builder}, - error::{ArrowError, Result as ArrowResult}, -}; -use arrow::{ - array::{ - ArrayRef, Float32Array, Float64Array, Int16Array, Int32Array, Int64Array, - Int8Array, StringArray, UInt16Array, UInt32Array, UInt64Array, UInt8Array, - }, - compute, -}; -use arrow::{ - array::{BooleanArray, Date32Array, DictionaryArray}, - compute::cast, - datatypes::{ - ArrowDictionaryKeyType, ArrowNativeType, Int16Type, Int32Type, Int64Type, - Int8Type, UInt16Type, UInt32Type, UInt64Type, UInt8Type, - }, +use crate::{ + error::{DataFusionError, Result}, + scalar::ScalarValue, }; -use arrow::{ - datatypes::{DataType, Field, Schema, SchemaRef, TimeUnit}, + +use arrow2::error::{ArrowError, Result as ArrowResult}; +use arrow2::{array::*, compute}; +use arrow2::{buffer::MutableBuffer, datatypes::*}; +use arrow2::{ + datatypes::{DataType, Field, Schema}, record_batch::RecordBatch, }; use hashbrown::HashMap; use ordered_float::OrderedFloat; use pin_project_lite::pin_project; -use arrow::array::{ - LargeStringArray, TimestampMicrosecondArray, TimestampMillisecondArray, - TimestampNanosecondArray, -}; +type SchemaRef = Arc; +type ArrayRef = Arc; + use async_trait::async_trait; use super::{ @@ -390,7 +376,7 @@ fn group_aggregate_batch( if v.is_empty() { batch_keys.push(key.clone()) }; - v.push(row as u32) + v.push(row as i32) }) // 1.2 .or_insert_with(|| { @@ -400,22 +386,23 @@ fn group_aggregate_batch( let _ = create_group_by_values(&group_values, row, &mut group_by_values); ( key.clone(), - (group_by_values.clone(), accumulator_set, vec![row as u32]), + (group_by_values.clone(), accumulator_set, vec![row as i32]), ) }); } // Collect all indices + offsets based on keys in this vec - let mut batch_indices: UInt32Builder = UInt32Builder::new(0); + let mut batch_indices = MutableBuffer::::with_capacity(0); let mut offsets = vec![0]; let mut offset_so_far = 0; for key in batch_keys.iter() { let (_, _, indices) = accumulators.get_mut(key).unwrap(); - batch_indices.append_slice(&indices)?; + batch_indices.extend_from_slice(&indices); offset_so_far += indices.len(); offsets.push(offset_so_far); } - let batch_indices = batch_indices.finish(); + let batch_indices = + Int32Array::from_data(DataType::Int32, batch_indices.into(), None); // `Take` all values based on indices into Arrays let values: Vec>> = aggr_input_values @@ -424,12 +411,9 @@ fn group_aggregate_batch( array .iter() .map(|array| { - compute::take( - array.as_ref(), - &batch_indices, - None, // None: no index check - ) - .unwrap() + compute::take::take(array.as_ref(), &batch_indices) + .unwrap() + .into() }) .collect() // 2.3 @@ -457,7 +441,7 @@ fn group_aggregate_batch( .iter() .map(|array| { // 2.3 - array.slice(offsets[0], offsets[1] - offsets[0]) + array.slice(offsets[0], offsets[1] - offsets[0]).into() }) .collect::>(), ) @@ -490,7 +474,7 @@ fn group_aggregate_batch( /// but it also has to to handle the case where the dictionary itself /// is not the same across all record batches (and thus indexes in one /// record batch may not correspond to the same index in another) -fn dictionary_create_key_for_col( +fn dictionary_create_key_for_col( col: &ArrayRef, row: usize, vec: &mut Vec, @@ -498,7 +482,7 @@ fn dictionary_create_key_for_col( let dict_col = col.as_any().downcast_ref::>().unwrap(); // look up the index in the values dictionary - let keys_col = dict_col.keys_array(); + let keys_col = dict_col.keys(); let values_index = keys_col.value(row).to_usize().ok_or_else(|| { DataFusionError::Internal(format!( "Can not convert index to usize in dictionary of type creating group by value {:?}", @@ -557,29 +541,15 @@ fn create_key_for_col(col: &ArrayRef, row: usize, vec: &mut Vec) -> Result<( let array = col.as_any().downcast_ref::().unwrap(); vec.extend_from_slice(&array.value(row).to_le_bytes()); } - DataType::Timestamp(TimeUnit::Millisecond, None) => { - let array = col - .as_any() - .downcast_ref::() - .unwrap(); - vec.extend_from_slice(&array.value(row).to_le_bytes()); - } - DataType::Timestamp(TimeUnit::Microsecond, None) => { + DataType::Timestamp(_, None) => { let array = col .as_any() - .downcast_ref::() - .unwrap(); - vec.extend_from_slice(&array.value(row).to_le_bytes()); - } - DataType::Timestamp(TimeUnit::Nanosecond, None) => { - let array = col - .as_any() - .downcast_ref::() + .downcast_ref::() .unwrap(); vec.extend_from_slice(&array.value(row).to_le_bytes()); } DataType::Utf8 => { - let array = col.as_any().downcast_ref::().unwrap(); + let array = col.as_any().downcast_ref::>().unwrap(); let value = array.value(row); // store the size vec.extend_from_slice(&value.len().to_le_bytes()); @@ -587,7 +557,7 @@ fn create_key_for_col(col: &ArrayRef, row: usize, vec: &mut Vec) -> Result<( vec.extend_from_slice(value.as_bytes()); } DataType::LargeUtf8 => { - let array = col.as_any().downcast_ref::().unwrap(); + let array = col.as_any().downcast_ref::>().unwrap(); let value = array.value(row); // store the size vec.extend_from_slice(&value.len().to_le_bytes()); @@ -595,33 +565,33 @@ fn create_key_for_col(col: &ArrayRef, row: usize, vec: &mut Vec) -> Result<( vec.extend_from_slice(value.as_bytes()); } DataType::Date32 => { - let array = col.as_any().downcast_ref::().unwrap(); + let array = col.as_any().downcast_ref::().unwrap(); vec.extend_from_slice(&array.value(row).to_le_bytes()); } DataType::Dictionary(index_type, _) => match **index_type { DataType::Int8 => { - dictionary_create_key_for_col::(col, row, vec)?; + dictionary_create_key_for_col::(col, row, vec)?; } DataType::Int16 => { - dictionary_create_key_for_col::(col, row, vec)?; + dictionary_create_key_for_col::(col, row, vec)?; } DataType::Int32 => { - dictionary_create_key_for_col::(col, row, vec)?; + dictionary_create_key_for_col::(col, row, vec)?; } DataType::Int64 => { - dictionary_create_key_for_col::(col, row, vec)?; + dictionary_create_key_for_col::(col, row, vec)?; } DataType::UInt8 => { - dictionary_create_key_for_col::(col, row, vec)?; + dictionary_create_key_for_col::(col, row, vec)?; } DataType::UInt16 => { - dictionary_create_key_for_col::(col, row, vec)?; + dictionary_create_key_for_col::(col, row, vec)?; } DataType::UInt32 => { - dictionary_create_key_for_col::(col, row, vec)?; + dictionary_create_key_for_col::(col, row, vec)?; } DataType::UInt64 => { - dictionary_create_key_for_col::(col, row, vec)?; + dictionary_create_key_for_col::(col, row, vec)?; } _ => return Err(DataFusionError::Internal(format!( "Unsupported GROUP BY type (dictionary index type not supported creating key) {}", @@ -723,7 +693,7 @@ impl GroupedHashAggregateStream { type AccumulatorItem = Box; type Accumulators = - HashMap, (Box<[GroupByScalar]>, Vec, Vec), RandomState>; + HashMap, (Box<[GroupByScalar]>, Vec, Vec), RandomState>; impl Stream for GroupedHashAggregateStream { type Item = ArrowResult; @@ -748,7 +718,7 @@ impl Stream for GroupedHashAggregateStream { // check for error in receiving channel and unwrap actual result let result = match result { - Err(e) => Err(ArrowError::ExternalError(Box::new(e))), // error receiving + Err(e) => Err(ArrowError::External("".to_string(), Box::new(e))), // error receiving Ok(result) => result, }; @@ -939,7 +909,7 @@ impl Stream for HashAggregateStream { // check for error in receiving channel and unwrap actual result let result = match result { - Err(e) => Err(ArrowError::ExternalError(Box::new(e))), // error receiving + Err(e) => Err(ArrowError::External("".to_string(), Box::new(e))), // error receiving Ok(result) => result, }; @@ -965,7 +935,7 @@ fn concatenate(arrays: Vec>) -> ArrowResult> { .iter() .map(|a| a[column].as_ref()) .collect::>(); - compute::concat(&array_list) + Ok(compute::concat::concatenate(&array_list)?.into()) }) .collect::>>() } @@ -987,42 +957,9 @@ fn create_batch_from_map( .map(|(_, (group_by_values, accumulator_set, _))| { // 2. let mut groups = (0..num_group_expr) - .map(|i| match &group_by_values[i] { - GroupByScalar::Float32(n) => { - Arc::new(Float32Array::from(vec![(*n).into()] as Vec)) - as ArrayRef - } - GroupByScalar::Float64(n) => { - Arc::new(Float64Array::from(vec![(*n).into()] as Vec)) - as ArrayRef - } - GroupByScalar::Int8(n) => { - Arc::new(Int8Array::from(vec![*n])) as ArrayRef - } - GroupByScalar::Int16(n) => Arc::new(Int16Array::from(vec![*n])), - GroupByScalar::Int32(n) => Arc::new(Int32Array::from(vec![*n])), - GroupByScalar::Int64(n) => Arc::new(Int64Array::from(vec![*n])), - GroupByScalar::UInt8(n) => Arc::new(UInt8Array::from(vec![*n])), - GroupByScalar::UInt16(n) => Arc::new(UInt16Array::from(vec![*n])), - GroupByScalar::UInt32(n) => Arc::new(UInt32Array::from(vec![*n])), - GroupByScalar::UInt64(n) => Arc::new(UInt64Array::from(vec![*n])), - GroupByScalar::Utf8(str) => { - Arc::new(StringArray::from(vec![&***str])) - } - GroupByScalar::LargeUtf8(str) => { - Arc::new(LargeStringArray::from(vec![&***str])) - } - GroupByScalar::Boolean(b) => Arc::new(BooleanArray::from(vec![*b])), - GroupByScalar::TimeMillisecond(n) => { - Arc::new(TimestampMillisecondArray::from(vec![*n])) - } - GroupByScalar::TimeMicrosecond(n) => { - Arc::new(TimestampMicrosecondArray::from(vec![*n])) - } - GroupByScalar::TimeNanosecond(n) => { - Arc::new(TimestampNanosecondArray::from_vec(vec![*n], None)) - } - GroupByScalar::Date32(n) => Arc::new(Date32Array::from(vec![*n])), + .map(|i| { + let scalar: ScalarValue = (&group_by_values[i]).into(); + scalar.to_array() }) .collect::>(); @@ -1047,7 +984,10 @@ fn create_batch_from_map( let columns = columns .iter() .zip(output_schema.fields().iter()) - .map(|(col, desired_field)| cast(col, desired_field.data_type())) + .map(|(col, desired_field)| { + compute::cast::cast(col.as_ref(), desired_field.data_type()) + .map(|x| x.into()) + }) .collect::>>()?; RecordBatch::try_new(Arc::new(output_schema.to_owned()), columns)? @@ -1097,14 +1037,14 @@ fn finalize_aggregation( } /// Extract the value in `col[row]` from a dictionary a GroupByScalar -fn dictionary_create_group_by_value( +fn dictionary_create_group_by_value( col: &ArrayRef, row: usize, ) -> Result { let dict_col = col.as_any().downcast_ref::>().unwrap(); // look up the index in the values dictionary - let keys_col = dict_col.keys_array(); + let keys_col = dict_col.keys(); let values_index = keys_col.value(row).to_usize().ok_or_else(|| { DataFusionError::Internal(format!( "Can not convert index to usize in dictionary of type creating group by value {:?}", @@ -1159,11 +1099,11 @@ fn create_group_by_value(col: &ArrayRef, row: usize) -> Result { Ok(GroupByScalar::Int64(array.value(row))) } DataType::Utf8 => { - let array = col.as_any().downcast_ref::().unwrap(); + let array = col.as_any().downcast_ref::>().unwrap(); Ok(GroupByScalar::Utf8(Box::new(array.value(row).into()))) } DataType::LargeUtf8 => { - let array = col.as_any().downcast_ref::().unwrap(); + let array = col.as_any().downcast_ref::>().unwrap(); Ok(GroupByScalar::Utf8(Box::new(array.value(row).into()))) } DataType::Boolean => { @@ -1171,39 +1111,30 @@ fn create_group_by_value(col: &ArrayRef, row: usize) -> Result { Ok(GroupByScalar::Boolean(array.value(row))) } DataType::Timestamp(TimeUnit::Millisecond, None) => { - let array = col - .as_any() - .downcast_ref::() - .unwrap(); - Ok(GroupByScalar::TimeMillisecond(array.value(row))) + let array = col.as_any().downcast_ref::().unwrap(); + Ok(GroupByScalar::TimestampMillisecond(array.value(row))) } DataType::Timestamp(TimeUnit::Microsecond, None) => { - let array = col - .as_any() - .downcast_ref::() - .unwrap(); - Ok(GroupByScalar::TimeMicrosecond(array.value(row))) + let array = col.as_any().downcast_ref::().unwrap(); + Ok(GroupByScalar::TimestampMicrosecond(array.value(row))) } DataType::Timestamp(TimeUnit::Nanosecond, None) => { - let array = col - .as_any() - .downcast_ref::() - .unwrap(); - Ok(GroupByScalar::TimeNanosecond(array.value(row))) + let array = col.as_any().downcast_ref::().unwrap(); + Ok(GroupByScalar::TimestampNanosecond(array.value(row))) } DataType::Date32 => { - let array = col.as_any().downcast_ref::().unwrap(); + let array = col.as_any().downcast_ref::().unwrap(); Ok(GroupByScalar::Date32(array.value(row))) } DataType::Dictionary(index_type, _) => match **index_type { - DataType::Int8 => dictionary_create_group_by_value::(col, row), - DataType::Int16 => dictionary_create_group_by_value::(col, row), - DataType::Int32 => dictionary_create_group_by_value::(col, row), - DataType::Int64 => dictionary_create_group_by_value::(col, row), - DataType::UInt8 => dictionary_create_group_by_value::(col, row), - DataType::UInt16 => dictionary_create_group_by_value::(col, row), - DataType::UInt32 => dictionary_create_group_by_value::(col, row), - DataType::UInt64 => dictionary_create_group_by_value::(col, row), + DataType::Int8 => dictionary_create_group_by_value::(col, row), + DataType::Int16 => dictionary_create_group_by_value::(col, row), + DataType::Int32 => dictionary_create_group_by_value::(col, row), + DataType::Int64 => dictionary_create_group_by_value::(col, row), + DataType::UInt8 => dictionary_create_group_by_value::(col, row), + DataType::UInt16 => dictionary_create_group_by_value::(col, row), + DataType::UInt32 => dictionary_create_group_by_value::(col, row), + DataType::UInt64 => dictionary_create_group_by_value::(col, row), _ => Err(DataFusionError::NotImplemented(format!( "Unsupported GROUP BY type (dictionary index type not supported) {}", col.data_type(), @@ -1232,7 +1163,7 @@ pub(crate) fn create_group_by_values( #[cfg(test)] mod tests { - use arrow::array::Float64Array; + use arrow2::array::Float64Array; use super::*; use crate::physical_plan::expressions::{col, Avg}; @@ -1255,16 +1186,16 @@ mod tests { RecordBatch::try_new( schema.clone(), vec![ - Arc::new(UInt32Array::from(vec![2, 3, 4, 4])), - Arc::new(Float64Array::from(vec![1.0, 2.0, 3.0, 4.0])), + Arc::new(UInt32Array::from_slice(&[2, 3, 4, 4])), + Arc::new(Float64Array::from_slice(&[1.0, 2.0, 3.0, 4.0])), ], ) .unwrap(), RecordBatch::try_new( schema, vec![ - Arc::new(UInt32Array::from(vec![2, 3, 3, 4])), - Arc::new(Float64Array::from(vec![1.0, 2.0, 3.0, 4.0])), + Arc::new(UInt32Array::from_slice(&[2, 3, 3, 4])), + Arc::new(Float64Array::from_slice(&[1.0, 2.0, 3.0, 4.0])), ], ) .unwrap(), diff --git a/datafusion/src/physical_plan/hash_join.rs b/datafusion/src/physical_plan/hash_join.rs index 01551cd4daf4c..6bff236dd9b34 100644 --- a/datafusion/src/physical_plan/hash_join.rs +++ b/datafusion/src/physical_plan/hash_join.rs @@ -21,16 +21,6 @@ use ahash::CallHasher; use ahash::RandomState; -use arrow::{ - array::{ - ArrayData, ArrayRef, BooleanArray, Date32Array, Date64Array, Float32Array, - Float64Array, LargeStringArray, PrimitiveArray, TimestampMicrosecondArray, - TimestampMillisecondArray, TimestampNanosecondArray, UInt32BufferBuilder, - UInt32Builder, UInt64BufferBuilder, UInt64Builder, - }, - compute, - datatypes::{TimeUnit, UInt32Type, UInt64Type}, -}; use smallvec::{smallvec, SmallVec}; use std::{any::Any, usize}; use std::{hash::Hasher, sync::Arc}; @@ -41,18 +31,14 @@ use futures::{Stream, StreamExt, TryStreamExt}; use hashbrown::HashMap; use tokio::sync::Mutex; -use arrow::array::Array; -use arrow::datatypes::DataType; -use arrow::datatypes::{Schema, SchemaRef}; -use arrow::error::Result as ArrowResult; -use arrow::record_batch::RecordBatch; +use arrow2::datatypes::*; +use arrow2::error::Result as ArrowResult; +use arrow2::record_batch::RecordBatch; +use arrow2::{array::*, buffer::MutableBuffer}; -use arrow::array::{ - Int16Array, Int32Array, Int64Array, Int8Array, StringArray, UInt16Array, UInt32Array, - UInt64Array, UInt8Array, -}; +use arrow2::compute::take; -use super::expressions::col; +use super::{expressions::col, ArrayRef}; use super::{ hash_utils::{build_join_schema, check_join_is_valid, JoinOn, JoinType}, merge::MergeExec, @@ -66,6 +52,10 @@ use super::{ use crate::physical_plan::coalesce_batches::concat_batches; use log::debug; +type SchemaRef = Arc; +type StringArray = Utf8Array; +type LargeStringArray = Utf8Array; + // Maps a `u64` hash value based on the left ["on" values] to a list of indices with this key's value. // // Note that the `u64` keys are not stored in the hashmap (hence the `()` as key), but are only used @@ -501,10 +491,10 @@ fn build_batch_from_indices( schema: &Schema, left: &RecordBatch, right: &RecordBatch, - left_indices: UInt64Array, - right_indices: UInt32Array, + left_indices: Int64Array, + right_indices: Int32Array, column_indices: &[ColumnIndex], -) -> ArrowResult<(RecordBatch, UInt64Array)> { +) -> ArrowResult<(RecordBatch, Int64Array)> { // build the columns of the new [RecordBatch]: // 1. pick whether the column is from the left or right // 2. based on the pick, `take` items from the different RecordBatches @@ -513,10 +503,10 @@ fn build_batch_from_indices( for column_index in column_indices { let array = if column_index.is_left { let array = left.column(column_index.index); - compute::take(array.as_ref(), &left_indices, None)? + take::take(array.as_ref(), &left_indices)?.into() } else { let array = right.column(column_index.index); - compute::take(array.as_ref(), &right_indices, None)? + take::take(array.as_ref(), &right_indices)?.into() }; columns.push(array); } @@ -533,7 +523,7 @@ fn build_batch( schema: &Schema, column_indices: &[ColumnIndex], random_state: &RandomState, -) -> ArrowResult<(RecordBatch, UInt64Array)> { +) -> ArrowResult<(RecordBatch, Int64Array)> { let (left_indices, right_indices) = build_join_indexes( &left_data, &batch, @@ -588,7 +578,7 @@ fn build_join_indexes( left_on: &[String], right_on: &[String], random_state: &RandomState, -) -> Result<(UInt64Array, UInt32Array)> { +) -> Result<(Int64Array, Int32Array)> { let keys_values = right_on .iter() .map(|name| Ok(col(name).evaluate(right)?.into_array(right.num_rows()))) @@ -608,8 +598,8 @@ fn build_join_indexes( match join_type { JoinType::Inner => { // Using a buffer builder to avoid slower normal builder - let mut left_indices = UInt64BufferBuilder::new(0); - let mut right_indices = UInt32BufferBuilder::new(0); + let mut left_indices = MutableBuffer::::new(); + let mut right_indices = MutableBuffer::::new(); // Visit all of the right rows for (row, hash_value) in hash_values.iter().enumerate() { @@ -624,29 +614,29 @@ fn build_join_indexes( for &i in indices { // Check hash collisions if equal_rows(i as usize, row, &left_join_values, &keys_values)? { - left_indices.append(i); - right_indices.append(row as u32); + left_indices.push(i as i64); + right_indices.push(row as i32); } } } } - let left = ArrayData::builder(DataType::UInt64) - .len(left_indices.len()) - .add_buffer(left_indices.finish()) - .build(); - let right = ArrayData::builder(DataType::UInt32) - .len(right_indices.len()) - .add_buffer(right_indices.finish()) - .build(); Ok(( - PrimitiveArray::::from(left), - PrimitiveArray::::from(right), + PrimitiveArray::::from_data( + DataType::Int64, + left_indices.into(), + None, + ), + PrimitiveArray::::from_data( + DataType::Int32, + right_indices.into(), + None, + ), )) } JoinType::Left => { - let mut left_indices = UInt64Builder::new(0); - let mut right_indices = UInt32Builder::new(0); + let mut left_indices = MutableBuffer::::new(); + let mut right_indices = MutableBuffer::::new(); // First visit all of the rows for (row, hash_value) in hash_values.iter().enumerate() { @@ -656,17 +646,28 @@ fn build_join_indexes( for &i in indices { // Collision check if equal_rows(i as usize, row, &left_join_values, &keys_values)? { - left_indices.append_value(i)?; - right_indices.append_value(row as u32)?; + left_indices.push(i as i64); + right_indices.push(row as i32); } } }; } - Ok((left_indices.finish(), right_indices.finish())) + Ok(( + PrimitiveArray::::from_data( + DataType::Int64, + left_indices.into(), + None, + ), + PrimitiveArray::::from_data( + DataType::Int32, + right_indices.into(), + None, + ), + )) } JoinType::Right | JoinType::Full => { - let mut left_indices = UInt64Builder::new(0); - let mut right_indices = UInt32Builder::new(0); + let mut left_indices = Primitive::::new(); + let mut right_indices = Primitive::::new(); for (row, hash_value) in hash_values.iter().enumerate() { match left.raw_entry().from_hash(*hash_value, |_| true) { @@ -678,22 +679,25 @@ fn build_join_indexes( &left_join_values, &keys_values, )? { - left_indices.append_value(i)?; - right_indices.append_value(row as u32)?; + left_indices.push(Some(i as i64).as_ref()); + right_indices.push(Some(row as i32).as_ref()); } else { - left_indices.append_null()?; - right_indices.append_value(row as u32)?; + left_indices.push(None); + right_indices.push(Some(row as i32).as_ref()); } } } None => { // when no match, add the row with None for the left side - left_indices.append_null()?; - right_indices.append_value(row as u32)?; + left_indices.push(None); + right_indices.push(Some(row as i32).as_ref()); } } } - Ok((left_indices.finish(), right_indices.finish())) + Ok(( + left_indices.to(DataType::Int64), + right_indices.to(DataType::Int32), + )) } } } @@ -977,7 +981,7 @@ pub fn create_hashes<'a>( multi_col ); } - DataType::Int32 => { + DataType::Int32 | DataType::Date32 => { hash_array_primitive!( Int32Array, col, @@ -987,7 +991,7 @@ pub fn create_hashes<'a>( multi_col ); } - DataType::Int64 => { + DataType::Int64 | DataType::Timestamp(_, None) | DataType::Date64 => { hash_array_primitive!( Int64Array, col, @@ -997,79 +1001,29 @@ pub fn create_hashes<'a>( multi_col ); } - DataType::Float32 => { - hash_array_float!( - Float32Array, + DataType::Boolean => { + hash_array!( + BooleanArray, col, - u32, + u8, hashes_buffer, random_state, multi_col ); } - DataType::Float64 => { + DataType::Float32 => { hash_array_float!( - Float64Array, - col, - u64, - hashes_buffer, - random_state, - multi_col - ); - } - DataType::Timestamp(TimeUnit::Millisecond, None) => { - hash_array_primitive!( - TimestampMillisecondArray, - col, - i64, - hashes_buffer, - random_state, - multi_col - ); - } - DataType::Timestamp(TimeUnit::Microsecond, None) => { - hash_array_primitive!( - TimestampMicrosecondArray, - col, - i64, - hashes_buffer, - random_state, - multi_col - ); - } - DataType::Timestamp(TimeUnit::Nanosecond, None) => { - hash_array_primitive!( - TimestampNanosecondArray, - col, - i64, - hashes_buffer, - random_state, - multi_col - ); - } - DataType::Date32 => { - hash_array_primitive!( - Date32Array, - col, - i32, - hashes_buffer, - random_state, - multi_col - ); - } - DataType::Date64 => { - hash_array_primitive!( - Date64Array, + Float32Array, col, - i64, + u8, hashes_buffer, random_state, multi_col ); } - DataType::Boolean => { - hash_array!( - BooleanArray, + DataType::Float64 => { + hash_array_float!( + Float64Array, col, u8, hashes_buffer, @@ -1116,24 +1070,24 @@ fn produce_unmatched( left_data: &JoinLeftData, ) -> ArrowResult { // Find indices which didn't match any right row (are false) - let unmatched_indices: Vec = visited_left_side + let unmatched_indices: MutableBuffer = visited_left_side .iter() .enumerate() .filter(|&(_, &value)| !value) - .map(|(index, _)| index as u64) + .map(|(index, _)| index as i64) .collect(); // generate batches by taking values from the left side and generating columns filled with null on the right side - let indices = UInt64Array::from_iter_values(unmatched_indices); + let indices = Int64Array::from_data(DataType::Int64, unmatched_indices.into(), None); let num_rows = indices.len(); let mut columns: Vec> = Vec::with_capacity(schema.fields().len()); for (idx, column_index) in column_indices.iter().enumerate() { let array = if column_index.is_left { let array = left_data.1.column(column_index.index); - compute::take(array.as_ref(), &indices, None).unwrap() + take::take(array.as_ref(), &indices)?.into() } else { - let datatype = schema.field(idx).data_type(); - arrow::array::new_null_array(datatype, num_rows) + let datatype = schema.field(idx).data_type().clone(); + new_null_array(datatype, num_rows).into() }; columns.push(array); @@ -1173,7 +1127,7 @@ impl Stream for HashJoinStream { match self.join_type { JoinType::Left | JoinType::Full => { left_side.iter().flatten().for_each(|x| { - self.visited_left_side[x as usize] = true; + self.visited_left_side[*x as usize] = true; }); } JoinType::Inner | JoinType::Right => {} @@ -1243,7 +1197,7 @@ mod tests { c: (&str, &Vec), ) -> Arc { let batch = build_table_i32(a, b, c); - let schema = batch.schema(); + let schema = batch.schema().clone(); Arc::new(MemoryExec::try_new(&[vec![batch]], schema, None).unwrap()) } @@ -1381,7 +1335,7 @@ mod tests { ); let batch2 = build_table_i32(("a1", &vec![2]), ("b2", &vec![2]), ("c1", &vec![9])); - let schema = batch1.schema(); + let schema = batch1.schema().clone(); let left = Arc::new( MemoryExec::try_new(&[vec![batch1], vec![batch2]], schema, None).unwrap(), ); @@ -1433,7 +1387,7 @@ mod tests { ); let batch2 = build_table_i32(("a2", &vec![30]), ("b1", &vec![5]), ("c2", &vec![90])); - let schema = batch1.schema(); + let schema = batch1.schema().clone(); let right = Arc::new( MemoryExec::try_new(&[vec![batch1], vec![batch2]], schema, None).unwrap(), ); @@ -1483,7 +1437,7 @@ mod tests { c: (&str, &Vec), ) -> Arc { let batch = build_table_i32(a, b, c); - let schema = batch.schema(); + let schema = batch.schema().clone(); Arc::new( MemoryExec::try_new(&[vec![batch.clone(), batch]], schema, None).unwrap(), ) @@ -1575,7 +1529,7 @@ mod tests { ); let right = build_table_i32(("a2", &vec![]), ("b1", &vec![]), ("c2", &vec![])); let on = &[("b1", "b1")]; - let schema = right.schema(); + let schema = right.schema().clone(); let right = Arc::new(MemoryExec::try_new(&[vec![right]], schema, None).unwrap()); let join = join(left, right, on, &JoinType::Left).unwrap(); @@ -1607,7 +1561,7 @@ mod tests { ); let right = build_table_i32(("a2", &vec![]), ("b2", &vec![]), ("c2", &vec![])); let on = &[("b1", "b2")]; - let schema = right.schema(); + let schema = right.schema().clone(); let right = Arc::new(MemoryExec::try_new(&[vec![right]], schema, None).unwrap()); let join = join(left, right, on, &JoinType::Full).unwrap(); @@ -1784,18 +1738,11 @@ mod tests { &random_state, )?; - let mut left_ids = UInt64Builder::new(0); - left_ids.append_value(0)?; - left_ids.append_value(1)?; - - let mut right_ids = UInt32Builder::new(0); - - right_ids.append_value(0)?; - right_ids.append_value(1)?; - - assert_eq!(left_ids.finish(), l); + let left_ids = Int64Array::from_slice(&[0, 1]); + let right_ids = Int32Array::from_slice(&[0, 1]); - assert_eq!(right_ids.finish(), r); + assert_eq!(left_ids, l); + assert_eq!(right_ids, r); Ok(()) } diff --git a/datafusion/src/physical_plan/hash_utils.rs b/datafusion/src/physical_plan/hash_utils.rs index 7e030af3a124c..5bcf74547d444 100644 --- a/datafusion/src/physical_plan/hash_utils.rs +++ b/datafusion/src/physical_plan/hash_utils.rs @@ -18,7 +18,7 @@ //! Functionality used both on logical and physical plans use crate::error::{DataFusionError, Result}; -use arrow::datatypes::{Field, Schema}; +use arrow2::datatypes::{Field, Schema}; use std::collections::HashSet; /// All valid types of joins. diff --git a/datafusion/src/physical_plan/limit.rs b/datafusion/src/physical_plan/limit.rs index c56dbe141b2d1..33ba2a28f4783 100644 --- a/datafusion/src/physical_plan/limit.rs +++ b/datafusion/src/physical_plan/limit.rs @@ -29,11 +29,15 @@ use crate::error::{DataFusionError, Result}; use crate::physical_plan::{ DisplayFormatType, Distribution, ExecutionPlan, Partitioning, }; -use arrow::array::ArrayRef; -use arrow::compute::limit; -use arrow::datatypes::SchemaRef; -use arrow::error::Result as ArrowResult; -use arrow::record_batch::RecordBatch; + +use arrow2::array::Array; +use arrow2::compute::limit::limit; +use arrow2::datatypes::Schema; +use arrow2::error::Result as ArrowResult; +use arrow2::record_batch::RecordBatch; + +type SchemaRef = Arc; +type ArrayRef = Arc; use super::{RecordBatchStream, SendableRecordBatchStream}; @@ -218,10 +222,10 @@ impl ExecutionPlan for LocalLimitExec { /// Truncate a RecordBatch to maximum of n rows pub fn truncate_batch(batch: &RecordBatch, n: usize) -> RecordBatch { let limited_columns: Vec = (0..batch.num_columns()) - .map(|i| limit(batch.column(i), n)) + .map(|i| limit(batch.column(i).as_ref(), n).into()) .collect(); - RecordBatch::try_new(batch.schema(), limited_columns).unwrap() + RecordBatch::try_new(batch.schema().clone(), limited_columns).unwrap() } /// A Limit stream limits the stream to up to `limit` rows. diff --git a/datafusion/src/physical_plan/math_expressions.rs b/datafusion/src/physical_plan/math_expressions.rs index cfc239cde6613..42d666bcb621a 100644 --- a/datafusion/src/physical_plan/math_expressions.rs +++ b/datafusion/src/physical_plan/math_expressions.rs @@ -16,42 +16,35 @@ // under the License. //! Math expressions -use super::{ColumnarValue, ScalarValue}; -use crate::error::{DataFusionError, Result}; -use arrow::array::{Float32Array, Float64Array}; -use arrow::datatypes::DataType; use rand::{thread_rng, Rng}; use std::iter; use std::sync::Arc; -macro_rules! downcast_compute_op { - ($ARRAY:expr, $NAME:expr, $FUNC:ident, $TYPE:ident) => {{ - let n = $ARRAY.as_any().downcast_ref::<$TYPE>(); - match n { - Some(array) => { - let res: $TYPE = - arrow::compute::kernels::arity::unary(array, |x| x.$FUNC()); - Ok(Arc::new(res)) - } - _ => Err(DataFusionError::Internal(format!( - "Invalid data type for {}", - $NAME - ))), - } - }}; -} +use arrow2::array::Float64Array; +use arrow2::compute::arity::unary; +use arrow2::datatypes::DataType; + +use super::{ColumnarValue, ScalarValue}; +use crate::error::{DataFusionError, Result}; macro_rules! unary_primitive_array_op { ($VALUE:expr, $NAME:expr, $FUNC:ident) => {{ match ($VALUE) { ColumnarValue::Array(array) => match array.data_type() { DataType::Float32 => { - let result = downcast_compute_op!(array, $NAME, $FUNC, Float32Array); - Ok(ColumnarValue::Array(result?)) + let array = array.as_any().downcast_ref().unwrap(); + let array = unary::( + array, + |x| x.$FUNC() as f64, + DataType::Float32, + ); + Ok(ColumnarValue::Array(Arc::new(array))) } DataType::Float64 => { - let result = downcast_compute_op!(array, $NAME, $FUNC, Float64Array); - Ok(ColumnarValue::Array(result?)) + let array = array.as_any().downcast_ref().unwrap(); + let array = + unary::(array, |x| x.$FUNC(), DataType::Float64); + Ok(ColumnarValue::Array(Arc::new(array))) } other => Err(DataFusionError::Internal(format!( "Unsupported data type {:?} for function {}", @@ -114,7 +107,7 @@ pub fn random(args: &[ColumnarValue]) -> Result { }; let mut rng = thread_rng(); let values = iter::repeat_with(|| rng.gen_range(0.0..1.0)).take(len); - let array = Float64Array::from_iter_values(values); + let array = Float64Array::from_trusted_len_values_iter(values); Ok(ColumnarValue::Array(Arc::new(array))) } @@ -122,7 +115,7 @@ pub fn random(args: &[ColumnarValue]) -> Result { mod tests { use super::*; - use arrow::array::{Float64Array, NullArray}; + use arrow2::array::{Float64Array, NullArray}; #[test] fn test_random_expression() { diff --git a/datafusion/src/physical_plan/memory.rs b/datafusion/src/physical_plan/memory.rs index 85d8aeef073c1..e29d3d227fbb2 100644 --- a/datafusion/src/physical_plan/memory.rs +++ b/datafusion/src/physical_plan/memory.rs @@ -27,9 +27,10 @@ use super::{ SendableRecordBatchStream, }; use crate::error::{DataFusionError, Result}; -use arrow::datatypes::SchemaRef; -use arrow::error::Result as ArrowResult; -use arrow::record_batch::RecordBatch; +use arrow2::datatypes::Schema; +type SchemaRef = Arc; +use arrow2::error::Result as ArrowResult; +use arrow2::record_batch::RecordBatch; use async_trait::async_trait; use futures::Stream; diff --git a/datafusion/src/physical_plan/merge.rs b/datafusion/src/physical_plan/merge.rs index c65227c161148..6cc4a8dcd7637 100644 --- a/datafusion/src/physical_plan/merge.rs +++ b/datafusion/src/physical_plan/merge.rs @@ -28,9 +28,9 @@ use futures::Stream; use async_trait::async_trait; -use arrow::record_batch::RecordBatch; -use arrow::{ - datatypes::SchemaRef, +use arrow2::record_batch::RecordBatch; +use arrow2::{ + datatypes::Schema, error::{ArrowError, Result as ArrowResult}, }; @@ -41,6 +41,8 @@ use crate::physical_plan::{DisplayFormatType, ExecutionPlan, Partitioning}; use super::SendableRecordBatchStream; use pin_project_lite::pin_project; +type SchemaRef = Arc; + /// Merge execution plan executes partitions in parallel and combines them into a single /// partition. No guarantees are made about the order of the resulting partition. #[derive(Debug)] @@ -128,7 +130,8 @@ impl ExecutionPlan for MergeExec { Err(e) => { // If send fails, plan being torn // down, no place to send the error - let arrow_error = ArrowError::ExternalError(Box::new(e)); + let arrow_error = + ArrowError::External("".to_string(), Box::new(e)); sender.send(Err(arrow_error)).await.ok(); return; } diff --git a/datafusion/src/physical_plan/mod.rs b/datafusion/src/physical_plan/mod.rs index e915b2c257ddc..436f611bc4e13 100644 --- a/datafusion/src/physical_plan/mod.rs +++ b/datafusion/src/physical_plan/mod.rs @@ -25,10 +25,13 @@ use std::{any::Any, pin::Pin}; use crate::execution::context::ExecutionContextState; use crate::logical_plan::LogicalPlan; use crate::{error::Result, scalar::ScalarValue}; -use arrow::datatypes::{DataType, Schema, SchemaRef}; -use arrow::error::Result as ArrowResult; -use arrow::record_batch::RecordBatch; -use arrow::{array::ArrayRef, datatypes::Field}; +use arrow2::array::Array; +use arrow2::datatypes::{DataType, Field, Schema}; +use arrow2::error::Result as ArrowResult; +use arrow2::record_batch::RecordBatch; + +type ArrayRef = Arc; +type SchemaRef = Arc; use async_trait::async_trait; pub use display::DisplayFormatType; @@ -37,7 +40,7 @@ use futures::stream::Stream; use self::{display::DisplayableExecutionPlan, merge::MergeExec}; use hashbrown::HashMap; -/// Trait for types that stream [arrow::record_batch::RecordBatch] +/// Trait for types that stream [arrow2::record_batch::RecordBatch] pub trait RecordBatchStream: Stream> { /// Returns the schema of this `RecordBatchStream`. /// @@ -527,6 +530,7 @@ pub mod string_expressions; pub mod type_coercion; pub mod udaf; pub mod udf; + #[cfg(feature = "unicode_expressions")] pub mod unicode_expressions; pub mod union; diff --git a/datafusion/src/physical_plan/parquet.rs b/datafusion/src/physical_plan/parquet.rs index dd5e77bc21eb9..051d800dea470 100644 --- a/datafusion/src/physical_plan/parquet.rs +++ b/datafusion/src/physical_plan/parquet.rs @@ -23,35 +23,23 @@ use std::sync::Arc; use std::task::{Context, Poll}; use std::{any::Any, collections::HashSet}; -use super::{ - planner::DefaultPhysicalPlanner, ColumnarValue, PhysicalExpr, RecordBatchStream, - SendableRecordBatchStream, -}; +use super::{RecordBatchStream, SendableRecordBatchStream}; use crate::physical_plan::{common, DisplayFormatType, ExecutionPlan, Partitioning}; use crate::{ error::{DataFusionError, Result}, - execution::context::ExecutionContextState, logical_plan::{Expr, Operator}, optimizer::utils, }; -use arrow::record_batch::RecordBatch; -use arrow::{ - array::new_null_array, + +use arrow2::{ + array::*, + datatypes::*, error::{ArrowError, Result as ArrowResult}, -}; -use arrow::{ - array::{make_array, ArrayData, ArrayRef, BooleanArray, BooleanBufferBuilder}, - buffer::MutableBuffer, - datatypes::{DataType, Field, Schema, SchemaRef}, -}; -use parquet::file::{ - metadata::RowGroupMetaData, - reader::{FileReader, SerializedFileReader}, - statistics::Statistics as ParquetStatistics, + io::parquet::read::{self, CompressedPage, FileMetaData}, + record_batch::RecordBatch, }; use fmt::Debug; -use parquet::arrow::{ArrowReader, ParquetFileArrowReader}; use tokio::{ sync::mpsc::{channel, Receiver, Sender}, task, @@ -62,23 +50,22 @@ use crate::datasource::datasource::{ColumnStatistics, Statistics}; use async_trait::async_trait; use futures::stream::{Stream, StreamExt}; +type SchemaRef = Arc; +type ArrayRef = Arc; + /// Execution plan for scanning one or more Parquet partitions #[derive(Debug, Clone)] pub struct ParquetExec { /// Parquet partitions to read partitions: Vec, /// Schema after projection is applied - schema: SchemaRef, + schema: Arc, /// Projection for which columns to load projection: Vec, - /// Batch size - batch_size: usize, /// Statistics for the data set (sum of statistics for all partitions) statistics: Statistics, - /// Optional predicate builder - predicate_builder: Option, /// Optional limit of the number of rows - limit: Option, + limit: usize, } /// Represents one partition of a Parquet data set and this currently means one Parquet file. @@ -93,7 +80,7 @@ pub struct ParquetExec { #[derive(Debug, Clone)] pub struct ParquetPartition { /// The Parquet filename for this partition - pub filenames: Vec, + pub filename: String, /// Statistics for this partition pub statistics: Statistics, } @@ -105,7 +92,6 @@ impl ParquetExec { path: &str, projection: Option>, predicate: Option, - batch_size: usize, max_concurrency: usize, limit: Option, ) -> Result { @@ -126,7 +112,6 @@ impl ParquetExec { &filenames, projection, predicate, - batch_size, max_concurrency, limit, ) @@ -139,80 +124,45 @@ impl ParquetExec { filenames: &[&str], projection: Option>, predicate: Option, - batch_size: usize, max_concurrency: usize, limit: Option, ) -> Result { + let limit = limit.unwrap_or(usize::MAX); // build a list of Parquet partitions with statistics and gather all unique schemas // used in this data set - let mut schemas: Vec = vec![]; + let mut schemas: Vec> = vec![]; let mut partitions = Vec::with_capacity(max_concurrency); - let filenames: Vec = filenames.iter().map(|s| s.to_string()).collect(); - let chunks = split_files(&filenames, max_concurrency); let mut num_rows = 0; let mut total_byte_size = 0; - let mut null_counts = Vec::new(); let mut limit_exhausted = false; - for chunk in chunks { - let mut filenames: Vec = - chunk.iter().map(|x| x.to_string()).collect(); - let mut total_files = 0; - for filename in &filenames { - total_files += 1; - let file = File::open(filename)?; - let file_reader = Arc::new(SerializedFileReader::new(file)?); - let mut arrow_reader = ParquetFileArrowReader::new(file_reader); - let meta_data = arrow_reader.get_metadata(); - // collect all the unique schemas in this data set - let schema = arrow_reader.get_schema()?; - let num_fields = schema.fields().len(); - if schemas.is_empty() || schema != schemas[0] { - schemas.push(schema); - null_counts = vec![0; num_fields] - } - for row_group_meta in meta_data.row_groups() { - num_rows += row_group_meta.num_rows(); - total_byte_size += row_group_meta.total_byte_size(); - - // Currently assumes every Parquet file has same schema - // https://issues.apache.org/jira/browse/ARROW-11017 - let columns_null_counts = row_group_meta - .columns() - .iter() - .flat_map(|c| c.statistics().map(|stats| stats.null_count())); - - for (i, cnt) in columns_null_counts.enumerate() { - null_counts[i] += cnt - } - if limit.map(|x| num_rows >= x as i64).unwrap_or(false) { - limit_exhausted = true; - break; - } - } - } + for filename in filenames { + let mut file = File::open(filename)?; + let file_metadata = read::read_metadata(&mut file)?; + let schema = read::get_schema(&file_metadata)?; + let schema = Arc::new(schema); - let column_stats = null_counts + let row_count: i64 = (&file_metadata.row_groups) .iter() - .map(|null_count| ColumnStatistics { - null_count: Some(*null_count as usize), - max_value: None, - min_value: None, - distinct_count: None, - }) - .collect(); + .map(|group| group.num_rows()) + .sum(); + let row_count = row_count as usize; + num_rows += row_count; + + if schemas.is_empty() || schema != schemas[0] { + schemas.push(schema); + } let statistics = Statistics { - num_rows: Some(num_rows as usize), - total_byte_size: Some(total_byte_size as usize), - column_statistics: Some(column_stats), + num_rows: Some(row_count), + total_byte_size: None, + column_statistics: None, }; // remove files that are not needed in case of limit - filenames.truncate(total_files); partitions.push(ParquetPartition { - filenames, + filename: filename.to_string(), statistics, }); - if limit_exhausted { + if num_rows > limit { break; } } @@ -228,28 +178,16 @@ impl ParquetExec { ))); } let schema = schemas[0].clone(); - let predicate_builder = predicate.and_then(|predicate_expr| { - RowGroupPredicateBuilder::try_new(&predicate_expr, schema.clone()).ok() - }); - Ok(Self::new( - partitions, - schema, - projection, - predicate_builder, - batch_size, - limit, - )) + Ok(Self::new(partitions, schema, projection, limit)) } /// Create a new Parquet reader execution plan with provided partitions and schema pub fn new( partitions: Vec, - schema: Schema, + schema: Arc, projection: Option>, - predicate_builder: Option, - batch_size: usize, - limit: Option, + limit: usize, ) -> Self { let projection = match projection { Some(p) => p, @@ -310,8 +248,6 @@ impl ParquetExec { partitions, schema: Arc::new(projected_schema), projection, - predicate_builder, - batch_size, statistics, limit, } @@ -327,11 +263,6 @@ impl ParquetExec { &self.projection } - /// Batch size - pub fn batch_size(&self) -> usize { - self.batch_size - } - /// Statistics for the data set (sum of statistics for all partitions) pub fn statistics(&self) -> &Statistics { &self.statistics @@ -340,16 +271,16 @@ impl ParquetExec { impl ParquetPartition { /// Create a new parquet partition - pub fn new(filenames: Vec, statistics: Statistics) -> Self { + pub fn new(filename: String, statistics: Statistics) -> Self { Self { - filenames, + filename, statistics, } } /// The Parquet filename for this partition - pub fn filenames(&self) -> &[String] { - &self.filenames + pub fn filename(&self) -> &String { + &self.filename } /// Statistics for this partition @@ -358,20 +289,22 @@ impl ParquetPartition { } } +/* #[derive(Debug, Clone)] /// Predicate builder used for generating of predicate functions, used to filter row group metadata pub struct RowGroupPredicateBuilder { - parquet_schema: Schema, + parquet_schema: Arc, predicate_expr: Arc, stat_column_req: Vec<(String, StatisticsType, Field)>, } + impl RowGroupPredicateBuilder { /// Try to create a new instance of PredicateExpressionBuilder. /// This will translate the filter expression into a statistics predicate expression /// (for example (column / 2) = 4 becomes (column_min / 2) <= 4 && 4 <= (column_max / 2)), /// then convert it to a DataFusion PhysicalExpression and cache it for later use by build_row_group_predicate. - pub fn try_new(expr: &Expr, parquet_schema: Schema) -> Result { + pub fn try_new(expr: &Expr, parquet_schema: Arc) -> Result { // build predicate expression once let mut stat_column_req = Vec::<(String, StatisticsType, Field)>::new(); let logical_predicate_expr = @@ -454,36 +387,7 @@ impl RowGroupPredicateBuilder { } } } - -/// Build a RecordBatch from a list of RowGroupMetadata structs, -/// creating arrays, one for each statistics column, -/// as requested in the stat_column_req parameter. -fn build_statistics_record_batch( - row_groups: &[RowGroupMetaData], - parquet_schema: &Schema, - stat_column_req: &[(String, StatisticsType, Field)], -) -> Result { - let mut fields = Vec::::new(); - let mut arrays = Vec::::new(); - for (column_name, statistics_type, stat_field) in stat_column_req { - if let Some((column_index, _)) = parquet_schema.column_with_name(column_name) { - let statistics = row_groups - .iter() - .map(|g| g.column(column_index).statistics()) - .collect::>(); - let array = build_statistics_array( - &statistics, - *statistics_type, - stat_field.data_type(), - ); - fields.push(stat_field.clone()); - arrays.push(array); - } - } - let schema = Arc::new(Schema::new(fields)); - RecordBatch::try_new(schema, arrays) - .map_err(|err| DataFusionError::Plan(err.to_string())) -} +*/ struct StatisticsExpressionBuilder<'a> { column_name: String, @@ -712,89 +616,25 @@ enum StatisticsType { Max, } -fn build_statistics_array( - statistics: &[Option<&ParquetStatistics>], - statistics_type: StatisticsType, - data_type: &DataType, -) -> ArrayRef { - let statistics_count = statistics.len(); - let first_group_stats = statistics.iter().find(|s| s.is_some()); - let first_group_stats = if let Some(Some(statistics)) = first_group_stats { - // found first row group with statistics defined - statistics - } else { - // no row group has statistics defined - return new_null_array(data_type, statistics_count); - }; - - let (data_size, arrow_type) = match first_group_stats { - ParquetStatistics::Int32(_) => (std::mem::size_of::(), DataType::Int32), - ParquetStatistics::Int64(_) => (std::mem::size_of::(), DataType::Int64), - ParquetStatistics::Float(_) => (std::mem::size_of::(), DataType::Float32), - ParquetStatistics::Double(_) => (std::mem::size_of::(), DataType::Float64), - ParquetStatistics::ByteArray(_) if data_type == &DataType::Utf8 => { - (0, DataType::Utf8) - } - _ => { - // type of statistics not supported - return new_null_array(data_type, statistics_count); - } - }; - - let statistics = statistics.iter().map(|s| { - s.filter(|s| s.has_min_max_set()) - .map(|s| match statistics_type { - StatisticsType::Min => s.min_bytes(), - StatisticsType::Max => s.max_bytes(), +type Payload = Vec>; + +// Task of the producer of compressed pages. This performs minimal CPU work +fn producer_task(path: &str, response_tx: Sender) -> Result<()> { + let mut file = File::open(path)?; + let metadata = read::read_metadata(&mut file)?; + for row_group in 0..metadata.row_groups.len() { + let columns = (0..metadata.schema().num_columns()) + .map(|column| { + Ok( + read::get_page_iterator(&metadata, row_group, column, &mut file)? + .map(|x| x.map_err(|x| ArrowError::from_external_error(x))) + .collect::>>()?, + ) }) - }); - - if arrow_type == DataType::Utf8 { - let data_size = statistics - .clone() - .map(|x| x.map(|b| b.len()).unwrap_or(0)) - .sum(); - let mut builder = - arrow::array::StringBuilder::with_capacity(statistics_count, data_size); - let string_statistics = - statistics.map(|x| x.and_then(|bytes| std::str::from_utf8(bytes).ok())); - for maybe_string in string_statistics { - match maybe_string { - Some(string_value) => builder.append_value(string_value).unwrap(), - None => builder.append_null().unwrap(), - }; - } - return Arc::new(builder.finish()); - } - - let mut data_buffer = MutableBuffer::new(statistics_count * data_size); - let mut bitmap_builder = BooleanBufferBuilder::new(statistics_count); - let mut null_count = 0; - for s in statistics { - if let Some(stat_data) = s { - bitmap_builder.append(true); - data_buffer.extend_from_slice(stat_data); - } else { - bitmap_builder.append(false); - data_buffer.resize(data_buffer.len() + data_size, 0); - null_count += 1; - } + .collect::>>()?; + response_tx.blocking_send(columns); } - - let mut builder = ArrayData::builder(arrow_type) - .len(statistics_count) - .add_buffer(data_buffer.into()); - if null_count > 0 { - builder = builder.null_bit_buffer(bitmap_builder.finish()); - } - let array_data = builder.build(); - let statistics_array = make_array(array_data); - if statistics_array.data_type() == data_type { - return statistics_array; - } - // cast statistics array to required data type - arrow::compute::cast(&statistics_array, data_type) - .unwrap_or_else(|_| new_null_array(data_type, statistics_count)) + Ok(()) } #[async_trait] @@ -835,32 +675,20 @@ impl ExecutionPlan for ParquetExec { async fn execute(&self, partition: usize) -> Result { // because the parquet implementation is not thread-safe, it is necessary to execute // on a thread and communicate with channels - let (response_tx, response_rx): ( - Sender>, - Receiver>, - ) = channel(2); + let (response_tx, response_rx): (Sender, Receiver) = channel(2); - let filenames = self.partitions[partition].filenames.clone(); + let path = self.partitions[partition].filename.clone(); let projection = self.projection.clone(); - let predicate_builder = self.predicate_builder.clone(); - let batch_size = self.batch_size; let limit = self.limit; - task::spawn_blocking(move || { - if let Err(e) = read_files( - &filenames, - &projection, - &predicate_builder, - batch_size, - response_tx, - limit, - ) { - println!("Parquet reader thread terminated due to error: {:?}", e); - } - }); + let mut file = File::open(path.clone())?; + let metadata = read::read_metadata(&mut file)?; + + task::spawn_blocking(move || producer_task(&path, response_tx).unwrap()); Ok(Box::pin(ParquetStream { schema: self.schema.clone(), + metadata, inner: ReceiverStream::new(response_rx), })) } @@ -875,15 +703,12 @@ impl ExecutionPlan for ParquetExec { let files: Vec<_> = self .partitions .iter() - .map(|pp| pp.filenames.iter()) - .flatten() - .map(|s| s.as_str()) + .map(|pp| pp.filename.as_str()) .collect(); write!( f, - "ParquetExec: batch_size={}, limit={:?}, partitions=[{}]", - self.batch_size, + "ParquetExec: limit={:?}, partitions=[{}]", self.limit, files.join(", ") ) @@ -892,84 +717,33 @@ impl ExecutionPlan for ParquetExec { } } -fn send_result( - response_tx: &Sender>, - result: ArrowResult, -) -> Result<()> { - // Note this function is running on its own blockng tokio thread so blocking here is ok. - response_tx - .blocking_send(result) - .map_err(|e| DataFusionError::Execution(e.to_string()))?; - Ok(()) -} - -fn read_files( - filenames: &[String], - projection: &[usize], - predicate_builder: &Option, - batch_size: usize, - response_tx: Sender>, - limit: Option, -) -> Result<()> { - let mut total_rows = 0; - 'outer: for filename in filenames { - let file = File::open(&filename)?; - let mut file_reader = SerializedFileReader::new(file)?; - if let Some(predicate_builder) = predicate_builder { - let row_group_predicate = predicate_builder - .build_row_group_predicate(file_reader.metadata().row_groups()); - file_reader.filter_row_groups(&row_group_predicate); - } - let mut arrow_reader = ParquetFileArrowReader::new(Arc::new(file_reader)); - let mut batch_reader = arrow_reader - .get_record_reader_by_columns(projection.to_owned(), batch_size)?; - loop { - match batch_reader.next() { - Some(Ok(batch)) => { - //println!("ParquetExec got new batch from {}", filename); - total_rows += batch.num_rows(); - send_result(&response_tx, Ok(batch))?; - if limit.map(|l| total_rows >= l).unwrap_or(false) { - break 'outer; - } - } - None => { - break; - } - Some(Err(e)) => { - let err_msg = format!( - "Error reading batch from {}: {}", - filename, - e.to_string() - ); - // send error to operator - send_result( - &response_tx, - Err(ArrowError::ParquetError(err_msg.clone())), - )?; - // terminate thread with error - return Err(DataFusionError::Execution(err_msg)); - } - } - } - } - - // finished reading files (dropping response_tx will close - // channel) - Ok(()) -} - -fn split_files(filenames: &[String], n: usize) -> Vec<&[String]> { - let mut chunk_size = filenames.len() / n; - if filenames.len() % n > 0 { - chunk_size += 1; - } - filenames.chunks(chunk_size).collect() +struct ParquetStream { + schema: SchemaRef, + metadata: FileMetaData, + inner: ReceiverStream, } -struct ParquetStream { +fn deserialize( + columns: Vec>, + metadata: &FileMetaData, schema: SchemaRef, - inner: ReceiverStream>, +) -> ArrowResult { + let data_types = schema.fields().iter().map(|field| field.data_type()); + let descriptors = metadata.row_groups[0] + .columns() + .iter() + .map(|x| x.descriptor()); + let columns = columns + .into_iter() + .zip(descriptors) + .zip(data_types) + .map(|((pages, descriptor), type_)| { + let array = + read::page_iter_to_array(pages.into_iter().map(|x| Ok(x)), descriptor)?; + arrow2::compute::cast::cast(array.as_ref(), type_).map(|x| x.into()) + }) + .collect::>>()?; + RecordBatch::try_new(schema, columns) } impl Stream for ParquetStream { @@ -979,7 +753,9 @@ impl Stream for ParquetStream { mut self: std::pin::Pin<&mut Self>, cx: &mut Context<'_>, ) -> Poll> { - self.inner.poll_next_unpin(cx) + self.inner + .poll_next_unpin(cx) + .map(|x| x.map(|x| deserialize(x, &self.metadata, self.schema.clone()))) } } @@ -989,10 +765,11 @@ impl RecordBatchStream for ParquetStream { } } +/* #[cfg(test)] mod tests { use super::*; - use arrow::array::{Int32Array, StringArray}; + use arrow2::array::{Int32Array, StringArray}; use futures::StreamExt; use parquet::basic::Type as PhysicalType; use parquet::schema::types::SchemaDescPtr; @@ -1035,7 +812,7 @@ mod tests { #[tokio::test] async fn test() -> Result<()> { - let testdata = arrow::util::test_util::parquet_test_data(); + let testdata = crate::test::parquet_test_data(); let filename = format!("{}/alltypes_plain.parquet", testdata); let parquet_exec = ParquetExec::try_from_path( &filename, @@ -1545,3 +1322,4 @@ mod tests { Arc::new(SchemaDescriptor::new(Arc::new(schema))) } } +*/ diff --git a/datafusion/src/physical_plan/planner.rs b/datafusion/src/physical_plan/planner.rs index 9e7dc7172b820..f0440da12f494 100644 --- a/datafusion/src/physical_plan/planner.rs +++ b/datafusion/src/physical_plan/planner.rs @@ -48,13 +48,15 @@ use crate::{ error::{DataFusionError, Result}, physical_plan::displayable, }; -use arrow::{compute::can_cast_types, datatypes::DataType}; -use arrow::compute::SortOptions; -use arrow::datatypes::{Schema, SchemaRef}; +use arrow2::compute::cast::can_cast_types; +use arrow2::compute::sort::SortOptions; +use arrow2::datatypes::*; use expressions::col; use log::debug; +type SchemaRef = Arc; + /// This trait exposes the ability to plan an [`ExecutionPlan`] out of a [`LogicalPlan`]. pub trait ExtensionPlanner { /// Create a physical plan for a [`UserDefinedLogicalNode`]. @@ -786,7 +788,7 @@ mod tests { logical_plan::{col, lit, sum, LogicalPlanBuilder}, physical_plan::SendableRecordBatchStream, }; - use arrow::datatypes::{DataType, Field, SchemaRef}; + use arrow2::datatypes::{DataType, Field}; use async_trait::async_trait; use fmt::Debug; use std::{any::Any, fmt}; @@ -804,7 +806,7 @@ mod tests { #[test] fn test_all_operators() -> Result<()> { - let testdata = arrow::util::test_util::arrow_test_data(); + let testdata = crate::test::arrow_test_data(); let path = format!("{}/csv/aggregate_test_100.csv", testdata); let options = CsvReadOptions::new().schema_infer_max_records(100); @@ -844,7 +846,7 @@ mod tests { #[test] fn test_with_csv_plan() -> Result<()> { - let testdata = arrow::util::test_util::arrow_test_data(); + let testdata = crate::test::arrow_test_data(); let path = format!("{}/csv/aggregate_test_100.csv", testdata); let options = CsvReadOptions::new().schema_infer_max_records(100); @@ -863,7 +865,7 @@ mod tests { #[test] fn errors() -> Result<()> { - let testdata = arrow::util::test_util::arrow_test_data(); + let testdata = crate::test::arrow_test_data(); let path = format!("{}/csv/aggregate_test_100.csv", testdata); let options = CsvReadOptions::new().schema_infer_max_records(100); @@ -965,7 +967,7 @@ mod tests { #[test] fn in_list_types() -> Result<()> { - let testdata = arrow::util::test_util::arrow_test_data(); + let testdata = crate::test::arrow_test_data(); let path = format!("{}/csv/aggregate_test_100.csv", testdata); let options = CsvReadOptions::new().schema_infer_max_records(100); @@ -1013,7 +1015,7 @@ mod tests { #[test] fn hash_agg_input_schema() -> Result<()> { - let testdata = arrow::util::test_util::arrow_test_data(); + let testdata = crate::test::arrow_test_data(); let path = format!("{}/csv/aggregate_test_100.csv", testdata); let options = CsvReadOptions::new().schema_infer_max_records(100); @@ -1036,7 +1038,7 @@ mod tests { #[test] fn hash_agg_group_by_partitioned() -> Result<()> { - let testdata = arrow::util::test_util::arrow_test_data(); + let testdata = crate::test::arrow_test_data(); let path = format!("{}/csv/aggregate_test_100.csv", testdata); let options = CsvReadOptions::new().schema_infer_max_records(100); diff --git a/datafusion/src/physical_plan/projection.rs b/datafusion/src/physical_plan/projection.rs index c0d78ff7168bf..e97bb7ca0e419 100644 --- a/datafusion/src/physical_plan/projection.rs +++ b/datafusion/src/physical_plan/projection.rs @@ -29,9 +29,12 @@ use crate::error::{DataFusionError, Result}; use crate::physical_plan::{ DisplayFormatType, ExecutionPlan, Partitioning, PhysicalExpr, }; -use arrow::datatypes::{Field, Schema, SchemaRef}; -use arrow::error::Result as ArrowResult; -use arrow::record_batch::RecordBatch; + +use arrow2::datatypes::{Field, Schema}; +use arrow2::error::Result as ArrowResult; +use arrow2::record_batch::RecordBatch; + +type SchemaRef = Arc; use super::{RecordBatchStream, SendableRecordBatchStream}; use async_trait::async_trait; diff --git a/datafusion/src/physical_plan/regex_expressions.rs b/datafusion/src/physical_plan/regex_expressions.rs index b526e7259ef61..2bd9f7e15dd3e 100644 --- a/datafusion/src/physical_plan/regex_expressions.rs +++ b/datafusion/src/physical_plan/regex_expressions.rs @@ -25,32 +25,34 @@ use std::any::type_name; use std::sync::Arc; use crate::error::{DataFusionError, Result}; -use arrow::array::{ArrayRef, GenericStringArray, StringOffsetSizeTrait}; -use arrow::compute; +use arrow2::array::{Array, Offset, Utf8Array}; +use arrow2::compute; use hashbrown::HashMap; use regex::Regex; +type ArrayRef = Arc; + macro_rules! downcast_string_arg { ($ARG:expr, $NAME:expr, $T:ident) => {{ $ARG.as_any() - .downcast_ref::>() + .downcast_ref::>() .ok_or_else(|| { DataFusionError::Internal(format!( "could not cast {} to {}", $NAME, - type_name::>() + type_name::>() )) })? }}; } /// extract a specific group from a string column, using a regular expression -pub fn regexp_match(args: &[ArrayRef]) -> Result { +pub fn regexp_match(args: &[ArrayRef]) -> Result { match args.len() { - 2 => compute::regexp_match(downcast_string_arg!(args[0], "string", T), downcast_string_arg!(args[1], "pattern", T), None) - .map_err(DataFusionError::ArrowError), - 3 => compute::regexp_match(downcast_string_arg!(args[0], "string", T), downcast_string_arg!(args[1], "pattern", T), Some(downcast_string_arg!(args[1], "flags", T))) - .map_err(DataFusionError::ArrowError), + 2 => compute::regex_match::regex_match(downcast_string_arg!(args[0], "string", T), downcast_string_arg!(args[1], "pattern", T)) + .map_err(DataFusionError::ArrowError).map(|x| Arc::new(x) as Arc), + 3 => compute::regex_match::regex_match(downcast_string_arg!(args[0], "string", T), downcast_string_arg!(args[1], "pattern", T)) + .map_err(DataFusionError::ArrowError).map(|x| Arc::new(x) as Arc), other => Err(DataFusionError::Internal(format!( "regexp_match was called with {} arguments. It requires at least 2 and at most 3.", other @@ -72,7 +74,7 @@ fn regex_replace_posix_groups(replacement: &str) -> String { /// Replaces substring(s) matching a POSIX regular expression. /// /// example: `regexp_replace('Thomas', '.[mN]a.', 'M') = 'ThM'` -pub fn regexp_replace(args: &[ArrayRef]) -> Result { +pub fn regexp_replace(args: &[ArrayRef]) -> Result { // creating Regex is expensive so create hashmap for memoization let mut patterns: HashMap = HashMap::new(); @@ -108,7 +110,7 @@ pub fn regexp_replace(args: &[ArrayRef]) -> Result Ok(None) }) - .collect::>>()?; + .collect::>>()?; Ok(Arc::new(result) as ArrayRef) } @@ -160,7 +162,7 @@ pub fn regexp_replace(args: &[ArrayRef]) -> Result Ok(None) }) - .collect::>>()?; + .collect::>>()?; Ok(Arc::new(result) as ArrayRef) } diff --git a/datafusion/src/physical_plan/repartition.rs b/datafusion/src/physical_plan/repartition.rs index 2599690bfc003..cf81807c2e6ac 100644 --- a/datafusion/src/physical_plan/repartition.rs +++ b/datafusion/src/physical_plan/repartition.rs @@ -25,9 +25,12 @@ use std::{any::Any, collections::HashMap, vec}; use crate::error::{DataFusionError, Result}; use crate::physical_plan::{DisplayFormatType, ExecutionPlan, Partitioning}; -use arrow::record_batch::RecordBatch; -use arrow::{array::Array, error::Result as ArrowResult}; -use arrow::{compute::take, datatypes::SchemaRef}; + +use arrow2::{ + array::*, buffer::MutableBuffer, compute::take, datatypes::*, + error::Result as ArrowResult, record_batch::RecordBatch, +}; + use tokio_stream::wrappers::UnboundedReceiverStream; use super::{hash_join::create_hashes, RecordBatchStream, SendableRecordBatchStream}; @@ -41,6 +44,7 @@ use tokio::sync::{ }; use tokio::task::JoinHandle; +type SchemaRef = Arc; type MaybeBatch = Option>; /// The repartition operator maps N input partitions to M output partitions based on a @@ -170,32 +174,38 @@ impl ExecutionPlan for RepartitionExec { // Hash arrays and compute buckets based on number of partitions let hashes = create_hashes(&arrays, &random_state, hashes_buf)?; - let mut indices = vec![vec![]; num_output_partitions]; + let mut indices = (0..num_output_partitions) + .map(|_| MutableBuffer::::new()) + .collect::>(); for (index, hash) in hashes.iter().enumerate() { - indices - [(*hash % num_output_partitions as u64) as usize] - .push(index as u64) + let i = + (*hash % num_output_partitions as u64) as usize; + indices[i].push(index as i64) } for (num_output_partition, partition_indices) in indices.into_iter().enumerate() { - let indices = partition_indices.into(); + let indices = Int64Array::from_data( + DataType::Int64, + partition_indices.into(), + None, + ); // Produce batches based on indices let columns = input_batch .columns() .iter() .map(|c| { - take(c.as_ref(), &indices, None).map_err( - |e| { + take::take(c.as_ref(), &indices) + .map_err(|e| { DataFusionError::Execution( e.to_string(), ) - }, - ) + }) + .map(|x| x.into()) }) .collect::>>>()?; let output_batch = RecordBatch::try_new( - input_batch.schema(), + input_batch.schema().clone(), columns, ); let tx = txs.get_mut(&num_output_partition).unwrap(); @@ -310,9 +320,9 @@ impl RecordBatchStream for RepartitionStream { mod tests { use super::*; use crate::physical_plan::memory::MemoryExec; - use arrow::array::UInt32Array; - use arrow::datatypes::{DataType, Field, Schema}; - use arrow::record_batch::RecordBatch; + use arrow2::array::UInt32Array; + use arrow2::datatypes::{DataType, Field, Schema}; + use arrow2::record_batch::RecordBatch; #[tokio::test] async fn one_to_many_round_robin() -> Result<()> { @@ -415,7 +425,7 @@ mod tests { fn create_batch(schema: &Arc) -> RecordBatch { RecordBatch::try_new( schema.clone(), - vec![Arc::new(UInt32Array::from(vec![1, 2, 3, 4, 5, 6, 7, 8]))], + vec![Arc::new(UInt32Array::from_slice(&[1, 2, 3, 4, 5, 6, 7, 8]))], ) .unwrap() } diff --git a/datafusion/src/physical_plan/sort.rs b/datafusion/src/physical_plan/sort.rs index 8229060190215..602d09db0a79f 100644 --- a/datafusion/src/physical_plan/sort.rs +++ b/datafusion/src/physical_plan/sort.rs @@ -30,12 +30,12 @@ use hashbrown::HashMap; use pin_project_lite::pin_project; -pub use arrow::compute::SortOptions; -use arrow::compute::{concat, lexsort_to_indices, take, SortColumn, TakeOptions}; -use arrow::datatypes::SchemaRef; -use arrow::error::Result as ArrowResult; -use arrow::record_batch::RecordBatch; -use arrow::{array::ArrayRef, error::ArrowError}; +pub use arrow2::compute::sort::SortOptions; +use arrow2::compute::{concat, sort::lexsort_to_indices, sort::SortColumn, take}; +use arrow2::datatypes::Schema; +use arrow2::error::Result as ArrowResult; +use arrow2::record_batch::RecordBatch; +use arrow2::{array::Array, error::ArrowError}; use super::{RecordBatchStream, SendableRecordBatchStream}; use crate::error::{DataFusionError, Result}; @@ -44,6 +44,9 @@ use crate::physical_plan::{ common, DisplayFormatType, Distribution, ExecutionPlan, Partitioning, SQLMetric, }; +type SchemaRef = Arc; +type ArrayRef = Arc; + /// Sort execution plan #[derive(Debug)] pub struct SortExec { @@ -182,26 +185,33 @@ fn sort_batches( .iter() .enumerate() .map(|(i, _)| { - concat( + concat::concatenate( &batches .iter() .map(|batch| batch.column(i).as_ref()) .collect::>(), ) + .map(|x| x.into()) }) .collect::>>()?, )?; + let columns = expr + .iter() + .map(|e| e.evaluate_to_sort_column(&combined_batch)) + .collect::>>() + .map_err(DataFusionError::into_arrow_external_error)?; + let columns = columns + .iter() + .map(|x| SortColumn { + values: x.values.as_ref(), + options: x.options, + }) + .collect::>(); + // sort combined record batch // TODO: pushup the limit expression to sort - let indices = lexsort_to_indices( - &expr - .iter() - .map(|e| e.evaluate_to_sort_column(&combined_batch)) - .collect::>>() - .map_err(DataFusionError::into_arrow_external_error)?, - None, - )?; + let indices = lexsort_to_indices(&columns)?; // reorder all rows based on sorted indices let sorted_batch = RecordBatch::try_new( @@ -209,17 +219,7 @@ fn sort_batches( combined_batch .columns() .iter() - .map(|column| { - take( - column.as_ref(), - &indices, - // disable bound check overhead since indices are already generated from - // the same record batch - Some(TakeOptions { - check_bounds: false, - }), - ) - }) + .map(|column| take::take(column.as_ref(), &indices).map(|x| x.into())) .collect::>>()?, ); sorted_batch.map(Some) @@ -289,7 +289,9 @@ impl Stream for SortStream { // check for error in receiving channel and unwrap actual result let result = match result { - Err(e) => Some(Err(ArrowError::ExternalError(Box::new(e)))), // error receiving + Err(e) => { + Some(Err(ArrowError::External("".to_string(), Box::new(e)))) + } // error receiving Ok(result) => result.transpose(), }; @@ -321,8 +323,8 @@ mod tests { csv::{CsvExec, CsvReadOptions}, }; use crate::test; - use arrow::array::*; - use arrow::datatypes::*; + use arrow2::array::*; + use arrow2::datatypes::*; #[tokio::test] async fn test_sort() -> Result<()> { @@ -363,15 +365,18 @@ mod tests { let columns = result[0].columns(); - let c1 = as_string_array(&columns[0]); + let c1 = columns[0] + .as_any() + .downcast_ref::>() + .unwrap(); assert_eq!(c1.value(0), "a"); assert_eq!(c1.value(c1.len() - 1), "e"); - let c2 = as_primitive_array::(&columns[1]); + let c2 = columns[1].as_any().downcast_ref::().unwrap(); assert_eq!(c2.value(0), 1); assert_eq!(c2.value(c2.len() - 1), 5,); - let c7 = as_primitive_array::(&columns[6]); + let c7 = columns[6].as_any().downcast_ref::().unwrap(); assert_eq!(c7.value(0), 15); assert_eq!(c7.value(c7.len() - 1), 254,); @@ -445,8 +450,8 @@ mod tests { assert_eq!(DataType::Float32, *columns[0].data_type()); assert_eq!(DataType::Float64, *columns[1].data_type()); - let a = as_primitive_array::(&columns[0]); - let b = as_primitive_array::(&columns[1]); + let a = columns[0].as_any().downcast_ref::().unwrap(); + let b = columns[1].as_any().downcast_ref::().unwrap(); // convert result to strings to allow comparing to expected result containing NaN let result: Vec<(Option, Option)> = (0..result[0].num_rows()) diff --git a/datafusion/src/physical_plan/string_expressions.rs b/datafusion/src/physical_plan/string_expressions.rs index 882fe30502fdf..7c82b8d269266 100644 --- a/datafusion/src/physical_plan/string_expressions.rs +++ b/datafusion/src/physical_plan/string_expressions.rs @@ -28,25 +28,27 @@ use crate::{ error::{DataFusionError, Result}, scalar::ScalarValue, }; -use arrow::{ +use arrow2::{ array::{ - Array, ArrayRef, BooleanArray, GenericStringArray, Int32Array, Int64Array, - PrimitiveArray, StringArray, StringOffsetSizeTrait, + Array, BooleanArray, Int32Array, Int64Array, Offset, PrimitiveArray, Utf8Array, }, - datatypes::{ArrowNativeType, ArrowPrimitiveType, DataType}, + datatypes::DataType, }; use super::ColumnarValue; +type StringArray = Utf8Array; +type ArrayRef = Arc; + macro_rules! downcast_string_arg { ($ARG:expr, $NAME:expr, $T:ident) => {{ $ARG.as_any() - .downcast_ref::>() + .downcast_ref::>() .ok_or_else(|| { DataFusionError::Internal(format!( "could not cast {} to {}", $NAME, - type_name::>() + type_name::>() )) })? }}; @@ -90,20 +92,20 @@ macro_rules! downcast_vec { } /// applies a unary expression to `args[0]` that is expected to be downcastable to -/// a `GenericStringArray` and returns a `GenericStringArray` (which may have a different offset) +/// a `Utf8Array` and returns a `Utf8Array` (which may have a different offset) /// # Errors /// This function errors when: /// * the number of arguments is not 1 -/// * the first argument is not castable to a `GenericStringArray` +/// * the first argument is not castable to a `Utf8Array` pub(crate) fn unary_string_function<'a, T, O, F, R>( args: &[&'a dyn Array], op: F, name: &str, -) -> Result> +) -> Result> where R: AsRef, - O: StringOffsetSizeTrait, - T: StringOffsetSizeTrait, + O: Offset, + T: Offset, F: Fn(&'a str) -> R, { if args.len() != 1 { @@ -174,7 +176,7 @@ where /// Returns the numeric code of the first character of the argument. /// ascii('x') = 120 -pub fn ascii(args: &[ArrayRef]) -> Result { +pub fn ascii(args: &[ArrayRef]) -> Result { let string_array = downcast_string_arg!(args[0], "string", T); let result = string_array @@ -192,7 +194,7 @@ pub fn ascii(args: &[ArrayRef]) -> Result { /// Removes the longest string containing only characters in characters (a space by default) from the start and end of string. /// btrim('xyxtrimyyx', 'xyz') = 'trim' -pub fn btrim(args: &[ArrayRef]) -> Result { +pub fn btrim(args: &[ArrayRef]) -> Result { match args.len() { 1 => { let string_array = downcast_string_arg!(args[0], "string", T); @@ -204,7 +206,7 @@ pub fn btrim(args: &[ArrayRef]) -> Result { string.trim_start_matches(' ').trim_end_matches(' ') }) }) - .collect::>(); + .collect::>(); Ok(Arc::new(result) as ArrayRef) } @@ -227,7 +229,7 @@ pub fn btrim(args: &[ArrayRef]) -> Result { ) } }) - .collect::>(); + .collect::>(); Ok(Arc::new(result) as ArrayRef) } @@ -246,15 +248,15 @@ pub fn chr(args: &[ArrayRef]) -> Result { // first map is the iterator, second is for the `Option<_>` let result = integer_array .iter() - .map(|integer: Option| { + .map(|integer| { integer .map(|integer| { - if integer == 0 { + if *integer == 0 { Err(DataFusionError::Execution( "null character not permitted.".to_string(), )) } else { - match core::char::from_u32(integer as u32) { + match core::char::from_u32(*integer as u32) { Some(integer) => Ok(integer.to_string()), None => Err(DataFusionError::Execution( "requested character too large for encoding.".to_string(), @@ -307,7 +309,7 @@ pub fn concat(args: &[ColumnarValue]) -> Result { } Some(owned_string) }) - .collect::(); + .collect::>(); Ok(ColumnarValue::Array(Arc::new(result))) } else { @@ -370,7 +372,7 @@ pub fn concat_ws(args: &[ArrayRef]) -> Result { /// Converts the first letter of each word to upper case and the rest to lower case. Words are sequences of alphanumeric characters separated by non-alphanumeric characters. /// initcap('hi THOMAS') = 'Hi Thomas' -pub fn initcap(args: &[ArrayRef]) -> Result { +pub fn initcap(args: &[ArrayRef]) -> Result { let string_array = downcast_string_arg!(args[0], "string", T); // first map is the iterator, second is for the `Option<_>` @@ -393,7 +395,7 @@ pub fn initcap(args: &[ArrayRef]) -> Result char_vector.iter().collect::() }) }) - .collect::>(); + .collect::>(); Ok(Arc::new(result) as ArrayRef) } @@ -406,7 +408,7 @@ pub fn lower(args: &[ColumnarValue]) -> Result { /// Removes the longest string containing only characters in characters (a space by default) from the start of string. /// ltrim('zzzytest', 'xyz') = 'test' -pub fn ltrim(args: &[ArrayRef]) -> Result { +pub fn ltrim(args: &[ArrayRef]) -> Result { match args.len() { 1 => { let string_array = downcast_string_arg!(args[0], "string", T); @@ -414,7 +416,7 @@ pub fn ltrim(args: &[ArrayRef]) -> Result { let result = string_array .iter() .map(|string| string.map(|string: &str| string.trim_start_matches(' '))) - .collect::>(); + .collect::>(); Ok(Arc::new(result) as ArrayRef) } @@ -432,7 +434,7 @@ pub fn ltrim(args: &[ArrayRef]) -> Result { } _ => None, }) - .collect::>(); + .collect::>(); Ok(Arc::new(result) as ArrayRef) } @@ -445,7 +447,7 @@ pub fn ltrim(args: &[ArrayRef]) -> Result { /// Repeats string the specified number of times. /// repeat('Pg', 4) = 'PgPgPgPg' -pub fn repeat(args: &[ArrayRef]) -> Result { +pub fn repeat(args: &[ArrayRef]) -> Result { let string_array = downcast_string_arg!(args[0], "string", T); let number_array = downcast_arg!(args[1], "number", Int64Array); @@ -453,17 +455,17 @@ pub fn repeat(args: &[ArrayRef]) -> Result { .iter() .zip(number_array.iter()) .map(|(string, number)| match (string, number) { - (Some(string), Some(number)) => Some(string.repeat(number as usize)), + (Some(string), Some(number)) => Some(string.repeat(*number as usize)), _ => None, }) - .collect::>(); + .collect::>(); Ok(Arc::new(result) as ArrayRef) } /// Replaces all occurrences in string of substring from with substring to. /// replace('abcdefabcdef', 'cd', 'XX') = 'abXXefabXXef' -pub fn replace(args: &[ArrayRef]) -> Result { +pub fn replace(args: &[ArrayRef]) -> Result { let string_array = downcast_string_arg!(args[0], "string", T); let from_array = downcast_string_arg!(args[1], "from", T); let to_array = downcast_string_arg!(args[2], "to", T); @@ -476,14 +478,14 @@ pub fn replace(args: &[ArrayRef]) -> Result (Some(string), Some(from), Some(to)) => Some(string.replace(from, to)), _ => None, }) - .collect::>(); + .collect::>(); Ok(Arc::new(result) as ArrayRef) } /// Removes the longest string containing only characters in characters (a space by default) from the end of string. /// rtrim('testxxzx', 'xyz') = 'test' -pub fn rtrim(args: &[ArrayRef]) -> Result { +pub fn rtrim(args: &[ArrayRef]) -> Result { match args.len() { 1 => { let string_array = downcast_string_arg!(args[0], "string", T); @@ -491,7 +493,7 @@ pub fn rtrim(args: &[ArrayRef]) -> Result { let result = string_array .iter() .map(|string| string.map(|string: &str| string.trim_end_matches(' '))) - .collect::>(); + .collect::>(); Ok(Arc::new(result) as ArrayRef) } @@ -509,7 +511,7 @@ pub fn rtrim(args: &[ArrayRef]) -> Result { } _ => None, }) - .collect::>(); + .collect::>(); Ok(Arc::new(result) as ArrayRef) } @@ -522,7 +524,7 @@ pub fn rtrim(args: &[ArrayRef]) -> Result { /// Splits string at occurrences of delimiter and returns the n'th field (counting from one). /// split_part('abc~@~def~@~ghi', '~@~', 2) = 'def' -pub fn split_part(args: &[ArrayRef]) -> Result { +pub fn split_part(args: &[ArrayRef]) -> Result { let string_array = downcast_string_arg!(args[0], "string", T); let delimiter_array = downcast_string_arg!(args[1], "delimiter", T); let n_array = downcast_arg!(args[2], "n", Int64Array); @@ -533,13 +535,13 @@ pub fn split_part(args: &[ArrayRef]) -> Result { - if n <= 0 { + if *n <= 0 { Err(DataFusionError::Execution( "field position must be greater than zero".to_string(), )) } else { let split_string: Vec<&str> = string.split(delimiter).collect(); - match split_string.get(n as usize - 1) { + match split_string.get(*n as usize - 1) { Some(s) => Ok(Some(*s)), None => Ok(Some("")), } @@ -547,14 +549,14 @@ pub fn split_part(args: &[ArrayRef]) -> Result Ok(None), }) - .collect::>>()?; + .collect::>>()?; Ok(Arc::new(result) as ArrayRef) } /// Returns true if string starts with prefix. /// starts_with('alphabet', 'alph') = 't' -pub fn starts_with(args: &[ArrayRef]) -> Result { +pub fn starts_with(args: &[ArrayRef]) -> Result { let string_array = downcast_string_arg!(args[0], "string", T); let prefix_array = downcast_string_arg!(args[1], "prefix", T); @@ -572,10 +574,7 @@ pub fn starts_with(args: &[ArrayRef]) -> Result(args: &[ArrayRef]) -> Result -where - T::Native: StringOffsetSizeTrait, -{ +pub fn to_hex(args: &[ArrayRef]) -> Result { let integer_array = downcast_primitive_array_arg!(args[0], "integer", T); let result = integer_array @@ -583,7 +582,7 @@ where .map(|integer| { integer.map(|integer| format!("{:x}", integer.to_usize().unwrap())) }) - .collect::>(); + .collect::(); Ok(Arc::new(result) as ArrayRef) } diff --git a/datafusion/src/physical_plan/type_coercion.rs b/datafusion/src/physical_plan/type_coercion.rs index 06d3739b53b27..6b64a75e8207f 100644 --- a/datafusion/src/physical_plan/type_coercion.rs +++ b/datafusion/src/physical_plan/type_coercion.rs @@ -31,7 +31,7 @@ use std::{sync::Arc, vec}; -use arrow::datatypes::{DataType, Schema, TimeUnit}; +use arrow2::datatypes::{DataType, Schema, TimeUnit}; use super::{functions::Signature, PhysicalExpr}; use crate::error::{DataFusionError, Result}; @@ -212,7 +212,7 @@ pub fn can_coerce_from(type_into: &DataType, type_from: &DataType) -> bool { mod tests { use super::*; use crate::physical_plan::expressions::col; - use arrow::datatypes::{DataType, Field, Schema}; + use arrow2::datatypes::{DataType, Field, Schema}; #[test] fn test_maybe_data_types() { diff --git a/datafusion/src/physical_plan/udaf.rs b/datafusion/src/physical_plan/udaf.rs index f7515d326d0a5..c50e1991c8153 100644 --- a/datafusion/src/physical_plan/udaf.rs +++ b/datafusion/src/physical_plan/udaf.rs @@ -21,7 +21,7 @@ use fmt::{Debug, Formatter}; use std::any::Any; use std::fmt; -use arrow::{ +use arrow2::{ datatypes::Field, datatypes::{DataType, Schema}, }; diff --git a/datafusion/src/physical_plan/udf.rs b/datafusion/src/physical_plan/udf.rs index a79c0a8a36059..78f9f018cb9ca 100644 --- a/datafusion/src/physical_plan/udf.rs +++ b/datafusion/src/physical_plan/udf.rs @@ -20,7 +20,7 @@ use fmt::{Debug, Formatter}; use std::fmt; -use arrow::datatypes::Schema; +use arrow2::datatypes::Schema; use crate::error::Result; use crate::{logical_plan::Expr, physical_plan::PhysicalExpr}; diff --git a/datafusion/src/physical_plan/unicode_expressions.rs b/datafusion/src/physical_plan/unicode_expressions.rs index 3852fd7c931fa..00ac6ed93abb2 100644 --- a/datafusion/src/physical_plan/unicode_expressions.rs +++ b/datafusion/src/physical_plan/unicode_expressions.rs @@ -25,25 +25,23 @@ use std::any::type_name; use std::cmp::Ordering; use std::sync::Arc; -use crate::error::{DataFusionError, Result}; -use arrow::{ - array::{ - ArrayRef, GenericStringArray, Int64Array, PrimitiveArray, StringOffsetSizeTrait, - }, - datatypes::{ArrowNativeType, ArrowPrimitiveType}, -}; +use arrow2::array::*; use hashbrown::HashMap; use unicode_segmentation::UnicodeSegmentation; +use crate::error::{DataFusionError, Result}; + +type ArrayRef = Arc; + macro_rules! downcast_string_arg { ($ARG:expr, $NAME:expr, $T:ident) => {{ $ARG.as_any() - .downcast_ref::>() + .downcast_ref::>() .ok_or_else(|| { DataFusionError::Internal(format!( "could not cast {} to {}", $NAME, - type_name::>() + type_name::>() )) })? }}; @@ -63,41 +61,38 @@ macro_rules! downcast_arg { /// Returns number of characters in the string. /// character_length('josé') = 4 -pub fn character_length(args: &[ArrayRef]) -> Result -where - T::Native: StringOffsetSizeTrait, -{ - let string_array: &GenericStringArray = args[0] - .as_any() - .downcast_ref::>() - .ok_or_else(|| { - DataFusionError::Internal("could not cast string to StringArray".to_string()) - })?; - - let result = string_array - .iter() - .map(|string| { - string.map(|string: &str| { - T::Native::from_usize(string.graphemes(true).count()).expect( - "should not fail as graphemes.count will always return integer", +pub fn character_length(args: &[ArrayRef]) -> Result { + let string_array = + args[0] + .as_any() + .downcast_ref::>() + .ok_or_else(|| { + DataFusionError::Internal( + "could not cast string to StringArray".to_string(), ) - }) + })?; + + let iter = string_array.iter().map(|string| { + string.map(|string: &str| { + O::from_usize(string.graphemes(true).count()) + .expect("should not fail as graphemes.count will always return integer") }) - .collect::>(); + }); + let result = Primitive::::from_trusted_len_iter(iter).to(O::DATA_TYPE); Ok(Arc::new(result) as ArrayRef) } /// Returns first n characters in the string, or when n is negative, returns all but last |n| characters. /// left('abcde', 2) = 'ab' -pub fn left(args: &[ArrayRef]) -> Result { +pub fn left(args: &[ArrayRef]) -> Result { let string_array = downcast_string_arg!(args[0], "string", T); let n_array = downcast_arg!(args[1], "n", Int64Array); let result = string_array .iter() .zip(n_array.iter()) .map(|(string, n)| match (string, n) { - (Some(string), Some(n)) => match n.cmp(&0) { + (Some(string), Some(&n)) => match n.cmp(&0) { Ordering::Less => { let graphemes = string.graphemes(true); let len = graphemes.clone().count() as i64; @@ -116,14 +111,14 @@ pub fn left(args: &[ArrayRef]) -> Result { }, _ => None, }) - .collect::>(); + .collect::>(); Ok(Arc::new(result) as ArrayRef) } /// Extends the string to length 'length' by prepending the characters fill (a space by default). If the string is already longer than length then it is truncated (on the right). /// lpad('hi', 5, 'xy') = 'xyxhi' -pub fn lpad(args: &[ArrayRef]) -> Result { +pub fn lpad(args: &[ArrayRef]) -> Result { match args.len() { 2 => { let string_array = downcast_string_arg!(args[0], "string", T); @@ -134,7 +129,7 @@ pub fn lpad(args: &[ArrayRef]) -> Result { .zip(length_array.iter()) .map(|(string, length)| match (string, length) { (Some(string), Some(length)) => { - let length = length as usize; + let length = *length as usize; if length == 0 { Some("".to_string()) } else { @@ -153,7 +148,7 @@ pub fn lpad(args: &[ArrayRef]) -> Result { } _ => None, }) - .collect::>(); + .collect::>(); Ok(Arc::new(result) as ArrayRef) } @@ -167,7 +162,7 @@ pub fn lpad(args: &[ArrayRef]) -> Result { .zip(length_array.iter()) .zip(fill_array.iter()) .map(|((string, length), fill)| match (string, length, fill) { - (Some(string), Some(length), Some(fill)) => { + (Some(string), Some(&length), Some(fill)) => { let length = length as usize; if length == 0 { @@ -199,7 +194,7 @@ pub fn lpad(args: &[ArrayRef]) -> Result { } _ => None, }) - .collect::>(); + .collect::>(); Ok(Arc::new(result) as ArrayRef) } @@ -212,7 +207,7 @@ pub fn lpad(args: &[ArrayRef]) -> Result { /// Reverses the order of the characters in the string. /// reverse('abcde') = 'edcba' -pub fn reverse(args: &[ArrayRef]) -> Result { +pub fn reverse(args: &[ArrayRef]) -> Result { let string_array = downcast_string_arg!(args[0], "string", T); let result = string_array @@ -220,14 +215,14 @@ pub fn reverse(args: &[ArrayRef]) -> Result .map(|string| { string.map(|string: &str| string.graphemes(true).rev().collect::()) }) - .collect::>(); + .collect::>(); Ok(Arc::new(result) as ArrayRef) } /// Returns last n characters in the string, or when n is negative, returns all but first |n| characters. /// right('abcde', 2) = 'de' -pub fn right(args: &[ArrayRef]) -> Result { +pub fn right(args: &[ArrayRef]) -> Result { let string_array = downcast_string_arg!(args[0], "string", T); let n_array = downcast_arg!(args[1], "n", Int64Array); @@ -258,7 +253,7 @@ pub fn right(args: &[ArrayRef]) -> Result { string .graphemes(true) .rev() - .take(n as usize) + .take(*n as usize) .collect::>() .iter() .rev() @@ -268,14 +263,14 @@ pub fn right(args: &[ArrayRef]) -> Result { }, _ => None, }) - .collect::>(); + .collect::>(); Ok(Arc::new(result) as ArrayRef) } /// Extends the string to length 'length' by appending the characters fill (a space by default). If the string is already longer than length then it is truncated. /// rpad('hi', 5, 'xy') = 'hixyx' -pub fn rpad(args: &[ArrayRef]) -> Result { +pub fn rpad(args: &[ArrayRef]) -> Result { match args.len() { 2 => { let string_array = downcast_string_arg!(args[0], "string", T); @@ -285,7 +280,7 @@ pub fn rpad(args: &[ArrayRef]) -> Result { .iter() .zip(length_array.iter()) .map(|(string, length)| match (string, length) { - (Some(string), Some(length)) => { + (Some(string), Some(&length)) => { let length = length as usize; if length == 0 { Some("".to_string()) @@ -302,7 +297,7 @@ pub fn rpad(args: &[ArrayRef]) -> Result { } _ => None, }) - .collect::>(); + .collect::>(); Ok(Arc::new(result) as ArrayRef) } @@ -316,7 +311,7 @@ pub fn rpad(args: &[ArrayRef]) -> Result { .zip(length_array.iter()) .zip(fill_array.iter()) .map(|((string, length), fill)| match (string, length, fill) { - (Some(string), Some(length), Some(fill)) => { + (Some(string), Some(&length), Some(fill)) => { let length = length as usize; let graphemes = string.graphemes(true).collect::>(); let fill_chars = fill.chars().collect::>(); @@ -339,7 +334,7 @@ pub fn rpad(args: &[ArrayRef]) -> Result { } _ => None, }) - .collect::>(); + .collect::>(); Ok(Arc::new(result) as ArrayRef) } @@ -352,20 +347,17 @@ pub fn rpad(args: &[ArrayRef]) -> Result { /// Returns starting index of specified substring within string, or zero if it's not present. (Same as position(substring in string), but note the reversed argument order.) /// strpos('high', 'ig') = 2 -pub fn strpos(args: &[ArrayRef]) -> Result -where - T::Native: StringOffsetSizeTrait, -{ - let string_array: &GenericStringArray = args[0] +pub fn strpos(args: &[ArrayRef]) -> Result { + let string_array: &Utf8Array = args[0] .as_any() - .downcast_ref::>() + .downcast_ref::>() .ok_or_else(|| { DataFusionError::Internal("could not cast string to StringArray".to_string()) })?; - let substring_array: &GenericStringArray = args[1] + let substring_array: &Utf8Array = args[1] .as_any() - .downcast_ref::>() + .downcast_ref::>() .ok_or_else(|| { DataFusionError::Internal( "could not cast substring to StringArray".to_string(), @@ -381,7 +373,7 @@ where // this method first finds the matching byte using rfind // then maps that to the character index by matching on the grapheme_index of the byte_index Some( - T::Native::from_usize(string.to_string().rfind(substring).map_or( + T::from_usize(string.to_string().rfind(substring).map_or( 0, |byte_offset| { string @@ -411,7 +403,7 @@ where /// Extracts the substring of string starting at the start'th character, and extending for count characters if that is specified. (Same as substring(string from start for count).) /// substr('alphabet', 3) = 'phabet' /// substr('alphabet', 3, 2) = 'ph' -pub fn substr(args: &[ArrayRef]) -> Result { +pub fn substr(args: &[ArrayRef]) -> Result { match args.len() { 2 => { let string_array = downcast_string_arg!(args[0], "string", T); @@ -421,7 +413,7 @@ pub fn substr(args: &[ArrayRef]) -> Result { .iter() .zip(start_array.iter()) .map(|(string, start)| match (string, start) { - (Some(string), Some(start)) => { + (Some(string), Some(&start)) => { if start <= 0 { Some(string.to_string()) } else { @@ -436,7 +428,7 @@ pub fn substr(args: &[ArrayRef]) -> Result { } _ => None, }) - .collect::>(); + .collect::>(); Ok(Arc::new(result) as ArrayRef) } @@ -450,7 +442,7 @@ pub fn substr(args: &[ArrayRef]) -> Result { .zip(start_array.iter()) .zip(count_array.iter()) .map(|((string, start), count)| match (string, start, count) { - (Some(string), Some(start), Some(count)) => { + (Some(string), Some(&start), Some(&count)) => { if count < 0 { Err(DataFusionError::Execution( "negative substring length not allowed".to_string(), @@ -475,7 +467,7 @@ pub fn substr(args: &[ArrayRef]) -> Result { } _ => Ok(None), }) - .collect::>>()?; + .collect::>>()?; Ok(Arc::new(result) as ArrayRef) } @@ -488,7 +480,7 @@ pub fn substr(args: &[ArrayRef]) -> Result { /// Replaces each character in string that matches a character in the from set with the corresponding character in the to set. If from is longer than to, occurrences of the extra characters in from are deleted. /// translate('12345', '143', 'ax') = 'a2x5' -pub fn translate(args: &[ArrayRef]) -> Result { +pub fn translate(args: &[ArrayRef]) -> Result { let string_array = downcast_string_arg!(args[0], "string", T); let from_array = downcast_string_arg!(args[1], "from", T); let to_array = downcast_string_arg!(args[2], "to", T); @@ -525,7 +517,7 @@ pub fn translate(args: &[ArrayRef]) -> Result None, }) - .collect::>(); + .collect::>(); Ok(Arc::new(result) as ArrayRef) } diff --git a/datafusion/src/physical_plan/union.rs b/datafusion/src/physical_plan/union.rs index cbab728a8428b..836045354821e 100644 --- a/datafusion/src/physical_plan/union.rs +++ b/datafusion/src/physical_plan/union.rs @@ -23,7 +23,8 @@ use std::{any::Any, sync::Arc}; -use arrow::datatypes::SchemaRef; +use arrow2::datatypes::Schema; +type SchemaRef = Arc; use super::{ExecutionPlan, Partitioning, SendableRecordBatchStream}; use crate::error::Result; @@ -104,7 +105,7 @@ mod tests { csv::{CsvExec, CsvReadOptions}, }; use crate::test; - use arrow::record_batch::RecordBatch; + use arrow2::record_batch::RecordBatch; #[tokio::test] async fn test_union_partitions() -> Result<()> { diff --git a/datafusion/src/scalar.rs b/datafusion/src/scalar.rs index e59d21e7fcef0..92621ee0b60c0 100644 --- a/datafusion/src/scalar.rs +++ b/datafusion/src/scalar.rs @@ -19,24 +19,12 @@ use std::{convert::TryFrom, fmt, iter::repeat, sync::Arc}; -use arrow::datatypes::{ArrowDictionaryKeyType, DataType, Field, IntervalUnit, TimeUnit}; -use arrow::{ - array::*, - datatypes::{ - ArrowNativeType, Float32Type, Int16Type, Int32Type, Int64Type, Int8Type, - TimestampNanosecondType, UInt16Type, UInt32Type, UInt64Type, UInt8Type, - }, -}; -use arrow::{ - array::{ - ArrayRef, Int16Builder, Int32Builder, Int64Builder, Int8Builder, ListBuilder, - TimestampMicrosecondArray, TimestampMillisecondArray, TimestampNanosecondArray, - UInt16Builder, UInt32Builder, UInt64Builder, UInt8Builder, - }, - datatypes::{ - TimestampMicrosecondType, TimestampMillisecondType, TimestampSecondType, - }, -}; +use arrow2::datatypes::{DataType, IntervalUnit, TimeUnit}; +use arrow2::{array::*, buffer::MutableBuffer, types::days_ms}; + +type ArrayRef = Arc; +type StringArray = Utf8Array; +type LargeStringArray = Utf8Array; use crate::error::{DataFusionError, Result}; @@ -75,7 +63,9 @@ pub enum ScalarValue { /// large binary LargeBinary(Option>), /// list of nested ScalarValue - List(Option>, DataType), + // 1st argument are the inner values + // 2st argument is datatype (i.e. it includes `Field`) + List(Option>, DataType), /// Date stored as a signed 32bit int Date32(Option), /// Date stored as a signed 64bit int @@ -91,7 +81,7 @@ pub enum ScalarValue { /// Interval with YearMonth unit IntervalYearMonth(Option), /// Interval with DayTime unit - IntervalDayTime(Option), + IntervalDayTime(Option), } macro_rules! typed_cast { @@ -104,91 +94,14 @@ macro_rules! typed_cast { }}; } -macro_rules! build_list { - ($VALUE_BUILDER_TY:ident, $SCALAR_TY:ident, $VALUES:expr, $SIZE:expr) => {{ - match $VALUES { - // the return on the macro is necessary, to short-circuit and return ArrayRef - None => { - return new_null_array( - &DataType::List(Box::new(Field::new( - "item", - DataType::$SCALAR_TY, - true, - ))), - $SIZE, - ) - } - Some(values) => { - build_values_list!($VALUE_BUILDER_TY, $SCALAR_TY, values, $SIZE) - } - } - }}; -} - -macro_rules! build_timestamp_list { - ($TIME_UNIT:expr, $TIME_ZONE:expr, $VALUES:expr, $SIZE:expr) => {{ - match $VALUES { - // the return on the macro is necessary, to short-circuit and return ArrayRef - None => { - return new_null_array( - &DataType::List(Box::new(Field::new( - "item", - DataType::Timestamp($TIME_UNIT, $TIME_ZONE), - true, - ))), - $SIZE, - ) - } - Some(values) => match $TIME_UNIT { - TimeUnit::Second => build_values_list!( - TimestampSecondBuilder, - TimestampSecond, - values, - $SIZE - ), - TimeUnit::Microsecond => build_values_list!( - TimestampMillisecondBuilder, - TimestampMillisecond, - values, - $SIZE - ), - TimeUnit::Millisecond => build_values_list!( - TimestampMicrosecondBuilder, - TimestampMicrosecond, - values, - $SIZE - ), - TimeUnit::Nanosecond => build_values_list!( - TimestampNanosecondBuilder, - TimestampNanosecond, - values, - $SIZE - ), - }, - } - }}; -} - -macro_rules! build_values_list { - ($VALUE_BUILDER_TY:ident, $SCALAR_TY:ident, $VALUES:expr, $SIZE:expr) => {{ - let mut builder = ListBuilder::new($VALUE_BUILDER_TY::new($VALUES.len())); - - for _ in 0..$SIZE { - for scalar_value in $VALUES { - match scalar_value { - ScalarValue::$SCALAR_TY(Some(v)) => { - builder.values().append_value(v.clone()).unwrap() - } - ScalarValue::$SCALAR_TY(None) => { - builder.values().append_null().unwrap(); - } - _ => panic!("Incompatible ScalarValue for list"), - }; - } - builder.append(true).unwrap(); - } - - builder.finish() +macro_rules! dyn_to_array { + ($self:expr, $value:expr, $size:expr, $ty:ty) => {{ + Arc::new(PrimitiveArray::<$ty>::from_data( + $self.get_datatype(), + MutableBuffer::<$ty>::from_trusted_len_iter(repeat(*$value).take($size)) + .into(), + None, + )) }}; } @@ -223,9 +136,7 @@ impl ScalarValue { ScalarValue::LargeUtf8(_) => DataType::LargeUtf8, ScalarValue::Binary(_) => DataType::Binary, ScalarValue::LargeBinary(_) => DataType::LargeBinary, - ScalarValue::List(_, data_type) => { - DataType::List(Box::new(Field::new("item", data_type.clone(), true))) - } + ScalarValue::List(_, data_type) => data_type.clone(), ScalarValue::Date32(_) => DataType::Date32, ScalarValue::Date64(_) => DataType::Date64, ScalarValue::IntervalYearMonth(_) => { @@ -290,151 +201,107 @@ impl ScalarValue { Arc::new(BooleanArray::from(vec![*e; size])) as ArrayRef } ScalarValue::Float64(e) => match e { - Some(value) => Arc::new(Float64Array::from_value(*value, size)), - None => new_null_array(&DataType::Float64, size), + Some(value) => dyn_to_array!(self, value, size, f64), + None => new_null_array(self.get_datatype(), size).into(), }, ScalarValue::Float32(e) => match e { - Some(value) => Arc::new(Float32Array::from_value(*value, size)), - None => new_null_array(&DataType::Float32, size), + Some(value) => dyn_to_array!(self, value, size, f32), + None => new_null_array(self.get_datatype(), size).into(), }, ScalarValue::Int8(e) => match e { - Some(value) => Arc::new(Int8Array::from_value(*value, size)), - None => new_null_array(&DataType::Int8, size), + Some(value) => dyn_to_array!(self, value, size, i8), + None => new_null_array(self.get_datatype(), size).into(), }, ScalarValue::Int16(e) => match e { - Some(value) => Arc::new(Int16Array::from_value(*value, size)), - None => new_null_array(&DataType::Int16, size), + Some(value) => dyn_to_array!(self, value, size, i16), + None => new_null_array(self.get_datatype(), size).into(), }, - ScalarValue::Int32(e) => match e { - Some(value) => Arc::new(Int32Array::from_value(*value, size)), - None => new_null_array(&DataType::Int32, size), + ScalarValue::Int32(e) + | ScalarValue::Date32(e) + | ScalarValue::IntervalYearMonth(e) => match e { + Some(value) => dyn_to_array!(self, value, size, i32), + None => new_null_array(self.get_datatype(), size).into(), }, - ScalarValue::Int64(e) => match e { - Some(value) => Arc::new(Int64Array::from_value(*value, size)), - None => new_null_array(&DataType::Int64, size), + ScalarValue::Int64(e) + | ScalarValue::Date64(e) + | ScalarValue::TimestampSecond(e) + | ScalarValue::TimestampMillisecond(e) + | ScalarValue::TimestampMicrosecond(e) + | ScalarValue::TimestampNanosecond(e) => match e { + Some(value) => dyn_to_array!(self, value, size, i64), + None => new_null_array(self.get_datatype(), size).into(), }, ScalarValue::UInt8(e) => match e { - Some(value) => Arc::new(UInt8Array::from_value(*value, size)), - None => new_null_array(&DataType::UInt8, size), + Some(value) => dyn_to_array!(self, value, size, u8), + None => new_null_array(self.get_datatype(), size).into(), }, ScalarValue::UInt16(e) => match e { - Some(value) => Arc::new(UInt16Array::from_value(*value, size)), - None => new_null_array(&DataType::UInt16, size), + Some(value) => dyn_to_array!(self, value, size, u16), + None => new_null_array(self.get_datatype(), size).into(), }, ScalarValue::UInt32(e) => match e { - Some(value) => Arc::new(UInt32Array::from_value(*value, size)), - None => new_null_array(&DataType::UInt32, size), + Some(value) => dyn_to_array!(self, value, size, u32), + None => new_null_array(self.get_datatype(), size).into(), }, ScalarValue::UInt64(e) => match e { - Some(value) => Arc::new(UInt64Array::from_value(*value, size)), - None => new_null_array(&DataType::UInt64, size), - }, - ScalarValue::TimestampSecond(e) => match e { - Some(value) => Arc::new(TimestampSecondArray::from_iter_values( - repeat(*value).take(size), - )), - None => { - new_null_array(&DataType::Timestamp(TimeUnit::Second, None), size) - } - }, - ScalarValue::TimestampMillisecond(e) => match e { - Some(value) => Arc::new(TimestampMillisecondArray::from_iter_values( - repeat(*value).take(size), - )), - None => new_null_array( - &DataType::Timestamp(TimeUnit::Millisecond, None), - size, - ), - }, - ScalarValue::TimestampMicrosecond(e) => match e { - Some(value) => { - Arc::new(TimestampMicrosecondArray::from_value(*value, size)) - } - None => new_null_array( - &DataType::Timestamp(TimeUnit::Microsecond, None), - size, - ), - }, - ScalarValue::TimestampNanosecond(e) => match e { - Some(value) => { - Arc::new(TimestampNanosecondArray::from_value(*value, size)) - } - None => { - new_null_array(&DataType::Timestamp(TimeUnit::Nanosecond, None), size) - } + Some(value) => dyn_to_array!(self, value, size, u64), + None => new_null_array(self.get_datatype(), size).into(), }, ScalarValue::Utf8(e) => match e { Some(value) => { - Arc::new(StringArray::from_iter_values(repeat(value).take(size))) + Arc::new(repeat(Some(&value)).take(size).collect::>()) } - None => new_null_array(&DataType::Utf8, size), + None => new_null_array(self.get_datatype(), size).into(), }, ScalarValue::LargeUtf8(e) => match e { Some(value) => { - Arc::new(LargeStringArray::from_iter_values(repeat(value).take(size))) + Arc::new(repeat(Some(&value)).take(size).collect::>()) } - None => new_null_array(&DataType::LargeUtf8, size), + None => new_null_array(self.get_datatype(), size).into(), }, ScalarValue::Binary(e) => match e { Some(value) => Arc::new( repeat(Some(value.as_slice())) .take(size) - .collect::(), + .collect::>(), ), - None => { - Arc::new(repeat(None::<&str>).take(size).collect::()) - } + None => new_null_array(self.get_datatype(), size).into(), }, ScalarValue::LargeBinary(e) => match e { Some(value) => Arc::new( repeat(Some(value.as_slice())) .take(size) - .collect::(), - ), - None => Arc::new( - repeat(None::<&str>) - .take(size) - .collect::(), + .collect::>(), ), + None => new_null_array(self.get_datatype(), size).into(), }, - ScalarValue::List(values, data_type) => Arc::new(match data_type { - DataType::Boolean => build_list!(BooleanBuilder, Boolean, values, size), - DataType::Int8 => build_list!(Int8Builder, Int8, values, size), - DataType::Int16 => build_list!(Int16Builder, Int16, values, size), - DataType::Int32 => build_list!(Int32Builder, Int32, values, size), - DataType::Int64 => build_list!(Int64Builder, Int64, values, size), - DataType::UInt8 => build_list!(UInt8Builder, UInt8, values, size), - DataType::UInt16 => build_list!(UInt16Builder, UInt16, values, size), - DataType::UInt32 => build_list!(UInt32Builder, UInt32, values, size), - DataType::UInt64 => build_list!(UInt64Builder, UInt64, values, size), - DataType::Utf8 => build_list!(StringBuilder, Utf8, values, size), - DataType::Float32 => build_list!(Float32Builder, Float32, values, size), - DataType::Float64 => build_list!(Float64Builder, Float64, values, size), - DataType::Timestamp(unit, tz) => { - build_timestamp_list!(unit.clone(), tz.clone(), values, size) - } - DataType::LargeUtf8 => { - build_list!(LargeStringBuilder, LargeUtf8, values, size) + ScalarValue::List(values, data_type) => { + if let Some(values) = values { + let length = values.len(); + let refs = std::iter::repeat(values.as_ref()) + .take(size) + .collect::>(); + let values = + arrow2::compute::concat::concatenate(&refs).unwrap().into(); + let offsets: arrow2::buffer::Buffer = + (0..=size).map(|i| (i * length) as i32).collect(); + Arc::new(ListArray::::from_data( + data_type.clone(), + offsets, + values, + None, + )) + } else { + new_null_array(self.get_datatype(), size).into() } - dt => panic!("Unexpected DataType for list {:?}", dt), - }), - ScalarValue::Date32(e) => match e { - Some(value) => Arc::new(Date32Array::from_value(*value, size)), - None => new_null_array(&DataType::Date32, size), - }, - ScalarValue::Date64(e) => match e { - Some(value) => Arc::new(Date64Array::from_value(*value, size)), - None => new_null_array(&DataType::Date64, size), - }, + } ScalarValue::IntervalDayTime(e) => match e { - Some(value) => Arc::new(IntervalDayTimeArray::from_value(*value, size)), - None => new_null_array(&DataType::Interval(IntervalUnit::DayTime), size), - }, - ScalarValue::IntervalYearMonth(e) => match e { - Some(value) => Arc::new(IntervalYearMonthArray::from_value(*value, size)), - None => { - new_null_array(&DataType::Interval(IntervalUnit::YearMonth), size) + Some(value) => { + Arc::new(PrimitiveArray::::from_trusted_len_values_iter( + std::iter::repeat(*value).take(size), + )) } + None => new_null_array(self.get_datatype(), size).into(), }, } } @@ -456,67 +323,45 @@ impl ScalarValue { DataType::Utf8 => typed_cast!(array, index, StringArray, Utf8), DataType::LargeUtf8 => typed_cast!(array, index, LargeStringArray, LargeUtf8), DataType::List(nested_type) => { - let list_array = - array.as_any().downcast_ref::().ok_or_else(|| { + let list_array = array + .as_any() + .downcast_ref::>() + .ok_or_else(|| { DataFusionError::Internal( "Failed to downcast ListArray".to_string(), ) })?; - let value = match list_array.is_null(index) { - true => None, - false => { - let nested_array = list_array.value(index); - let scalar_vec = (0..nested_array.len()) - .map(|i| ScalarValue::try_from_array(&nested_array, i)) - .collect::>>()?; - Some(scalar_vec) - } - }; - ScalarValue::List(value, nested_type.data_type().clone()) + let is_valid = list_array.is_valid(index); + let value = list_array.value(index).into(); + ScalarValue::List(Some(value), nested_type.data_type().clone()) } DataType::Date32 => { - typed_cast!(array, index, Date32Array, Date32) + typed_cast!(array, index, Int32Array, Date32) } DataType::Date64 => { - typed_cast!(array, index, Date64Array, Date64) + typed_cast!(array, index, Int64Array, Date64) } DataType::Timestamp(TimeUnit::Second, _) => { - typed_cast!(array, index, TimestampSecondArray, TimestampSecond) + typed_cast!(array, index, Int64Array, TimestampSecond) } DataType::Timestamp(TimeUnit::Millisecond, _) => { - typed_cast!( - array, - index, - TimestampMillisecondArray, - TimestampMillisecond - ) + typed_cast!(array, index, Int64Array, TimestampMillisecond) } DataType::Timestamp(TimeUnit::Microsecond, _) => { - typed_cast!( - array, - index, - TimestampMicrosecondArray, - TimestampMicrosecond - ) + typed_cast!(array, index, Int64Array, TimestampMicrosecond) } DataType::Timestamp(TimeUnit::Nanosecond, _) => { - typed_cast!(array, index, TimestampNanosecondArray, TimestampNanosecond) + typed_cast!(array, index, Int64Array, TimestampNanosecond) } DataType::Dictionary(index_type, _) => match **index_type { - DataType::Int8 => Self::try_from_dict_array::(array, index)?, - DataType::Int16 => Self::try_from_dict_array::(array, index)?, - DataType::Int32 => Self::try_from_dict_array::(array, index)?, - DataType::Int64 => Self::try_from_dict_array::(array, index)?, - DataType::UInt8 => Self::try_from_dict_array::(array, index)?, - DataType::UInt16 => { - Self::try_from_dict_array::(array, index)? - } - DataType::UInt32 => { - Self::try_from_dict_array::(array, index)? - } - DataType::UInt64 => { - Self::try_from_dict_array::(array, index)? - } + DataType::Int8 => Self::try_from_dict_array::(array, index)?, + DataType::Int16 => Self::try_from_dict_array::(array, index)?, + DataType::Int32 => Self::try_from_dict_array::(array, index)?, + DataType::Int64 => Self::try_from_dict_array::(array, index)?, + DataType::UInt8 => Self::try_from_dict_array::(array, index)?, + DataType::UInt16 => Self::try_from_dict_array::(array, index)?, + DataType::UInt32 => Self::try_from_dict_array::(array, index)?, + DataType::UInt64 => Self::try_from_dict_array::(array, index)?, _ => { return Err(DataFusionError::Internal(format!( "Index type not supported while creating scalar from dictionary: {}", @@ -533,14 +378,14 @@ impl ScalarValue { }) } - fn try_from_dict_array( + fn try_from_dict_array( array: &ArrayRef, index: usize, ) -> Result { let dict_array = array.as_any().downcast_ref::>().unwrap(); // look up the index in the values dictionary - let keys_col = dict_array.keys_array(); + let keys_col = dict_array.keys(); let values_index = keys_col.value(index).to_usize().ok_or_else(|| { DataFusionError::Internal(format!( "Can not convert index to usize in dictionary of type creating group by value {:?}", @@ -551,71 +396,33 @@ impl ScalarValue { } } -impl From for ScalarValue { - fn from(value: f64) -> Self { - ScalarValue::Float64(Some(value)) - } -} - -impl From for ScalarValue { - fn from(value: f32) -> Self { - ScalarValue::Float32(Some(value)) - } -} - -impl From for ScalarValue { - fn from(value: i8) -> Self { - ScalarValue::Int8(Some(value)) - } -} - -impl From for ScalarValue { - fn from(value: i16) -> Self { - ScalarValue::Int16(Some(value)) - } -} - -impl From for ScalarValue { - fn from(value: i32) -> Self { - ScalarValue::Int32(Some(value)) - } -} - -impl From for ScalarValue { - fn from(value: i64) -> Self { - ScalarValue::Int64(Some(value)) - } -} - -impl From for ScalarValue { - fn from(value: bool) -> Self { - ScalarValue::Boolean(Some(value)) - } -} - -impl From for ScalarValue { - fn from(value: u8) -> Self { - ScalarValue::UInt8(Some(value)) - } -} - -impl From for ScalarValue { - fn from(value: u16) -> Self { - ScalarValue::UInt16(Some(value)) - } -} +macro_rules! impl_scalar { + ($ty:ty, $scalar:tt) => { + impl From<$ty> for ScalarValue { + fn from(value: $ty) -> Self { + ScalarValue::$scalar(Some(value)) + } + } -impl From for ScalarValue { - fn from(value: u32) -> Self { - ScalarValue::UInt32(Some(value)) - } + impl From> for ScalarValue { + fn from(value: Option<$ty>) -> Self { + ScalarValue::$scalar(value) + } + } + }; } -impl From for ScalarValue { - fn from(value: u64) -> Self { - ScalarValue::UInt64(Some(value)) - } -} +impl_scalar!(f64, Float64); +impl_scalar!(f32, Float32); +impl_scalar!(i8, Int8); +impl_scalar!(i16, Int16); +impl_scalar!(i32, Int32); +impl_scalar!(i64, Int64); +impl_scalar!(bool, Boolean); +impl_scalar!(u8, UInt8); +impl_scalar!(u16, UInt16); +impl_scalar!(u32, UInt32); +impl_scalar!(u64, UInt64); macro_rules! impl_try_from { ($SCALAR:ident, $NATIVE:ident) => { @@ -711,9 +518,7 @@ impl TryFrom<&DataType> for ScalarValue { DataType::Timestamp(TimeUnit::Nanosecond, _) => { ScalarValue::TimestampNanosecond(None) } - DataType::List(ref nested_type) => { - ScalarValue::List(None, nested_type.data_type().clone()) - } + DataType::List(ref nested_type) => ScalarValue::List(None, datatype.clone()), _ => { return Err(DataFusionError::NotImplemented(format!( "Can't create a scalar of type \"{:?}\"", @@ -775,17 +580,13 @@ impl fmt::Display for ScalarValue { )?, None => write!(f, "NULL")?, }, - ScalarValue::List(e, _) => match e { - Some(l) => write!( - f, - "{}", - l.iter() - .map(|v| format!("{}", v)) - .collect::>() - .join(",") - )?, - None => write!(f, "NULL")?, - }, + ScalarValue::List(e, _) => { + if let Some(e) = e { + write!(f, "{}", e)? + } else { + write!(f, "NULL")? + } + } ScalarValue::Date32(e) => format_option!(f, e)?, ScalarValue::Date64(e) => format_option!(f, e)?, ScalarValue::IntervalDayTime(e) => format_option!(f, e)?, @@ -840,42 +641,6 @@ impl fmt::Debug for ScalarValue { } } -/// Trait used to map a NativeTime to a ScalarType. -pub trait ScalarType { - /// returns a scalar from an optional T - fn scalar(r: Option) -> ScalarValue; -} - -impl ScalarType for Float32Type { - fn scalar(r: Option) -> ScalarValue { - ScalarValue::Float32(r) - } -} - -impl ScalarType for TimestampSecondType { - fn scalar(r: Option) -> ScalarValue { - ScalarValue::TimestampSecond(r) - } -} - -impl ScalarType for TimestampMillisecondType { - fn scalar(r: Option) -> ScalarValue { - ScalarValue::TimestampMillisecond(r) - } -} - -impl ScalarType for TimestampMicrosecondType { - fn scalar(r: Option) -> ScalarValue { - ScalarValue::TimestampMicrosecond(r) - } -} - -impl ScalarType for TimestampNanosecondType { - fn scalar(r: Option) -> ScalarValue { - ScalarValue::TimestampNanosecond(r) - } -} - #[cfg(test)] mod tests { use super::*; @@ -883,37 +648,13 @@ mod tests { #[test] fn scalar_list_null_to_array() { let list_array_ref = ScalarValue::List(None, DataType::UInt64).to_array(); - let list_array = list_array_ref.as_any().downcast_ref::().unwrap(); + let list_array = list_array_ref + .as_any() + .downcast_ref::>() + .unwrap(); assert!(list_array.is_null(0)); assert_eq!(list_array.len(), 1); assert_eq!(list_array.values().len(), 0); } - - #[test] - fn scalar_list_to_array() { - let list_array_ref = ScalarValue::List( - Some(vec![ - ScalarValue::UInt64(Some(100)), - ScalarValue::UInt64(None), - ScalarValue::UInt64(Some(101)), - ]), - DataType::UInt64, - ) - .to_array(); - - let list_array = list_array_ref.as_any().downcast_ref::().unwrap(); - assert_eq!(list_array.len(), 1); - assert_eq!(list_array.values().len(), 3); - - let prim_array_ref = list_array.value(0); - let prim_array = prim_array_ref - .as_any() - .downcast_ref::() - .unwrap(); - assert_eq!(prim_array.len(), 3); - assert_eq!(prim_array.value(0), 100); - assert!(prim_array.is_null(1)); - assert_eq!(prim_array.value(2), 101); - } } diff --git a/datafusion/src/sql/planner.rs b/datafusion/src/sql/planner.rs index 34c5901b450a2..7a440aae77931 100644 --- a/datafusion/src/sql/planner.rs +++ b/datafusion/src/sql/planner.rs @@ -39,7 +39,9 @@ use crate::{ sql::parser::{CreateExternalTable, FileType, Statement as DFStatement}, }; -use arrow::datatypes::*; +use arrow2::datatypes::*; +use arrow2::types::days_ms; + use hashbrown::HashMap; use crate::prelude::JoinType; @@ -1305,7 +1307,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { )))); } - let result: i64 = (result_days << 32) | result_millis; + let result = days_ms::new(result_days as i32, result_millis as i32); Ok(Expr::Literal(ScalarValue::IntervalDayTime(Some(result)))) } diff --git a/datafusion/src/test/exec.rs b/datafusion/src/test/exec.rs index 04cd29530c016..3ec49548ea8c4 100644 --- a/datafusion/src/test/exec.rs +++ b/datafusion/src/test/exec.rs @@ -17,13 +17,16 @@ //! Simple iterator over batches for use in testing +use std::sync::Arc; use std::task::{Context, Poll}; -use arrow::{ - datatypes::SchemaRef, error::Result as ArrowResult, record_batch::RecordBatch, +use arrow2::{ + datatypes::Schema, error::Result as ArrowResult, record_batch::RecordBatch, }; use futures::Stream; +type SchemaRef = Arc; + use crate::physical_plan::RecordBatchStream; /// Index into the data that has been returned so far @@ -97,6 +100,6 @@ impl Stream for TestStream { impl RecordBatchStream for TestStream { /// Get the schema fn schema(&self) -> SchemaRef { - self.data[0].schema() + self.data[0].schema().clone() } } diff --git a/datafusion/src/test/mod.rs b/datafusion/src/test/mod.rs index 926a692261691..4a2b90d26f5ea 100644 --- a/datafusion/src/test/mod.rs +++ b/datafusion/src/test/mod.rs @@ -17,22 +17,22 @@ //! Common unit test utility methods -use crate::datasource::{MemTable, TableProvider}; -use crate::error::Result; -use crate::logical_plan::{LogicalPlan, LogicalPlanBuilder}; -use array::{ - Array, ArrayRef, StringArray, TimestampMicrosecondArray, TimestampMillisecondArray, - TimestampNanosecondArray, TimestampSecondArray, -}; -use arrow::array::{self, Int32Array}; -use arrow::datatypes::{DataType, Field, Schema, SchemaRef}; -use arrow::record_batch::RecordBatch; use std::fs::File; use std::io::prelude::*; use std::io::{BufReader, BufWriter}; use std::sync::Arc; +use std::{env, error::Error, path::PathBuf}; + use tempfile::TempDir; +use arrow2::array::*; +use arrow2::datatypes::*; +use arrow2::record_batch::RecordBatch; + +use crate::datasource::{MemTable, TableProvider}; +use crate::error::Result; +use crate::logical_plan::{LogicalPlan, LogicalPlanBuilder}; + pub fn create_table_dual() -> Arc { let dual_schema = Arc::new(Schema::new(vec![ Field::new("id", DataType::Int32, false), @@ -41,8 +41,8 @@ pub fn create_table_dual() -> Arc { let batch = RecordBatch::try_new( dual_schema.clone(), vec![ - Arc::new(array::Int32Array::from(vec![1])), - Arc::new(array::StringArray::from(vec!["a"])), + Arc::new(Int32Array::from_slice(&[1])), + Arc::new(Utf8Array::::from_slice(&["a"])), ], ) .unwrap(); @@ -52,7 +52,7 @@ pub fn create_table_dual() -> Arc { /// Generated partitioned copy of a CSV file pub fn create_partitioned_csv(filename: &str, partitions: usize) -> Result { - let testdata = arrow::util::test_util::arrow_test_data(); + let testdata = arrow_test_data(); let path = format!("{}/csv/{}", testdata, filename); let tmp_dir = TempDir::new()?; @@ -92,7 +92,7 @@ pub fn create_partitioned_csv(filename: &str, partitions: usize) -> Result SchemaRef { +pub fn aggr_test_schema() -> Arc { Arc::new(Schema::new(vec![ Field::new("c1", DataType::Utf8, false), Field::new("c2", DataType::UInt32, false), @@ -145,9 +145,9 @@ pub fn build_table_i32( RecordBatch::try_new( Arc::new(schema), vec![ - Arc::new(Int32Array::from(a.1.clone())), - Arc::new(Int32Array::from(b.1.clone())), - Arc::new(Int32Array::from(c.1.clone())), + Arc::new(Int32Array::from_slice(a.1)), + Arc::new(Int32Array::from_slice(b.1)), + Arc::new(Int32Array::from_slice(c.1)), ], ) .unwrap() @@ -165,11 +165,10 @@ pub fn table_with_sequence( seq_end: i32, ) -> Result> { let schema = Arc::new(Schema::new(vec![Field::new("i", DataType::Int32, true)])); - let arr = Arc::new(Int32Array::from((seq_start..=seq_end).collect::>())); - let partitions = vec![vec![RecordBatch::try_new( - schema.clone(), - vec![arr as ArrayRef], - )?]]; + let arr = Arc::new(Int32Array::from_slice( + &(seq_start..=seq_end).collect::>(), + )); + let partitions = vec![vec![RecordBatch::try_new(schema.clone(), vec![arr])?]]; Ok(Arc::new(MemTable::try_new(schema, partitions)?)) } @@ -179,8 +178,7 @@ pub fn make_partition(sz: i32) -> RecordBatch { let seq_end = sz; let values = (seq_start..seq_end).collect::>(); let schema = Arc::new(Schema::new(vec![Field::new("i", DataType::Int32, true)])); - let arr = Arc::new(Int32Array::from(values)); - let arr = arr as ArrayRef; + let arr = Arc::new(Int32Array::from_slice(&values)); RecordBatch::try_new(schema, vec![arr]).unwrap() } @@ -188,7 +186,7 @@ pub fn make_partition(sz: i32) -> RecordBatch { /// Return a new table provider containing all of the supported timestamp types pub fn table_with_timestamps() -> Arc { let batch = make_timestamps(); - let schema = batch.schema(); + let schema = batch.schema().clone(); let partitions = vec![vec![batch]]; Arc::new(MemTable::try_new(schema, partitions).unwrap()) } @@ -242,13 +240,17 @@ pub fn make_timestamps() -> RecordBatch { .map(|(i, _)| format!("Row {}", i)) .collect::>(); - let arr_nanos = TimestampNanosecondArray::from_opt_vec(ts_nanos, None); - let arr_micros = TimestampMicrosecondArray::from_opt_vec(ts_micros, None); - let arr_millis = TimestampMillisecondArray::from_opt_vec(ts_millis, None); - let arr_secs = TimestampSecondArray::from_opt_vec(ts_secs, None); + let arr_nanos = Primitive::::from(ts_nanos) + .to(DataType::Timestamp(TimeUnit::Nanosecond, None)); + let arr_micros = Primitive::::from(ts_micros) + .to(DataType::Timestamp(TimeUnit::Microsecond, None)); + let arr_millis = Primitive::::from(ts_millis) + .to(DataType::Timestamp(TimeUnit::Millisecond, None)); + let arr_secs = + Primitive::::from(ts_secs).to(DataType::Timestamp(TimeUnit::Second, None)); let names = names.iter().map(|s| s.as_str()).collect::>(); - let arr_names = StringArray::from(names); + let arr_names = Utf8Array::::from(&names); let schema = Schema::new(vec![ Field::new("nanos", arr_nanos.data_type().clone(), false), @@ -292,7 +294,7 @@ macro_rules! assert_batches_eq { let expected_lines: Vec = $EXPECTED_LINES.iter().map(|&s| s.into()).collect(); - let formatted = arrow::util::pretty::pretty_format_batches($CHUNKS).unwrap(); + let formatted = arrow2::util::pretty::pretty_format_batches($CHUNKS).unwrap(); let actual_lines: Vec<&str> = formatted.trim().lines().collect(); @@ -326,7 +328,7 @@ macro_rules! assert_batches_sorted_eq { expected_lines.as_mut_slice()[2..num_lines - 1].sort_unstable() } - let formatted = arrow::util::pretty::pretty_format_batches($CHUNKS).unwrap(); + let formatted = arrow2::util::pretty::pretty_format_batches($CHUNKS).unwrap(); // fix for windows: \r\n --> let mut actual_lines: Vec<&str> = formatted.trim().lines().collect(); @@ -344,3 +346,75 @@ macro_rules! assert_batches_sorted_eq { ); }; } + +/// Returns the arrow test data directory, which is by default stored +/// in a git submodule rooted at `arrow/testing/data`. +/// +/// The default can be overridden by the optional environment +/// variable `ARROW_TEST_DATA` +/// +/// panics when the directory can not be found. +/// +/// Example: +/// ``` +/// let testdata = crate::test::arrow_test_data(); +/// let csvdata = format!("{}/csv/aggregate_test_100.csv", testdata); +/// assert!(std::path::PathBuf::from(csvdata).exists()); +/// ``` +pub fn arrow_test_data() -> String { + match get_data_dir("ARROW_TEST_DATA", "testing/arrow-testing/data") { + Ok(pb) => pb.display().to_string(), + Err(err) => panic!("failed to get arrow data dir: {}", err), + } +} + +/// Returns a directory path for finding test data. +/// +/// udf_env: name of an environment variable +/// +/// submodule_dir: fallback path (relative to CARGO_MANIFEST_DIR) +/// +/// Returns either: +/// The path referred to in `udf_env` if that variable is set and refers to a directory +/// The submodule_data directory relative to CARGO_MANIFEST_PATH +fn get_data_dir( + udf_env: &str, + submodule_data: &str, +) -> std::result::Result> { + // Try user defined env. + if let Ok(dir) = env::var(udf_env) { + let trimmed = dir.trim().to_string(); + if !trimmed.is_empty() { + let pb = PathBuf::from(trimmed); + if pb.is_dir() { + return Ok(pb); + } else { + return Err(format!( + "the data dir `{}` defined by env {} not found", + pb.display().to_string(), + udf_env + ) + .into()); + } + } + } + + // The env is undefined or its value is trimmed to empty, let's try default dir. + + // env "CARGO_MANIFEST_DIR" is "the directory containing the manifest of your package", + // set by `cargo run` or `cargo test`, see: + // https://doc.rust-lang.org/cargo/reference/environment-variables.html + let dir = env!("CARGO_MANIFEST_DIR"); + + let pb = PathBuf::from(dir).join(submodule_data); + if pb.is_dir() { + Ok(pb) + } else { + Err(format!( + "env `{}` is undefined or has empty value, and the pre-defined data dir `{}` not found\n\ + HINT: try running `git submodule update --init`", + udf_env, + pb.display().to_string(), + ).into()) + } +} diff --git a/datafusion/tests/custom_sources.rs b/datafusion/tests/custom_sources.rs index b39f47bba07b1..851640b2b2997 100644 --- a/datafusion/tests/custom_sources.rs +++ b/datafusion/tests/custom_sources.rs @@ -15,10 +15,10 @@ // specific language governing permissions and limitations // under the License. -use arrow::array::Int32Array; -use arrow::datatypes::{DataType, Field, Schema, SchemaRef}; -use arrow::error::Result as ArrowResult; -use arrow::record_batch::RecordBatch; +use arrow2::array::Int32Array; +use arrow2::datatypes::{DataType, Field, Schema, SchemaRef}; +use arrow2::error::Result as ArrowResult; +use arrow2::record_batch::RecordBatch; use datafusion::{ datasource::{datasource::Statistics, TableProvider}, diff --git a/datafusion/tests/dataframe.rs b/datafusion/tests/dataframe.rs index b93e21f4ababb..b6465bcb41c09 100644 --- a/datafusion/tests/dataframe.rs +++ b/datafusion/tests/dataframe.rs @@ -17,8 +17,8 @@ use std::sync::Arc; -use arrow::datatypes::{DataType, Field, Schema}; -use arrow::{ +use arrow2::datatypes::{DataType, Field, Schema}; +use arrow2::{ array::{Int32Array, StringArray}, record_batch::RecordBatch, }; diff --git a/datafusion/tests/provider_filter_pushdown.rs b/datafusion/tests/provider_filter_pushdown.rs index 0bf67bea8b9d4..1696dcb15dc7a 100644 --- a/datafusion/tests/provider_filter_pushdown.rs +++ b/datafusion/tests/provider_filter_pushdown.rs @@ -15,9 +15,9 @@ // specific language governing permissions and limitations // under the License. -use arrow::array::{as_primitive_array, Int32Builder, UInt64Array}; -use arrow::datatypes::{DataType, Field, Schema, SchemaRef}; -use arrow::record_batch::RecordBatch; +use arrow2::array::{as_primitive_array, Int32Builder, UInt64Array}; +use arrow2::datatypes::{DataType, Field, Schema, SchemaRef}; +use arrow2::record_batch::RecordBatch; use async_trait::async_trait; use datafusion::datasource::datasource::{ Statistics, TableProvider, TableProviderFilterPushDown, diff --git a/datafusion/tests/sql.rs b/datafusion/tests/sql.rs index 17e0f13609a38..aafa5e667afd3 100644 --- a/datafusion/tests/sql.rs +++ b/datafusion/tests/sql.rs @@ -21,15 +21,7 @@ use std::sync::Arc; use chrono::prelude::*; use chrono::Duration; -extern crate arrow; -extern crate datafusion; - -use arrow::{array::*, datatypes::TimeUnit}; -use arrow::{datatypes::Int32Type, datatypes::Int64Type, record_batch::RecordBatch}; -use arrow::{ - datatypes::{DataType, Field, Schema, SchemaRef}, - util::display::array_value_to_string, -}; +use arrow2::{array::*, datatypes::*, record_batch::RecordBatch}; use datafusion::logical_plan::LogicalPlan; use datafusion::prelude::*; @@ -126,7 +118,7 @@ async fn parquet_query() { #[tokio::test] async fn parquet_single_nan_schema() { let mut ctx = ExecutionContext::new(); - let testdata = arrow::util::test_util::parquet_test_data(); + let testdata = crate::test::parquet_test_data(); ctx.register_parquet("single_nan", &format!("{}/single_nan.parquet", testdata)) .unwrap(); let sql = "SELECT mycol FROM single_nan"; @@ -144,7 +136,7 @@ async fn parquet_single_nan_schema() { #[ignore = "Test ignored, will be enabled as part of the nested Parquet reader"] async fn parquet_list_columns() { let mut ctx = ExecutionContext::new(); - let testdata = arrow::util::test_util::parquet_test_data(); + let testdata = crate::test::parquet_test_data(); ctx.register_parquet( "list_columns", &format!("{}/list_columns.parquet", testdata), @@ -1610,7 +1602,7 @@ fn aggr_test_schema() -> SchemaRef { } async fn register_aggregate_csv_by_sql(ctx: &mut ExecutionContext) { - let testdata = arrow::util::test_util::arrow_test_data(); + let testdata = crate::test::arrow_test_data(); // TODO: The following c9 should be migrated to UInt32 and c10 should be UInt64 once // unsigned is supported. @@ -1650,7 +1642,7 @@ async fn register_aggregate_csv_by_sql(ctx: &mut ExecutionContext) { } fn register_aggregate_csv(ctx: &mut ExecutionContext) -> Result<()> { - let testdata = arrow::util::test_util::arrow_test_data(); + let testdata = crate::test::arrow_test_data(); let schema = aggr_test_schema(); ctx.register_csv( "aggregate_test_100", @@ -1677,7 +1669,7 @@ fn register_aggregate_simple_csv(ctx: &mut ExecutionContext) -> Result<()> { } fn register_alltypes_parquet(ctx: &mut ExecutionContext) { - let testdata = arrow::util::test_util::parquet_test_data(); + let testdata = crate::test::parquet_test_data(); ctx.register_parquet( "alltypes_plain", &format!("{}/alltypes_plain.parquet", testdata), @@ -2936,7 +2928,7 @@ async fn test_cast_expressions_error() -> Result<()> { Ok(_) => panic!("expected error"), Err(e) => { assert!(e.to_string().contains( - "Cast error: Cannot cast string 'c' to value of arrow::datatypes::types::Int32Type type" + "Cast error: Cannot cast string 'c' to value of arrow2::datatypes::types::Int32Type type" )) } } @@ -2975,7 +2967,7 @@ async fn test_physical_plan_display_indent() { " CsvExec: source=Path(ARROW_TEST_DATA/csv/aggregate_test_100.csv: [ARROW_TEST_DATA/csv/aggregate_test_100.csv]), has_header=true", ]; - let data_path = arrow::util::test_util::arrow_test_data(); + let data_path = crate::test::arrow_test_data(); let actual = format!("{}", displayable(physical_plan.as_ref()).indent()) .trim() .lines() diff --git a/datafusion/tests/user_defined_plan.rs b/datafusion/tests/user_defined_plan.rs index 8914c05e8f88f..9ea7248c786f3 100644 --- a/datafusion/tests/user_defined_plan.rs +++ b/datafusion/tests/user_defined_plan.rs @@ -60,7 +60,7 @@ use futures::{Stream, StreamExt}; -use arrow::{ +use arrow2::{ array::{Int64Array, StringArray}, datatypes::SchemaRef, error::ArrowError,