From fe46a1ed9833f2f9ea4c4ccd4d77718e5c371ab1 Mon Sep 17 00:00:00 2001 From: Jiayu Liu Date: Mon, 7 Feb 2022 21:27:14 +0800 Subject: [PATCH] [split/1] split datafusion-common module (#1751) * split datafusion-common module * pyarrow * Update datafusion-common/README.md Co-authored-by: Andy Grove * Update datafusion/Cargo.toml * include publishing Co-authored-by: Andy Grove --- Cargo.toml | 1 + datafusion-common/Cargo.toml | 44 +++++++ datafusion-common/README.md | 24 ++++ datafusion-common/src/error.rs | 209 +++++++++++++++++++++++++++++++++ datafusion-common/src/lib.rs | 20 ++++ datafusion/Cargo.toml | 5 +- datafusion/src/error.rs | 182 +--------------------------- datafusion/src/pyarrow.rs | 8 -- 8 files changed, 302 insertions(+), 191 deletions(-) create mode 100644 datafusion-common/Cargo.toml create mode 100644 datafusion-common/README.md create mode 100644 datafusion-common/src/error.rs create mode 100644 datafusion-common/src/lib.rs diff --git a/Cargo.toml b/Cargo.toml index ea1acc04e687..81f6bb59f2d0 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -18,6 +18,7 @@ [workspace] members = [ "datafusion", + "datafusion-common", "datafusion-cli", "datafusion-examples", "benchmarks", diff --git a/datafusion-common/Cargo.toml b/datafusion-common/Cargo.toml new file mode 100644 index 000000000000..9c05d8095caf --- /dev/null +++ b/datafusion-common/Cargo.toml @@ -0,0 +1,44 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +[package] +name = "datafusion-common" +description = "DataFusion is an in-memory query engine that uses Apache Arrow as the memory model" +version = "6.0.0" +homepage = "https://github.com/apache/arrow-datafusion" +repository = "https://github.com/apache/arrow-datafusion" +readme = "README.md" +authors = ["Apache Arrow "] +license = "Apache-2.0" +keywords = [ "arrow", "query", "sql" ] +edition = "2021" +rust-version = "1.58" + +[lib] +name = "datafusion_common" +path = "src/lib.rs" + +[features] +avro = ["avro-rs"] +pyarrow = ["pyo3"] + +[dependencies] +arrow = { version = "8.0.0", features = ["prettyprint"] } +parquet = { version = "8.0.0", features = ["arrow"] } +avro-rs = { version = "0.13", features = ["snappy"], optional = true } +pyo3 = { version = "0.15", optional = true } +sqlparser = "0.13" diff --git a/datafusion-common/README.md b/datafusion-common/README.md new file mode 100644 index 000000000000..8c44d78ef47f --- /dev/null +++ b/datafusion-common/README.md @@ -0,0 +1,24 @@ + + +# DataFusion Common + +This is an internal module for the most fundamental types of [DataFusion][df]. + +[df]: https://crates.io/crates/datafusion diff --git a/datafusion-common/src/error.rs b/datafusion-common/src/error.rs new file mode 100644 index 000000000000..ee2e61892fd4 --- /dev/null +++ b/datafusion-common/src/error.rs @@ -0,0 +1,209 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! DataFusion error types + +use std::error; +use std::fmt::{Display, Formatter}; +use std::io; +use std::result; + +use arrow::error::ArrowError; +#[cfg(feature = "avro")] +use avro_rs::Error as AvroError; +use parquet::errors::ParquetError; +#[cfg(feature = "pyarrow")] +use pyo3::exceptions::PyException; +#[cfg(feature = "pyarrow")] +use pyo3::prelude::PyErr; +use sqlparser::parser::ParserError; + +/// Result type for operations that could result in an [DataFusionError] +pub type Result = result::Result; + +/// Error type for generic operations that could result in DataFusionError::External +pub type GenericError = Box; + +/// DataFusion error +#[derive(Debug)] +pub enum DataFusionError { + /// Error returned by arrow. + ArrowError(ArrowError), + /// Wraps an error from the Parquet crate + ParquetError(ParquetError), + /// Wraps an error from the Avro crate + #[cfg(feature = "avro")] + AvroError(AvroError), + /// Error associated to I/O operations and associated traits. + IoError(io::Error), + /// Error returned when SQL is syntactically incorrect. + SQL(ParserError), + /// Error returned on a branch that we know it is possible + /// but to which we still have no implementation for. + /// Often, these errors are tracked in our issue tracker. + NotImplemented(String), + /// Error returned as a consequence of an error in DataFusion. + /// This error should not happen in normal usage of DataFusion. + // DataFusions has internal invariants that we are unable to ask the compiler to check for us. + // This error is raised when one of those invariants is not verified during execution. + Internal(String), + /// This error happens whenever a plan is not valid. Examples include + /// impossible casts, schema inference not possible and non-unique column names. + Plan(String), + /// Error returned during execution of the query. + /// Examples include files not found, errors in parsing certain types. + Execution(String), + /// This error is thrown when a consumer cannot acquire memory from the Memory Manager + /// we can just cancel the execution of the partition. + ResourcesExhausted(String), + /// Errors originating from outside DataFusion's core codebase. + /// For example, a custom S3Error from the crate datafusion-objectstore-s3 + External(GenericError), +} + +impl From for DataFusionError { + fn from(e: io::Error) -> Self { + DataFusionError::IoError(e) + } +} + +impl From for DataFusionError { + fn from(e: ArrowError) -> Self { + DataFusionError::ArrowError(e) + } +} + +#[cfg(feature = "pyarrow")] +impl From for PyErr { + fn from(err: DataFusionError) -> PyErr { + PyException::new_err(err.to_string()) + } +} + +impl From for ArrowError { + fn from(e: DataFusionError) -> Self { + match e { + DataFusionError::ArrowError(e) => e, + DataFusionError::External(e) => ArrowError::ExternalError(e), + other => ArrowError::ExternalError(Box::new(other)), + } + } +} + +impl From for DataFusionError { + fn from(e: ParquetError) -> Self { + DataFusionError::ParquetError(e) + } +} + +#[cfg(feature = "avro")] +impl From for DataFusionError { + fn from(e: AvroError) -> Self { + DataFusionError::AvroError(e) + } +} + +impl From for DataFusionError { + fn from(e: ParserError) -> Self { + DataFusionError::SQL(e) + } +} + +impl From for DataFusionError { + fn from(err: GenericError) -> Self { + DataFusionError::External(err) + } +} + +impl Display for DataFusionError { + fn fmt(&self, f: &mut Formatter) -> std::fmt::Result { + match *self { + DataFusionError::ArrowError(ref desc) => write!(f, "Arrow error: {}", desc), + DataFusionError::ParquetError(ref desc) => { + write!(f, "Parquet error: {}", desc) + } + #[cfg(feature = "avro")] + DataFusionError::AvroError(ref desc) => { + write!(f, "Avro error: {}", desc) + } + DataFusionError::IoError(ref desc) => write!(f, "IO error: {}", desc), + DataFusionError::SQL(ref desc) => { + write!(f, "SQL error: {:?}", desc) + } + DataFusionError::NotImplemented(ref desc) => { + write!(f, "This feature is not implemented: {}", desc) + } + DataFusionError::Internal(ref desc) => { + write!(f, "Internal error: {}. This was likely caused by a bug in DataFusion's \ + code and we would welcome that you file an bug report in our issue tracker", desc) + } + DataFusionError::Plan(ref desc) => { + write!(f, "Error during planning: {}", desc) + } + DataFusionError::Execution(ref desc) => { + write!(f, "Execution error: {}", desc) + } + DataFusionError::ResourcesExhausted(ref desc) => { + write!(f, "Resources exhausted: {}", desc) + } + DataFusionError::External(ref desc) => { + write!(f, "External error: {}", desc) + } + } + } +} + +impl error::Error for DataFusionError {} + +#[cfg(test)] +mod test { + use crate::error::DataFusionError; + use arrow::error::ArrowError; + + #[test] + fn arrow_error_to_datafusion() { + let res = return_arrow_error().unwrap_err(); + assert_eq!( + res.to_string(), + "External error: Error during planning: foo" + ); + } + + #[test] + fn datafusion_error_to_arrow() { + let res = return_datafusion_error().unwrap_err(); + assert_eq!(res.to_string(), "Arrow error: Schema error: bar"); + } + + /// Model what happens when implementing SendableRecrordBatchStream: + /// DataFusion code needs to return an ArrowError + #[allow(clippy::try_err)] + fn return_arrow_error() -> arrow::error::Result<()> { + // Expect the '?' to work + let _foo = Err(DataFusionError::Plan("foo".to_string()))?; + Ok(()) + } + + /// Model what happens when using arrow kernels in DataFusion + /// code: need to turn an ArrowError into a DataFusionError + #[allow(clippy::try_err)] + fn return_datafusion_error() -> crate::error::Result<()> { + // Expect the '?' to work + let _bar = Err(ArrowError::SchemaError("bar".to_string()))?; + Ok(()) + } +} diff --git a/datafusion-common/src/lib.rs b/datafusion-common/src/lib.rs new file mode 100644 index 000000000000..ac8ef623b0cd --- /dev/null +++ b/datafusion-common/src/lib.rs @@ -0,0 +1,20 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +mod error; + +pub use error::{DataFusionError, Result}; diff --git a/datafusion/Cargo.toml b/datafusion/Cargo.toml index 81e2bb14877b..6df852257e16 100644 --- a/datafusion/Cargo.toml +++ b/datafusion/Cargo.toml @@ -43,13 +43,14 @@ simd = ["arrow/simd"] crypto_expressions = ["md-5", "sha2", "blake2", "blake3"] regex_expressions = ["regex"] unicode_expressions = ["unicode-segmentation"] -pyarrow = ["pyo3", "arrow/pyarrow"] +pyarrow = ["pyo3", "arrow/pyarrow", "datafusion-common/pyarrow"] # Used for testing ONLY: causes all values to hash to the same value (test for collisions) force_hash_collisions = [] # Used to enable the avro format -avro = ["avro-rs", "num-traits"] +avro = ["avro-rs", "num-traits", "datafusion-common/avro"] [dependencies] +datafusion-common = { path = "../datafusion-common", version = "6.0.0" } ahash = { version = "0.7", default-features = false } hashbrown = { version = "0.12", features = ["raw"] } arrow = { version = "8.0.0", features = ["prettyprint"] } diff --git a/datafusion/src/error.rs b/datafusion/src/error.rs index 248f24350356..c2c80b48781e 100644 --- a/datafusion/src/error.rs +++ b/datafusion/src/error.rs @@ -16,184 +16,4 @@ // under the License. //! DataFusion error types - -use std::error; -use std::fmt::{Display, Formatter}; -use std::io; -use std::result; - -use arrow::error::ArrowError; -#[cfg(feature = "avro")] -use avro_rs::Error as AvroError; -use parquet::errors::ParquetError; -use sqlparser::parser::ParserError; - -/// Result type for operations that could result in an [DataFusionError] -pub type Result = result::Result; - -/// Error type for generic operations that could result in DataFusionError::External -pub type GenericError = Box; - -/// DataFusion error -#[derive(Debug)] -#[allow(missing_docs)] -pub enum DataFusionError { - /// Error returned by arrow. - ArrowError(ArrowError), - /// Wraps an error from the Parquet crate - ParquetError(ParquetError), - /// Wraps an error from the Avro crate - #[cfg(feature = "avro")] - AvroError(AvroError), - /// Error associated to I/O operations and associated traits. - IoError(io::Error), - /// Error returned when SQL is syntactically incorrect. - SQL(ParserError), - /// Error returned on a branch that we know it is possible - /// but to which we still have no implementation for. - /// Often, these errors are tracked in our issue tracker. - NotImplemented(String), - /// Error returned as a consequence of an error in DataFusion. - /// This error should not happen in normal usage of DataFusion. - // DataFusions has internal invariants that we are unable to ask the compiler to check for us. - // This error is raised when one of those invariants is not verified during execution. - Internal(String), - /// This error happens whenever a plan is not valid. Examples include - /// impossible casts, schema inference not possible and non-unique column names. - Plan(String), - /// Error returned during execution of the query. - /// Examples include files not found, errors in parsing certain types. - Execution(String), - /// This error is thrown when a consumer cannot acquire memory from the Memory Manager - /// we can just cancel the execution of the partition. - ResourcesExhausted(String), - /// Errors originating from outside DataFusion's core codebase. - /// For example, a custom S3Error from the crate datafusion-objectstore-s3 - External(GenericError), -} - -impl From for DataFusionError { - fn from(e: io::Error) -> Self { - DataFusionError::IoError(e) - } -} - -impl From for DataFusionError { - fn from(e: ArrowError) -> Self { - DataFusionError::ArrowError(e) - } -} - -impl From for ArrowError { - fn from(e: DataFusionError) -> Self { - match e { - DataFusionError::ArrowError(e) => e, - DataFusionError::External(e) => ArrowError::ExternalError(e), - other => ArrowError::ExternalError(Box::new(other)), - } - } -} - -impl From for DataFusionError { - fn from(e: ParquetError) -> Self { - DataFusionError::ParquetError(e) - } -} - -#[cfg(feature = "avro")] -impl From for DataFusionError { - fn from(e: AvroError) -> Self { - DataFusionError::AvroError(e) - } -} - -impl From for DataFusionError { - fn from(e: ParserError) -> Self { - DataFusionError::SQL(e) - } -} - -impl From for DataFusionError { - fn from(err: GenericError) -> Self { - DataFusionError::External(err) - } -} - -impl Display for DataFusionError { - fn fmt(&self, f: &mut Formatter) -> std::fmt::Result { - match *self { - DataFusionError::ArrowError(ref desc) => write!(f, "Arrow error: {}", desc), - DataFusionError::ParquetError(ref desc) => { - write!(f, "Parquet error: {}", desc) - } - #[cfg(feature = "avro")] - DataFusionError::AvroError(ref desc) => { - write!(f, "Avro error: {}", desc) - } - DataFusionError::IoError(ref desc) => write!(f, "IO error: {}", desc), - DataFusionError::SQL(ref desc) => { - write!(f, "SQL error: {:?}", desc) - } - DataFusionError::NotImplemented(ref desc) => { - write!(f, "This feature is not implemented: {}", desc) - } - DataFusionError::Internal(ref desc) => { - write!(f, "Internal error: {}. This was likely caused by a bug in DataFusion's \ - code and we would welcome that you file an bug report in our issue tracker", desc) - } - DataFusionError::Plan(ref desc) => { - write!(f, "Error during planning: {}", desc) - } - DataFusionError::Execution(ref desc) => { - write!(f, "Execution error: {}", desc) - } - DataFusionError::ResourcesExhausted(ref desc) => { - write!(f, "Resources exhausted: {}", desc) - } - DataFusionError::External(ref desc) => { - write!(f, "External error: {}", desc) - } - } - } -} - -impl error::Error for DataFusionError {} - -#[cfg(test)] -mod test { - use crate::error::DataFusionError; - use arrow::error::ArrowError; - - #[test] - fn arrow_error_to_datafusion() { - let res = return_arrow_error().unwrap_err(); - assert_eq!( - res.to_string(), - "External error: Error during planning: foo" - ); - } - - #[test] - fn datafusion_error_to_arrow() { - let res = return_datafusion_error().unwrap_err(); - assert_eq!(res.to_string(), "Arrow error: Schema error: bar"); - } - - /// Model what happens when implementing SendableRecrordBatchStream: - /// DataFusion code needs to return an ArrowError - #[allow(clippy::try_err)] - fn return_arrow_error() -> arrow::error::Result<()> { - // Expect the '?' to work - let _foo = Err(DataFusionError::Plan("foo".to_string()))?; - Ok(()) - } - - /// Model what happens when using arrow kernels in DataFusion - /// code: need to turn an ArrowError into a DataFusionError - #[allow(clippy::try_err)] - fn return_datafusion_error() -> crate::error::Result<()> { - // Expect the '?' to work - let _bar = Err(ArrowError::SchemaError("bar".to_string()))?; - Ok(()) - } -} +pub use datafusion_common::{DataFusionError, Result}; diff --git a/datafusion/src/pyarrow.rs b/datafusion/src/pyarrow.rs index d819b2b41154..46eb6b4437b5 100644 --- a/datafusion/src/pyarrow.rs +++ b/datafusion/src/pyarrow.rs @@ -15,21 +15,13 @@ // specific language governing permissions and limitations // under the License. -use pyo3::exceptions::PyException; use pyo3::prelude::*; use pyo3::types::PyList; use crate::arrow::array::ArrayData; use crate::arrow::pyarrow::PyArrowConvert; -use crate::error::DataFusionError; use crate::scalar::ScalarValue; -impl From for PyErr { - fn from(err: DataFusionError) -> PyErr { - PyException::new_err(err.to_string()) - } -} - impl PyArrowConvert for ScalarValue { fn from_pyarrow(value: &PyAny) -> PyResult { let py = value.py();