From 9afb1ac73851f4ea4262d16910f48d18d44d9be9 Mon Sep 17 00:00:00 2001 From: Jorge Leitao Date: Tue, 11 Jan 2022 18:49:07 +0100 Subject: [PATCH] Added nullif against scalar (#753) --- src/compute/comparison/mod.rs | 5 +- src/compute/comparison/primitive.rs | 49 +++--- src/compute/nullif.rs | 244 ++++++++++++++-------------- src/compute/utils.rs | 11 -- 4 files changed, 150 insertions(+), 159 deletions(-) diff --git a/src/compute/comparison/mod.rs b/src/compute/comparison/mod.rs index 8da298375bf..836eab497b4 100644 --- a/src/compute/comparison/mod.rs +++ b/src/compute/comparison/mod.rs @@ -56,7 +56,10 @@ pub mod utf8; mod simd; pub use simd::{Simd8, Simd8Lanes, Simd8PartialEq, Simd8PartialOrd}; -pub(crate) use primitive::compare_values_op as primitive_compare_values_op; +pub(crate) use primitive::{ + compare_values_op as primitive_compare_values_op, + compare_values_op_scalar as primitive_compare_values_op_scalar, +}; macro_rules! match_eq_ord {( $key_type:expr, | $_:tt $T:ident | $($body:tt)* diff --git a/src/compute/comparison/primitive.rs b/src/compute/comparison/primitive.rs index c7949379634..c5a36c5d2a5 100644 --- a/src/compute/comparison/primitive.rs +++ b/src/compute/comparison/primitive.rs @@ -1,7 +1,7 @@ //! Comparison functions for [`PrimitiveArray`] use crate::{ array::{BooleanArray, PrimitiveArray}, - bitmap::{Bitmap, MutableBitmap}, + bitmap::MutableBitmap, datatypes::DataType, types::NativeType, }; @@ -37,6 +37,31 @@ where MutableBitmap::from_vec(values, lhs.len()) } +pub(crate) fn compare_values_op_scalar(lhs: &[T], rhs: T, op: F) -> MutableBitmap +where + T: NativeType + Simd8, + F: Fn(T::Simd, T::Simd) -> u8, +{ + let rhs = T::Simd::from_chunk(&[rhs; 8]); + + let lhs_chunks_iter = lhs.chunks_exact(8); + let lhs_remainder = lhs_chunks_iter.remainder(); + + let mut values = Vec::with_capacity((lhs.len() + 7) / 8); + let iterator = lhs_chunks_iter.map(|lhs| { + let lhs = T::Simd::from_chunk(lhs); + op(lhs, rhs) + }); + values.extend(iterator); + + if !lhs_remainder.is_empty() { + let lhs = T::Simd::from_incomplete_chunk(lhs_remainder, T::default()); + values.push(op(lhs, rhs)) + }; + + MutableBitmap::from_vec(values, lhs.len()) +} + /// Evaluate `op(lhs, rhs)` for [`PrimitiveArray`]s using a specified /// comparison function. fn compare_op(lhs: &PrimitiveArray, rhs: &PrimitiveArray, op: F) -> BooleanArray @@ -59,28 +84,10 @@ where F: Fn(T::Simd, T::Simd) -> u8, { let validity = lhs.validity().cloned(); - let rhs = T::Simd::from_chunk(&[rhs; 8]); - let lhs_chunks_iter = lhs.values().chunks_exact(8); - let lhs_remainder = lhs_chunks_iter.remainder(); + let values = compare_values_op_scalar(lhs.values(), rhs, op); - let mut values = Vec::with_capacity((lhs.len() + 7) / 8); - let iterator = lhs_chunks_iter.map(|lhs| { - let lhs = T::Simd::from_chunk(lhs); - op(lhs, rhs) - }); - values.extend(iterator); - - if !lhs_remainder.is_empty() { - let lhs = T::Simd::from_incomplete_chunk(lhs_remainder, T::default()); - values.push(op(lhs, rhs)) - }; - - BooleanArray::from_data( - DataType::Boolean, - Bitmap::from_u8_vec(values, lhs.len()), - validity, - ) + BooleanArray::from_data(DataType::Boolean, values.into(), validity) } /// Perform `lhs == rhs` operation on two arrays. diff --git a/src/compute/nullif.rs b/src/compute/nullif.rs index a83e096806c..88053ef4ad0 100644 --- a/src/compute/nullif.rs +++ b/src/compute/nullif.rs @@ -1,176 +1,168 @@ //! Contains the operator [`nullif`]. use crate::array::PrimitiveArray; use crate::bitmap::Bitmap; -use crate::compute::comparison::{primitive_compare_values_op, Simd8, Simd8PartialEq}; -use crate::compute::utils::check_same_type; +use crate::compute::comparison::{ + primitive_compare_values_op, primitive_compare_values_op_scalar, Simd8, Simd8PartialEq, +}; use crate::datatypes::DataType; -use crate::error::{ArrowError, Result}; +use crate::scalar::PrimitiveScalar; +use crate::scalar::Scalar; use crate::{array::Array, types::NativeType}; use super::utils::combine_validities; /// Returns an array whose validity is null iff `lhs == rhs` or `lhs` is null. -/// This has the same semantics as postgres. +/// This has the same semantics as postgres - the validity of the rhs is ignored. +/// # Panic +/// This function panics iff +/// * The arguments do not have the same logical type +/// * The arguments do not have the same length /// # Example /// ```rust /// # use arrow2::array::Int32Array; /// # use arrow2::datatypes::DataType; -/// # use arrow2::error::Result; -/// # use arrow2::compute::nullif::nullif_primitive; -/// # fn main() -> Result<()> { +/// # use arrow2::compute::nullif::primitive_nullif; +/// # fn main() { /// let lhs = Int32Array::from(&[None, None, Some(1), Some(1), Some(1)]); /// let rhs = Int32Array::from(&[None, Some(1), None, Some(1), Some(0)]); -/// let result = nullif_primitive(&lhs, &rhs)?; +/// let result = primitive_nullif(&lhs, &rhs); /// /// let expected = Int32Array::from(&[None, None, Some(1), None, Some(1)]); /// /// assert_eq!(expected, result); -/// Ok(()) /// # } /// ``` -/// # Errors -/// This function errors iff -/// * The arguments do not have the same logical type -/// * The arguments do not have the same length -pub fn nullif_primitive( - lhs: &PrimitiveArray, - rhs: &PrimitiveArray, -) -> Result> +pub fn primitive_nullif(lhs: &PrimitiveArray, rhs: &PrimitiveArray) -> PrimitiveArray where T: NativeType + Simd8, T::Simd: Simd8PartialEq, { - check_same_type(lhs, rhs)?; - let equal = primitive_compare_values_op(lhs.values(), rhs.values(), |lhs, rhs| lhs.neq(rhs)); let equal: Option = equal.into(); let validity = combine_validities(lhs.validity(), equal.as_ref()); - Ok(PrimitiveArray::::from_data( - lhs.data_type().clone(), - lhs.values().clone(), - validity, - )) + PrimitiveArray::::from_data(lhs.data_type().clone(), lhs.values().clone(), validity) } -/// Returns whether [`nullif`] is implemented for the datatypes. -pub fn can_nullif(lhs: &DataType, rhs: &DataType) -> bool { - if lhs != rhs { - return false; - }; - use DataType::*; - matches!( - lhs, - UInt8 - | UInt16 - | UInt32 - | UInt64 - | Int8 - | Int16 - | Int32 - | Int64 - | Float32 - | Float64 - | Time32(_) - | Time64(_) - | Date32 - | Date64 - | Timestamp(_, _) - | Duration(_) - ) +/// Returns a [`PrimitiveArray`] whose validity is null iff `lhs == rhs` or `lhs` is null. +/// +/// This has the same semantics as postgres. +/// # Panic +/// This function panics iff +/// * The arguments do not have the same logical type +/// # Example +/// ```rust +/// # use arrow2::array::Int32Array; +/// # use arrow2::datatypes::DataType; +/// # use arrow2::compute::nullif::primitive_nullif_scalar; +/// # fn main() { +/// let lhs = Int32Array::from(&[None, None, Some(1), Some(0), Some(1)]); +/// let result = primitive_nullif_scalar(&lhs, 0); +/// +/// let expected = Int32Array::from(&[None, None, Some(1), None, Some(1)]); +/// +/// assert_eq!(expected, result); +/// # } +/// ``` +pub fn primitive_nullif_scalar(lhs: &PrimitiveArray, rhs: T) -> PrimitiveArray +where + T: NativeType + Simd8, + T::Simd: Simd8PartialEq, +{ + let equal = primitive_compare_values_op_scalar(lhs.values(), rhs, |lhs, rhs| lhs.neq(rhs)); + let equal: Option = equal.into(); + + let validity = combine_validities(lhs.validity(), equal.as_ref()); + + PrimitiveArray::::from_data(lhs.data_type().clone(), lhs.values().clone(), validity) } -/// Returns an array whose validity is null iff `lhs == rhs` or `lhs` is null. -/// This has the same semantics as postgres. +/// Returns an [`Array`] with the same type as `lhs` and whose validity +/// is null iff either `lhs == rhs` or `lhs` is null. +/// +/// This has the same semantics as postgres - the validity of the rhs is ignored. +/// # Panics +/// This function panics iff +/// * The arguments do not have the same logical type +/// * The arguments do not have the same length +/// * The physical type is not supported for this operation (use [`can_nullif`] to check) /// # Example /// ```rust /// # use arrow2::array::Int32Array; /// # use arrow2::datatypes::DataType; -/// # use arrow2::error::Result; /// # use arrow2::compute::nullif::nullif; -/// # fn main() -> Result<()> { +/// # fn main() { /// let lhs = Int32Array::from(&[None, None, Some(1), Some(1), Some(1)]); /// let rhs = Int32Array::from(&[None, Some(1), None, Some(1), Some(0)]); -/// let result = nullif(&lhs, &rhs)?; +/// let result = nullif(&lhs, &rhs); /// /// let expected = Int32Array::from(&[None, None, Some(1), None, Some(1)]); /// /// assert_eq!(expected, result.as_ref()); -/// Ok(()) /// # } /// ``` -/// # Errors -/// This function errors iff -/// * The arguments do not have the same logical type -/// * The arguments do not have the same length -/// * The logical type is not supported -pub fn nullif(lhs: &dyn Array, rhs: &dyn Array) -> Result> { - if lhs.data_type() != rhs.data_type() { - return Err(ArrowError::InvalidArgumentError( - "Nullif expects arrays of the the same logical type".to_string(), - )); - } - if lhs.len() != rhs.len() { - return Err(ArrowError::InvalidArgumentError( - "Nullif expects arrays of the the same length".to_string(), - )); +pub fn nullif(lhs: &dyn Array, rhs: &dyn Array) -> Box { + assert_eq!(lhs.data_type(), rhs.data_type()); + assert_eq!(lhs.len(), rhs.len()); + + use crate::datatypes::PhysicalType::*; + match lhs.data_type().to_physical_type() { + Primitive(primitive) => with_match_primitive_type!(primitive, |$T| { + Box::new(primitive_nullif::<$T>( + lhs.as_any().downcast_ref().unwrap(), + rhs.as_any().downcast_ref().unwrap(), + )) + }), + other => unimplemented!("Nullif is not implemented for physical type {:?}", other), } - use crate::datatypes::DataType::*; - match lhs.data_type() { - UInt8 => nullif_primitive::( - lhs.as_any().downcast_ref().unwrap(), - rhs.as_any().downcast_ref().unwrap(), - ) - .map(|x| Box::new(x) as Box), - UInt16 => nullif_primitive::( - lhs.as_any().downcast_ref().unwrap(), - rhs.as_any().downcast_ref().unwrap(), - ) - .map(|x| Box::new(x) as Box), - UInt32 => nullif_primitive::( - lhs.as_any().downcast_ref().unwrap(), - rhs.as_any().downcast_ref().unwrap(), - ) - .map(|x| Box::new(x) as Box), - UInt64 => nullif_primitive::( - lhs.as_any().downcast_ref().unwrap(), - rhs.as_any().downcast_ref().unwrap(), - ) - .map(|x| Box::new(x) as Box), - Int8 => nullif_primitive::( - lhs.as_any().downcast_ref().unwrap(), - rhs.as_any().downcast_ref().unwrap(), - ) - .map(|x| Box::new(x) as Box), - Int16 => nullif_primitive::( - lhs.as_any().downcast_ref().unwrap(), - rhs.as_any().downcast_ref().unwrap(), - ) - .map(|x| Box::new(x) as Box), - Int32 | Time32(_) | Date32 => nullif_primitive::( - lhs.as_any().downcast_ref().unwrap(), - rhs.as_any().downcast_ref().unwrap(), - ) - .map(|x| Box::new(x) as Box), - Int64 | Time64(_) | Date64 | Timestamp(_, _) | Duration(_) => nullif_primitive::( - lhs.as_any().downcast_ref().unwrap(), - rhs.as_any().downcast_ref().unwrap(), - ) - .map(|x| Box::new(x) as Box), - Float32 => nullif_primitive::( - lhs.as_any().downcast_ref().unwrap(), - rhs.as_any().downcast_ref().unwrap(), - ) - .map(|x| Box::new(x) as Box), - Float64 => nullif_primitive::( - lhs.as_any().downcast_ref().unwrap(), - rhs.as_any().downcast_ref().unwrap(), - ) - .map(|x| Box::new(x) as Box), - other => Err(ArrowError::NotYetImplemented(format!( - "Nullif is not implemented for logical datatype {:?}", - other - ))), +} + +/// Returns an [`Array`] with the same type as `lhs` and whose validity +/// is null iff either `lhs == rhs` or `lhs` is null. +/// # Panics +/// iff +/// * Scalar is null +/// * lhs and rhs do not have the same type +/// * The physical type is not supported for this operation (use [`can_nullif`] to check) +/// # Example +/// ```rust +/// # use arrow2::array::Int32Array; +/// # use arrow2::scalar::PrimitiveScalar; +/// # use arrow2::datatypes::DataType; +/// # use arrow2::compute::nullif::nullif_scalar; +/// # fn main() { +/// let lhs = Int32Array::from(&[None, None, Some(1), Some(0), Some(1)]); +/// let rhs = PrimitiveScalar::::from(Some(0)); +/// let result = nullif_scalar(&lhs, &rhs); +/// +/// let expected = Int32Array::from(&[None, None, Some(1), None, Some(1)]); +/// +/// assert_eq!(expected, result.as_ref()); +/// # } +/// ``` +pub fn nullif_scalar(lhs: &dyn Array, rhs: &dyn Scalar) -> Box { + assert_eq!(lhs.data_type(), rhs.data_type()); + use crate::datatypes::PhysicalType::*; + match lhs.data_type().to_physical_type() { + Primitive(primitive) => with_match_primitive_type!(primitive, |$T| { + let scalar = rhs.as_any().downcast_ref::>().unwrap(); + let scalar = scalar.value().expect("Scalar to be non-null"); + + Box::new(primitive_nullif_scalar::<$T>( + lhs.as_any().downcast_ref().unwrap(), + scalar, + )) + }), + other => unimplemented!("Nullif is not implemented for physical type {:?}", other), } } + +/// Returns whether [`nullif`] and [`nullif_scalar`] is implemented for the datatypes. +pub fn can_nullif(lhs: &DataType, rhs: &DataType) -> bool { + if lhs != rhs { + return false; + }; + use crate::datatypes::PhysicalType; + matches!(lhs.to_physical_type(), PhysicalType::Primitive(_)) +} diff --git a/src/compute/utils.rs b/src/compute/utils.rs index 69ed5b7a5a8..269a9c9ce37 100644 --- a/src/compute/utils.rs +++ b/src/compute/utils.rs @@ -48,14 +48,3 @@ pub fn check_same_len(lhs: &dyn Array, rhs: &dyn Array) -> Result<()> { } Ok(()) } - -// Errors iff the two arrays have a different data_type. -#[inline] -pub fn check_same_type(lhs: &dyn Array, rhs: &dyn Array) -> Result<()> { - if lhs.data_type() != rhs.data_type() { - return Err(ArrowError::InvalidArgumentError( - "Arrays must have the same logical type".to_string(), - )); - } - Ok(()) -}