From d20c8ed194893db5f554b9ec02ac56b1fdac5815 Mon Sep 17 00:00:00 2001 From: sundy-li <543950155@qq.com> Date: Sat, 30 Oct 2021 22:50:25 +0800 Subject: [PATCH 1/5] Add partial_lexer option to cast --- src/compute/cast/binary_to.rs | 21 ++++-- src/compute/cast/dictionary_to.rs | 11 ++- src/compute/cast/mod.rs | 110 ++++++++++++++++++------------ src/compute/cast/utf8_to.rs | 19 +++++- tests/it/compute/cast.rs | 13 +++- 5 files changed, 120 insertions(+), 54 deletions(-) diff --git a/src/compute/cast/binary_to.rs b/src/compute/cast/binary_to.rs index f0bf65af098..246c07717e1 100644 --- a/src/compute/cast/binary_to.rs +++ b/src/compute/cast/binary_to.rs @@ -3,6 +3,8 @@ use std::convert::TryFrom; use crate::error::{ArrowError, Result}; use crate::{array::*, buffer::Buffer, datatypes::DataType, types::NativeType}; +use super::CastOptions; + /// Conversion of binary pub fn binary_to_large_binary(from: &BinaryArray, to_data_type: DataType) -> BinaryArray { let values = from.values().clone(); @@ -31,13 +33,21 @@ pub fn binary_large_to_binary( } /// Casts a [`BinaryArray`] to a [`PrimitiveArray`], making any uncastable value a Null. -pub fn binary_to_primitive(from: &BinaryArray, to: &DataType) -> PrimitiveArray +pub fn binary_to_primitive( + from: &BinaryArray, + to: &DataType, + options: CastOptions, +) -> PrimitiveArray where T: NativeType + lexical_core::FromLexical, { - let iter = from - .iter() - .map(|x| x.and_then::(|x| lexical_core::parse(x).ok())); + let parse_fn = if options.partial { + |x| lexical_core::parse(x).ok() + } else { + |x| lexical_core::parse_partial(x).ok().map(|x| x.0) + }; + + let iter = from.iter().map(|x| x.and_then::(parse_fn)); PrimitiveArray::::from_trusted_len_iter(iter).to(to.clone()) } @@ -45,12 +55,13 @@ where pub(super) fn binary_to_primitive_dyn( from: &dyn Array, to: &DataType, + options: CastOptions, ) -> Result> where T: NativeType + lexical_core::FromLexical, { let from = from.as_any().downcast_ref().unwrap(); - Ok(Box::new(binary_to_primitive::(from, to))) + Ok(Box::new(binary_to_primitive::(from, to, options))) } /// Cast [`BinaryArray`] to [`DictionaryArray`], also known as packing. diff --git a/src/compute/cast/dictionary_to.rs b/src/compute/cast/dictionary_to.rs index 7b149a3906f..d047ebb93d7 100644 --- a/src/compute/cast/dictionary_to.rs +++ b/src/compute/cast/dictionary_to.rs @@ -44,8 +44,15 @@ pub fn wrapping_dictionary_to_dictionary_values( let keys = from.keys(); let values = from.values(); - let values = - cast_with_options(values.as_ref(), values_type, CastOptions { wrapped: true })?.into(); + let values = cast_with_options( + values.as_ref(), + values_type, + CastOptions { + wrapped: true, + partial: false, + }, + )? + .into(); Ok(DictionaryArray::from_data(keys.clone(), values)) } diff --git a/src/compute/cast/mod.rs b/src/compute/cast/mod.rs index 95975c42e26..31f00636bbf 100644 --- a/src/compute/cast/mod.rs +++ b/src/compute/cast/mod.rs @@ -21,11 +21,14 @@ pub use utf8_to::*; /// options defining how Cast kernels behave #[derive(Clone, Copy, Debug, Default)] -struct CastOptions { +pub struct CastOptions { /// default to false /// whether an overflowing cast should be converted to `None` (default), or be wrapped (i.e. `256i16 as u8 = 0` vectorized). /// Settings this to `true` is 5-6x faster for numeric types. wrapped: bool, + /// default to false + /// whether to cast to an integer at the best-effort + partial: bool, } impl CastOptions { @@ -324,14 +327,35 @@ fn cast_large_to_list(array: &ListArray, to_type: &DataType) -> ListArray Result> { - cast_with_options(array, to_type, CastOptions { wrapped: false }) + cast_with_options(array, to_type, CastOptions::default()) } /// Similar to [`cast`], but overflowing cast is wrapped /// Behavior: /// * PrimitiveArray to PrimitiveArray: overflowing cast will be wrapped (i.e. `256i16 as u8 = 0` vectorized). pub fn wrapping_cast(array: &dyn Array, to_type: &DataType) -> Result> { - cast_with_options(array, to_type, CastOptions { wrapped: true }) + cast_with_options( + array, + to_type, + CastOptions { + wrapped: true, + partial: false, + }, + ) +} + +/// Similar to [`cast`], but parse the utf8/binary into integer at the best-effort. +/// Behavior: +/// * PrimitiveArray to PrimitiveArray: overflowing cast will be wrapped (i.e. `256i16 as u8 = 0` vectorized). +pub fn partial_cast(array: &dyn Array, to_type: &DataType) -> Result> { + cast_with_options( + array, + to_type, + CastOptions { + wrapped: false, + partial: true, + }, + ) } #[inline] @@ -451,16 +475,16 @@ fn cast_with_options( }, (Utf8, _) => match to_type { - UInt8 => utf8_to_primitive_dyn::(array, to_type), - UInt16 => utf8_to_primitive_dyn::(array, to_type), - UInt32 => utf8_to_primitive_dyn::(array, to_type), - UInt64 => utf8_to_primitive_dyn::(array, to_type), - Int8 => utf8_to_primitive_dyn::(array, to_type), - Int16 => utf8_to_primitive_dyn::(array, to_type), - Int32 => utf8_to_primitive_dyn::(array, to_type), - Int64 => utf8_to_primitive_dyn::(array, to_type), - Float32 => utf8_to_primitive_dyn::(array, to_type), - Float64 => utf8_to_primitive_dyn::(array, to_type), + UInt8 => utf8_to_primitive_dyn::(array, to_type, options), + UInt16 => utf8_to_primitive_dyn::(array, to_type, options), + UInt32 => utf8_to_primitive_dyn::(array, to_type, options), + UInt64 => utf8_to_primitive_dyn::(array, to_type, options), + Int8 => utf8_to_primitive_dyn::(array, to_type, options), + Int16 => utf8_to_primitive_dyn::(array, to_type, options), + Int32 => utf8_to_primitive_dyn::(array, to_type, options), + Int64 => utf8_to_primitive_dyn::(array, to_type, options), + Float32 => utf8_to_primitive_dyn::(array, to_type, options), + Float64 => utf8_to_primitive_dyn::(array, to_type, options), Date32 => utf8_to_date32_dyn::(array), Date64 => utf8_to_date64_dyn::(array), LargeUtf8 => Ok(Box::new(utf8_to_large_utf8( @@ -476,16 +500,16 @@ fn cast_with_options( ))), }, (LargeUtf8, _) => match to_type { - UInt8 => utf8_to_primitive_dyn::(array, to_type), - UInt16 => utf8_to_primitive_dyn::(array, to_type), - UInt32 => utf8_to_primitive_dyn::(array, to_type), - UInt64 => utf8_to_primitive_dyn::(array, to_type), - Int8 => utf8_to_primitive_dyn::(array, to_type), - Int16 => utf8_to_primitive_dyn::(array, to_type), - Int32 => utf8_to_primitive_dyn::(array, to_type), - Int64 => utf8_to_primitive_dyn::(array, to_type), - Float32 => utf8_to_primitive_dyn::(array, to_type), - Float64 => utf8_to_primitive_dyn::(array, to_type), + UInt8 => utf8_to_primitive_dyn::(array, to_type, options), + UInt16 => utf8_to_primitive_dyn::(array, to_type, options), + UInt32 => utf8_to_primitive_dyn::(array, to_type, options), + UInt64 => utf8_to_primitive_dyn::(array, to_type, options), + Int8 => utf8_to_primitive_dyn::(array, to_type, options), + Int16 => utf8_to_primitive_dyn::(array, to_type, options), + Int32 => utf8_to_primitive_dyn::(array, to_type, options), + Int64 => utf8_to_primitive_dyn::(array, to_type, options), + Float32 => utf8_to_primitive_dyn::(array, to_type, options), + Float64 => utf8_to_primitive_dyn::(array, to_type, options), Date32 => utf8_to_date32_dyn::(array), Date64 => utf8_to_date64_dyn::(array), Utf8 => utf8_large_to_utf8(array.as_any().downcast_ref().unwrap()) @@ -573,16 +597,16 @@ fn cast_with_options( }, (Binary, _) => match to_type { - UInt8 => binary_to_primitive_dyn::(array, to_type), - UInt16 => binary_to_primitive_dyn::(array, to_type), - UInt32 => binary_to_primitive_dyn::(array, to_type), - UInt64 => binary_to_primitive_dyn::(array, to_type), - Int8 => binary_to_primitive_dyn::(array, to_type), - Int16 => binary_to_primitive_dyn::(array, to_type), - Int32 => binary_to_primitive_dyn::(array, to_type), - Int64 => binary_to_primitive_dyn::(array, to_type), - Float32 => binary_to_primitive_dyn::(array, to_type), - Float64 => binary_to_primitive_dyn::(array, to_type), + UInt8 => binary_to_primitive_dyn::(array, to_type, options), + UInt16 => binary_to_primitive_dyn::(array, to_type, options), + UInt32 => binary_to_primitive_dyn::(array, to_type, options), + UInt64 => binary_to_primitive_dyn::(array, to_type, options), + Int8 => binary_to_primitive_dyn::(array, to_type, options), + Int16 => binary_to_primitive_dyn::(array, to_type, options), + Int32 => binary_to_primitive_dyn::(array, to_type, options), + Int64 => binary_to_primitive_dyn::(array, to_type, options), + Float32 => binary_to_primitive_dyn::(array, to_type, options), + Float64 => binary_to_primitive_dyn::(array, to_type, options), LargeBinary => Ok(Box::new(binary_to_large_binary( array.as_any().downcast_ref().unwrap(), to_type.clone(), @@ -594,16 +618,16 @@ fn cast_with_options( }, (LargeBinary, _) => match to_type { - UInt8 => binary_to_primitive_dyn::(array, to_type), - UInt16 => binary_to_primitive_dyn::(array, to_type), - UInt32 => binary_to_primitive_dyn::(array, to_type), - UInt64 => binary_to_primitive_dyn::(array, to_type), - Int8 => binary_to_primitive_dyn::(array, to_type), - Int16 => binary_to_primitive_dyn::(array, to_type), - Int32 => binary_to_primitive_dyn::(array, to_type), - Int64 => binary_to_primitive_dyn::(array, to_type), - Float32 => binary_to_primitive_dyn::(array, to_type), - Float64 => binary_to_primitive_dyn::(array, to_type), + UInt8 => binary_to_primitive_dyn::(array, to_type, options), + UInt16 => binary_to_primitive_dyn::(array, to_type, options), + UInt32 => binary_to_primitive_dyn::(array, to_type, options), + UInt64 => binary_to_primitive_dyn::(array, to_type, options), + Int8 => binary_to_primitive_dyn::(array, to_type, options), + Int16 => binary_to_primitive_dyn::(array, to_type, options), + Int32 => binary_to_primitive_dyn::(array, to_type, options), + Int64 => binary_to_primitive_dyn::(array, to_type, options), + Float32 => binary_to_primitive_dyn::(array, to_type, options), + Float64 => binary_to_primitive_dyn::(array, to_type, options), Binary => { binary_large_to_binary(array.as_any().downcast_ref().unwrap(), to_type.clone()) .map(|x| Box::new(x) as Box) diff --git a/src/compute/cast/utf8_to.rs b/src/compute/cast/utf8_to.rs index 6c042d3de48..8877236f9e6 100644 --- a/src/compute/cast/utf8_to.rs +++ b/src/compute/cast/utf8_to.rs @@ -11,16 +11,28 @@ use crate::{ }, }; +use super::CastOptions; + const RFC3339: &str = "%Y-%m-%dT%H:%M:%S%.f%:z"; /// Casts a [`Utf8Array`] to a [`PrimitiveArray`], making any uncastable value a Null. -pub fn utf8_to_primitive(from: &Utf8Array, to: &DataType) -> PrimitiveArray +pub fn utf8_to_primitive( + from: &Utf8Array, + to: &DataType, + options: CastOptions, +) -> PrimitiveArray where T: NativeType + lexical_core::FromLexical, { + let parse_fn = if options.partial { + |x| lexical_core::parse(x).ok() + } else { + |x| lexical_core::parse_partial(x).ok().map(|x| x.0) + }; + let iter = from .iter() - .map(|x| x.and_then::(|x| lexical_core::parse(x.as_bytes()).ok())); + .map(|x| x.and_then::(|x| parse_fn(x.as_bytes()))); PrimitiveArray::::from_trusted_len_iter(iter).to(to.clone()) } @@ -28,12 +40,13 @@ where pub(super) fn utf8_to_primitive_dyn( from: &dyn Array, to: &DataType, + options: CastOptions, ) -> Result> where T: NativeType + lexical_core::FromLexical, { let from = from.as_any().downcast_ref().unwrap(); - Ok(Box::new(utf8_to_primitive::(from, to))) + Ok(Box::new(utf8_to_primitive::(from, to, options))) } /// Casts a [`Utf8Array`] to a Date32 primitive, making any uncastable value a Null. diff --git a/tests/it/compute/cast.rs b/tests/it/compute/cast.rs index 70cba0455fa..fbae8a3efa4 100644 --- a/tests/it/compute/cast.rs +++ b/tests/it/compute/cast.rs @@ -1,5 +1,5 @@ use arrow2::array::*; -use arrow2::compute::cast::{can_cast_types, cast, wrapping_cast}; +use arrow2::compute::cast::{can_cast_types, cast, partial_cast, wrapping_cast}; use arrow2::datatypes::*; use arrow2::types::NativeType; @@ -173,6 +173,17 @@ fn binary_to_i32() { assert_eq!(c, &expected); } +#[test] +fn binary_to_i32_partial() { + let array = BinaryArray::::from_slice(&["5", "6", "123 abseven", "aaa", "9.1"]); + let b = partial_cast(&array, &DataType::Int32).unwrap(); + let c = b.as_any().downcast_ref::>().unwrap(); + + let expected = &[Some(5), Some(6), Some(123), Some(0), None]; + let expected = Int32Array::from(expected); + assert_eq!(c, &expected); +} + #[test] fn utf8_to_i32() { let array = Utf8Array::::from_slice(&["5", "6", "seven", "8", "9.1"]); From c05f0a4bb8cf8b782315fd008973887f15920a86 Mon Sep 17 00:00:00 2001 From: sundyli <543950155@qq.com> Date: Sun, 31 Oct 2021 09:31:31 +0800 Subject: [PATCH 2/5] add partial cast test --- src/compute/cast/binary_to.rs | 2 +- src/compute/cast/utf8_to.rs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/compute/cast/binary_to.rs b/src/compute/cast/binary_to.rs index 246c07717e1..f4a768f38e1 100644 --- a/src/compute/cast/binary_to.rs +++ b/src/compute/cast/binary_to.rs @@ -41,7 +41,7 @@ pub fn binary_to_primitive( where T: NativeType + lexical_core::FromLexical, { - let parse_fn = if options.partial { + let parse_fn = if !options.partial { |x| lexical_core::parse(x).ok() } else { |x| lexical_core::parse_partial(x).ok().map(|x| x.0) diff --git a/src/compute/cast/utf8_to.rs b/src/compute/cast/utf8_to.rs index 8877236f9e6..2b1d8e40213 100644 --- a/src/compute/cast/utf8_to.rs +++ b/src/compute/cast/utf8_to.rs @@ -24,7 +24,7 @@ pub fn utf8_to_primitive( where T: NativeType + lexical_core::FromLexical, { - let parse_fn = if options.partial { + let parse_fn = if !options.partial { |x| lexical_core::parse(x).ok() } else { |x| lexical_core::parse_partial(x).ok().map(|x| x.0) From 19e61fcd2e3cd7a00e6d454bafd544b9c2bf1b53 Mon Sep 17 00:00:00 2001 From: sundyli <543950155@qq.com> Date: Sun, 31 Oct 2021 09:54:31 +0800 Subject: [PATCH 3/5] add partial cast test --- tests/it/compute/cast.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/it/compute/cast.rs b/tests/it/compute/cast.rs index fbae8a3efa4..f0a66843572 100644 --- a/tests/it/compute/cast.rs +++ b/tests/it/compute/cast.rs @@ -179,7 +179,7 @@ fn binary_to_i32_partial() { let b = partial_cast(&array, &DataType::Int32).unwrap(); let c = b.as_any().downcast_ref::>().unwrap(); - let expected = &[Some(5), Some(6), Some(123), Some(0), None]; + let expected = &[Some(5), Some(6), Some(123), Some(0), Some(9)]; let expected = Int32Array::from(expected); assert_eq!(c, &expected); } From 049461054ed0b02459aa8e6fc1821aeaa6a929a9 Mon Sep 17 00:00:00 2001 From: sundy-li <543950155@qq.com> Date: Wed, 3 Nov 2021 11:21:55 +0800 Subject: [PATCH 4/5] Apply comments --- src/compute/cast/binary_to.rs | 31 ++++--- src/compute/cast/dictionary_to.rs | 10 +-- src/compute/cast/mod.rs | 49 ++--------- src/compute/cast/utf8_to.rs | 33 +++++--- tests/it/compute/cast.rs | 135 ++++++++++++++++++++++-------- 5 files changed, 154 insertions(+), 104 deletions(-) diff --git a/src/compute/cast/binary_to.rs b/src/compute/cast/binary_to.rs index f4a768f38e1..4183fbfe593 100644 --- a/src/compute/cast/binary_to.rs +++ b/src/compute/cast/binary_to.rs @@ -32,22 +32,29 @@ pub fn binary_large_to_binary( )) } -/// Casts a [`BinaryArray`] to a [`PrimitiveArray`], making any uncastable value a Null. -pub fn binary_to_primitive( +/// Casts a [`BinaryArray`] to a [`PrimitiveArray`], as best-effort using `lexical_core::parse_partial`, making any uncastable value as zero. +pub fn partial_binary_to_primitive( from: &BinaryArray, to: &DataType, - options: CastOptions, ) -> PrimitiveArray where T: NativeType + lexical_core::FromLexical, { - let parse_fn = if !options.partial { - |x| lexical_core::parse(x).ok() - } else { - |x| lexical_core::parse_partial(x).ok().map(|x| x.0) - }; + let iter = from + .iter() + .map(|x| x.and_then::(|x| lexical_core::parse_partial(x).ok().map(|x| x.0))); - let iter = from.iter().map(|x| x.and_then::(parse_fn)); + PrimitiveArray::::from_trusted_len_iter(iter).to(to.clone()) +} + +/// Casts a [`BinaryArray`] to a [`PrimitiveArray`], making any uncastable value a Null. +pub fn binary_to_primitive(from: &BinaryArray, to: &DataType) -> PrimitiveArray +where + T: NativeType + lexical_core::FromLexical, +{ + let iter = from + .iter() + .map(|x| x.and_then::(|x| lexical_core::parse(x).ok())); PrimitiveArray::::from_trusted_len_iter(iter).to(to.clone()) } @@ -61,7 +68,11 @@ where T: NativeType + lexical_core::FromLexical, { let from = from.as_any().downcast_ref().unwrap(); - Ok(Box::new(binary_to_primitive::(from, to, options))) + if options.partial { + Ok(Box::new(partial_binary_to_primitive::(from, to))) + } else { + Ok(Box::new(binary_to_primitive::(from, to))) + } } /// Cast [`BinaryArray`] to [`DictionaryArray`], also known as packing. diff --git a/src/compute/cast/dictionary_to.rs b/src/compute/cast/dictionary_to.rs index d047ebb93d7..77c696ec915 100644 --- a/src/compute/cast/dictionary_to.rs +++ b/src/compute/cast/dictionary_to.rs @@ -1,7 +1,7 @@ use super::{primitive_as_primitive, primitive_to_primitive, CastOptions}; use crate::{ array::{Array, DictionaryArray, DictionaryKey, PrimitiveArray}, - compute::{cast::cast_with_options, take::take}, + compute::{cast::cast, take::take}, datatypes::DataType, error::{ArrowError, Result}, }; @@ -32,7 +32,7 @@ pub fn dictionary_to_dictionary_values( let keys = from.keys(); let values = from.values(); - let values = cast_with_options(values.as_ref(), values_type, CastOptions::default())?.into(); + let values = cast(values.as_ref(), values_type, CastOptions::default())?.into(); Ok(DictionaryArray::from_data(keys.clone(), values)) } @@ -44,7 +44,7 @@ pub fn wrapping_dictionary_to_dictionary_values( let keys = from.keys(); let values = from.values(); - let values = cast_with_options( + let values = cast( values.as_ref(), values_type, CastOptions { @@ -111,7 +111,7 @@ pub(super) fn dictionary_cast_dyn( match to_type { DataType::Dictionary(to_keys_type, to_values_type) => { - let values = cast_with_options(values.as_ref(), to_values_type, options)?.into(); + let values = cast(values.as_ref(), to_values_type, options)?.into(); // create the appropriate array type with_match_dictionary_key_type!(to_keys_type.as_ref(), |$T| { @@ -134,7 +134,7 @@ where { // attempt to cast the dict values to the target type // use the take kernel to expand out the dictionary - let values = cast_with_options(values, to_type, options)?; + let values = cast(values, to_type, options)?; // take requires first casting i32 let indices = primitive_to_primitive::<_, i32>(keys, &DataType::Int32); diff --git a/src/compute/cast/mod.rs b/src/compute/cast/mod.rs index 31f00636bbf..99c1542e792 100644 --- a/src/compute/cast/mod.rs +++ b/src/compute/cast/mod.rs @@ -25,10 +25,10 @@ pub struct CastOptions { /// default to false /// whether an overflowing cast should be converted to `None` (default), or be wrapped (i.e. `256i16 as u8 = 0` vectorized). /// Settings this to `true` is 5-6x faster for numeric types. - wrapped: bool, + pub wrapped: bool, /// default to false /// whether to cast to an integer at the best-effort - partial: bool, + pub partial: bool, } impl CastOptions { @@ -265,7 +265,7 @@ fn cast_list( options: CastOptions, ) -> Result> { let values = array.values(); - let new_values = cast_with_options( + let new_values = cast( values.as_ref(), ListArray::::get_child_type(to_type), options, @@ -326,44 +326,7 @@ fn cast_large_to_list(array: &ListArray, to_type: &DataType) -> ListArray Result> { - cast_with_options(array, to_type, CastOptions::default()) -} - -/// Similar to [`cast`], but overflowing cast is wrapped -/// Behavior: -/// * PrimitiveArray to PrimitiveArray: overflowing cast will be wrapped (i.e. `256i16 as u8 = 0` vectorized). -pub fn wrapping_cast(array: &dyn Array, to_type: &DataType) -> Result> { - cast_with_options( - array, - to_type, - CastOptions { - wrapped: true, - partial: false, - }, - ) -} - -/// Similar to [`cast`], but parse the utf8/binary into integer at the best-effort. -/// Behavior: -/// * PrimitiveArray to PrimitiveArray: overflowing cast will be wrapped (i.e. `256i16 as u8 = 0` vectorized). -pub fn partial_cast(array: &dyn Array, to_type: &DataType) -> Result> { - cast_with_options( - array, - to_type, - CastOptions { - wrapped: false, - partial: true, - }, - ) -} - -#[inline] -fn cast_with_options( - array: &dyn Array, - to_type: &DataType, - options: CastOptions, -) -> Result> { +pub fn cast(array: &dyn Array, to_type: &DataType, options: CastOptions) -> Result> { use DataType::*; let from_type = array.data_type(); @@ -402,7 +365,7 @@ fn cast_with_options( (_, List(to)) => { // cast primitive to list's primitive - let values = cast_with_options(array, to.data_type(), options)?.into(); + let values = cast(array, to.data_type(), options)?.into(); // create offsets, where if array.len() = 2, we have [0,1,2] let offsets = unsafe { Buffer::from_trusted_len_iter_unchecked(0..=array.len() as i32) }; @@ -845,7 +808,7 @@ fn cast_to_dictionary( dict_value_type: &DataType, options: CastOptions, ) -> Result> { - let array = cast_with_options(array, dict_value_type, options)?; + let array = cast(array, dict_value_type, options)?; let array = array.as_ref(); match *dict_value_type { DataType::Int8 => primitive_to_dictionary_dyn::(array), diff --git a/src/compute/cast/utf8_to.rs b/src/compute/cast/utf8_to.rs index 2b1d8e40213..0bc421262be 100644 --- a/src/compute/cast/utf8_to.rs +++ b/src/compute/cast/utf8_to.rs @@ -16,23 +16,28 @@ use super::CastOptions; const RFC3339: &str = "%Y-%m-%dT%H:%M:%S%.f%:z"; /// Casts a [`Utf8Array`] to a [`PrimitiveArray`], making any uncastable value a Null. -pub fn utf8_to_primitive( +pub fn utf8_to_primitive(from: &Utf8Array, to: &DataType) -> PrimitiveArray +where + T: NativeType + lexical_core::FromLexical, +{ + let iter = from + .iter() + .map(|x| x.and_then::(|x| lexical_core::parse(x.as_bytes()).ok())); + + PrimitiveArray::::from_trusted_len_iter(iter).to(to.clone()) +} + +/// Casts a [`Utf8Array`] to a [`PrimitiveArray`] as best-effort using `lexical_core::parse_partial`, making any uncastable value as zero. +pub fn partial_utf8_to_primitive( from: &Utf8Array, to: &DataType, - options: CastOptions, ) -> PrimitiveArray where T: NativeType + lexical_core::FromLexical, { - let parse_fn = if !options.partial { - |x| lexical_core::parse(x).ok() - } else { - |x| lexical_core::parse_partial(x).ok().map(|x| x.0) - }; - - let iter = from - .iter() - .map(|x| x.and_then::(|x| parse_fn(x.as_bytes()))); + let iter = from.iter().map(|x| { + x.and_then::(|x| lexical_core::parse_partial(x.as_bytes()).ok().map(|x| x.0)) + }); PrimitiveArray::::from_trusted_len_iter(iter).to(to.clone()) } @@ -46,7 +51,11 @@ where T: NativeType + lexical_core::FromLexical, { let from = from.as_any().downcast_ref().unwrap(); - Ok(Box::new(utf8_to_primitive::(from, to, options))) + if options.partial { + Ok(Box::new(partial_utf8_to_primitive::(from, to))) + } else { + Ok(Box::new(utf8_to_primitive::(from, to))) + } } /// Casts a [`Utf8Array`] to a Date32 primitive, making any uncastable value a Null. diff --git a/tests/it/compute/cast.rs b/tests/it/compute/cast.rs index f0a66843572..c1156553ee5 100644 --- a/tests/it/compute/cast.rs +++ b/tests/it/compute/cast.rs @@ -1,12 +1,12 @@ use arrow2::array::*; -use arrow2::compute::cast::{can_cast_types, cast, partial_cast, wrapping_cast}; +use arrow2::compute::cast::{can_cast_types, cast, CastOptions}; use arrow2::datatypes::*; use arrow2::types::NativeType; #[test] fn i32_to_f64() { let array = Int32Array::from_slice(&[5, 6, 7, 8, 9]); - let b = cast(&array, &DataType::Float64).unwrap(); + let b = cast(&array, &DataType::Float64, CastOptions::default()).unwrap(); let c = b.as_any().downcast_ref::().unwrap(); assert!((5.0 - c.value(0)).abs() < f64::EPSILON); assert!((6.0 - c.value(1)).abs() < f64::EPSILON); @@ -18,7 +18,15 @@ fn i32_to_f64() { #[test] fn i32_as_f64_no_overflow() { let array = Int32Array::from_slice(&[5, 6, 7, 8, 9]); - let b = wrapping_cast(&array, &DataType::Float64).unwrap(); + let b = cast( + &array, + &DataType::Float64, + CastOptions { + wrapped: true, + ..Default::default() + }, + ) + .unwrap(); let c = b.as_any().downcast_ref::().unwrap(); assert!((5.0 - c.value(0)).abs() < f64::EPSILON); assert!((6.0 - c.value(1)).abs() < f64::EPSILON); @@ -30,7 +38,15 @@ fn i32_as_f64_no_overflow() { #[test] fn u16_as_u8_overflow() { let array = UInt16Array::from_slice(&[255, 256, 257, 258, 259]); - let b = wrapping_cast(&array, &DataType::UInt8).unwrap(); + let b = cast( + &array, + &DataType::UInt8, + CastOptions { + wrapped: true, + ..Default::default() + }, + ) + .unwrap(); let c = b.as_any().downcast_ref::().unwrap(); let values = c.values().as_slice(); @@ -40,7 +56,15 @@ fn u16_as_u8_overflow() { #[test] fn u16_as_u8_no_overflow() { let array = UInt16Array::from_slice(&[1, 2, 3, 4, 5]); - let b = wrapping_cast(&array, &DataType::UInt8).unwrap(); + let b = cast( + &array, + &DataType::UInt8, + CastOptions { + wrapped: true, + ..Default::default() + }, + ) + .unwrap(); let c = b.as_any().downcast_ref::().unwrap(); let values = c.values().as_slice(); assert_eq!(values, &[1, 2, 3, 4, 5]) @@ -49,11 +73,19 @@ fn u16_as_u8_no_overflow() { #[test] fn f32_as_u8_overflow() { let array = Float32Array::from_slice(&[1.1, 5000.0]); - let b = cast(&array, &DataType::UInt8).unwrap(); + let b = cast(&array, &DataType::UInt8, CastOptions::default()).unwrap(); let expected = UInt8Array::from(&[Some(1), None]); assert_eq!(expected, b.as_ref()); - let b = wrapping_cast(&array, &DataType::UInt8).unwrap(); + let b = cast( + &array, + &DataType::UInt8, + CastOptions { + wrapped: true, + ..Default::default() + }, + ) + .unwrap(); let expected = UInt8Array::from(&[Some(1), Some(255)]); assert_eq!(expected, b.as_ref()); } @@ -61,7 +93,7 @@ fn f32_as_u8_overflow() { #[test] fn i32_to_u8() { let array = Int32Array::from_slice(&[-5, 6, -7, 8, 100000000]); - let b = cast(&array, &DataType::UInt8).unwrap(); + let b = cast(&array, &DataType::UInt8, CastOptions::default()).unwrap(); let expected = UInt8Array::from(&[None, Some(6), None, Some(8), None]); let c = b.as_any().downcast_ref::().unwrap(); assert_eq!(c, &expected); @@ -71,7 +103,7 @@ fn i32_to_u8() { fn i32_to_u8_sliced() { let array = Int32Array::from_slice(&[-5, 6, -7, 8, 100000000]); let array = array.slice(2, 3); - let b = cast(&array, &DataType::UInt8).unwrap(); + let b = cast(&array, &DataType::UInt8, CastOptions::default()).unwrap(); let expected = UInt8Array::from(&[None, Some(8), None]); let c = b.as_any().downcast_ref::().unwrap(); assert_eq!(c, &expected); @@ -80,7 +112,7 @@ fn i32_to_u8_sliced() { #[test] fn i32_to_i32() { let array = Int32Array::from_slice(&[5, 6, 7, 8, 9]); - let b = cast(&array, &DataType::Int32).unwrap(); + let b = cast(&array, &DataType::Int32, CastOptions::default()).unwrap(); let c = b.as_any().downcast_ref::().unwrap(); let expected = &[5, 6, 7, 8, 9]; @@ -94,6 +126,7 @@ fn i32_to_list_i32() { let b = cast( &array, &DataType::List(Box::new(Field::new("item", DataType::Int32, true))), + CastOptions::default(), ) .unwrap(); @@ -117,6 +150,7 @@ fn i32_to_list_i32_nullable() { let b = cast( &array, &DataType::List(Box::new(Field::new("item", DataType::Int32, true))), + CastOptions::default(), ) .unwrap(); @@ -140,6 +174,7 @@ fn i32_to_list_f64_nullable_sliced() { let b = cast( &array, &DataType::List(Box::new(Field::new("item", DataType::Float64, true))), + CastOptions::default(), ) .unwrap(); @@ -156,7 +191,7 @@ fn i32_to_list_f64_nullable_sliced() { #[test] fn i32_to_binary() { let array = Int32Array::from_slice(&[5, 6, 7]); - let b = cast(&array, &DataType::Binary).unwrap(); + let b = cast(&array, &DataType::Binary, CastOptions::default()).unwrap(); let expected = BinaryArray::::from(&[Some(b"5"), Some(b"6"), Some(b"7")]); let c = b.as_any().downcast_ref::>().unwrap(); assert_eq!(c, &expected); @@ -165,7 +200,7 @@ fn i32_to_binary() { #[test] fn binary_to_i32() { let array = BinaryArray::::from_slice(&["5", "6", "seven", "8", "9.1"]); - let b = cast(&array, &DataType::Int32).unwrap(); + let b = cast(&array, &DataType::Int32, CastOptions::default()).unwrap(); let c = b.as_any().downcast_ref::>().unwrap(); let expected = &[Some(5), Some(6), None, Some(8), None]; @@ -176,7 +211,15 @@ fn binary_to_i32() { #[test] fn binary_to_i32_partial() { let array = BinaryArray::::from_slice(&["5", "6", "123 abseven", "aaa", "9.1"]); - let b = partial_cast(&array, &DataType::Int32).unwrap(); + let b = cast( + &array, + &DataType::Int32, + CastOptions { + partial: true, + ..Default::default() + }, + ) + .unwrap(); let c = b.as_any().downcast_ref::>().unwrap(); let expected = &[Some(5), Some(6), Some(123), Some(0), Some(9)]; @@ -187,7 +230,7 @@ fn binary_to_i32_partial() { #[test] fn utf8_to_i32() { let array = Utf8Array::::from_slice(&["5", "6", "seven", "8", "9.1"]); - let b = cast(&array, &DataType::Int32).unwrap(); + let b = cast(&array, &DataType::Int32, CastOptions::default()).unwrap(); let c = b.as_any().downcast_ref::>().unwrap(); let expected = &[Some(5), Some(6), None, Some(8), None]; @@ -195,10 +238,29 @@ fn utf8_to_i32() { assert_eq!(c, &expected); } +#[test] +fn utf8_to_i32_partial() { + let array = Utf8Array::::from_slice(&["5", "6", "seven", "8aa", "9.1aa"]); + let b = cast( + &array, + &DataType::Int32, + CastOptions { + partial: true, + ..Default::default() + }, + ) + .unwrap(); + let c = b.as_any().downcast_ref::>().unwrap(); + + let expected = &[Some(5), Some(6), Some(0), Some(8), Some(9)]; + let expected = Int32Array::from(expected); + assert_eq!(c, &expected); +} + #[test] fn bool_to_i32() { let array = BooleanArray::from(vec![Some(true), Some(false), None]); - let b = cast(&array, &DataType::Int32).unwrap(); + let b = cast(&array, &DataType::Int32, CastOptions::default()).unwrap(); let c = b.as_any().downcast_ref::().unwrap(); let expected = &[Some(1), Some(0), None]; @@ -209,7 +271,7 @@ fn bool_to_i32() { #[test] fn bool_to_f64() { let array = BooleanArray::from(vec![Some(true), Some(false), None]); - let b = cast(&array, &DataType::Float64).unwrap(); + let b = cast(&array, &DataType::Float64, CastOptions::default()).unwrap(); let c = b.as_any().downcast_ref::().unwrap(); let expected = &[Some(1.0), Some(0.0), None]; @@ -220,7 +282,7 @@ fn bool_to_f64() { #[test] fn bool_to_utf8() { let array = BooleanArray::from(vec![Some(true), Some(false), None]); - let b = cast(&array, &DataType::Utf8).unwrap(); + let b = cast(&array, &DataType::Utf8, CastOptions::default()).unwrap(); let c = b.as_any().downcast_ref::>().unwrap(); let expected = Utf8Array::::from(&[Some("1"), Some("0"), Some("0")]); @@ -230,7 +292,7 @@ fn bool_to_utf8() { #[test] fn bool_to_binary() { let array = BooleanArray::from(vec![Some(true), Some(false), None]); - let b = cast(&array, &DataType::Binary).unwrap(); + let b = cast(&array, &DataType::Binary, CastOptions::default()).unwrap(); let c = b.as_any().downcast_ref::>().unwrap(); let expected = BinaryArray::::from(&[Some("1"), Some("0"), Some("0")]); @@ -240,7 +302,12 @@ fn bool_to_binary() { #[test] fn int32_to_timestamp() { let array = Int32Array::from(&[Some(2), Some(10), None]); - assert!(cast(&array, &DataType::Timestamp(TimeUnit::Microsecond, None)).is_err()); + assert!(cast( + &array, + &DataType::Timestamp(TimeUnit::Microsecond, None), + CastOptions::default() + ) + .is_err()); } #[test] @@ -285,13 +352,13 @@ fn consistency() { for d2 in &datatypes { let array = new_null_array(d1.clone(), 10); if can_cast_types(d1, d2) { - let result = cast(array.as_ref(), d2); + let result = cast(array.as_ref(), d2, CastOptions::default()); if let Ok(result) = result { assert_eq!(result.data_type(), d2, "type not equal: {:?} {:?}", d1, d2); } else { panic!("Cast should have not failed {:?} {:?}", d1, d2); } - } else if cast(array.as_ref(), d2).is_ok() { + } else if cast(array.as_ref(), d2, CastOptions::default()).is_ok() { panic!("Cast should have failed {:?} {:?}", d1, d2); } } @@ -305,7 +372,7 @@ fn test_primitive_to_primitive( expected_type: DataType, ) { let a = PrimitiveArray::::from_slice(lhs).to(lhs_type); - let b = cast(&a, &expected_type).unwrap(); + let b = cast(&a, &expected_type, CastOptions::default()).unwrap(); let b = b.as_any().downcast_ref::>().unwrap(); let expected = PrimitiveArray::::from_slice(expected).to(expected_type); assert_eq!(b, &expected); @@ -417,7 +484,7 @@ fn utf8_to_dict() { // Cast to a dictionary (same value type, Utf8) let cast_type = DataType::Dictionary(Box::new(DataType::UInt8), Box::new(DataType::Utf8)); - let result = cast(&array, &cast_type).expect("cast failed"); + let result = cast(&array, &cast_type, CastOptions::default()).expect("cast failed"); let mut expected = MutableDictionaryArray::>::new(); expected @@ -435,7 +502,7 @@ fn dict_to_utf8() { .unwrap(); let array: DictionaryArray = array.into(); - let result = cast(&array, &DataType::Utf8).expect("cast failed"); + let result = cast(&array, &DataType::Utf8, CastOptions::default()).expect("cast failed"); let expected = Utf8Array::::from(&[Some("one"), None, Some("three"), Some("one")]); @@ -448,7 +515,7 @@ fn i32_to_dict() { // Cast to a dictionary (same value type, Utf8) let cast_type = DataType::Dictionary(Box::new(DataType::UInt8), Box::new(DataType::Int32)); - let result = cast(&array, &cast_type).expect("cast failed"); + let result = cast(&array, &cast_type, CastOptions::default()).expect("cast failed"); let mut expected = MutableDictionaryArray::>::new(); expected @@ -478,7 +545,7 @@ fn list_to_list() { expected.try_extend(expected_data).unwrap(); let expected: ListArray = expected.into(); - let result = cast(&array, expected.data_type()).unwrap(); + let result = cast(&array, expected.data_type(), CastOptions::default()).unwrap(); assert_eq!(expected, result.as_ref()); } @@ -490,7 +557,7 @@ fn timestamp_with_tz_to_utf8() { let array = Int64Array::from_slice(&[851020797000000000, 851024397000000000]) .to(DataType::Timestamp(TimeUnit::Nanosecond, Some(tz))); - let result = cast(&array, expected.data_type()).expect("cast failed"); + let result = cast(&array, expected.data_type(), CastOptions::default()).expect("cast failed"); assert_eq!(expected, result.as_ref()); } @@ -503,7 +570,7 @@ fn utf8_to_timestamp_with_tz() { let expected = Int64Array::from_slice(&[851020797000000000, 851024397000000000]) .to(DataType::Timestamp(TimeUnit::Nanosecond, Some(tz))); - let result = cast(&array, expected.data_type()).expect("cast failed"); + let result = cast(&array, expected.data_type(), CastOptions::default()).expect("cast failed"); assert_eq!(expected, result.as_ref()); } @@ -515,7 +582,7 @@ fn utf8_to_naive_timestamp() { let expected = Int64Array::from_slice(&[851013597000000000, 851017197000000000]) .to(DataType::Timestamp(TimeUnit::Nanosecond, None)); - let result = cast(&array, expected.data_type()).expect("cast failed"); + let result = cast(&array, expected.data_type(), CastOptions::default()).expect("cast failed"); assert_eq!(expected, result.as_ref()); } @@ -526,7 +593,7 @@ fn naive_timestamp_to_utf8() { let expected = Utf8Array::::from_slice(&["1996-12-19 16:39:57", "1996-12-19 17:39:57"]); - let result = cast(&array, expected.data_type()).expect("cast failed"); + let result = cast(&array, expected.data_type(), CastOptions::default()).expect("cast failed"); assert_eq!(expected, result.as_ref()); } @@ -551,7 +618,7 @@ fn dict_to_dict_bad_index_value_primitive() { let array: ArrayRef = Arc::new(builder.finish()); let cast_type = Dictionary(Box::new(Int8), Box::new(Utf8)); - let res = cast_with_options(&array, &cast_type); + let res = cast(&array, &cast_type, CastOptions::default()); assert, CastOptions::default())!(res.is_err()); let actual_error = format!("{:?}", res); let expected_error = "Could not convert 72 dictionary indexes from Int32 to Int8"; @@ -583,7 +650,7 @@ fn dict_to_dict_bad_index_value_utf8() { let array: ArrayRef = Arc::new(builder.finish()); let cast_type = Dictionary(Box::new(Int8), Box::new(Utf8)); - let res = cast_with_options(&array, &cast_type); + let res = cast(&array, &cast_type, CastOptions::default()); assert, CastOptions::default())!(res.is_err()); let actual_error = format!("{:?}", res); let expected_error = "Could not convert 72 dictionary indexes from Int32 to Int8"; @@ -609,7 +676,7 @@ fn utf8_to_date32() { "2000", // just a year is invalid ]); let array = Arc::new(a) as ArrayRef; - let b = cast_with_options(&array, &DataType::Date32, CastOptions::default()).unwrap(); + let b = cast(&array, &DataType::Date32, CastOptions::default()).unwrap(); let c = b.as_any().downcast_ref::().unwrap(); // test valid inputs @@ -640,7 +707,7 @@ fn utf8_to_date64() { "2000-01-01", // just a date is invalid ]); let array = Arc::new(a) as ArrayRef; - let b = cast_with_options(&array, &DataType::Date64, CastOptions::default()).unwrap(); + let b = cast(&array, &DataType::Date64, CastOptions::default()).unwrap(); let c = b.as_any().downcast_ref::().unwrap(); // test valid inputs From 5d38f2239f4b36f3735818bf58755fdc12c5f677 Mon Sep 17 00:00:00 2001 From: sundy-li <543950155@qq.com> Date: Wed, 3 Nov 2021 11:29:18 +0800 Subject: [PATCH 5/5] Apply comments --- src/compute/cast/binary_to.rs | 2 +- src/compute/cast/utf8_to.rs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/compute/cast/binary_to.rs b/src/compute/cast/binary_to.rs index 4183fbfe593..538825f8965 100644 --- a/src/compute/cast/binary_to.rs +++ b/src/compute/cast/binary_to.rs @@ -32,7 +32,7 @@ pub fn binary_large_to_binary( )) } -/// Casts a [`BinaryArray`] to a [`PrimitiveArray`], as best-effort using `lexical_core::parse_partial`, making any uncastable value as zero. +/// Casts a [`BinaryArray`] to a [`PrimitiveArray`] at best-effort using `lexical_core::parse_partial`, making any uncastable value as zero. pub fn partial_binary_to_primitive( from: &BinaryArray, to: &DataType, diff --git a/src/compute/cast/utf8_to.rs b/src/compute/cast/utf8_to.rs index 0bc421262be..20711261fb2 100644 --- a/src/compute/cast/utf8_to.rs +++ b/src/compute/cast/utf8_to.rs @@ -27,7 +27,7 @@ where PrimitiveArray::::from_trusted_len_iter(iter).to(to.clone()) } -/// Casts a [`Utf8Array`] to a [`PrimitiveArray`] as best-effort using `lexical_core::parse_partial`, making any uncastable value as zero. +/// Casts a [`Utf8Array`] to a [`PrimitiveArray`] at best-effort using `lexical_core::parse_partial`, making any uncastable value as zero. pub fn partial_utf8_to_primitive( from: &Utf8Array, to: &DataType,