From c9815c3aa6b8177aa83c4f82428c12d2bb6ac81a Mon Sep 17 00:00:00 2001 From: Xuanwo Date: Sat, 27 Nov 2021 20:07:01 +0800 Subject: [PATCH 1/7] compute: Add lower support Signed-off-by: Xuanwo --- Cargo.toml | 4 +- src/compute/lower.rs | 79 ++++++++++++++ src/compute/mod.rs | 3 + tests/it/compute/lower.rs | 213 ++++++++++++++++++++++++++++++++++++++ tests/it/compute/mod.rs | 2 + 5 files changed, 300 insertions(+), 1 deletion(-) create mode 100644 src/compute/lower.rs create mode 100644 tests/it/compute/lower.rs diff --git a/Cargo.toml b/Cargo.toml index f38a7ca5c4d..19a8b8ac97f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -171,6 +171,7 @@ compute_substring = [] compute_take = [] compute_temporal = [] compute_window = ["compute_concatenate"] +compute_lower = [] compute = [ "compute_aggregate", "compute_arithmetics", @@ -196,6 +197,7 @@ compute = [ "compute_take", "compute_temporal", "compute_window", + "compute_lower", ] # base64 + io_ipc because arrow schemas are stored as base64-encoded ipc format. io_parquet = ["parquet2", "io_ipc", "base64", "futures"] @@ -298,4 +300,4 @@ harness = false [[bench]] name = "bitwise" -harness = false \ No newline at end of file +harness = false diff --git a/src/compute/lower.rs b/src/compute/lower.rs new file mode 100644 index 00000000000..e635248bc0c --- /dev/null +++ b/src/compute/lower.rs @@ -0,0 +1,79 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Defines kernel to extract a lower case of a \[Large\]StringArray + +use crate::array::*; +use crate::{ + datatypes::DataType, + error::{ArrowError, Result}, +}; + +fn utf8_lower(array: &Utf8Array) -> Utf8Array { + let iter = array.values_iter().map(str::to_lowercase); + + let new = Utf8Array::::from_trusted_len_values_iter(iter); + new.with_validity(array.validity().cloned()) +} + +/// Returns an ArrayRef with lowercase of each of the elements in `array`. +/// this function errors when the passed array is not a \[Large\]String array. +pub fn lower(array: &dyn Array) -> Result> { + match array.data_type() { + // For binary and large binary, lower is no-op. + DataType::Binary | DataType::LargeBinary => unsafe { + // Safety: we will use the whole slice directly, so we don't need to check it. + Ok(array.slice_unchecked(0, array.len())) + }, + DataType::LargeUtf8 => Ok(Box::new(utf8_lower( + array + .as_any() + .downcast_ref::>() + .expect("A large string is expected"), + ))), + DataType::Utf8 => Ok(Box::new(utf8_lower( + array + .as_any() + .downcast_ref::>() + .expect("A large string is expected"), + ))), + _ => Err(ArrowError::InvalidArgumentError(format!( + "lower does not support type {:?}", + array.data_type() + ))), + } +} + +/// Checks if an array of type `datatype` can perform lower operation +/// +/// # Examples +/// ``` +/// use arrow2::compute::lower::can_lower; +/// use arrow2::datatypes::{DataType}; +/// +/// let data_type = DataType::Utf8; +/// assert_eq!(can_lower(&data_type), true); +/// +/// let data_type = DataType::Null; +/// assert_eq!(can_lower(&data_type), false); +/// ``` +pub fn can_lower(data_type: &DataType) -> bool { + matches!( + data_type, + DataType::LargeUtf8 | DataType::Utf8 | DataType::LargeBinary | DataType::Binary + ) +} diff --git a/src/compute/mod.rs b/src/compute/mod.rs index b44433b9ef7..9ee59af794f 100644 --- a/src/compute/mod.rs +++ b/src/compute/mod.rs @@ -86,3 +86,6 @@ mod utils; #[cfg(feature = "compute_window")] #[cfg_attr(docsrs, doc(cfg(feature = "compute_window")))] pub mod window; +#[cfg(feature = "compute_lower")] +#[cfg_attr(docsrs, doc(cfg(feature = "compute_lower")))] +pub mod lower; diff --git a/tests/it/compute/lower.rs b/tests/it/compute/lower.rs new file mode 100644 index 00000000000..5a3de12f142 --- /dev/null +++ b/tests/it/compute/lower.rs @@ -0,0 +1,213 @@ +use arrow2::{array::*, compute::lower::*, error::Result}; + +fn with_nulls_utf8() -> Result<()> { + let cases = vec![ + // identity + ( + vec![Some("hello"), None, Some("world")], + vec![Some("hello"), None, Some("world")], + ), + // part of input + ( + vec![Some("Hello"), None, Some("wOrld")], + vec![Some("hello"), None, Some("world")], + ), + // all input + ( + vec![Some("HELLO"), None, Some("WORLD")], + vec![Some("hello"), None, Some("world")], + ), + ]; + + cases + .into_iter() + .try_for_each::<_, Result<()>>(|(array, expected)| { + let array = Utf8Array::::from(&array); + let result = lower(&array)?; + assert_eq!(array.len(), result.len()); + + let result = result.as_any().downcast_ref::>().unwrap(); + let expected = Utf8Array::::from(&expected); + + assert_eq!(&expected, result); + Ok(()) + })?; + + Ok(()) +} + +#[test] +fn with_nulls_string() -> Result<()> { + with_nulls_utf8::() +} + +#[test] +fn with_nulls_large_string() -> Result<()> { + with_nulls_utf8::() +} + +fn without_nulls_utf8() -> Result<()> { + let cases = vec![ + // identity + (vec!["hello", "world"], vec!["hello", "world"]), + // part of input + (vec!["Hello", "wOrld"], vec!["hello", "world"]), + // all input + (vec!["HELLO", "WORLD"], vec!["hello", "world"]), + ]; + + cases + .into_iter() + .try_for_each::<_, Result<()>>(|(array, expected)| { + let array = Utf8Array::::from_slice(&array); + let result = lower(&array)?; + assert_eq!(array.len(), result.len()); + + let result = result.as_any().downcast_ref::>().unwrap(); + let expected = Utf8Array::::from_slice(&expected); + assert_eq!(&expected, result); + Ok(()) + })?; + + Ok(()) +} + +#[test] +fn without_nulls_string() -> Result<()> { + without_nulls_utf8::() +} + +#[test] +fn without_nulls_large_string() -> Result<()> { + without_nulls_utf8::() +} + +fn with_null_binarys() -> Result<()> { + let cases = vec![ + // identity + ( + vec![Some(b"hello"), None, Some(b"world")], + vec![Some(b"hello"), None, Some(b"world")], + ), + // part of input + ( + vec![Some(b"Hello"), None, Some(b"wOrld")], + vec![Some(b"Hello"), None, Some(b"wOrld")], + ), + // all input + ( + vec![Some(b"HELLO"), None, Some(b"WORLD")], + vec![Some(b"HELLO"), None, Some(b"WORLD")], + ), + ]; + + cases + .into_iter() + .try_for_each::<_, Result<()>>(|(array, expected)| { + let array = BinaryArray::::from(&array); + let result = lower(&array)?; + assert_eq!(array.len(), result.len()); + + let result = result.as_any().downcast_ref::>().unwrap(); + let expected = BinaryArray::::from(&expected); + + assert_eq!(&expected, result); + Ok(()) + })?; + + Ok(()) +} + +#[test] +fn with_nulls_binary() -> Result<()> { + with_null_binarys::() +} + +#[test] +fn with_nulls_large_binary() -> Result<()> { + with_null_binarys::() +} + +fn without_null_binarys() -> Result<()> { + let cases = vec![ + // identity + (vec![b"hello", b"world"], vec![b"hello", b"world"]), + // part of input + (vec![b"Hello", b"wOrld"], vec![b"Hello", b"wOrld"]), + // all input + (vec![b"HELLO", b"WORLD"], vec![b"HELLO", b"WORLD"]), + ]; + + cases + .into_iter() + .try_for_each::<_, Result<()>>(|(array, expected)| { + let array = BinaryArray::::from_slice(&array); + let result = lower(&array)?; + assert_eq!(array.len(), result.len()); + + let result = result.as_any().downcast_ref::>().unwrap(); + let expected = BinaryArray::::from_slice(&expected); + + assert_eq!(&expected, result); + Ok(()) + })?; + + Ok(()) +} + +#[test] +fn without_nulls_binary() -> Result<()> { + without_null_binarys::() +} + +#[test] +fn without_nulls_large_binary() -> Result<()> { + without_null_binarys::() +} + +#[test] +fn consistency() { + use arrow2::datatypes::DataType::*; + use arrow2::datatypes::TimeUnit; + let datatypes = vec![ + Null, + Boolean, + UInt8, + UInt16, + UInt32, + UInt64, + Int8, + Int16, + Int32, + Int64, + Float32, + Float64, + Timestamp(TimeUnit::Second, None), + Timestamp(TimeUnit::Millisecond, None), + Timestamp(TimeUnit::Microsecond, None), + Timestamp(TimeUnit::Nanosecond, None), + Time64(TimeUnit::Microsecond), + Time64(TimeUnit::Nanosecond), + Date32, + Time32(TimeUnit::Second), + Time32(TimeUnit::Millisecond), + Date64, + Utf8, + LargeUtf8, + Binary, + LargeBinary, + Duration(TimeUnit::Second), + Duration(TimeUnit::Millisecond), + Duration(TimeUnit::Microsecond), + Duration(TimeUnit::Nanosecond), + ]; + + datatypes.into_iter().for_each(|d1| { + let array = new_null_array(d1.clone(), 10); + if can_lower(&d1) { + assert!(lower(array.as_ref()).is_ok()); + } else { + assert!(lower(array.as_ref()).is_err()); + } + }); +} diff --git a/tests/it/compute/mod.rs b/tests/it/compute/mod.rs index ff55ab74d7b..2a570a06c9a 100644 --- a/tests/it/compute/mod.rs +++ b/tests/it/compute/mod.rs @@ -44,3 +44,5 @@ mod take; mod temporal; #[cfg(feature = "compute_window")] mod window; +#[cfg(feature = "compute_lower")] +mod lower; From 5be7fa579e84a58e72aef72de259fb2b6bd81f5b Mon Sep 17 00:00:00 2001 From: Xuanwo Date: Sat, 27 Nov 2021 20:21:06 +0800 Subject: [PATCH 2/7] Format code Signed-off-by: Xuanwo --- src/compute/mod.rs | 6 +++--- tests/it/compute/mod.rs | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/compute/mod.rs b/src/compute/mod.rs index 9ee59af794f..5608e3ebfe5 100644 --- a/src/compute/mod.rs +++ b/src/compute/mod.rs @@ -58,6 +58,9 @@ pub mod like; #[cfg(feature = "compute_limit")] #[cfg_attr(docsrs, doc(cfg(feature = "compute_limit")))] pub mod limit; +#[cfg(feature = "compute_lower")] +#[cfg_attr(docsrs, doc(cfg(feature = "compute_lower")))] +pub mod lower; #[cfg(feature = "compute_merge_sort")] #[cfg_attr(docsrs, doc(cfg(feature = "compute_merge_sort")))] pub mod merge_sort; @@ -86,6 +89,3 @@ mod utils; #[cfg(feature = "compute_window")] #[cfg_attr(docsrs, doc(cfg(feature = "compute_window")))] pub mod window; -#[cfg(feature = "compute_lower")] -#[cfg_attr(docsrs, doc(cfg(feature = "compute_lower")))] -pub mod lower; diff --git a/tests/it/compute/mod.rs b/tests/it/compute/mod.rs index 2a570a06c9a..d4bd0b008eb 100644 --- a/tests/it/compute/mod.rs +++ b/tests/it/compute/mod.rs @@ -28,6 +28,8 @@ mod length; mod like; #[cfg(feature = "compute_limit")] mod limit; +#[cfg(feature = "compute_lower")] +mod lower; #[cfg(feature = "compute_merge_sort")] mod merge_sort; #[cfg(feature = "compute_partition")] @@ -44,5 +46,3 @@ mod take; mod temporal; #[cfg(feature = "compute_window")] mod window; -#[cfg(feature = "compute_lower")] -mod lower; From 458f6aad09db6a4b7939cfd7720f6fcd7c01c679 Mon Sep 17 00:00:00 2001 From: Xuanwo Date: Sun, 28 Nov 2021 14:47:00 +0800 Subject: [PATCH 3/7] Update src/compute/lower.rs Co-authored-by: Jorge Leitao --- src/compute/lower.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/compute/lower.rs b/src/compute/lower.rs index e635248bc0c..015a723bd2e 100644 --- a/src/compute/lower.rs +++ b/src/compute/lower.rs @@ -30,7 +30,7 @@ fn utf8_lower(array: &Utf8Array) -> Utf8Array { new.with_validity(array.validity().cloned()) } -/// Returns an ArrayRef with lowercase of each of the elements in `array`. +/// Returns a new `Array` where each of each of the elements is lower-cased. /// this function errors when the passed array is not a \[Large\]String array. pub fn lower(array: &dyn Array) -> Result> { match array.data_type() { From 6eba31a0b2a37225d5bd46bd6ebdf858995f7a40 Mon Sep 17 00:00:00 2001 From: Xuanwo Date: Mon, 29 Nov 2021 13:32:42 +0800 Subject: [PATCH 4/7] Update src/compute/lower.rs --- src/compute/lower.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/compute/lower.rs b/src/compute/lower.rs index 015a723bd2e..4b882099f6d 100644 --- a/src/compute/lower.rs +++ b/src/compute/lower.rs @@ -49,7 +49,7 @@ pub fn lower(array: &dyn Array) -> Result> { array .as_any() .downcast_ref::>() - .expect("A large string is expected"), + .expect("A string is expected"), ))), _ => Err(ArrowError::InvalidArgumentError(format!( "lower does not support type {:?}", From 82906c28351eab603e61e6ccb093b8eaef1afd98 Mon Sep 17 00:00:00 2001 From: Xuanwo Date: Mon, 6 Dec 2021 14:18:13 +0800 Subject: [PATCH 5/7] Remove binary support Signed-off-by: Xuanwo --- src/compute/lower.rs | 10 +---- tests/it/compute/lower.rs | 83 --------------------------------------- 2 files changed, 1 insertion(+), 92 deletions(-) diff --git a/src/compute/lower.rs b/src/compute/lower.rs index 4b882099f6d..cd903bba992 100644 --- a/src/compute/lower.rs +++ b/src/compute/lower.rs @@ -34,11 +34,6 @@ fn utf8_lower(array: &Utf8Array) -> Utf8Array { /// this function errors when the passed array is not a \[Large\]String array. pub fn lower(array: &dyn Array) -> Result> { match array.data_type() { - // For binary and large binary, lower is no-op. - DataType::Binary | DataType::LargeBinary => unsafe { - // Safety: we will use the whole slice directly, so we don't need to check it. - Ok(array.slice_unchecked(0, array.len())) - }, DataType::LargeUtf8 => Ok(Box::new(utf8_lower( array .as_any() @@ -72,8 +67,5 @@ pub fn lower(array: &dyn Array) -> Result> { /// assert_eq!(can_lower(&data_type), false); /// ``` pub fn can_lower(data_type: &DataType) -> bool { - matches!( - data_type, - DataType::LargeUtf8 | DataType::Utf8 | DataType::LargeBinary | DataType::Binary - ) + matches!(data_type, DataType::LargeUtf8 | DataType::Utf8) } diff --git a/tests/it/compute/lower.rs b/tests/it/compute/lower.rs index 5a3de12f142..8ca4ad8f5fb 100644 --- a/tests/it/compute/lower.rs +++ b/tests/it/compute/lower.rs @@ -82,89 +82,6 @@ fn without_nulls_large_string() -> Result<()> { without_nulls_utf8::() } -fn with_null_binarys() -> Result<()> { - let cases = vec![ - // identity - ( - vec![Some(b"hello"), None, Some(b"world")], - vec![Some(b"hello"), None, Some(b"world")], - ), - // part of input - ( - vec![Some(b"Hello"), None, Some(b"wOrld")], - vec![Some(b"Hello"), None, Some(b"wOrld")], - ), - // all input - ( - vec![Some(b"HELLO"), None, Some(b"WORLD")], - vec![Some(b"HELLO"), None, Some(b"WORLD")], - ), - ]; - - cases - .into_iter() - .try_for_each::<_, Result<()>>(|(array, expected)| { - let array = BinaryArray::::from(&array); - let result = lower(&array)?; - assert_eq!(array.len(), result.len()); - - let result = result.as_any().downcast_ref::>().unwrap(); - let expected = BinaryArray::::from(&expected); - - assert_eq!(&expected, result); - Ok(()) - })?; - - Ok(()) -} - -#[test] -fn with_nulls_binary() -> Result<()> { - with_null_binarys::() -} - -#[test] -fn with_nulls_large_binary() -> Result<()> { - with_null_binarys::() -} - -fn without_null_binarys() -> Result<()> { - let cases = vec![ - // identity - (vec![b"hello", b"world"], vec![b"hello", b"world"]), - // part of input - (vec![b"Hello", b"wOrld"], vec![b"Hello", b"wOrld"]), - // all input - (vec![b"HELLO", b"WORLD"], vec![b"HELLO", b"WORLD"]), - ]; - - cases - .into_iter() - .try_for_each::<_, Result<()>>(|(array, expected)| { - let array = BinaryArray::::from_slice(&array); - let result = lower(&array)?; - assert_eq!(array.len(), result.len()); - - let result = result.as_any().downcast_ref::>().unwrap(); - let expected = BinaryArray::::from_slice(&expected); - - assert_eq!(&expected, result); - Ok(()) - })?; - - Ok(()) -} - -#[test] -fn without_nulls_binary() -> Result<()> { - without_null_binarys::() -} - -#[test] -fn without_nulls_large_binary() -> Result<()> { - without_null_binarys::() -} - #[test] fn consistency() { use arrow2::datatypes::DataType::*; From f6347b593d4ab77f4a018f5d2376bf1ac1e4b43c Mon Sep 17 00:00:00 2001 From: Xuanwo Date: Mon, 6 Dec 2021 14:29:54 +0800 Subject: [PATCH 6/7] Add real UTF-8 lower test Signed-off-by: Xuanwo --- tests/it/compute/lower.rs | 56 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 56 insertions(+) diff --git a/tests/it/compute/lower.rs b/tests/it/compute/lower.rs index 8ca4ad8f5fb..d8f594174fb 100644 --- a/tests/it/compute/lower.rs +++ b/tests/it/compute/lower.rs @@ -17,6 +17,35 @@ fn with_nulls_utf8() -> Result<()> { vec![Some("HELLO"), None, Some("WORLD")], vec![Some("hello"), None, Some("world")], ), + // UTF8 characters + ( + vec![ + None, + Some("السلام عليكم"), + Some("Dobrý den"), + Some("שָׁלוֹם"), + Some("नमस्ते"), + Some("こんにちは"), + Some("안녕하세요"), + Some("你好"), + Some("Olá"), + Some("Здравствуйте"), + Some("Hola"), + ], + vec![ + None, + Some("السلام عليكم"), + Some("dobrý den"), + Some("שָׁלוֹם"), + Some("नमस्ते"), + Some("こんにちは"), + Some("안녕하세요"), + Some("你好"), + Some("olá"), + Some("здравствуйте"), + Some("hola"), + ], + ), ]; cases @@ -54,6 +83,33 @@ fn without_nulls_utf8() -> Result<()> { (vec!["Hello", "wOrld"], vec!["hello", "world"]), // all input (vec!["HELLO", "WORLD"], vec!["hello", "world"]), + // UTF8 characters + ( + vec![ + "السلام عليكم", + "Dobrý den", + "שָׁלוֹם", + "नमस्ते", + "こんにちは", + "안녕하세요", + "你好", + "Olá", + "Здравствуйте", + "Hola", + ], + vec![ + "السلام عليكم", + "dobrý den", + "שָׁלוֹם", + "नमस्ते", + "こんにちは", + "안녕하세요", + "你好", + "olá", + "здравствуйте", + "hola", + ], + ), ]; cases From fa9693b27b7e988b659fbc17995a67ad433dadcd Mon Sep 17 00:00:00 2001 From: Xuanwo Date: Mon, 6 Dec 2021 14:43:50 +0800 Subject: [PATCH 7/7] Implement generic over the function applied to the utf8 array Signed-off-by: Xuanwo --- src/compute/lower.rs | 14 +++++--------- src/compute/utils.rs | 8 ++++++++ 2 files changed, 13 insertions(+), 9 deletions(-) diff --git a/src/compute/lower.rs b/src/compute/lower.rs index cd903bba992..5a9978179b4 100644 --- a/src/compute/lower.rs +++ b/src/compute/lower.rs @@ -17,30 +17,26 @@ //! Defines kernel to extract a lower case of a \[Large\]StringArray +use super::utils::utf8_apply; use crate::array::*; use crate::{ datatypes::DataType, error::{ArrowError, Result}, }; -fn utf8_lower(array: &Utf8Array) -> Utf8Array { - let iter = array.values_iter().map(str::to_lowercase); - - let new = Utf8Array::::from_trusted_len_values_iter(iter); - new.with_validity(array.validity().cloned()) -} - /// Returns a new `Array` where each of each of the elements is lower-cased. /// this function errors when the passed array is not a \[Large\]String array. pub fn lower(array: &dyn Array) -> Result> { match array.data_type() { - DataType::LargeUtf8 => Ok(Box::new(utf8_lower( + DataType::LargeUtf8 => Ok(Box::new(utf8_apply( + str::to_lowercase, array .as_any() .downcast_ref::>() .expect("A large string is expected"), ))), - DataType::Utf8 => Ok(Box::new(utf8_lower( + DataType::Utf8 => Ok(Box::new(utf8_apply( + str::to_lowercase, array .as_any() .downcast_ref::>() diff --git a/src/compute/utils.rs b/src/compute/utils.rs index 864eb27d40d..69ed5b7a5a8 100644 --- a/src/compute/utils.rs +++ b/src/compute/utils.rs @@ -30,6 +30,14 @@ pub fn unary_utf8_boolean bool>( BooleanArray::from_data(DataType::Boolean, values, validity) } +/// utf8_apply will apply `Fn(&str) -> String` to every value in Utf8Array. +pub fn utf8_apply String>(f: F, array: &Utf8Array) -> Utf8Array { + let iter = array.values_iter().map(f); + + let new = Utf8Array::::from_trusted_len_values_iter(iter); + new.with_validity(array.validity().cloned()) +} + // Errors iff the two arrays have a different length. #[inline] pub fn check_same_len(lhs: &dyn Array, rhs: &dyn Array) -> Result<()> {