jorgecarleitao · jorgecarleitao · Dec 6, 2021 · Nov 27, 2021 · Nov 27, 2021 · Nov 28, 2021
diff --git a/Cargo.toml b/Cargo.toml
@@ -171,6 +171,7 @@ compute_substring = []
 compute_take = []
 compute_temporal = []
 compute_window = ["compute_concatenate"]
+compute_lower = []
 compute = [
     "compute_aggregate",
     "compute_arithmetics",
@@ -196,6 +197,7 @@ compute = [
     "compute_take",
     "compute_temporal",
     "compute_window",
+    "compute_lower",
 ]
 # base64 + io_ipc because arrow schemas are stored as base64-encoded ipc format.
 io_parquet = ["parquet2", "io_ipc", "base64", "futures"]
@@ -298,4 +300,4 @@ harness = false
 
 [[bench]]
 name = "bitwise"
-harness = false
+harness = false
diff --git a/src/compute/lower.rs b/src/compute/lower.rs
@@ -0,0 +1,79 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Defines kernel to extract a lower case of a \[Large\]StringArray
+
+use crate::array::*;
+use crate::{
+    datatypes::DataType,
+    error::{ArrowError, Result},
+};
+
+fn utf8_lower<O: Offset>(array: &Utf8Array<O>) -> Utf8Array<O> {
+    let iter = array.values_iter().map(str::to_lowercase);
+
+    let new = Utf8Array::<O>::from_trusted_len_values_iter(iter);
+    new.with_validity(array.validity().cloned())
+}
+
+/// Returns an ArrayRef with lowercase of each of the elements in `array`.
+/// this function errors when the passed array is not a \[Large\]String array.
+pub fn lower(array: &dyn Array) -> Result<Box<dyn Array>> {
+    match array.data_type() {
+        // For binary and large binary, lower is no-op.
+        DataType::Binary | DataType::LargeBinary => unsafe {
+            // Safety: we will use the whole slice directly, so we don't need to check it.
+            Ok(array.slice_unchecked(0, array.len()))
+        },
+        DataType::LargeUtf8 => Ok(Box::new(utf8_lower(
+            array
+                .as_any()
+                .downcast_ref::<Utf8Array<i64>>()
+                .expect("A large string is expected"),
+        ))),
+        DataType::Utf8 => Ok(Box::new(utf8_lower(
+            array
+                .as_any()
+                .downcast_ref::<Utf8Array<i32>>()
+                .expect("A large string is expected"),
+        ))),
+        _ => Err(ArrowError::InvalidArgumentError(format!(
+            "lower does not support type {:?}",
+            array.data_type()
+        ))),
+    }
+}
+
+/// Checks if an array of type `datatype` can perform lower operation
+///
+/// # Examples
+/// ```
+/// use arrow2::compute::lower::can_lower;
+/// use arrow2::datatypes::{DataType};
+///
+/// let data_type = DataType::Utf8;
+/// assert_eq!(can_lower(&data_type), true);
+///
+/// let data_type = DataType::Null;
+/// assert_eq!(can_lower(&data_type), false);
+/// ```
+pub fn can_lower(data_type: &DataType) -> bool {
+    matches!(
+        data_type,
+        DataType::LargeUtf8 | DataType::Utf8 | DataType::LargeBinary | DataType::Binary
+    )
+}
diff --git a/src/compute/mod.rs b/src/compute/mod.rs
@@ -58,6 +58,9 @@ pub mod like;
 #[cfg(feature = "compute_limit")]
 #[cfg_attr(docsrs, doc(cfg(feature = "compute_limit")))]
 pub mod limit;
+#[cfg(feature = "compute_lower")]
+#[cfg_attr(docsrs, doc(cfg(feature = "compute_lower")))]
+pub mod lower;
 #[cfg(feature = "compute_merge_sort")]
 #[cfg_attr(docsrs, doc(cfg(feature = "compute_merge_sort")))]
 pub mod merge_sort;

diff --git a/tests/it/compute/lower.rs b/tests/it/compute/lower.rs
@@ -0,0 +1,213 @@
+use arrow2::{array::*, compute::lower::*, error::Result};
+
+fn with_nulls_utf8<O: Offset>() -> Result<()> {
+    let cases = vec![
+        // identity
+        (
+            vec![Some("hello"), None, Some("world")],
+            vec![Some("hello"), None, Some("world")],
+        ),
+        // part of input
+        (
+            vec![Some("Hello"), None, Some("wOrld")],
+            vec![Some("hello"), None, Some("world")],
+        ),
+        // all input
+        (
+            vec![Some("HELLO"), None, Some("WORLD")],
+            vec![Some("hello"), None, Some("world")],
+        ),
+    ];
+
+    cases
+        .into_iter()
+        .try_for_each::<_, Result<()>>(|(array, expected)| {
+            let array = Utf8Array::<O>::from(&array);
+            let result = lower(&array)?;
+            assert_eq!(array.len(), result.len());
+
+            let result = result.as_any().downcast_ref::<Utf8Array<O>>().unwrap();
+            let expected = Utf8Array::<O>::from(&expected);
+
+            assert_eq!(&expected, result);
+            Ok(())
+        })?;
+
+    Ok(())
+}
+
+#[test]
+fn with_nulls_string() -> Result<()> {
+    with_nulls_utf8::<i32>()
+}
+
+#[test]
+fn with_nulls_large_string() -> Result<()> {
+    with_nulls_utf8::<i64>()
+}
+
+fn without_nulls_utf8<O: Offset>() -> Result<()> {
+    let cases = vec![
+        // identity
+        (vec!["hello", "world"], vec!["hello", "world"]),
+        // part of input
+        (vec!["Hello", "wOrld"], vec!["hello", "world"]),
+        // all input
+        (vec!["HELLO", "WORLD"], vec!["hello", "world"]),
+    ];
+
+    cases
+        .into_iter()
+        .try_for_each::<_, Result<()>>(|(array, expected)| {
+            let array = Utf8Array::<O>::from_slice(&array);
+            let result = lower(&array)?;
+            assert_eq!(array.len(), result.len());
+
+            let result = result.as_any().downcast_ref::<Utf8Array<O>>().unwrap();
+            let expected = Utf8Array::<O>::from_slice(&expected);
+            assert_eq!(&expected, result);
+            Ok(())
+        })?;
+
+    Ok(())
+}
+
+#[test]
+fn without_nulls_string() -> Result<()> {
+    without_nulls_utf8::<i32>()
+}
+
+#[test]
+fn without_nulls_large_string() -> Result<()> {
+    without_nulls_utf8::<i64>()
+}
+
+fn with_null_binarys<O: Offset>() -> Result<()> {
+    let cases = vec![
+        // identity
+        (
+            vec![Some(b"hello"), None, Some(b"world")],
+            vec![Some(b"hello"), None, Some(b"world")],
+        ),
+        // part of input
+        (
+            vec![Some(b"Hello"), None, Some(b"wOrld")],
+            vec![Some(b"Hello"), None, Some(b"wOrld")],
+        ),
+        // all input
+        (
+            vec![Some(b"HELLO"), None, Some(b"WORLD")],
+            vec![Some(b"HELLO"), None, Some(b"WORLD")],
+        ),
+    ];
+
+    cases
+        .into_iter()
+        .try_for_each::<_, Result<()>>(|(array, expected)| {
+            let array = BinaryArray::<O>::from(&array);
+            let result = lower(&array)?;
+            assert_eq!(array.len(), result.len());
+
+            let result = result.as_any().downcast_ref::<BinaryArray<O>>().unwrap();
+            let expected = BinaryArray::<O>::from(&expected);
+
+            assert_eq!(&expected, result);
+            Ok(())
+        })?;
+
+    Ok(())
+}
+
+#[test]
+fn with_nulls_binary() -> Result<()> {
+    with_null_binarys::<i32>()
+}
+
+#[test]
+fn with_nulls_large_binary() -> Result<()> {
+    with_null_binarys::<i64>()
+}
+
+fn without_null_binarys<O: Offset>() -> Result<()> {
+    let cases = vec![
+        // identity
+        (vec![b"hello", b"world"], vec![b"hello", b"world"]),
+        // part of input
+        (vec![b"Hello", b"wOrld"], vec![b"Hello", b"wOrld"]),
+        // all input
+        (vec![b"HELLO", b"WORLD"], vec![b"HELLO", b"WORLD"]),
+    ];
+
+    cases
+        .into_iter()
+        .try_for_each::<_, Result<()>>(|(array, expected)| {
+            let array = BinaryArray::<O>::from_slice(&array);
+            let result = lower(&array)?;
+            assert_eq!(array.len(), result.len());
+
+            let result = result.as_any().downcast_ref::<BinaryArray<O>>().unwrap();
+            let expected = BinaryArray::<O>::from_slice(&expected);
+
+            assert_eq!(&expected, result);
+            Ok(())
+        })?;
+
+    Ok(())
+}
+
+#[test]
+fn without_nulls_binary() -> Result<()> {
+    without_null_binarys::<i32>()
+}
+
+#[test]
+fn without_nulls_large_binary() -> Result<()> {
+    without_null_binarys::<i64>()
+}
+
+#[test]
+fn consistency() {
+    use arrow2::datatypes::DataType::*;
+    use arrow2::datatypes::TimeUnit;
+    let datatypes = vec![
+        Null,
+        Boolean,
+        UInt8,
+        UInt16,
+        UInt32,
+        UInt64,
+        Int8,
+        Int16,
+        Int32,
+        Int64,
+        Float32,
+        Float64,
+        Timestamp(TimeUnit::Second, None),
+        Timestamp(TimeUnit::Millisecond, None),
+        Timestamp(TimeUnit::Microsecond, None),
+        Timestamp(TimeUnit::Nanosecond, None),
+        Time64(TimeUnit::Microsecond),
+        Time64(TimeUnit::Nanosecond),
+        Date32,
+        Time32(TimeUnit::Second),
+        Time32(TimeUnit::Millisecond),
+        Date64,
+        Utf8,
+        LargeUtf8,
+        Binary,
+        LargeBinary,
+        Duration(TimeUnit::Second),
+        Duration(TimeUnit::Millisecond),
+        Duration(TimeUnit::Microsecond),
+        Duration(TimeUnit::Nanosecond),
+    ];
+
+    datatypes.into_iter().for_each(|d1| {
+        let array = new_null_array(d1.clone(), 10);
+        if can_lower(&d1) {
+            assert!(lower(array.as_ref()).is_ok());
+        } else {
+            assert!(lower(array.as_ref()).is_err());
+        }
+    });
+}
diff --git a/tests/it/compute/mod.rs b/tests/it/compute/mod.rs
@@ -28,6 +28,8 @@ mod length;
 mod like;
 #[cfg(feature = "compute_limit")]
 mod limit;
+#[cfg(feature = "compute_lower")]
+mod lower;
 #[cfg(feature = "compute_merge_sort")]
 mod merge_sort;
 #[cfg(feature = "compute_partition")]