Skip to content
This repository has been archived by the owner on Feb 18, 2024. It is now read-only.

Added lower #641

Merged
merged 7 commits into from
Dec 6, 2021
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -171,6 +171,7 @@ compute_substring = []
compute_take = []
compute_temporal = []
compute_window = ["compute_concatenate"]
compute_lower = []
compute = [
"compute_aggregate",
"compute_arithmetics",
Expand All @@ -196,6 +197,7 @@ compute = [
"compute_take",
"compute_temporal",
"compute_window",
"compute_lower",
]
# base64 + io_ipc because arrow schemas are stored as base64-encoded ipc format.
io_parquet = ["parquet2", "io_ipc", "base64", "futures"]
Expand Down Expand Up @@ -298,4 +300,4 @@ harness = false

[[bench]]
name = "bitwise"
harness = false
harness = false
79 changes: 79 additions & 0 deletions src/compute/lower.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

//! Defines kernel to extract a lower case of a \[Large\]StringArray

use crate::array::*;
use crate::{
datatypes::DataType,
error::{ArrowError, Result},
};

fn utf8_lower<O: Offset>(array: &Utf8Array<O>) -> Utf8Array<O> {
Xuanwo marked this conversation as resolved.
Show resolved Hide resolved
let iter = array.values_iter().map(str::to_lowercase);

let new = Utf8Array::<O>::from_trusted_len_values_iter(iter);
Xuanwo marked this conversation as resolved.
Show resolved Hide resolved
new.with_validity(array.validity().cloned())
}

/// Returns an ArrayRef with lowercase of each of the elements in `array`.
Xuanwo marked this conversation as resolved.
Show resolved Hide resolved
/// this function errors when the passed array is not a \[Large\]String array.
pub fn lower(array: &dyn Array) -> Result<Box<dyn Array>> {
match array.data_type() {
// For binary and large binary, lower is no-op.
DataType::Binary | DataType::LargeBinary => unsafe {
Xuanwo marked this conversation as resolved.
Show resolved Hide resolved
// Safety: we will use the whole slice directly, so we don't need to check it.
Ok(array.slice_unchecked(0, array.len()))
},
DataType::LargeUtf8 => Ok(Box::new(utf8_lower(
array
.as_any()
.downcast_ref::<Utf8Array<i64>>()
.expect("A large string is expected"),
))),
DataType::Utf8 => Ok(Box::new(utf8_lower(
array
.as_any()
.downcast_ref::<Utf8Array<i32>>()
.expect("A large string is expected"),
Xuanwo marked this conversation as resolved.
Show resolved Hide resolved
))),
_ => Err(ArrowError::InvalidArgumentError(format!(
"lower does not support type {:?}",
array.data_type()
))),
}
}

/// Checks if an array of type `datatype` can perform lower operation
///
/// # Examples
/// ```
/// use arrow2::compute::lower::can_lower;
/// use arrow2::datatypes::{DataType};
///
/// let data_type = DataType::Utf8;
/// assert_eq!(can_lower(&data_type), true);
///
/// let data_type = DataType::Null;
/// assert_eq!(can_lower(&data_type), false);
/// ```
pub fn can_lower(data_type: &DataType) -> bool {
matches!(
data_type,
DataType::LargeUtf8 | DataType::Utf8 | DataType::LargeBinary | DataType::Binary
)
}
3 changes: 3 additions & 0 deletions src/compute/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,9 @@ pub mod like;
#[cfg(feature = "compute_limit")]
#[cfg_attr(docsrs, doc(cfg(feature = "compute_limit")))]
pub mod limit;
#[cfg(feature = "compute_lower")]
#[cfg_attr(docsrs, doc(cfg(feature = "compute_lower")))]
pub mod lower;
#[cfg(feature = "compute_merge_sort")]
#[cfg_attr(docsrs, doc(cfg(feature = "compute_merge_sort")))]
pub mod merge_sort;
Expand Down
213 changes: 213 additions & 0 deletions tests/it/compute/lower.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,213 @@
use arrow2::{array::*, compute::lower::*, error::Result};

fn with_nulls_utf8<O: Offset>() -> Result<()> {
let cases = vec![
// identity
(
vec![Some("hello"), None, Some("world")],
vec![Some("hello"), None, Some("world")],
),
// part of input
(
vec![Some("Hello"), None, Some("wOrld")],
vec![Some("hello"), None, Some("world")],
),
// all input
(
vec![Some("HELLO"), None, Some("WORLD")],
vec![Some("hello"), None, Some("world")],
),
];

cases
.into_iter()
.try_for_each::<_, Result<()>>(|(array, expected)| {
let array = Utf8Array::<O>::from(&array);
let result = lower(&array)?;
assert_eq!(array.len(), result.len());

let result = result.as_any().downcast_ref::<Utf8Array<O>>().unwrap();
let expected = Utf8Array::<O>::from(&expected);

assert_eq!(&expected, result);
Ok(())
})?;

Ok(())
}

#[test]
fn with_nulls_string() -> Result<()> {
with_nulls_utf8::<i32>()
}

#[test]
fn with_nulls_large_string() -> Result<()> {
with_nulls_utf8::<i64>()
}

fn without_nulls_utf8<O: Offset>() -> Result<()> {
let cases = vec![
// identity
(vec!["hello", "world"], vec!["hello", "world"]),
// part of input
(vec!["Hello", "wOrld"], vec!["hello", "world"]),
// all input
(vec!["HELLO", "WORLD"], vec!["hello", "world"]),
];

cases
.into_iter()
.try_for_each::<_, Result<()>>(|(array, expected)| {
let array = Utf8Array::<O>::from_slice(&array);
let result = lower(&array)?;
assert_eq!(array.len(), result.len());

let result = result.as_any().downcast_ref::<Utf8Array<O>>().unwrap();
let expected = Utf8Array::<O>::from_slice(&expected);
assert_eq!(&expected, result);
Ok(())
})?;

Ok(())
}

#[test]
fn without_nulls_string() -> Result<()> {
without_nulls_utf8::<i32>()
}

#[test]
fn without_nulls_large_string() -> Result<()> {
without_nulls_utf8::<i64>()
}

fn with_null_binarys<O: Offset>() -> Result<()> {
let cases = vec![
// identity
(
vec![Some(b"hello"), None, Some(b"world")],
vec![Some(b"hello"), None, Some(b"world")],
),
// part of input
(
vec![Some(b"Hello"), None, Some(b"wOrld")],
vec![Some(b"Hello"), None, Some(b"wOrld")],
),
// all input
(
vec![Some(b"HELLO"), None, Some(b"WORLD")],
vec![Some(b"HELLO"), None, Some(b"WORLD")],
),
];

cases
.into_iter()
.try_for_each::<_, Result<()>>(|(array, expected)| {
let array = BinaryArray::<O>::from(&array);
let result = lower(&array)?;
assert_eq!(array.len(), result.len());

let result = result.as_any().downcast_ref::<BinaryArray<O>>().unwrap();
let expected = BinaryArray::<O>::from(&expected);

assert_eq!(&expected, result);
Ok(())
})?;

Ok(())
}

#[test]
fn with_nulls_binary() -> Result<()> {
with_null_binarys::<i32>()
}

#[test]
fn with_nulls_large_binary() -> Result<()> {
with_null_binarys::<i64>()
}

fn without_null_binarys<O: Offset>() -> Result<()> {
let cases = vec![
// identity
(vec![b"hello", b"world"], vec![b"hello", b"world"]),
// part of input
(vec![b"Hello", b"wOrld"], vec![b"Hello", b"wOrld"]),
// all input
(vec![b"HELLO", b"WORLD"], vec![b"HELLO", b"WORLD"]),
];

cases
.into_iter()
.try_for_each::<_, Result<()>>(|(array, expected)| {
let array = BinaryArray::<O>::from_slice(&array);
let result = lower(&array)?;
assert_eq!(array.len(), result.len());

let result = result.as_any().downcast_ref::<BinaryArray<O>>().unwrap();
let expected = BinaryArray::<O>::from_slice(&expected);

assert_eq!(&expected, result);
Ok(())
})?;

Ok(())
}

#[test]
fn without_nulls_binary() -> Result<()> {
without_null_binarys::<i32>()
}

#[test]
fn without_nulls_large_binary() -> Result<()> {
without_null_binarys::<i64>()
}

#[test]
fn consistency() {
use arrow2::datatypes::DataType::*;
use arrow2::datatypes::TimeUnit;
let datatypes = vec![
Null,
Boolean,
UInt8,
UInt16,
UInt32,
UInt64,
Int8,
Int16,
Int32,
Int64,
Float32,
Float64,
Timestamp(TimeUnit::Second, None),
Timestamp(TimeUnit::Millisecond, None),
Timestamp(TimeUnit::Microsecond, None),
Timestamp(TimeUnit::Nanosecond, None),
Time64(TimeUnit::Microsecond),
Time64(TimeUnit::Nanosecond),
Date32,
Time32(TimeUnit::Second),
Time32(TimeUnit::Millisecond),
Date64,
Utf8,
LargeUtf8,
Binary,
LargeBinary,
Duration(TimeUnit::Second),
Duration(TimeUnit::Millisecond),
Duration(TimeUnit::Microsecond),
Duration(TimeUnit::Nanosecond),
];

datatypes.into_iter().for_each(|d1| {
let array = new_null_array(d1.clone(), 10);
if can_lower(&d1) {
assert!(lower(array.as_ref()).is_ok());
} else {
assert!(lower(array.as_ref()).is_err());
}
});
}
2 changes: 2 additions & 0 deletions tests/it/compute/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@ mod length;
mod like;
#[cfg(feature = "compute_limit")]
mod limit;
#[cfg(feature = "compute_lower")]
mod lower;
#[cfg(feature = "compute_merge_sort")]
mod merge_sort;
#[cfg(feature = "compute_partition")]
Expand Down