From 8334a6f12aa562a1a9aed2114a7b9653493d4545 Mon Sep 17 00:00:00 2001 From: zhyass <34016424+zhyass@users.noreply.github.com> Date: Tue, 31 Aug 2021 11:38:59 +0800 Subject: [PATCH] Add support for binary substring --- src/compute/substring.rs | 218 +++++++++++++++++++++++++++++++++++++-- 1 file changed, 211 insertions(+), 7 deletions(-) diff --git a/src/compute/substring.rs b/src/compute/substring.rs index 2e4fd209b6c..7283ffd2997 100644 --- a/src/compute/substring.rs +++ b/src/compute/substring.rs @@ -63,11 +63,71 @@ fn utf8_substring(array: &Utf8Array, start: O, length: &Option) Utf8Array::::from_data(new_offsets.into(), new_values.into(), validity.clone()) } +fn binary_substring( + array: &BinaryArray, + start: O, + length: &Option, +) -> BinaryArray { + let validity = array.validity(); + let offsets = array.offsets(); + let values = array.values(); + + let mut new_offsets = MutableBuffer::::with_capacity(array.len() + 1); + let mut new_values = MutableBuffer::::new(); // we have no way to estimate how much this will be. + + let mut length_so_far = O::zero(); + new_offsets.push(length_so_far); + + offsets.windows(2).for_each(|windows| { + let length_i: O = windows[1] - windows[0]; + + // compute where we should start slicing this entry + let start = windows[0] + + if start >= O::zero() { + start + } else { + length_i + start + }; + let start = start.max(windows[0]).min(windows[1]); + + let length: O = length + .unwrap_or(length_i) + // .max(0) is not needed as it is guaranteed + .min(windows[1] - start); // so we do not go beyond this entry + length_so_far += length; + new_offsets.push(length_so_far); + + // we need usize for ranges + let start = start.to_usize(); + let length = length.to_usize(); + + new_values.extend_from_slice(&values[start..start + length]); + }); + + BinaryArray::::from_data(new_offsets.into(), new_values.into(), validity.clone()) +} + /// Returns an ArrayRef with a substring starting from `start` and with optional length `length` of each of the elements in `array`. /// `start` can be negative, in which case the start counts from the end of the string. /// this function errors when the passed array is not a \[Large\]String array. pub fn substring(array: &dyn Array, start: i64, length: &Option) -> Result> { match array.data_type() { + DataType::Binary => Ok(Box::new(binary_substring( + array + .as_any() + .downcast_ref::>() + .expect("A binary is expected"), + start as i32, + &length.map(|e| e as i32), + ))), + DataType::LargeBinary => Ok(Box::new(binary_substring( + array + .as_any() + .downcast_ref::>() + .expect("A large binary is expected"), + start, + &length.map(|e| e as i64), + ))), DataType::LargeUtf8 => Ok(Box::new(utf8_substring( array .as_any() @@ -105,14 +165,14 @@ pub fn substring(array: &dyn Array, start: i64, length: &Option) -> Result< /// assert_eq!(can_substring(&data_type), false); /// ``` pub fn can_substring(data_type: &DataType) -> bool { - matches!(data_type, DataType::LargeUtf8 | DataType::Utf8) + matches!(data_type, DataType::LargeUtf8 | DataType::Utf8 | DataType::LargeBinary | DataType::Binary) } #[cfg(test)] mod tests { use super::*; - fn with_nulls() -> Result<()> { + fn with_nulls_utf8() -> Result<()> { let cases = vec![ // identity ( @@ -169,15 +229,15 @@ mod tests { #[test] fn with_nulls_string() -> Result<()> { - with_nulls::() + with_nulls_utf8::() } #[test] fn with_nulls_large_string() -> Result<()> { - with_nulls::() + with_nulls_utf8::() } - fn without_nulls() -> Result<()> { + fn without_nulls_utf8() -> Result<()> { let cases = vec![ // increase start ( @@ -248,12 +308,156 @@ mod tests { #[test] fn without_nulls_string() -> Result<()> { - without_nulls::() + without_nulls_utf8::() } #[test] fn without_nulls_large_string() -> Result<()> { - without_nulls::() + without_nulls_utf8::() + } + + fn with_null_binarys() -> Result<()> { + let cases = vec![ + // identity + ( + vec![Some(b"hello"), None, Some(b"world")], + 0, + None, + vec![Some("hello"), None, Some("world")], + ), + // 0 length -> Nothing + ( + vec![Some(b"hello"), None, Some(b"world")], + 0, + Some(0), + vec![Some(""), None, Some("")], + ), + // high start -> Nothing + ( + vec![Some(b"hello"), None, Some(b"world")], + 1000, + Some(0), + vec![Some(""), None, Some("")], + ), + // high negative start -> identity + ( + vec![Some(b"hello"), None, Some(b"world")], + -1000, + None, + vec![Some("hello"), None, Some("world")], + ), + // high length -> identity + ( + vec![Some(b"hello"), None, Some(b"world")], + 0, + Some(1000), + vec![Some("hello"), None, Some("world")], + ), + ]; + + cases + .into_iter() + .try_for_each::<_, Result<()>>(|(array, start, length, expected)| { + let array = BinaryArray::::from(&array); + let result = substring(&array, start, &length)?; + assert_eq!(array.len(), result.len()); + + let result = result.as_any().downcast_ref::>().unwrap(); + let expected = BinaryArray::::from(&expected); + assert_eq!(&expected, result); + Ok(()) + })?; + + Ok(()) + } + + #[test] + fn with_nulls_binary() -> Result<()> { + with_null_binarys::() + } + + #[test] + fn with_nulls_large_binary() -> Result<()> { + with_null_binarys::() + } + + fn without_null_binarys() -> Result<()> { + let cases = vec![ + // increase start + ( + vec!["hello", "", "word"], + 0, + None, + vec!["hello", "", "word"], + ), + (vec!["hello", "", "word"], 1, None, vec!["ello", "", "ord"]), + (vec!["hello", "", "word"], 2, None, vec!["llo", "", "rd"]), + (vec!["hello", "", "word"], 3, None, vec!["lo", "", "d"]), + (vec!["hello", "", "word"], 10, None, vec!["", "", ""]), + // increase start negatively + (vec!["hello", "", "word"], -1, None, vec!["o", "", "d"]), + (vec!["hello", "", "word"], -2, None, vec!["lo", "", "rd"]), + (vec!["hello", "", "word"], -3, None, vec!["llo", "", "ord"]), + ( + vec!["hello", "", "word"], + -10, + None, + vec!["hello", "", "word"], + ), + // increase length + (vec!["hello", "", "word"], 1, Some(1), vec!["e", "", "o"]), + (vec!["hello", "", "word"], 1, Some(2), vec!["el", "", "or"]), + ( + vec!["hello", "", "word"], + 1, + Some(3), + vec!["ell", "", "ord"], + ), + ( + vec!["hello", "", "word"], + 1, + Some(4), + vec!["ello", "", "ord"], + ), + (vec!["hello", "", "word"], -3, Some(1), vec!["l", "", "o"]), + (vec!["hello", "", "word"], -3, Some(2), vec!["ll", "", "or"]), + ( + vec!["hello", "", "word"], + -3, + Some(3), + vec!["llo", "", "ord"], + ), + ( + vec!["hello", "", "word"], + -3, + Some(4), + vec!["llo", "", "ord"], + ), + ]; + + cases + .into_iter() + .try_for_each::<_, Result<()>>(|(array, start, length, expected)| { + let array = BinaryArray::::from_slice(&array); + let result = substring(&array, start, &length)?; + assert_eq!(array.len(), result.len()); + let result = result.as_any().downcast_ref::>().unwrap(); + let expected = BinaryArray::::from_slice(&expected); + assert_eq!(&expected, result); + Ok(()) + })?; + + Ok(()) + } + + #[test] + fn without_nulls_binary() -> Result<()> { + without_null_binarys::() + } + + #[test] + fn without_nulls_large_binary() -> Result<()> { + without_null_binarys::() } #[test]