From 845b1c871a426fa91e94247235777585769ca9a5 Mon Sep 17 00:00:00 2001 From: remzi <13716567376yh@gmail.com> Date: Sun, 22 May 2022 14:17:55 +0800 Subject: [PATCH 1/5] replace fold with reduce Signed-off-by: remzi <13716567376yh@gmail.com> --- benches/aggregate.rs | 12 +++++++++ src/compute/aggregate/min_max.rs | 45 ++++++++++++-------------------- 2 files changed, 29 insertions(+), 28 deletions(-) diff --git a/benches/aggregate.rs b/benches/aggregate.rs index d515fff1281..00675c4c6ce 100644 --- a/benches/aggregate.rs +++ b/benches/aggregate.rs @@ -42,6 +42,18 @@ fn add_benchmark(c: &mut Criterion) { c.bench_function(&format!("min null 2^{} f32", log2_size), |b| { b.iter(|| bench_min(&arr_a)) }); + + let arr_a = create_string_array::(100, size, 0.0, 0); + + c.bench_function(&format!("min 2^{} utf8", log2_size), |b| { + b.iter(|| bench_min(&arr_a)) + }); + + let arr_a = create_string_array::(100, size, 0.1, 0); + + c.bench_function(&format!("min null 2^{} utf8", log2_size), |b| { + b.iter(|| bench_min(&arr_a)) + }); }); } diff --git a/src/compute/aggregate/min_max.rs b/src/compute/aggregate/min_max.rs index 2c6478cff48..f8e91ffacb1 100644 --- a/src/compute/aggregate/min_max.rs +++ b/src/compute/aggregate/min_max.rs @@ -75,39 +75,28 @@ fn min_max_string bool>( array: &Utf8Array, cmp: F, ) -> Option<&str> { - let null_count = array.null_count(); - - if null_count == array.len() || array.len() == 0 { - return None; - } - let value = if array.validity().is_some() { - array.iter().fold(None, |mut acc: Option<&str>, v| { - if let Some(item) = v { - if let Some(acc) = acc.as_mut() { - if cmp(acc, item) { - *acc = item + if array.null_count() == array.len() { + None + } else if array.validity().is_some() { + array + .iter() + .reduce(|v1, v2| match (v1, v2) { + (None, v2) => v2, + (v1, None) => v1, + (Some(v1), Some(v2)) => { + if cmp(v1, v2) { + Some(v2) + } else { + Some(v1) } - } else { - acc = Some(item) } - } - acc - }) + }) + .unwrap_or(None) } else { array .values_iter() - .fold(None, |mut acc: Option<&str>, item| { - if let Some(acc) = acc.as_mut() { - if cmp(acc, item) { - *acc = item - } - } else { - acc = Some(item) - } - acc - }) - }; - value + .reduce(|v1, v2| if cmp(v1, v2) { v2 } else { v1 }) + } } fn nonnull_min_primitive(values: &[T]) -> T From 1965fa6633e2ac4b9c76e5f053181fe4075adb59 Mon Sep 17 00:00:00 2001 From: remzi <13716567376yh@gmail.com> Date: Sun, 22 May 2022 14:34:15 +0800 Subject: [PATCH 2/5] set length = 1 Signed-off-by: remzi <13716567376yh@gmail.com> --- benches/aggregate.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/benches/aggregate.rs b/benches/aggregate.rs index 00675c4c6ce..f157550d153 100644 --- a/benches/aggregate.rs +++ b/benches/aggregate.rs @@ -43,13 +43,13 @@ fn add_benchmark(c: &mut Criterion) { b.iter(|| bench_min(&arr_a)) }); - let arr_a = create_string_array::(100, size, 0.0, 0); + let arr_a = create_string_array::(1, size, 0.0, 0); c.bench_function(&format!("min 2^{} utf8", log2_size), |b| { b.iter(|| bench_min(&arr_a)) }); - let arr_a = create_string_array::(100, size, 0.1, 0); + let arr_a = create_string_array::(1, size, 0.1, 0); c.bench_function(&format!("min null 2^{} utf8", log2_size), |b| { b.iter(|| bench_min(&arr_a)) From 6cb49949cd02bd953043524d932c9ff607461267 Mon Sep 17 00:00:00 2001 From: remzi <13716567376yh@gmail.com> Date: Sun, 22 May 2022 16:01:39 +0800 Subject: [PATCH 3/5] add test case for no null Signed-off-by: remzi <13716567376yh@gmail.com> --- tests/it/compute/aggregate/min_max.rs | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/tests/it/compute/aggregate/min_max.rs b/tests/it/compute/aggregate/min_max.rs index 224a245ae79..6a6e244a434 100644 --- a/tests/it/compute/aggregate/min_max.rs +++ b/tests/it/compute/aggregate/min_max.rs @@ -112,12 +112,11 @@ fn min_max_f64_edge_cases() { assert_eq!(Some(f64::INFINITY), max_primitive(&a)); } -// todo: convert me #[test] fn test_string_min_max_with_nulls() { let a = Utf8Array::::from(&[Some("b"), None, None, Some("a"), Some("c")]); - assert_eq!("a", min_string(&a).unwrap()); - assert_eq!("c", max_string(&a).unwrap()); + assert_eq!(Some("a"), min_string(&a)); + assert_eq!(Some("c"), max_string(&a)); } #[test] @@ -127,6 +126,13 @@ fn test_string_min_max_all_nulls() { assert_eq!(None, max_string(&a)); } +#[test] +fn test_string_min_max_no_null() { + let a = Utf8Array::::from(&[Some("abc"), Some("abd"), Some("bac"), Some("bbb")]); + assert_eq!(Some("abc"), min_string(&a)); + assert_eq!(Some("bbb"), max_string(&a)); +} + #[test] fn test_string_min_max_1() { let a = Utf8Array::::from(&[None, None, Some("b"), Some("a")]); From ba6c04a56d0e9df301d6628a5d753d596e0f34ae Mon Sep 17 00:00:00 2001 From: remzi <13716567376yh@gmail.com> Date: Mon, 23 May 2022 09:31:13 +0800 Subject: [PATCH 4/5] refactor min_max_binary Signed-off-by: remzi <13716567376yh@gmail.com> --- src/compute/aggregate/min_max.rs | 105 ++++++++++--------------------- 1 file changed, 32 insertions(+), 73 deletions(-) diff --git a/src/compute/aggregate/min_max.rs b/src/compute/aggregate/min_max.rs index f8e91ffacb1..54c9e7d5048 100644 --- a/src/compute/aggregate/min_max.rs +++ b/src/compute/aggregate/min_max.rs @@ -30,75 +30,6 @@ pub trait SimdOrd { fn new_max() -> Self; } -/// Helper to compute min/max of [`BinaryArray`] -fn min_max_binary bool>( - array: &BinaryArray, - cmp: F, -) -> Option<&[u8]> { - let null_count = array.null_count(); - - if null_count == array.len() || array.len() == 0 { - return None; - } - let value = if array.validity().is_some() { - array.iter().fold(None, |mut acc: Option<&[u8]>, v| { - if let Some(item) = v { - if let Some(acc) = acc.as_mut() { - if cmp(acc, item) { - *acc = item - } - } else { - acc = Some(item) - } - } - acc - }) - } else { - array - .values_iter() - .fold(None, |mut acc: Option<&[u8]>, item| { - if let Some(acc) = acc.as_mut() { - if cmp(acc, item) { - *acc = item - } - } else { - acc = Some(item) - } - acc - }) - }; - value -} - -/// Helper to compute min/max of [`Utf8Array`] -fn min_max_string bool>( - array: &Utf8Array, - cmp: F, -) -> Option<&str> { - if array.null_count() == array.len() { - None - } else if array.validity().is_some() { - array - .iter() - .reduce(|v1, v2| match (v1, v2) { - (None, v2) => v2, - (v1, None) => v1, - (Some(v1), Some(v2)) => { - if cmp(v1, v2) { - Some(v2) - } else { - Some(v1) - } - } - }) - .unwrap_or(None) - } else { - array - .values_iter() - .reduce(|v1, v2| if cmp(v1, v2) { v2 } else { v1 }) - } -} - fn nonnull_min_primitive(values: &[T]) -> T where T: NativeType + Simd, @@ -267,24 +198,52 @@ where }) } +/// Helper to compute min/max of [`BinaryArray`] and [`Utf8Array`] +macro_rules! min_max_binary_utf8 { + ($array: expr, $cmp: expr) => { + if $array.null_count() == $array.len() { + None + } else if $array.validity().is_some() { + $array + .iter() + .reduce(|v1, v2| match (v1, v2) { + (None, v2) => v2, + (v1, None) => v1, + (Some(v1), Some(v2)) => { + if $cmp(v1, v2) { + Some(v2) + } else { + Some(v1) + } + } + }) + .unwrap_or(None) + } else { + $array + .values_iter() + .reduce(|v1, v2| if $cmp(v1, v2) { v2 } else { v1 }) + } + }; +} + /// Returns the maximum value in the binary array, according to the natural order. pub fn max_binary(array: &BinaryArray) -> Option<&[u8]> { - min_max_binary(array, |a, b| a < b) + min_max_binary_utf8!(array, |a, b| a < b) } /// Returns the minimum value in the binary array, according to the natural order. pub fn min_binary(array: &BinaryArray) -> Option<&[u8]> { - min_max_binary(array, |a, b| a > b) + min_max_binary_utf8!(array, |a, b| a > b) } /// Returns the maximum value in the string array, according to the natural order. pub fn max_string(array: &Utf8Array) -> Option<&str> { - min_max_string(array, |a, b| a < b) + min_max_binary_utf8!(array, |a, b| a < b) } /// Returns the minimum value in the string array, according to the natural order. pub fn min_string(array: &Utf8Array) -> Option<&str> { - min_max_string(array, |a, b| a > b) + min_max_binary_utf8!(array, |a, b| a > b) } /// Returns the minimum value in the boolean array. From c2332f13d98e1c429082a261f4795c226142d497 Mon Sep 17 00:00:00 2001 From: remzi <13716567376yh@gmail.com> Date: Mon, 23 May 2022 09:31:41 +0800 Subject: [PATCH 5/5] add test to cover the no null case for binary Signed-off-by: remzi <13716567376yh@gmail.com> --- tests/it/compute/aggregate/min_max.rs | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/tests/it/compute/aggregate/min_max.rs b/tests/it/compute/aggregate/min_max.rs index 6a6e244a434..3cfc8b675f0 100644 --- a/tests/it/compute/aggregate/min_max.rs +++ b/tests/it/compute/aggregate/min_max.rs @@ -198,8 +198,20 @@ fn test_boolean_min_max_smaller() { #[test] fn test_binary_min_max_with_nulls() { let a = BinaryArray::::from(&[Some(b"b"), None, None, Some(b"a"), Some(b"c")]); - assert_eq!("a".as_bytes(), min_binary(&a).unwrap()); - assert_eq!("c".as_bytes(), max_binary(&a).unwrap()); + assert_eq!(Some("a".as_bytes()), min_binary(&a)); + assert_eq!(Some("c".as_bytes()), max_binary(&a)); +} + +#[test] +fn test_binary_min_max_no_null() { + let a = BinaryArray::::from(&[ + Some("abc".as_bytes()), + Some(b"acd"), + Some(b"aabd"), + Some(b""), + ]); + assert_eq!(Some("".as_bytes()), min_binary(&a)); + assert_eq!(Some("acd".as_bytes()), max_binary(&a)); } #[test]