From f985818012e6d0a56fca49487dbc13c516f4613c Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Sun, 27 Nov 2022 13:20:47 -0800 Subject: [PATCH] Use SlicesIterator for ArrayData Equality (#3198) * Use SlicesIterator for ArrayData Equality * Use BitSliceIterator --- arrow-data/src/equal/fixed_binary.rs | 72 ++++++++++++++++++-------- arrow-data/src/equal/primitive.rs | 75 ++++++++++++++++++++-------- arrow-select/src/filter.rs | 2 +- arrow/benches/equal.rs | 3 ++ 4 files changed, 110 insertions(+), 42 deletions(-) diff --git a/arrow-data/src/equal/fixed_binary.rs b/arrow-data/src/equal/fixed_binary.rs index d6af208016fa..17e470b5c47c 100644 --- a/arrow-data/src/equal/fixed_binary.rs +++ b/arrow-data/src/equal/fixed_binary.rs @@ -15,7 +15,10 @@ // specific language governing permissions and limitations // under the License. -use crate::data::{contains_nulls, ArrayData}; +use crate::bit_iterator::BitSliceIterator; +use crate::contains_nulls; +use crate::data::ArrayData; +use crate::equal::primitive::NULL_SLICES_SELECTIVITY_THRESHOLD; use arrow_buffer::bit_util::get_bit; use arrow_schema::DataType; @@ -47,26 +50,55 @@ pub(super) fn fixed_binary_equal( size * len, ) } else { - // get a ref of the null buffer bytes, to use in testing for nullness - let lhs_null_bytes = lhs.null_buffer().as_ref().unwrap().as_slice(); - let rhs_null_bytes = rhs.null_buffer().as_ref().unwrap().as_slice(); - // with nulls, we need to compare item by item whenever it is not null - (0..len).all(|i| { - let lhs_pos = lhs_start + i; - let rhs_pos = rhs_start + i; + let selectivity_frac = lhs.null_count() as f64 / lhs.len() as f64; - let lhs_is_null = !get_bit(lhs_null_bytes, lhs_pos + lhs.offset()); - let rhs_is_null = !get_bit(rhs_null_bytes, rhs_pos + rhs.offset()); + if selectivity_frac >= NULL_SLICES_SELECTIVITY_THRESHOLD { + // get a ref of the null buffer bytes, to use in testing for nullness + let lhs_null_bytes = lhs.null_buffer().as_ref().unwrap().as_slice(); + let rhs_null_bytes = rhs.null_buffer().as_ref().unwrap().as_slice(); + // with nulls, we need to compare item by item whenever it is not null + (0..len).all(|i| { + let lhs_pos = lhs_start + i; + let rhs_pos = rhs_start + i; - lhs_is_null - || (lhs_is_null == rhs_is_null) - && equal_len( - lhs_values, - rhs_values, - lhs_pos * size, - rhs_pos * size, - size, // 1 * size since we are comparing a single entry - ) - }) + let lhs_is_null = !get_bit(lhs_null_bytes, lhs_pos + lhs.offset()); + let rhs_is_null = !get_bit(rhs_null_bytes, rhs_pos + rhs.offset()); + + lhs_is_null + || (lhs_is_null == rhs_is_null) + && equal_len( + lhs_values, + rhs_values, + lhs_pos * size, + rhs_pos * size, + size, // 1 * size since we are comparing a single entry + ) + }) + } else { + let lhs_slices_iter = BitSliceIterator::new( + lhs.null_buffer().as_ref().unwrap(), + lhs_start + lhs.offset(), + len, + ); + let rhs_slices_iter = BitSliceIterator::new( + rhs.null_buffer().as_ref().unwrap(), + rhs_start + rhs.offset(), + len, + ); + + lhs_slices_iter.zip(rhs_slices_iter).all( + |((l_start, l_end), (r_start, r_end))| { + l_start == r_start + && l_end == r_end + && equal_len( + lhs_values, + rhs_values, + (lhs_start + l_start) * size, + (rhs_start + r_start) * size, + (l_end - l_start) * size, + ) + }, + ) + } } } diff --git a/arrow-data/src/equal/primitive.rs b/arrow-data/src/equal/primitive.rs index e619375d5314..f52541e2861c 100644 --- a/arrow-data/src/equal/primitive.rs +++ b/arrow-data/src/equal/primitive.rs @@ -15,13 +15,17 @@ // specific language governing permissions and limitations // under the License. +use crate::bit_iterator::BitSliceIterator; +use crate::contains_nulls; +use arrow_buffer::bit_util::get_bit; use std::mem::size_of; -use crate::data::{contains_nulls, ArrayData}; -use arrow_buffer::bit_util::get_bit; +use crate::data::ArrayData; use super::utils::equal_len; +pub(crate) const NULL_SLICES_SELECTIVITY_THRESHOLD: f64 = 0.4; + pub(super) fn primitive_equal( lhs: &ArrayData, rhs: &ArrayData, @@ -45,25 +49,54 @@ pub(super) fn primitive_equal( len * byte_width, ) } else { - // get a ref of the null buffer bytes, to use in testing for nullness - let lhs_null_bytes = lhs.null_buffer().as_ref().unwrap().as_slice(); - let rhs_null_bytes = rhs.null_buffer().as_ref().unwrap().as_slice(); - // with nulls, we need to compare item by item whenever it is not null - (0..len).all(|i| { - let lhs_pos = lhs_start + i; - let rhs_pos = rhs_start + i; - let lhs_is_null = !get_bit(lhs_null_bytes, lhs_pos + lhs.offset()); - let rhs_is_null = !get_bit(rhs_null_bytes, rhs_pos + rhs.offset()); + let selectivity_frac = lhs.null_count() as f64 / lhs.len() as f64; + + if selectivity_frac >= NULL_SLICES_SELECTIVITY_THRESHOLD { + // get a ref of the null buffer bytes, to use in testing for nullness + let lhs_null_bytes = lhs.null_buffer().as_ref().unwrap().as_slice(); + let rhs_null_bytes = rhs.null_buffer().as_ref().unwrap().as_slice(); + // with nulls, we need to compare item by item whenever it is not null + (0..len).all(|i| { + let lhs_pos = lhs_start + i; + let rhs_pos = rhs_start + i; + let lhs_is_null = !get_bit(lhs_null_bytes, lhs_pos + lhs.offset()); + let rhs_is_null = !get_bit(rhs_null_bytes, rhs_pos + rhs.offset()); + + lhs_is_null + || (lhs_is_null == rhs_is_null) + && equal_len( + lhs_values, + rhs_values, + lhs_pos * byte_width, + rhs_pos * byte_width, + byte_width, // 1 * byte_width since we are comparing a single entry + ) + }) + } else { + let lhs_slices_iter = BitSliceIterator::new( + lhs.null_buffer().as_ref().unwrap(), + lhs_start + lhs.offset(), + len, + ); + let rhs_slices_iter = BitSliceIterator::new( + rhs.null_buffer().as_ref().unwrap(), + rhs_start + rhs.offset(), + len, + ); - lhs_is_null - || (lhs_is_null == rhs_is_null) - && equal_len( - lhs_values, - rhs_values, - lhs_pos * byte_width, - rhs_pos * byte_width, - byte_width, // 1 * byte_width since we are comparing a single entry - ) - }) + lhs_slices_iter.zip(rhs_slices_iter).all( + |((l_start, l_end), (r_start, r_end))| { + l_start == r_start + && l_end == r_end + && equal_len( + lhs_values, + rhs_values, + (lhs_start + l_start) * byte_width, + (rhs_start + r_start) * byte_width, + (l_end - l_start) * byte_width, + ) + }, + ) + } } } diff --git a/arrow-select/src/filter.rs b/arrow-select/src/filter.rs index 41d93aefa31b..fde4b41b04cf 100644 --- a/arrow-select/src/filter.rs +++ b/arrow-select/src/filter.rs @@ -39,7 +39,7 @@ use arrow_schema::*; const FILTER_SLICES_SELECTIVITY_THRESHOLD: f64 = 0.8; /// An iterator of `(usize, usize)` each representing an interval -/// `[start, end)` whose slots of a [BooleanArray] are true. Each +/// `[start, end)` whose slots of a bitmap [Buffer] are true. Each /// interval corresponds to a contiguous region of memory to be /// "taken" from an array to be filtered. /// diff --git a/arrow/benches/equal.rs b/arrow/benches/equal.rs index f54aff1b5cc7..2f4e2fada9e9 100644 --- a/arrow/benches/equal.rs +++ b/arrow/benches/equal.rs @@ -43,6 +43,9 @@ fn add_benchmark(c: &mut Criterion) { let arr_a_nulls = create_primitive_array::(512, 0.5); c.bench_function("equal_nulls_512", |b| b.iter(|| bench_equal(&arr_a_nulls))); + let arr_a = create_primitive_array::(51200, 0.1); + c.bench_function("equal_51200", |b| b.iter(|| bench_equal(&arr_a))); + let arr_a = create_string_array::(512, 0.0); c.bench_function("equal_string_512", |b| b.iter(|| bench_equal(&arr_a)));