Skip to content

Commit

Permalink
Use SlicesIterator for ArrayData Equality (#3198)
Browse files Browse the repository at this point in the history
* Use SlicesIterator for ArrayData Equality

* Use BitSliceIterator
  • Loading branch information
viirya authored Nov 27, 2022
1 parent 1daf7d3 commit f985818
Show file tree
Hide file tree
Showing 4 changed files with 110 additions and 42 deletions.
72 changes: 52 additions & 20 deletions arrow-data/src/equal/fixed_binary.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,10 @@
// specific language governing permissions and limitations
// under the License.

use crate::data::{contains_nulls, ArrayData};
use crate::bit_iterator::BitSliceIterator;
use crate::contains_nulls;
use crate::data::ArrayData;
use crate::equal::primitive::NULL_SLICES_SELECTIVITY_THRESHOLD;
use arrow_buffer::bit_util::get_bit;
use arrow_schema::DataType;

Expand Down Expand Up @@ -47,26 +50,55 @@ pub(super) fn fixed_binary_equal(
size * len,
)
} else {
// get a ref of the null buffer bytes, to use in testing for nullness
let lhs_null_bytes = lhs.null_buffer().as_ref().unwrap().as_slice();
let rhs_null_bytes = rhs.null_buffer().as_ref().unwrap().as_slice();
// with nulls, we need to compare item by item whenever it is not null
(0..len).all(|i| {
let lhs_pos = lhs_start + i;
let rhs_pos = rhs_start + i;
let selectivity_frac = lhs.null_count() as f64 / lhs.len() as f64;

let lhs_is_null = !get_bit(lhs_null_bytes, lhs_pos + lhs.offset());
let rhs_is_null = !get_bit(rhs_null_bytes, rhs_pos + rhs.offset());
if selectivity_frac >= NULL_SLICES_SELECTIVITY_THRESHOLD {
// get a ref of the null buffer bytes, to use in testing for nullness
let lhs_null_bytes = lhs.null_buffer().as_ref().unwrap().as_slice();
let rhs_null_bytes = rhs.null_buffer().as_ref().unwrap().as_slice();
// with nulls, we need to compare item by item whenever it is not null
(0..len).all(|i| {
let lhs_pos = lhs_start + i;
let rhs_pos = rhs_start + i;

lhs_is_null
|| (lhs_is_null == rhs_is_null)
&& equal_len(
lhs_values,
rhs_values,
lhs_pos * size,
rhs_pos * size,
size, // 1 * size since we are comparing a single entry
)
})
let lhs_is_null = !get_bit(lhs_null_bytes, lhs_pos + lhs.offset());
let rhs_is_null = !get_bit(rhs_null_bytes, rhs_pos + rhs.offset());

lhs_is_null
|| (lhs_is_null == rhs_is_null)
&& equal_len(
lhs_values,
rhs_values,
lhs_pos * size,
rhs_pos * size,
size, // 1 * size since we are comparing a single entry
)
})
} else {
let lhs_slices_iter = BitSliceIterator::new(
lhs.null_buffer().as_ref().unwrap(),
lhs_start + lhs.offset(),
len,
);
let rhs_slices_iter = BitSliceIterator::new(
rhs.null_buffer().as_ref().unwrap(),
rhs_start + rhs.offset(),
len,
);

lhs_slices_iter.zip(rhs_slices_iter).all(
|((l_start, l_end), (r_start, r_end))| {
l_start == r_start
&& l_end == r_end
&& equal_len(
lhs_values,
rhs_values,
(lhs_start + l_start) * size,
(rhs_start + r_start) * size,
(l_end - l_start) * size,
)
},
)
}
}
}
75 changes: 54 additions & 21 deletions arrow-data/src/equal/primitive.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,13 +15,17 @@
// specific language governing permissions and limitations
// under the License.

use crate::bit_iterator::BitSliceIterator;
use crate::contains_nulls;
use arrow_buffer::bit_util::get_bit;
use std::mem::size_of;

use crate::data::{contains_nulls, ArrayData};
use arrow_buffer::bit_util::get_bit;
use crate::data::ArrayData;

use super::utils::equal_len;

pub(crate) const NULL_SLICES_SELECTIVITY_THRESHOLD: f64 = 0.4;

pub(super) fn primitive_equal<T>(
lhs: &ArrayData,
rhs: &ArrayData,
Expand All @@ -45,25 +49,54 @@ pub(super) fn primitive_equal<T>(
len * byte_width,
)
} else {
// get a ref of the null buffer bytes, to use in testing for nullness
let lhs_null_bytes = lhs.null_buffer().as_ref().unwrap().as_slice();
let rhs_null_bytes = rhs.null_buffer().as_ref().unwrap().as_slice();
// with nulls, we need to compare item by item whenever it is not null
(0..len).all(|i| {
let lhs_pos = lhs_start + i;
let rhs_pos = rhs_start + i;
let lhs_is_null = !get_bit(lhs_null_bytes, lhs_pos + lhs.offset());
let rhs_is_null = !get_bit(rhs_null_bytes, rhs_pos + rhs.offset());
let selectivity_frac = lhs.null_count() as f64 / lhs.len() as f64;

if selectivity_frac >= NULL_SLICES_SELECTIVITY_THRESHOLD {
// get a ref of the null buffer bytes, to use in testing for nullness
let lhs_null_bytes = lhs.null_buffer().as_ref().unwrap().as_slice();
let rhs_null_bytes = rhs.null_buffer().as_ref().unwrap().as_slice();
// with nulls, we need to compare item by item whenever it is not null
(0..len).all(|i| {
let lhs_pos = lhs_start + i;
let rhs_pos = rhs_start + i;
let lhs_is_null = !get_bit(lhs_null_bytes, lhs_pos + lhs.offset());
let rhs_is_null = !get_bit(rhs_null_bytes, rhs_pos + rhs.offset());

lhs_is_null
|| (lhs_is_null == rhs_is_null)
&& equal_len(
lhs_values,
rhs_values,
lhs_pos * byte_width,
rhs_pos * byte_width,
byte_width, // 1 * byte_width since we are comparing a single entry
)
})
} else {
let lhs_slices_iter = BitSliceIterator::new(
lhs.null_buffer().as_ref().unwrap(),
lhs_start + lhs.offset(),
len,
);
let rhs_slices_iter = BitSliceIterator::new(
rhs.null_buffer().as_ref().unwrap(),
rhs_start + rhs.offset(),
len,
);

lhs_is_null
|| (lhs_is_null == rhs_is_null)
&& equal_len(
lhs_values,
rhs_values,
lhs_pos * byte_width,
rhs_pos * byte_width,
byte_width, // 1 * byte_width since we are comparing a single entry
)
})
lhs_slices_iter.zip(rhs_slices_iter).all(
|((l_start, l_end), (r_start, r_end))| {
l_start == r_start
&& l_end == r_end
&& equal_len(
lhs_values,
rhs_values,
(lhs_start + l_start) * byte_width,
(rhs_start + r_start) * byte_width,
(l_end - l_start) * byte_width,
)
},
)
}
}
}
2 changes: 1 addition & 1 deletion arrow-select/src/filter.rs
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ use arrow_schema::*;
const FILTER_SLICES_SELECTIVITY_THRESHOLD: f64 = 0.8;

/// An iterator of `(usize, usize)` each representing an interval
/// `[start, end)` whose slots of a [BooleanArray] are true. Each
/// `[start, end)` whose slots of a bitmap [Buffer] are true. Each
/// interval corresponds to a contiguous region of memory to be
/// "taken" from an array to be filtered.
///
Expand Down
3 changes: 3 additions & 0 deletions arrow/benches/equal.rs
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,9 @@ fn add_benchmark(c: &mut Criterion) {
let arr_a_nulls = create_primitive_array::<Float32Type>(512, 0.5);
c.bench_function("equal_nulls_512", |b| b.iter(|| bench_equal(&arr_a_nulls)));

let arr_a = create_primitive_array::<Float32Type>(51200, 0.1);
c.bench_function("equal_51200", |b| b.iter(|| bench_equal(&arr_a)));

let arr_a = create_string_array::<i32>(512, 0.0);
c.bench_function("equal_string_512", |b| b.iter(|| bench_equal(&arr_a)));

Expand Down

0 comments on commit f985818

Please sign in to comment.