Skip to content

Commit

Permalink
Improve performance if dictionary kernels, add benchmark and add `tak…
Browse files Browse the repository at this point in the history
…e_iter_unchecked` (#1372)

* Add benchmark and take_iter_unchecked.

* Add Safety section for clippy

* Update arrow/src/compute/kernels/comparison.rs

Co-authored-by: Andrew Lamb <[email protected]>

Co-authored-by: Andrew Lamb <[email protected]>
  • Loading branch information
viirya and alamb authored Mar 3, 2022
1 parent ce27bed commit f9e4cf5
Show file tree
Hide file tree
Showing 6 changed files with 84 additions and 14 deletions.
21 changes: 20 additions & 1 deletion arrow/benches/comparison_kernels.rs
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ extern crate arrow;
use arrow::compute::*;
use arrow::datatypes::{ArrowNumericType, IntervalMonthDayNanoType};
use arrow::util::bench_util::*;
use arrow::{array::*, datatypes::Float32Type};
use arrow::{array::*, datatypes::Float32Type, datatypes::Int32Type};

fn bench_eq<T>(arr_a: &PrimitiveArray<T>, arr_b: &PrimitiveArray<T>)
where
Expand Down Expand Up @@ -133,6 +133,18 @@ fn bench_regexp_is_match_utf8_scalar(arr_a: &StringArray, value_b: &str) {
.unwrap();
}

fn bench_dict_eq<T>(arr_a: &DictionaryArray<T>, arr_b: &DictionaryArray<T>)
where
T: ArrowNumericType,
{
cmp_dict_utf8::<T, i32, _>(
criterion::black_box(arr_a),
criterion::black_box(arr_b),
|a, b| a == b,
)
.unwrap();
}

fn add_benchmark(c: &mut Criterion) {
let size = 65536;
let arr_a = create_primitive_array_with_seed::<Float32Type>(size, 0.0, 42);
Expand Down Expand Up @@ -249,6 +261,13 @@ fn add_benchmark(c: &mut Criterion) {
c.bench_function("egexp_matches_utf8 scalar ends with", |b| {
b.iter(|| bench_regexp_is_match_utf8_scalar(&arr_string, "xx$"))
});

let dict_arr_a = create_string_dict_array::<Int32Type>(size, 0.0);
let dict_arr_b = create_string_dict_array::<Int32Type>(size, 0.0);

c.bench_function("dict eq string", |b| {
b.iter(|| bench_dict_eq(&dict_arr_a, &dict_arr_b))
});
}

criterion_group!(benches, add_benchmark);
Expand Down
11 changes: 11 additions & 0 deletions arrow/src/array/array_binary.rs
Original file line number Diff line number Diff line change
Expand Up @@ -209,6 +209,17 @@ impl<OffsetSize: BinaryOffsetSizeTrait> GenericBinaryArray<OffsetSize> {
) -> impl Iterator<Item = Option<&[u8]>> + 'a {
indexes.map(|opt_index| opt_index.map(|index| self.value(index)))
}

/// Returns an iterator that returns the values of `array.value(i)` for an iterator with each element `i`
/// # Safety
///
/// caller must ensure that the offsets in the iterator are less than the array len()
pub unsafe fn take_iter_unchecked<'a>(
&'a self,
indexes: impl Iterator<Item = Option<usize>> + 'a,
) -> impl Iterator<Item = Option<&[u8]>> + 'a {
indexes.map(|opt_index| opt_index.map(|index| self.value_unchecked(index)))
}
}

impl<'a, T: BinaryOffsetSizeTrait> GenericBinaryArray<T> {
Expand Down
11 changes: 11 additions & 0 deletions arrow/src/array/array_boolean.rs
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,17 @@ impl BooleanArray {
) -> impl Iterator<Item = Option<bool>> + 'a {
indexes.map(|opt_index| opt_index.map(|index| self.value(index)))
}

/// Returns an iterator that returns the values of `array.value(i)` for an iterator with each element `i`
/// # Safety
///
/// caller must ensure that the offsets in the iterator are less than the array len()
pub unsafe fn take_iter_unchecked<'a>(
&'a self,
indexes: impl Iterator<Item = Option<usize>> + 'a,
) -> impl Iterator<Item = Option<bool>> + 'a {
indexes.map(|opt_index| opt_index.map(|index| self.value_unchecked(index)))
}
}

impl Array for BooleanArray {
Expand Down
11 changes: 11 additions & 0 deletions arrow/src/array/array_primitive.rs
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,17 @@ impl<T: ArrowPrimitiveType> PrimitiveArray<T> {
) -> impl Iterator<Item = Option<T::Native>> + 'a {
indexes.map(|opt_index| opt_index.map(|index| self.value(index)))
}

/// Returns an iterator that returns the values of `array.value(i)` for an iterator with each element `i`
/// # Safety
///
/// caller must ensure that the offsets in the iterator are less than the array len()
pub unsafe fn take_iter_unchecked<'a>(
&'a self,
indexes: impl Iterator<Item = Option<usize>> + 'a,
) -> impl Iterator<Item = Option<T::Native>> + 'a {
indexes.map(|opt_index| opt_index.map(|index| self.value_unchecked(index)))
}
}

impl<T: ArrowPrimitiveType> Array for PrimitiveArray<T> {
Expand Down
11 changes: 11 additions & 0 deletions arrow/src/array/array_string.rs
Original file line number Diff line number Diff line change
Expand Up @@ -180,6 +180,17 @@ impl<OffsetSize: StringOffsetSizeTrait> GenericStringArray<OffsetSize> {
) -> impl Iterator<Item = Option<&str>> + 'a {
indexes.map(|opt_index| opt_index.map(|index| self.value(index)))
}

/// Returns an iterator that returns the values of `array.value(i)` for an iterator with each element `i`
/// # Safety
///
/// caller must ensure that the offsets in the iterator are less than the array len()
pub unsafe fn take_iter_unchecked<'a>(
&'a self,
indexes: impl Iterator<Item = Option<usize>> + 'a,
) -> impl Iterator<Item = Option<&str>> + 'a {
indexes.map(|opt_index| opt_index.map(|index| self.value_unchecked(index)))
}
}

impl<'a, Ptr, OffsetSize: StringOffsetSizeTrait> FromIterator<&'a Option<Ptr>>
Expand Down
33 changes: 20 additions & 13 deletions arrow/src/compute/kernels/comparison.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2214,19 +2214,26 @@ macro_rules! compare_dict_op {
));
}

let left_iter = $left
.values()
.as_any()
.downcast_ref::<$value_ty>()
.unwrap()
.take_iter($left.keys_iter());

let right_iter = $right
.values()
.as_any()
.downcast_ref::<$value_ty>()
.unwrap()
.take_iter($right.keys_iter());
// Safety justification: Since the inputs are valid Arrow arrays, all values are
// valid indexes into the dictionary (which is verified during construction)

let left_iter = unsafe {
$left
.values()
.as_any()
.downcast_ref::<$value_ty>()
.unwrap()
.take_iter_unchecked($left.keys_iter())
};

let right_iter = unsafe {
$right
.values()
.as_any()
.downcast_ref::<$value_ty>()
.unwrap()
.take_iter_unchecked($right.keys_iter())
};

let result = left_iter
.zip(right_iter)
Expand Down

0 comments on commit f9e4cf5

Please sign in to comment.