Skip to content
This repository has been archived by the owner on Feb 18, 2024. It is now read-only.

Improved performance of utf8 comparison (1.7x-4x) #322

Merged
merged 2 commits into from
Aug 24, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions benches/comparison_kernels.rs
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,16 @@ fn add_benchmark(c: &mut Criterion) {
c.bench_function(&format!("bool scalar 2^{}", log2_size), |b| {
b.iter(|| bench_op_scalar(&arr_a, &BooleanScalar::from(Some(true)), Operator::Eq))
});

let arr_a = create_string_array::<i32>(size, 0.1, 42);
let arr_b = create_string_array::<i32>(size, 0.1, 43);
c.bench_function(&format!("utf8 2^{}", log2_size), |b| {
b.iter(|| bench_op(&arr_a, &arr_b, Operator::Eq))
});

c.bench_function(&format!("utf8 2^{}", log2_size), |b| {
b.iter(|| bench_op_scalar(&arr_a, &Utf8Scalar::<i32>::from(Some("abc")), Operator::Eq))
});
})
}

Expand Down
2 changes: 1 addition & 1 deletion benches/filter_kernels.rs
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,7 @@ fn add_benchmark(c: &mut Criterion) {
b.iter(|| bench_built_filter(&sparse_filter, &data_array))
});

let data_array = create_string_array::<i32>(size, 0.5);
let data_array = create_string_array::<i32>(size, 0.5, 42);
c.bench_function("filter context string", |b| {
b.iter(|| bench_built_filter(&filter, &data_array))
});
Expand Down
2 changes: 1 addition & 1 deletion benches/sort_kernel.rs
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ fn add_benchmark(c: &mut Criterion) {
b.iter(|| bench_lexsort(&arr_a, &arr_b))
});

let arr_a = create_string_array::<i32>(size, 0.1);
let arr_a = create_string_array::<i32>(size, 0.1, 42);
c.bench_function(&format!("sort utf8 null 2^{}", log2_size), |b| {
b.iter(|| bench_sort(&arr_a))
});
Expand Down
12 changes: 6 additions & 6 deletions benches/take_kernels.rs
Original file line number Diff line number Diff line change
Expand Up @@ -91,36 +91,36 @@ fn add_benchmark(c: &mut Criterion) {
b.iter(|| bench_take(&values, &indices))
});

let values = create_string_array::<i32>(512, 0.0);
let values = create_string_array::<i32>(512, 0.0, 42);
let indices = create_random_index(512, 0.0);
c.bench_function("take str 512", |b| b.iter(|| bench_take(&values, &indices)));

let values = create_string_array::<i32>(1024, 0.0);
let values = create_string_array::<i32>(1024, 0.0, 42);
let indices = create_random_index(1024, 0.0);
c.bench_function("take str 1024", |b| {
b.iter(|| bench_take(&values, &indices))
});

let values = create_string_array::<i32>(512, 0.0);
let values = create_string_array::<i32>(512, 0.0, 42);
let indices = create_random_index(512, 0.5);
c.bench_function("take str null indices 512", |b| {
b.iter(|| bench_take(&values, &indices))
});

let values = create_string_array::<i32>(1024, 0.0);
let values = create_string_array::<i32>(1024, 0.0, 42);
let indices = create_random_index(1024, 0.5);
c.bench_function("take str null indices 1024", |b| {
b.iter(|| bench_take(&values, &indices))
});

let values = create_string_array::<i32>(1024, 0.5);
let values = create_string_array::<i32>(1024, 0.5, 42);

let indices = create_random_index(1024, 0.0);
c.bench_function("take str null values 1024", |b| {
b.iter(|| bench_take(&values, &indices))
});

let values = create_string_array::<i32>(1024, 0.5);
let values = create_string_array::<i32>(1024, 0.5, 42);
let indices = create_random_index(1024, 0.5);
c.bench_function("take str null values null indices 1024", |b| {
b.iter(|| bench_take(&values, &indices))
Expand Down
2 changes: 1 addition & 1 deletion benches/write_ipc.rs
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ fn add_benchmark(c: &mut Criterion) {
});

(0..=10).step_by(2).for_each(|i| {
let array = &create_string_array::<i32>(1024 * 2usize.pow(i), 0.1);
let array = &create_string_array::<i32>(1024 * 2usize.pow(i), 0.1, 42);
let a = format!("write utf8 2^{}", 10 + i);
c.bench_function(&a, |b| b.iter(|| write(array).unwrap()));
});
Expand Down
4 changes: 2 additions & 2 deletions benches/write_parquet.rs
Original file line number Diff line number Diff line change
Expand Up @@ -55,13 +55,13 @@ fn add_benchmark(c: &mut Criterion) {
});

(0..=10).step_by(2).for_each(|i| {
let array = &create_string_array::<i32>(1024 * 2usize.pow(i), 0.1);
let array = &create_string_array::<i32>(1024 * 2usize.pow(i), 0.1, 42);
let a = format!("write utf8 2^{}", 10 + i);
c.bench_function(&a, |b| b.iter(|| write(array, Encoding::Plain).unwrap()));
});

(0..=10).step_by(2).for_each(|i| {
let array = &create_string_array::<i32>(1024 * 2usize.pow(i), 0.1);
let array = &create_string_array::<i32>(1024 * 2usize.pow(i), 0.1, 42);
let a = format!("write utf8 delta 2^{}", 10 + i);
c.bench_function(&a, |b| {
b.iter(|| write(array, Encoding::DeltaLengthByteArray).unwrap())
Expand Down
14 changes: 4 additions & 10 deletions src/compute/comparison/utf8.rs
Original file line number Diff line number Diff line change
Expand Up @@ -37,12 +37,9 @@ where
let validity = combine_validities(lhs.validity(), rhs.validity());

let values = lhs
.iter()
.zip(rhs.iter())
.map(|(lhs, rhs)| match (lhs, rhs) {
(Some(lhs), Some(rhs)) => op(lhs, rhs),
_ => false,
});
.values_iter()
.zip(rhs.values_iter())
.map(|(lhs, rhs)| op(lhs, rhs));
let values = Bitmap::from_trusted_len_iter(values);

Ok(BooleanArray::from_data(values, validity))
Expand All @@ -57,10 +54,7 @@ where
{
let validity = lhs.validity().clone();

let values = lhs.iter().map(|lhs| match lhs {
None => false,
Some(lhs) => op(lhs, rhs),
});
let values = lhs.values_iter().map(|lhs| op(lhs, rhs));
let values = Bitmap::from_trusted_len_iter(values);

BooleanArray::from_data(values, validity)
Expand Down
6 changes: 3 additions & 3 deletions src/util/bench_util.rs
Original file line number Diff line number Diff line change
Expand Up @@ -95,15 +95,15 @@ where
}

/// Creates an random (but fixed-seeded) array of a given size and null density
pub fn create_string_array<O: Offset>(size: usize, null_density: f32) -> Utf8Array<O> {
let rng = &mut seedable_rng();
pub fn create_string_array<O: Offset>(size: usize, null_density: f32, seed: u64) -> Utf8Array<O> {
let mut rng = StdRng::seed_from_u64(seed);

(0..size)
.map(|_| {
if rng.gen::<f32>() < null_density {
None
} else {
let value = rng
let value = (&mut rng)
.sample_iter(&Alphanumeric)
.take(4)
.map(char::from)
Expand Down