Skip to content
This repository has been archived by the owner on Feb 18, 2024. It is now read-only.

Commit

Permalink
Optimize hashing using ahash and multiversioning (#428)
Browse files Browse the repository at this point in the history
  • Loading branch information
Dandandan authored Sep 20, 2021
1 parent 7dedd02 commit 38361d2
Show file tree
Hide file tree
Showing 4 changed files with 64 additions and 42 deletions.
7 changes: 5 additions & 2 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,6 @@ full = [
"io_avro",
"regex",
"merge_sort",
"ahash",
"compute",
# parses timezones used in timestamp conversions
"chrono-tz",
Expand All @@ -121,7 +120,7 @@ io_avro = ["avro-rs", "streaming-iterator", "serde_json"]
io_json_integration = ["io_json", "serde_derive", "hex"]
io_print = ["comfy-table"]
# the compute kernels. Disabling this significantly reduces compile time.
compute = ["strength_reduce", "multiversion", "lexical-core"]
compute = ["strength_reduce", "multiversion", "lexical-core", "ahash"]
# base64 + io_ipc because arrow schemas are stored as base64-encoded ipc format.
io_parquet = ["parquet2", "io_ipc", "base64", "futures"]
benchmarks = ["rand"]
Expand Down Expand Up @@ -223,3 +222,7 @@ harness = false
[[bench]]
name = "write_csv"
harness = false

[[bench]]
name = "hash_kernel"
harness = false
2 changes: 1 addition & 1 deletion benches/filter_kernels.rs
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ use arrow2::array::*;
use arrow2::compute::filter::{build_filter, filter, filter_record_batch, Filter};
use arrow2::datatypes::{DataType, Field, Schema};
use arrow2::record_batch::RecordBatch;
use arrow2::util::bench_util::*;
use arrow2::util::

use criterion::{criterion_group, criterion_main, Criterion};

Expand Down
34 changes: 34 additions & 0 deletions benches/hash_kernel.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
extern crate arrow2;

use arrow2::compute::hash::hash;
use arrow2::datatypes::DataType;
use arrow2::util::bench_util::*;

use criterion::{criterion_group, criterion_main, Criterion};

fn add_benchmark(c: &mut Criterion) {
let log2_size = 10;
let size = 2usize.pow(log2_size);

let arr_a = create_primitive_array::<i32>(size, DataType::Int32, 0.0);

c.bench_function(&format!("i32 2^{}", log2_size), |b| b.iter(|| hash(&arr_a)));

let arr_a = create_primitive_array::<i64>(size, DataType::Int64, 0.0);

c.bench_function(&format!("i64 2^{}", log2_size), |b| b.iter(|| hash(&arr_a)));

let arr_a = create_string_array::<i32>(size, 5, 0.0, 0);

c.bench_function(&format!("str 2^{}", log2_size), |b| b.iter(|| hash(&arr_a)));

let arr_a = create_boolean_array(size, 0.5, 0.0);

c.bench_function(&format!("bool 2^{}", log2_size), |b| {
b.iter(|| hash(&arr_a))
});
}

criterion_group!(benches, add_benchmark);
criterion_main!(benches);

63 changes: 24 additions & 39 deletions src/compute/hash.rs
Original file line number Diff line number Diff line change
@@ -1,20 +1,10 @@
use std::hash::{Hash, Hasher};
use ahash::{CallHasher, RandomState};
use multiversion::multiversion;
use std::hash::Hash;

#[cfg(feature = "ahash")]
use ahash::AHasher as DefaultHasher;
#[cfg(not(feature = "ahash"))]
use std::collections::hash_map::DefaultHasher;

#[cfg(feature = "ahash")]
macro_rules! new_hasher {
() => {
DefaultHasher::new_with_keys(0, 0)
};
}
#[cfg(not(feature = "ahash"))]
macro_rules! new_hasher {
macro_rules! new_state {
() => {
DefaultHasher::new()
RandomState::with_seeds(0, 0, 0, 0)
};
}

Expand All @@ -29,47 +19,42 @@ use crate::{
use super::arity::unary;

/// Element-wise hash of a [`PrimitiveArray`]. Validity is preserved.
#[multiversion]
#[clone(target = "x86_64+aes+sse3+ssse3+avx+avx2")]
pub fn hash_primitive<T: NativeType + Hash>(array: &PrimitiveArray<T>) -> PrimitiveArray<u64> {
unary(
array,
|x| {
let mut hasher = new_hasher!();
x.hash(&mut hasher);
hasher.finish()
},
DataType::UInt64,
)
let state = new_state!();

unary(array, |x| T::get_hash(&x, &state), DataType::UInt64)
}

/// Element-wise hash of a [`BooleanArray`]. Validity is preserved.
#[multiversion]
#[clone(target = "x86_64+aes+sse3+ssse3+avx+avx2")]
pub fn hash_boolean(array: &BooleanArray) -> PrimitiveArray<u64> {
let iter = array.values_iter().map(|x| {
let mut hasher = new_hasher!();
x.hash(&mut hasher);
hasher.finish()
});
let state = new_state!();

let iter = array.values_iter().map(|x| u8::get_hash(&x, &state));
let values = Buffer::from_trusted_len_iter(iter);
PrimitiveArray::<u64>::from_data(DataType::UInt64, values, array.validity().clone())
}

#[multiversion]
#[clone(target = "x86_64+aes+sse3+ssse3+avx+avx2")]
/// Element-wise hash of a [`Utf8Array`]. Validity is preserved.
pub fn hash_utf8<O: Offset>(array: &Utf8Array<O>) -> PrimitiveArray<u64> {
let iter = array.values_iter().map(|x| {
let mut hasher = new_hasher!();
x.hash(&mut hasher);
hasher.finish()
});
let state = new_state!();

let iter = array
.values_iter()
.map(|x| <[u8]>::get_hash(&x.as_bytes(), &state));
let values = Buffer::from_trusted_len_iter(iter);
PrimitiveArray::<u64>::from_data(DataType::UInt64, values, array.validity().clone())
}

/// Element-wise hash of a [`BinaryArray`]. Validity is preserved.
pub fn hash_binary<O: Offset>(array: &BinaryArray<O>) -> PrimitiveArray<u64> {
let iter = array.values_iter().map(|x| {
let mut hasher = new_hasher!();
x.hash(&mut hasher);
hasher.finish()
});
let state = new_state!();
let iter = array.values_iter().map(|x| <[u8]>::get_hash(&x, &state));
let values = Buffer::from_trusted_len_iter(iter);
PrimitiveArray::<u64>::from_data(DataType::UInt64, values, array.validity().clone())
}
Expand Down

0 comments on commit 38361d2

Please sign in to comment.