From 38361d248c54d3ec966d654329b9de01879ed486 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20Heres?= Date: Mon, 20 Sep 2021 17:45:59 +0200 Subject: [PATCH] Optimize hashing using ahash and multiversioning (#428) --- Cargo.toml | 7 +++-- benches/filter_kernels.rs | 2 +- benches/hash_kernel.rs | 34 +++++++++++++++++++++ src/compute/hash.rs | 63 +++++++++++++++------------------------ 4 files changed, 64 insertions(+), 42 deletions(-) create mode 100644 benches/hash_kernel.rs diff --git a/Cargo.toml b/Cargo.toml index b5e0175aa8e..74b127aba46 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -96,7 +96,6 @@ full = [ "io_avro", "regex", "merge_sort", - "ahash", "compute", # parses timezones used in timestamp conversions "chrono-tz", @@ -121,7 +120,7 @@ io_avro = ["avro-rs", "streaming-iterator", "serde_json"] io_json_integration = ["io_json", "serde_derive", "hex"] io_print = ["comfy-table"] # the compute kernels. Disabling this significantly reduces compile time. -compute = ["strength_reduce", "multiversion", "lexical-core"] +compute = ["strength_reduce", "multiversion", "lexical-core", "ahash"] # base64 + io_ipc because arrow schemas are stored as base64-encoded ipc format. io_parquet = ["parquet2", "io_ipc", "base64", "futures"] benchmarks = ["rand"] @@ -223,3 +222,7 @@ harness = false [[bench]] name = "write_csv" harness = false + +[[bench]] +name = "hash_kernel" +harness = false diff --git a/benches/filter_kernels.rs b/benches/filter_kernels.rs index 13acb942300..4ed1205f721 100644 --- a/benches/filter_kernels.rs +++ b/benches/filter_kernels.rs @@ -22,7 +22,7 @@ use arrow2::array::*; use arrow2::compute::filter::{build_filter, filter, filter_record_batch, Filter}; use arrow2::datatypes::{DataType, Field, Schema}; use arrow2::record_batch::RecordBatch; -use arrow2::util::bench_util::*; +use arrow2::util:: use criterion::{criterion_group, criterion_main, Criterion}; diff --git a/benches/hash_kernel.rs b/benches/hash_kernel.rs new file mode 100644 index 00000000000..4a49100deeb --- /dev/null +++ b/benches/hash_kernel.rs @@ -0,0 +1,34 @@ +extern crate arrow2; + +use arrow2::compute::hash::hash; +use arrow2::datatypes::DataType; +use arrow2::util::bench_util::*; + +use criterion::{criterion_group, criterion_main, Criterion}; + +fn add_benchmark(c: &mut Criterion) { + let log2_size = 10; + let size = 2usize.pow(log2_size); + + let arr_a = create_primitive_array::(size, DataType::Int32, 0.0); + + c.bench_function(&format!("i32 2^{}", log2_size), |b| b.iter(|| hash(&arr_a))); + + let arr_a = create_primitive_array::(size, DataType::Int64, 0.0); + + c.bench_function(&format!("i64 2^{}", log2_size), |b| b.iter(|| hash(&arr_a))); + + let arr_a = create_string_array::(size, 5, 0.0, 0); + + c.bench_function(&format!("str 2^{}", log2_size), |b| b.iter(|| hash(&arr_a))); + + let arr_a = create_boolean_array(size, 0.5, 0.0); + + c.bench_function(&format!("bool 2^{}", log2_size), |b| { + b.iter(|| hash(&arr_a)) + }); +} + +criterion_group!(benches, add_benchmark); +criterion_main!(benches); + \ No newline at end of file diff --git a/src/compute/hash.rs b/src/compute/hash.rs index 50133c7870f..c7f763d79e2 100644 --- a/src/compute/hash.rs +++ b/src/compute/hash.rs @@ -1,20 +1,10 @@ -use std::hash::{Hash, Hasher}; +use ahash::{CallHasher, RandomState}; +use multiversion::multiversion; +use std::hash::Hash; -#[cfg(feature = "ahash")] -use ahash::AHasher as DefaultHasher; -#[cfg(not(feature = "ahash"))] -use std::collections::hash_map::DefaultHasher; - -#[cfg(feature = "ahash")] -macro_rules! new_hasher { - () => { - DefaultHasher::new_with_keys(0, 0) - }; -} -#[cfg(not(feature = "ahash"))] -macro_rules! new_hasher { +macro_rules! new_state { () => { - DefaultHasher::new() + RandomState::with_seeds(0, 0, 0, 0) }; } @@ -29,47 +19,42 @@ use crate::{ use super::arity::unary; /// Element-wise hash of a [`PrimitiveArray`]. Validity is preserved. +#[multiversion] +#[clone(target = "x86_64+aes+sse3+ssse3+avx+avx2")] pub fn hash_primitive(array: &PrimitiveArray) -> PrimitiveArray { - unary( - array, - |x| { - let mut hasher = new_hasher!(); - x.hash(&mut hasher); - hasher.finish() - }, - DataType::UInt64, - ) + let state = new_state!(); + + unary(array, |x| T::get_hash(&x, &state), DataType::UInt64) } /// Element-wise hash of a [`BooleanArray`]. Validity is preserved. +#[multiversion] +#[clone(target = "x86_64+aes+sse3+ssse3+avx+avx2")] pub fn hash_boolean(array: &BooleanArray) -> PrimitiveArray { - let iter = array.values_iter().map(|x| { - let mut hasher = new_hasher!(); - x.hash(&mut hasher); - hasher.finish() - }); + let state = new_state!(); + + let iter = array.values_iter().map(|x| u8::get_hash(&x, &state)); let values = Buffer::from_trusted_len_iter(iter); PrimitiveArray::::from_data(DataType::UInt64, values, array.validity().clone()) } +#[multiversion] +#[clone(target = "x86_64+aes+sse3+ssse3+avx+avx2")] /// Element-wise hash of a [`Utf8Array`]. Validity is preserved. pub fn hash_utf8(array: &Utf8Array) -> PrimitiveArray { - let iter = array.values_iter().map(|x| { - let mut hasher = new_hasher!(); - x.hash(&mut hasher); - hasher.finish() - }); + let state = new_state!(); + + let iter = array + .values_iter() + .map(|x| <[u8]>::get_hash(&x.as_bytes(), &state)); let values = Buffer::from_trusted_len_iter(iter); PrimitiveArray::::from_data(DataType::UInt64, values, array.validity().clone()) } /// Element-wise hash of a [`BinaryArray`]. Validity is preserved. pub fn hash_binary(array: &BinaryArray) -> PrimitiveArray { - let iter = array.values_iter().map(|x| { - let mut hasher = new_hasher!(); - x.hash(&mut hasher); - hasher.finish() - }); + let state = new_state!(); + let iter = array.values_iter().map(|x| <[u8]>::get_hash(&x, &state)); let values = Buffer::from_trusted_len_iter(iter); PrimitiveArray::::from_data(DataType::UInt64, values, array.validity().clone()) }