Skip to content
This repository has been archived by the owner on Feb 18, 2024. It is now read-only.

Commit

Permalink
Fix: native hashed-hash for MutableDictionaryArray (#1564)
Browse files Browse the repository at this point in the history
  • Loading branch information
aldanor authored Sep 6, 2023
1 parent 87ab844 commit 767834e
Show file tree
Hide file tree
Showing 2 changed files with 82 additions and 17 deletions.
82 changes: 65 additions & 17 deletions src/array/dictionary/value_map.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
use std::borrow::Borrow;
use std::fmt::{self, Debug};
use std::hash::{BuildHasher, Hash, Hasher};
use std::hash::{BuildHasher, BuildHasherDefault, Hash, Hasher};

use hashbrown::hash_map::RawEntryMut;
use hashbrown::HashMap;
Expand All @@ -15,10 +15,54 @@ use crate::{

use super::DictionaryKey;

/// Hasher for pre-hashed values; similar to `hash_hasher` but with native endianness.
///
/// We know that we'll only use it for `u64` values, so we can avoid endian conversion.
///
/// Invariant: hash of a u64 value is always equal to itself.
#[derive(Copy, Clone, Default)]
pub struct PassthroughHasher(u64);

impl Hasher for PassthroughHasher {
#[inline]
fn write_u64(&mut self, value: u64) {
self.0 = value;
}

fn write(&mut self, _: &[u8]) {
unreachable!();
}

#[inline]
fn finish(&self) -> u64 {
self.0
}
}

#[derive(Clone)]
pub struct Hashed<K> {
hash: u64,
key: K,
}

#[inline]
fn ahash_hash<T: Hash + ?Sized>(value: &T) -> u64 {
let mut hasher = BuildHasherDefault::<ahash::AHasher>::default().build_hasher();
value.hash(&mut hasher);
hasher.finish()
}

impl<K> Hash for Hashed<K> {
#[inline]
fn hash<H: Hasher>(&self, state: &mut H) {
self.hash.hash(state)
}
}

#[derive(Clone)]
pub struct ValueMap<K: DictionaryKey, M: MutableArray> {
values: M,
map: HashMap<K, ()>, // NB: *only* use insert_hashed_nocheck() and no other hashmap API
pub values: M,
pub map: HashMap<Hashed<K>, (), BuildHasherDefault<PassthroughHasher>>, // NB: *only* use insert_hashed_nocheck() and no other hashmap API
}

impl<K: DictionaryKey, M: MutableArray> ValueMap<K, M> {
Expand All @@ -39,22 +83,28 @@ impl<K: DictionaryKey, M: MutableArray> ValueMap<K, M> {
M: Indexable,
M::Type: Eq + Hash,
{
let mut map = HashMap::with_capacity(values.len());
let mut map = HashMap::<Hashed<K>, _, _>::with_capacity_and_hasher(
values.len(),
BuildHasherDefault::<PassthroughHasher>::default(),
);
for index in 0..values.len() {
let key = K::try_from(index).map_err(|_| Error::Overflow)?;
// safety: we only iterate within bounds
let value = unsafe { values.value_unchecked_at(index) };
let mut hasher = map.hasher().build_hasher();
value.borrow().hash(&mut hasher);
let hash = hasher.finish();
match map.raw_entry_mut().from_hash(hash, |_| true) {
let hash = ahash_hash(value.borrow());
match map.raw_entry_mut().from_hash(hash, |item| {
// safety: invariant of the struct, it's always in bounds since we maintain it
let stored_value = unsafe { values.value_unchecked_at(item.key.as_usize()) };
stored_value.borrow() == value.borrow()
}) {
RawEntryMut::Occupied(_) => {
return Err(Error::InvalidArgumentError(
"duplicate value in dictionary values array".into(),
))
}
RawEntryMut::Vacant(entry) => {
entry.insert_hashed_nocheck(hash, key, ()); // NB: don't use .insert() here!
// NB: don't use .insert() here!
entry.insert_hashed_nocheck(hash, Hashed { hash, key }, ());
}
}
}
Expand Down Expand Up @@ -91,24 +141,22 @@ impl<K: DictionaryKey, M: MutableArray> ValueMap<K, M> {
V: AsIndexed<M>,
M::Type: Eq + Hash,
{
let mut hasher = self.map.hasher().build_hasher();
value.as_indexed().hash(&mut hasher);
let hash = hasher.finish();

let hash = ahash_hash(value.as_indexed());
Ok(
match self.map.raw_entry_mut().from_hash(hash, |key| {
match self.map.raw_entry_mut().from_hash(hash, |item| {
// safety: we've already checked (the inverse) when we pushed it, so it should be ok?
let index = unsafe { key.as_usize() };
let index = unsafe { item.key.as_usize() };
// safety: invariant of the struct, it's always in bounds since we maintain it
let stored_value = unsafe { self.values.value_unchecked_at(index) };
stored_value.borrow() == value.as_indexed()
}) {
RawEntryMut::Occupied(entry) => *entry.key(),
RawEntryMut::Occupied(entry) => entry.key().key,
RawEntryMut::Vacant(entry) => {
let index = self.values.len();
let key = K::try_from(index).map_err(|_| Error::Overflow)?;
entry.insert_hashed_nocheck(hash, key, ()); // NB: don't use .insert() here!
entry.insert_hashed_nocheck(hash, Hashed { hash, key }, ()); // NB: don't use .insert() here!
push(&mut self.values, value)?;
debug_assert_eq!(self.values.len(), index + 1);
key
}
},
Expand Down
17 changes: 17 additions & 0 deletions tests/it/array/dictionary/mutable.rs
Original file line number Diff line number Diff line change
Expand Up @@ -150,3 +150,20 @@ fn test_push_utf8_ex() {
fn test_push_i64_ex() {
test_push_ex::<MutablePrimitiveArray<i64>, _>(vec![10, 20, 30, 20], |i| 1000 + i as i64);
}

#[test]
fn test_big_dict() {
let n = 10;
let strings = (0..10).map(|i| i.to_string()).collect::<Vec<_>>();
let mut arr = MutableDictionaryArray::<u8, MutableUtf8Array<i32>>::new();
for s in &strings {
arr.try_push(Some(s)).unwrap();
}
assert_eq!(arr.values().len(), n);
for _ in 0..10_000 {
for s in &strings {
arr.try_push(Some(s)).unwrap();
}
}
assert_eq!(arr.values().len(), n);
}

0 comments on commit 767834e

Please sign in to comment.