diff --git a/Cargo.toml b/Cargo.toml index 0f3f9ec27bf..1bb20a69556 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -32,7 +32,7 @@ hash_hasher = "^2.0.3" simdutf8 = "0.1.4" # A Rust port of SwissTable -hashbrown = { version = "0.14", default-features = false, optional = true } +hashbrown = { version = "0.14", default-features = false, features = ["ahash"] } # for timezone support chrono-tz = { version = "0.8", optional = true } @@ -243,7 +243,7 @@ compute_merge_sort = ["itertools", "compute_sort"] compute_nullif = ["compute_comparison"] compute_partition = ["compute_sort"] compute_regex_match = ["regex"] -compute_sort = ["compute_take", "hashbrown"] +compute_sort = ["compute_take"] compute_substring = [] compute_take = [] compute_temporal = [] diff --git a/src/array/dictionary/mod.rs b/src/array/dictionary/mod.rs index f7d4a0f43d7..3a23e670a1d 100644 --- a/src/array/dictionary/mod.rs +++ b/src/array/dictionary/mod.rs @@ -1,3 +1,4 @@ +use std::hash::Hash; use std::hint::unreachable_unchecked; use crate::{ @@ -20,6 +21,7 @@ mod iterator; mod mutable; use crate::array::specification::check_indexes_unchecked; mod typed_iterator; +mod value_map; use crate::array::dictionary::typed_iterator::{DictValue, DictionaryValuesIterTyped}; pub use iterator::*; @@ -33,7 +35,7 @@ use super::{new_null_array, specification::check_indexes}; /// /// Any implementation of this trait must ensure that `always_fits_usize` only /// returns `true` if all values succeeds on `value::try_into::().unwrap()`. -pub unsafe trait DictionaryKey: NativeType + TryInto + TryFrom { +pub unsafe trait DictionaryKey: NativeType + TryInto + TryFrom + Hash { /// The corresponding [`IntegerType`] of this key const KEY_TYPE: IntegerType; diff --git a/src/array/dictionary/mutable.rs b/src/array/dictionary/mutable.rs index 444de34bcc4..b48a57a9458 100644 --- a/src/array/dictionary/mutable.rs +++ b/src/array/dictionary/mutable.rs @@ -1,15 +1,15 @@ -use std::hash::{Hash, Hasher}; -use std::{collections::hash_map::DefaultHasher, sync::Arc}; - -use hash_hasher::HashedMap; +use std::hash::Hash; +use std::sync::Arc; +use crate::array::indexable::{AsIndexed, Indexable}; use crate::{ array::{primitive::MutablePrimitiveArray, Array, MutableArray, TryExtend, TryPush}, bitmap::MutableBitmap, datatypes::DataType, - error::{Error, Result}, + error::Result, }; +use super::value_map::ValueMap; use super::{DictionaryArray, DictionaryKey}; /// A mutable, strong-typed version of [`DictionaryArray`]. @@ -30,55 +30,29 @@ use super::{DictionaryArray, DictionaryKey}; #[derive(Debug)] pub struct MutableDictionaryArray { data_type: DataType, + map: ValueMap, + // invariant: `max(keys) < map.values().len()` keys: MutablePrimitiveArray, - map: HashedMap, - // invariant: `keys.len() <= values.len()` - values: M, } impl From> for DictionaryArray { - fn from(mut other: MutableDictionaryArray) -> Self { + fn from(other: MutableDictionaryArray) -> Self { // Safety - the invariant of this struct ensures that this is up-held unsafe { DictionaryArray::::try_new_unchecked( other.data_type, other.keys.into(), - other.values.as_box(), + other.map.into_values().as_box(), ) .unwrap() } } } -impl From for MutableDictionaryArray { - fn from(values: M) -> Self { - Self { - data_type: DataType::Dictionary( - K::KEY_TYPE, - Box::new(values.data_type().clone()), - false, - ), - keys: MutablePrimitiveArray::::new(), - map: HashedMap::default(), - values, - } - } -} - impl MutableDictionaryArray { /// Creates an empty [`MutableDictionaryArray`]. pub fn new() -> Self { - let values = M::default(); - Self { - data_type: DataType::Dictionary( - K::KEY_TYPE, - Box::new(values.data_type().clone()), - false, - ), - keys: MutablePrimitiveArray::::new(), - map: HashedMap::default(), - values, - } + Self::try_empty(M::default()).unwrap() } } @@ -89,38 +63,61 @@ impl Default for MutableDictionaryA } impl MutableDictionaryArray { - /// Returns whether the value should be pushed to the values or not - fn try_push_valid(&mut self, value: &T) -> Result { - let mut hasher = DefaultHasher::new(); - value.hash(&mut hasher); - let hash = hasher.finish(); - match self.map.get(&hash) { - Some(key) => { - self.keys.push(Some(*key)); - Ok(false) - } - None => { - let key = K::try_from(self.map.len()).map_err(|_| Error::Overflow)?; - self.map.insert(hash, key); - self.keys.push(Some(key)); - Ok(true) - } + /// Creates an empty [`MutableDictionaryArray`] from a given empty values array. + /// # Errors + /// Errors if the array is non-empty. + pub fn try_empty(values: M) -> Result { + Ok(Self::from_value_map(ValueMap::::try_empty(values)?)) + } + + /// Creates an empty [`MutableDictionaryArray`] preloaded with a given dictionary of values. + /// Indices associated with those values are automatically assigned based on the order of + /// the values. + /// # Errors + /// Errors if there's more values than the maximum value of `K` or if values are not unique. + pub fn from_values(values: M) -> Result + where + M: Indexable, + M::Type: Eq + Hash, + { + Ok(Self::from_value_map(ValueMap::::from_values(values)?)) + } + + fn from_value_map(value_map: ValueMap) -> Self { + let keys = MutablePrimitiveArray::::new(); + let data_type = + DataType::Dictionary(K::KEY_TYPE, Box::new(value_map.data_type().clone()), false); + Self { + data_type, + map: value_map, + keys, } } + /// Creates an empty [`MutableDictionaryArray`] retaining the same dictionary as the current + /// mutable dictionary array, but with no data. This may come useful when serializing the + /// array into multiple chunks, where there's a requirement that the dictionary is the same. + /// No copying is performed, the value map is moved over to the new array. + pub fn into_empty(self) -> Self { + Self::from_value_map(self.map) + } + + /// Same as `into_empty` but clones the inner value map instead of taking full ownership. + pub fn to_empty(&self) -> Self + where + M: Clone, + { + Self::from_value_map(self.map.clone()) + } + /// pushes a null value pub fn push_null(&mut self) { self.keys.push(None) } - /// returns a mutable reference to the inner values. - fn mut_values(&mut self) -> &mut M { - &mut self.values - } - /// returns a reference to the inner values. pub fn values(&self) -> &M { - &self.values + self.map.values() } /// converts itself into [`Arc`] @@ -142,15 +139,10 @@ impl MutableDictionaryArray { /// Shrinks the capacity of the [`MutableDictionaryArray`] to fit its current length. pub fn shrink_to_fit(&mut self) { - self.values.shrink_to_fit(); + self.map.shrink_to_fit(); self.keys.shrink_to_fit(); } - /// Returns the dictionary map - pub fn map(&self) -> &HashedMap { - &self.map - } - /// Returns the dictionary keys pub fn keys(&self) -> &MutablePrimitiveArray { &self.keys @@ -160,7 +152,7 @@ impl MutableDictionaryArray { DictionaryArray::::try_new( self.data_type.clone(), std::mem::take(&mut self.keys).into(), - self.values.as_box(), + self.map.take_into(), ) .unwrap() } @@ -208,17 +200,20 @@ impl MutableArray for MutableDictio } } -impl TryExtend> for MutableDictionaryArray +impl TryExtend> for MutableDictionaryArray where K: DictionaryKey, - M: MutableArray + TryExtend>, + M: MutableArray + Indexable + TryExtend>, + T: AsIndexed, + M::Type: Eq + Hash, { fn try_extend>>(&mut self, iter: II) -> Result<()> { for value in iter { if let Some(value) = value { - if self.try_push_valid(&value)? { - self.mut_values().try_extend(std::iter::once(Some(value)))?; - } + let key = self + .map + .try_push_valid(value, |arr, v| arr.try_extend(std::iter::once(Some(v))))?; + self.keys.try_push(Some(key))?; } else { self.push_null(); } @@ -230,19 +225,19 @@ where impl TryPush> for MutableDictionaryArray where K: DictionaryKey, - M: MutableArray + TryPush>, - T: Hash, + M: MutableArray + Indexable + TryPush>, + T: AsIndexed, + M::Type: Eq + Hash, { fn try_push(&mut self, item: Option) -> Result<()> { if let Some(value) = item { - if self.try_push_valid(&value)? { - self.values.try_push(Some(value)) - } else { - Ok(()) - } + let key = self + .map + .try_push_valid(value, |arr, v| arr.try_push(Some(v)))?; + self.keys.try_push(Some(key))?; } else { self.push_null(); - Ok(()) } + Ok(()) } } diff --git a/src/array/dictionary/value_map.rs b/src/array/dictionary/value_map.rs new file mode 100644 index 00000000000..35b59aaa2ac --- /dev/null +++ b/src/array/dictionary/value_map.rs @@ -0,0 +1,127 @@ +use std::borrow::Borrow; +use std::fmt::{self, Debug}; +use std::hash::{BuildHasher, Hash, Hasher}; + +use hashbrown::hash_map::RawEntryMut; +use hashbrown::HashMap; + +use crate::array::Array; +use crate::{ + array::indexable::{AsIndexed, Indexable}, + array::MutableArray, + datatypes::DataType, + error::{Error, Result}, +}; + +use super::DictionaryKey; + +#[derive(Clone)] +pub struct ValueMap { + values: M, + map: HashMap, // NB: *only* use insert_hashed_nocheck() and no other hashmap API +} + +impl ValueMap { + pub fn try_empty(values: M) -> Result { + if !values.is_empty() { + return Err(Error::InvalidArgumentError( + "initializing value map with non-empty values array".into(), + )); + } + Ok(Self { + values, + map: HashMap::default(), + }) + } + + pub fn from_values(values: M) -> Result + where + M: Indexable, + M::Type: Eq + Hash, + { + let mut map = HashMap::with_capacity(values.len()); + for index in 0..values.len() { + let key = K::try_from(index).map_err(|_| Error::Overflow)?; + // safety: we only iterate within bounds + let value = unsafe { values.value_unchecked_at(index) }; + let mut hasher = map.hasher().build_hasher(); + value.borrow().hash(&mut hasher); + let hash = hasher.finish(); + match map.raw_entry_mut().from_hash(hash, |_| true) { + RawEntryMut::Occupied(_) => { + return Err(Error::InvalidArgumentError( + "duplicate value in dictionary values array".into(), + )) + } + RawEntryMut::Vacant(entry) => { + entry.insert_hashed_nocheck(hash, key, ()); // NB: don't use .insert() here! + } + } + } + Ok(Self { values, map }) + } + + pub fn data_type(&self) -> &DataType { + self.values.data_type() + } + + pub fn into_values(self) -> M { + self.values + } + + pub fn take_into(&mut self) -> Box { + let arr = self.values.as_box(); + self.map.clear(); + arr + } + + #[inline] + pub fn values(&self) -> &M { + &self.values + } + + /// Try to insert a value and return its index (it may or may not get inserted). + pub fn try_push_valid( + &mut self, + value: V, + mut push: impl FnMut(&mut M, V) -> Result<()>, + ) -> Result + where + M: Indexable, + V: AsIndexed, + M::Type: Eq + Hash, + { + let mut hasher = self.map.hasher().build_hasher(); + value.as_indexed().hash(&mut hasher); + let hash = hasher.finish(); + + Ok( + match self.map.raw_entry_mut().from_hash(hash, |key| { + // safety: we've already checked (the inverse) when we pushed it, so it should be ok? + let index = unsafe { key.as_usize() }; + // safety: invariant of the struct, it's always in bounds since we maintain it + let stored_value = unsafe { self.values.value_unchecked_at(index) }; + stored_value.borrow() == value.as_indexed() + }) { + RawEntryMut::Occupied(entry) => *entry.key(), + RawEntryMut::Vacant(entry) => { + let index = self.values.len(); + let key = K::try_from(index).map_err(|_| Error::Overflow)?; + entry.insert_hashed_nocheck(hash, key, ()); // NB: don't use .insert() here! + push(&mut self.values, value)?; + key + } + }, + ) + } + + pub fn shrink_to_fit(&mut self) { + self.values.shrink_to_fit(); + } +} + +impl Debug for ValueMap { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + self.values.fmt(f) + } +} diff --git a/src/array/indexable.rs b/src/array/indexable.rs new file mode 100644 index 00000000000..76001bfcf5c --- /dev/null +++ b/src/array/indexable.rs @@ -0,0 +1,197 @@ +use std::borrow::Borrow; + +use crate::{ + array::{ + MutableArray, MutableBinaryArray, MutableBinaryValuesArray, MutableBooleanArray, + MutableFixedSizeBinaryArray, MutablePrimitiveArray, MutableUtf8Array, + MutableUtf8ValuesArray, + }, + offset::Offset, + types::NativeType, +}; + +/// Trait for arrays that can be indexed directly to extract a value. +pub trait Indexable { + /// The type of the element at index `i`; may be a reference type or a value type. + type Value<'a>: Borrow + where + Self: 'a; + + type Type: ?Sized; + + /// Returns the element at index `i`. + /// # Panic + /// May panic if `i >= self.len()`. + fn value_at(&self, index: usize) -> Self::Value<'_>; + + /// Returns the element at index `i`. + /// # Safety + /// Assumes that the `i < self.len`. + #[inline] + unsafe fn value_unchecked_at(&self, index: usize) -> Self::Value<'_> { + self.value_at(index) + } +} + +pub trait AsIndexed { + fn as_indexed(&self) -> &M::Type; +} + +impl Indexable for MutableBooleanArray { + type Value<'a> = bool; + type Type = bool; + + #[inline] + fn value_at(&self, i: usize) -> Self::Value<'_> { + self.values().get(i) + } +} + +impl AsIndexed for bool { + #[inline] + fn as_indexed(&self) -> &bool { + self + } +} + +impl Indexable for MutableBinaryArray { + type Value<'a> = &'a [u8]; + type Type = [u8]; + + #[inline] + fn value_at(&self, i: usize) -> Self::Value<'_> { + // TODO: add .value() / .value_unchecked() to MutableBinaryArray? + assert!(i < self.len()); + unsafe { self.value_unchecked_at(i) } + } + + #[inline] + unsafe fn value_unchecked_at(&self, i: usize) -> Self::Value<'_> { + // TODO: add .value() / .value_unchecked() to MutableBinaryArray? + // soundness: the invariant of the function + let (start, end) = self.offsets().start_end_unchecked(i); + // soundness: the invariant of the struct + self.values().get_unchecked(start..end) + } +} + +impl AsIndexed> for &[u8] { + #[inline] + fn as_indexed(&self) -> &[u8] { + self + } +} + +impl Indexable for MutableBinaryValuesArray { + type Value<'a> = &'a [u8]; + type Type = [u8]; + + #[inline] + fn value_at(&self, i: usize) -> Self::Value<'_> { + self.value(i) + } + + #[inline] + unsafe fn value_unchecked_at(&self, i: usize) -> Self::Value<'_> { + self.value_unchecked(i) + } +} + +impl AsIndexed> for &[u8] { + #[inline] + fn as_indexed(&self) -> &[u8] { + self + } +} + +impl Indexable for MutableFixedSizeBinaryArray { + type Value<'a> = &'a [u8]; + type Type = [u8]; + + #[inline] + fn value_at(&self, i: usize) -> Self::Value<'_> { + self.value(i) + } + + #[inline] + unsafe fn value_unchecked_at(&self, i: usize) -> Self::Value<'_> { + // soundness: the invariant of the struct + self.value_unchecked(i) + } +} + +impl AsIndexed for &[u8] { + #[inline] + fn as_indexed(&self) -> &[u8] { + self + } +} + +// TODO: should NativeType derive from Hash? +impl Indexable for MutablePrimitiveArray { + type Value<'a> = T; + type Type = T; + + #[inline] + fn value_at(&self, i: usize) -> Self::Value<'_> { + assert!(i < self.len()); + // TODO: add Length trait? (for both Array and MutableArray) + unsafe { self.value_unchecked_at(i) } + } + + #[inline] + unsafe fn value_unchecked_at(&self, i: usize) -> Self::Value<'_> { + *self.values().get_unchecked(i) + } +} + +impl AsIndexed> for T { + #[inline] + fn as_indexed(&self) -> &T { + self + } +} + +impl Indexable for MutableUtf8Array { + type Value<'a> = &'a str; + type Type = str; + + #[inline] + fn value_at(&self, i: usize) -> Self::Value<'_> { + self.value(i) + } + + #[inline] + unsafe fn value_unchecked_at(&self, i: usize) -> Self::Value<'_> { + self.value_unchecked(i) + } +} + +impl> AsIndexed> for V { + #[inline] + fn as_indexed(&self) -> &str { + self.as_ref() + } +} + +impl Indexable for MutableUtf8ValuesArray { + type Value<'a> = &'a str; + type Type = str; + + #[inline] + fn value_at(&self, i: usize) -> Self::Value<'_> { + self.value(i) + } + + #[inline] + unsafe fn value_unchecked_at(&self, i: usize) -> Self::Value<'_> { + self.value_unchecked(i) + } +} + +impl> AsIndexed> for V { + #[inline] + fn as_indexed(&self) -> &str { + self.as_ref() + } +} diff --git a/src/array/mod.rs b/src/array/mod.rs index 04b7b2c8e35..02735c3d0bb 100644 --- a/src/array/mod.rs +++ b/src/array/mod.rs @@ -720,8 +720,11 @@ mod utf8; mod equal; mod ffi; mod fmt; -pub mod growable; +#[doc(hidden)] +pub mod indexable; mod iterator; + +pub mod growable; pub mod ord; pub(crate) use iterator::ArrayAccessor; diff --git a/src/compute/cast/primitive_to.rs b/src/compute/cast/primitive_to.rs index 30b265e2d59..110288817a7 100644 --- a/src/compute/cast/primitive_to.rs +++ b/src/compute/cast/primitive_to.rs @@ -306,9 +306,9 @@ pub fn primitive_to_dictionary( from: &PrimitiveArray, ) -> Result> { let iter = from.iter().map(|x| x.copied()); - let mut array = MutableDictionaryArray::::from(MutablePrimitiveArray::::from( + let mut array = MutableDictionaryArray::::try_empty(MutablePrimitiveArray::::from( from.data_type().clone(), - )); + ))?; array.try_extend(iter)?; Ok(array.into()) diff --git a/tests/it/array/dictionary/mutable.rs b/tests/it/array/dictionary/mutable.rs index b6103dcccfc..9570339893a 100644 --- a/tests/it/array/dictionary/mutable.rs +++ b/tests/it/array/dictionary/mutable.rs @@ -1,8 +1,11 @@ +use std::borrow::Borrow; +use std::collections::HashSet; +use std::fmt::Debug; +use std::hash::Hash; + +use arrow2::array::indexable::{AsIndexed, Indexable}; use arrow2::array::*; use arrow2::error::Result; -use hash_hasher::HashedMap; -use std::collections::hash_map::DefaultHasher; -use std::hash::{Hash, Hasher}; #[test] fn primitive() -> Result<()> { @@ -61,16 +64,89 @@ fn push_utf8() { expected_keys.push(Some(0)); expected_keys.push(Some(1)); assert_eq!(*new.keys(), expected_keys); +} + +#[test] +fn into_empty() { + let mut new: MutableDictionaryArray> = MutableDictionaryArray::new(); + for value in [Some("A"), Some("B"), None, Some("C"), Some("A"), Some("B")] { + new.try_push(value).unwrap(); + } + let values = new.values().clone(); + let empty = new.into_empty(); + assert_eq!(empty.values(), &values); + assert!(empty.is_empty()); +} + +#[test] +fn from_values() { + let mut new: MutableDictionaryArray> = MutableDictionaryArray::new(); + for value in [Some("A"), Some("B"), None, Some("C"), Some("A"), Some("B")] { + new.try_push(value).unwrap(); + } + let mut values = new.values().clone(); + let empty = MutableDictionaryArray::::from_values(values.clone()).unwrap(); + assert_eq!(empty.values(), &values); + assert!(empty.is_empty()); + values.push(Some("A")); + assert!(MutableDictionaryArray::::from_values(values).is_err()); +} - let expected_map = ["A", "B", "C"] - .iter() - .enumerate() - .map(|(index, value)| { - let mut hasher = DefaultHasher::new(); - value.hash(&mut hasher); - let hash = hasher.finish(); - (hash, index as i32) - }) - .collect::>(); - assert_eq!(*new.map(), expected_map); +#[test] +fn try_empty() { + let mut values = MutableUtf8Array::::new(); + MutableDictionaryArray::::try_empty(values.clone()).unwrap(); + values.push(Some("A")); + assert!(MutableDictionaryArray::::try_empty(values.clone()).is_err()); +} + +fn test_push_ex(values: Vec, gen: impl Fn(usize) -> T) +where + M: MutableArray + Indexable + TryPush> + TryExtend> + Default + 'static, + M::Type: Eq + Hash + Debug, + T: AsIndexed + Default + Clone + Eq + Hash, +{ + for is_extend in [false, true] { + let mut set = HashSet::new(); + let mut arr = MutableDictionaryArray::::new(); + macro_rules! push { + ($v:expr) => { + if is_extend { + arr.try_extend(std::iter::once($v)) + } else { + arr.try_push($v) + } + }; + } + arr.push_null(); + push!(None).unwrap(); + assert_eq!(arr.len(), 2); + assert_eq!(arr.values().len(), 0); + for (i, v) in values.iter().cloned().enumerate() { + push!(Some(v.clone())).unwrap(); + let is_dup = !set.insert(v.clone()); + if !is_dup { + assert_eq!(arr.values().value_at(i).borrow(), v.as_indexed()); + assert_eq!(arr.keys().value_at(arr.keys().len() - 1), i as u8); + } + assert_eq!(arr.values().len(), set.len()); + assert_eq!(arr.len(), 3 + i); + } + for i in 0..256 - set.len() { + push!(Some(gen(i))).unwrap(); + } + assert!(push!(Some(gen(256))).is_err()); + } +} + +#[test] +fn test_push_utf8_ex() { + test_push_ex::, _>(vec!["a".into(), "b".into(), "a".into()], |i| { + i.to_string() + }) +} + +#[test] +fn test_push_i64_ex() { + test_push_ex::, _>(vec![10, 20, 30, 20], |i| 1000 + i as i64); }