Skip to content
This repository has been archived by the owner on Feb 18, 2024. It is now read-only.

2nd (safe) rewrite of MutableDictionaryArray #1561

Merged
merged 5 commits into from
Sep 4, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ hash_hasher = "^2.0.3"
simdutf8 = "0.1.4"

# A Rust port of SwissTable
hashbrown = { version = "0.14", default-features = false, optional = true }
hashbrown = { version = "0.14", default-features = false, features = ["ahash"] }

# for timezone support
chrono-tz = { version = "0.8", optional = true }
Expand Down Expand Up @@ -243,7 +243,7 @@ compute_merge_sort = ["itertools", "compute_sort"]
compute_nullif = ["compute_comparison"]
compute_partition = ["compute_sort"]
compute_regex_match = ["regex"]
compute_sort = ["compute_take", "hashbrown"]
compute_sort = ["compute_take"]
compute_substring = []
compute_take = []
compute_temporal = []
Expand Down
4 changes: 3 additions & 1 deletion src/array/dictionary/mod.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
use std::hash::Hash;
use std::hint::unreachable_unchecked;

use crate::{
Expand All @@ -20,6 +21,7 @@ mod iterator;
mod mutable;
use crate::array::specification::check_indexes_unchecked;
mod typed_iterator;
mod value_map;

use crate::array::dictionary::typed_iterator::{DictValue, DictionaryValuesIterTyped};
pub use iterator::*;
Expand All @@ -33,7 +35,7 @@ use super::{new_null_array, specification::check_indexes};
///
/// Any implementation of this trait must ensure that `always_fits_usize` only
/// returns `true` if all values succeeds on `value::try_into::<usize>().unwrap()`.
pub unsafe trait DictionaryKey: NativeType + TryInto<usize> + TryFrom<usize> {
pub unsafe trait DictionaryKey: NativeType + TryInto<usize> + TryFrom<usize> + Hash {
/// The corresponding [`IntegerType`] of this key
const KEY_TYPE: IntegerType;

Expand Down
151 changes: 73 additions & 78 deletions src/array/dictionary/mutable.rs
Original file line number Diff line number Diff line change
@@ -1,15 +1,15 @@
use std::hash::{Hash, Hasher};
use std::{collections::hash_map::DefaultHasher, sync::Arc};

use hash_hasher::HashedMap;
use std::hash::Hash;
use std::sync::Arc;

use crate::array::indexable::{AsIndexed, Indexable};
use crate::{
array::{primitive::MutablePrimitiveArray, Array, MutableArray, TryExtend, TryPush},
bitmap::MutableBitmap,
datatypes::DataType,
error::{Error, Result},
error::Result,
};

use super::value_map::ValueMap;
use super::{DictionaryArray, DictionaryKey};

/// A mutable, strong-typed version of [`DictionaryArray`].
Expand All @@ -30,55 +30,29 @@ use super::{DictionaryArray, DictionaryKey};
#[derive(Debug)]
pub struct MutableDictionaryArray<K: DictionaryKey, M: MutableArray> {
data_type: DataType,
map: ValueMap<K, M>,
// invariant: `max(keys) < map.values().len()`
keys: MutablePrimitiveArray<K>,
map: HashedMap<u64, K>,
// invariant: `keys.len() <= values.len()`
values: M,
}

impl<K: DictionaryKey, M: MutableArray> From<MutableDictionaryArray<K, M>> for DictionaryArray<K> {
fn from(mut other: MutableDictionaryArray<K, M>) -> Self {
fn from(other: MutableDictionaryArray<K, M>) -> Self {
// Safety - the invariant of this struct ensures that this is up-held
unsafe {
DictionaryArray::<K>::try_new_unchecked(
other.data_type,
other.keys.into(),
other.values.as_box(),
other.map.into_values().as_box(),
)
.unwrap()
}
}
}

impl<K: DictionaryKey, M: MutableArray> From<M> for MutableDictionaryArray<K, M> {
fn from(values: M) -> Self {
Self {
data_type: DataType::Dictionary(
K::KEY_TYPE,
Box::new(values.data_type().clone()),
false,
),
keys: MutablePrimitiveArray::<K>::new(),
map: HashedMap::default(),
values,
}
}
}

impl<K: DictionaryKey, M: MutableArray + Default> MutableDictionaryArray<K, M> {
/// Creates an empty [`MutableDictionaryArray`].
pub fn new() -> Self {
let values = M::default();
Self {
data_type: DataType::Dictionary(
K::KEY_TYPE,
Box::new(values.data_type().clone()),
false,
),
keys: MutablePrimitiveArray::<K>::new(),
map: HashedMap::default(),
values,
}
Self::try_empty(M::default()).unwrap()
}
}

Expand All @@ -89,38 +63,61 @@ impl<K: DictionaryKey, M: MutableArray + Default> Default for MutableDictionaryA
}

impl<K: DictionaryKey, M: MutableArray> MutableDictionaryArray<K, M> {
/// Returns whether the value should be pushed to the values or not
fn try_push_valid<T: Hash>(&mut self, value: &T) -> Result<bool> {
let mut hasher = DefaultHasher::new();
value.hash(&mut hasher);
let hash = hasher.finish();
match self.map.get(&hash) {
Some(key) => {
self.keys.push(Some(*key));
Ok(false)
}
None => {
let key = K::try_from(self.map.len()).map_err(|_| Error::Overflow)?;
self.map.insert(hash, key);
self.keys.push(Some(key));
Ok(true)
}
/// Creates an empty [`MutableDictionaryArray`] from a given empty values array.
/// # Errors
/// Errors if the array is non-empty.
pub fn try_empty(values: M) -> Result<Self> {
Ok(Self::from_value_map(ValueMap::<K, M>::try_empty(values)?))
}

/// Creates an empty [`MutableDictionaryArray`] preloaded with a given dictionary of values.
/// Indices associated with those values are automatically assigned based on the order of
/// the values.
/// # Errors
/// Errors if there's more values than the maximum value of `K` or if values are not unique.
pub fn from_values(values: M) -> Result<Self>
where
M: Indexable,
M::Type: Eq + Hash,
{
Ok(Self::from_value_map(ValueMap::<K, M>::from_values(values)?))
}

fn from_value_map(value_map: ValueMap<K, M>) -> Self {
let keys = MutablePrimitiveArray::<K>::new();
let data_type =
DataType::Dictionary(K::KEY_TYPE, Box::new(value_map.data_type().clone()), false);
Self {
data_type,
map: value_map,
keys,
}
}

/// Creates an empty [`MutableDictionaryArray`] retaining the same dictionary as the current
/// mutable dictionary array, but with no data. This may come useful when serializing the
/// array into multiple chunks, where there's a requirement that the dictionary is the same.
/// No copying is performed, the value map is moved over to the new array.
pub fn into_empty(self) -> Self {
Self::from_value_map(self.map)
}

/// Same as `into_empty` but clones the inner value map instead of taking full ownership.
pub fn to_empty(&self) -> Self
where
M: Clone,
{
Self::from_value_map(self.map.clone())
}

/// pushes a null value
pub fn push_null(&mut self) {
self.keys.push(None)
}

/// returns a mutable reference to the inner values.
fn mut_values(&mut self) -> &mut M {
&mut self.values
}

/// returns a reference to the inner values.
pub fn values(&self) -> &M {
&self.values
self.map.values()
}

/// converts itself into [`Arc<dyn Array>`]
Expand All @@ -142,15 +139,10 @@ impl<K: DictionaryKey, M: MutableArray> MutableDictionaryArray<K, M> {

/// Shrinks the capacity of the [`MutableDictionaryArray`] to fit its current length.
pub fn shrink_to_fit(&mut self) {
self.values.shrink_to_fit();
self.map.shrink_to_fit();
self.keys.shrink_to_fit();
}

/// Returns the dictionary map
pub fn map(&self) -> &HashedMap<u64, K> {
&self.map
}

/// Returns the dictionary keys
pub fn keys(&self) -> &MutablePrimitiveArray<K> {
&self.keys
Expand All @@ -160,7 +152,7 @@ impl<K: DictionaryKey, M: MutableArray> MutableDictionaryArray<K, M> {
DictionaryArray::<K>::try_new(
self.data_type.clone(),
std::mem::take(&mut self.keys).into(),
self.values.as_box(),
self.map.take_into(),
)
.unwrap()
}
Expand Down Expand Up @@ -208,17 +200,20 @@ impl<K: DictionaryKey, M: 'static + MutableArray> MutableArray for MutableDictio
}
}

impl<K, M, T: Hash> TryExtend<Option<T>> for MutableDictionaryArray<K, M>
impl<K, M, T> TryExtend<Option<T>> for MutableDictionaryArray<K, M>
where
K: DictionaryKey,
M: MutableArray + TryExtend<Option<T>>,
M: MutableArray + Indexable + TryExtend<Option<T>>,
T: AsIndexed<M>,
M::Type: Eq + Hash,
{
fn try_extend<II: IntoIterator<Item = Option<T>>>(&mut self, iter: II) -> Result<()> {
for value in iter {
if let Some(value) = value {
if self.try_push_valid(&value)? {
self.mut_values().try_extend(std::iter::once(Some(value)))?;
}
let key = self
.map
.try_push_valid(value, |arr, v| arr.try_extend(std::iter::once(Some(v))))?;
self.keys.try_push(Some(key))?;
} else {
self.push_null();
}
Expand All @@ -230,19 +225,19 @@ where
impl<K, M, T> TryPush<Option<T>> for MutableDictionaryArray<K, M>
where
K: DictionaryKey,
M: MutableArray + TryPush<Option<T>>,
T: Hash,
M: MutableArray + Indexable + TryPush<Option<T>>,
T: AsIndexed<M>,
M::Type: Eq + Hash,
{
fn try_push(&mut self, item: Option<T>) -> Result<()> {
if let Some(value) = item {
if self.try_push_valid(&value)? {
self.values.try_push(Some(value))
} else {
Ok(())
}
let key = self
.map
.try_push_valid(value, |arr, v| arr.try_push(Some(v)))?;
self.keys.try_push(Some(key))?;
} else {
self.push_null();
Ok(())
}
Ok(())
}
}
Loading