From cdbc9580a7176ea1bef97089e192469feb4d6c3c Mon Sep 17 00:00:00 2001 From: Jorge Leitao Date: Wed, 22 Sep 2021 16:12:51 +0100 Subject: [PATCH] Improved growable. (#434) --- src/array/growable/binary.rs | 3 +-- src/array/growable/boolean.rs | 5 ++-- src/array/growable/dictionary.rs | 39 +++++++++++++++--------------- src/array/growable/fixed_binary.rs | 5 ++-- src/array/growable/list.rs | 5 ++-- src/array/growable/mod.rs | 16 ++++++------ src/array/growable/null.rs | 1 + src/array/growable/primitive.rs | 27 ++++++++++++--------- src/array/growable/structure.rs | 5 ++-- src/array/growable/utf8.rs | 5 ++-- src/array/growable/utils.rs | 21 +++------------- 11 files changed, 63 insertions(+), 69 deletions(-) diff --git a/src/array/growable/binary.rs b/src/array/growable/binary.rs index caca72a272b..3a2b36fb11f 100644 --- a/src/array/growable/binary.rs +++ b/src/array/growable/binary.rs @@ -20,12 +20,11 @@ pub struct GrowableBinary<'a, O: Offset> { values: MutableBuffer, offsets: MutableBuffer, length: O, // always equal to the last offset at `offsets`. - // function used to extend nulls from arrays. This function's lifetime is bound to the array - // because it reads nulls from it. extend_null_bits: Vec>, } impl<'a, O: Offset> GrowableBinary<'a, O> { + /// Creates a new [`GrowableBinary`] bound to `arrays` with a pre-allocated `capacity`. /// # Panics /// If `arrays` is empty. pub fn new(arrays: Vec<&'a BinaryArray>, mut use_validity: bool, capacity: usize) -> Self { diff --git a/src/array/growable/boolean.rs b/src/array/growable/boolean.rs index 1dca6c1b1eb..1ab512ceacc 100644 --- a/src/array/growable/boolean.rs +++ b/src/array/growable/boolean.rs @@ -17,12 +17,13 @@ pub struct GrowableBoolean<'a> { data_type: DataType, validity: MutableBitmap, values: MutableBitmap, - // function used to extend nulls from arrays. This function's lifetime is bound to the array - // because it reads nulls from it. extend_null_bits: Vec>, } impl<'a> GrowableBoolean<'a> { + /// Creates a new [`GrowableBoolean`] bound to `arrays` with a pre-allocated `capacity`. + /// # Panics + /// If `arrays` is empty. pub fn new(arrays: Vec<&'a BooleanArray>, mut use_validity: bool, capacity: usize) -> Self { let data_type = arrays[0].data_type().clone(); diff --git a/src/array/growable/dictionary.rs b/src/array/growable/dictionary.rs index ac9e98f5a7a..4be93a53a2b 100644 --- a/src/array/growable/dictionary.rs +++ b/src/array/growable/dictionary.rs @@ -2,22 +2,27 @@ use std::sync::Arc; use crate::{ array::{Array, DictionaryArray, DictionaryKey, PrimitiveArray}, - bitmap::{Bitmap, MutableBitmap}, + bitmap::MutableBitmap, buffer::MutableBuffer, }; -use super::{make_growable, utils::extend_validity, Growable}; +use super::{ + make_growable, + utils::{build_extend_null_bits, ExtendNullBits}, + Growable, +}; /// Concrete [`Growable`] for the [`DictionaryArray`]. -#[derive(Debug)] +/// # Implementation +/// This growable does not perform collision checks and instead concatenates +/// the values of each [`DictionaryArray`] one after the other. pub struct GrowableDictionary<'a, K: DictionaryKey> { keys_values: Vec<&'a [K]>, - keys_validities: Vec<&'a Option>, key_values: MutableBuffer, key_validity: MutableBitmap, - use_validity: bool, offsets: Vec, values: Arc, + extend_null_bits: Vec>, } fn concatenate_values( @@ -36,6 +41,9 @@ fn concatenate_values( } impl<'a, T: DictionaryKey> GrowableDictionary<'a, T> { + /// Creates a new [`GrowableDictionary`] bound to `arrays` with a pre-allocated `capacity`. + /// # Panics + /// If `arrays` is empty. pub fn new(arrays: &[&'a DictionaryArray], mut use_validity: bool, capacity: usize) -> Self { // if any of the arrays has nulls, insertions from any array requires setting bits // as there is at least one array with nulls. @@ -48,10 +56,11 @@ impl<'a, T: DictionaryKey> GrowableDictionary<'a, T> { .iter() .map(|array| array.values().as_slice()) .collect::>(); - let keys_validities = arrays_keys + + let extend_null_bits = arrays .iter() - .map(|array| array.validity()) - .collect::>(); + .map(|array| build_extend_null_bits(array.keys(), use_validity)) + .collect(); let arrays_values = arrays .iter() @@ -63,11 +72,10 @@ impl<'a, T: DictionaryKey> GrowableDictionary<'a, T> { Self { offsets, values, - use_validity, keys_values, - keys_validities, key_values: MutableBuffer::with_capacity(capacity), key_validity: MutableBitmap::with_capacity(capacity), + extend_null_bits, } } @@ -85,13 +93,7 @@ impl<'a, T: DictionaryKey> GrowableDictionary<'a, T> { impl<'a, T: DictionaryKey> Growable<'a> for GrowableDictionary<'a, T> { #[inline] fn extend(&mut self, index: usize, start: usize, len: usize) { - extend_validity( - &mut self.key_validity, - self.keys_validities[index], - start, - len, - self.use_validity, - ); + (self.extend_null_bits[index])(&mut self.key_validity, start, len); let values = &self.keys_values[index][start..start + len]; let offset = self.offsets[index]; @@ -104,8 +106,7 @@ impl<'a, T: DictionaryKey> Growable<'a> for GrowableDictionary<'a, T> { #[inline] fn extend_validity(&mut self, additional: usize) { - self.key_values - .resize(self.key_values.len() + additional, T::default()); + self.key_values.extend_constant(additional, T::default()); self.key_validity.extend_constant(additional, false); } diff --git a/src/array/growable/fixed_binary.rs b/src/array/growable/fixed_binary.rs index 945786968b8..b9472a972b3 100644 --- a/src/array/growable/fixed_binary.rs +++ b/src/array/growable/fixed_binary.rs @@ -16,13 +16,14 @@ pub struct GrowableFixedSizeBinary<'a> { arrays: Vec<&'a FixedSizeBinaryArray>, validity: MutableBitmap, values: MutableBuffer, - // function used to extend nulls from arrays. This function's lifetime is bound to the array - // because it reads nulls from it. extend_null_bits: Vec>, size: usize, // just a cache } impl<'a> GrowableFixedSizeBinary<'a> { + /// Creates a new [`GrowableFixedSizeBinary`] bound to `arrays` with a pre-allocated `capacity`. + /// # Panics + /// If `arrays` is empty. pub fn new( arrays: Vec<&'a FixedSizeBinaryArray>, mut use_validity: bool, diff --git a/src/array/growable/list.rs b/src/array/growable/list.rs index 94a623c6cf9..bd2f805cf77 100644 --- a/src/array/growable/list.rs +++ b/src/array/growable/list.rs @@ -61,12 +61,13 @@ pub struct GrowableList<'a, O: Offset> { values: Box + 'a>, offsets: MutableBuffer, last_offset: O, // always equal to the last offset at `offsets`. - // function used to extend nulls from arrays. This function's lifetime is bound to the array - // because it reads nulls from it. extend_null_bits: Vec>, } impl<'a, O: Offset> GrowableList<'a, O> { + /// Creates a new [`GrowableFixedSizeBinary`] bound to `arrays` with a pre-allocated `capacity`. + /// # Panics + /// If `arrays` is empty. pub fn new(arrays: Vec<&'a ListArray>, mut use_validity: bool, capacity: usize) -> Self { // if any of the arrays has nulls, insertions from any array requires setting bits // as there is at least one array with nulls. diff --git a/src/array/growable/mod.rs b/src/array/growable/mod.rs index 7f57b87a6f7..f9c3bf6ce25 100644 --- a/src/array/growable/mod.rs +++ b/src/array/growable/mod.rs @@ -25,9 +25,9 @@ pub use dictionary::GrowableDictionary; mod utils; -/// A trait describing a struct that can be extended from slices of pre-existing [`Array`]s. -/// This is used in operations where a new array is built out of other arrays such, -/// as filtering and concatenation. +/// Describes a struct that can be extended from slices of other pre-existing [`Array`]s. +/// This is used in operations where a new array is built out of other arrays such +/// as filter and concatenation. pub trait Growable<'a> { /// Extends this [`Growable`] with elements from the bounded [`Array`] at index `index` from /// a slice starting at `start` and length `len`. @@ -38,13 +38,13 @@ pub trait Growable<'a> { /// Extends this [`Growable`] with null elements, disregarding the bound arrays fn extend_validity(&mut self, additional: usize); - /// Converts itself to an `Arc`, thereby finishing the mutation. - /// Self will be empty after such operation + /// Converts this [`Growable`] to an [`Arc`], thereby finishing the mutation. + /// Self will be empty after such operation. fn as_arc(&mut self) -> std::sync::Arc { self.as_box().into() } - /// Converts itself to an `Box`, thereby finishing the mutation. + /// Converts this [`Growable`] to an [`Box`], thereby finishing the mutation. /// Self will be empty after such operation fn as_box(&mut self) -> Box; } @@ -82,11 +82,11 @@ macro_rules! dyn_dict_growable { }}; } -/// Creates a new [`Growable`] from an arbitrary number of dynamic [`Array`]s. +/// Creates a new [`Growable`] from an arbitrary number of [`Array`]s. /// # Panics /// This function panics iff /// * the arrays do not have the same [`DataType`]. -/// * `arrays.is_empty`. +/// * `arrays.is_empty()`. pub fn make_growable<'a>( arrays: &[&'a dyn Array], use_validity: bool, diff --git a/src/array/growable/null.rs b/src/array/growable/null.rs index 725968521fe..b96707870a3 100644 --- a/src/array/growable/null.rs +++ b/src/array/growable/null.rs @@ -20,6 +20,7 @@ impl Default for GrowableNull { } impl GrowableNull { + /// Creates a new [`GrowableNull`]. pub fn new(data_type: DataType) -> Self { Self { data_type, diff --git a/src/array/growable/primitive.rs b/src/array/growable/primitive.rs index 118bd79e6b3..1485937a31d 100644 --- a/src/array/growable/primitive.rs +++ b/src/array/growable/primitive.rs @@ -2,25 +2,30 @@ use std::sync::Arc; use crate::{ array::{Array, PrimitiveArray}, - bitmap::{Bitmap, MutableBitmap}, + bitmap::MutableBitmap, buffer::MutableBuffer, datatypes::DataType, types::NativeType, }; -use super::{utils::extend_validity, Growable}; +use super::{ + utils::{build_extend_null_bits, ExtendNullBits}, + Growable, +}; /// Concrete [`Growable`] for the [`PrimitiveArray`]. pub struct GrowablePrimitive<'a, T: NativeType> { data_type: DataType, arrays: Vec<&'a [T]>, - validities: Vec<&'a Option>, - use_validity: bool, validity: MutableBitmap, values: MutableBuffer, + extend_null_bits: Vec>, } impl<'a, T: NativeType> GrowablePrimitive<'a, T> { + /// Creates a new [`GrowablePrimitive`] bound to `arrays` with a pre-allocated `capacity`. + /// # Panics + /// If `arrays` is empty. pub fn new( arrays: Vec<&'a PrimitiveArray>, mut use_validity: bool, @@ -33,10 +38,12 @@ impl<'a, T: NativeType> GrowablePrimitive<'a, T> { }; let data_type = arrays[0].data_type().clone(); - let validities = arrays + + let extend_null_bits = arrays .iter() - .map(|array| array.validity()) - .collect::>(); + .map(|array| build_extend_null_bits(*array, use_validity)) + .collect(); + let arrays = arrays .iter() .map(|array| array.values().as_slice()) @@ -45,10 +52,9 @@ impl<'a, T: NativeType> GrowablePrimitive<'a, T> { Self { data_type, arrays, - validities, - use_validity, values: MutableBuffer::with_capacity(capacity), validity: MutableBitmap::with_capacity(capacity), + extend_null_bits, } } @@ -64,8 +70,7 @@ impl<'a, T: NativeType> GrowablePrimitive<'a, T> { impl<'a, T: NativeType> Growable<'a> for GrowablePrimitive<'a, T> { #[inline] fn extend(&mut self, index: usize, start: usize, len: usize) { - let validity = self.validities[index]; - extend_validity(&mut self.validity, validity, start, len, self.use_validity); + (self.extend_null_bits[index])(&mut self.validity, start, len); let values = self.arrays[index]; self.values.extend_from_slice(&values[start..start + len]); diff --git a/src/array/growable/structure.rs b/src/array/growable/structure.rs index 620fe191509..bf487dd0fd8 100644 --- a/src/array/growable/structure.rs +++ b/src/array/growable/structure.rs @@ -17,14 +17,13 @@ pub struct GrowableStruct<'a> { arrays: Vec<&'a StructArray>, validity: MutableBitmap, values: Vec + 'a>>, - // function used to extend nulls from arrays. This function's lifetime is bound to the array - // because it reads nulls from it. extend_null_bits: Vec>, } impl<'a> GrowableStruct<'a> { + /// Creates a new [`GrowableStruct`] bound to `arrays` with a pre-allocated `capacity`. /// # Panics - /// This function panics if any of the `arrays` is not downcastable to `PrimitiveArray`. + /// If `arrays` is empty. pub fn new(arrays: Vec<&'a StructArray>, mut use_validity: bool, capacity: usize) -> Self { // if any of the arrays has nulls, insertions from any array requires setting bits // as there is at least one array with nulls. diff --git a/src/array/growable/utf8.rs b/src/array/growable/utf8.rs index 355d1e20121..f17095dab5c 100644 --- a/src/array/growable/utf8.rs +++ b/src/array/growable/utf8.rs @@ -18,12 +18,13 @@ pub struct GrowableUtf8<'a, O: Offset> { values: MutableBuffer, offsets: MutableBuffer, length: O, // always equal to the last offset at `offsets`. - // function used to extend nulls from arrays. This function's lifetime is bound to the array - // because it reads nulls from it. extend_null_bits: Vec>, } impl<'a, O: Offset> GrowableUtf8<'a, O> { + /// Creates a new [`GrowableUtf8`] bound to `arrays` with a pre-allocated `capacity`. + /// # Panics + /// If `arrays` is empty. pub fn new(arrays: Vec<&'a Utf8Array>, mut use_validity: bool, capacity: usize) -> Self { // if any of the arrays has nulls, insertions from any array requires setting bits // as there is at least one array with nulls. diff --git a/src/array/growable/utils.rs b/src/array/growable/utils.rs index 6a67988e2d8..597585244d8 100644 --- a/src/array/growable/utils.rs +++ b/src/array/growable/utils.rs @@ -1,6 +1,6 @@ use crate::{ array::{Array, Offset}, - bitmap::{Bitmap, MutableBitmap}, + bitmap::MutableBitmap, buffer::MutableBuffer, }; @@ -18,6 +18,8 @@ pub(super) fn extend_offsets( }); } +// function used to extend nulls from arrays. This function's lifetime is bound to the array +// because it reads nulls from it. pub(super) type ExtendNullBits<'a> = Box; pub(super) fn build_extend_null_bits(array: &dyn Array, use_validity: bool) -> ExtendNullBits { @@ -36,23 +38,6 @@ pub(super) fn build_extend_null_bits(array: &dyn Array, use_validity: bool) -> E } } -#[inline] -pub(super) fn extend_validity( - mutable_validity: &mut MutableBitmap, - validity: &Option, - start: usize, - len: usize, - use_validity: bool, -) { - if let Some(bitmap) = validity { - assert!(start + len <= bitmap.len()); - let (slice, offset, _) = bitmap.as_slice(); - mutable_validity.extend_from_slice(slice, start + offset, len); - } else if use_validity { - mutable_validity.extend_constant(len, true); - }; -} - #[inline] pub(super) fn extend_offset_values( buffer: &mut MutableBuffer,