diff --git a/src/array/binary/iterator.rs b/src/array/binary/iterator.rs index 4c80a780423..43f85176197 100644 --- a/src/array/binary/iterator.rs +++ b/src/array/binary/iterator.rs @@ -1,68 +1,26 @@ -use crate::{array::Offset, bitmap::utils::ZipValidity, trusted_len::TrustedLen}; +use crate::{ + array::{ArrayAccessor, ArrayValuesIter, Offset}, + bitmap::utils::ZipValidity, +}; use super::BinaryArray; -/// Iterator over slices of `&[u8]`. -#[derive(Debug, Clone)] -pub struct BinaryValueIter<'a, O: Offset> { - array: &'a BinaryArray, - index: usize, - end: usize, -} - -impl<'a, O: Offset> BinaryValueIter<'a, O> { - /// Creates a new [`BinaryValueIter`] - pub fn new(array: &'a BinaryArray) -> Self { - Self { - array, - index: 0, - end: array.len(), - } - } -} - -impl<'a, O: Offset> Iterator for BinaryValueIter<'a, O> { +unsafe impl<'a, O: Offset> ArrayAccessor<'a> for BinaryArray { type Item = &'a [u8]; #[inline] - fn next(&mut self) -> Option { - if self.index == self.end { - return None; - } - let old = self.index; - self.index += 1; - Some(unsafe { self.array.value_unchecked(old) }) - } - - #[inline] - fn size_hint(&self) -> (usize, Option) { - (self.end - self.index, Some(self.end - self.index)) + unsafe fn value_unchecked(&'a self, index: usize) -> Self::Item { + self.value_unchecked(index) } #[inline] - fn nth(&mut self, n: usize) -> Option { - let new_index = self.index + n; - if new_index > self.end { - self.index = self.end; - None - } else { - self.index = new_index; - self.next() - } + fn len(&self) -> usize { + self.len() } } -impl<'a, O: Offset> DoubleEndedIterator for BinaryValueIter<'a, O> { - #[inline] - fn next_back(&mut self) -> Option { - if self.index == self.end { - None - } else { - self.end -= 1; - Some(unsafe { self.array.value_unchecked(self.end) }) - } - } -} +/// Iterator of values of an [`BinaryArray`]. +pub type BinaryValueIter<'a, O> = ArrayValuesIter<'a, BinaryArray>; impl<'a, O: Offset> IntoIterator for &'a BinaryArray { type Item = Option<&'a [u8]>; @@ -72,5 +30,3 @@ impl<'a, O: Offset> IntoIterator for &'a BinaryArray { self.iter() } } - -unsafe impl TrustedLen for BinaryValueIter<'_, O> {} diff --git a/src/array/iterator.rs b/src/array/iterator.rs new file mode 100644 index 00000000000..a6cc51281e2 --- /dev/null +++ b/src/array/iterator.rs @@ -0,0 +1,83 @@ +use crate::trusted_len::TrustedLen; + +mod private { + pub trait Sealed {} + + impl<'a, T: super::ArrayAccessor<'a>> Sealed for T {} +} + +/// +/// # Safety +/// Implementers of this trait guarantee that +/// `value_unchecked` is safe when called up to `len` +/// Implementations must guarantee that +pub unsafe trait ArrayAccessor<'a>: private::Sealed { + type Item: 'a; + unsafe fn value_unchecked(&'a self, index: usize) -> Self::Item; + fn len(&self) -> usize; +} + +/// Iterator of values of an `ArrayAccessor`. +#[derive(Debug, Clone)] +pub struct ArrayValuesIter<'a, A: ArrayAccessor<'a>> { + array: &'a A, + index: usize, + end: usize, +} + +impl<'a, A: ArrayAccessor<'a>> ArrayValuesIter<'a, A> { + /// Creates a new [`ArrayValuesIter`] + #[inline] + pub fn new(array: &'a A) -> Self { + Self { + array, + index: 0, + end: array.len(), + } + } +} + +impl<'a, A: ArrayAccessor<'a>> Iterator for ArrayValuesIter<'a, A> { + type Item = A::Item; + + #[inline] + fn next(&mut self) -> Option { + if self.index == self.end { + return None; + } + let old = self.index; + self.index += 1; + Some(unsafe { self.array.value_unchecked(old) }) + } + + #[inline] + fn size_hint(&self) -> (usize, Option) { + (self.end - self.index, Some(self.end - self.index)) + } + + #[inline] + fn nth(&mut self, n: usize) -> Option { + let new_index = self.index + n; + if new_index > self.end { + self.index = self.end; + None + } else { + self.index = new_index; + self.next() + } + } +} + +impl<'a, A: ArrayAccessor<'a>> DoubleEndedIterator for ArrayValuesIter<'a, A> { + #[inline] + fn next_back(&mut self) -> Option { + if self.index == self.end { + None + } else { + self.end -= 1; + Some(unsafe { self.array.value_unchecked(self.end) }) + } + } +} + +unsafe impl<'a, A: ArrayAccessor<'a>> TrustedLen for ArrayValuesIter<'a, A> {} diff --git a/src/array/mod.rs b/src/array/mod.rs index 4a5e7fceba2..b78bb3f3b81 100644 --- a/src/array/mod.rs +++ b/src/array/mod.rs @@ -377,8 +377,12 @@ mod equal; mod ffi; mod fmt; pub mod growable; +mod iterator; pub mod ord; +pub(crate) use iterator::ArrayAccessor; +pub use iterator::ArrayValuesIter; + pub use equal::equal; pub use fmt::{get_display, get_value_display}; @@ -394,7 +398,7 @@ pub use null::NullArray; pub use primitive::*; pub use struct_::{MutableStructArray, StructArray}; pub use union::UnionArray; -pub use utf8::{MutableUtf8Array, Utf8Array, Utf8ValuesIter}; +pub use utf8::{MutableUtf8Array, MutableUtf8ValuesArray, Utf8Array, Utf8ValuesIter}; pub(crate) use self::ffi::offset_buffers_children_dictionary; pub(crate) use self::ffi::FromFfi; diff --git a/src/array/utf8/iterator.rs b/src/array/utf8/iterator.rs index 6a725c6254b..1953d4352fc 100644 --- a/src/array/utf8/iterator.rs +++ b/src/array/utf8/iterator.rs @@ -1,77 +1,79 @@ +use crate::array::{ArrayAccessor, ArrayValuesIter, Offset}; use crate::bitmap::utils::ZipValidity; -use crate::{array::Offset, trusted_len::TrustedLen}; -use super::Utf8Array; +use super::{MutableUtf8Array, MutableUtf8ValuesArray, Utf8Array}; -/// Iterator of values of an `Utf8Array`. -#[derive(Debug, Clone)] -pub struct Utf8ValuesIter<'a, O: Offset> { - array: &'a Utf8Array, - index: usize, - end: usize, +unsafe impl<'a, O: Offset> ArrayAccessor<'a> for Utf8Array { + type Item = &'a str; + + #[inline] + unsafe fn value_unchecked(&'a self, index: usize) -> Self::Item { + self.value_unchecked(index) + } + + #[inline] + fn len(&self) -> usize { + self.len() + } } -impl<'a, O: Offset> Utf8ValuesIter<'a, O> { - /// Creates a new [`Utf8ValuesIter`] - pub fn new(array: &'a Utf8Array) -> Self { - Self { - array, - index: 0, - end: array.len(), - } +/// Iterator of values of an [`Utf8Array`]. +pub type Utf8ValuesIter<'a, O> = ArrayValuesIter<'a, Utf8Array>; + +impl<'a, O: Offset> IntoIterator for &'a Utf8Array { + type Item = Option<&'a str>; + type IntoIter = ZipValidity<'a, &'a str, Utf8ValuesIter<'a, O>>; + + fn into_iter(self) -> Self::IntoIter { + self.iter() } } -impl<'a, O: Offset> Iterator for Utf8ValuesIter<'a, O> { +unsafe impl<'a, O: Offset> ArrayAccessor<'a> for MutableUtf8Array { type Item = &'a str; #[inline] - fn next(&mut self) -> Option { - if self.index == self.end { - return None; - } - let old = self.index; - self.index += 1; - Some(unsafe { self.array.value_unchecked(old) }) + unsafe fn value_unchecked(&'a self, index: usize) -> Self::Item { + self.value_unchecked(index) } #[inline] - fn size_hint(&self) -> (usize, Option) { - (self.end - self.index, Some(self.end - self.index)) + fn len(&self) -> usize { + self.len() } +} - #[inline] - fn nth(&mut self, n: usize) -> Option { - let new_index = self.index + n; - if new_index > self.end { - self.index = self.end; - None - } else { - self.index = new_index; - self.next() - } +/// Iterator of values of an [`MutableUtf8ValuesArray`]. +pub type MutableUtf8ValuesIter<'a, O> = ArrayValuesIter<'a, MutableUtf8ValuesArray>; + +impl<'a, O: Offset> IntoIterator for &'a MutableUtf8Array { + type Item = Option<&'a str>; + type IntoIter = ZipValidity<'a, &'a str, MutableUtf8ValuesIter<'a, O>>; + + fn into_iter(self) -> Self::IntoIter { + self.iter() } } -impl<'a, O: Offset> DoubleEndedIterator for Utf8ValuesIter<'a, O> { +unsafe impl<'a, O: Offset> ArrayAccessor<'a> for MutableUtf8ValuesArray { + type Item = &'a str; + #[inline] - fn next_back(&mut self) -> Option { - if self.index == self.end { - None - } else { - self.end -= 1; - Some(unsafe { self.array.value_unchecked(self.end) }) - } + unsafe fn value_unchecked(&'a self, index: usize) -> Self::Item { + self.value_unchecked(index) + } + + #[inline] + fn len(&self) -> usize { + self.len() } } -impl<'a, O: Offset> IntoIterator for &'a Utf8Array { - type Item = Option<&'a str>; - type IntoIter = ZipValidity<'a, &'a str, Utf8ValuesIter<'a, O>>; +impl<'a, O: Offset> IntoIterator for &'a MutableUtf8ValuesArray { + type Item = &'a str; + type IntoIter = ArrayValuesIter<'a, MutableUtf8ValuesArray>; fn into_iter(self) -> Self::IntoIter { self.iter() } } - -unsafe impl TrustedLen for Utf8ValuesIter<'_, O> {} diff --git a/src/array/utf8/mod.rs b/src/array/utf8/mod.rs index 67c6c9c3a39..21b7d50ed32 100644 --- a/src/array/utf8/mod.rs +++ b/src/array/utf8/mod.rs @@ -21,8 +21,19 @@ pub(super) mod fmt; mod from; mod iterator; mod mutable; +mod mutable_values; pub use iterator::*; pub use mutable::*; +pub use mutable_values::MutableUtf8ValuesArray; + +// Auxiliary struct to allow presenting &str as [u8] to a generic function +pub(super) struct StrAsBytes

(P); +impl> AsRef<[u8]> for StrAsBytes { + #[inline(always)] + fn as_ref(&self) -> &[u8] { + self.0.as_ref().as_bytes() + } +} /// A [`Utf8Array`] is arrow's semantic equivalent of an immutable `Vec>`. /// Cloning and slicing this struct is `O(1)`. diff --git a/src/array/utf8/mutable.rs b/src/array/utf8/mutable.rs index 0b87b97c25c..0cc1f8d1c5f 100644 --- a/src/array/utf8/mutable.rs +++ b/src/array/utf8/mutable.rs @@ -1,59 +1,35 @@ use std::{iter::FromIterator, sync::Arc}; +use crate::array::physical_binary::*; use crate::{ - array::{ - specification::{check_offsets_minimal, try_check_offsets_and_utf8}, - Array, MutableArray, Offset, TryExtend, TryPush, + array::{Array, MutableArray, Offset, TryExtend, TryPush}, + bitmap::{ + utils::{zip_validity, ZipValidity}, + Bitmap, MutableBitmap, }, - bitmap::MutableBitmap, datatypes::DataType, error::{Error, Result}, trusted_len::TrustedLen, }; -use super::Utf8Array; -use crate::array::physical_binary::*; -use crate::bitmap::Bitmap; - -struct StrAsBytes

(P); -impl> AsRef<[u8]> for StrAsBytes { - #[inline] - fn as_ref(&self) -> &[u8] { - self.0.as_ref().as_bytes() - } -} +use super::{MutableUtf8ValuesArray, MutableUtf8ValuesIter, StrAsBytes, Utf8Array}; -/// The mutable version of [`Utf8Array`]. See [`MutableArray`] for more details. +/// A [`MutableArray`] that builds a [`Utf8Array`]. It differs +/// from [`MutableUtf8ValuesArray`] in that it can build nullable [`Utf8Array`]s. #[derive(Debug)] pub struct MutableUtf8Array { - data_type: DataType, - offsets: Vec, - values: Vec, + values: MutableUtf8ValuesArray, validity: Option, } impl From> for Utf8Array { fn from(other: MutableUtf8Array) -> Self { - // Safety: - // `MutableUtf8Array` has the same invariants as `Utf8Array` and thus - // `Utf8Array` can be safely created from `MutableUtf8Array` without checks. let validity = other.validity.and_then(|x| { - let bitmap: Bitmap = x.into(); - if bitmap.unset_bits() == 0 { - None - } else { - Some(bitmap) - } + let validity: Option = x.into(); + validity }); - - unsafe { - Utf8Array::::from_data_unchecked( - other.data_type, - other.offsets.into(), - other.values.into(), - validity, - ) - } + let array: Utf8Array = other.values.into(); + array.with_validity(validity) } } @@ -67,9 +43,7 @@ impl MutableUtf8Array { /// Initializes a new empty [`MutableUtf8Array`]. pub fn new() -> Self { Self { - data_type: Self::default_data_type(), - offsets: vec![O::default()], - values: Vec::::new(), + values: Default::default(), validity: None, } } @@ -91,71 +65,65 @@ impl MutableUtf8Array { values: Vec, validity: Option, ) -> Result { - try_check_offsets_and_utf8(&offsets, &values)?; + let values = MutableUtf8ValuesArray::try_new(data_type, offsets, values)?; + if validity .as_ref() - .map_or(false, |validity| validity.len() != offsets.len() - 1) + .map_or(false, |validity| validity.len() != values.len()) { return Err(Error::oos( "validity's length must be equal to the number of values", )); } - if data_type.to_physical_type() != Self::default_data_type().to_physical_type() { - return Err(Error::oos( - "MutableUtf8Array can only be initialized with DataType::Utf8 or DataType::LargeUtf8", - )); - } - - Ok(Self { - data_type, - offsets, - values, - validity, - }) + Ok(Self { values, validity }) } - /// The canonical method to create a [`MutableUtf8Array`] out of low-end APIs. + /// Create a [`MutableUtf8Array`] out of low-end APIs. + /// # Safety + /// The caller must ensure that every value between offsets is a valid utf8. /// # Panics /// This function panics iff: /// * The `offsets` and `values` are inconsistent - /// * The `values` between `offsets` are not utf8 encoded /// * The validity is not `None` and its length is different from `offsets`'s length minus one. - pub fn from_data( + pub unsafe fn new_unchecked( data_type: DataType, offsets: Vec, values: Vec, validity: Option, ) -> Self { - Self::try_new(data_type, offsets, values, validity).unwrap() + Self::from_data_unchecked(data_type, offsets, values, validity) } - /// Create a [`MutableUtf8Array`] out of low-end APIs. + /// Alias of `new_unchecked` /// # Safety /// The caller must ensure that every value between offsets is a valid utf8. + pub unsafe fn from_data_unchecked( + data_type: DataType, + offsets: Vec, + values: Vec, + validity: Option, + ) -> Self { + let values = MutableUtf8ValuesArray::new_unchecked(data_type, offsets, values); + if let Some(ref validity) = validity { + assert_eq!(values.len(), validity.len()); + } + Self { values, validity } + } + + /// The canonical method to create a [`MutableUtf8Array`] out of low-end APIs. /// # Panics /// This function panics iff: /// * The `offsets` and `values` are inconsistent + /// * The `values` between `offsets` are not utf8 encoded /// * The validity is not `None` and its length is different from `offsets`'s length minus one. - pub unsafe fn from_data_unchecked( + pub fn from_data( data_type: DataType, offsets: Vec, values: Vec, validity: Option, ) -> Self { - check_offsets_minimal(&offsets, values.len()); - if let Some(ref validity) = validity { - assert_eq!(offsets.len() - 1, validity.len()); - } - if data_type.to_physical_type() != Self::default_data_type().to_physical_type() { - panic!("MutableUtf8Array can only be initialized with DataType::Utf8 or DataType::LargeUtf8") - } - Self { - data_type, - offsets, - values, - validity, - } + Self::try_new(data_type, offsets, values, validity).unwrap() } fn default_data_type() -> DataType { @@ -169,29 +137,29 @@ impl MutableUtf8Array { /// Initializes a new [`MutableUtf8Array`] with a pre-allocated capacity of slots and values. pub fn with_capacities(capacity: usize, values: usize) -> Self { - let mut offsets = Vec::::with_capacity(capacity + 1); - offsets.push(O::default()); - Self { - data_type: Self::default_data_type(), - offsets, - values: Vec::::with_capacity(values), + values: MutableUtf8ValuesArray::with_capacities(capacity, values), validity: None, } } /// Reserves `additional` elements and `additional_values` on the values buffer. pub fn reserve(&mut self, additional: usize, additional_values: usize) { - self.offsets.reserve(additional); + self.values.reserve(additional, additional_values); if let Some(x) = self.validity.as_mut() { x.reserve(additional) } - self.values.reserve(additional_values); } + /// Reserves `additional` elements and `additional_values` on the values buffer. + pub fn capacity(&self) -> usize { + self.values.capacity() + } + + /// Returns the length of this array #[inline] - fn last_offset(&self) -> O { - *self.offsets.last().unwrap() + pub fn len(&self) -> usize { + self.values.len() } /// Pushes a new element to the array. @@ -202,31 +170,45 @@ impl MutableUtf8Array { self.try_push(value).unwrap() } + /// Returns the value of the element at index `i`, ignoring the array's validity. + /// # Safety + /// This function is safe iff `i < self.len`. + #[inline] + pub fn value(&self, i: usize) -> &str { + self.values.value(i) + } + + /// Returns the value of the element at index `i`, ignoring the array's validity. + /// # Safety + /// This function is safe iff `i < self.len`. + #[inline] + pub unsafe fn value_unchecked(&self, i: usize) -> &str { + self.values.value_unchecked(i) + } + /// Pop the last entry from [`MutableUtf8Array`]. /// This function returns `None` iff this array is empty. pub fn pop(&mut self) -> Option { - if self.offsets.len() < 2 { - return None; - } - self.offsets.pop()?; - let value_start = self.offsets.iter().last().cloned()?.to_usize(); - let value = self.values.split_off(value_start); + let value = self.values.pop()?; self.validity .as_mut() .map(|x| x.pop()?.then(|| ())) .unwrap_or_else(|| Some(())) - .map(|_| - // soundness: we always check for utf8 soundness on constructors. - unsafe { String::from_utf8_unchecked(value) }) + .map(|_| value) } fn init_validity(&mut self) { - let mut validity = MutableBitmap::with_capacity(self.offsets.capacity()); + let mut validity = MutableBitmap::with_capacity(self.values.capacity()); validity.extend_constant(self.len(), true); validity.set(self.len() - 1, false); self.validity = Some(validity); } + /// Returns an iterator of `Option<&str>` + pub fn iter(&self) -> ZipValidity<&str, MutableUtf8ValuesIter> { + zip_validity(self.values_iter(), self.validity.as_ref().map(|x| x.iter())) + } + /// Converts itself into an [`Array`]. pub fn into_arc(self) -> Arc { let a: Utf8Array = self.into(); @@ -236,7 +218,6 @@ impl MutableUtf8Array { /// Shrinks the capacity of the [`MutableUtf8Array`] to fit its current length. pub fn shrink_to_fit(&mut self) { self.values.shrink_to_fit(); - self.offsets.shrink_to_fit(); if let Some(validity) = &mut self.validity { validity.shrink_to_fit() } @@ -244,25 +225,31 @@ impl MutableUtf8Array { /// Extract the low-end APIs from the [`MutableUtf8Array`]. pub fn into_data(self) -> (DataType, Vec, Vec, Option) { - (self.data_type, self.offsets, self.values, self.validity) + let (data_type, offsets, values) = self.values.into_inner(); + (data_type, offsets, values, self.validity) + } + + /// Returns an iterator of `&str` + pub fn values_iter(&self) -> MutableUtf8ValuesIter { + self.values.iter() } } impl MutableUtf8Array { /// returns its values. pub fn values(&self) -> &Vec { - &self.values + self.values.values() } /// returns its offsets. pub fn offsets(&self) -> &Vec { - &self.offsets + self.values.offsets() } } impl MutableArray for MutableUtf8Array { fn len(&self) -> usize { - self.offsets.len() - 1 + self.len() } fn validity(&self) -> Option<&MutableBitmap> { @@ -273,28 +260,32 @@ impl MutableArray for MutableUtf8Array { // Safety: // `MutableUtf8Array` has the same invariants as `Utf8Array` and thus // `Utf8Array` can be safely created from `MutableUtf8Array` without checks. - Box::new(unsafe { + let (data_type, offsets, values) = std::mem::take(&mut self.values).into_inner(); + unsafe { Utf8Array::from_data_unchecked( - self.data_type.clone(), - std::mem::take(&mut self.offsets).into(), - std::mem::take(&mut self.values).into(), + data_type, + offsets.into(), + values.into(), std::mem::take(&mut self.validity).map(|x| x.into()), ) - }) + } + .boxed() } fn as_arc(&mut self) -> Arc { // Safety: // `MutableUtf8Array` has the same invariants as `Utf8Array` and thus // `Utf8Array` can be safely created from `MutableUtf8Array` without checks. - Arc::new(unsafe { + let (data_type, offsets, values) = std::mem::take(&mut self.values).into_inner(); + unsafe { Utf8Array::from_data_unchecked( - self.data_type.clone(), - std::mem::take(&mut self.offsets).into(), - std::mem::take(&mut self.values).into(), + data_type, + offsets.into(), + values.into(), std::mem::take(&mut self.validity).map(|x| x.into()), ) - }) + } + .arced() } fn data_type(&self) -> &DataType { @@ -353,8 +344,9 @@ impl MutableUtf8Array { P: AsRef, I: Iterator, { - let iterator = iterator.map(StrAsBytes); - let additional = extend_from_values_iter(&mut self.offsets, &mut self.values, iterator); + let length = self.values.len(); + self.values.extend(iterator); + let additional = self.values.len() - length; if let Some(validity) = self.validity.as_mut() { validity.extend_constant(additional, true); @@ -372,11 +364,9 @@ impl MutableUtf8Array { P: AsRef, I: Iterator, { - let (_, upper) = iterator.size_hint(); - let additional = upper.expect("extend_trusted_len_values requires an upper limit"); - - let iterator = iterator.map(StrAsBytes); - extend_from_trusted_len_values_iter(&mut self.offsets, &mut self.values, iterator); + let length = self.values.len(); + self.values.extend_trusted_len_unchecked(iterator); + let additional = self.values.len() - length; if let Some(validity) = self.validity.as_mut() { validity.extend_constant(additional, true); @@ -408,13 +398,8 @@ impl MutableUtf8Array { self.validity = Some(validity); } - let iterator = iterator.map(|x| x.map(StrAsBytes)); - extend_from_trusted_len_iter( - &mut self.offsets, - &mut self.values, - self.validity.as_mut().unwrap(), - iterator, - ); + self.values + .extend_from_trusted_len_iter(self.validity.as_mut().unwrap(), iterator); } /// Creates a [`MutableUtf8Array`] from an iterator of trusted length. @@ -453,10 +438,7 @@ impl MutableUtf8Array { pub unsafe fn from_trusted_len_values_iter_unchecked, I: Iterator>( iterator: I, ) -> Self { - let iterator = iterator.map(StrAsBytes); - let (offsets, values) = unsafe { trusted_len_values_iter(iterator) }; - // soundness: T is AsRef - Self::from_data_unchecked(Self::default_data_type(), offsets, values, None) + MutableUtf8ValuesArray::from_trusted_len_iter_unchecked(iterator).into() } /// Creates a new [`MutableUtf8Array`] from a [`TrustedLen`] of `&str`. @@ -521,10 +503,7 @@ impl MutableUtf8Array { /// Creates a new [`MutableUtf8Array`] from a [`Iterator`] of `&str`. pub fn from_iter_values, I: Iterator>(iterator: I) -> Self { - let iterator = iterator.map(StrAsBytes); - let (offsets, values) = values_iter(iterator); - // soundness: T: AsRef - unsafe { Self::from_data_unchecked(Self::default_data_type(), offsets, values, None) } + MutableUtf8ValuesArray::from_iter(iterator).into() } } @@ -547,12 +526,7 @@ impl> TryPush> for MutableUtf8Array { fn try_push(&mut self, value: Option) -> Result<()> { match value { Some(value) => { - let bytes = value.as_ref().as_bytes(); - self.values.extend_from_slice(bytes); - - let size = O::from_usize(self.values.len()).ok_or(Error::Overflow)?; - - self.offsets.push(size); + self.values.try_push(value.as_ref())?; match &mut self.validity { Some(validity) => validity.push(true), @@ -560,7 +534,7 @@ impl> TryPush> for MutableUtf8Array { } } None => { - self.offsets.push(self.last_offset()); + self.values.push(""); match &mut self.validity { Some(validity) => validity.push(false), None => self.init_validity(), diff --git a/src/array/utf8/mutable_values.rs b/src/array/utf8/mutable_values.rs new file mode 100644 index 00000000000..1f393a931d1 --- /dev/null +++ b/src/array/utf8/mutable_values.rs @@ -0,0 +1,409 @@ +use std::{iter::FromIterator, sync::Arc}; + +use crate::{ + array::{ + specification::{check_offsets_minimal, try_check_offsets_and_utf8}, + Array, ArrayValuesIter, MutableArray, Offset, TryExtend, TryPush, + }, + bitmap::MutableBitmap, + datatypes::DataType, + error::{Error, Result}, + trusted_len::TrustedLen, +}; + +use super::{MutableUtf8Array, StrAsBytes, Utf8Array}; +use crate::array::physical_binary::*; + +/// A [`MutableArray`] that builds a [`Utf8Array`]. It differs +/// from [`MutableUtf8Array`] in that it builds non-null [`Utf8Array`]. +#[derive(Debug, Clone)] +pub struct MutableUtf8ValuesArray { + data_type: DataType, + offsets: Vec, + values: Vec, +} + +impl From> for Utf8Array { + fn from(other: MutableUtf8ValuesArray) -> Self { + // Safety: + // `MutableUtf8ValuesArray` has the same invariants as `Utf8Array` and thus + // `Utf8Array` can be safely created from `MutableUtf8ValuesArray` without checks. + unsafe { + Utf8Array::::from_data_unchecked( + other.data_type, + other.offsets.into(), + other.values.into(), + None, + ) + } + } +} + +impl From> for MutableUtf8Array { + fn from(other: MutableUtf8ValuesArray) -> Self { + // Safety: + // `MutableUtf8ValuesArray` has the same invariants as `MutableUtf8Array` + unsafe { + MutableUtf8Array::::from_data_unchecked( + other.data_type, + other.offsets, + other.values, + None, + ) + } + } +} + +impl Default for MutableUtf8ValuesArray { + fn default() -> Self { + Self::new() + } +} + +impl MutableUtf8ValuesArray { + /// Returns an empty [`MutableUtf8ValuesArray`]. + pub fn new() -> Self { + Self { + data_type: Self::default_data_type(), + offsets: vec![O::default()], + values: Vec::::new(), + } + } + + /// Returns a [`MutableUtf8ValuesArray`] created from its internal representation. + /// + /// # Errors + /// This function returns an error iff: + /// * the offsets are not monotonically increasing + /// * The last offset is not equal to the values' length. + /// * The `data_type`'s [`crate::datatypes::PhysicalType`] is not equal to either `Utf8` or `LargeUtf8`. + /// * The `values` between two consecutive `offsets` are not valid utf8 + /// # Implementation + /// This function is `O(N)` - checking monotinicity and utf8 is `O(N)` + pub fn try_new(data_type: DataType, offsets: Vec, values: Vec) -> Result { + try_check_offsets_and_utf8(&offsets, &values)?; + if data_type.to_physical_type() != Self::default_data_type().to_physical_type() { + return Err(Error::oos( + "MutableUtf8ValuesArray can only be initialized with DataType::Utf8 or DataType::LargeUtf8", + )); + } + + Ok(Self { + data_type, + offsets, + values, + }) + } + + /// Returns a [`MutableUtf8ValuesArray`] created from its internal representation. + /// + /// # Panic + /// This function does not panic iff: + /// * The last offset is equal to the values' length. + /// * The `data_type`'s [`crate::datatypes::PhysicalType`] is equal to either `Utf8` or `LargeUtf8`. + /// # Safety + /// This function is safe iff: + /// * the offsets are monotonically increasing + /// * The `values` between two consecutive `offsets` are not valid utf8 + /// # Implementation + /// This function is `O(1)` + pub unsafe fn new_unchecked(data_type: DataType, offsets: Vec, values: Vec) -> Self { + check_offsets_minimal(&offsets, values.len()); + + if data_type.to_physical_type() != Self::default_data_type().to_physical_type() { + panic!("MutableUtf8ValuesArray can only be initialized with DataType::Utf8 or DataType::LargeUtf8") + } + + Self { + data_type, + offsets, + values, + } + } + + /// Returns the default [`DataType`] of this container: [`DataType::Utf8`] or [`DataType::LargeUtf8`] + /// depending on the generic [`Offset`]. + pub fn default_data_type() -> DataType { + Utf8Array::::default_data_type() + } + + /// Initializes a new [`MutableUtf8ValuesArray`] with a pre-allocated capacity of items. + pub fn with_capacity(capacity: usize) -> Self { + Self::with_capacities(capacity, 0) + } + + /// Initializes a new [`MutableUtf8ValuesArray`] with a pre-allocated capacity of items and values. + pub fn with_capacities(capacity: usize, values: usize) -> Self { + let mut offsets = Vec::::with_capacity(capacity + 1); + offsets.push(O::default()); + + Self { + data_type: Self::default_data_type(), + offsets, + values: Vec::::with_capacity(values), + } + } + + /// returns its values. + #[inline] + pub fn values(&self) -> &Vec { + &self.values + } + + /// returns its offsets. + #[inline] + pub fn offsets(&self) -> &Vec { + &self.offsets + } + + /// Reserves `additional` elements and `additional_values` on the values. + #[inline] + pub fn reserve(&mut self, additional: usize, additional_values: usize) { + self.offsets.reserve(additional + 1); + self.values.reserve(additional_values); + } + + /// Returns the capacity in number of items + pub fn capacity(&self) -> usize { + self.offsets.capacity() - 1 + } + + /// Returns the length of this array + #[inline] + pub fn len(&self) -> usize { + self.offsets.len() - 1 + } + + /// Pushes a new item to the array. + /// # Panic + /// This operation panics iff the length of all values (in bytes) exceeds `O` maximum value. + #[inline] + pub fn push>(&mut self, value: T) { + self.try_push(value).unwrap() + } + + /// Pop the last entry from [`MutableUtf8ValuesArray`]. + /// This function returns `None` iff this array is empty. + pub fn pop(&mut self) -> Option { + if self.len() == 0 { + return None; + } + self.offsets.pop()?; + let start = self.offsets.last()?.to_usize(); + let value = self.values.split_off(start); + // Safety: utf8 is validated on initialization + Some(unsafe { String::from_utf8_unchecked(value) }) + } + + /// Returns the value of the element at index `i`. + /// # Panic + /// This function panics iff `i >= self.len`. + #[inline] + pub fn value(&self, i: usize) -> &str { + assert!(i < self.len()); + unsafe { self.value_unchecked(i) } + } + + /// Returns the value of the element at index `i`. + /// # Safety + /// This function is safe iff `i < self.len`. + #[inline] + pub unsafe fn value_unchecked(&self, i: usize) -> &str { + // soundness: the invariant of the function + let start = self.offsets.get_unchecked(i).to_usize(); + let end = self.offsets.get_unchecked(i + 1).to_usize(); + + // soundness: the invariant of the struct + let slice = self.values.get_unchecked(start..end); + + // soundness: the invariant of the struct + std::str::from_utf8_unchecked(slice) + } + + /// Returns an iterator of `&str` + pub fn iter(&self) -> ArrayValuesIter { + ArrayValuesIter::new(self) + } + + /// Shrinks the capacity of the [`MutableUtf8ValuesArray`] to fit its current length. + pub fn shrink_to_fit(&mut self) { + self.values.shrink_to_fit(); + self.offsets.shrink_to_fit(); + } + + /// Extract the low-end APIs from the [`MutableUtf8ValuesArray`]. + pub fn into_inner(self) -> (DataType, Vec, Vec) { + (self.data_type, self.offsets, self.values) + } +} + +impl MutableArray for MutableUtf8ValuesArray { + fn len(&self) -> usize { + self.len() + } + + fn validity(&self) -> Option<&MutableBitmap> { + None + } + + fn as_box(&mut self) -> Box { + // Safety: + // `MutableUtf8ValuesArray` has the same invariants as `Utf8Array` and thus + // `Utf8Array` can be safely created from `MutableUtf8ValuesArray` without checks. + let (data_type, offsets, values) = std::mem::take(self).into_inner(); + unsafe { Utf8Array::from_data_unchecked(data_type, offsets.into(), values.into(), None) } + .boxed() + } + + fn as_arc(&mut self) -> Arc { + // Safety: + // `MutableUtf8ValuesArray` has the same invariants as `Utf8Array` and thus + // `Utf8Array` can be safely created from `MutableUtf8ValuesArray` without checks. + let (data_type, offsets, values) = std::mem::take(self).into_inner(); + unsafe { Utf8Array::from_data_unchecked(data_type, offsets.into(), values.into(), None) } + .arced() + } + + fn data_type(&self) -> &DataType { + &self.data_type + } + + fn as_any(&self) -> &dyn std::any::Any { + self + } + + fn as_mut_any(&mut self) -> &mut dyn std::any::Any { + self + } + + #[inline] + fn push_null(&mut self) { + self.push::<&str>("") + } + + fn reserve(&mut self, additional: usize) { + self.reserve(additional, 0) + } + + fn shrink_to_fit(&mut self) { + self.shrink_to_fit() + } +} + +impl> FromIterator

for MutableUtf8ValuesArray { + fn from_iter>(iter: I) -> Self { + let (offsets, values) = values_iter(iter.into_iter().map(StrAsBytes)); + // soundness: T: AsRef and offsets are monotonically increasing + unsafe { Self::new_unchecked(Self::default_data_type(), offsets, values) } + } +} + +impl MutableUtf8ValuesArray { + pub(crate) unsafe fn extend_from_trusted_len_iter( + &mut self, + validity: &mut MutableBitmap, + iterator: I, + ) where + P: AsRef, + I: Iterator>, + { + let iterator = iterator.map(|x| x.map(StrAsBytes)); + extend_from_trusted_len_iter(&mut self.offsets, &mut self.values, validity, iterator); + } + + /// Extends the [`MutableUtf8ValuesArray`] from a [`TrustedLen`] + #[inline] + pub fn extend_trusted_len(&mut self, iterator: I) + where + P: AsRef, + I: TrustedLen, + { + unsafe { self.extend_trusted_len_unchecked(iterator) } + } + + /// Extends [`MutableUtf8ValuesArray`] from an iterator of trusted len. + /// # Safety + /// The iterator must be trusted len. + #[inline] + pub unsafe fn extend_trusted_len_unchecked(&mut self, iterator: I) + where + P: AsRef, + I: Iterator, + { + let iterator = iterator.map(StrAsBytes); + extend_from_trusted_len_values_iter(&mut self.offsets, &mut self.values, iterator); + } + + /// Creates a [`MutableUtf8ValuesArray`] from a [`TrustedLen`] + #[inline] + pub fn from_trusted_len_iter(iterator: I) -> Self + where + P: AsRef, + I: TrustedLen, + { + // soundness: I is `TrustedLen` + unsafe { Self::from_trusted_len_iter_unchecked(iterator) } + } + + /// Returns a new [`MutableUtf8ValuesArray`] from an iterator of trusted length. + /// # Safety + /// The iterator must be [`TrustedLen`](https://doc.rust-lang.org/std/iter/trait.TrustedLen.html). + /// I.e. that `size_hint().1` correctly reports its length. + #[inline] + pub unsafe fn from_trusted_len_iter_unchecked(iterator: I) -> Self + where + P: AsRef, + I: Iterator, + { + let iterator = iterator.map(StrAsBytes); + let (offsets, values) = trusted_len_values_iter(iterator); + + // soundness: P is `str` and offsets are monotonically increasing + Self::new_unchecked(Self::default_data_type(), offsets, values) + } + + /// Returns a new [`MutableUtf8ValuesArray`] from an iterator. + /// # Error + /// This operation errors iff the total length in bytes on the iterator exceeds `O`'s maximum value. + /// (`i32::MAX` or `i64::MAX` respectively). + pub fn try_from_iter, I: IntoIterator>(iter: I) -> Result { + let iterator = iter.into_iter(); + let (lower, _) = iterator.size_hint(); + let mut array = Self::with_capacity(lower); + for item in iterator { + array.try_push(item)?; + } + Ok(array) + } +} + +impl> Extend for MutableUtf8ValuesArray { + fn extend>(&mut self, iter: I) { + extend_from_values_iter( + &mut self.offsets, + &mut self.values, + iter.into_iter().map(StrAsBytes), + ); + } +} + +impl> TryExtend for MutableUtf8ValuesArray { + fn try_extend>(&mut self, iter: I) -> Result<()> { + let mut iter = iter.into_iter(); + self.reserve(iter.size_hint().0, 0); + iter.try_for_each(|x| self.try_push(x)) + } +} + +impl> TryPush for MutableUtf8ValuesArray { + #[inline] + fn try_push(&mut self, value: T) -> Result<()> { + let bytes = value.as_ref().as_bytes(); + self.values.extend_from_slice(bytes); + + let size = O::from_usize(self.values.len()).ok_or(Error::Overflow)?; + + self.offsets.push(size); + Ok(()) + } +} diff --git a/tests/it/array/utf8/mod.rs b/tests/it/array/utf8/mod.rs index c6a0acec03f..daa2734faa8 100644 --- a/tests/it/array/utf8/mod.rs +++ b/tests/it/array/utf8/mod.rs @@ -1,6 +1,7 @@ use arrow2::{array::*, bitmap::Bitmap, buffer::Buffer, datatypes::DataType, error::Result}; mod mutable; +mod mutable_values; mod to_mutable; #[test] diff --git a/tests/it/array/utf8/mutable.rs b/tests/it/array/utf8/mutable.rs index 80cc24ca3c8..377d8b26de2 100644 --- a/tests/it/array/utf8/mutable.rs +++ b/tests/it/array/utf8/mutable.rs @@ -167,3 +167,24 @@ fn as_arc() { array.as_arc().as_ref() ); } + +#[test] +fn test_iter() { + let mut array = MutableUtf8Array::::new(); + + array.extend_trusted_len(vec![Some("hi"), Some("there")].into_iter()); + array.extend_trusted_len(vec![None, Some("hello")].into_iter()); + array.extend_trusted_len_values(["again"].iter()); + + let result = array.iter().collect::>(); + assert_eq!( + result, + vec![ + Some("hi"), + Some("there"), + None, + Some("hello"), + Some("again"), + ] + ); +} diff --git a/tests/it/array/utf8/mutable_values.rs b/tests/it/array/utf8/mutable_values.rs new file mode 100644 index 00000000000..fbb2abf0af6 --- /dev/null +++ b/tests/it/array/utf8/mutable_values.rs @@ -0,0 +1,101 @@ +use arrow2::array::MutableArray; +use arrow2::array::MutableUtf8ValuesArray; +use arrow2::datatypes::DataType; + +#[test] +fn capacity() { + let mut b = MutableUtf8ValuesArray::::with_capacity(100); + + assert_eq!(b.values().capacity(), 0); + assert!(b.offsets().capacity() >= 101); + b.shrink_to_fit(); + assert!(b.offsets().capacity() < 101); +} + +#[test] +fn offsets_must_be_monotonic_increasing() { + let offsets = vec![0, 5, 4]; + let values = b"abbbbb".to_vec(); + assert!(MutableUtf8ValuesArray::::try_new(DataType::Utf8, offsets, values).is_err()); +} + +#[test] +fn data_type_must_be_consistent() { + let offsets = vec![0, 4]; + let values = b"abbb".to_vec(); + assert!(MutableUtf8ValuesArray::::try_new(DataType::Int32, offsets, values).is_err()); +} + +#[test] +fn must_be_utf8() { + let offsets = vec![0, 2]; + let values = vec![207, 128]; + assert!(MutableUtf8ValuesArray::::try_new(DataType::Int32, offsets, values).is_err()); +} + +#[test] +fn as_box() { + let offsets = vec![0, 2]; + let values = b"ab".to_vec(); + let mut b = MutableUtf8ValuesArray::::try_new(DataType::Utf8, offsets, values).unwrap(); + let _ = b.as_box(); +} + +#[test] +fn as_arc() { + let offsets = vec![0, 2]; + let values = b"ab".to_vec(); + let mut b = MutableUtf8ValuesArray::::try_new(DataType::Utf8, offsets, values).unwrap(); + let _ = b.as_arc(); +} + +#[test] +fn extend_trusted_len() { + let offsets = vec![0, 2]; + let values = b"ab".to_vec(); + let mut b = MutableUtf8ValuesArray::::try_new(DataType::Utf8, offsets, values).unwrap(); + b.extend_trusted_len(vec!["a", "b"].into_iter()); + + let offsets = vec![0, 2, 3, 4]; + let values = b"abab".to_vec(); + assert_eq!( + b.as_box(), + MutableUtf8ValuesArray::::try_new(DataType::Utf8, offsets, values) + .unwrap() + .as_box() + ) +} + +#[test] +fn from_trusted_len() { + let mut b = MutableUtf8ValuesArray::::from_trusted_len_iter(vec!["a", "b"].into_iter()); + + let offsets = vec![0, 1, 2]; + let values = b"ab".to_vec(); + assert_eq!( + b.as_box(), + MutableUtf8ValuesArray::::try_new(DataType::Utf8, offsets, values) + .unwrap() + .as_box() + ) +} + +#[test] +fn extend_from_iter() { + let offsets = vec![0, 2]; + let values = b"ab".to_vec(); + let mut b = MutableUtf8ValuesArray::::try_new(DataType::Utf8, offsets, values).unwrap(); + b.extend_trusted_len(vec!["a", "b"].into_iter()); + + let a = b.clone(); + b.extend_trusted_len(a.iter()); + + let offsets = vec![0, 2, 3, 4, 6, 7, 8]; + let values = b"abababab".to_vec(); + assert_eq!( + b.as_box(), + MutableUtf8ValuesArray::::try_new(DataType::Utf8, offsets, values) + .unwrap() + .as_box() + ) +}