Skip to content
This repository has been archived by the owner on Feb 18, 2024. It is now read-only.

Commit

Permalink
Added MutableUtf8ValuesArray
Browse files Browse the repository at this point in the history
  • Loading branch information
jorgecarleitao committed Sep 25, 2022
1 parent e972df0 commit 7ed66f7
Show file tree
Hide file tree
Showing 3 changed files with 467 additions and 110 deletions.
2 changes: 2 additions & 0 deletions src/array/utf8/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,10 @@ pub(super) mod fmt;
mod from;
mod iterator;
mod mutable;
mod mutable_values;
pub use iterator::*;
pub use mutable::*;
pub use mutable_values::MutableUtf8ValuesArray;

/// A [`Utf8Array`] is arrow's semantic equivalent of an immutable `Vec<Option<String>>`.
/// Cloning and slicing this struct is `O(1)`.
Expand Down
180 changes: 70 additions & 110 deletions src/array/utf8/mutable.rs
Original file line number Diff line number Diff line change
@@ -1,17 +1,14 @@
use std::{iter::FromIterator, sync::Arc};

use crate::{
array::{
specification::{check_offsets_minimal, try_check_offsets_and_utf8},
Array, MutableArray, Offset, TryExtend, TryPush,
},
array::{Array, MutableArray, Offset, TryExtend, TryPush},
bitmap::MutableBitmap,
datatypes::DataType,
error::{Error, Result},
trusted_len::TrustedLen,
};

use super::Utf8Array;
use super::{mutable_values::MutableUtf8ValuesArray, Utf8Array};
use crate::array::physical_binary::*;
use crate::bitmap::Bitmap;

Expand All @@ -26,9 +23,7 @@ impl<T: AsRef<str>> AsRef<[u8]> for StrAsBytes<T> {
/// The mutable version of [`Utf8Array`]. See [`MutableArray`] for more details.
#[derive(Debug)]
pub struct MutableUtf8Array<O: Offset> {
data_type: DataType,
offsets: Vec<O>,
values: Vec<u8>,
values: MutableUtf8ValuesArray<O>,
validity: Option<MutableBitmap>,
}

Expand All @@ -45,15 +40,8 @@ impl<O: Offset> From<MutableUtf8Array<O>> for Utf8Array<O> {
Some(bitmap)
}
});

unsafe {
Utf8Array::<O>::from_data_unchecked(
other.data_type,
other.offsets.into(),
other.values.into(),
validity,
)
}
let array: Utf8Array<O> = other.values.into();
array.with_validity(validity)
}
}

Expand All @@ -67,9 +55,7 @@ impl<O: Offset> MutableUtf8Array<O> {
/// Initializes a new empty [`MutableUtf8Array`].
pub fn new() -> Self {
Self {
data_type: Self::default_data_type(),
offsets: vec![O::default()],
values: Vec::<u8>::new(),
values: Default::default(),
validity: None,
}
}
Expand All @@ -91,71 +77,65 @@ impl<O: Offset> MutableUtf8Array<O> {
values: Vec<u8>,
validity: Option<MutableBitmap>,
) -> Result<Self> {
try_check_offsets_and_utf8(&offsets, &values)?;
let values = MutableUtf8ValuesArray::try_new(data_type, offsets, values)?;

if validity
.as_ref()
.map_or(false, |validity| validity.len() != offsets.len() - 1)
.map_or(false, |validity| validity.len() != values.len())
{
return Err(Error::oos(
"validity's length must be equal to the number of values",
));
}

if data_type.to_physical_type() != Self::default_data_type().to_physical_type() {
return Err(Error::oos(
"MutableUtf8Array can only be initialized with DataType::Utf8 or DataType::LargeUtf8",
));
}

Ok(Self {
data_type,
offsets,
values,
validity,
})
Ok(Self { values, validity })
}

/// The canonical method to create a [`MutableUtf8Array`] out of low-end APIs.
/// Create a [`MutableUtf8Array`] out of low-end APIs.
/// # Safety
/// The caller must ensure that every value between offsets is a valid utf8.
/// # Panics
/// This function panics iff:
/// * The `offsets` and `values` are inconsistent
/// * The `values` between `offsets` are not utf8 encoded
/// * The validity is not `None` and its length is different from `offsets`'s length minus one.
pub fn from_data(
pub unsafe fn new_unchecked(
data_type: DataType,
offsets: Vec<O>,
values: Vec<u8>,
validity: Option<MutableBitmap>,
) -> Self {
Self::try_new(data_type, offsets, values, validity).unwrap()
Self::from_data_unchecked(data_type, offsets, values, validity)
}

/// Create a [`MutableUtf8Array`] out of low-end APIs.
/// Alias of `new_unchecked`
/// # Safety
/// The caller must ensure that every value between offsets is a valid utf8.
pub unsafe fn from_data_unchecked(
data_type: DataType,
offsets: Vec<O>,
values: Vec<u8>,
validity: Option<MutableBitmap>,
) -> Self {
let values = MutableUtf8ValuesArray::new_unchecked(data_type, offsets, values);
if let Some(ref validity) = validity {
assert_eq!(values.len(), validity.len());
}
Self { values, validity }
}

/// The canonical method to create a [`MutableUtf8Array`] out of low-end APIs.
/// # Panics
/// This function panics iff:
/// * The `offsets` and `values` are inconsistent
/// * The `values` between `offsets` are not utf8 encoded
/// * The validity is not `None` and its length is different from `offsets`'s length minus one.
pub unsafe fn from_data_unchecked(
pub fn from_data(
data_type: DataType,
offsets: Vec<O>,
values: Vec<u8>,
validity: Option<MutableBitmap>,
) -> Self {
check_offsets_minimal(&offsets, values.len());
if let Some(ref validity) = validity {
assert_eq!(offsets.len() - 1, validity.len());
}
if data_type.to_physical_type() != Self::default_data_type().to_physical_type() {
panic!("MutableUtf8Array can only be initialized with DataType::Utf8 or DataType::LargeUtf8")
}
Self {
data_type,
offsets,
values,
validity,
}
Self::try_new(data_type, offsets, values, validity).unwrap()
}

fn default_data_type() -> DataType {
Expand All @@ -169,29 +149,23 @@ impl<O: Offset> MutableUtf8Array<O> {

/// Initializes a new [`MutableUtf8Array`] with a pre-allocated capacity of slots and values.
pub fn with_capacities(capacity: usize, values: usize) -> Self {
let mut offsets = Vec::<O>::with_capacity(capacity + 1);
offsets.push(O::default());

Self {
data_type: Self::default_data_type(),
offsets,
values: Vec::<u8>::with_capacity(values),
values: MutableUtf8ValuesArray::with_capacities(capacity, values),
validity: None,
}
}

/// Reserves `additional` elements and `additional_values` on the values buffer.
pub fn reserve(&mut self, additional: usize, additional_values: usize) {
self.offsets.reserve(additional);
self.values.reserve(additional, additional_values);
if let Some(x) = self.validity.as_mut() {
x.reserve(additional)
}
self.values.reserve(additional_values);
}

#[inline]
fn last_offset(&self) -> O {
*self.offsets.last().unwrap()
/// Reserves `additional` elements and `additional_values` on the values buffer.
pub fn capacity(&self) -> usize {
self.values.capacity()
}

/// Pushes a new element to the array.
Expand All @@ -205,23 +179,16 @@ impl<O: Offset> MutableUtf8Array<O> {
/// Pop the last entry from [`MutableUtf8Array`].
/// This function returns `None` iff this array is empty.
pub fn pop(&mut self) -> Option<String> {
if self.offsets.len() < 2 {
return None;
}
self.offsets.pop()?;
let value_start = self.offsets.iter().last().cloned()?.to_usize();
let value = self.values.split_off(value_start);
let value = self.values.pop()?;
self.validity
.as_mut()
.map(|x| x.pop()?.then(|| ()))
.unwrap_or_else(|| Some(()))
.map(|_|
// soundness: we always check for utf8 soundness on constructors.
unsafe { String::from_utf8_unchecked(value) })
.map(|_| value)
}

fn init_validity(&mut self) {
let mut validity = MutableBitmap::with_capacity(self.offsets.capacity());
let mut validity = MutableBitmap::with_capacity(self.values.capacity());
validity.extend_constant(self.len(), true);
validity.set(self.len() - 1, false);
self.validity = Some(validity);
Expand All @@ -236,33 +203,33 @@ impl<O: Offset> MutableUtf8Array<O> {
/// Shrinks the capacity of the [`MutableUtf8Array`] to fit its current length.
pub fn shrink_to_fit(&mut self) {
self.values.shrink_to_fit();
self.offsets.shrink_to_fit();
if let Some(validity) = &mut self.validity {
validity.shrink_to_fit()
}
}

/// Extract the low-end APIs from the [`MutableUtf8Array`].
pub fn into_data(self) -> (DataType, Vec<O>, Vec<u8>, Option<MutableBitmap>) {
(self.data_type, self.offsets, self.values, self.validity)
let (data_type, offsets, values) = self.values.into_inner();
(data_type, offsets, values, self.validity)
}
}

impl<O: Offset> MutableUtf8Array<O> {
/// returns its values.
pub fn values(&self) -> &Vec<u8> {
&self.values
self.values.values()
}

/// returns its offsets.
pub fn offsets(&self) -> &Vec<O> {
&self.offsets
self.values.offsets()
}
}

impl<O: Offset> MutableArray for MutableUtf8Array<O> {
fn len(&self) -> usize {
self.offsets.len() - 1
self.values.len()
}

fn validity(&self) -> Option<&MutableBitmap> {
Expand All @@ -273,28 +240,32 @@ impl<O: Offset> MutableArray for MutableUtf8Array<O> {
// Safety:
// `MutableUtf8Array` has the same invariants as `Utf8Array` and thus
// `Utf8Array` can be safely created from `MutableUtf8Array` without checks.
Box::new(unsafe {
let (data_type, offsets, values) = std::mem::take(&mut self.values).into_inner();
unsafe {
Utf8Array::from_data_unchecked(
self.data_type.clone(),
std::mem::take(&mut self.offsets).into(),
std::mem::take(&mut self.values).into(),
data_type,
offsets.into(),
values.into(),
std::mem::take(&mut self.validity).map(|x| x.into()),
)
})
}
.boxed()
}

fn as_arc(&mut self) -> Arc<dyn Array> {
// Safety:
// `MutableUtf8Array` has the same invariants as `Utf8Array` and thus
// `Utf8Array` can be safely created from `MutableUtf8Array` without checks.
Arc::new(unsafe {
let (data_type, offsets, values) = std::mem::take(&mut self.values).into_inner();
unsafe {
Utf8Array::from_data_unchecked(
self.data_type.clone(),
std::mem::take(&mut self.offsets).into(),
std::mem::take(&mut self.values).into(),
data_type,
offsets.into(),
values.into(),
std::mem::take(&mut self.validity).map(|x| x.into()),
)
})
}
.arced()
}

fn data_type(&self) -> &DataType {
Expand Down Expand Up @@ -353,8 +324,9 @@ impl<O: Offset> MutableUtf8Array<O> {
P: AsRef<str>,
I: Iterator<Item = P>,
{
let iterator = iterator.map(StrAsBytes);
let additional = extend_from_values_iter(&mut self.offsets, &mut self.values, iterator);
let length = self.values.len();
self.values.extend(iterator);
let additional = self.values.len() - length;

if let Some(validity) = self.validity.as_mut() {
validity.extend_constant(additional, true);
Expand All @@ -372,11 +344,9 @@ impl<O: Offset> MutableUtf8Array<O> {
P: AsRef<str>,
I: Iterator<Item = P>,
{
let (_, upper) = iterator.size_hint();
let additional = upper.expect("extend_trusted_len_values requires an upper limit");

let iterator = iterator.map(StrAsBytes);
extend_from_trusted_len_values_iter(&mut self.offsets, &mut self.values, iterator);
let length = self.values.len();
self.values.extend_trusted_len_unchecked(iterator);
let additional = self.values.len() - length;

if let Some(validity) = self.validity.as_mut() {
validity.extend_constant(additional, true);
Expand Down Expand Up @@ -408,13 +378,8 @@ impl<O: Offset> MutableUtf8Array<O> {
self.validity = Some(validity);
}

let iterator = iterator.map(|x| x.map(StrAsBytes));
extend_from_trusted_len_iter(
&mut self.offsets,
&mut self.values,
self.validity.as_mut().unwrap(),
iterator,
);
self.values
.extend_from_trusted_len_iter(self.validity.as_mut().unwrap(), iterator);
}

/// Creates a [`MutableUtf8Array`] from an iterator of trusted length.
Expand Down Expand Up @@ -547,20 +512,15 @@ impl<O: Offset, T: AsRef<str>> TryPush<Option<T>> for MutableUtf8Array<O> {
fn try_push(&mut self, value: Option<T>) -> Result<()> {
match value {
Some(value) => {
let bytes = value.as_ref().as_bytes();
self.values.extend_from_slice(bytes);

let size = O::from_usize(self.values.len()).ok_or(Error::Overflow)?;

self.offsets.push(size);
self.values.try_push(value.as_ref())?;

match &mut self.validity {
Some(validity) => validity.push(true),
None => {}
}
}
None => {
self.offsets.push(self.last_offset());
self.values.push("");
match &mut self.validity {
Some(validity) => validity.push(false),
None => self.init_validity(),
Expand Down
Loading

0 comments on commit 7ed66f7

Please sign in to comment.