Skip to content
This repository has been archived by the owner on Feb 18, 2024. It is now read-only.

Commit

Permalink
Improved MutablePrimitiveArray and MutableUtf8Array (#299)
Browse files Browse the repository at this point in the history
  • Loading branch information
jorgecarleitao authored Aug 19, 2021
1 parent 2e8ba17 commit 8ff8f7d
Show file tree
Hide file tree
Showing 12 changed files with 438 additions and 224 deletions.
20 changes: 2 additions & 18 deletions src/array/boolean/mutable.rs
Original file line number Diff line number Diff line change
Expand Up @@ -169,15 +169,7 @@ impl MutableBooleanArray {
P: std::borrow::Borrow<bool>,
I: TrustedLen<Item = Option<P>>,
{
let (validity, values) = unsafe { trusted_len_unzip(iterator) };

let validity = if validity.null_count() > 0 {
Some(validity)
} else {
None
};

Self::from_data(values, validity)
unsafe { Self::from_trusted_len_iter_unchecked(iterator) }
}

/// Creates a [`BooleanArray`] from an falible iterator of trusted length.
Expand Down Expand Up @@ -210,15 +202,7 @@ impl MutableBooleanArray {
P: std::borrow::Borrow<bool>,
I: TrustedLen<Item = std::result::Result<Option<P>, E>>,
{
let (validity, values) = unsafe { try_trusted_len_unzip(iterator)? };

let validity = if validity.null_count() > 0 {
Some(validity)
} else {
None
};

Ok(Self::from_data(values, validity))
unsafe { Self::try_from_trusted_len_iter_unchecked(iterator) }
}
}

Expand Down
3 changes: 2 additions & 1 deletion src/array/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -258,7 +258,8 @@ pub fn new_empty_array(data_type: DataType) -> Box<dyn Array> {
}

/// Creates a new [`Array`] of [`DataType`] `data_type` and `length`.
/// The array is guaranteed to have [`Array::null_count`] equal to [`Array::len`].
/// The array is guaranteed to have [`Array::null_count`] equal to [`Array::len`]
/// for all types except Union, which does not have a validity.
pub fn new_null_array(data_type: DataType, length: usize) -> Box<dyn Array> {
match data_type {
DataType::Null => Box::new(NullArray::new_null(length)),
Expand Down
20 changes: 19 additions & 1 deletion src/array/primitive/iterator.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@ use crate::{
types::NativeType,
};

use super::PrimitiveArray;
use super::super::MutableArray;
use super::{MutablePrimitiveArray, PrimitiveArray};

impl<'a, T: NativeType> IntoIterator for &'a PrimitiveArray<T> {
type Item = Option<&'a T>;
Expand All @@ -25,3 +26,20 @@ impl<'a, T: NativeType> PrimitiveArray<T> {
)
}
}

impl<'a, T: NativeType> MutablePrimitiveArray<T> {
/// Returns an iterator over `Option<T>`
#[inline]
pub fn iter(&'a self) -> ZipValidity<'a, &'a T, std::slice::Iter<'a, T>> {
zip_validity(
self.values().iter(),
self.validity().as_ref().map(|x| x.iter()),
)
}

/// Returns an iterator of `bool`
#[inline]
pub fn values_iter(&'a self) -> std::slice::Iter<'a, T> {
self.values().iter()
}
}
22 changes: 8 additions & 14 deletions src/array/primitive/mutable.rs
Original file line number Diff line number Diff line change
Expand Up @@ -332,13 +332,7 @@ impl<T: NativeType + NaturalDataType> MutablePrimitiveArray<T> {
P: std::borrow::Borrow<T>,
I: TrustedLen<Item = Option<P>>,
{
let (validity, values) = unsafe { trusted_len_unzip(iterator) };

Self {
data_type: T::DATA_TYPE,
values,
validity,
}
unsafe { Self::from_trusted_len_iter_unchecked(iterator) }
}

/// Creates a [`MutablePrimitiveArray`] from an fallible iterator of trusted length.
Expand Down Expand Up @@ -371,13 +365,7 @@ impl<T: NativeType + NaturalDataType> MutablePrimitiveArray<T> {
P: std::borrow::Borrow<T>,
I: TrustedLen<Item = std::result::Result<Option<P>, E>>,
{
let (validity, values) = unsafe { try_trusted_len_unzip(iterator) }?;

Ok(Self {
data_type: T::DATA_TYPE,
values,
validity,
})
unsafe { Self::try_from_trusted_len_iter_unchecked(iterator) }
}

/// Creates a new [`MutablePrimitiveArray`] out an iterator over values
Expand Down Expand Up @@ -543,3 +531,9 @@ where

Ok((validity, buffer))
}

impl<T: NativeType> PartialEq for MutablePrimitiveArray<T> {
fn eq(&self, other: &Self) -> bool {
self.iter().eq(other.iter())
}
}
10 changes: 4 additions & 6 deletions src/array/specification.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ use std::convert::TryFrom;

use num::Num;

use crate::{buffer::Buffer, types::Index};
use crate::types::Index;

/// Trait describing types that can be used as offsets as per Arrow specification.
/// This trait is only implemented for `i32` and `i64`, the two sizes part of the specification.
Expand Down Expand Up @@ -51,15 +51,13 @@ unsafe impl Offset for i64 {
}

#[inline]
pub fn check_offsets<O: Offset>(offsets: &Buffer<O>, values_len: usize) -> usize {
pub fn check_offsets<O: Offset>(offsets: &[O], values_len: usize) -> usize {
assert!(
!offsets.is_empty(),
"The length of the offset buffer must be larger than 1"
);
let len = offsets.len() - 1;

let offsets = offsets.as_slice();

let last_offset = offsets[len];
let last_offset = last_offset.to_usize();

Expand All @@ -71,9 +69,9 @@ pub fn check_offsets<O: Offset>(offsets: &Buffer<O>, values_len: usize) -> usize
}

#[inline]
pub fn check_offsets_and_utf8<O: Offset>(offsets: &Buffer<O>, values: &Buffer<u8>) -> usize {
pub fn check_offsets_and_utf8<O: Offset>(offsets: &[O], values: &[u8]) -> usize {
let len = check_offsets(offsets, values.len());
offsets.as_slice().windows(2).for_each(|window| {
offsets.windows(2).for_each(|window| {
let start = window[0].to_usize();
let end = window[1].to_usize();
assert!(end <= values.len());
Expand Down
198 changes: 17 additions & 181 deletions src/array/utf8/from.rs
Original file line number Diff line number Diff line change
@@ -1,11 +1,7 @@
use std::iter::FromIterator;

use crate::array::Offset;
use crate::trusted_len::TrustedLen;
use crate::{
array::Offset,
bitmap::{Bitmap, MutableBitmap},
buffer::{Buffer, MutableBuffer},
};

use super::{MutableUtf8Array, Utf8Array};

Expand All @@ -28,15 +24,12 @@ impl<O: Offset> Utf8Array<O> {
pub fn from_trusted_len_values_iter<T: AsRef<str>, I: TrustedLen<Item = T>>(
iterator: I,
) -> Self {
let (offsets, values) = unsafe { trusted_len_values_iter(iterator) };
Self::from_data(offsets, values, None)
MutableUtf8Array::<O>::from_trusted_len_values_iter(iterator).into()
}

/// Creates a new [`Utf8Array`] from a [`Iterator`] of `&str`.
pub fn from_iter_values<T: AsRef<str>, I: IntoIterator<Item = T>>(iter: I) -> Self {
let iterator = iter.into_iter();
let (offsets, values) = values_iter(iterator);
Self::from_data(offsets, values, None)
pub fn from_iter_values<T: AsRef<str>, I: Iterator<Item = T>>(iterator: I) -> Self {
MutableUtf8Array::<O>::from_iter_values(iterator).into()
}
}

Expand All @@ -51,10 +44,7 @@ impl<O: Offset> Utf8Array<O> {
P: AsRef<str>,
I: Iterator<Item = Option<P>>,
{
let (validity, offsets, values) = trusted_len_unzip(iterator);

// soundness: P is `str`
Self::from_data_unchecked(offsets, values, validity)
MutableUtf8Array::<O>::from_trusted_len_iter_unchecked(iterator).into()
}

/// Creates a [`Utf8Array`] from an iterator of trusted length.
Expand All @@ -68,183 +58,29 @@ impl<O: Offset> Utf8Array<O> {
unsafe { Self::from_trusted_len_iter_unchecked(iterator) }
}

/// Creates a [`PrimitiveArray`] from an falible iterator of trusted length.
/// Creates a [`Utf8Array`] from an falible iterator of trusted length.
/// # Safety
/// The iterator must be [`TrustedLen`](https://doc.rust-lang.org/std/iter/trait.TrustedLen.html).
/// I.e. that `size_hint().1` correctly reports its length.
#[inline]
pub unsafe fn try_from_trusted_len_iter<E, I, P>(iter: I) -> Result<Self, E>
pub unsafe fn try_from_trusted_len_iter_unchecked<E, I, P>(iterator: I) -> Result<Self, E>
where
P: AsRef<str>,
I: IntoIterator<Item = Result<Option<P>, E>>,
{
let iterator = iter.into_iter();

let (validity, offsets, values) = try_trusted_len_unzip(iterator)?;

// soundness: P is `str`
Ok(Self::from_data_unchecked(offsets, values, validity))
}
}

/// Creates [`Bitmap`] and two [`Buffer`]s from an iterator of `Option`.
/// The first buffer corresponds to a offset buffer, the second one
/// corresponds to a values buffer.
/// # Safety
/// The caller must ensure that `iterator` is `TrustedLen`.
#[inline]
pub(crate) unsafe fn trusted_len_unzip<O, I, P>(
iterator: I,
) -> (Option<Bitmap>, Buffer<O>, Buffer<u8>)
where
O: Offset,
P: AsRef<str>,
I: Iterator<Item = Option<P>>,
{
let (_, upper) = iterator.size_hint();
let len = upper.expect("trusted_len_unzip requires an upper limit");

let mut null = MutableBitmap::with_capacity(len);
let mut offsets = MutableBuffer::<O>::with_capacity(len + 1);
let mut values = MutableBuffer::<u8>::new();

let mut length = O::default();
let mut dst = offsets.as_mut_ptr();
std::ptr::write(dst, length);
dst = dst.add(1);
for item in iterator {
if let Some(item) = item {
null.push(true);
let s = item.as_ref();
length += O::from_usize(s.len()).unwrap();
values.extend_from_slice(s.as_bytes());
} else {
null.push(false);
values.extend_from_slice(b"");
};

std::ptr::write(dst, length);
dst = dst.add(1);
}
assert_eq!(
dst.offset_from(offsets.as_ptr()) as usize,
len + 1,
"Trusted iterator length was not accurately reported"
);
offsets.set_len(len + 1);

(null.into(), offsets.into(), values.into())
}

/// Creates two [`Buffer`]s from an iterator of `&str`.
/// The first buffer corresponds to a offset buffer, the second to a values buffer.
/// # Safety
/// The caller must ensure that `iterator` is [`TrustedLen`].
#[inline]
pub(crate) unsafe fn trusted_len_values_iter<O, I, P>(iterator: I) -> (Buffer<O>, Buffer<u8>)
where
O: Offset,
P: AsRef<str>,
I: Iterator<Item = P>,
{
let (_, upper) = iterator.size_hint();
let len = upper.expect("trusted_len_unzip requires an upper limit");

let mut offsets = MutableBuffer::<O>::with_capacity(len + 1);
let mut values = MutableBuffer::<u8>::new();

let mut length = O::default();
let mut dst = offsets.as_mut_ptr();
std::ptr::write(dst, length);
dst = dst.add(1);
for item in iterator {
let s = item.as_ref();
length += O::from_usize(s.len()).unwrap();
values.extend_from_slice(s.as_bytes());

std::ptr::write(dst, length);
dst = dst.add(1);
MutableUtf8Array::<O>::try_from_trusted_len_iter_unchecked(iterator).map(|x| x.into())
}
assert_eq!(
dst.offset_from(offsets.as_ptr()) as usize,
len + 1,
"Trusted iterator length was not accurately reported"
);
offsets.set_len(len + 1);

(offsets.into(), values.into())
}

/// Creates two [`Buffer`]s from an iterator of `&str`.
/// The first buffer corresponds to a offset buffer, the second to a values buffer.
#[inline]
fn values_iter<O, I, P>(iterator: I) -> (Buffer<O>, Buffer<u8>)
where
O: Offset,
P: AsRef<str>,
I: Iterator<Item = P>,
{
let (lower, _) = iterator.size_hint();

let mut offsets = MutableBuffer::<O>::with_capacity(lower + 1);
let mut values = MutableBuffer::<u8>::new();

let mut length = O::default();
offsets.push(length);

for item in iterator {
let s = item.as_ref();
length += O::from_usize(s.len()).unwrap();
values.extend_from_slice(s.as_bytes());

offsets.push(length)
}
(offsets.into(), values.into())
}

/// # Safety
/// The caller must ensure that `iterator` is `TrustedLen`.
#[inline]
#[allow(clippy::type_complexity)]
pub(crate) unsafe fn try_trusted_len_unzip<E, I, P, O>(
iterator: I,
) -> Result<(Option<Bitmap>, Buffer<O>, Buffer<u8>), E>
where
O: Offset,
P: AsRef<str>,
I: Iterator<Item = Result<Option<P>, E>>,
{
let (_, upper) = iterator.size_hint();
let len = upper.expect("trusted_len_unzip requires an upper limit");

let mut null = MutableBitmap::with_capacity(len);
let mut offsets = MutableBuffer::<O>::with_capacity(len + 1);
let mut values = MutableBuffer::<u8>::new();

let mut length = O::default();
let mut dst = offsets.as_mut_ptr();
std::ptr::write(dst, length);
dst = dst.add(1);
for item in iterator {
if let Some(item) = item? {
null.push(true);
let s = item.as_ref();
length += O::from_usize(s.len()).unwrap();
values.extend_from_slice(s.as_bytes());
} else {
null.push(false);
};
std::ptr::write(dst, length);
dst = dst.add(1);
/// Creates a [`Utf8Array`] from an fallible iterator of trusted length.
#[inline]
pub fn try_from_trusted_len_iter<E, I, P>(iter: I) -> Result<Self, E>
where
P: AsRef<str>,
I: TrustedLen<Item = Result<Option<P>, E>>,
{
// soundness: I: TrustedLen
unsafe { Self::try_from_trusted_len_iter_unchecked(iter) }
}
assert_eq!(
dst.offset_from(offsets.as_ptr()) as usize,
len + 1,
"Trusted iterator length was not accurately reported"
);
offsets.set_len(len + 1);

Ok((null.into(), offsets.into(), values.into()))
}

impl<O: Offset, P: AsRef<str>> FromIterator<Option<P>> for Utf8Array<O> {
Expand Down
Loading

0 comments on commit 8ff8f7d

Please sign in to comment.