Skip to content
This repository has been archived by the owner on Feb 18, 2024. It is now read-only.

Improved MutablePrimitiveArray and MutableUtf8Array #299

Merged
merged 2 commits into from
Aug 19, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 2 additions & 18 deletions src/array/boolean/mutable.rs
Original file line number Diff line number Diff line change
Expand Up @@ -169,15 +169,7 @@ impl MutableBooleanArray {
P: std::borrow::Borrow<bool>,
I: TrustedLen<Item = Option<P>>,
{
let (validity, values) = unsafe { trusted_len_unzip(iterator) };

let validity = if validity.null_count() > 0 {
Some(validity)
} else {
None
};

Self::from_data(values, validity)
unsafe { Self::from_trusted_len_iter_unchecked(iterator) }
}

/// Creates a [`BooleanArray`] from an falible iterator of trusted length.
Expand Down Expand Up @@ -210,15 +202,7 @@ impl MutableBooleanArray {
P: std::borrow::Borrow<bool>,
I: TrustedLen<Item = std::result::Result<Option<P>, E>>,
{
let (validity, values) = unsafe { try_trusted_len_unzip(iterator)? };

let validity = if validity.null_count() > 0 {
Some(validity)
} else {
None
};

Ok(Self::from_data(values, validity))
unsafe { Self::try_from_trusted_len_iter_unchecked(iterator) }
}
}

Expand Down
3 changes: 2 additions & 1 deletion src/array/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -258,7 +258,8 @@ pub fn new_empty_array(data_type: DataType) -> Box<dyn Array> {
}

/// Creates a new [`Array`] of [`DataType`] `data_type` and `length`.
/// The array is guaranteed to have [`Array::null_count`] equal to [`Array::len`].
/// The array is guaranteed to have [`Array::null_count`] equal to [`Array::len`]
/// for all types except Union, which does not have a validity.
pub fn new_null_array(data_type: DataType, length: usize) -> Box<dyn Array> {
match data_type {
DataType::Null => Box::new(NullArray::new_null(length)),
Expand Down
20 changes: 19 additions & 1 deletion src/array/primitive/iterator.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@ use crate::{
types::NativeType,
};

use super::PrimitiveArray;
use super::super::MutableArray;
use super::{MutablePrimitiveArray, PrimitiveArray};

impl<'a, T: NativeType> IntoIterator for &'a PrimitiveArray<T> {
type Item = Option<&'a T>;
Expand All @@ -25,3 +26,20 @@ impl<'a, T: NativeType> PrimitiveArray<T> {
)
}
}

impl<'a, T: NativeType> MutablePrimitiveArray<T> {
/// Returns an iterator over `Option<T>`
#[inline]
pub fn iter(&'a self) -> ZipValidity<'a, &'a T, std::slice::Iter<'a, T>> {
zip_validity(
self.values().iter(),
self.validity().as_ref().map(|x| x.iter()),
)
}

/// Returns an iterator of `bool`
#[inline]
pub fn values_iter(&'a self) -> std::slice::Iter<'a, T> {
self.values().iter()
}
}
22 changes: 8 additions & 14 deletions src/array/primitive/mutable.rs
Original file line number Diff line number Diff line change
Expand Up @@ -332,13 +332,7 @@ impl<T: NativeType + NaturalDataType> MutablePrimitiveArray<T> {
P: std::borrow::Borrow<T>,
I: TrustedLen<Item = Option<P>>,
{
let (validity, values) = unsafe { trusted_len_unzip(iterator) };

Self {
data_type: T::DATA_TYPE,
values,
validity,
}
unsafe { Self::from_trusted_len_iter_unchecked(iterator) }
}

/// Creates a [`MutablePrimitiveArray`] from an fallible iterator of trusted length.
Expand Down Expand Up @@ -371,13 +365,7 @@ impl<T: NativeType + NaturalDataType> MutablePrimitiveArray<T> {
P: std::borrow::Borrow<T>,
I: TrustedLen<Item = std::result::Result<Option<P>, E>>,
{
let (validity, values) = unsafe { try_trusted_len_unzip(iterator) }?;

Ok(Self {
data_type: T::DATA_TYPE,
values,
validity,
})
unsafe { Self::try_from_trusted_len_iter_unchecked(iterator) }
}

/// Creates a new [`MutablePrimitiveArray`] out an iterator over values
Expand Down Expand Up @@ -543,3 +531,9 @@ where

Ok((validity, buffer))
}

impl<T: NativeType> PartialEq for MutablePrimitiveArray<T> {
fn eq(&self, other: &Self) -> bool {
self.iter().eq(other.iter())
}
}
10 changes: 4 additions & 6 deletions src/array/specification.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ use std::convert::TryFrom;

use num::Num;

use crate::{buffer::Buffer, types::Index};
use crate::types::Index;

/// Trait describing types that can be used as offsets as per Arrow specification.
/// This trait is only implemented for `i32` and `i64`, the two sizes part of the specification.
Expand Down Expand Up @@ -51,15 +51,13 @@ unsafe impl Offset for i64 {
}

#[inline]
pub fn check_offsets<O: Offset>(offsets: &Buffer<O>, values_len: usize) -> usize {
pub fn check_offsets<O: Offset>(offsets: &[O], values_len: usize) -> usize {
assert!(
!offsets.is_empty(),
"The length of the offset buffer must be larger than 1"
);
let len = offsets.len() - 1;

let offsets = offsets.as_slice();

let last_offset = offsets[len];
let last_offset = last_offset.to_usize();

Expand All @@ -71,9 +69,9 @@ pub fn check_offsets<O: Offset>(offsets: &Buffer<O>, values_len: usize) -> usize
}

#[inline]
pub fn check_offsets_and_utf8<O: Offset>(offsets: &Buffer<O>, values: &Buffer<u8>) -> usize {
pub fn check_offsets_and_utf8<O: Offset>(offsets: &[O], values: &[u8]) -> usize {
let len = check_offsets(offsets, values.len());
offsets.as_slice().windows(2).for_each(|window| {
offsets.windows(2).for_each(|window| {
let start = window[0].to_usize();
let end = window[1].to_usize();
assert!(end <= values.len());
Expand Down
198 changes: 17 additions & 181 deletions src/array/utf8/from.rs
Original file line number Diff line number Diff line change
@@ -1,11 +1,7 @@
use std::iter::FromIterator;

use crate::array::Offset;
use crate::trusted_len::TrustedLen;
use crate::{
array::Offset,
bitmap::{Bitmap, MutableBitmap},
buffer::{Buffer, MutableBuffer},
};

use super::{MutableUtf8Array, Utf8Array};

Expand All @@ -28,15 +24,12 @@ impl<O: Offset> Utf8Array<O> {
pub fn from_trusted_len_values_iter<T: AsRef<str>, I: TrustedLen<Item = T>>(
iterator: I,
) -> Self {
let (offsets, values) = unsafe { trusted_len_values_iter(iterator) };
Self::from_data(offsets, values, None)
MutableUtf8Array::<O>::from_trusted_len_values_iter(iterator).into()
}

/// Creates a new [`Utf8Array`] from a [`Iterator`] of `&str`.
pub fn from_iter_values<T: AsRef<str>, I: IntoIterator<Item = T>>(iter: I) -> Self {
let iterator = iter.into_iter();
let (offsets, values) = values_iter(iterator);
Self::from_data(offsets, values, None)
pub fn from_iter_values<T: AsRef<str>, I: Iterator<Item = T>>(iterator: I) -> Self {
MutableUtf8Array::<O>::from_iter_values(iterator).into()
}
}

Expand All @@ -51,10 +44,7 @@ impl<O: Offset> Utf8Array<O> {
P: AsRef<str>,
I: Iterator<Item = Option<P>>,
{
let (validity, offsets, values) = trusted_len_unzip(iterator);

// soundness: P is `str`
Self::from_data_unchecked(offsets, values, validity)
MutableUtf8Array::<O>::from_trusted_len_iter_unchecked(iterator).into()
}

/// Creates a [`Utf8Array`] from an iterator of trusted length.
Expand All @@ -68,183 +58,29 @@ impl<O: Offset> Utf8Array<O> {
unsafe { Self::from_trusted_len_iter_unchecked(iterator) }
}

/// Creates a [`PrimitiveArray`] from an falible iterator of trusted length.
/// Creates a [`Utf8Array`] from an falible iterator of trusted length.
/// # Safety
/// The iterator must be [`TrustedLen`](https://doc.rust-lang.org/std/iter/trait.TrustedLen.html).
/// I.e. that `size_hint().1` correctly reports its length.
#[inline]
pub unsafe fn try_from_trusted_len_iter<E, I, P>(iter: I) -> Result<Self, E>
pub unsafe fn try_from_trusted_len_iter_unchecked<E, I, P>(iterator: I) -> Result<Self, E>
where
P: AsRef<str>,
I: IntoIterator<Item = Result<Option<P>, E>>,
{
let iterator = iter.into_iter();

let (validity, offsets, values) = try_trusted_len_unzip(iterator)?;

// soundness: P is `str`
Ok(Self::from_data_unchecked(offsets, values, validity))
}
}

/// Creates [`Bitmap`] and two [`Buffer`]s from an iterator of `Option`.
/// The first buffer corresponds to a offset buffer, the second one
/// corresponds to a values buffer.
/// # Safety
/// The caller must ensure that `iterator` is `TrustedLen`.
#[inline]
pub(crate) unsafe fn trusted_len_unzip<O, I, P>(
iterator: I,
) -> (Option<Bitmap>, Buffer<O>, Buffer<u8>)
where
O: Offset,
P: AsRef<str>,
I: Iterator<Item = Option<P>>,
{
let (_, upper) = iterator.size_hint();
let len = upper.expect("trusted_len_unzip requires an upper limit");

let mut null = MutableBitmap::with_capacity(len);
let mut offsets = MutableBuffer::<O>::with_capacity(len + 1);
let mut values = MutableBuffer::<u8>::new();

let mut length = O::default();
let mut dst = offsets.as_mut_ptr();
std::ptr::write(dst, length);
dst = dst.add(1);
for item in iterator {
if let Some(item) = item {
null.push(true);
let s = item.as_ref();
length += O::from_usize(s.len()).unwrap();
values.extend_from_slice(s.as_bytes());
} else {
null.push(false);
values.extend_from_slice(b"");
};

std::ptr::write(dst, length);
dst = dst.add(1);
}
assert_eq!(
dst.offset_from(offsets.as_ptr()) as usize,
len + 1,
"Trusted iterator length was not accurately reported"
);
offsets.set_len(len + 1);

(null.into(), offsets.into(), values.into())
}

/// Creates two [`Buffer`]s from an iterator of `&str`.
/// The first buffer corresponds to a offset buffer, the second to a values buffer.
/// # Safety
/// The caller must ensure that `iterator` is [`TrustedLen`].
#[inline]
pub(crate) unsafe fn trusted_len_values_iter<O, I, P>(iterator: I) -> (Buffer<O>, Buffer<u8>)
where
O: Offset,
P: AsRef<str>,
I: Iterator<Item = P>,
{
let (_, upper) = iterator.size_hint();
let len = upper.expect("trusted_len_unzip requires an upper limit");

let mut offsets = MutableBuffer::<O>::with_capacity(len + 1);
let mut values = MutableBuffer::<u8>::new();

let mut length = O::default();
let mut dst = offsets.as_mut_ptr();
std::ptr::write(dst, length);
dst = dst.add(1);
for item in iterator {
let s = item.as_ref();
length += O::from_usize(s.len()).unwrap();
values.extend_from_slice(s.as_bytes());

std::ptr::write(dst, length);
dst = dst.add(1);
MutableUtf8Array::<O>::try_from_trusted_len_iter_unchecked(iterator).map(|x| x.into())
}
assert_eq!(
dst.offset_from(offsets.as_ptr()) as usize,
len + 1,
"Trusted iterator length was not accurately reported"
);
offsets.set_len(len + 1);

(offsets.into(), values.into())
}

/// Creates two [`Buffer`]s from an iterator of `&str`.
/// The first buffer corresponds to a offset buffer, the second to a values buffer.
#[inline]
fn values_iter<O, I, P>(iterator: I) -> (Buffer<O>, Buffer<u8>)
where
O: Offset,
P: AsRef<str>,
I: Iterator<Item = P>,
{
let (lower, _) = iterator.size_hint();

let mut offsets = MutableBuffer::<O>::with_capacity(lower + 1);
let mut values = MutableBuffer::<u8>::new();

let mut length = O::default();
offsets.push(length);

for item in iterator {
let s = item.as_ref();
length += O::from_usize(s.len()).unwrap();
values.extend_from_slice(s.as_bytes());

offsets.push(length)
}
(offsets.into(), values.into())
}

/// # Safety
/// The caller must ensure that `iterator` is `TrustedLen`.
#[inline]
#[allow(clippy::type_complexity)]
pub(crate) unsafe fn try_trusted_len_unzip<E, I, P, O>(
iterator: I,
) -> Result<(Option<Bitmap>, Buffer<O>, Buffer<u8>), E>
where
O: Offset,
P: AsRef<str>,
I: Iterator<Item = Result<Option<P>, E>>,
{
let (_, upper) = iterator.size_hint();
let len = upper.expect("trusted_len_unzip requires an upper limit");

let mut null = MutableBitmap::with_capacity(len);
let mut offsets = MutableBuffer::<O>::with_capacity(len + 1);
let mut values = MutableBuffer::<u8>::new();

let mut length = O::default();
let mut dst = offsets.as_mut_ptr();
std::ptr::write(dst, length);
dst = dst.add(1);
for item in iterator {
if let Some(item) = item? {
null.push(true);
let s = item.as_ref();
length += O::from_usize(s.len()).unwrap();
values.extend_from_slice(s.as_bytes());
} else {
null.push(false);
};
std::ptr::write(dst, length);
dst = dst.add(1);
/// Creates a [`Utf8Array`] from an fallible iterator of trusted length.
#[inline]
pub fn try_from_trusted_len_iter<E, I, P>(iter: I) -> Result<Self, E>
where
P: AsRef<str>,
I: TrustedLen<Item = Result<Option<P>, E>>,
{
// soundness: I: TrustedLen
unsafe { Self::try_from_trusted_len_iter_unchecked(iter) }
}
assert_eq!(
dst.offset_from(offsets.as_ptr()) as usize,
len + 1,
"Trusted iterator length was not accurately reported"
);
offsets.set_len(len + 1);

Ok((null.into(), offsets.into(), values.into()))
}

impl<O: Offset, P: AsRef<str>> FromIterator<Option<P>> for Utf8Array<O> {
Expand Down
Loading