Skip to content
This repository has been archived by the owner on Feb 18, 2024. It is now read-only.

Simplified code #760

Merged
merged 1 commit into from
Jan 12, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
245 changes: 1 addition & 244 deletions src/array/binary/mutable.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ use crate::{
};

use super::BinaryArray;
use crate::array::physical_binary::*;

/// The Arrow's equivalent to `Vec<Option<Vec<u8>>>`.
/// Converting a [`MutableBinaryArray`] into a [`BinaryArray`] is `O(1)`.
Expand Down Expand Up @@ -432,247 +433,3 @@ impl<O: Offset, T: AsRef<[u8]>> TryPush<Option<T>> for MutableBinaryArray<O> {
Ok(())
}
}

/// Creates [`MutableBitmap`] and two [`Vec`]s from an iterator of `Option`.
/// The first buffer corresponds to a offset buffer, the second one
/// corresponds to a values buffer.
/// # Safety
/// The caller must ensure that `iterator` is `TrustedLen`.
#[inline]
unsafe fn trusted_len_unzip<O, I, P>(iterator: I) -> (Option<MutableBitmap>, Vec<O>, Vec<u8>)
where
O: Offset,
P: AsRef<[u8]>,
I: Iterator<Item = Option<P>>,
{
let (_, upper) = iterator.size_hint();
let len = upper.expect("trusted_len_unzip requires an upper limit");

let mut offsets = Vec::<O>::with_capacity(len + 1);
let mut values = Vec::<u8>::new();
let mut validity = MutableBitmap::new();

offsets.push(O::default());

extend_from_trusted_len_iter(&mut offsets, &mut values, &mut validity, iterator);

let validity = if validity.null_count() > 0 {
Some(validity)
} else {
None
};

(validity, offsets, values)
}

/// # Safety
/// The caller must ensure that `iterator` is `TrustedLen`.
#[inline]
#[allow(clippy::type_complexity)]
pub(crate) unsafe fn try_trusted_len_unzip<E, I, P, O>(
iterator: I,
) -> std::result::Result<(Option<MutableBitmap>, Vec<O>, Vec<u8>), E>
where
O: Offset,
P: AsRef<[u8]>,
I: Iterator<Item = std::result::Result<Option<P>, E>>,
{
let (_, upper) = iterator.size_hint();
let len = upper.expect("trusted_len_unzip requires an upper limit");

let mut null = MutableBitmap::with_capacity(len);
let mut offsets = Vec::<O>::with_capacity(len + 1);
let mut values = Vec::<u8>::new();

let mut length = O::default();
let mut dst = offsets.as_mut_ptr();
std::ptr::write(dst, length);
dst = dst.add(1);
for item in iterator {
if let Some(item) = item? {
null.push(true);
let s = item.as_ref();
length += O::from_usize(s.len()).unwrap();
values.extend_from_slice(s);
} else {
null.push(false);
};

std::ptr::write(dst, length);
dst = dst.add(1);
}
assert_eq!(
dst.offset_from(offsets.as_ptr()) as usize,
len + 1,
"Trusted iterator length was not accurately reported"
);
offsets.set_len(len + 1);

Ok((null.into(), offsets, values))
}

/// Creates two [`Buffer`]s from an iterator of `&[u8]`.
/// The first buffer corresponds to a offset buffer, the second to a values buffer.
/// # Safety
/// The caller must ensure that `iterator` is [`TrustedLen`].
#[inline]
pub(crate) unsafe fn trusted_len_values_iter<O, I, P>(iterator: I) -> (Vec<O>, Vec<u8>)
where
O: Offset,
P: AsRef<[u8]>,
I: Iterator<Item = P>,
{
let (_, upper) = iterator.size_hint();
let len = upper.expect("trusted_len_unzip requires an upper limit");

let mut offsets = Vec::<O>::with_capacity(len + 1);
let mut values = Vec::<u8>::new();

offsets.push(O::default());

extend_from_trusted_len_values_iter(&mut offsets, &mut values, iterator);

(offsets, values)
}

// Populates `offsets` and `values` [`Vec`]s with information extracted
// from the incoming `iterator`.
// # Safety
// The caller must ensure the `iterator` is [`TrustedLen`]
#[inline]
unsafe fn extend_from_trusted_len_values_iter<I, P, O>(
offsets: &mut Vec<O>,
values: &mut Vec<u8>,
iterator: I,
) where
O: Offset,
P: AsRef<[u8]>,
I: Iterator<Item = P>,
{
let (_, upper) = iterator.size_hint();
let additional = upper.expect("extend_from_trusted_len_values_iter requires an upper limit");

offsets.reserve(additional);

// Read in the last offset, will be used to increment and store
// new values later on
let mut length = *offsets.last().unwrap();

// Get a mutable pointer to the `offsets`, and move the pointer
// to the position, where a new value will be written
let mut dst = offsets.as_mut_ptr();
dst = dst.add(offsets.len());

for item in iterator {
let s = item.as_ref();

// Calculate the new offset value
length += O::from_usize(s.len()).unwrap();

// Push new entries for both `values` and `offsets` buffer
values.extend_from_slice(s);
std::ptr::write(dst, length);

// Move to the next position in offset buffer
dst = dst.add(1);
}

debug_assert_eq!(
dst.offset_from(offsets.as_ptr()) as usize,
offsets.len() + additional,
"TrustedLen iterator's length was not accurately reported"
);

// We make sure to set the new length for the `offsets` buffer
offsets.set_len(offsets.len() + additional);
}

// Populates `offsets`, `values`, and `validity` [`Vec`]s with
// information extracted from the incoming `iterator`.
//
// # Safety
// The caller must ensure that `iterator` is [`TrustedLen`]
#[inline]
unsafe fn extend_from_trusted_len_iter<O, I, P>(
offsets: &mut Vec<O>,
values: &mut Vec<u8>,
validity: &mut MutableBitmap,
iterator: I,
) where
O: Offset,
P: AsRef<[u8]>,
I: Iterator<Item = Option<P>>,
{
let (_, upper) = iterator.size_hint();
let additional = upper.expect("extend_from_trusted_len_iter requires an upper limit");

offsets.reserve(additional);
validity.reserve(additional);

// Read in the last offset, will be used to increment and store
// new values later on
let mut length = *offsets.last().unwrap();

// Get a mutable pointer to the `offsets`, and move the pointer
// to the position, where a new value will be written
let mut dst = offsets.as_mut_ptr();
dst = dst.add(offsets.len());

for item in iterator {
if let Some(item) = item {
let bytes = item.as_ref();

// Calculate new offset value
length += O::from_usize(bytes.len()).unwrap();

// Push new values for `values` and `validity` buffer
values.extend_from_slice(bytes);
validity.push_unchecked(true);
} else {
// If `None`, update only `validity`
validity.push_unchecked(false);
}

// Push new offset or old offset depending on the `item`
std::ptr::write(dst, length);

// Move to the next position in offset buffer
dst = dst.add(1);
}

debug_assert_eq!(
dst.offset_from(offsets.as_ptr()) as usize,
offsets.len() + additional,
"TrustedLen iterator's length was not accurately reported"
);

// We make sure to set the new length for the `offsets` buffer
offsets.set_len(offsets.len() + additional);
}

/// Creates two [`Vec`]s from an iterator of `&[u8]`.
/// The first buffer corresponds to a offset buffer, the second to a values buffer.
#[inline]
fn values_iter<O, I, P>(iterator: I) -> (Vec<O>, Vec<u8>)
where
O: Offset,
P: AsRef<[u8]>,
I: Iterator<Item = P>,
{
let (lower, _) = iterator.size_hint();

let mut offsets = Vec::<O>::with_capacity(lower + 1);
let mut values = Vec::<u8>::new();

let mut length = O::default();
offsets.push(length);

for item in iterator {
let s = item.as_ref();
length += O::from_usize(s.len()).unwrap();
values.extend_from_slice(s);

offsets.push(length)
}
(offsets, values)
}
2 changes: 2 additions & 0 deletions src/array/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@ use crate::{
datatypes::DataType,
};

pub(self) mod physical_binary;

/// A trait representing an immutable Arrow array. Arrow arrays are trait objects
/// that are infalibly downcasted to concrete types according to the [`Array::data_type`].
pub trait Array: Send + Sync {
Expand Down
Loading