Skip to content
This repository has been archived by the owner on Feb 18, 2024. It is now read-only.

Commit

Permalink
Improved iteration over Utf8Array and BinaryArray.
Browse files Browse the repository at this point in the history
  • Loading branch information
jorgecarleitao committed Sep 20, 2021
1 parent 0e9c65c commit c76e4c2
Show file tree
Hide file tree
Showing 4 changed files with 75 additions and 47 deletions.
30 changes: 17 additions & 13 deletions src/array/binary/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,10 @@ mod mutable;
pub use mutable::*;

/// A [`BinaryArray`] is a nullable array of bytes - the Arrow equivalent of `Vec<Option<Vec<u8>>>`.
/// # Safety
/// The following invariants hold:
/// * Two consecutives `offsets` casted (`as`) to `usize` are valid slices of `values`.
/// * `len` is equal to `validity.len()`, when defined.
#[derive(Debug, Clone)]
pub struct BinaryArray<O: Offset> {
data_type: DataType,
Expand Down Expand Up @@ -42,8 +46,10 @@ impl<O: Offset> BinaryArray<O> {

/// Creates a new [`BinaryArray`] from lower-level parts
/// # Panics
/// * The length of the offset buffer must be larger than 1
/// * The length of the values must be equal to the last offset value
/// * the offsets are not monotonically increasing
/// * The last offset is not equal to the values' length.
/// * the validity's length is not equal to `offsets.len() - 1`.
/// * The `data_type`'s physical type is not equal to `Binary` or `LargeBinary`.
pub fn from_data(
data_type: DataType,
offsets: Buffer<O>,
Expand Down Expand Up @@ -114,25 +120,23 @@ impl<O: Offset> BinaryArray<O> {
/// # Panics
/// iff `i >= self.len()`
pub fn value(&self, i: usize) -> &[u8] {
let offsets = self.offsets.as_slice();
let offset = offsets[i];
let offset_1 = offsets[i + 1];
let length = (offset_1 - offset).to_usize();
let offset = offset.to_usize();
let start = self.offsets[i].to_usize();
let end = self.offsets[i + 1].to_usize();

&self.values[offset..offset + length]
// soundness: the invariant of the struct
unsafe { self.values.get_unchecked(start..end) }
}

/// Returns the element at index `i`
/// # Safety
/// Assumes that the `i < self.len`.
pub unsafe fn value_unchecked(&self, i: usize) -> &[u8] {
let offset = *self.offsets.get_unchecked(i);
let offset_1 = *self.offsets.get_unchecked(i + 1);
let length = (offset_1 - offset).to_usize();
let offset = offset.to_usize();
// soundness: the invariant of the function
let start = self.offsets.get_unchecked(i).to_usize();
let end = self.offsets.get_unchecked(i + 1).to_usize();

&self.values[offset..offset + length]
// soundness: the invariant of the struct
self.values.get_unchecked(start..end)
}

/// Returns the offsets that slice `.values()` to return valid values.
Expand Down
23 changes: 17 additions & 6 deletions src/array/specification.rs
Original file line number Diff line number Diff line change
Expand Up @@ -53,8 +53,7 @@ unsafe impl Offset for i64 {
}
}

#[inline]
pub fn check_offsets<O: Offset>(offsets: &[O], values_len: usize) -> usize {
pub fn check_offsets_minimal<O: Offset>(offsets: &[O], values_len: usize) -> usize {
assert!(
!offsets.is_empty(),
"The length of the offset buffer must be larger than 1"
Expand All @@ -72,15 +71,27 @@ pub fn check_offsets<O: Offset>(offsets: &[O], values_len: usize) -> usize {
}

#[inline]
pub fn check_offsets_and_utf8<O: Offset>(offsets: &[O], values: &[u8]) -> usize {
let len = check_offsets(offsets, values.len());
pub fn check_offsets_and_utf8<O: Offset>(offsets: &[O], values: &[u8]) {
offsets.windows(2).for_each(|window| {
let start = window[0].to_usize();
let end = window[1].to_usize();
assert!(end <= values.len());
let slice = unsafe { std::slice::from_raw_parts(values.as_ptr().add(start), end - start) };
// checks bounds
let slice = &values[start..end];
// checks utf8 checks
#[cfg(not(feature = "simdutf8"))]
std::str::from_utf8(slice).expect("A non-utf8 string was passed.");
#[cfg(feature = "simdutf8")]
simdutf8::basic::from_utf8(slice).expect("A non-utf8 string was passed.");
});
len
}

/// Checks that the offsets
pub fn check_offsets<O: Offset>(offsets: &[O], values_len: usize) {
offsets.windows(2).for_each(|window| {
let start = window[0].to_usize();
let end = window[1].to_usize();
assert!(start <= end);
assert!(end <= values_len);
});
}
63 changes: 38 additions & 25 deletions src/array/utf8/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ use crate::{bitmap::Bitmap, buffer::Buffer, datatypes::DataType};

use super::{
display_fmt,
specification::{check_offsets, check_offsets_and_utf8},
specification::{check_offsets_and_utf8, check_offsets_minimal},
Array, GenericBinaryArray, Offset,
};

Expand All @@ -25,6 +25,11 @@ pub use mutable::*;
/// assert_eq!(array.offsets().as_slice(), &[0, 2, 2, 2 + 5]);
/// # }
/// ```
/// # Safety
/// The following invariants hold:
/// * Two consecutives `offsets` casted (`as`) to `usize` are valid slices of `values`.
/// * A slice of `values` taken from two consecutives `offsets` is valid `utf8`.
/// * `len` is equal to `validity.len()`, when defined.
#[derive(Debug, Clone)]
pub struct Utf8Array<O: Offset> {
data_type: DataType,
Expand Down Expand Up @@ -58,9 +63,9 @@ impl<O: Offset> Utf8Array<O> {
/// # Panics
/// This function panics iff:
/// * The `data_type`'s physical type is not consistent with the offset `O`.
/// * The `offsets` and `values` are consistent
/// * The `offsets` and `values` are inconsistent
/// * The `values` between `offsets` are utf8 encoded
/// * The validity is not `None` and its length is different from `offsets`'s length minus one.
/// * The validity is not `None` and its length is different from `offsets.len() - 1`.
pub fn from_data(
data_type: DataType,
offsets: Buffer<O>,
Expand Down Expand Up @@ -94,16 +99,25 @@ impl<O: Offset> Utf8Array<O> {
}
}

/// The same as [`Utf8Array::from_data`] but does not check for valid utf8.
/// The same as [`Utf8Array::from_data`] but does not check for offsets nor utf8 validity.
/// # Safety
/// `values` buffer must contain valid utf8 between every `offset`
/// * `offsets` MUST be monotonically increasing; and
/// * every slice of `values` constructed from `offsets` MUST be valid utf8
/// # Panics
/// This function panics iff:
/// * The `data_type`'s physical type is not consistent with the offset `O`.
/// * The last element of `offsets` is different from `values.len()`.
/// * The validity is not `None` and its length is different from `offsets.len() - 1`.
pub unsafe fn from_data_unchecked(
data_type: DataType,
offsets: Buffer<O>,
values: Buffer<u8>,
validity: Option<Bitmap>,
) -> Self {
check_offsets(&offsets, values.len());
check_offsets_minimal(&offsets, values.len());
if let Some(ref validity) = validity {
assert_eq!(offsets.len() - 1, validity.len());
}

if data_type.to_physical_type() != Self::default_data_type().to_physical_type() {
panic!("Utf8Array can only be initialized with DataType::Utf8 or DataType::LargeUtf8")
Expand All @@ -120,19 +134,31 @@ impl<O: Offset> Utf8Array<O> {

/// Returns the element at index `i` as &str
/// # Safety
/// This function is safe `iff` `i < self.len`.
/// This function is safe iff `i < self.len`.
pub unsafe fn value_unchecked(&self, i: usize) -> &str {
// soundness: the invariant of the function
let offset = *self.offsets.get_unchecked(i);
let offset_1 = *self.offsets.get_unchecked(i + 1);
let length = (offset_1 - offset).to_usize();
let offset = offset.to_usize();
let start = self.offsets.get_unchecked(i).to_usize();
let end = self.offsets.get_unchecked(i + 1).to_usize();

// soundness: the invariant of the struct
let slice = self.values.get_unchecked(start..end);

let slice = &self.values.as_slice()[offset..offset + length];
// soundness: the invariant of the struct
std::str::from_utf8_unchecked(slice)
}

/// Returns the element at index `i`
pub fn value(&self, i: usize) -> &str {
let start = self.offsets[i].to_usize();
let end = self.offsets[i + 1].to_usize();

// soundness: the invariant of the struct
let slice = unsafe { self.values.get_unchecked(start..end) };

// soundness: we always check for utf8 soundness on constructors.
unsafe { std::str::from_utf8_unchecked(slice) }
}

/// Returns a slice of this [`Utf8Array`].
/// # Implementation
/// This operation is `O(1)` as it amounts to essentially increase two ref counts.
Expand Down Expand Up @@ -163,19 +189,6 @@ impl<O: Offset> Utf8Array<O> {
arr
}

/// Returns the element at index `i` as &str
pub fn value(&self, i: usize) -> &str {
let offsets = self.offsets.as_slice();
let offset = offsets[i];
let offset_1 = offsets[i + 1];
let length = (offset_1 - offset).to_usize();
let offset = offset.to_usize();

let slice = &self.values.as_slice()[offset..offset + length];
// soundness: we always check for utf8 soundness on constructors.
unsafe { std::str::from_utf8_unchecked(slice) }
}

/// Returns the offsets of this [`Utf8Array`].
#[inline]
pub fn offsets(&self) -> &Buffer<O> {
Expand Down
6 changes: 3 additions & 3 deletions src/array/utf8/mutable.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ use std::{iter::FromIterator, sync::Arc};

use crate::{
array::{
specification::{check_offsets, check_offsets_and_utf8},
specification::{check_offsets_and_utf8, check_offsets_minimal},
Array, MutableArray, Offset, TryExtend, TryPush,
},
bitmap::MutableBitmap,
Expand Down Expand Up @@ -93,7 +93,7 @@ impl<O: Offset> MutableUtf8Array<O> {
values: MutableBuffer<u8>,
validity: Option<MutableBitmap>,
) -> Self {
check_offsets(&offsets, values.len());
check_offsets_minimal(&offsets, values.len());
if let Some(ref validity) = validity {
assert_eq!(offsets.len() - 1, validity.len());
}
Expand Down Expand Up @@ -267,7 +267,7 @@ impl<O: Offset> MutableUtf8Array<O> {
}

/// Extends [`MutableUtf8Array`] from an iterator of trusted len.
/// #Safety
/// # Safety
/// The iterator must be trusted len.
#[inline]
pub unsafe fn extend_trusted_len_unchecked<I, P>(&mut self, iterator: I)
Expand Down

0 comments on commit c76e4c2

Please sign in to comment.