Skip to content
This repository has been archived by the owner on Feb 18, 2024. It is now read-only.

Improved performance of iterator of Utf8Array and BinaryArray (3-4x) #427

Merged
merged 2 commits into from
Sep 20, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -226,3 +226,7 @@ harness = false
[[bench]]
name = "hash_kernel"
harness = false

[[bench]]
name = "iter_utf8"
harness = false
34 changes: 34 additions & 0 deletions benches/iter_utf8.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
use arrow2::array::Utf8Array;

use criterion::{criterion_group, criterion_main, Criterion};

fn add_benchmark(c: &mut Criterion) {
(10..=20).step_by(2).for_each(|log2_size| {
let size = 2usize.pow(log2_size);

let array = Utf8Array::<i32>::from_trusted_len_values_iter(
std::iter::repeat("aaa")
.take(size / 2)
.chain(std::iter::repeat("bbb").take(size / 2)),
);

c.bench_function(&format!("iter_values 2^{}", log2_size), |b| {
b.iter(|| {
for x in array.values_iter() {
assert_eq!(x.len(), 3);
}
})
});

c.bench_function(&format!("iter 2^{}", log2_size), |b| {
b.iter(|| {
for x in array.iter() {
assert_eq!(x.unwrap().len(), 3);
}
})
});
})
}

criterion_group!(benches, add_benchmark);
criterion_main!(benches);
30 changes: 17 additions & 13 deletions src/array/binary/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,10 @@ mod mutable;
pub use mutable::*;

/// A [`BinaryArray`] is a nullable array of bytes - the Arrow equivalent of `Vec<Option<Vec<u8>>>`.
/// # Safety
/// The following invariants hold:
/// * Two consecutives `offsets` casted (`as`) to `usize` are valid slices of `values`.
/// * `len` is equal to `validity.len()`, when defined.
#[derive(Debug, Clone)]
pub struct BinaryArray<O: Offset> {
data_type: DataType,
Expand Down Expand Up @@ -42,8 +46,10 @@ impl<O: Offset> BinaryArray<O> {

/// Creates a new [`BinaryArray`] from lower-level parts
/// # Panics
/// * The length of the offset buffer must be larger than 1
/// * The length of the values must be equal to the last offset value
/// * the offsets are not monotonically increasing
/// * The last offset is not equal to the values' length.
/// * the validity's length is not equal to `offsets.len() - 1`.
/// * The `data_type`'s physical type is not equal to `Binary` or `LargeBinary`.
pub fn from_data(
data_type: DataType,
offsets: Buffer<O>,
Expand Down Expand Up @@ -114,25 +120,23 @@ impl<O: Offset> BinaryArray<O> {
/// # Panics
/// iff `i >= self.len()`
pub fn value(&self, i: usize) -> &[u8] {
let offsets = self.offsets.as_slice();
let offset = offsets[i];
let offset_1 = offsets[i + 1];
let length = (offset_1 - offset).to_usize();
let offset = offset.to_usize();
let start = self.offsets[i].to_usize();
let end = self.offsets[i + 1].to_usize();

&self.values[offset..offset + length]
Copy link
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this line @ritchie46

// soundness: the invariant of the struct
unsafe { self.values.get_unchecked(start..end) }
}

/// Returns the element at index `i`
/// # Safety
/// Assumes that the `i < self.len`.
pub unsafe fn value_unchecked(&self, i: usize) -> &[u8] {
let offset = *self.offsets.get_unchecked(i);
let offset_1 = *self.offsets.get_unchecked(i + 1);
let length = (offset_1 - offset).to_usize();
let offset = offset.to_usize();
// soundness: the invariant of the function
let start = self.offsets.get_unchecked(i).to_usize();
let end = self.offsets.get_unchecked(i + 1).to_usize();

&self.values[offset..offset + length]
Copy link
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this line

// soundness: the invariant of the struct
self.values.get_unchecked(start..end)
}

/// Returns the offsets that slice `.values()` to return valid values.
Expand Down
33 changes: 25 additions & 8 deletions src/array/specification.rs
Original file line number Diff line number Diff line change
Expand Up @@ -53,8 +53,7 @@ unsafe impl Offset for i64 {
}
}

#[inline]
pub fn check_offsets<O: Offset>(offsets: &[O], values_len: usize) -> usize {
pub fn check_offsets_minimal<O: Offset>(offsets: &[O], values_len: usize) -> usize {
assert!(
!offsets.is_empty(),
"The length of the offset buffer must be larger than 1"
Expand All @@ -71,15 +70,33 @@ pub fn check_offsets<O: Offset>(offsets: &[O], values_len: usize) -> usize {
len
}

#[inline]
pub fn check_offsets_and_utf8<O: Offset>(offsets: &[O], values: &[u8]) -> usize {
let len = check_offsets(offsets, values.len());
/// # Panics iff:
/// * the `offsets` is not monotonically increasing, or
/// * any slice of `values` between two consecutive pairs from `offsets` is invalid `utf8`, or
/// * any offset is larger or equal to `values_len`.
pub fn check_offsets_and_utf8<O: Offset>(offsets: &[O], values: &[u8]) {
offsets.windows(2).for_each(|window| {
let start = window[0].to_usize();
let end = window[1].to_usize();
assert!(end <= values.len());
let slice = unsafe { std::slice::from_raw_parts(values.as_ptr().add(start), end - start) };
// assert monotonicity
assert!(start <= end);
// assert bounds
let slice = &values[start..end];
// assert utf8
simdutf8::basic::from_utf8(slice).expect("A non-utf8 string was passed.");
});
len
}

/// # Panics iff:
/// * the `offsets` is not monotonically increasing, or
/// * any offset is larger or equal to `values_len`.
pub fn check_offsets<O: Offset>(offsets: &[O], values_len: usize) {
offsets.windows(2).for_each(|window| {
let start = window[0].to_usize();
let end = window[1].to_usize();
// assert monotonicity
assert!(start <= end);
// assert bound
assert!(end <= values_len);
});
}
63 changes: 38 additions & 25 deletions src/array/utf8/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ use crate::{bitmap::Bitmap, buffer::Buffer, datatypes::DataType};

use super::{
display_fmt,
specification::{check_offsets, check_offsets_and_utf8},
specification::{check_offsets_and_utf8, check_offsets_minimal},
Array, GenericBinaryArray, Offset,
};

Expand All @@ -25,6 +25,11 @@ pub use mutable::*;
/// assert_eq!(array.offsets().as_slice(), &[0, 2, 2, 2 + 5]);
/// # }
/// ```
/// # Safety
/// The following invariants hold:
/// * Two consecutives `offsets` casted (`as`) to `usize` are valid slices of `values`.
/// * A slice of `values` taken from two consecutives `offsets` is valid `utf8`.
/// * `len` is equal to `validity.len()`, when defined.
#[derive(Debug, Clone)]
pub struct Utf8Array<O: Offset> {
data_type: DataType,
Expand Down Expand Up @@ -58,9 +63,9 @@ impl<O: Offset> Utf8Array<O> {
/// # Panics
/// This function panics iff:
/// * The `data_type`'s physical type is not consistent with the offset `O`.
/// * The `offsets` and `values` are consistent
/// * The `offsets` and `values` are inconsistent
/// * The `values` between `offsets` are utf8 encoded
/// * The validity is not `None` and its length is different from `offsets`'s length minus one.
/// * The validity is not `None` and its length is different from `offsets.len() - 1`.
pub fn from_data(
data_type: DataType,
offsets: Buffer<O>,
Expand Down Expand Up @@ -94,16 +99,25 @@ impl<O: Offset> Utf8Array<O> {
}
}

/// The same as [`Utf8Array::from_data`] but does not check for valid utf8.
/// The same as [`Utf8Array::from_data`] but does not check for offsets nor utf8 validity.
/// # Safety
/// `values` buffer must contain valid utf8 between every `offset`
/// * `offsets` MUST be monotonically increasing; and
/// * every slice of `values` constructed from `offsets` MUST be valid utf8
/// # Panics
/// This function panics iff:
/// * The `data_type`'s physical type is not consistent with the offset `O`.
/// * The last element of `offsets` is different from `values.len()`.
/// * The validity is not `None` and its length is different from `offsets.len() - 1`.
pub unsafe fn from_data_unchecked(
data_type: DataType,
offsets: Buffer<O>,
values: Buffer<u8>,
validity: Option<Bitmap>,
) -> Self {
check_offsets(&offsets, values.len());
check_offsets_minimal(&offsets, values.len());
if let Some(ref validity) = validity {
assert_eq!(offsets.len() - 1, validity.len());
}

if data_type.to_physical_type() != Self::default_data_type().to_physical_type() {
panic!("Utf8Array can only be initialized with DataType::Utf8 or DataType::LargeUtf8")
Expand All @@ -120,19 +134,31 @@ impl<O: Offset> Utf8Array<O> {

/// Returns the element at index `i` as &str
/// # Safety
/// This function is safe `iff` `i < self.len`.
/// This function is safe iff `i < self.len`.
pub unsafe fn value_unchecked(&self, i: usize) -> &str {
// soundness: the invariant of the function
let offset = *self.offsets.get_unchecked(i);
let offset_1 = *self.offsets.get_unchecked(i + 1);
let length = (offset_1 - offset).to_usize();
let offset = offset.to_usize();
let start = self.offsets.get_unchecked(i).to_usize();
let end = self.offsets.get_unchecked(i + 1).to_usize();

// soundness: the invariant of the struct
let slice = self.values.get_unchecked(start..end);

let slice = &self.values.as_slice()[offset..offset + length];
Copy link
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this line @ritchie46

// soundness: the invariant of the struct
std::str::from_utf8_unchecked(slice)
}

/// Returns the element at index `i`
pub fn value(&self, i: usize) -> &str {
let start = self.offsets[i].to_usize();
let end = self.offsets[i + 1].to_usize();

// soundness: the invariant of the struct
let slice = unsafe { self.values.get_unchecked(start..end) };

// soundness: we always check for utf8 soundness on constructors.
unsafe { std::str::from_utf8_unchecked(slice) }
}

/// Returns a slice of this [`Utf8Array`].
/// # Implementation
/// This operation is `O(1)` as it amounts to essentially increase two ref counts.
Expand Down Expand Up @@ -163,19 +189,6 @@ impl<O: Offset> Utf8Array<O> {
arr
}

/// Returns the element at index `i` as &str
pub fn value(&self, i: usize) -> &str {
let offsets = self.offsets.as_slice();
let offset = offsets[i];
let offset_1 = offsets[i + 1];
let length = (offset_1 - offset).to_usize();
let offset = offset.to_usize();

let slice = &self.values.as_slice()[offset..offset + length];
Copy link
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this

// soundness: we always check for utf8 soundness on constructors.
unsafe { std::str::from_utf8_unchecked(slice) }
}

/// Returns the offsets of this [`Utf8Array`].
#[inline]
pub fn offsets(&self) -> &Buffer<O> {
Expand Down
6 changes: 3 additions & 3 deletions src/array/utf8/mutable.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ use std::{iter::FromIterator, sync::Arc};

use crate::{
array::{
specification::{check_offsets, check_offsets_and_utf8},
specification::{check_offsets_and_utf8, check_offsets_minimal},
Array, MutableArray, Offset, TryExtend, TryPush,
},
bitmap::MutableBitmap,
Expand Down Expand Up @@ -93,7 +93,7 @@ impl<O: Offset> MutableUtf8Array<O> {
values: MutableBuffer<u8>,
validity: Option<MutableBitmap>,
) -> Self {
check_offsets(&offsets, values.len());
check_offsets_minimal(&offsets, values.len());
if let Some(ref validity) = validity {
assert_eq!(offsets.len() - 1, validity.len());
}
Expand Down Expand Up @@ -267,7 +267,7 @@ impl<O: Offset> MutableUtf8Array<O> {
}

/// Extends [`MutableUtf8Array`] from an iterator of trusted len.
/// #Safety
/// # Safety
/// The iterator must be trusted len.
#[inline]
pub unsafe fn extend_trusted_len_unchecked<I, P>(&mut self, iterator: I)
Expand Down