Skip to content
This repository has been archived by the owner on Feb 18, 2024. It is now read-only.

Improved performance of list iterator (- 10-20%) #441

Merged
merged 2 commits into from
Sep 25, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -230,3 +230,7 @@ harness = false
[[bench]]
name = "iter_utf8"
harness = false

[[bench]]
name = "iter_list"
harness = false
54 changes: 54 additions & 0 deletions benches/iter_list.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
use arrow2::{
array::{ListArray, PrimitiveArray},
buffer::Buffer,
datatypes::DataType,
};

use arrow2::bitmap::Bitmap;
use arrow2::buffer::MutableBuffer;
use criterion::{criterion_group, criterion_main, Criterion};
use std::iter::FromIterator;
use std::sync::Arc;

fn add_benchmark(c: &mut Criterion) {
(10..=20).step_by(2).for_each(|log2_size| {
let size = 2usize.pow(log2_size);

let values = Buffer::from_iter(0..size as i32);
let values = PrimitiveArray::<i32>::from_data(DataType::Int32, values, None);

let mut offsets = MutableBuffer::from_iter((0..size as i32).step_by(2));
offsets.push(size as i32);

let validity = (0..(offsets.len() - 1))
.map(|i| i % 4 == 0)
.collect::<Bitmap>();

let data_type = ListArray::<i32>::default_datatype(DataType::Int32);
let array = ListArray::<i32>::from_data(
data_type,
offsets.into(),
Arc::new(values),
Some(validity),
);

c.bench_function(&format!("list: iter_values 2^{}", log2_size), |b| {
b.iter(|| {
for x in array.values_iter() {
assert_eq!(x.len(), 2);
}
})
});

c.bench_function(&format!("list: iter 2^{}", log2_size), |b| {
b.iter(|| {
for x in array.iter() {
assert_eq!(x.unwrap().len(), 2);
}
})
});
})
}

criterion_group!(benches, add_benchmark);
criterion_main!(benches);
23 changes: 21 additions & 2 deletions src/array/binary/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -90,8 +90,24 @@ impl<O: Offset> BinaryArray<O> {
/// # Panics
/// iff `offset + length > self.len()`.
pub fn slice(&self, offset: usize, length: usize) -> Self {
let validity = self.validity.clone().map(|x| x.slice(offset, length));
let offsets = self.offsets.clone().slice(offset, length + 1);
assert!(
offset + length <= self.len(),
"the offset of the new Buffer cannot exceed the existing length"
);
unsafe { self.slice_unchecked(offset, length) }
}

/// Creates a new [`BinaryArray`] by slicing this [`BinaryArray`].
/// # Implementation
/// This function is `O(1)`: all data will be shared between both arrays.
/// # Safety
/// The caller must ensure that `offset + length <= self.len()`.
pub unsafe fn slice_unchecked(&self, offset: usize, length: usize) -> Self {
let validity = self
.validity
.clone()
.map(|x| x.slice_unchecked(offset, length));
let offsets = self.offsets.clone().slice_unchecked(offset, length + 1);
Self {
data_type: self.data_type.clone(),
offsets,
Expand Down Expand Up @@ -175,6 +191,9 @@ impl<O: Offset> Array for BinaryArray<O> {
fn slice(&self, offset: usize, length: usize) -> Box<dyn Array> {
Box::new(self.slice(offset, length))
}
unsafe fn slice_unchecked(&self, offset: usize, length: usize) -> Box<dyn Array> {
Box::new(self.slice_unchecked(offset, length))
}
fn with_validity(&self, validity: Option<Bitmap>) -> Box<dyn Array> {
Box::new(self.with_validity(validity))
}
Expand Down
25 changes: 23 additions & 2 deletions src/array/boolean/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -61,10 +61,27 @@ impl BooleanArray {
/// This function panics iff `offset + length >= self.len()`.
#[inline]
pub fn slice(&self, offset: usize, length: usize) -> Self {
let validity = self.validity.clone().map(|x| x.slice(offset, length));
assert!(
offset + length <= self.len(),
"the offset of the new Buffer cannot exceed the existing length"
);
unsafe { self.slice_unchecked(offset, length) }
}

/// Returns a slice of this [`BooleanArray`].
/// # Implementation
/// This operation is `O(1)` as it amounts to increase two ref counts.
/// # Safety
/// The caller must ensure that `offset + length <= self.len()`.
#[inline]
pub unsafe fn slice_unchecked(&self, offset: usize, length: usize) -> Self {
let validity = self
.validity
.clone()
.map(|x| x.slice_unchecked(offset, length));
Self {
data_type: self.data_type.clone(),
values: self.values.clone().slice(offset, length),
values: self.values.clone().slice_unchecked(offset, length),
validity,
offset: self.offset + offset,
}
Expand Down Expand Up @@ -130,6 +147,10 @@ impl Array for BooleanArray {
fn slice(&self, offset: usize, length: usize) -> Box<dyn Array> {
Box::new(self.slice(offset, length))
}
#[inline]
unsafe fn slice_unchecked(&self, offset: usize, length: usize) -> Box<dyn Array> {
Box::new(self.slice_unchecked(offset, length))
}
fn with_validity(&self, validity: Option<Bitmap>) -> Box<dyn Array> {
Box::new(self.with_validity(validity))
}
Expand Down
17 changes: 17 additions & 0 deletions src/array/dictionary/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,8 @@ impl<K: DictionaryKey> DictionaryArray<K> {
}

/// Creates a new [`DictionaryArray`] by slicing the existing [`DictionaryArray`].
/// # Panics
/// iff `offset + length > self.len()`.
pub fn slice(&self, offset: usize, length: usize) -> Self {
Self {
data_type: self.data_type.clone(),
Expand All @@ -83,6 +85,18 @@ impl<K: DictionaryKey> DictionaryArray<K> {
}
}

/// Creates a new [`DictionaryArray`] by slicing the existing [`DictionaryArray`].
ritchie46 marked this conversation as resolved.
Show resolved Hide resolved
/// # Safety
/// Safe iff `offset + length <= self.len()`.
pub unsafe fn slice_unchecked(&self, offset: usize, length: usize) -> Self {
Self {
data_type: self.data_type.clone(),
keys: self.keys.clone().slice_unchecked(offset, length),
values: self.values.clone(),
offset: self.offset + offset,
}
}

/// Sets the validity bitmap on this [`Array`].
/// # Panic
/// This function panics iff `validity.len() != self.len()`.
Expand Down Expand Up @@ -149,6 +163,9 @@ impl<K: DictionaryKey> Array for DictionaryArray<K> {
fn slice(&self, offset: usize, length: usize) -> Box<dyn Array> {
Box::new(self.slice(offset, length))
}
unsafe fn slice_unchecked(&self, offset: usize, length: usize) -> Box<dyn Array> {
Box::new(self.slice_unchecked(offset, length))
}
fn with_validity(&self, validity: Option<Bitmap>) -> Box<dyn Array> {
Box::new(self.with_validity(validity))
}
Expand Down
25 changes: 23 additions & 2 deletions src/array/fixed_size_binary/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -50,12 +50,30 @@ impl FixedSizeBinaryArray {
/// Returns a slice of this [`FixedSizeBinaryArray`].
/// # Implementation
/// This operation is `O(1)` as it amounts to increase 3 ref counts.
/// # Panics
/// panics iff `offset + length > self.len()`
pub fn slice(&self, offset: usize, length: usize) -> Self {
let validity = self.validity.clone().map(|x| x.slice(offset, length));
assert!(
offset + length <= self.len(),
"the offset of the new Buffer cannot exceed the existing length"
);
unsafe { self.slice_unchecked(offset, length) }
}

/// Returns a slice of this [`FixedSizeBinaryArray`].
/// # Implementation
/// This operation is `O(1)` as it amounts to increase 3 ref counts.
/// # Safety
/// The caller must ensure that `offset + length <= self.len()`.
pub unsafe fn slice_unchecked(&self, offset: usize, length: usize) -> Self {
let validity = self
.validity
.clone()
.map(|x| x.slice_unchecked(offset, length));
let values = self
.values
.clone()
.slice(offset * self.size as usize, length * self.size as usize);
.slice_unchecked(offset * self.size as usize, length * self.size as usize);
Self {
data_type: self.data_type.clone(),
size: self.size,
Expand Down Expand Up @@ -139,6 +157,9 @@ impl Array for FixedSizeBinaryArray {
fn slice(&self, offset: usize, length: usize) -> Box<dyn Array> {
Box::new(self.slice(offset, length))
}
unsafe fn slice_unchecked(&self, offset: usize, length: usize) -> Box<dyn Array> {
Box::new(self.slice_unchecked(offset, length))
}
fn with_validity(&self, validity: Option<Bitmap>) -> Box<dyn Array> {
Box::new(self.with_validity(validity))
}
Expand Down
4 changes: 2 additions & 2 deletions src/array/fixed_size_list/iterator.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@ use crate::{
use super::FixedSizeListArray;

impl IterableListArray for FixedSizeListArray {
fn value(&self, i: usize) -> Box<dyn Array> {
FixedSizeListArray::value(self, i)
unsafe fn value_unchecked(&self, i: usize) -> Box<dyn Array> {
FixedSizeListArray::value_unchecked(self, i)
}
}

Expand Down
36 changes: 34 additions & 2 deletions src/array/fixed_size_list/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -63,12 +63,30 @@ impl FixedSizeListArray {
/// Returns a slice of this [`FixedSizeListArray`].
/// # Implementation
/// This operation is `O(1)`.
/// # Panics
/// panics iff `offset + length > self.len()`
pub fn slice(&self, offset: usize, length: usize) -> Self {
let validity = self.validity.clone().map(|x| x.slice(offset, length));
assert!(
offset + length <= self.len(),
"the offset of the new Buffer cannot exceed the existing length"
);
unsafe { self.slice_unchecked(offset, length) }
}

/// Returns a slice of this [`FixedSizeListArray`].
/// # Implementation
/// This operation is `O(1)`.
/// # Safety
/// The caller must ensure that `offset + length <= self.len()`.
pub unsafe fn slice_unchecked(&self, offset: usize, length: usize) -> Self {
let validity = self
.validity
.clone()
.map(|x| x.slice_unchecked(offset, length));
let values = self
.values
.clone()
.slice(offset * self.size as usize, length * self.size as usize)
.slice_unchecked(offset * self.size as usize, length * self.size as usize)
.into();
Self {
data_type: self.data_type.clone(),
Expand All @@ -85,12 +103,23 @@ impl FixedSizeListArray {
}

/// Returns the `Vec<T>` at position `i`.
/// # Panic:
/// panics iff `i >= self.len()`
#[inline]
pub fn value(&self, i: usize) -> Box<dyn Array> {
self.values
.slice(i * self.size as usize, self.size as usize)
}

/// Returns the `Vec<T>` at position `i`.
/// # Safety:
/// Caller must ensure that `i < self.len()`
#[inline]
pub unsafe fn value_unchecked(&self, i: usize) -> Box<dyn Array> {
self.values
.slice_unchecked(i * self.size as usize, self.size as usize)
}

/// Sets the validity bitmap on this [`FixedSizeListArray`].
/// # Panic
/// This function panics iff `validity.len() != self.len()`.
Expand Down Expand Up @@ -143,6 +172,9 @@ impl Array for FixedSizeListArray {
fn slice(&self, offset: usize, length: usize) -> Box<dyn Array> {
Box::new(self.slice(offset, length))
}
unsafe fn slice_unchecked(&self, offset: usize, length: usize) -> Box<dyn Array> {
Box::new(self.slice_unchecked(offset, length))
}
fn with_validity(&self, validity: Option<Bitmap>) -> Box<dyn Array> {
Box::new(self.with_validity(validity))
}
Expand Down
12 changes: 8 additions & 4 deletions src/array/list/iterator.rs
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,9 @@ impl<'a, A: IterableListArray> Iterator for ListValuesIter<'a, A> {
}
let old = self.index;
self.index += 1;
Some(self.array.value(old))
// Safety:
// self.end is maximized by the length of the array
Some(unsafe { self.array.value_unchecked(old) })
}

#[inline]
Expand All @@ -50,14 +52,16 @@ impl<'a, A: IterableListArray> DoubleEndedIterator for ListValuesIter<'a, A> {
None
} else {
self.end -= 1;
Some(self.array.value(self.end))
// Safety:
// self.end is maximized by the length of the array
Some(unsafe { self.array.value_unchecked(self.end) })
}
}
}

impl<O: Offset> IterableListArray for ListArray<O> {
fn value(&self, i: usize) -> Box<dyn Array> {
ListArray::<O>::value(self, i)
unsafe fn value_unchecked(&self, i: usize) -> Box<dyn Array> {
ListArray::<O>::value_unchecked(self, i)
}
}

Expand Down
31 changes: 27 additions & 4 deletions src/array/list/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,10 @@ impl<O: Offset> ListArray<O> {
let offset_1 = offsets[i + 1];
let length = (offset_1 - offset).to_usize();

self.values.slice(offset.to_usize(), length)
// Safety:
// One of the invariants of the struct
// is that offsets are in bounds
unsafe { self.values.slice_unchecked(offset.to_usize(), length) }
}
}

Expand All @@ -93,12 +96,29 @@ impl<O: Offset> ListArray<O> {
let offset_1 = *self.offsets.as_ptr().add(i + 1);
let length = (offset_1 - offset).to_usize();

self.values.slice(offset.to_usize(), length)
self.values.slice_unchecked(offset.to_usize(), length)
}

/// Returns a slice of this [`ListArray`].
/// # Panics
/// panics iff `offset + length >= self.len()`
pub fn slice(&self, offset: usize, length: usize) -> Self {
let validity = self.validity.clone().map(|x| x.slice(offset, length));
let offsets = self.offsets.clone().slice(offset, length + 1);
assert!(
offset + length <= self.len(),
"the offset of the new Buffer cannot exceed the existing length"
);
unsafe { self.slice_unchecked(offset, length) }
}

/// Returns a slice of this [`ListArray`].
/// # Safety
/// The caller must ensure that `offset + length < self.len()`.
pub unsafe fn slice_unchecked(&self, offset: usize, length: usize) -> Self {
let validity = self
.validity
.clone()
.map(|x| x.slice_unchecked(offset, length));
let offsets = self.offsets.clone().slice_unchecked(offset, length + 1);
Self {
data_type: self.data_type.clone(),
offsets,
Expand Down Expand Up @@ -186,6 +206,9 @@ impl<O: Offset> Array for ListArray<O> {
fn slice(&self, offset: usize, length: usize) -> Box<dyn Array> {
Box::new(self.slice(offset, length))
}
unsafe fn slice_unchecked(&self, offset: usize, length: usize) -> Box<dyn Array> {
Box::new(self.slice_unchecked(offset, length))
}
fn with_validity(&self, validity: Option<Bitmap>) -> Box<dyn Array> {
Box::new(self.with_validity(validity))
}
Expand Down
Loading