This repository has been archived by the owner on Feb 18, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 224
Improved performance of iterator of Utf8Array
and BinaryArray
(3-4x)
#427
Merged
+121
−49
Merged
Changes from all commits
Commits
Show all changes
2 commits
Select commit
Hold shift + click to select a range
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -226,3 +226,7 @@ harness = false | |
[[bench]] | ||
name = "hash_kernel" | ||
harness = false | ||
|
||
[[bench]] | ||
name = "iter_utf8" | ||
harness = false |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
use arrow2::array::Utf8Array; | ||
|
||
use criterion::{criterion_group, criterion_main, Criterion}; | ||
|
||
fn add_benchmark(c: &mut Criterion) { | ||
(10..=20).step_by(2).for_each(|log2_size| { | ||
let size = 2usize.pow(log2_size); | ||
|
||
let array = Utf8Array::<i32>::from_trusted_len_values_iter( | ||
std::iter::repeat("aaa") | ||
.take(size / 2) | ||
.chain(std::iter::repeat("bbb").take(size / 2)), | ||
); | ||
|
||
c.bench_function(&format!("iter_values 2^{}", log2_size), |b| { | ||
b.iter(|| { | ||
for x in array.values_iter() { | ||
assert_eq!(x.len(), 3); | ||
} | ||
}) | ||
}); | ||
|
||
c.bench_function(&format!("iter 2^{}", log2_size), |b| { | ||
b.iter(|| { | ||
for x in array.iter() { | ||
assert_eq!(x.unwrap().len(), 3); | ||
} | ||
}) | ||
}); | ||
}) | ||
} | ||
|
||
criterion_group!(benches, add_benchmark); | ||
criterion_main!(benches); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -13,6 +13,10 @@ mod mutable; | |
pub use mutable::*; | ||
|
||
/// A [`BinaryArray`] is a nullable array of bytes - the Arrow equivalent of `Vec<Option<Vec<u8>>>`. | ||
/// # Safety | ||
/// The following invariants hold: | ||
/// * Two consecutives `offsets` casted (`as`) to `usize` are valid slices of `values`. | ||
/// * `len` is equal to `validity.len()`, when defined. | ||
#[derive(Debug, Clone)] | ||
pub struct BinaryArray<O: Offset> { | ||
data_type: DataType, | ||
|
@@ -42,8 +46,10 @@ impl<O: Offset> BinaryArray<O> { | |
|
||
/// Creates a new [`BinaryArray`] from lower-level parts | ||
/// # Panics | ||
/// * The length of the offset buffer must be larger than 1 | ||
/// * The length of the values must be equal to the last offset value | ||
/// * the offsets are not monotonically increasing | ||
/// * The last offset is not equal to the values' length. | ||
/// * the validity's length is not equal to `offsets.len() - 1`. | ||
/// * The `data_type`'s physical type is not equal to `Binary` or `LargeBinary`. | ||
pub fn from_data( | ||
data_type: DataType, | ||
offsets: Buffer<O>, | ||
|
@@ -114,25 +120,23 @@ impl<O: Offset> BinaryArray<O> { | |
/// # Panics | ||
/// iff `i >= self.len()` | ||
pub fn value(&self, i: usize) -> &[u8] { | ||
let offsets = self.offsets.as_slice(); | ||
let offset = offsets[i]; | ||
let offset_1 = offsets[i + 1]; | ||
let length = (offset_1 - offset).to_usize(); | ||
let offset = offset.to_usize(); | ||
let start = self.offsets[i].to_usize(); | ||
let end = self.offsets[i + 1].to_usize(); | ||
|
||
&self.values[offset..offset + length] | ||
// soundness: the invariant of the struct | ||
unsafe { self.values.get_unchecked(start..end) } | ||
} | ||
|
||
/// Returns the element at index `i` | ||
/// # Safety | ||
/// Assumes that the `i < self.len`. | ||
pub unsafe fn value_unchecked(&self, i: usize) -> &[u8] { | ||
let offset = *self.offsets.get_unchecked(i); | ||
let offset_1 = *self.offsets.get_unchecked(i + 1); | ||
let length = (offset_1 - offset).to_usize(); | ||
let offset = offset.to_usize(); | ||
// soundness: the invariant of the function | ||
let start = self.offsets.get_unchecked(i).to_usize(); | ||
let end = self.offsets.get_unchecked(i + 1).to_usize(); | ||
|
||
&self.values[offset..offset + length] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this line |
||
// soundness: the invariant of the struct | ||
self.values.get_unchecked(start..end) | ||
} | ||
|
||
/// Returns the offsets that slice `.values()` to return valid values. | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2,7 +2,7 @@ use crate::{bitmap::Bitmap, buffer::Buffer, datatypes::DataType}; | |
|
||
use super::{ | ||
display_fmt, | ||
specification::{check_offsets, check_offsets_and_utf8}, | ||
specification::{check_offsets_and_utf8, check_offsets_minimal}, | ||
Array, GenericBinaryArray, Offset, | ||
}; | ||
|
||
|
@@ -25,6 +25,11 @@ pub use mutable::*; | |
/// assert_eq!(array.offsets().as_slice(), &[0, 2, 2, 2 + 5]); | ||
/// # } | ||
/// ``` | ||
/// # Safety | ||
/// The following invariants hold: | ||
/// * Two consecutives `offsets` casted (`as`) to `usize` are valid slices of `values`. | ||
/// * A slice of `values` taken from two consecutives `offsets` is valid `utf8`. | ||
/// * `len` is equal to `validity.len()`, when defined. | ||
#[derive(Debug, Clone)] | ||
pub struct Utf8Array<O: Offset> { | ||
data_type: DataType, | ||
|
@@ -58,9 +63,9 @@ impl<O: Offset> Utf8Array<O> { | |
/// # Panics | ||
/// This function panics iff: | ||
/// * The `data_type`'s physical type is not consistent with the offset `O`. | ||
/// * The `offsets` and `values` are consistent | ||
/// * The `offsets` and `values` are inconsistent | ||
/// * The `values` between `offsets` are utf8 encoded | ||
/// * The validity is not `None` and its length is different from `offsets`'s length minus one. | ||
/// * The validity is not `None` and its length is different from `offsets.len() - 1`. | ||
pub fn from_data( | ||
data_type: DataType, | ||
offsets: Buffer<O>, | ||
|
@@ -94,16 +99,25 @@ impl<O: Offset> Utf8Array<O> { | |
} | ||
} | ||
|
||
/// The same as [`Utf8Array::from_data`] but does not check for valid utf8. | ||
/// The same as [`Utf8Array::from_data`] but does not check for offsets nor utf8 validity. | ||
/// # Safety | ||
/// `values` buffer must contain valid utf8 between every `offset` | ||
/// * `offsets` MUST be monotonically increasing; and | ||
/// * every slice of `values` constructed from `offsets` MUST be valid utf8 | ||
/// # Panics | ||
/// This function panics iff: | ||
/// * The `data_type`'s physical type is not consistent with the offset `O`. | ||
/// * The last element of `offsets` is different from `values.len()`. | ||
/// * The validity is not `None` and its length is different from `offsets.len() - 1`. | ||
pub unsafe fn from_data_unchecked( | ||
data_type: DataType, | ||
offsets: Buffer<O>, | ||
values: Buffer<u8>, | ||
validity: Option<Bitmap>, | ||
) -> Self { | ||
check_offsets(&offsets, values.len()); | ||
check_offsets_minimal(&offsets, values.len()); | ||
if let Some(ref validity) = validity { | ||
assert_eq!(offsets.len() - 1, validity.len()); | ||
} | ||
|
||
if data_type.to_physical_type() != Self::default_data_type().to_physical_type() { | ||
panic!("Utf8Array can only be initialized with DataType::Utf8 or DataType::LargeUtf8") | ||
|
@@ -120,19 +134,31 @@ impl<O: Offset> Utf8Array<O> { | |
|
||
/// Returns the element at index `i` as &str | ||
/// # Safety | ||
/// This function is safe `iff` `i < self.len`. | ||
/// This function is safe iff `i < self.len`. | ||
pub unsafe fn value_unchecked(&self, i: usize) -> &str { | ||
// soundness: the invariant of the function | ||
let offset = *self.offsets.get_unchecked(i); | ||
let offset_1 = *self.offsets.get_unchecked(i + 1); | ||
let length = (offset_1 - offset).to_usize(); | ||
let offset = offset.to_usize(); | ||
let start = self.offsets.get_unchecked(i).to_usize(); | ||
let end = self.offsets.get_unchecked(i + 1).to_usize(); | ||
|
||
// soundness: the invariant of the struct | ||
let slice = self.values.get_unchecked(start..end); | ||
|
||
let slice = &self.values.as_slice()[offset..offset + length]; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this line @ritchie46 |
||
// soundness: the invariant of the struct | ||
std::str::from_utf8_unchecked(slice) | ||
} | ||
|
||
/// Returns the element at index `i` | ||
pub fn value(&self, i: usize) -> &str { | ||
let start = self.offsets[i].to_usize(); | ||
let end = self.offsets[i + 1].to_usize(); | ||
|
||
// soundness: the invariant of the struct | ||
let slice = unsafe { self.values.get_unchecked(start..end) }; | ||
|
||
// soundness: we always check for utf8 soundness on constructors. | ||
unsafe { std::str::from_utf8_unchecked(slice) } | ||
} | ||
|
||
/// Returns a slice of this [`Utf8Array`]. | ||
/// # Implementation | ||
/// This operation is `O(1)` as it amounts to essentially increase two ref counts. | ||
|
@@ -163,19 +189,6 @@ impl<O: Offset> Utf8Array<O> { | |
arr | ||
} | ||
|
||
/// Returns the element at index `i` as &str | ||
pub fn value(&self, i: usize) -> &str { | ||
let offsets = self.offsets.as_slice(); | ||
let offset = offsets[i]; | ||
let offset_1 = offsets[i + 1]; | ||
let length = (offset_1 - offset).to_usize(); | ||
let offset = offset.to_usize(); | ||
|
||
let slice = &self.values.as_slice()[offset..offset + length]; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this |
||
// soundness: we always check for utf8 soundness on constructors. | ||
unsafe { std::str::from_utf8_unchecked(slice) } | ||
} | ||
|
||
/// Returns the offsets of this [`Utf8Array`]. | ||
#[inline] | ||
pub fn offsets(&self) -> &Buffer<O> { | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
this line @ritchie46