Skip to content
This repository has been archived by the owner on Feb 18, 2024. It is now read-only.

Added fast path for validating ASCII text (~1.12-1.89x improvement on reading ASCII parquet data) #542

Merged
merged 7 commits into from
Oct 19, 2021
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
53 changes: 35 additions & 18 deletions src/array/specification.rs
Original file line number Diff line number Diff line change
Expand Up @@ -75,28 +75,45 @@ pub fn check_offsets_minimal<O: Offset>(offsets: &[O], values_len: usize) -> usi
/// * any slice of `values` between two consecutive pairs from `offsets` is invalid `utf8`, or
/// * any offset is larger or equal to `values_len`.
pub fn check_offsets_and_utf8<O: Offset>(offsets: &[O], values: &[u8]) {
offsets.windows(2).for_each(|window| {
let start = window[0].to_usize();
let end = window[1].to_usize();
// assert monotonicity
assert!(start <= end);
// assert bounds
let slice = &values[start..end];
// assert utf8
simdutf8::basic::from_utf8(slice).expect("A non-utf8 string was passed.");
});
const SIMD_CHUNK_SIZE: usize = 64;

if values.is_ascii() {
check_offsets(offsets, values.len());
} else {
offsets.windows(2).for_each(|window| {
let start = window[0].to_usize();
let end = window[1].to_usize();
// assert monotonicity
assert!(start <= end);
// assert bounds
let slice = &values[start..end];

// Fast ASCII check per item
if slice.len() < SIMD_CHUNK_SIZE && slice.is_ascii() {
return;
}

// assert utf8
simdutf8::basic::from_utf8(slice).expect("A non-utf8 string was passed.");
});
}
}

/// # Panics iff:
/// * the `offsets` is not monotonically increasing, or
/// * any offset is larger or equal to `values_len`.
pub fn check_offsets<O: Offset>(offsets: &[O], values_len: usize) {
offsets.windows(2).for_each(|window| {
let start = window[0].to_usize();
let end = window[1].to_usize();
// assert monotonicity
assert!(start <= end);
// assert bound
assert!(end <= values_len);
});
if offsets.is_empty() {
return;
}

let mut last = offsets[0];
// assert monotonicity
assert!(offsets.iter().skip(1).all(|&end| {
let monotone = last <= end;
last = end;
monotone
}));
// assert bounds
assert!(last.to_usize() <= values_len);
}