Skip to content
This repository has been archived by the owner on Feb 18, 2024. It is now read-only.

Commit

Permalink
Faster utf8 validation: ~1.14-2x improvement (#823)
Browse files Browse the repository at this point in the history
  • Loading branch information
Dandandan authored Feb 8, 2022
1 parent 1dd1b19 commit 7e25ef2
Show file tree
Hide file tree
Showing 3 changed files with 51 additions and 13 deletions.
3 changes: 3 additions & 0 deletions benches/read_parquet.rs
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,9 @@ fn add_benchmark(c: &mut Criterion) {
let a = format!("read utf8 large 2^{}", i);
c.bench_function(&a, |b| b.iter(|| read_batch(&buffer, size, 6).unwrap()));

let a = format!("read utf8 emoji 2^{}", i);
c.bench_function(&a, |b| b.iter(|| read_batch(&buffer, size, 12).unwrap()));

let a = format!("read bool 2^{}", i);
c.bench_function(&a, |b| b.iter(|| read_batch(&buffer, size, 3).unwrap()));

Expand Down
3 changes: 3 additions & 0 deletions parquet_integration/write_parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ def case_basic_nullable(size=1):
string_large = [
"ABCDABCDABCDABCDABCDABCDABCDABCDABCDABCDABCDABCDABCDABCDABCDABCD😃🌚🕳👊"
] * 10
emoji = ["😃"] * 10
decimal = [Decimal(e) if e is not None else None for e in int64]

fields = [
Expand All @@ -30,6 +31,7 @@ def case_basic_nullable(size=1):
pa.field("decimal_26", pa.decimal128(26, 0)),
pa.field("timestamp_us", pa.timestamp("us")),
pa.field("timestamp_s", pa.timestamp("s")),
pa.field("emoji", pa.utf8()),
]
schema = pa.schema(fields)

Expand All @@ -47,6 +49,7 @@ def case_basic_nullable(size=1):
"decimal_26": decimal * size,
"timestamp_us": int64 * size,
"timestamp_s": int64 * size,
"emoji": emoji * size,
},
schema,
f"basic_nullable_{size*10}.parquet",
Expand Down
58 changes: 45 additions & 13 deletions src/array/specification.rs
Original file line number Diff line number Diff line change
Expand Up @@ -31,10 +31,11 @@ pub fn check_offsets_and_utf8<O: Offset>(offsets: &[O], values: &[u8]) {
/// * any slice of `values` between two consecutive pairs from `offsets` is invalid `utf8`, or
/// * any offset is larger or equal to `values_len`.
pub fn try_check_offsets_and_utf8<O: Offset>(offsets: &[O], values: &[u8]) -> Result<()> {
const SIMD_CHUNK_SIZE: usize = 64;
if values.is_ascii() {
try_check_offsets(offsets, values.len())
} else {
simdutf8::basic::from_utf8(values)?;

for window in offsets.windows(2) {
let start = window[0].to_usize();
let end = window[1].to_usize();
Expand All @@ -44,21 +45,23 @@ pub fn try_check_offsets_and_utf8<O: Offset>(offsets: &[O], values: &[u8]) -> Re
return Err(ArrowError::oos("offsets must be monotonically increasing"));
}

// check bounds
if end > values.len() {
return Err(ArrowError::oos("offsets must not exceed values length"));
};

let slice = &values[start..end];
let first = values.get(start);

// fast ASCII check per item
if slice.len() < SIMD_CHUNK_SIZE && slice.is_ascii() {
continue;
if let Some(&b) = first {
// A valid code-point iff it does not start with 0b10xxxxxx
// Bit-magic taken from `std::str::is_char_boundary`
if (b as i8) < -0x40 {
return Err(ArrowError::oos("Non-valid char boundary detected"));
}
}

// check utf8
simdutf8::basic::from_utf8(slice)?;
}
// check bounds
if offsets
.last()
.map_or(false, |last| last.to_usize() > values.len())
{
return Err(ArrowError::oos("offsets must not exceed values length"));
};

Ok(())
}
Expand All @@ -85,3 +88,32 @@ pub fn try_check_offsets<O: Offset>(offsets: &[O], values_len: usize) -> Result<
Ok(())
}
}

#[cfg(test)]
mod tests {
use proptest::prelude::*;

use super::*;

pub(crate) fn binary_strategy() -> impl Strategy<Value = Vec<u8>> {
prop::collection::vec(any::<u8>(), 1..100)
}

proptest! {
// a bit expensive, feel free to run it when changing the code above
//#![proptest_config(ProptestConfig::with_cases(100000))]
#[test]
#[cfg_attr(miri, ignore)] // miri and proptest do not work well
fn check_utf8_validation(values in binary_strategy()) {

for offset in 0..values.len() - 1 {
let offsets = vec![0, offset as i32, values.len() as i32];

let mut is_valid = std::str::from_utf8(&values[..offset]).is_ok();
is_valid &= std::str::from_utf8(&values[offset..]).is_ok();

assert_eq!(try_check_offsets_and_utf8::<i32>(&offsets, &values).is_ok(), is_valid)
}
}
}
}

0 comments on commit 7e25ef2

Please sign in to comment.