diff --git a/benches/read_parquet.rs b/benches/read_parquet.rs index bc1d8b569c2..3d1dc858588 100644 --- a/benches/read_parquet.rs +++ b/benches/read_parquet.rs @@ -57,6 +57,9 @@ fn add_benchmark(c: &mut Criterion) { let a = format!("read utf8 large 2^{}", i); c.bench_function(&a, |b| b.iter(|| read_batch(&buffer, size, 6).unwrap())); + let a = format!("read utf8 emoji 2^{}", i); + c.bench_function(&a, |b| b.iter(|| read_batch(&buffer, size, 12).unwrap())); + let a = format!("read bool 2^{}", i); c.bench_function(&a, |b| b.iter(|| read_batch(&buffer, size, 3).unwrap())); diff --git a/parquet_integration/write_parquet.py b/parquet_integration/write_parquet.py index b6f48abc6a5..bf2e39cb678 100644 --- a/parquet_integration/write_parquet.py +++ b/parquet_integration/write_parquet.py @@ -14,6 +14,7 @@ def case_basic_nullable(size=1): string_large = [ "ABCDABCDABCDABCDABCDABCDABCDABCDABCDABCDABCDABCDABCDABCDABCDABCDšŸ˜ƒšŸŒššŸ•³šŸ‘Š" ] * 10 + emoji = ["šŸ˜ƒ"] * 10 decimal = [Decimal(e) if e is not None else None for e in int64] fields = [ @@ -30,6 +31,7 @@ def case_basic_nullable(size=1): pa.field("decimal_26", pa.decimal128(26, 0)), pa.field("timestamp_us", pa.timestamp("us")), pa.field("timestamp_s", pa.timestamp("s")), + pa.field("emoji", pa.utf8()), ] schema = pa.schema(fields) @@ -47,6 +49,7 @@ def case_basic_nullable(size=1): "decimal_26": decimal * size, "timestamp_us": int64 * size, "timestamp_s": int64 * size, + "emoji": emoji * size, }, schema, f"basic_nullable_{size*10}.parquet", diff --git a/src/array/specification.rs b/src/array/specification.rs index 5bdcd945594..530daed5300 100644 --- a/src/array/specification.rs +++ b/src/array/specification.rs @@ -31,10 +31,11 @@ pub fn check_offsets_and_utf8(offsets: &[O], values: &[u8]) { /// * any slice of `values` between two consecutive pairs from `offsets` is invalid `utf8`, or /// * any offset is larger or equal to `values_len`. pub fn try_check_offsets_and_utf8(offsets: &[O], values: &[u8]) -> Result<()> { - const SIMD_CHUNK_SIZE: usize = 64; if values.is_ascii() { try_check_offsets(offsets, values.len()) } else { + simdutf8::basic::from_utf8(values)?; + for window in offsets.windows(2) { let start = window[0].to_usize(); let end = window[1].to_usize(); @@ -44,21 +45,23 @@ pub fn try_check_offsets_and_utf8(offsets: &[O], values: &[u8]) -> Re return Err(ArrowError::oos("offsets must be monotonically increasing")); } - // check bounds - if end > values.len() { - return Err(ArrowError::oos("offsets must not exceed values length")); - }; - - let slice = &values[start..end]; + let first = values.get(start); - // fast ASCII check per item - if slice.len() < SIMD_CHUNK_SIZE && slice.is_ascii() { - continue; + if let Some(&b) = first { + // A valid code-point iff it does not start with 0b10xxxxxx + // Bit-magic taken from `std::str::is_char_boundary` + if (b as i8) < -0x40 { + return Err(ArrowError::oos("Non-valid char boundary detected")); + } } - - // check utf8 - simdutf8::basic::from_utf8(slice)?; } + // check bounds + if offsets + .last() + .map_or(false, |last| last.to_usize() > values.len()) + { + return Err(ArrowError::oos("offsets must not exceed values length")); + }; Ok(()) } @@ -85,3 +88,32 @@ pub fn try_check_offsets(offsets: &[O], values_len: usize) -> Result< Ok(()) } } + +#[cfg(test)] +mod tests { + use proptest::prelude::*; + + use super::*; + + pub(crate) fn binary_strategy() -> impl Strategy> { + prop::collection::vec(any::(), 1..100) + } + + proptest! { + // a bit expensive, feel free to run it when changing the code above + //#![proptest_config(ProptestConfig::with_cases(100000))] + #[test] + #[cfg_attr(miri, ignore)] // miri and proptest do not work well + fn check_utf8_validation(values in binary_strategy()) { + + for offset in 0..values.len() - 1 { + let offsets = vec![0, offset as i32, values.len() as i32]; + + let mut is_valid = std::str::from_utf8(&values[..offset]).is_ok(); + is_valid &= std::str::from_utf8(&values[offset..]).is_ok(); + + assert_eq!(try_check_offsets_and_utf8::(&offsets, &values).is_ok(), is_valid) + } + } + } +}