Faster utf8 validation: ~1.14-2x improvement (#823)

jorgecarleitao · Feb 8, 2022 · 7e25ef2 · 7e25ef2
1 parent 1dd1b19
commit 7e25ef2
Show file tree

Hide file tree

Showing 3 changed files with 51 additions and 13 deletions.
diff --git a/benches/read_parquet.rs b/benches/read_parquet.rs
@@ -57,6 +57,9 @@ fn add_benchmark(c: &mut Criterion) {
         let a = format!("read utf8 large 2^{}", i);
         c.bench_function(&a, |b| b.iter(|| read_batch(&buffer, size, 6).unwrap()));
 
+        let a = format!("read utf8 emoji 2^{}", i);
+        c.bench_function(&a, |b| b.iter(|| read_batch(&buffer, size, 12).unwrap()));
+
         let a = format!("read bool 2^{}", i);
         c.bench_function(&a, |b| b.iter(|| read_batch(&buffer, size, 3).unwrap()));
 

diff --git a/parquet_integration/write_parquet.py b/parquet_integration/write_parquet.py
@@ -14,6 +14,7 @@ def case_basic_nullable(size=1):
     string_large = [
         "ABCDABCDABCDABCDABCDABCDABCDABCDABCDABCDABCDABCDABCDABCDABCDABCD😃🌚🕳👊"
     ] * 10
+    emoji = ["😃"] * 10
     decimal = [Decimal(e) if e is not None else None for e in int64]
 
     fields = [
@@ -30,6 +31,7 @@ def case_basic_nullable(size=1):
         pa.field("decimal_26", pa.decimal128(26, 0)),
         pa.field("timestamp_us", pa.timestamp("us")),
         pa.field("timestamp_s", pa.timestamp("s")),
+        pa.field("emoji", pa.utf8()),
     ]
     schema = pa.schema(fields)
 
@@ -47,6 +49,7 @@ def case_basic_nullable(size=1):
             "decimal_26": decimal * size,
             "timestamp_us": int64 * size,
             "timestamp_s": int64 * size,
+            "emoji": emoji * size,
         },
         schema,
         f"basic_nullable_{size*10}.parquet",

diff --git a/src/array/specification.rs b/src/array/specification.rs
@@ -31,10 +31,11 @@ pub fn check_offsets_and_utf8<O: Offset>(offsets: &[O], values: &[u8]) {
 /// * any slice of `values` between two consecutive pairs from `offsets` is invalid `utf8`, or
 /// * any offset is larger or equal to `values_len`.
 pub fn try_check_offsets_and_utf8<O: Offset>(offsets: &[O], values: &[u8]) -> Result<()> {
-    const SIMD_CHUNK_SIZE: usize = 64;
     if values.is_ascii() {
         try_check_offsets(offsets, values.len())
     } else {
+        simdutf8::basic::from_utf8(values)?;
+
         for window in offsets.windows(2) {
             let start = window[0].to_usize();
             let end = window[1].to_usize();
@@ -44,21 +45,23 @@ pub fn try_check_offsets_and_utf8<O: Offset>(offsets: &[O], values: &[u8]) -> Re
                 return Err(ArrowError::oos("offsets must be monotonically increasing"));
             }
 
-            // check bounds
-            if end > values.len() {
-                return Err(ArrowError::oos("offsets must not exceed values length"));
-            };
-
-            let slice = &values[start..end];
+            let first = values.get(start);
 
-            // fast ASCII check per item
-            if slice.len() < SIMD_CHUNK_SIZE && slice.is_ascii() {
-                continue;
+            if let Some(&b) = first {
+                // A valid code-point iff it does not start with 0b10xxxxxx
+                // Bit-magic taken from `std::str::is_char_boundary`
+                if (b as i8) < -0x40 {
+                    return Err(ArrowError::oos("Non-valid char boundary detected"));
+                }
             }
-
-            // check utf8
-            simdutf8::basic::from_utf8(slice)?;
         }
+        // check bounds
+        if offsets
+            .last()
+            .map_or(false, |last| last.to_usize() > values.len())
+        {
+            return Err(ArrowError::oos("offsets must not exceed values length"));
+        };
 
         Ok(())
     }
@@ -85,3 +88,32 @@ pub fn try_check_offsets<O: Offset>(offsets: &[O], values_len: usize) -> Result<
         Ok(())
     }
 }
+
+#[cfg(test)]
+mod tests {
+    use proptest::prelude::*;
+
+    use super::*;
+
+    pub(crate) fn binary_strategy() -> impl Strategy<Value = Vec<u8>> {
+        prop::collection::vec(any::<u8>(), 1..100)
+    }
+
+    proptest! {
+        // a bit expensive, feel free to run it when changing the code above
+        //#![proptest_config(ProptestConfig::with_cases(100000))]
+        #[test]
+        #[cfg_attr(miri, ignore)] // miri and proptest do not work well
+        fn check_utf8_validation(values in binary_strategy()) {
+
+            for offset in 0..values.len() - 1 {
+                let offsets = vec![0, offset as i32, values.len() as i32];
+
+                let mut is_valid = std::str::from_utf8(&values[..offset]).is_ok();
+                is_valid &= std::str::from_utf8(&values[offset..]).is_ok();
+
+                assert_eq!(try_check_offsets_and_utf8::<i32>(&offsets, &values).is_ok(), is_valid)
+            }
+        }
+    }
+}