From facdc91030b5ff657fd84a6f017a6758998935a6 Mon Sep 17 00:00:00 2001 From: "Heres, Daniel" Date: Tue, 19 Oct 2021 18:06:29 +0200 Subject: [PATCH 1/7] Add fast path for checking ascii text --- src/array/specification.rs | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/array/specification.rs b/src/array/specification.rs index ceedb46b51f..8503529d180 100644 --- a/src/array/specification.rs +++ b/src/array/specification.rs @@ -75,6 +75,8 @@ pub fn check_offsets_minimal(offsets: &[O], values_len: usize) -> usi /// * any slice of `values` between two consecutive pairs from `offsets` is invalid `utf8`, or /// * any offset is larger or equal to `values_len`. pub fn check_offsets_and_utf8(offsets: &[O], values: &[u8]) { + const SIMD_CHUNK_SIZE: usize = 64; + offsets.windows(2).for_each(|window| { let start = window[0].to_usize(); let end = window[1].to_usize(); @@ -82,6 +84,12 @@ pub fn check_offsets_and_utf8(offsets: &[O], values: &[u8]) { assert!(start <= end); // assert bounds let slice = &values[start..end]; + + // Fast ASCII check + if slice.len() < SIMD_CHUNK_SIZE && slice.is_ascii() { + return; + } + // assert utf8 simdutf8::basic::from_utf8(slice).expect("A non-utf8 string was passed."); }); From 29988ac97fbeadf592c2086db757d76db609375e Mon Sep 17 00:00:00 2001 From: "Heres, Daniel" Date: Tue, 19 Oct 2021 18:25:13 +0200 Subject: [PATCH 2/7] Move fast path before loop for ascii --- src/array/specification.rs | 43 ++++++++++++++++++++++++-------------- 1 file changed, 27 insertions(+), 16 deletions(-) diff --git a/src/array/specification.rs b/src/array/specification.rs index 8503529d180..da5ae11cc0d 100644 --- a/src/array/specification.rs +++ b/src/array/specification.rs @@ -77,22 +77,33 @@ pub fn check_offsets_minimal(offsets: &[O], values_len: usize) -> usi pub fn check_offsets_and_utf8(offsets: &[O], values: &[u8]) { const SIMD_CHUNK_SIZE: usize = 64; - offsets.windows(2).for_each(|window| { - let start = window[0].to_usize(); - let end = window[1].to_usize(); - // assert monotonicity - assert!(start <= end); - // assert bounds - let slice = &values[start..end]; - - // Fast ASCII check - if slice.len() < SIMD_CHUNK_SIZE && slice.is_ascii() { - return; - } - - // assert utf8 - simdutf8::basic::from_utf8(slice).expect("A non-utf8 string was passed."); - }); + let all_ascii = values.is_ascii(); + + if all_ascii { + offsets.windows(2).for_each(|window| { + let start = window[0].to_usize(); + let end = window[1].to_usize(); + // assert monotonicity, bounds + assert!(start <= end && end <= values.len()); + }); + } else { + offsets.windows(2).for_each(|window| { + let start = window[0].to_usize(); + let end = window[1].to_usize(); + // assert monotonicity + assert!(start <= end); + // assert bounds + let slice = &values[start..end]; + + // Fast ASCII check per item + if slice.len() < SIMD_CHUNK_SIZE && slice.is_ascii() { + return; + } + + // assert utf8 + simdutf8::basic::from_utf8(slice).expect("A non-utf8 string was passed."); + }); + } } /// # Panics iff: From b70c734d3415a089cd669ab79ee5260643ba483a Mon Sep 17 00:00:00 2001 From: "Heres, Daniel" Date: Tue, 19 Oct 2021 20:21:44 +0200 Subject: [PATCH 3/7] Small simplification --- src/array/specification.rs | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/src/array/specification.rs b/src/array/specification.rs index da5ae11cc0d..0e695621222 100644 --- a/src/array/specification.rs +++ b/src/array/specification.rs @@ -80,12 +80,17 @@ pub fn check_offsets_and_utf8(offsets: &[O], values: &[u8]) { let all_ascii = values.is_ascii(); if all_ascii { - offsets.windows(2).for_each(|window| { - let start = window[0].to_usize(); - let end = window[1].to_usize(); - // assert monotonicity, bounds - assert!(start <= end && end <= values.len()); - }); + if offsets.len() == 0 { + return; + } + let mut last = offsets[0]; + assert!(offsets.iter().skip(1).all(|&end| { + let monotone = last <= end; + last = end; + monotone + })); + // assert bounds + assert!(last.to_usize() <= values.len()); } else { offsets.windows(2).for_each(|window| { let start = window[0].to_usize(); From 579c0d094f4afd1c05482ab6847e720ed5a20aba Mon Sep 17 00:00:00 2001 From: "Heres, Daniel" Date: Tue, 19 Oct 2021 20:37:39 +0200 Subject: [PATCH 4/7] Clippy --- src/array/specification.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/array/specification.rs b/src/array/specification.rs index 0e695621222..05b2dc95ad5 100644 --- a/src/array/specification.rs +++ b/src/array/specification.rs @@ -80,7 +80,7 @@ pub fn check_offsets_and_utf8(offsets: &[O], values: &[u8]) { let all_ascii = values.is_ascii(); if all_ascii { - if offsets.len() == 0 { + if offsets.is_empty() { return; } let mut last = offsets[0]; From 05f76e720190e0caa393683b5a8b071990f7c518 Mon Sep 17 00:00:00 2001 From: "Heres, Daniel" Date: Tue, 19 Oct 2021 20:42:48 +0200 Subject: [PATCH 5/7] Move implementation --- src/array/specification.rs | 33 ++++++++++++++------------------- 1 file changed, 14 insertions(+), 19 deletions(-) diff --git a/src/array/specification.rs b/src/array/specification.rs index 05b2dc95ad5..047ecf7f9e9 100644 --- a/src/array/specification.rs +++ b/src/array/specification.rs @@ -80,17 +80,7 @@ pub fn check_offsets_and_utf8(offsets: &[O], values: &[u8]) { let all_ascii = values.is_ascii(); if all_ascii { - if offsets.is_empty() { - return; - } - let mut last = offsets[0]; - assert!(offsets.iter().skip(1).all(|&end| { - let monotone = last <= end; - last = end; - monotone - })); - // assert bounds - assert!(last.to_usize() <= values.len()); + return check_offsets(offsets, values.len()); } else { offsets.windows(2).for_each(|window| { let start = window[0].to_usize(); @@ -115,12 +105,17 @@ pub fn check_offsets_and_utf8(offsets: &[O], values: &[u8]) { /// * the `offsets` is not monotonically increasing, or /// * any offset is larger or equal to `values_len`. pub fn check_offsets(offsets: &[O], values_len: usize) { - offsets.windows(2).for_each(|window| { - let start = window[0].to_usize(); - let end = window[1].to_usize(); - // assert monotonicity - assert!(start <= end); - // assert bound - assert!(end <= values_len); - }); + if offsets.is_empty() { + return; + } + + let mut last = offsets[0]; + // assert monotonicity + assert!(offsets.iter().skip(1).all(|&end| { + let monotone = last <= end; + last = end; + monotone + })); + // assert bounds + assert!(last.to_usize() <= values_len); } From ec2fc314ddceb339d57548e3a4e78de6a932a83e Mon Sep 17 00:00:00 2001 From: "Heres, Daniel" Date: Tue, 19 Oct 2021 20:44:44 +0200 Subject: [PATCH 6/7] Inline condition --- src/array/specification.rs | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/array/specification.rs b/src/array/specification.rs index 047ecf7f9e9..c644d0a5dea 100644 --- a/src/array/specification.rs +++ b/src/array/specification.rs @@ -77,9 +77,7 @@ pub fn check_offsets_minimal(offsets: &[O], values_len: usize) -> usi pub fn check_offsets_and_utf8(offsets: &[O], values: &[u8]) { const SIMD_CHUNK_SIZE: usize = 64; - let all_ascii = values.is_ascii(); - - if all_ascii { + if values.is_ascii() { return check_offsets(offsets, values.len()); } else { offsets.windows(2).for_each(|window| { From 9ba4b0f6f6eba07a0272e3e08f1773ba82a7f345 Mon Sep 17 00:00:00 2001 From: "Heres, Daniel" Date: Tue, 19 Oct 2021 20:46:37 +0200 Subject: [PATCH 7/7] Clippy --- src/array/specification.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/array/specification.rs b/src/array/specification.rs index c644d0a5dea..1edcb70653f 100644 --- a/src/array/specification.rs +++ b/src/array/specification.rs @@ -78,7 +78,7 @@ pub fn check_offsets_and_utf8(offsets: &[O], values: &[u8]) { const SIMD_CHUNK_SIZE: usize = 64; if values.is_ascii() { - return check_offsets(offsets, values.len()); + check_offsets(offsets, values.len()); } else { offsets.windows(2).for_each(|window| { let start = window[0].to_usize();