From bd78970a15e4a164a1e1af1a0d38bc59c162f660 Mon Sep 17 00:00:00 2001 From: "Jorge C. Leitao" Date: Sun, 19 Sep 2021 18:14:01 +0000 Subject: [PATCH] Improved iteration over Utf8Array and BinaryArray. --- src/array/binary/mod.rs | 30 ++++++++++-------- src/array/specification.rs | 28 ++++++++++++----- src/array/utf8/mod.rs | 63 +++++++++++++++++++++++--------------- src/array/utf8/mutable.rs | 6 ++-- 4 files changed, 79 insertions(+), 48 deletions(-) diff --git a/src/array/binary/mod.rs b/src/array/binary/mod.rs index eea1fdfcad4..c7b0f65992f 100644 --- a/src/array/binary/mod.rs +++ b/src/array/binary/mod.rs @@ -13,6 +13,10 @@ mod mutable; pub use mutable::*; /// A [`BinaryArray`] is a nullable array of bytes - the Arrow equivalent of `Vec>>`. +/// # Safety +/// The following invariants hold: +/// * Two consecutives `offsets` casted (`as`) to `usize` are valid slices of `values`. +/// * `len` is equal to `validity.len()`, when defined. #[derive(Debug, Clone)] pub struct BinaryArray { data_type: DataType, @@ -42,8 +46,10 @@ impl BinaryArray { /// Creates a new [`BinaryArray`] from lower-level parts /// # Panics - /// * The length of the offset buffer must be larger than 1 - /// * The length of the values must be equal to the last offset value + /// * the offsets are not monotonically increasing + /// * The last offset is not equal to the values' length. + /// * the validity's length is not equal to `offsets.len() - 1`. + /// * The `data_type`'s physical type is not equal to `Binary` or `LargeBinary`. pub fn from_data( data_type: DataType, offsets: Buffer, @@ -114,25 +120,23 @@ impl BinaryArray { /// # Panics /// iff `i >= self.len()` pub fn value(&self, i: usize) -> &[u8] { - let offsets = self.offsets.as_slice(); - let offset = offsets[i]; - let offset_1 = offsets[i + 1]; - let length = (offset_1 - offset).to_usize(); - let offset = offset.to_usize(); + let start = self.offsets[i].to_usize(); + let end = self.offsets[i + 1].to_usize(); - &self.values[offset..offset + length] + // soundness: the invariant of the struct + unsafe { self.values.get_unchecked(start..end) } } /// Returns the element at index `i` /// # Safety /// Assumes that the `i < self.len`. pub unsafe fn value_unchecked(&self, i: usize) -> &[u8] { - let offset = *self.offsets.get_unchecked(i); - let offset_1 = *self.offsets.get_unchecked(i + 1); - let length = (offset_1 - offset).to_usize(); - let offset = offset.to_usize(); + // soundness: the invariant of the function + let start = self.offsets.get_unchecked(i).to_usize(); + let end = self.offsets.get_unchecked(i + 1).to_usize(); - &self.values[offset..offset + length] + // soundness: the invariant of the struct + self.values.get_unchecked(start..end) } /// Returns the offsets that slice `.values()` to return valid values. diff --git a/src/array/specification.rs b/src/array/specification.rs index 276ba9a57c2..be479e50188 100644 --- a/src/array/specification.rs +++ b/src/array/specification.rs @@ -53,8 +53,7 @@ unsafe impl Offset for i64 { } } -#[inline] -pub fn check_offsets(offsets: &[O], values_len: usize) -> usize { +pub fn check_offsets_minimal(offsets: &[O], values_len: usize) -> usize { assert!( !offsets.is_empty(), "The length of the offset buffer must be larger than 1" @@ -71,15 +70,30 @@ pub fn check_offsets(offsets: &[O], values_len: usize) -> usize { len } -#[inline] -pub fn check_offsets_and_utf8(offsets: &[O], values: &[u8]) -> usize { - let len = check_offsets(offsets, values.len()); +/// # Panics iff: +/// * the `offsets` is not monotonically increasing, or +/// * any slice of `values` between two consecutive pairs from `offsets` is invalid `utf8`, or +/// * any offset is larger or equal to `values_len`. +pub fn check_offsets_and_utf8(offsets: &[O], values: &[u8]) { offsets.windows(2).for_each(|window| { let start = window[0].to_usize(); let end = window[1].to_usize(); assert!(end <= values.len()); - let slice = unsafe { std::slice::from_raw_parts(values.as_ptr().add(start), end - start) }; + // checks bounds + let slice = &values[start..end]; + // checks utf8 checks simdutf8::basic::from_utf8(slice).expect("A non-utf8 string was passed."); }); - len +} + +/// # Panics iff: +/// * the `offsets` is not monotonically increasing, or +/// * any offset is larger or equal to `values_len`. +pub fn check_offsets(offsets: &[O], values_len: usize) { + offsets.windows(2).for_each(|window| { + let start = window[0].to_usize(); + let end = window[1].to_usize(); + assert!(start <= end); + assert!(end <= values_len); + }); } diff --git a/src/array/utf8/mod.rs b/src/array/utf8/mod.rs index 6033a8224b0..0e5ce24d9c2 100644 --- a/src/array/utf8/mod.rs +++ b/src/array/utf8/mod.rs @@ -2,7 +2,7 @@ use crate::{bitmap::Bitmap, buffer::Buffer, datatypes::DataType}; use super::{ display_fmt, - specification::{check_offsets, check_offsets_and_utf8}, + specification::{check_offsets_and_utf8, check_offsets_minimal}, Array, GenericBinaryArray, Offset, }; @@ -25,6 +25,11 @@ pub use mutable::*; /// assert_eq!(array.offsets().as_slice(), &[0, 2, 2, 2 + 5]); /// # } /// ``` +/// # Safety +/// The following invariants hold: +/// * Two consecutives `offsets` casted (`as`) to `usize` are valid slices of `values`. +/// * A slice of `values` taken from two consecutives `offsets` is valid `utf8`. +/// * `len` is equal to `validity.len()`, when defined. #[derive(Debug, Clone)] pub struct Utf8Array { data_type: DataType, @@ -58,9 +63,9 @@ impl Utf8Array { /// # Panics /// This function panics iff: /// * The `data_type`'s physical type is not consistent with the offset `O`. - /// * The `offsets` and `values` are consistent + /// * The `offsets` and `values` are inconsistent /// * The `values` between `offsets` are utf8 encoded - /// * The validity is not `None` and its length is different from `offsets`'s length minus one. + /// * The validity is not `None` and its length is different from `offsets.len() - 1`. pub fn from_data( data_type: DataType, offsets: Buffer, @@ -94,16 +99,25 @@ impl Utf8Array { } } - /// The same as [`Utf8Array::from_data`] but does not check for valid utf8. + /// The same as [`Utf8Array::from_data`] but does not check for offsets nor utf8 validity. /// # Safety - /// `values` buffer must contain valid utf8 between every `offset` + /// * `offsets` MUST be monotonically increasing; and + /// * every slice of `values` constructed from `offsets` MUST be valid utf8 + /// # Panics + /// This function panics iff: + /// * The `data_type`'s physical type is not consistent with the offset `O`. + /// * The last element of `offsets` is different from `values.len()`. + /// * The validity is not `None` and its length is different from `offsets.len() - 1`. pub unsafe fn from_data_unchecked( data_type: DataType, offsets: Buffer, values: Buffer, validity: Option, ) -> Self { - check_offsets(&offsets, values.len()); + check_offsets_minimal(&offsets, values.len()); + if let Some(ref validity) = validity { + assert_eq!(offsets.len() - 1, validity.len()); + } if data_type.to_physical_type() != Self::default_data_type().to_physical_type() { panic!("Utf8Array can only be initialized with DataType::Utf8 or DataType::LargeUtf8") @@ -120,19 +134,31 @@ impl Utf8Array { /// Returns the element at index `i` as &str /// # Safety - /// This function is safe `iff` `i < self.len`. + /// This function is safe iff `i < self.len`. pub unsafe fn value_unchecked(&self, i: usize) -> &str { // soundness: the invariant of the function - let offset = *self.offsets.get_unchecked(i); - let offset_1 = *self.offsets.get_unchecked(i + 1); - let length = (offset_1 - offset).to_usize(); - let offset = offset.to_usize(); + let start = self.offsets.get_unchecked(i).to_usize(); + let end = self.offsets.get_unchecked(i + 1).to_usize(); + + // soundness: the invariant of the struct + let slice = self.values.get_unchecked(start..end); - let slice = &self.values.as_slice()[offset..offset + length]; // soundness: the invariant of the struct std::str::from_utf8_unchecked(slice) } + /// Returns the element at index `i` + pub fn value(&self, i: usize) -> &str { + let start = self.offsets[i].to_usize(); + let end = self.offsets[i + 1].to_usize(); + + // soundness: the invariant of the struct + let slice = unsafe { self.values.get_unchecked(start..end) }; + + // soundness: we always check for utf8 soundness on constructors. + unsafe { std::str::from_utf8_unchecked(slice) } + } + /// Returns a slice of this [`Utf8Array`]. /// # Implementation /// This operation is `O(1)` as it amounts to essentially increase two ref counts. @@ -163,19 +189,6 @@ impl Utf8Array { arr } - /// Returns the element at index `i` as &str - pub fn value(&self, i: usize) -> &str { - let offsets = self.offsets.as_slice(); - let offset = offsets[i]; - let offset_1 = offsets[i + 1]; - let length = (offset_1 - offset).to_usize(); - let offset = offset.to_usize(); - - let slice = &self.values.as_slice()[offset..offset + length]; - // soundness: we always check for utf8 soundness on constructors. - unsafe { std::str::from_utf8_unchecked(slice) } - } - /// Returns the offsets of this [`Utf8Array`]. #[inline] pub fn offsets(&self) -> &Buffer { diff --git a/src/array/utf8/mutable.rs b/src/array/utf8/mutable.rs index cbc3904e022..ab1bb0744f1 100644 --- a/src/array/utf8/mutable.rs +++ b/src/array/utf8/mutable.rs @@ -2,7 +2,7 @@ use std::{iter::FromIterator, sync::Arc}; use crate::{ array::{ - specification::{check_offsets, check_offsets_and_utf8}, + specification::{check_offsets_and_utf8, check_offsets_minimal}, Array, MutableArray, Offset, TryExtend, TryPush, }, bitmap::MutableBitmap, @@ -93,7 +93,7 @@ impl MutableUtf8Array { values: MutableBuffer, validity: Option, ) -> Self { - check_offsets(&offsets, values.len()); + check_offsets_minimal(&offsets, values.len()); if let Some(ref validity) = validity { assert_eq!(offsets.len() - 1, validity.len()); } @@ -267,7 +267,7 @@ impl MutableUtf8Array { } /// Extends [`MutableUtf8Array`] from an iterator of trusted len. - /// #Safety + /// # Safety /// The iterator must be trusted len. #[inline] pub unsafe fn extend_trusted_len_unchecked(&mut self, iterator: I)