From ab38982965ece4ae55631d0f80216387275b2f58 Mon Sep 17 00:00:00 2001 From: "Jorge C. Leitao" Date: Sun, 4 Dec 2022 07:48:28 +0000 Subject: [PATCH] Added Offsets and OffsetsBuffer --- src/array/binary/ffi.rs | 13 +- src/array/binary/mod.rs | 123 +----- src/array/binary/mutable.rs | 45 +- src/array/binary/mutable_values.rs | 70 ++- src/array/growable/binary.rs | 26 +- src/array/growable/list.rs | 77 +--- src/array/growable/utf8.rs | 26 +- src/array/growable/utils.rs | 10 - src/array/list/ffi.rs | 9 +- src/array/list/mod.rs | 98 +--- src/array/list/mutable.rs | 154 ++----- src/array/physical_binary.rs | 140 ++---- src/array/utf8/ffi.rs | 9 +- src/array/utf8/mod.rs | 45 +- src/array/utf8/mutable.rs | 14 +- src/array/utf8/mutable_values.rs | 31 +- src/compute/cast/binary_to.rs | 22 +- src/compute/cast/mod.rs | 29 +- src/compute/cast/primitive_to.rs | 12 +- src/compute/cast/utf8_to.rs | 27 +- src/compute/substring.rs | 12 +- src/compute/take/generic_binary.rs | 36 +- src/io/avro/read/nested.rs | 33 +- src/io/ipc/read/array/binary.rs | 2 +- src/io/ipc/read/array/list.rs | 2 +- src/io/ipc/read/array/utf8.rs | 2 +- src/io/ipc/write/serialize.rs | 4 +- src/io/json/read/deserialize.rs | 80 ++-- src/io/json_integration/read/array.rs | 8 +- src/io/odbc/read/deserialize.rs | 12 +- src/io/orc/read/mod.rs | 37 +- .../parquet/read/deserialize/binary/basic.rs | 20 +- .../read/deserialize/binary/dictionary.rs | 5 +- .../parquet/read/deserialize/binary/utils.rs | 53 +-- src/io/parquet/read/deserialize/mod.rs | 14 +- .../parquet/read/deserialize/nested_utils.rs | 30 +- src/io/parquet/read/statistics/list.rs | 11 +- src/io/parquet/write/pages.rs | 2 +- src/offset.rs | 417 ++++++++++++++++++ src/types/index.rs | 8 + 40 files changed, 842 insertions(+), 926 deletions(-) diff --git a/src/array/binary/ffi.rs b/src/array/binary/ffi.rs index f592773f56e..70af1ddd9eb 100644 --- a/src/array/binary/ffi.rs +++ b/src/array/binary/ffi.rs @@ -2,7 +2,7 @@ use crate::{ array::{FromFfi, ToFfi}, bitmap::align, ffi, - offset::Offset, + offset::{Offset, OffsetsBuffer}, }; use crate::error::Result; @@ -19,7 +19,7 @@ unsafe impl ToFfi for BinaryArray { } fn offset(&self) -> Option { - let offset = self.offsets.offset(); + let offset = self.offsets.buffer().offset(); if let Some(bitmap) = self.validity.as_ref() { if bitmap.offset() == offset { Some(offset) @@ -32,7 +32,7 @@ unsafe impl ToFfi for BinaryArray { } fn to_ffi_aligned(&self) -> Self { - let offset = self.offsets.offset(); + let offset = self.offsets.buffer().offset(); let validity = self.validity.as_ref().map(|bitmap| { if bitmap.offset() == offset { @@ -59,8 +59,9 @@ impl FromFfi for BinaryArray { let offsets = unsafe { array.buffer::(1) }?; let values = unsafe { array.buffer::(2) }?; - Ok(Self::from_data_unchecked( - data_type, offsets, values, validity, - )) + // assumption that data from FFI is well constructed + let offsets = unsafe { OffsetsBuffer::new_unchecked(offsets) }; + + Ok(Self::new(data_type, offsets, values, validity)) } } diff --git a/src/array/binary/mod.rs b/src/array/binary/mod.rs index 612d988215e..b870d19ebee 100644 --- a/src/array/binary/mod.rs +++ b/src/array/binary/mod.rs @@ -6,16 +6,13 @@ use crate::{ buffer::Buffer, datatypes::DataType, error::Error, - offset::Offset, + offset::{Offset, OffsetsBuffer}, trusted_len::TrustedLen, }; use either::Either; -use super::{ - specification::{try_check_offsets, try_check_offsets_bounds}, - Array, GenericBinaryArray, -}; +use super::{specification::try_check_offsets_bounds, Array, GenericBinaryArray}; mod ffi; pub(super) mod fmt; @@ -60,7 +57,7 @@ pub use mutable::*; #[derive(Clone)] pub struct BinaryArray { data_type: DataType, - offsets: Buffer, + offsets: OffsetsBuffer, values: Buffer, validity: Option, } @@ -70,7 +67,6 @@ impl BinaryArray { /// /// # Errors /// This function returns an error iff: - /// * the offsets are not monotonically increasing /// * The last offset is not equal to the values' length. /// * the validity's length is not equal to `offsets.len() - 1`. /// * The `data_type`'s [`crate::datatypes::PhysicalType`] is not equal to either `Binary` or `LargeBinary`. @@ -78,11 +74,11 @@ impl BinaryArray { /// This function is `O(N)` - checking monotinicity is `O(N)` pub fn try_new( data_type: DataType, - offsets: Buffer, + offsets: OffsetsBuffer, values: Buffer, validity: Option, ) -> Result { - try_check_offsets(&offsets, values.len())?; + try_check_offsets_bounds(&offsets, values.len())?; if validity .as_ref() @@ -131,7 +127,7 @@ impl BinaryArray { /// Returns the length of this array #[inline] pub fn len(&self) -> usize { - self.offsets.len() - 1 + self.offsets.len() } /// Returns the element at index `i` @@ -170,7 +166,7 @@ impl BinaryArray { /// Returns the offsets of this [`BinaryArray`]. #[inline] - pub fn offsets(&self) -> &Buffer { + pub fn offsets(&self) -> &OffsetsBuffer { &self.offsets } @@ -251,21 +247,16 @@ impl BinaryArray { match bitmap.into_mut() { // Safety: invariants are preserved Left(bitmap) => Left(unsafe { - BinaryArray::new_unchecked( - self.data_type, - self.offsets, - self.values, - Some(bitmap), - ) + BinaryArray::new(self.data_type, self.offsets, self.values, Some(bitmap)) }), Right(mutable_bitmap) => match ( self.values.get_mut().map(std::mem::take), - self.offsets.get_mut().map(std::mem::take), + self.offsets.get_mut(), ) { (None, None) => { // Safety: invariants are preserved Left(unsafe { - BinaryArray::new_unchecked( + BinaryArray::new( self.data_type, self.offsets, self.values, @@ -276,7 +267,7 @@ impl BinaryArray { (None, Some(offsets)) => { // Safety: invariants are preserved Left(unsafe { - BinaryArray::new_unchecked( + BinaryArray::new( self.data_type, offsets.into(), self.values, @@ -287,7 +278,7 @@ impl BinaryArray { (Some(mutable_values), None) => { // Safety: invariants are preserved Left(unsafe { - BinaryArray::new_unchecked( + BinaryArray::new( self.data_type, self.offsets, mutable_values.into(), @@ -308,16 +299,16 @@ impl BinaryArray { } else { match ( self.values.get_mut().map(std::mem::take), - self.offsets.get_mut().map(std::mem::take), + self.offsets.get_mut(), ) { (None, None) => Left(unsafe { - BinaryArray::new_unchecked(self.data_type, self.offsets, self.values, None) + BinaryArray::new(self.data_type, self.offsets, self.values, None) }), (None, Some(offsets)) => Left(unsafe { - BinaryArray::new_unchecked(self.data_type, offsets.into(), self.values, None) + BinaryArray::new(self.data_type, offsets.into(), self.values, None) }), (Some(values), None) => Left(unsafe { - BinaryArray::new_unchecked(self.data_type, self.offsets, values.into(), None) + BinaryArray::new(self.data_type, self.offsets, values.into(), None) }), (Some(values), Some(offsets)) => Right(unsafe { MutableBinaryArray::from_data(self.data_type, offsets, values, None) @@ -328,12 +319,7 @@ impl BinaryArray { /// Creates an empty [`BinaryArray`], i.e. whose `.len` is zero. pub fn new_empty(data_type: DataType) -> Self { - Self::new( - data_type, - Buffer::from(vec![O::zero()]), - Buffer::new(), - None, - ) + Self::new(data_type, OffsetsBuffer::new(), Buffer::new(), None) } /// Creates an null [`BinaryArray`], i.e. whose `.null_count() == .len()`. @@ -341,7 +327,7 @@ impl BinaryArray { pub fn new_null(data_type: DataType, length: usize) -> Self { Self::new( data_type, - vec![O::default(); 1 + length].into(), + vec![O::default(); 1 + length].try_into().unwrap(), Buffer::new(), Some(Bitmap::new_zeroed(length)), ) @@ -356,72 +342,16 @@ impl BinaryArray { } } - /// Creates a new [`BinaryArray`] without checking for offsets monotinicity. - /// - /// # Errors - /// This function returns an error iff: - /// * The last offset is not equal to the values' length. - /// * the validity's length is not equal to `offsets.len() - 1`. - /// * The `data_type`'s [`crate::datatypes::PhysicalType`] is not equal to either `Binary` or `LargeBinary`. - /// # Safety - /// This function is unsafe iff: - /// * the offsets are not monotonically increasing - /// # Implementation - /// This function is `O(1)` - pub unsafe fn try_new_unchecked( - data_type: DataType, - offsets: Buffer, - values: Buffer, - validity: Option, - ) -> Result { - try_check_offsets_bounds(&offsets, values.len())?; - - if validity - .as_ref() - .map_or(false, |validity| validity.len() != offsets.len() - 1) - { - return Err(Error::oos( - "validity mask length must match the number of values", - )); - } - - if data_type.to_physical_type() != Self::default_data_type().to_physical_type() { - return Err(Error::oos( - "BinaryArray can only be initialized with DataType::Binary or DataType::LargeBinary", - )); - } - - Ok(Self { - data_type, - offsets, - values, - validity, - }) - } - /// Alias for unwrapping [`Self::try_new`] pub fn new( data_type: DataType, - offsets: Buffer, + offsets: OffsetsBuffer, values: Buffer, validity: Option, ) -> Self { Self::try_new(data_type, offsets, values, validity).unwrap() } - /// Alias for unwrapping [`Self::try_new_unchecked`] - /// # Safety - /// This function is unsafe iff: - /// * the offsets are not monotonically increasing - pub unsafe fn new_unchecked( - data_type: DataType, - offsets: Buffer, - values: Buffer, - validity: Option, - ) -> Self { - Self::try_new_unchecked(data_type, offsets, values, validity).unwrap() - } - /// Returns a [`BinaryArray`] from an iterator of trusted length. /// /// The [`BinaryArray`] is guaranteed to not have a validity @@ -487,23 +417,10 @@ impl BinaryArray { unsafe { Self::try_from_trusted_len_iter_unchecked(iter) } } - /// Alias for [`Self::new_unchecked`] - /// # Safety - /// This function is unsafe iff: - /// * the offsets are not monotonically increasing - pub unsafe fn from_data_unchecked( - data_type: DataType, - offsets: Buffer, - values: Buffer, - validity: Option, - ) -> Self { - Self::new_unchecked(data_type, offsets, values, validity) - } - /// Alias for `new` pub fn from_data( data_type: DataType, - offsets: Buffer, + offsets: OffsetsBuffer, values: Buffer, validity: Option, ) -> Self { diff --git a/src/array/binary/mutable.rs b/src/array/binary/mutable.rs index 36f31eee5a5..811e07419ce 100644 --- a/src/array/binary/mutable.rs +++ b/src/array/binary/mutable.rs @@ -8,7 +8,7 @@ use crate::{ }, datatypes::DataType, error::{Error, Result}, - offset::Offset, + offset::{Offset, Offsets}, trusted_len::TrustedLen, }; @@ -54,7 +54,6 @@ impl MutableBinaryArray { /// /// # Errors /// This function returns an error iff: - /// * the offsets are not monotonically increasing /// * The last offset is not equal to the values' length. /// * the validity's length is not equal to `offsets.len() - 1`. /// * The `data_type`'s [`crate::datatypes::PhysicalType`] is not equal to either `Binary` or `LargeBinary`. @@ -62,7 +61,7 @@ impl MutableBinaryArray { /// This function is `O(N)` - checking monotinicity is `O(N)` pub fn try_new( data_type: DataType, - offsets: Vec, + offsets: Offsets, values: Vec, validity: Option, ) -> Result { @@ -80,26 +79,6 @@ impl MutableBinaryArray { Ok(Self { values, validity }) } - /// Create a [`MutableBinaryArray`] out of its inner attributes. - /// # Safety - /// The caller must ensure that every value between offsets is a valid utf8. - /// # Panics - /// This function panics iff: - /// * The `offsets` and `values` are inconsistent - /// * The validity is not `None` and its length is different from `offsets`'s length minus one. - pub unsafe fn new_unchecked( - data_type: DataType, - offsets: Vec, - values: Vec, - validity: Option, - ) -> Self { - let values = MutableBinaryValuesArray::new_unchecked(data_type, offsets, values); - if let Some(ref validity) = validity { - assert_eq!(values.len(), validity.len()); - } - Self { values, validity } - } - /// Creates a new [`MutableBinaryArray`] from a slice of optional `&[u8]`. // Note: this can't be `impl From` because Rust does not allow double `AsRef` on it. pub fn from, P: AsRef<[Option]>>(slice: P) -> Self { @@ -185,7 +164,7 @@ impl MutableBinaryArray { /// Equivalent to `Self::try_new(...).unwrap()` pub fn from_data( data_type: DataType, - offsets: Vec, + offsets: Offsets, values: Vec, validity: Option, ) -> Self { @@ -200,7 +179,7 @@ impl MutableBinaryArray { } /// returns its offsets. - pub fn offsets(&self) -> &Vec { + pub fn offsets(&self) -> &Offsets { self.values.offsets() } @@ -229,14 +208,12 @@ impl MutableArray for MutableBinaryArray { // `MutableBinaryArray` has the same invariants as `BinaryArray` and thus // `BinaryArray` can be safely created from `MutableBinaryArray` without checks. let (data_type, offsets, values) = std::mem::take(&mut self.values).into_inner(); - unsafe { - BinaryArray::new_unchecked( - data_type, - offsets.into(), - values.into(), - std::mem::take(&mut self.validity).map(|x| x.into()), - ) - } + BinaryArray::new( + data_type, + offsets.into(), + values.into(), + std::mem::take(&mut self.validity).map(|x| x.into()), + ) .boxed() } @@ -246,7 +223,7 @@ impl MutableArray for MutableBinaryArray { // `BinaryArray` can be safely created from `MutableBinaryArray` without checks. let (data_type, offsets, values) = std::mem::take(&mut self.values).into_inner(); unsafe { - BinaryArray::new_unchecked( + BinaryArray::new( data_type, offsets.into(), values.into(), diff --git a/src/array/binary/mutable_values.rs b/src/array/binary/mutable_values.rs index 1d608b7403f..b633a11e685 100644 --- a/src/array/binary/mutable_values.rs +++ b/src/array/binary/mutable_values.rs @@ -2,13 +2,13 @@ use std::{iter::FromIterator, sync::Arc}; use crate::{ array::{ - specification::{check_offsets_minimal, try_check_offsets}, + specification::{check_offsets_minimal, try_check_offsets_bounds}, Array, ArrayAccessor, ArrayValuesIter, MutableArray, TryExtend, TryExtendFromSelf, TryPush, }, bitmap::MutableBitmap, datatypes::DataType, error::{Error, Result}, - offset::Offset, + offset::{Offset, Offsets}, trusted_len::TrustedLen, }; @@ -20,23 +20,18 @@ use crate::array::physical_binary::*; #[derive(Debug, Clone)] pub struct MutableBinaryValuesArray { data_type: DataType, - offsets: Vec, + offsets: Offsets, values: Vec, } impl From> for BinaryArray { fn from(other: MutableBinaryValuesArray) -> Self { - // Safety: - // `MutableBinaryValuesArray` has the same invariants as `BinaryArray` and thus - // `BinaryArray` can be safely created from `MutableBinaryValuesArray` without checks. - unsafe { - BinaryArray::::from_data_unchecked( - other.data_type, - other.offsets.into(), - other.values.into(), - None, - ) - } + BinaryArray::::new( + other.data_type, + other.offsets.into(), + other.values.into(), + None, + ) } } @@ -45,12 +40,7 @@ impl From> for MutableBinaryArray { // Safety: // `MutableBinaryValuesArray` has the same invariants as `MutableBinaryArray` unsafe { - MutableBinaryArray::::new_unchecked( - other.data_type, - other.offsets, - other.values, - None, - ) + MutableBinaryArray::::from_data(other.data_type, other.offsets, other.values, None) } } } @@ -66,7 +56,7 @@ impl MutableBinaryValuesArray { pub fn new() -> Self { Self { data_type: Self::default_data_type(), - offsets: vec![O::default()], + offsets: Offsets::new(), values: Vec::::new(), } } @@ -75,13 +65,12 @@ impl MutableBinaryValuesArray { /// /// # Errors /// This function returns an error iff: - /// * the offsets are not monotonically increasing /// * The last offset is not equal to the values' length. /// * The `data_type`'s [`crate::datatypes::PhysicalType`] is not equal to either `Binary` or `LargeBinary`. /// # Implementation /// This function is `O(N)` - checking monotinicity is `O(N)` - pub fn try_new(data_type: DataType, offsets: Vec, values: Vec) -> Result { - try_check_offsets(&offsets, values.len())?; + pub fn try_new(data_type: DataType, offsets: Offsets, values: Vec) -> Result { + try_check_offsets_bounds(&offsets, values.len())?; if data_type.to_physical_type() != Self::default_data_type().to_physical_type() { return Err(Error::oos( "MutableBinaryValuesArray can only be initialized with DataType::Binary or DataType::LargeBinary", @@ -106,7 +95,7 @@ impl MutableBinaryValuesArray { /// * the offsets are monotonically increasing /// # Implementation /// This function is `O(1)` - pub unsafe fn new_unchecked(data_type: DataType, offsets: Vec, values: Vec) -> Self { + pub unsafe fn new_unchecked(data_type: DataType, offsets: Offsets, values: Vec) -> Self { check_offsets_minimal(&offsets, values.len()); if data_type.to_physical_type() != Self::default_data_type().to_physical_type() { @@ -133,12 +122,9 @@ impl MutableBinaryValuesArray { /// Initializes a new [`MutableBinaryValuesArray`] with a pre-allocated capacity of items and values. pub fn with_capacities(capacity: usize, values: usize) -> Self { - let mut offsets = Vec::::with_capacity(capacity + 1); - offsets.push(O::default()); - Self { data_type: Self::default_data_type(), - offsets, + offsets: Offsets::::with_capacity(capacity), values: Vec::::with_capacity(values), } } @@ -151,26 +137,26 @@ impl MutableBinaryValuesArray { /// returns its offsets. #[inline] - pub fn offsets(&self) -> &Vec { + pub fn offsets(&self) -> &Offsets { &self.offsets } /// Reserves `additional` elements and `additional_values` on the values. #[inline] pub fn reserve(&mut self, additional: usize, additional_values: usize) { - self.offsets.reserve(additional + 1); + self.offsets.reserve(additional); self.values.reserve(additional_values); } /// Returns the capacity in number of items pub fn capacity(&self) -> usize { - self.offsets.capacity() - 1 + self.offsets.capacity() } /// Returns the length of this array #[inline] pub fn len(&self) -> usize { - self.offsets.len() - 1 + self.offsets.len() } /// Pushes a new item to the array. @@ -188,7 +174,7 @@ impl MutableBinaryValuesArray { return None; } self.offsets.pop()?; - let start = self.offsets.last()?.to_usize(); + let start = self.offsets.last().to_usize(); let value = self.values.split_off(start); Some(value.to_vec()) } @@ -227,7 +213,7 @@ impl MutableBinaryValuesArray { } /// Extract the low-end APIs from the [`MutableBinaryValuesArray`]. - pub fn into_inner(self) -> (DataType, Vec, Vec) { + pub fn into_inner(self) -> (DataType, Offsets, Vec) { (self.data_type, self.offsets, self.values) } } @@ -246,8 +232,7 @@ impl MutableArray for MutableBinaryValuesArray { // `MutableBinaryValuesArray` has the same invariants as `BinaryArray` and thus // `BinaryArray` can be safely created from `MutableBinaryValuesArray` without checks. let (data_type, offsets, values) = std::mem::take(self).into_inner(); - unsafe { BinaryArray::from_data_unchecked(data_type, offsets.into(), values.into(), None) } - .boxed() + unsafe { BinaryArray::new(data_type, offsets.into(), values.into(), None) }.boxed() } fn as_arc(&mut self) -> Arc { @@ -255,8 +240,7 @@ impl MutableArray for MutableBinaryValuesArray { // `MutableBinaryValuesArray` has the same invariants as `BinaryArray` and thus // `BinaryArray` can be safely created from `MutableBinaryValuesArray` without checks. let (data_type, offsets, values) = std::mem::take(self).into_inner(); - unsafe { BinaryArray::from_data_unchecked(data_type, offsets.into(), values.into(), None) } - .arced() + unsafe { BinaryArray::new(data_type, offsets.into(), values.into(), None) }.arced() } fn data_type(&self) -> &DataType { @@ -388,11 +372,7 @@ impl> TryPush for MutableBinaryValuesArray { fn try_push(&mut self, value: T) -> Result<()> { let bytes = value.as_ref(); self.values.extend_from_slice(bytes); - - let size = O::from_usize(self.values.len()).ok_or(Error::Overflow)?; - - self.offsets.push(size); - Ok(()) + self.offsets.try_push_usize(bytes.len()) } } @@ -413,6 +393,6 @@ unsafe impl<'a, O: Offset> ArrayAccessor<'a> for MutableBinaryValuesArray { impl TryExtendFromSelf for MutableBinaryValuesArray { fn try_extend_from_self(&mut self, other: &Self) -> Result<()> { self.values.extend_from_slice(&other.values); - try_extend_offsets(&mut self.offsets, &other.offsets) + self.offsets.try_extend_from_self(&other.offsets) } } diff --git a/src/array/growable/binary.rs b/src/array/growable/binary.rs index fd91590a25d..ad70425321e 100644 --- a/src/array/growable/binary.rs +++ b/src/array/growable/binary.rs @@ -4,11 +4,11 @@ use crate::{ array::{Array, BinaryArray}, bitmap::MutableBitmap, datatypes::DataType, - offset::Offset, + offset::{Offset, Offsets}, }; use super::{ - utils::{build_extend_null_bits, extend_offset_values, extend_offsets, ExtendNullBits}, + utils::{build_extend_null_bits, extend_offset_values, ExtendNullBits}, Growable, }; @@ -18,8 +18,7 @@ pub struct GrowableBinary<'a, O: Offset> { data_type: DataType, validity: MutableBitmap, values: Vec, - offsets: Vec, - length: O, // always equal to the last offset at `offsets`. + offsets: Offsets, extend_null_bits: Vec>, } @@ -41,16 +40,11 @@ impl<'a, O: Offset> GrowableBinary<'a, O> { .map(|array| build_extend_null_bits(*array, use_validity)) .collect(); - let mut offsets = Vec::with_capacity(capacity + 1); - let length = O::default(); - offsets.push(length); - Self { arrays, data_type, values: Vec::with_capacity(0), - offsets, - length, + offsets: Offsets::with_capacity(capacity), validity: MutableBitmap::with_capacity(capacity), extend_null_bits, } @@ -74,18 +68,16 @@ impl<'a, O: Offset> Growable<'a> for GrowableBinary<'a, O> { let offsets = array.offsets(); let values = array.values(); - extend_offsets::( - &mut self.offsets, - &mut self.length, - &offsets[start..start + len + 1], - ); + self.offsets + .try_extend_from_slice(offsets, start, len) + .unwrap(); + // values extend_offset_values::(&mut self.values, offsets, values, start, len); } fn extend_validity(&mut self, additional: usize) { - self.offsets - .resize(self.offsets.len() + additional, self.length); + self.offsets.extend_constant(additional); self.validity.extend_constant(additional, false); } diff --git a/src/array/growable/list.rs b/src/array/growable/list.rs index 3fcbe3c4539..378fd6d93d3 100644 --- a/src/array/growable/list.rs +++ b/src/array/growable/list.rs @@ -3,12 +3,12 @@ use std::sync::Arc; use crate::{ array::{Array, ListArray}, bitmap::MutableBitmap, - offset::Offset, + offset::{Offset, Offsets}, }; use super::{ make_growable, - utils::{build_extend_null_bits, extend_offsets, ExtendNullBits}, + utils::{build_extend_null_bits, ExtendNullBits}, Growable, }; @@ -21,37 +21,15 @@ fn extend_offset_values( let array = growable.arrays[index]; let offsets = array.offsets(); - if array.null_count() == 0 { - // offsets - extend_offsets::( - &mut growable.offsets, - &mut growable.last_offset, - &offsets[start..start + len + 1], - ); - - let end = offsets[start + len].to_usize(); - let start = offsets[start].to_usize(); - let len = end - start; - growable.values.extend(index, start, len) - } else { - growable.offsets.reserve(len); - - let new_offsets = &mut growable.offsets; - let inner_values = &mut growable.values; - let last_offset = &mut growable.last_offset; - (start..start + len).for_each(|i| { - if array.is_valid(i) { - let len = offsets[i + 1] - offsets[i]; - // compute the new offset - *last_offset += len; - - // append value - inner_values.extend(index, offsets[i].to_usize(), len.to_usize()); - } - // append offset - new_offsets.push(*last_offset); - }) - } + growable + .offsets + .try_extend_from_slice(offsets, start, len) + .unwrap(); + + let end = offsets[start + len].to_usize(); + let start = offsets[start].to_usize(); + let len = end - start; + growable.values.extend(index, start, len); } /// Concrete [`Growable`] for the [`ListArray`]. @@ -59,8 +37,7 @@ pub struct GrowableList<'a, O: Offset> { arrays: Vec<&'a ListArray>, validity: MutableBitmap, values: Box + 'a>, - offsets: Vec, - last_offset: O, // always equal to the last offset at `offsets`. + offsets: Offsets, extend_null_bits: Vec>, } @@ -86,16 +63,11 @@ impl<'a, O: Offset> GrowableList<'a, O> { .collect::>(); let values = make_growable(&inner, use_validity, 0); - let mut offsets = Vec::with_capacity(capacity + 1); - let length = O::default(); - offsets.push(length); - Self { arrays, - offsets, + offsets: Offsets::with_capacity(capacity), values, validity: MutableBitmap::with_capacity(capacity), - last_offset: O::default(), extend_null_bits, } } @@ -105,20 +77,12 @@ impl<'a, O: Offset> GrowableList<'a, O> { let offsets = std::mem::take(&mut self.offsets); let values = self.values.as_box(); - #[cfg(debug_assertions)] - { - crate::array::specification::try_check_offsets(&offsets, values.len()).unwrap(); - } - - // Safety - the invariant of this struct ensures that this is up-held - unsafe { - ListArray::::new_unchecked( - self.arrays[0].data_type().clone(), - offsets.into(), - values, - validity.into(), - ) - } + ListArray::::new( + self.arrays[0].data_type().clone(), + offsets.into(), + values, + validity.into(), + ) } } @@ -129,8 +93,7 @@ impl<'a, O: Offset> Growable<'a> for GrowableList<'a, O> { } fn extend_validity(&mut self, additional: usize) { - self.offsets - .resize(self.offsets.len() + additional, self.last_offset); + self.offsets.extend_constant(additional); self.validity.extend_constant(additional, false); } diff --git a/src/array/growable/utf8.rs b/src/array/growable/utf8.rs index eed8ba30159..a34bc9a9584 100644 --- a/src/array/growable/utf8.rs +++ b/src/array/growable/utf8.rs @@ -2,12 +2,12 @@ use std::sync::Arc; use crate::{ array::{Array, Utf8Array}, - offset::Offset, bitmap::MutableBitmap, + offset::{Offset, Offsets}, }; use super::{ - utils::{build_extend_null_bits, extend_offset_values, extend_offsets, ExtendNullBits}, + utils::{build_extend_null_bits, extend_offset_values, ExtendNullBits}, Growable, }; @@ -16,8 +16,7 @@ pub struct GrowableUtf8<'a, O: Offset> { arrays: Vec<&'a Utf8Array>, validity: MutableBitmap, values: Vec, - offsets: Vec, - length: O, // always equal to the last offset at `offsets`. + offsets: Offsets, extend_null_bits: Vec>, } @@ -37,15 +36,10 @@ impl<'a, O: Offset> GrowableUtf8<'a, O> { .map(|array| build_extend_null_bits(*array, use_validity)) .collect(); - let mut offsets = Vec::with_capacity(capacity + 1); - let length = O::default(); - offsets.push(length); - Self { arrays: arrays.to_vec(), values: Vec::with_capacity(0), - offsets, - length, + offsets: Offsets::with_capacity(capacity), validity: MutableBitmap::with_capacity(capacity), extend_null_bits, } @@ -81,18 +75,16 @@ impl<'a, O: Offset> Growable<'a> for GrowableUtf8<'a, O> { let offsets = array.offsets(); let values = array.values(); - extend_offsets::( - &mut self.offsets, - &mut self.length, - &offsets[start..start + len + 1], - ); + self.offsets + .try_extend_from_slice(offsets, start, len) + .unwrap(); + // values extend_offset_values::(&mut self.values, offsets, values, start, len); } fn extend_validity(&mut self, additional: usize) { - self.offsets - .resize(self.offsets.len() + additional, self.length); + self.offsets.extend_constant(additional); self.validity.extend_constant(additional, false); } diff --git a/src/array/growable/utils.rs b/src/array/growable/utils.rs index d06c1116d48..06a85cd9ad4 100644 --- a/src/array/growable/utils.rs +++ b/src/array/growable/utils.rs @@ -1,15 +1,5 @@ use crate::{array::Array, bitmap::MutableBitmap, offset::Offset}; -pub(super) fn extend_offsets(buffer: &mut Vec, last_offset: &mut T, offsets: &[T]) { - buffer.reserve(offsets.len() - 1); - offsets.windows(2).for_each(|offsets| { - // compute the new offset - let length = offsets[1] - offsets[0]; - *last_offset += length; - buffer.push(*last_offset); - }); -} - // function used to extend nulls from arrays. This function's lifetime is bound to the array // because it reads nulls from it. pub(super) type ExtendNullBits<'a> = Box; diff --git a/src/array/list/ffi.rs b/src/array/list/ffi.rs index 9d0b19a85e0..c670dbde903 100644 --- a/src/array/list/ffi.rs +++ b/src/array/list/ffi.rs @@ -1,6 +1,6 @@ use crate::{array::FromFfi, bitmap::align, error::Result, ffi}; -use crate::offset::Offset; +use crate::offset::{Offset, OffsetsBuffer}; use super::super::{ffi::ToFfi, Array}; use super::ListArray; @@ -18,7 +18,7 @@ unsafe impl ToFfi for ListArray { } fn offset(&self) -> Option { - let offset = self.offsets.offset(); + let offset = self.offsets.buffer().offset(); if let Some(bitmap) = self.validity.as_ref() { if bitmap.offset() == offset { Some(offset) @@ -31,7 +31,7 @@ unsafe impl ToFfi for ListArray { } fn to_ffi_aligned(&self) -> Self { - let offset = self.offsets.offset(); + let offset = self.offsets.buffer().offset(); let validity = self.validity.as_ref().map(|bitmap| { if bitmap.offset() == offset { @@ -58,6 +58,9 @@ impl FromFfi for ListArray { let child = unsafe { array.child(0)? }; let values = ffi::try_from(child)?; + // assumption that data from FFI is well constructed + let offsets = unsafe { OffsetsBuffer::new_unchecked(offsets) }; + Ok(Self::from_data(data_type, offsets, values, validity)) } } diff --git a/src/array/list/mod.rs b/src/array/list/mod.rs index 7740307799d..7795d2dc428 100644 --- a/src/array/list/mod.rs +++ b/src/array/list/mod.rs @@ -1,17 +1,12 @@ use crate::{ bitmap::Bitmap, - buffer::Buffer, datatypes::{DataType, Field}, error::Error, - offset::Offset, + offset::{Offset, OffsetsBuffer}, }; use std::sync::Arc; -use super::{ - new_empty_array, - specification::{try_check_offsets, try_check_offsets_bounds}, - Array, -}; +use super::{new_empty_array, specification::try_check_offsets_bounds, Array}; mod ffi; pub(super) mod fmt; @@ -24,7 +19,7 @@ pub use mutable::*; #[derive(Clone)] pub struct ListArray { data_type: DataType, - offsets: Buffer, + offsets: OffsetsBuffer, values: Box, validity: Option, } @@ -34,7 +29,6 @@ impl ListArray { /// /// # Errors /// This function returns an error iff: - /// * the offsets are not monotonically increasing /// * The last offset is not equal to the values' length. /// * the validity's length is not equal to `offsets.len() - 1`. /// * The `data_type`'s [`crate::datatypes::PhysicalType`] is not equal to either [`crate::datatypes::PhysicalType::List`] or [`crate::datatypes::PhysicalType::LargeList`]. @@ -43,11 +37,11 @@ impl ListArray { /// This function is `O(N)` - checking monotinicity is `O(N)` pub fn try_new( data_type: DataType, - offsets: Buffer, + offsets: OffsetsBuffer, values: Box, validity: Option, ) -> Result { - try_check_offsets(&offsets, values.len())?; + try_check_offsets_bounds(&offsets, values.len())?; if validity .as_ref() @@ -78,7 +72,6 @@ impl ListArray { /// /// # Panics /// This function panics iff: - /// * the offsets are not monotonically increasing /// * The last offset is not equal to the values' length. /// * the validity's length is not equal to `offsets.len() - 1`. /// * The `data_type`'s [`crate::datatypes::PhysicalType`] is not equal to either [`crate::datatypes::PhysicalType::List`] or [`crate::datatypes::PhysicalType::LargeList`]. @@ -87,7 +80,7 @@ impl ListArray { /// This function is `O(N)` - checking monotinicity is `O(N)` pub fn new( data_type: DataType, - offsets: Buffer, + offsets: OffsetsBuffer, values: Box, validity: Option, ) -> Self { @@ -97,7 +90,7 @@ impl ListArray { /// Alias of `new` pub fn from_data( data_type: DataType, - offsets: Buffer, + offsets: OffsetsBuffer, values: Box, validity: Option, ) -> Self { @@ -107,7 +100,7 @@ impl ListArray { /// Returns a new empty [`ListArray`]. pub fn new_empty(data_type: DataType) -> Self { let values = new_empty_array(Self::get_child_type(&data_type).clone()); - Self::new(data_type, Buffer::from(vec![O::zero()]), values, None) + Self::new(data_type, OffsetsBuffer::default(), values, None) } /// Returns a new null [`ListArray`]. @@ -116,7 +109,7 @@ impl ListArray { let child = Self::get_child_type(&data_type).clone(); Self::new( data_type, - vec![O::default(); 1 + length].into(), + vec![O::zero(); 1 + length].try_into().unwrap(), new_empty_array(child), Some(Bitmap::new_zeroed(length)), ) @@ -133,77 +126,6 @@ impl ListArray { } } -// unsafe construtors -impl ListArray { - /// Creates a new [`ListArray`]. - /// - /// # Errors - /// This function returns an error iff: - /// * The last offset is not equal to the values' length. - /// * the validity's length is not equal to `offsets.len() - 1`. - /// * The `data_type`'s [`crate::datatypes::PhysicalType`] is not equal to either [`crate::datatypes::PhysicalType::List`] or [`crate::datatypes::PhysicalType::LargeList`]. - /// * The `data_type`'s inner field's data type is not equal to `values.data_type`. - /// # Safety - /// This function is unsafe iff: - /// * the offsets are not monotonically increasing - /// # Implementation - /// This function is `O(1)` - pub unsafe fn try_new_unchecked( - data_type: DataType, - offsets: Buffer, - values: Box, - validity: Option, - ) -> Result { - try_check_offsets_bounds(&offsets, values.len())?; - - if validity - .as_ref() - .map_or(false, |validity| validity.len() != offsets.len() - 1) - { - return Err(Error::oos( - "validity mask length must match the number of values", - )); - } - - let child_data_type = Self::try_get_child(&data_type)?.data_type(); - let values_data_type = values.data_type(); - if child_data_type != values_data_type { - return Err(Error::oos( - format!("ListArray's child's DataType must match. However, the expected DataType is {child_data_type:?} while it got {values_data_type:?}."), - )); - } - - Ok(Self { - data_type, - offsets, - values, - validity, - }) - } - - /// Creates a new [`ListArray`]. - /// - /// # Panics - /// This function panics iff: - /// * The last offset is not equal to the values' length. - /// * the validity's length is not equal to `offsets.len() - 1`. - /// * The `data_type`'s [`crate::datatypes::PhysicalType`] is not equal to either [`crate::datatypes::PhysicalType::List`] or [`crate::datatypes::PhysicalType::LargeList`]. - /// * The `data_type`'s inner field's data type is not equal to `values.data_type`. - /// # Safety - /// This function is unsafe iff: - /// * the offsets are not monotonically increasing - /// # Implementation - /// This function is `O(1)` - pub unsafe fn new_unchecked( - data_type: DataType, - offsets: Buffer, - values: Box, - validity: Option, - ) -> Self { - Self::try_new_unchecked(data_type, offsets, values, validity).unwrap() - } -} - impl ListArray { /// Returns a slice of this [`ListArray`]. /// # Panics @@ -295,7 +217,7 @@ impl ListArray { /// The offsets [`Buffer`]. #[inline] - pub fn offsets(&self) -> &Buffer { + pub fn offsets(&self) -> &OffsetsBuffer { &self.offsets } diff --git a/src/array/list/mutable.rs b/src/array/list/mutable.rs index 97785a01740..7ff6044d7f9 100644 --- a/src/array/list/mutable.rs +++ b/src/array/list/mutable.rs @@ -2,14 +2,13 @@ use std::sync::Arc; use crate::{ array::{ - physical_binary::{extend_validity, try_extend_offsets}, - specification::try_check_offsets, - Array, MutableArray, TryExtend, TryExtendFromSelf, TryPush, + physical_binary::extend_validity, Array, MutableArray, TryExtend, TryExtendFromSelf, + TryPush, }, bitmap::MutableBitmap, datatypes::{DataType, Field}, error::{Error, Result}, - offset::Offset, + offset::{Offset, Offsets}, trusted_len::TrustedLen, }; @@ -19,7 +18,7 @@ use super::ListArray; #[derive(Debug, Clone)] pub struct MutableListArray { data_type: DataType, - offsets: Vec, + offsets: Offsets, values: M, validity: Option, } @@ -37,8 +36,7 @@ impl MutableListArray { let values = M::default(); let data_type = ListArray::::default_datatype(values.data_type().clone()); - let mut offsets = Vec::::with_capacity(capacity + 1); - offsets.push(O::default()); + let offsets = Offsets::::with_capacity(capacity); Self { data_type, offsets, @@ -56,16 +54,12 @@ impl Default for MutableListArray { impl From> for ListArray { fn from(mut other: MutableListArray) -> Self { - // Safety: - // MutableListArray has monotonically increasing offsets - unsafe { - ListArray::new_unchecked( - other.data_type, - other.offsets.into(), - other.values.as_box(), - other.validity.map(|x| x.into()), - ) - } + ListArray::new( + other.data_type, + other.offsets.into(), + other.values.as_box(), + other.validity.map(|x| x.into()), + ) } } @@ -113,16 +107,14 @@ where extend_validity(self.len(), &mut self.validity, &other.validity); self.values.try_extend_from_self(&other.values)?; - - try_extend_offsets(&mut self.offsets, &other.offsets) + self.offsets.try_extend_from_self(&other.offsets) } } impl MutableListArray { /// Creates a new [`MutableListArray`] from a [`MutableArray`] and capacity. pub fn new_from(values: M, data_type: DataType, capacity: usize) -> Self { - let mut offsets = Vec::::with_capacity(capacity + 1); - offsets.push(O::default()); + let offsets = Offsets::::with_capacity(capacity); assert_eq!(values.len(), 0); ListArray::::get_child_field(&data_type); Self { @@ -154,11 +146,11 @@ impl MutableListArray { /// Needs to be called when a valid value was extended to this array. /// This is a relatively low level function, prefer `try_push` when you can. pub fn try_push_valid(&mut self) -> Result<()> { - let size = self.values.len(); - let size = O::from_usize(size).ok_or(Error::Overflow)?; - assert!(size >= *self.offsets.last().unwrap()); + let total_length = self.values.len(); + let offset = self.offsets.last().to_usize(); + let length = total_length.checked_sub(offset).ok_or(Error::Overflow)?; - self.offsets.push(size); + self.offsets.try_push_usize(length)?; if let Some(validity) = &mut self.validity { validity.push(true) } @@ -167,7 +159,7 @@ impl MutableListArray { #[inline] fn push_null(&mut self) { - self.offsets.push(self.last_offset()); + self.offsets.extend_constant(1); match &mut self.validity { Some(validity) => validity.push(false), None => self.init_validity(), @@ -176,79 +168,30 @@ impl MutableListArray { /// Expand this array, using elements from the underlying backing array. /// Assumes the expansion begins at the highest previous offset, or zero if - /// this [MutableListArray] is currently empty. + /// this [`MutableListArray`] is currently empty. /// /// Panics if: /// - the new offsets are not in monotonic increasing order. /// - any new offset is not in bounds of the backing array. /// - the passed iterator has no upper bound. #[allow(dead_code)] - pub(crate) fn extend_offsets(&mut self, expansion: II) - where - II: TrustedLen>, - { - let current_len = self.offsets.len(); - let (_, upper) = expansion.size_hint(); - let upper = upper.expect("iterator must have upper bound"); - if current_len == 0 && upper > 0 { - self.offsets.push(O::zero()); - } - // safety: checked below - unsafe { self.unsafe_extend_offsets(expansion) }; - if self.offsets.len() > current_len { - // check all inserted offsets - try_check_offsets(&self.offsets[current_len..], self.values.len()) - .expect("invalid offsets"); - } - // else expansion is empty, and this is trivially safe. - } - - /// Expand this array, using elements from the underlying backing array. - /// Assumes the expansion begins at the highest previous offset, or zero if - /// this [MutableListArray] is currently empty. - /// - /// # Safety - /// - /// Assumes that `offsets` are in order, and do not overrun the underlying - /// `values` backing array. - /// - /// Also assumes the expansion begins at the highest previous offset, or - /// zero if the array is currently empty. - /// - /// Panics if the passed iterator has no upper bound. - #[allow(dead_code)] - pub(crate) unsafe fn unsafe_extend_offsets(&mut self, expansion: II) + pub(crate) fn try_extend_from_lengths(&mut self, iterator: II) -> Result<()> where - II: TrustedLen>, + II: TrustedLen> + Clone, { - let (_, upper) = expansion.size_hint(); - let upper = upper.expect("iterator must have upper bound"); - let final_size = self.len() + upper; - self.offsets.reserve(upper); - - for item in expansion { - match item { - Some(offset) => { - self.offsets.push(offset); - if let Some(validity) = &mut self.validity { - validity.push(true); - } - } - None => self.push_null(), - } - - if let Some(validity) = &mut self.validity { - if validity.capacity() < final_size { - validity.reserve(final_size - validity.capacity()); - } - } + self.offsets + .try_extend_from_lengths(iterator.clone().map(|x| x.unwrap_or_default()))?; + if let Some(validity) = &mut self.validity { + validity.extend_from_trusted_len_iter(iterator.map(|x| x.is_some())) } + assert_eq!(self.offsets.last().to_usize(), self.values.len()); + Ok(()) } /// Returns the length of this array #[inline] pub fn len(&self) -> usize { - self.offsets.len() - 1 + self.offsets.len() } /// The values @@ -257,7 +200,7 @@ impl MutableListArray { } /// The offsets - pub fn offsets(&self) -> &Vec { + pub fn offsets(&self) -> &Offsets { &self.offsets } @@ -266,11 +209,6 @@ impl MutableListArray { &self.values } - #[inline] - fn last_offset(&self) -> O { - *self.offsets.last().unwrap() - } - fn init_validity(&mut self) { let len = self.offsets.len() - 1; @@ -320,29 +258,23 @@ impl MutableArray for MutableListArray Box { - // Safety: - // MutableListArray has monotonically increasing offsets - Box::new(unsafe { - ListArray::new_unchecked( - self.data_type.clone(), - std::mem::take(&mut self.offsets).into(), - self.values.as_box(), - std::mem::take(&mut self.validity).map(|x| x.into()), - ) - }) + ListArray::new( + self.data_type.clone(), + std::mem::take(&mut self.offsets).into(), + self.values.as_box(), + std::mem::take(&mut self.validity).map(|x| x.into()), + ) + .boxed() } fn as_arc(&mut self) -> Arc { - // Safety: - // MutableListArray has monotonically increasing offsets - Arc::new(unsafe { - ListArray::new_unchecked( - self.data_type.clone(), - std::mem::take(&mut self.offsets).into(), - self.values.as_box(), - std::mem::take(&mut self.validity).map(|x| x.into()), - ) - }) + ListArray::new( + self.data_type.clone(), + std::mem::take(&mut self.offsets).into(), + self.values.as_box(), + std::mem::take(&mut self.validity).map(|x| x.into()), + ) + .arced() } fn data_type(&self) -> &DataType { diff --git a/src/array/physical_binary.rs b/src/array/physical_binary.rs index 825ba01e5d0..adbf62d6c27 100644 --- a/src/array/physical_binary.rs +++ b/src/array/physical_binary.rs @@ -1,6 +1,5 @@ use crate::bitmap::MutableBitmap; -use crate::error::Error; -use crate::offset::Offset; +use crate::offset::{Offset, Offsets}; /// # Safety /// The caller must ensure that `iterator` is `TrustedLen`. @@ -8,7 +7,7 @@ use crate::offset::Offset; #[allow(clippy::type_complexity)] pub(crate) unsafe fn try_trusted_len_unzip( iterator: I, -) -> std::result::Result<(Option, Vec, Vec), E> +) -> std::result::Result<(Option, Offsets, Vec), E> where O: Offset, P: AsRef<[u8]>, @@ -45,7 +44,7 @@ where ); offsets.set_len(len + 1); - Ok((null.into(), offsets, values)) + Ok((null.into(), Offsets::new_unchecked(offsets), values)) } /// Creates [`MutableBitmap`] and two [`Vec`]s from an iterator of `Option`. @@ -56,7 +55,7 @@ where #[inline] pub(crate) unsafe fn trusted_len_unzip( iterator: I, -) -> (Option, Vec, Vec) +) -> (Option, Offsets, Vec) where O: Offset, P: AsRef<[u8]>, @@ -65,12 +64,10 @@ where let (_, upper) = iterator.size_hint(); let len = upper.expect("trusted_len_unzip requires an upper limit"); - let mut offsets = Vec::::with_capacity(len + 1); + let mut offsets = Offsets::::with_capacity(len); let mut values = Vec::::new(); let mut validity = MutableBitmap::new(); - offsets.push(O::default()); - extend_from_trusted_len_iter(&mut offsets, &mut values, &mut validity, iterator); let validity = if validity.unset_bits() > 0 { @@ -87,7 +84,7 @@ where /// # Safety /// The caller must ensure that `iterator` is [`TrustedLen`]. #[inline] -pub(crate) unsafe fn trusted_len_values_iter(iterator: I) -> (Vec, Vec) +pub(crate) unsafe fn trusted_len_values_iter(iterator: I) -> (Offsets, Vec) where O: Offset, P: AsRef<[u8]>, @@ -96,11 +93,9 @@ where let (_, upper) = iterator.size_hint(); let len = upper.expect("trusted_len_unzip requires an upper limit"); - let mut offsets = Vec::::with_capacity(len + 1); + let mut offsets = Offsets::::with_capacity(len); let mut values = Vec::::new(); - offsets.push(O::default()); - extend_from_trusted_len_values_iter(&mut offsets, &mut values, iterator); (offsets, values) @@ -112,7 +107,7 @@ where // The caller must ensure the `iterator` is [`TrustedLen`] #[inline] pub(crate) unsafe fn extend_from_trusted_len_values_iter( - offsets: &mut Vec, + offsets: &mut Offsets, values: &mut Vec, iterator: I, ) where @@ -120,42 +115,13 @@ pub(crate) unsafe fn extend_from_trusted_len_values_iter( P: AsRef<[u8]>, I: Iterator, { - let (_, upper) = iterator.size_hint(); - let additional = upper.expect("extend_from_trusted_len_values_iter requires an upper limit"); - - offsets.reserve(additional); - - // Read in the last offset, will be used to increment and store - // new values later on - let mut length = *offsets.last().unwrap(); - - // Get a mutable pointer to the `offsets`, and move the pointer - // to the position, where a new value will be written - let mut dst = offsets.as_mut_ptr(); - dst = dst.add(offsets.len()); - - for item in iterator { + let lengths = iterator.map(|item| { let s = item.as_ref(); - - // Calculate the new offset value - length += O::from_usize(s.len()).unwrap(); - // Push new entries for both `values` and `offsets` buffer values.extend_from_slice(s); - std::ptr::write(dst, length); - - // Move to the next position in offset buffer - dst = dst.add(1); - } - - debug_assert_eq!( - dst.offset_from(offsets.as_ptr()) as usize, - offsets.len() + additional, - "TrustedLen iterator's length was not accurately reported" - ); - - // We make sure to set the new length for the `offsets` buffer - offsets.set_len(offsets.len() + additional); + s.len() + }); + offsets.try_extend_from_lengths(lengths).unwrap(); } // Populates `offsets` and `values` [`Vec`]s with information extracted @@ -163,7 +129,7 @@ pub(crate) unsafe fn extend_from_trusted_len_values_iter( // the return value indicates how many items were added. #[inline] pub(crate) fn extend_from_values_iter( - offsets: &mut Vec, + offsets: &mut Offsets, values: &mut Vec, iterator: I, ) -> usize @@ -176,18 +142,12 @@ where offsets.reserve(size_hint); - // Read in the last offset, will be used to increment and store - // new values later on - let mut length = *offsets.last().unwrap(); let start_index = offsets.len(); for item in iterator { - let s = item.as_ref(); - // Calculate the new offset value - length += O::from_usize(s.len()).unwrap(); - - values.extend_from_slice(s); - offsets.push(length); + let bytes = item.as_ref(); + values.extend_from_slice(bytes); + offsets.try_push_usize(bytes.len()).unwrap(); } offsets.len() - start_index } @@ -199,7 +159,7 @@ where // The caller must ensure that `iterator` is [`TrustedLen`] #[inline] pub(crate) unsafe fn extend_from_trusted_len_iter( - offsets: &mut Vec, + offsets: &mut Offsets, values: &mut Vec, validity: &mut MutableBitmap, iterator: I, @@ -214,51 +174,24 @@ pub(crate) unsafe fn extend_from_trusted_len_iter( offsets.reserve(additional); validity.reserve(additional); - // Read in the last offset, will be used to increment and store - // new values later on - let mut length = *offsets.last().unwrap(); - - // Get a mutable pointer to the `offsets`, and move the pointer - // to the position, where a new value will be written - let mut dst = offsets.as_mut_ptr(); - dst = dst.add(offsets.len()); - - for item in iterator { + let lengths = iterator.map(|item| { if let Some(item) = item { let bytes = item.as_ref(); - - // Calculate new offset value - length += O::from_usize(bytes.len()).unwrap(); - - // Push new values for `values` and `validity` buffer values.extend_from_slice(bytes); validity.push_unchecked(true); + bytes.len() } else { - // If `None`, update only `validity` validity.push_unchecked(false); + 0 } - - // Push new offset or old offset depending on the `item` - std::ptr::write(dst, length); - - // Move to the next position in offset buffer - dst = dst.add(1); - } - - debug_assert_eq!( - dst.offset_from(offsets.as_ptr()) as usize, - offsets.len() + additional, - "TrustedLen iterator's length was not accurately reported" - ); - - // We make sure to set the new length for the `offsets` buffer - offsets.set_len(offsets.len() + additional); + }); + offsets.try_extend_from_lengths(lengths).unwrap(); } /// Creates two [`Vec`]s from an iterator of `&[u8]`. /// The first buffer corresponds to a offset buffer, the second to a values buffer. #[inline] -pub(crate) fn values_iter(iterator: I) -> (Vec, Vec) +pub(crate) fn values_iter(iterator: I) -> (Offsets, Vec) where O: Offset, P: AsRef<[u8]>, @@ -266,40 +199,17 @@ where { let (lower, _) = iterator.size_hint(); - let mut offsets = Vec::::with_capacity(lower + 1); + let mut offsets = Offsets::::with_capacity(lower); let mut values = Vec::::new(); - let mut length = O::default(); - offsets.push(length); - for item in iterator { let s = item.as_ref(); - length += O::from_usize(s.len()).unwrap(); values.extend_from_slice(s); - - offsets.push(length) + offsets.try_push_usize(s.len()).unwrap(); } (offsets, values) } -/// Extends `offsets` with all offsets from `other` -#[inline] -pub(crate) fn try_extend_offsets(offsets: &mut Vec, other: &[O]) -> Result<(), Error> -where - O: Offset, -{ - let lengths = other.windows(2).map(|w| w[1] - w[0]); - let mut last = *offsets.last().unwrap(); - - offsets.reserve(other.len() - 1); - for length in lengths { - let r = last.checked_add(&length).ok_or(Error::Overflow)?; - last += length; - offsets.push(r) - } - Ok(()) -} - /// Extends `validity` with all items from `other` pub(crate) fn extend_validity( length: usize, diff --git a/src/array/utf8/ffi.rs b/src/array/utf8/ffi.rs index 2152dfade45..97ff152a036 100644 --- a/src/array/utf8/ffi.rs +++ b/src/array/utf8/ffi.rs @@ -3,7 +3,7 @@ use crate::{ bitmap::align, error::Result, ffi, - offset::Offset, + offset::{Offset, OffsetsBuffer}, }; use super::Utf8Array; @@ -18,7 +18,7 @@ unsafe impl ToFfi for Utf8Array { } fn offset(&self) -> Option { - let offset = self.offsets.offset(); + let offset = self.offsets.buffer().offset(); if let Some(bitmap) = self.validity.as_ref() { if bitmap.offset() == offset { Some(offset) @@ -31,7 +31,7 @@ unsafe impl ToFfi for Utf8Array { } fn to_ffi_aligned(&self) -> Self { - let offset = self.offsets.offset(); + let offset = self.offsets.buffer().offset(); let validity = self.validity.as_ref().map(|bitmap| { if bitmap.offset() == offset { @@ -57,6 +57,9 @@ impl FromFfi for Utf8Array { let offsets = unsafe { array.buffer::(1) }?; let values = unsafe { array.buffer::(2)? }; + // assumption that data from FFI is well constructed + let offsets = unsafe { OffsetsBuffer::new_unchecked(offsets) }; + Ok(Self::from_data_unchecked( data_type, offsets, values, validity, )) diff --git a/src/array/utf8/mod.rs b/src/array/utf8/mod.rs index f011183ce6e..2f3130696de 100644 --- a/src/array/utf8/mod.rs +++ b/src/array/utf8/mod.rs @@ -6,7 +6,7 @@ use crate::{ buffer::Buffer, datatypes::DataType, error::{Error, Result}, - offset::Offset, + offset::{Offset, OffsetsBuffer}, trusted_len::TrustedLen, }; @@ -69,7 +69,7 @@ impl> AsRef<[u8]> for StrAsBytes { #[derive(Clone)] pub struct Utf8Array { data_type: DataType, - offsets: Buffer, + offsets: OffsetsBuffer, values: Buffer, validity: Option, } @@ -89,7 +89,7 @@ impl Utf8Array { /// This function is `O(N)` - checking monotinicity and utf8 is `O(N)` pub fn try_new( data_type: DataType, - offsets: Buffer, + offsets: OffsetsBuffer, values: Buffer, validity: Option, ) -> Result { @@ -145,7 +145,7 @@ impl Utf8Array { /// Returns the length of this array #[inline] pub fn len(&self) -> usize { - self.offsets.len() - 1 + self.offsets.len() } /// Returns the value of the element at index `i`, ignoring the array's validity. @@ -187,7 +187,7 @@ impl Utf8Array { /// Returns the offsets of this [`Utf8Array`]. #[inline] - pub fn offsets(&self) -> &Buffer { + pub fn offsets(&self) -> &OffsetsBuffer { &self.offsets } @@ -278,7 +278,7 @@ impl Utf8Array { }), Right(mutable_bitmap) => match ( self.values.get_mut().map(std::mem::take), - self.offsets.get_mut().map(std::mem::take), + self.offsets.get_mut(), ) { (None, None) => { // Safety: invariants are preserved @@ -326,7 +326,7 @@ impl Utf8Array { } else { match ( self.values.get_mut().map(std::mem::take), - self.offsets.get_mut().map(std::mem::take), + self.offsets.get_mut(), ) { (None, None) => Left(unsafe { Utf8Array::new_unchecked(self.data_type, self.offsets, self.values, None) @@ -349,14 +349,7 @@ impl Utf8Array { /// The array is guaranteed to have no elements nor validity. #[inline] pub fn new_empty(data_type: DataType) -> Self { - unsafe { - Self::from_data_unchecked( - data_type, - Buffer::from(vec![O::zero()]), - Buffer::new(), - None, - ) - } + unsafe { Self::from_data_unchecked(data_type, OffsetsBuffer::new(), Buffer::new(), None) } } /// Returns a new [`Utf8Array`] whose all slots are null / `None`. @@ -364,7 +357,7 @@ impl Utf8Array { pub fn new_null(data_type: DataType, length: usize) -> Self { Self::new( data_type, - vec![O::default(); 1 + length].into(), + vec![O::default(); 1 + length].try_into().unwrap(), Buffer::new(), Some(Bitmap::new_zeroed(length)), ) @@ -388,13 +381,12 @@ impl Utf8Array { /// * The `data_type`'s [`crate::datatypes::PhysicalType`] is not equal to either `Utf8` or `LargeUtf8`. /// # Safety /// This function is unsound iff: - /// * the offsets are not monotonically increasing /// * The `values` between two consecutive `offsets` are not valid utf8 /// # Implementation /// This function is `O(1)` pub unsafe fn try_new_unchecked( data_type: DataType, - offsets: Buffer, + offsets: OffsetsBuffer, values: Buffer, validity: Option, ) -> Result { @@ -426,7 +418,6 @@ impl Utf8Array { /// Creates a new [`Utf8Array`]. /// # Panics /// This function panics iff: - /// * the offsets are not monotonically increasing /// * The last offset is not equal to the values' length. /// * the validity's length is not equal to `offsets.len() - 1`. /// * The `data_type`'s [`crate::datatypes::PhysicalType`] is not equal to either `Utf8` or `LargeUtf8`. @@ -435,7 +426,7 @@ impl Utf8Array { /// This function is `O(N)` - checking monotinicity and utf8 is `O(N)` pub fn new( data_type: DataType, - offsets: Buffer, + offsets: OffsetsBuffer, values: Buffer, validity: Option, ) -> Self { @@ -457,7 +448,7 @@ impl Utf8Array { /// This function is `O(1)` pub unsafe fn new_unchecked( data_type: DataType, - offsets: Buffer, + offsets: OffsetsBuffer, values: Buffer, validity: Option, ) -> Self { @@ -530,7 +521,7 @@ impl Utf8Array { /// Alias for `new` pub fn from_data( data_type: DataType, - offsets: Buffer, + offsets: OffsetsBuffer, values: Buffer, validity: Option, ) -> Self { @@ -540,11 +531,10 @@ impl Utf8Array { /// Alias for [`Self::new_unchecked`] /// # Safety /// This function is unsafe iff: - /// * the offsets are not monotonically increasing /// * The `values` between two consecutive `offsets` are not valid utf8 pub unsafe fn from_data_unchecked( data_type: DataType, - offsets: Buffer, + offsets: OffsetsBuffer, values: Buffer, validity: Option, ) -> Self { @@ -611,11 +601,6 @@ impl Default for Utf8Array { } else { DataType::Utf8 }; - Utf8Array::new( - data_type, - vec![O::from_usize(0).unwrap()].into(), - Default::default(), - None, - ) + Utf8Array::new(data_type, Default::default(), Default::default(), None) } } diff --git a/src/array/utf8/mutable.rs b/src/array/utf8/mutable.rs index 4dc9b1304b8..d3e633648f2 100644 --- a/src/array/utf8/mutable.rs +++ b/src/array/utf8/mutable.rs @@ -9,7 +9,7 @@ use crate::{ }, datatypes::DataType, error::{Error, Result}, - offset::Offset, + offset::{Offset, Offsets}, trusted_len::TrustedLen, }; @@ -62,7 +62,7 @@ impl MutableUtf8Array { /// This function is `O(N)` - checking monotinicity and utf8 is `O(N)` pub fn try_new( data_type: DataType, - offsets: Vec, + offsets: Offsets, values: Vec, validity: Option, ) -> Result { @@ -89,7 +89,7 @@ impl MutableUtf8Array { /// * The validity is not `None` and its length is different from `offsets`'s length minus one. pub unsafe fn new_unchecked( data_type: DataType, - offsets: Vec, + offsets: Offsets, values: Vec, validity: Option, ) -> Self { @@ -105,7 +105,7 @@ impl MutableUtf8Array { /// The caller must ensure that every value between offsets is a valid utf8. pub unsafe fn from_data_unchecked( data_type: DataType, - offsets: Vec, + offsets: Offsets, values: Vec, validity: Option, ) -> Self { @@ -120,7 +120,7 @@ impl MutableUtf8Array { /// * The validity is not `None` and its length is different from `offsets`'s length minus one. pub fn from_data( data_type: DataType, - offsets: Vec, + offsets: Offsets, values: Vec, validity: Option, ) -> Self { @@ -231,7 +231,7 @@ impl MutableUtf8Array { } /// Extract the low-end APIs from the [`MutableUtf8Array`]. - pub fn into_data(self) -> (DataType, Vec, Vec, Option) { + pub fn into_data(self) -> (DataType, Offsets, Vec, Option) { let (data_type, offsets, values) = self.values.into_inner(); (data_type, offsets, values, self.validity) } @@ -249,7 +249,7 @@ impl MutableUtf8Array { } /// returns its offsets. - pub fn offsets(&self) -> &Vec { + pub fn offsets(&self) -> &Offsets { self.values.offsets() } } diff --git a/src/array/utf8/mutable_values.rs b/src/array/utf8/mutable_values.rs index 0da7ff8ff7c..15fbc586c42 100644 --- a/src/array/utf8/mutable_values.rs +++ b/src/array/utf8/mutable_values.rs @@ -8,7 +8,7 @@ use crate::{ bitmap::MutableBitmap, datatypes::DataType, error::{Error, Result}, - offset::Offset, + offset::{Offset, Offsets}, trusted_len::TrustedLen, }; @@ -20,7 +20,7 @@ use crate::array::physical_binary::*; #[derive(Debug, Clone)] pub struct MutableUtf8ValuesArray { data_type: DataType, - offsets: Vec, + offsets: Offsets, values: Vec, } @@ -66,7 +66,7 @@ impl MutableUtf8ValuesArray { pub fn new() -> Self { Self { data_type: Self::default_data_type(), - offsets: vec![O::default()], + offsets: Offsets::new(), values: Vec::::new(), } } @@ -81,7 +81,7 @@ impl MutableUtf8ValuesArray { /// * The `values` between two consecutive `offsets` are not valid utf8 /// # Implementation /// This function is `O(N)` - checking monotinicity and utf8 is `O(N)` - pub fn try_new(data_type: DataType, offsets: Vec, values: Vec) -> Result { + pub fn try_new(data_type: DataType, offsets: Offsets, values: Vec) -> Result { try_check_offsets_and_utf8(&offsets, &values)?; if data_type.to_physical_type() != Self::default_data_type().to_physical_type() { return Err(Error::oos( @@ -108,7 +108,7 @@ impl MutableUtf8ValuesArray { /// * The `values` between two consecutive `offsets` are not valid utf8 /// # Implementation /// This function is `O(1)` - pub unsafe fn new_unchecked(data_type: DataType, offsets: Vec, values: Vec) -> Self { + pub unsafe fn new_unchecked(data_type: DataType, offsets: Offsets, values: Vec) -> Self { check_offsets_minimal(&offsets, values.len()); if data_type.to_physical_type() != Self::default_data_type().to_physical_type() { @@ -135,12 +135,9 @@ impl MutableUtf8ValuesArray { /// Initializes a new [`MutableUtf8ValuesArray`] with a pre-allocated capacity of items and values. pub fn with_capacities(capacity: usize, values: usize) -> Self { - let mut offsets = Vec::::with_capacity(capacity + 1); - offsets.push(O::default()); - Self { data_type: Self::default_data_type(), - offsets, + offsets: Offsets::::with_capacity(capacity), values: Vec::::with_capacity(values), } } @@ -153,7 +150,7 @@ impl MutableUtf8ValuesArray { /// returns its offsets. #[inline] - pub fn offsets(&self) -> &Vec { + pub fn offsets(&self) -> &Offsets { &self.offsets } @@ -172,7 +169,7 @@ impl MutableUtf8ValuesArray { /// Returns the length of this array #[inline] pub fn len(&self) -> usize { - self.offsets.len() - 1 + self.offsets.len() } /// Pushes a new item to the array. @@ -190,7 +187,7 @@ impl MutableUtf8ValuesArray { return None; } self.offsets.pop()?; - let start = self.offsets.last()?.to_usize(); + let start = self.offsets.last().to_usize(); let value = self.values.split_off(start); // Safety: utf8 is validated on initialization Some(unsafe { String::from_utf8_unchecked(value) }) @@ -233,7 +230,7 @@ impl MutableUtf8ValuesArray { } /// Extract the low-end APIs from the [`MutableUtf8ValuesArray`]. - pub fn into_inner(self) -> (DataType, Vec, Vec) { + pub fn into_inner(self) -> (DataType, Offsets, Vec) { (self.data_type, self.offsets, self.values) } } @@ -401,17 +398,13 @@ impl> TryPush for MutableUtf8ValuesArray { fn try_push(&mut self, value: T) -> Result<()> { let bytes = value.as_ref().as_bytes(); self.values.extend_from_slice(bytes); - - let size = O::from_usize(self.values.len()).ok_or(Error::Overflow)?; - - self.offsets.push(size); - Ok(()) + self.offsets.try_push_usize(bytes.len()) } } impl TryExtendFromSelf for MutableUtf8ValuesArray { fn try_extend_from_self(&mut self, other: &Self) -> Result<()> { self.values.extend_from_slice(&other.values); - try_extend_offsets(&mut self.offsets, &other.offsets) + self.offsets.try_extend_from_self(&other.offsets) } } diff --git a/src/compute/cast/binary_to.rs b/src/compute/cast/binary_to.rs index d4b63c1e73d..98cf4105b4b 100644 --- a/src/compute/cast/binary_to.rs +++ b/src/compute/cast/binary_to.rs @@ -1,6 +1,4 @@ -use std::convert::TryFrom; - -use crate::error::{Error, Result}; +use crate::error::Result; use crate::offset::Offset; use crate::{array::*, datatypes::DataType, types::NativeType}; @@ -9,11 +7,9 @@ use super::CastOptions; /// Conversion of binary pub fn binary_to_large_binary(from: &BinaryArray, to_data_type: DataType) -> BinaryArray { let values = from.values().clone(); - let offsets = from.offsets().iter().map(|x| *x as i64).collect::>(); - // todo: use `new_unchecked` since all invariants are preserved BinaryArray::::new( to_data_type, - offsets.into(), + from.offsets().into(), values, from.validity().cloned(), ) @@ -25,13 +21,10 @@ pub fn binary_large_to_binary( to_data_type: DataType, ) -> Result> { let values = from.values().clone(); - let _ = i32::try_from(*from.offsets().last().unwrap()).map_err(Error::from_external_error)?; - - let offsets = from.offsets().iter().map(|x| *x as i32).collect::>(); - // todo: use `new_unchecked` since all invariants are preserved + let offsets = from.offsets().try_into()?; Ok(BinaryArray::::new( to_data_type, - offsets.into(), + offsets, values, from.validity().cloned(), )) @@ -58,12 +51,7 @@ pub fn binary_to_large_utf8( to_data_type: DataType, ) -> Result> { let values = from.values().clone(); - let offsets = from - .offsets() - .iter() - .map(|x| *x as i64) - .collect::>() - .into(); + let offsets = from.offsets().into(); Utf8Array::::try_new(to_data_type, offsets, values, from.validity().cloned()) } diff --git a/src/compute/cast/mod.rs b/src/compute/cast/mod.rs index eda7bda78a0..9e27d0a1194 100644 --- a/src/compute/cast/mod.rs +++ b/src/compute/cast/mod.rs @@ -18,7 +18,7 @@ use crate::{ array::*, datatypes::*, error::{Error, Result}, - offset::Offset, + offset::{Offset, Offsets}, }; /// options defining how Cast kernels behave @@ -324,28 +324,18 @@ fn cast_list( } fn cast_list_to_large_list(array: &ListArray, to_type: &DataType) -> ListArray { - let offets = array - .offsets() - .iter() - .map(|x| *x as i64) - .collect::>() - .into(); + let offsets = array.offsets().into(); ListArray::::new( to_type.clone(), - offets, + offsets, array.values().clone(), array.validity().cloned(), ) } fn cast_large_to_list(array: &ListArray, to_type: &DataType) -> ListArray { - let offsets = array - .offsets() - .iter() - .map(|x| *x as i32) - .collect::>() - .into(); + let offsets = array.offsets().try_into().expect("Conver me to error"); ListArray::::new( to_type.clone(), @@ -366,14 +356,15 @@ fn cast_fixed_size_list_to_list( options, )?; - let offsets = (0..(fixed.len() + 1)) + let offsets = (0..=fixed.len()) .map(|ix| (ix * fixed.size()) as i32) - .collect::>() - .into(); + .collect::>(); + // Safety: offsets _are_ monotonically increasing + let offsets = unsafe { Offsets::new_unchecked(offsets) }; Ok(ListArray::::new( to_type.clone(), - offsets, + offsets.into(), new_values, fixed.validity().cloned(), )) @@ -478,6 +469,8 @@ pub fn cast(array: &dyn Array, to_type: &DataType, options: CastOptions) -> Resu let values = cast(array, &to.data_type, options)?; // create offsets, where if array.len() = 2, we have [0,1,2] let offsets = (0..=array.len() as i32).collect::>(); + // Safety: offsets _are_ monotonically increasing + let offsets = unsafe { Offsets::new_unchecked(offsets) }; let list_array = ListArray::::new(to_type.clone(), offsets.into(), values, None); diff --git a/src/compute/cast/primitive_to.rs b/src/compute/cast/primitive_to.rs index 6c5c7bb8753..4feb5aaba6c 100644 --- a/src/compute/cast/primitive_to.rs +++ b/src/compute/cast/primitive_to.rs @@ -4,7 +4,7 @@ use num_traits::{AsPrimitive, Float, ToPrimitive}; use crate::datatypes::IntervalUnit; use crate::error::Result; -use crate::offset::Offset; +use crate::offset::{Offset, Offsets}; use crate::types::{days_ms, f16, months_days_ns}; use crate::{ array::*, @@ -42,7 +42,9 @@ pub fn primitive_to_binary( } values.set_len(offset); values.shrink_to_fit(); - BinaryArray::::from_data_unchecked( + // Safety: offsets _are_ monotonically increasing + let offsets = unsafe { Offsets::new_unchecked(offsets) }; + BinaryArray::::new( BinaryArray::::default_data_type(), offsets.into(), values.into(), @@ -104,11 +106,13 @@ pub fn primitive_to_utf8( let len = lexical_core::write_unchecked(*x, bytes).len(); offset += len; - offsets.push(O::from_usize(offset as usize).unwrap()); + offsets.push(O::from_usize(offset).unwrap()); } values.set_len(offset); values.shrink_to_fit(); - Utf8Array::::from_data_unchecked( + // Safety: offsets _are_ monotonically increasing + let offsets = unsafe { Offsets::new_unchecked(offsets) }; + Utf8Array::::new_unchecked( Utf8Array::::default_data_type(), offsets.into(), values.into(), diff --git a/src/compute/cast/utf8_to.rs b/src/compute/cast/utf8_to.rs index 2625497ed67..165c24a1025 100644 --- a/src/compute/cast/utf8_to.rs +++ b/src/compute/cast/utf8_to.rs @@ -1,11 +1,9 @@ -use std::convert::TryFrom; - use chrono::Datelike; use crate::{ array::*, datatypes::DataType, - error::{Error, Result}, + error::Result, offset::Offset, temporal_conversions::{ utf8_to_naive_timestamp_ns as utf8_to_naive_timestamp_ns_, @@ -150,13 +148,9 @@ pub fn utf8_to_large_utf8(from: &Utf8Array) -> Utf8Array { let data_type = Utf8Array::::default_data_type(); let validity = from.validity().cloned(); let values = from.values().clone(); - let offsets = from - .offsets() - .iter() - .map(|x| *x as i64) - .collect::>() - .into(); - // Safety: sound because `offsets` fulfills the same invariants as `from.offsets()` + + let offsets = from.offsets().into(); + // Safety: sound because `values` fulfills the same invariants as `from.values()` unsafe { Utf8Array::::from_data_unchecked(data_type, offsets, values, validity) } } @@ -165,22 +159,17 @@ pub fn utf8_large_to_utf8(from: &Utf8Array) -> Result> { let data_type = Utf8Array::::default_data_type(); let validity = from.validity().cloned(); let values = from.values().clone(); - let _ = i32::try_from(*from.offsets().last().unwrap()).map_err(Error::from_external_error)?; + let offsets = from.offsets().try_into()?; - let offsets = from - .offsets() - .iter() - .map(|x| *x as i32) - .collect::>() - .into(); - // Safety: sound because `offsets` fulfills the same invariants as `from.offsets()` + // Safety: sound because `values` fulfills the same invariants as `from.values()` Ok(unsafe { Utf8Array::::from_data_unchecked(data_type, offsets, values, validity) }) } /// Conversion to binary pub fn utf8_to_binary(from: &Utf8Array, to_data_type: DataType) -> BinaryArray { + // Safety: erasure of an invariant is always safe unsafe { - BinaryArray::::new_unchecked( + BinaryArray::::new( to_data_type, from.offsets().clone(), from.values().clone(), diff --git a/src/compute/substring.rs b/src/compute/substring.rs index 1edab3a0cb1..2c5ddd078a6 100644 --- a/src/compute/substring.rs +++ b/src/compute/substring.rs @@ -21,7 +21,7 @@ use crate::{ array::*, datatypes::DataType, error::{Error, Result}, - offset::Offset, + offset::{Offset, Offsets}, }; fn utf8_substring(array: &Utf8Array, start: O, length: &Option) -> Utf8Array { @@ -77,12 +77,9 @@ fn binary_substring( let offsets = array.offsets(); let values = array.values(); - let mut new_offsets = Vec::::with_capacity(array.len() + 1); + let mut new_offsets = Offsets::::with_capacity(array.len()); let mut new_values = Vec::::new(); // we have no way to estimate how much this will be. - let mut length_so_far = O::zero(); - new_offsets.push(length_so_far); - offsets.windows(2).for_each(|windows| { let length_i: O = windows[1] - windows[0]; @@ -99,8 +96,9 @@ fn binary_substring( .unwrap_or(length_i) // .max(0) is not needed as it is guaranteed .min(windows[1] - start); // so we do not go beyond this entry - length_so_far += length; - new_offsets.push(length_so_far); + new_offsets + .try_push(length) + .expect("Substring is always smaller than original - overflow never happens"); // we need usize for ranges let start = start.to_usize(); diff --git a/src/compute/take/generic_binary.rs b/src/compute/take/generic_binary.rs index 3656aebc771..bb6a5aad44e 100644 --- a/src/compute/take/generic_binary.rs +++ b/src/compute/take/generic_binary.rs @@ -2,7 +2,7 @@ use crate::{ array::{GenericBinaryArray, PrimitiveArray}, bitmap::{Bitmap, MutableBitmap}, buffer::Buffer, - offset::Offset, + offset::{Offset, Offsets, OffsetsBuffer}, }; use super::Index; @@ -26,8 +26,8 @@ pub fn take_no_validity( offsets: &[O], values: &[u8], indices: &[I], -) -> (Buffer, Buffer, Option) { - let mut length = O::default(); +) -> (OffsetsBuffer, Buffer, Option) { + let mut length = O::zero(); let mut buffer = Vec::::new(); let offsets = indices.iter().map(|index| { let index = index.to_usize(); @@ -40,19 +40,20 @@ pub fn take_no_validity( buffer.extend_from_slice(&values[_start..end]); length }); - let offsets = std::iter::once(O::default()) + let offsets = std::iter::once(O::zero()) .chain(offsets) - .collect::>() - .into(); + .collect::>(); + // Safety: offsets _are_ monotonically increasing + let offsets = unsafe { Offsets::new_unchecked(offsets) }; - (offsets, buffer.into(), None) + (offsets.into(), buffer.into(), None) } // take implementation when only values contain nulls pub fn take_values_validity>( values: &A, indices: &[I], -) -> (Buffer, Buffer, Option) { +) -> (OffsetsBuffer, Buffer, Option) { let validity_values = values.validity().unwrap(); let validity = indices .iter() @@ -75,8 +76,10 @@ pub fn take_values_validity>( let offsets = std::iter::once(O::default()) .chain(offsets) .collect::>(); + // Safety: by construction offsets are monotonically increasing + let offsets = unsafe { Offsets::new_unchecked(offsets) }; - let buffer = take_values(length, starts.as_slice(), offsets.as_slice(), values_values); + let buffer = take_values(length, starts.as_slice(), &offsets, values_values); (offsets.into(), buffer, validity.into()) } @@ -86,7 +89,7 @@ pub fn take_indices_validity( offsets: &[O], values: &[u8], indices: &PrimitiveArray, -) -> (Buffer, Buffer, Option) { +) -> (OffsetsBuffer, Buffer, Option) { let mut length = O::default(); let mut starts = Vec::::with_capacity(indices.len()); @@ -105,9 +108,10 @@ pub fn take_indices_validity( let offsets = std::iter::once(O::default()) .chain(offsets) .collect::>(); - let starts: Buffer = starts.into(); + // Safety: by construction offsets are monotonically increasing + let offsets = unsafe { Offsets::new_unchecked(offsets) }; - let buffer = take_values(length, starts.as_slice(), offsets.as_slice(), values); + let buffer = take_values(length, &starts, &offsets, values); (offsets.into(), buffer, indices.validity().cloned()) } @@ -116,7 +120,7 @@ pub fn take_indices_validity( pub fn take_values_indices_validity>( values: &A, indices: &PrimitiveArray, -) -> (Buffer, Buffer, Option) { +) -> (OffsetsBuffer, Buffer, Option) { let mut length = O::default(); let mut validity = MutableBitmap::with_capacity(indices.len()); @@ -148,10 +152,10 @@ pub fn take_values_indices_validity>(); + // Safety: by construction offsets are monotonically increasing + let offsets = unsafe { Offsets::new_unchecked(offsets) }; - let starts: Buffer = starts.into(); - - let buffer = take_values(length, starts.as_slice(), offsets.as_slice(), values_values); + let buffer = take_values(length, &starts, &offsets, values_values); (offsets.into(), buffer, validity.into()) } diff --git a/src/io/avro/read/nested.rs b/src/io/avro/read/nested.rs index 04d9bcf43b6..a14af9d4898 100644 --- a/src/io/avro/read/nested.rs +++ b/src/io/avro/read/nested.rs @@ -2,26 +2,24 @@ use crate::array::*; use crate::bitmap::*; use crate::datatypes::*; use crate::error::*; -use crate::offset::Offset; +use crate::offset::{Offset, Offsets}; /// Auxiliary struct #[derive(Debug)] pub struct DynMutableListArray { data_type: DataType, - offsets: Vec, + offsets: Offsets, values: Box, validity: Option, } impl DynMutableListArray { pub fn new_from(values: Box, data_type: DataType, capacity: usize) -> Self { - let mut offsets = Vec::::with_capacity(capacity + 1); - offsets.push(O::default()); assert_eq!(values.len(), 0); ListArray::::get_child_field(&data_type); Self { data_type, - offsets, + offsets: Offsets::::with_capacity(capacity), values, validity: None, } @@ -34,11 +32,11 @@ impl DynMutableListArray { #[inline] pub fn try_push_valid(&mut self) -> Result<()> { - let size = self.values.len(); - let size = O::from_usize(size).ok_or(Error::Overflow)?; - assert!(size >= *self.offsets.last().unwrap()); + let total_length = self.values.len(); + let offset = self.offsets.last().to_usize(); + let length = total_length.checked_sub(offset).ok_or(Error::Overflow)?; - self.offsets.push(size); + self.offsets.try_push_usize(length)?; if let Some(validity) = &mut self.validity { validity.push(true) } @@ -47,18 +45,13 @@ impl DynMutableListArray { #[inline] fn push_null(&mut self) { - self.offsets.push(self.last_offset()); + self.offsets.extend_constant(1); match &mut self.validity { Some(validity) => validity.push(false), None => self.init_validity(), } } - #[inline] - fn last_offset(&self) -> O { - *self.offsets.last().unwrap() - } - fn init_validity(&mut self) { let len = self.offsets.len() - 1; @@ -79,21 +72,23 @@ impl MutableArray for DynMutableListArray { } fn as_box(&mut self) -> Box { - Box::new(ListArray::new( + ListArray::new( self.data_type.clone(), std::mem::take(&mut self.offsets).into(), self.values.as_box(), std::mem::take(&mut self.validity).map(|x| x.into()), - )) + ) + .boxed() } fn as_arc(&mut self) -> std::sync::Arc { - std::sync::Arc::new(ListArray::new( + ListArray::new( self.data_type.clone(), std::mem::take(&mut self.offsets).into(), self.values.as_box(), std::mem::take(&mut self.validity).map(|x| x.into()), - )) + ) + .arced() } fn data_type(&self) -> &DataType { diff --git a/src/io/ipc/read/array/binary.rs b/src/io/ipc/read/array/binary.rs index eea120a5b2f..68c5b40d078 100644 --- a/src/io/ipc/read/array/binary.rs +++ b/src/io/ipc/read/array/binary.rs @@ -69,7 +69,7 @@ pub fn read_binary( scratch, )?; - BinaryArray::::try_new(data_type, offsets, values, validity) + BinaryArray::::try_new(data_type, offsets.try_into()?, values, validity) } pub fn skip_binary( diff --git a/src/io/ipc/read/array/list.rs b/src/io/ipc/read/array/list.rs index 8824ed86fe1..1b45b10730d 100644 --- a/src/io/ipc/read/array/list.rs +++ b/src/io/ipc/read/array/list.rs @@ -85,7 +85,7 @@ where version, scratch, )?; - ListArray::try_new(data_type, offsets, values, validity) + ListArray::try_new(data_type, offsets.try_into()?, values, validity) } pub fn skip_list( diff --git a/src/io/ipc/read/array/utf8.rs b/src/io/ipc/read/array/utf8.rs index 1ff056d6f8c..398184e3e55 100644 --- a/src/io/ipc/read/array/utf8.rs +++ b/src/io/ipc/read/array/utf8.rs @@ -70,7 +70,7 @@ pub fn read_utf8( scratch, )?; - Utf8Array::::try_new(data_type, offsets, values, validity) + Utf8Array::::try_new(data_type, offsets.try_into()?, values, validity) } pub fn skip_utf8( diff --git a/src/io/ipc/write/serialize.rs b/src/io/ipc/write/serialize.rs index 7737cbca9cd..c19500d02b2 100644 --- a/src/io/ipc/write/serialize.rs +++ b/src/io/ipc/write/serialize.rs @@ -195,8 +195,8 @@ fn write_list( ); let first = *offsets.first().unwrap(); - let last = *offsets.last().unwrap(); - if first == O::default() { + let last = *offsets.last(); + if first == O::zero() { write_buffer( offsets, buffers, diff --git a/src/io/json/read/deserialize.rs b/src/io/json/read/deserialize.rs index 1d3997b1da1..73eac81d4f9 100644 --- a/src/io/json/read/deserialize.rs +++ b/src/io/json/read/deserialize.rs @@ -12,7 +12,7 @@ use crate::{ chunk::Chunk, datatypes::{DataType, Field, IntervalUnit, Schema}, error::Error, - offset::Offset, + offset::{Offset, Offsets}, types::{f16, NativeType}, }; @@ -227,24 +227,19 @@ fn deserialize_list<'a, O: Offset, A: Borrow>>( let child = ListArray::::get_child_type(&data_type); let mut validity = MutableBitmap::with_capacity(rows.len()); - let mut offsets = Vec::::with_capacity(rows.len() + 1); + let mut offsets = Offsets::::with_capacity(rows.len()); let mut inner = vec![]; - offsets.push(O::zero()); - rows.iter().fold(O::zero(), |mut length, row| { - match row.borrow() { - Value::Array(value) => { - inner.extend(value.iter()); - validity.push(true); - // todo make this an Err - length += O::from_usize(value.len()).expect("List offset is too large :/"); - offsets.push(length); - length - } - _ => { - validity.push(false); - offsets.push(length); - length - } + rows.iter().for_each(|row| match row.borrow() { + Value::Array(value) => { + inner.extend(value.iter()); + validity.push(true); + offsets + .try_push_usize(value.len()) + .expect("List offset is too large :/"); + } + _ => { + validity.push(false); + offsets.extend_constant(1); } }); @@ -259,39 +254,25 @@ fn deserialize_list_into<'a, O: Offset, A: Borrow>>( target: &mut MutableListArray>, rows: &[A], ) { - let start = { - let empty = vec![]; - let inner: Vec<_> = rows - .iter() - .flat_map(|row| match row.borrow() { - Value::Array(value) => value.iter(), - _ => empty.iter(), - }) - .collect(); - - let child = target.mut_values(); - let start_len = child.len(); - deserialize_into(child, &inner); + let empty = vec![]; + let inner: Vec<_> = rows + .iter() + .flat_map(|row| match row.borrow() { + Value::Array(value) => value.iter(), + _ => empty.iter(), + }) + .collect(); - // todo make this an Err - O::from_usize(start_len).expect("Child list size too large") - }; + deserialize_into(target.mut_values(), &inner); - let mut position = start; - let arrays = rows.iter().map(|row| { - match row.borrow() { - Value::Array(value) => { - // todo make this an Err - position += O::from_usize(value.len()).expect("List offset is too large :/"); - Some(position) - } - _ => None, - } + let lengths = rows.iter().map(|row| match row.borrow() { + Value::Array(value) => Some(value.len()), + _ => None, }); - // though this will always be safe, we cannot use unsafe_extend_offsets here - // due to `#![forbid(unsafe_code)]` on the io module - target.extend_offsets(arrays); + target + .try_extend_from_lengths(lengths) + .expect("Offsets overflow"); } fn deserialize_fixed_size_list_into<'a, A: Borrow>>( @@ -302,10 +283,7 @@ fn deserialize_fixed_size_list_into<'a, A: Borrow>>( match row.borrow() { Value::Array(value) => { if value.len() == target.size() { - { - let child = target.mut_values(); - deserialize_into(child, value); - } + deserialize_into(target.mut_values(), value); // unless alignment is already off, the if above should // prevent this from ever happening. target.try_push_valid().expect("unaligned backing array"); diff --git a/src/io/json_integration/read/array.rs b/src/io/json_integration/read/array.rs index 42fe220b96f..240add0ea95 100644 --- a/src/io/json_integration/read/array.rs +++ b/src/io/json_integration/read/array.rs @@ -190,7 +190,7 @@ fn to_binary(json_col: &ArrowJsonColumn, data_type: DataType) -> Box< .iter() .flat_map(|value| value.as_str().map(|x| hex::decode(x).unwrap()).unwrap()) .collect(); - Box::new(BinaryArray::new(data_type, offsets, values, validity)) + BinaryArray::new(data_type, offsets.try_into().unwrap(), values, validity).boxed() } fn to_utf8(json_col: &ArrowJsonColumn, data_type: DataType) -> Box { @@ -203,7 +203,7 @@ fn to_utf8(json_col: &ArrowJsonColumn, data_type: DataType) -> Box( @@ -223,9 +223,7 @@ fn to_list( dictionaries, )?; let offsets = to_offsets::(json_col.offset.as_ref()); - Ok(Box::new(ListArray::::new( - data_type, offsets, values, validity, - ))) + Ok(ListArray::::new(data_type, offsets.try_into()?, values, validity).boxed()) } fn to_map( diff --git a/src/io/odbc/read/deserialize.rs b/src/io/odbc/read/deserialize.rs index b98596850c2..7ebf79b8b9a 100644 --- a/src/io/odbc/read/deserialize.rs +++ b/src/io/odbc/read/deserialize.rs @@ -6,6 +6,7 @@ use crate::array::{Array, BinaryArray, BooleanArray, PrimitiveArray, Utf8Array}; use crate::bitmap::{Bitmap, MutableBitmap}; use crate::buffer::Buffer; use crate::datatypes::{DataType, TimeUnit}; +use crate::offset::{Offsets, OffsetsBuffer}; use crate::types::NativeType; use super::super::api::buffers::AnyColumnView; @@ -118,22 +119,23 @@ fn bool_optional(data_type: DataType, values: &[Bit], indicators: &[isize]) -> B fn binary_generic<'a>( iter: impl Iterator>, -) -> (Buffer, Buffer, Option) { +) -> (OffsetsBuffer, Buffer, Option) { let length = iter.size_hint().0; let mut validity = MutableBitmap::with_capacity(length); let mut values = Vec::::with_capacity(0); - let mut offsets = Vec::with_capacity(length + 1); - offsets.push(0i32); - + let mut offsets = Offsets::::with_capacity(length); for item in iter { if let Some(item) = item { values.extend_from_slice(item); + offsets + .try_push_usize(item.len()) + .expect("List to contain less than i32::MAX items."); validity.push(true); } else { + offsets.extend_constant(1); validity.push(false); } - offsets.push(values.len() as i32) } (offsets.into(), values.into(), validity.into()) diff --git a/src/io/orc/read/mod.rs b/src/io/orc/read/mod.rs index 4a365078236..3fe4abb7f63 100644 --- a/src/io/orc/read/mod.rs +++ b/src/io/orc/read/mod.rs @@ -5,7 +5,7 @@ use crate::array::{Array, BinaryArray, BooleanArray, Int64Array, PrimitiveArray, use crate::bitmap::{Bitmap, MutableBitmap}; use crate::datatypes::{DataType, Field, Schema}; use crate::error::Error; -use crate::offset::Offset; +use crate::offset::{Offset, Offsets}; use crate::types::NativeType; use orc_format::proto::stream::Kind; @@ -250,23 +250,19 @@ where #[inline] fn try_extend, I: Iterator>( - offsets: &mut Vec, - length: &mut O, + offsets: &mut Offsets, iter: I, -) -> Result<(), orc_format::error::Error> { +) -> Result<(), Error> { for item in iter { - let item: O = item - .try_into() - .map_err(|_| orc_format::error::Error::OutOfSpec)?; - *length += item; - offsets.push(*length) + let length: O = item.try_into().map_err(|_| Error::Overflow)?; + offsets.try_push(length)? } Ok(()) } fn deserialize_binary_generic>( column: &Column, -) -> Result<(Vec, Vec, Option), Error> { +) -> Result<(Offsets, Vec, Option), Error> { let num_rows = column.number_of_rows(); let mut scratch = vec![]; @@ -274,9 +270,7 @@ fn deserialize_binary_generic>( let lengths = column.get_stream(Kind::Length, scratch)?; - let mut offsets = Vec::with_capacity(num_rows + 1); - let mut length = O::default(); - offsets.push(length); + let mut offsets = Offsets::with_capacity(num_rows); if let Some(validity) = &validity { let mut iter = decode::UnsignedRleV2Iter::new(lengths, validity.len() - validity.unset_bits(), vec![]); @@ -286,34 +280,35 @@ fn deserialize_binary_generic>( .next() .transpose()? .ok_or(orc_format::error::Error::OutOfSpec)?; - let item: O = item + let length: O = item .try_into() .map_err(|_| Error::ExternalFormat("value uncastable".to_string()))?; - length += item; + offsets.try_push(length)?; + } else { + offsets.extend_constant(1) } - offsets.push(length); } let (lengths, _) = iter.into_inner(); scratch = std::mem::take(&mut lengths.into_inner()); } else { let mut iter = decode::UnsignedRleV2RunIter::new(lengths, num_rows, vec![]); iter.try_for_each(|run| { - run.and_then(|run| match run { + run.map_err(Error::from).and_then(|run| match run { decode::UnsignedRleV2Run::Direct(values_iter) => { - try_extend(&mut offsets, &mut length, values_iter) + try_extend(&mut offsets, values_iter) } decode::UnsignedRleV2Run::Delta(values_iter) => { - try_extend(&mut offsets, &mut length, values_iter) + try_extend(&mut offsets, values_iter) } decode::UnsignedRleV2Run::ShortRepeat(values_iter) => { - try_extend(&mut offsets, &mut length, values_iter) + try_extend(&mut offsets, values_iter) } }) })?; let (lengths, _) = iter.into_inner(); scratch = std::mem::take(&mut lengths.into_inner()); } - let length = length.to_usize(); + let length = offsets.last().to_usize(); let mut values = vec![0; length]; let mut data = column.get_stream(Kind::Data, scratch)?; diff --git a/src/io/parquet/read/deserialize/binary/basic.rs b/src/io/parquet/read/deserialize/binary/basic.rs index 0a2aa098c45..4cae2ddddfd 100644 --- a/src/io/parquet/read/deserialize/binary/basic.rs +++ b/src/io/parquet/read/deserialize/binary/basic.rs @@ -14,7 +14,7 @@ use crate::{ buffer::Buffer, datatypes::DataType, error::{Error, Result}, - offset::Offset, + offset::{Offset, OffsetsBuffer}, }; use super::super::utils::{ @@ -228,7 +228,7 @@ impl<'a> utils::PageState<'a> for State<'a> { pub trait TraitBinaryArray: Array + 'static { fn try_new( data_type: DataType, - offsets: Buffer, + offsets: OffsetsBuffer, values: Buffer, validity: Option, ) -> Result @@ -239,7 +239,7 @@ pub trait TraitBinaryArray: Array + 'static { impl TraitBinaryArray for BinaryArray { fn try_new( data_type: DataType, - offsets: Buffer, + offsets: OffsetsBuffer, values: Buffer, validity: Option, ) -> Result { @@ -250,7 +250,7 @@ impl TraitBinaryArray for BinaryArray { impl TraitBinaryArray for Utf8Array { fn try_new( data_type: DataType, - offsets: Buffer, + offsets: OffsetsBuffer, values: Buffer, validity: Option, ) -> Result { @@ -373,22 +373,18 @@ impl<'a, O: Offset> utils::Decoder<'a> for BinaryDecoder { let Binary { offsets, values: values_, - last_offset, } = values; - let offset = *last_offset; + let last_offset = *offsets.last(); extend_from_decoder( validity, page_validity, Some(additional), offsets, - page_values.lengths.by_ref().map(|x| { - *last_offset += O::from_usize(x).unwrap(); - *last_offset - }), + page_values.lengths.by_ref(), ); - let length = *last_offset - offset; + let length = last_offset - *offsets.last(); let (consumed, remaining) = page_values.values.split_at(length.to_usize()); page_values.values = remaining; @@ -486,7 +482,7 @@ pub(super) fn finish>( ) -> Result { A::try_new( data_type.clone(), - values.offsets.0.into(), + values.offsets.into(), values.values.into(), validity.into(), ) diff --git a/src/io/parquet/read/deserialize/binary/dictionary.rs b/src/io/parquet/read/deserialize/binary/dictionary.rs index 5cf3c07d97b..6f883528ef8 100644 --- a/src/io/parquet/read/deserialize/binary/dictionary.rs +++ b/src/io/parquet/read/deserialize/binary/dictionary.rs @@ -67,11 +67,10 @@ fn read_dict(data_type: DataType, dict: &DictPage) -> Box match data_type.to_physical_type() { PhysicalType::Utf8 | PhysicalType::LargeUtf8 => { - Utf8Array::::new(data_type, data.offsets.0.into(), data.values.into(), None).boxed() + Utf8Array::::new(data_type, data.offsets.into(), data.values.into(), None).boxed() } PhysicalType::Binary | PhysicalType::LargeBinary => { - BinaryArray::::new(data_type, data.offsets.0.into(), data.values.into(), None) - .boxed() + BinaryArray::::new(data_type, data.offsets.into(), data.values.into(), None).boxed() } _ => unreachable!(), } diff --git a/src/io/parquet/read/deserialize/binary/utils.rs b/src/io/parquet/read/deserialize/binary/utils.rs index d886c1bfae6..ddf7abc2a06 100644 --- a/src/io/parquet/read/deserialize/binary/utils.rs +++ b/src/io/parquet/read/deserialize/binary/utils.rs @@ -1,4 +1,4 @@ -use crate::offset::Offset; +use crate::offset::{Offset, Offsets}; use super::super::utils::Pushable; @@ -7,70 +7,51 @@ use super::super::utils::Pushable; pub struct Binary { pub offsets: Offsets, pub values: Vec, - pub last_offset: O, } -#[derive(Debug)] -pub struct Offsets(pub Vec); - -impl Offsets { - #[inline] - pub fn extend_lengths>(&mut self, lengths: I) { - let mut last_offset = *self.0.last().unwrap(); - self.0.extend(lengths.map(|length| { - last_offset += O::from_usize(length).unwrap(); - last_offset - })); - } -} - -impl Pushable for Offsets { +impl Pushable for Offsets { fn reserve(&mut self, additional: usize) { - self.0.reserve(additional) + self.reserve(additional) } #[inline] fn len(&self) -> usize { - self.0.len() - 1 + self.len() } #[inline] - fn push(&mut self, value: O) { - self.0.push(value) + fn push(&mut self, value: usize) { + self.try_push_usize(value).unwrap() } #[inline] fn push_null(&mut self) { - self.0.push(*self.0.last().unwrap()) + self.extend_constant(1); } #[inline] - fn extend_constant(&mut self, additional: usize, value: O) { - self.0.extend_constant(additional, value) + fn extend_constant(&mut self, additional: usize, _: usize) { + self.extend_constant(additional) } } impl Binary { #[inline] pub fn with_capacity(capacity: usize) -> Self { - let mut offsets = Vec::with_capacity(1 + capacity); - offsets.push(O::default()); Self { - offsets: Offsets(offsets), + offsets: Offsets::with_capacity(capacity), values: Vec::with_capacity(capacity * 24), - last_offset: O::default(), } } #[inline] pub fn push(&mut self, v: &[u8]) { self.values.extend(v); - self.last_offset += O::from_usize(v.len()).unwrap(); - self.offsets.push(self.last_offset) + self.offsets.try_push_usize(v.len()).unwrap() } #[inline] pub fn extend_constant(&mut self, additional: usize) { - self.offsets.extend_constant(additional, self.last_offset); + self.offsets.extend_constant(additional); } #[inline] @@ -80,10 +61,10 @@ impl Binary { #[inline] pub fn extend_lengths>(&mut self, lengths: I, values: &mut &[u8]) { - let current_offset = self.last_offset; - self.offsets.extend_lengths(lengths); - self.last_offset = *self.offsets.0.last().unwrap(); // guaranteed to have one - let length = self.last_offset.to_usize() - current_offset.to_usize(); + let current_offset = *self.offsets.last(); + self.offsets.try_extend_from_lengths(lengths).unwrap(); + let new_offset = *self.offsets.last(); + let length = new_offset.to_usize() - current_offset.to_usize(); let (consumed, remaining) = values.split_at(length); *values = remaining; self.values.extend_from_slice(consumed); @@ -93,7 +74,7 @@ impl Binary { impl<'a, O: Offset> Pushable<&'a [u8]> for Binary { #[inline] fn reserve(&mut self, additional: usize) { - let avg_len = self.values.len() / std::cmp::max(self.last_offset.to_usize(), 1); + let avg_len = self.values.len() / std::cmp::max(self.offsets.last().to_usize(), 1); self.values.reserve(additional * avg_len); self.offsets.reserve(additional); } diff --git a/src/io/parquet/read/deserialize/mod.rs b/src/io/parquet/read/deserialize/mod.rs index b16d1a6e83d..5c606a8f5e7 100644 --- a/src/io/parquet/read/deserialize/mod.rs +++ b/src/io/parquet/read/deserialize/mod.rs @@ -18,6 +18,7 @@ use crate::{ array::{Array, DictionaryKey, FixedSizeListArray, ListArray}, datatypes::{DataType, Field, IntervalUnit}, error::Result, + offset::Offsets, }; use self::nested_utils::{InitNested, NestedArrayIter, NestedState}; @@ -50,9 +51,14 @@ fn create_list( let (mut offsets, validity) = nested.nested.pop().unwrap().inner(); match data_type.to_logical_type() { DataType::List(_) => { - offsets.push(values.len() as i64); + offsets + .try_push(values.len() as i64 - *offsets.last()) + .expect("Lenghts too large"); + + let offsets: Offsets = offsets + .try_into() + .expect("i64 offsets do not fit in i32 offsets"); - let offsets = offsets.iter().map(|x| *x as i32).collect::>(); Box::new(ListArray::::new( data_type, offsets.into(), @@ -61,7 +67,9 @@ fn create_list( )) } DataType::LargeList(_) => { - offsets.push(values.len() as i64); + offsets + .try_push(values.len() as i64 - *offsets.last()) + .expect("Lenghts too large"); Box::new(ListArray::::new( data_type, diff --git a/src/io/parquet/read/deserialize/nested_utils.rs b/src/io/parquet/read/deserialize/nested_utils.rs index f6f1a8baaa3..400da7c7bf7 100644 --- a/src/io/parquet/read/deserialize/nested_utils.rs +++ b/src/io/parquet/read/deserialize/nested_utils.rs @@ -6,7 +6,7 @@ use parquet2::{ read::levels::get_bit_width, }; -use crate::{array::Array, bitmap::MutableBitmap, error::Result}; +use crate::{array::Array, bitmap::MutableBitmap, error::Result, offset::Offsets, types::Index}; pub use super::utils::Zip; use super::utils::{DecodedState, MaybeNext}; @@ -14,7 +14,7 @@ use super::{super::Pages, utils::PageState}; /// trait describing deserialized repetition and definition levels pub trait Nested: std::fmt::Debug + Send + Sync { - fn inner(&mut self) -> (Vec, Option); + fn inner(&mut self) -> (Offsets, Option); fn push(&mut self, length: i64, is_valid: bool); @@ -50,7 +50,7 @@ impl NestedPrimitive { } impl Nested for NestedPrimitive { - fn inner(&mut self) -> (Vec, Option) { + fn inner(&mut self) -> (Offsets, Option) { (Default::default(), Default::default()) } @@ -78,11 +78,11 @@ impl Nested for NestedPrimitive { #[derive(Debug, Default)] pub struct NestedOptional { pub validity: MutableBitmap, - pub offsets: Vec, + pub offsets: Offsets, } impl Nested for NestedOptional { - fn inner(&mut self) -> (Vec, Option) { + fn inner(&mut self) -> (Offsets, Option) { let offsets = std::mem::take(&mut self.offsets); let validity = std::mem::take(&mut self.validity); (offsets, Some(validity)) @@ -102,7 +102,7 @@ impl Nested for NestedOptional { } fn push(&mut self, value: i64, is_valid: bool) { - self.offsets.push(value); + self.offsets.try_push(value).unwrap(); self.validity.push(is_valid); } @@ -111,13 +111,13 @@ impl Nested for NestedOptional { } fn num_values(&self) -> usize { - self.offsets.last().copied().unwrap_or(0) as usize + (*self.offsets.last()).to_usize() } } impl NestedOptional { pub fn with_capacity(capacity: usize) -> Self { - let offsets = Vec::::with_capacity(capacity + 1); + let offsets = Offsets::::with_capacity(capacity); let validity = MutableBitmap::with_capacity(capacity); Self { validity, offsets } } @@ -125,11 +125,11 @@ impl NestedOptional { #[derive(Debug, Default)] pub struct NestedValid { - pub offsets: Vec, + pub offsets: Offsets, } impl Nested for NestedValid { - fn inner(&mut self) -> (Vec, Option) { + fn inner(&mut self) -> (Offsets, Option) { let offsets = std::mem::take(&mut self.offsets); (offsets, None) } @@ -148,7 +148,7 @@ impl Nested for NestedValid { } fn push(&mut self, value: i64, _is_valid: bool) { - self.offsets.push(value); + self.offsets.try_push(value).unwrap(); } fn len(&self) -> usize { @@ -156,13 +156,13 @@ impl Nested for NestedValid { } fn num_values(&self) -> usize { - self.offsets.last().copied().unwrap_or(0) as usize + self.offsets.last().to_usize() } } impl NestedValid { pub fn with_capacity(capacity: usize) -> Self { - let offsets = Vec::::with_capacity(capacity + 1); + let offsets = Offsets::::with_capacity(capacity); Self { offsets } } } @@ -179,7 +179,7 @@ impl NestedStructValid { } impl Nested for NestedStructValid { - fn inner(&mut self) -> (Vec, Option) { + fn inner(&mut self) -> (Offsets, Option) { (Default::default(), None) } @@ -218,7 +218,7 @@ impl NestedStruct { } impl Nested for NestedStruct { - fn inner(&mut self) -> (Vec, Option) { + fn inner(&mut self) -> (Offsets, Option) { (Default::default(), Some(std::mem::take(&mut self.validity))) } diff --git a/src/io/parquet/read/statistics/list.rs b/src/io/parquet/read/statistics/list.rs index 9d0adbcb0cc..047b5e07700 100644 --- a/src/io/parquet/read/statistics/list.rs +++ b/src/io/parquet/read/statistics/list.rs @@ -1,6 +1,7 @@ use crate::array::*; use crate::datatypes::DataType; use crate::error::Result; +use crate::offset::Offsets; use super::make_mutable; @@ -40,19 +41,21 @@ impl MutableArray for DynMutableListArray { match self.data_type.to_logical_type() { DataType::List(_) => { - let offsets = (0..=inner.len() as i32).collect::>().into(); + let offsets = + Offsets::try_from_lengths(std::iter::repeat(1).take(inner.len())).unwrap(); Box::new(ListArray::::new( self.data_type.clone(), - offsets, + offsets.into(), inner, None, )) } DataType::LargeList(_) => { - let offsets = (0..=inner.len() as i64).collect::>().into(); + let offsets = + Offsets::try_from_lengths(std::iter::repeat(1).take(inner.len())).unwrap(); Box::new(ListArray::::new( self.data_type.clone(), - offsets, + offsets.into(), inner, None, )) diff --git a/src/io/parquet/write/pages.rs b/src/io/parquet/write/pages.rs index 2259647d79c..0b7ac9e5531 100644 --- a/src/io/parquet/write/pages.rs +++ b/src/io/parquet/write/pages.rs @@ -418,7 +418,7 @@ mod tests { let array = ListArray::new( DataType::List(Box::new(Field::new("l", array.data_type().clone(), true))), - vec![0i32, 2, 4].into(), + vec![0i32, 2, 4].try_into().unwrap(), Box::new(array), None, ); diff --git a/src/offset.rs b/src/offset.rs index edca7dc8b38..e093b4ebe3a 100644 --- a/src/offset.rs +++ b/src/offset.rs @@ -1,2 +1,419 @@ //! Contains the declaration of [`Offset`] +use std::hint::unreachable_unchecked; + +use crate::buffer::Buffer; +use crate::error::Error; pub use crate::types::Offset; + +/// A wrapper type of [`Vec`] representing the invariants of Arrow's offsets. +/// It is guaranteed to (sound to assume that): +/// * every element is `>= 0` +/// * element at position `i` is >= than element at position `i-1`. +#[derive(Debug, Clone)] +pub struct Offsets(Vec); + +impl Default for Offsets { + #[inline] + fn default() -> Self { + Self::new() + } +} + +impl std::ops::Deref for Offsets { + type Target = [O]; + + #[inline] + fn deref(&self) -> &[O] { + &self.0 + } +} + +impl TryFrom> for Offsets { + type Error = Error; + + #[inline] + fn try_from(offsets: Vec) -> Result { + try_check_offsets(&offsets)?; + Ok(Self(offsets)) + } +} + +impl TryFrom> for OffsetsBuffer { + type Error = Error; + + #[inline] + fn try_from(offsets: Buffer) -> Result { + try_check_offsets(&offsets)?; + Ok(Self(offsets)) + } +} + +impl TryFrom> for OffsetsBuffer { + type Error = Error; + + #[inline] + fn try_from(offsets: Vec) -> Result { + try_check_offsets(&offsets)?; + Ok(Self(offsets.into())) + } +} + +impl From> for OffsetsBuffer { + #[inline] + fn from(offsets: Offsets) -> Self { + Self(offsets.0.into()) + } +} + +impl Offsets { + /// Returns an empty [`Offsets`] (i.e. with a single element, the zero) + #[inline] + pub fn new() -> Self { + Self(vec![O::zero()]) + } + + /// Creates a new [`Offsets`] from an iterator of lengths + #[inline] + pub fn try_from_iter>(iter: I) -> Result { + let iterator = iter.into_iter(); + let (lower, _) = iterator.size_hint(); + let mut offsets = Self::with_capacity(lower); + for item in iterator { + offsets.try_push_usize(item)? + } + Ok(offsets) + } + + /// Returns a new [`Offsets`] with a capacity, allocating at least `capacity + 1` entries. + pub fn with_capacity(capacity: usize) -> Self { + let mut offsets = Vec::with_capacity(capacity + 1); + offsets.push(O::zero()); + Self(offsets) + } + + /// Returns the capacity of [`Offsets`]. + pub fn capacity(&self) -> usize { + self.0.capacity() - 1 + } + + /// Reserves `additional` entries. + pub fn reserve(&mut self, additional: usize) { + self.0.reserve(additional); + } + + /// Shrinks the capacity of self to fit. + pub fn shrink_to_fit(&mut self) { + self.0.shrink_to_fit(); + } + + /// Pushes a new element with a given length. + /// # Error + /// This function errors iff the new last item is larger than what `O` supports. + /// # Implementation + /// This function: + /// * checks that this length does not overflow + #[inline] + pub fn try_push(&mut self, length: O) -> Result<(), Error> { + let old_length = self.last(); + let new_length = old_length.checked_add(&length).ok_or(Error::Overflow)?; + self.0.push(new_length); + Ok(()) + } + + /// Pushes a new element with a given length. + /// # Error + /// This function errors iff the new last item is larger than what `O` supports. + /// # Implementation + /// This function: + /// * checks that this length does not overflow + #[inline] + pub fn try_push_usize(&mut self, length: usize) -> Result<(), Error> { + let length = O::from_usize(length).ok_or(Error::Overflow)?; + self.try_push(length) + } + + /// Returns [`Offsets`] assuming that `offsets` fulfills its invariants + /// # Safety + /// This is safe iff the invariants of this struct are guaranteed in `offsets`. + #[inline] + pub unsafe fn new_unchecked(offsets: Vec) -> Self { + Self(offsets) + } + + /// Returns the last offset of this container. + #[inline] + pub fn last(&self) -> &O { + match self.0.last() { + Some(element) => element, + None => unsafe { unreachable_unchecked() }, + } + } + + /// Returns the length of this container + #[inline] + pub fn len(&self) -> usize { + self.0.len() - 1 + } + + /// Pops the last element + #[inline] + pub fn pop(&mut self) -> Option { + if self.len() == 0 { + None + } else { + self.0.pop() + } + } + + /// Extends itself with `additional` elements equal to the last offset. + /// This is useful to extend offsets with empty values, e.g. for null slots. + #[inline] + pub fn extend_constant(&mut self, additional: usize) { + let offset = *self.last(); + if additional == 1 { + self.0.push(offset) + } else { + self.0.resize(self.len() + additional, offset) + } + } + + /// Try to create a new [`Offsets`] from a sequence of `lengths` + /// # Errors + /// This function errors iff this operation overflows for the maximum value of `O`. + #[inline] + pub fn try_from_lengths>(lengths: I) -> Result { + let mut self_ = Self::with_capacity(lengths.size_hint().0); + self_.try_extend_from_lengths(lengths)?; + Ok(self_) + } + + /// Try extend from an iterator of lengths + /// # Errors + /// This function errors iff this operation overflows for the maximum value of `O`. + #[inline] + pub fn try_extend_from_lengths>( + &mut self, + lengths: I, + ) -> Result<(), Error> { + let mut total_length = 0; + let mut offset = *self.last(); + let original_offset = offset.to_usize(); + + let lengths = lengths.map(|length| { + total_length += length; + O::from_as_usize(length) + }); + + let offsets = lengths.map(|length| { + offset += length; // this may overflow, checked below + offset + }); + self.0.extend(offsets); + + let last_offset = original_offset + .checked_add(total_length) + .ok_or(Error::Overflow)?; + O::from_usize(last_offset).ok_or(Error::Overflow)?; + Ok(()) + } + + /// Extends itself from another [`Offsets`] + /// # Errors + /// This function errors iff this operation overflows for the maximum value of `O`. + pub fn try_extend_from_self(&mut self, other: &Self) -> Result<(), Error> { + let mut length = *self.last(); + let other_length = *other.last(); + // check if the operation would overflow + length.checked_add(&other_length).ok_or(Error::Overflow)?; + + let lengths = other.windows(2).map(|w| w[1] - w[0]); + let offsets = lengths.map(|new_length| { + length += new_length; + length + }); + self.0.extend(offsets); + Ok(()) + } + + /// Extends itself from another [`Offsets`] sliced by `start, length` + /// # Errors + /// This function errors iff this operation overflows for the maximum value of `O`. + pub fn try_extend_from_slice( + &mut self, + other: &OffsetsBuffer, + start: usize, + length: usize, + ) -> Result<(), Error> { + if length == 0 { + return Ok(()); + } + let other = &other.0[start..start + length + 1]; + let other_length = other.last().expect("Length to be non-zero"); + let mut length = *self.last(); + // check if the operation would overflow + length.checked_add(other_length).ok_or(Error::Overflow)?; + + let lengths = other.windows(2).map(|w| w[1] - w[0]); + let offsets = lengths.map(|new_length| { + length += new_length; + length + }); + self.0.extend(offsets); + Ok(()) + } + + /// Returns the inner [`Vec`]. + #[inline] + pub fn into_inner(self) -> Vec { + self.0 + } +} + +/// Checks that `offsets` is monotonically increasing. +fn try_check_offsets(offsets: &[O]) -> Result<(), Error> { + // this code is carefully constructed to auto-vectorize, don't change naively! + match offsets.first() { + None => Err(Error::oos("offsets must have at least one element")), + Some(first) => { + if *first < O::zero() { + return Err(Error::oos("offsets must be larger than 0")); + } + let mut previous = *first; + let mut any_invalid = false; + + // This loop will auto-vectorize because there is not any break, + // an invalid value will be returned once the whole offsets buffer is processed. + for offset in offsets { + if previous > *offset { + any_invalid = true + } + previous = *offset; + } + + if any_invalid { + Err(Error::oos("offsets must be monotonically increasing")) + } else { + Ok(()) + } + } + } +} + +/// A wrapper type of [`Buffer`] that is guaranteed to: +/// * Always contain an element +/// * element at position `i` is >= than element at position `i-1`. +#[derive(Clone)] +pub struct OffsetsBuffer(Buffer); + +impl Default for OffsetsBuffer { + #[inline] + fn default() -> Self { + Self(vec![O::zero()].into()) + } +} + +impl OffsetsBuffer { + /// # Safety + /// This is safe iff the invariants of this struct are guaranteed in `offsets`. + #[inline] + pub unsafe fn new_unchecked(offsets: Buffer) -> Self { + Self(offsets) + } + + /// Returns an empty [`OffsetsBuffer`] (i.e. with a single element, the zero) + #[inline] + pub fn new() -> Self { + Self(vec![O::zero()].into()) + } + + /// Copy-on-write API to convert [`OffsetsBuffer`] into [`Offsets`]. + #[inline] + pub fn get_mut(&mut self) -> Option> { + self.0 + .get_mut() + .map(|x| { + let mut new = vec![O::zero()]; + std::mem::swap(x, &mut new); + new + }) + // Safety: Offsets and OffsetsBuffer share invariants + .map(|offsets| unsafe { Offsets::new_unchecked(offsets) }) + } + + /// Returns a reference to its internal [`Buffer`]. + #[inline] + pub fn buffer(&self) -> &Buffer { + &self.0 + } + + /// Returns the last offset of this container, which is guaranteed to exist. + #[inline] + pub fn last(&self) -> &O { + match self.0.last() { + Some(element) => element, + None => unsafe { unreachable_unchecked() }, + } + } + + /// Returns a new [`OffsetsBuffer`] that is a slice of this buffer starting at `offset`. + /// Doing so allows the same memory region to be shared between buffers. + /// # Safety + /// The caller must ensure `offset + length <= self.len()` + #[inline] + pub unsafe fn slice_unchecked(self, offset: usize, length: usize) -> Self { + Self(self.0.slice_unchecked(offset, length)) + } + + /// Returns the inner [`Buffer`]. + #[inline] + pub fn into_inner(self) -> Buffer { + self.0 + } +} + +impl std::ops::Deref for OffsetsBuffer { + type Target = [O]; + + #[inline] + fn deref(&self) -> &[O] { + self.0.as_slice() + } +} + +impl From<&OffsetsBuffer> for OffsetsBuffer { + fn from(offsets: &OffsetsBuffer) -> Self { + // this conversion is lossless and uphelds all invariants + Self(offsets.iter().map(|x| *x as i64).collect::>().into()) + } +} + +impl TryFrom<&OffsetsBuffer> for OffsetsBuffer { + type Error = Error; + + fn try_from(offsets: &OffsetsBuffer) -> Result { + i32::try_from(*offsets.last()).map_err(|_| Error::Overflow)?; + + // this conversion is lossless and uphelds all invariants + Ok(Self( + offsets.iter().map(|x| *x as i32).collect::>().into(), + )) + } +} + +impl From> for Offsets { + fn from(offsets: Offsets) -> Self { + // this conversion is lossless and uphelds all invariants + Self(offsets.iter().map(|x| *x as i64).collect::>()) + } +} + +impl TryFrom> for Offsets { + type Error = Error; + + fn try_from(offsets: Offsets) -> Result { + i32::try_from(*offsets.last()).map_err(|_| Error::Overflow)?; + + // this conversion is lossless and uphelds all invariants + Ok(Self(offsets.iter().map(|x| *x as i32).collect::>())) + } +} diff --git a/src/types/index.rs b/src/types/index.rs index b44b3957e79..264720fbe7a 100644 --- a/src/types/index.rs +++ b/src/types/index.rs @@ -21,6 +21,9 @@ pub trait Index: /// Convert itself from [`usize`]. fn from_usize(index: usize) -> Option; + /// Convert itself from [`usize`]. + fn from_as_usize(index: usize) -> Self; + /// An iterator from (inclusive) `start` to (exclusive) `end`. fn range(start: usize, end: usize) -> Option> { let start = Self::from_usize(start); @@ -44,6 +47,11 @@ macro_rules! index { fn from_usize(value: usize) -> Option { Self::try_from(value).ok() } + + #[inline] + fn from_as_usize(value: usize) -> Self { + value as $t + } } }; }