diff --git a/Cargo.toml b/Cargo.toml index a8e5933d2fe..1650387caf1 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -21,6 +21,10 @@ num-traits = "0.2" dyn-clone = "1" bytemuck = { version = "1", features = ["derive"] } chrono = { version = "0.4.31", default_features = false, features = ["std"] } +atoi_simd = "0.15.5" +itoa = "1.0.6" +ryu = "1.0.13" +fast-float = { version = "0.2" } # for decimal i256 ethnum = "1" @@ -57,7 +61,7 @@ indexmap = { version = "^1.6", optional = true } # used to print columns in a nice columnar format comfy-table = { version = "6.0", optional = true, default-features = false } -arrow-format = { version = "0.8", optional = true, features = ["ipc"] } +arrow-format = { package = "polars-arrow-format", version = "0.1.0", optional = true, features = ["ipc"] } hex = { version = "^0.4", optional = true } @@ -184,7 +188,7 @@ io_json_write = ["streaming-iterator", "fallible-streaming-iterator", "lexical-c io_ipc = ["arrow-format"] io_ipc_write_async = ["io_ipc", "futures"] io_ipc_read_async = ["io_ipc", "futures", "async-stream"] -io_ipc_compression = ["lz4", "zstd"] +io_ipc_compression = ["lz4", "zstd", "io_ipc"] io_flight = ["io_ipc", "arrow-format/flight-data"] # base64 + io_ipc because arrow schemas are stored as base64-encoded ipc format. diff --git a/examples/ipc_file_mmap.rs b/examples/ipc_file_mmap.rs index e51b49de5be..166f752099e 100644 --- a/examples/ipc_file_mmap.rs +++ b/examples/ipc_file_mmap.rs @@ -29,7 +29,7 @@ fn write( let options = arrow2::io::ipc::write::WriteOptions { compression }; let mut writer = arrow2::io::ipc::write::FileWriter::try_new( result, - schema.clone(), + schema.clone().into(), ipc_fields.clone(), options, )?; diff --git a/src/array/binary/ffi.rs b/src/array/binary/ffi.rs index 6f971c4226f..598aa87074d 100644 --- a/src/array/binary/ffi.rs +++ b/src/array/binary/ffi.rs @@ -13,8 +13,8 @@ unsafe impl ToFfi for BinaryArray { fn buffers(&self) -> Vec> { vec![ self.validity.as_ref().map(|x| x.as_ptr()), - Some(self.offsets.buffer().as_ptr().cast::()), - Some(self.values.as_ptr().cast::()), + Some(self.offsets.buffer().storage_ptr().cast::()), + Some(self.values.storage_ptr().cast::()), ] } @@ -62,6 +62,6 @@ impl FromFfi for BinaryArray { // assumption that data from FFI is well constructed let offsets = unsafe { OffsetsBuffer::new_unchecked(offsets) }; - Ok(Self::new(data_type, offsets, values, validity)) + Self::try_new(data_type, offsets, values, validity) } } diff --git a/src/array/binview/ffi.rs b/src/array/binview/ffi.rs new file mode 100644 index 00000000000..a03b5a28e7e --- /dev/null +++ b/src/array/binview/ffi.rs @@ -0,0 +1,101 @@ +use std::sync::atomic::{AtomicU64, Ordering}; +use std::sync::Arc; + +use super::BinaryViewArrayGeneric; + +use crate::error::Result; +use crate::array::binview::{View, ViewType}; +use crate::array::{FromFfi, ToFfi}; +use crate::bitmap::align; +use crate::ffi; + +unsafe impl ToFfi for BinaryViewArrayGeneric { + fn buffers(&self) -> Vec> { + let mut buffers = Vec::with_capacity(self.buffers.len() + 2); + buffers.push(self.validity.as_ref().map(|x| x.as_ptr())); + buffers.push(Some(self.views.storage_ptr().cast::())); + buffers.extend(self.buffers.iter().map(|b| Some(b.storage_ptr()))); + buffers + } + + fn offset(&self) -> Option { + let offset = self.views.offset(); + if let Some(bitmap) = self.validity.as_ref() { + if bitmap.offset() == offset { + Some(offset) + } else { + None + } + } else { + Some(offset) + } + } + + fn to_ffi_aligned(&self) -> Self { + let offset = self.views.offset(); + + let validity = self.validity.as_ref().map(|bitmap| { + if bitmap.offset() == offset { + bitmap.clone() + } else { + align(bitmap, offset) + } + }); + + Self { + data_type: self.data_type.clone(), + validity, + views: self.views.clone(), + buffers: self.buffers.clone(), + raw_buffers: self.raw_buffers.clone(), + phantom: Default::default(), + total_bytes_len: AtomicU64::new(self.total_bytes_len.load(Ordering::Relaxed)), + total_buffer_len: self.total_buffer_len, + } + } +} + +impl FromFfi for BinaryViewArrayGeneric { + unsafe fn try_from_ffi(array: A) -> Result { + let data_type = array.data_type().clone(); + + let validity = unsafe { array.validity() }?; + let views = unsafe { array.buffer::(1) }?; + + // 2 - validity + views + let n_buffers = array.n_buffers(); + let mut remaining_buffers = n_buffers - 2; + if remaining_buffers <= 1 { + return Ok(Self::new_unchecked_unknown_md( + data_type, + views, + Arc::from([]), + validity, + None, + )); + } + + let n_variadic_buffers = remaining_buffers - 1; + let variadic_buffer_offset = n_buffers - 1; + + let variadic_buffer_sizes = + array.buffer_known_len::(variadic_buffer_offset, n_variadic_buffers)?; + remaining_buffers -= 1; + + let mut variadic_buffers = Vec::with_capacity(remaining_buffers); + + let offset = 2; + for (i, &size) in (offset..remaining_buffers + offset).zip(variadic_buffer_sizes.iter()) { + let values = unsafe { array.buffer_known_len::(i, size as usize) }?; + variadic_buffers.push(values); + } + + Ok(Self::new_unchecked_unknown_md( + data_type, + views, + Arc::from(variadic_buffers), + validity, + None, + )) + } +} diff --git a/src/array/binview/fmt.rs b/src/array/binview/fmt.rs new file mode 100644 index 00000000000..53a0f71dd4b --- /dev/null +++ b/src/array/binview/fmt.rs @@ -0,0 +1,36 @@ +use std::fmt::{Debug, Formatter, Result, Write}; + +use super::super::fmt::write_vec; +use super::BinaryViewArrayGeneric; +use crate::array::binview::ViewType; +use crate::array::{Array, BinaryViewArray, Utf8ViewArray}; + +pub fn write_value<'a, T: ViewType + ?Sized, W: Write>( + array: &'a BinaryViewArrayGeneric, + index: usize, + f: &mut W, +) -> Result +where + &'a T: Debug, +{ + let bytes = array.value(index).to_bytes(); + let writer = |f: &mut W, index| write!(f, "{}", bytes[index]); + + write_vec(f, writer, None, bytes.len(), "None", false) +} + +impl Debug for BinaryViewArray { + fn fmt(&self, f: &mut Formatter<'_>) -> Result { + let writer = |f: &mut Formatter, index| write_value(self, index, f); + write!(f, "BinaryViewArray")?; + write_vec(f, writer, self.validity(), self.len(), "None", false) + } +} + +impl Debug for Utf8ViewArray { + fn fmt(&self, f: &mut Formatter<'_>) -> Result { + let writer = |f: &mut Formatter, index| write!(f, "{}", self.value(index)); + write!(f, "Utf8ViewArray")?; + write_vec(f, writer, self.validity(), self.len(), "None", false) + } +} diff --git a/src/array/binview/iterator.rs b/src/array/binview/iterator.rs new file mode 100644 index 00000000000..26587d5c1b7 --- /dev/null +++ b/src/array/binview/iterator.rs @@ -0,0 +1,47 @@ +use super::BinaryViewArrayGeneric; +use crate::array::binview::ViewType; +use crate::array::{ArrayAccessor, ArrayValuesIter, MutableBinaryViewArray}; +use crate::bitmap::utils::{BitmapIter, ZipValidity}; + +unsafe impl<'a, T: ViewType + ?Sized> ArrayAccessor<'a> for BinaryViewArrayGeneric { + type Item = &'a T; + + #[inline] + unsafe fn value_unchecked(&'a self, index: usize) -> Self::Item { + self.value_unchecked(index) + } + + #[inline] + fn len(&self) -> usize { + self.views.len() + } +} + +/// Iterator of values of an [`BinaryArray`]. +pub type BinaryViewValueIter<'a, T> = ArrayValuesIter<'a, BinaryViewArrayGeneric>; + +impl<'a, T: ViewType + ?Sized> IntoIterator for &'a BinaryViewArrayGeneric { + type Item = Option<&'a T>; + type IntoIter = ZipValidity<&'a T, BinaryViewValueIter<'a, T>, BitmapIter<'a>>; + + fn into_iter(self) -> Self::IntoIter { + self.iter() + } +} + +unsafe impl<'a, T: ViewType + ?Sized> ArrayAccessor<'a> for MutableBinaryViewArray { + type Item = &'a T; + + #[inline] + unsafe fn value_unchecked(&'a self, index: usize) -> Self::Item { + self.value_unchecked(index) + } + + #[inline] + fn len(&self) -> usize { + self.views().len() + } +} + +/// Iterator of values of an [`MutableBinaryViewArray`]. +pub type MutableBinaryViewValueIter<'a, T> = ArrayValuesIter<'a, MutableBinaryViewArray>; diff --git a/src/array/binview/mod.rs b/src/array/binview/mod.rs new file mode 100644 index 00000000000..45adb1e4c2d --- /dev/null +++ b/src/array/binview/mod.rs @@ -0,0 +1,513 @@ +//! See thread: https://lists.apache.org/thread/w88tpz76ox8h3rxkjl4so6rg3f1rv7wt +mod ffi; +pub(super) mod fmt; +mod iterator; +mod mutable; +mod view; + +use std::any::Any; +use std::fmt::Debug; +use std::marker::PhantomData; +use std::sync::atomic::{AtomicU64, Ordering}; +use std::sync::Arc; + +use crate::array::Array; +use crate::bitmap::Bitmap; +use crate::buffer::Buffer; +use crate::datatypes::DataType; + +mod private { + pub trait Sealed: Send + Sync {} + + impl Sealed for str {} + impl Sealed for [u8] {} +} +pub use iterator::BinaryViewValueIter; +pub use mutable::MutableBinaryViewArray; +use private::Sealed; + +use crate::array::binview::view::{ + validate_binary_view, validate_utf8_only, validate_utf8_view, +}; +use crate::bitmap::utils::{BitmapIter, ZipValidity}; +use crate::error::{Error, Result}; + +pub type BinaryViewArray = BinaryViewArrayGeneric<[u8]>; +pub type Utf8ViewArray = BinaryViewArrayGeneric; +pub use view::View; + +pub type MutablePlString = MutableBinaryViewArray; +pub type MutablePlBinary = MutableBinaryViewArray<[u8]>; + +static BIN_VIEW_TYPE: DataType = DataType::BinaryView; +static UTF8_VIEW_TYPE: DataType = DataType::Utf8View; + +pub trait ViewType: Sealed + 'static + PartialEq + AsRef { + const IS_UTF8: bool; + const DATA_TYPE: DataType; + type Owned: Debug + Clone + Sync + Send + AsRef; + + /// # Safety + /// The caller must ensure `index < self.len()`. + unsafe fn from_bytes_unchecked(slice: &[u8]) -> &Self; + + fn to_bytes(&self) -> &[u8]; + + #[allow(clippy::wrong_self_convention)] + fn into_owned(&self) -> Self::Owned; + + fn dtype() -> &'static DataType; +} + +impl ViewType for str { + const IS_UTF8: bool = true; + const DATA_TYPE: DataType = DataType::Utf8View; + type Owned = String; + + #[inline(always)] + unsafe fn from_bytes_unchecked(slice: &[u8]) -> &Self { + std::str::from_utf8_unchecked(slice) + } + + #[inline(always)] + fn to_bytes(&self) -> &[u8] { + self.as_bytes() + } + + fn into_owned(&self) -> Self::Owned { + self.to_string() + } + fn dtype() -> &'static DataType { + &UTF8_VIEW_TYPE + } +} + +impl ViewType for [u8] { + const IS_UTF8: bool = false; + const DATA_TYPE: DataType = DataType::BinaryView; + type Owned = Vec; + + #[inline(always)] + unsafe fn from_bytes_unchecked(slice: &[u8]) -> &Self { + slice + } + + #[inline(always)] + fn to_bytes(&self) -> &[u8] { + self + } + + fn into_owned(&self) -> Self::Owned { + self.to_vec() + } + + fn dtype() -> &'static DataType { + &BIN_VIEW_TYPE + } +} + +pub struct BinaryViewArrayGeneric { + data_type: DataType, + views: Buffer, + buffers: Arc<[Buffer]>, + // Raw buffer access. (pointer, len). + raw_buffers: Arc<[(*const u8, usize)]>, + validity: Option, + phantom: PhantomData, + /// Total bytes length if we would concatenate them all. + total_bytes_len: AtomicU64, + /// Total bytes in the buffer (excluding remaining capacity) + total_buffer_len: usize, +} + +impl PartialEq for BinaryViewArrayGeneric { + fn eq(&self, other: &Self) -> bool { + self.into_iter().zip(other).all(|(l, r)| l == r) + } +} + +impl Clone for BinaryViewArrayGeneric { + fn clone(&self) -> Self { + Self { + data_type: self.data_type.clone(), + views: self.views.clone(), + buffers: self.buffers.clone(), + raw_buffers: self.raw_buffers.clone(), + validity: self.validity.clone(), + phantom: Default::default(), + total_bytes_len: AtomicU64::new(self.total_bytes_len.load(Ordering::Relaxed)), + total_buffer_len: self.total_buffer_len, + } + } +} + +unsafe impl Send for BinaryViewArrayGeneric {} +unsafe impl Sync for BinaryViewArrayGeneric {} + +fn buffers_into_raw(buffers: &[Buffer]) -> Arc<[(*const T, usize)]> { + buffers + .iter() + .map(|buf| (buf.storage_ptr(), buf.len())) + .collect() +} +const UNKNOWN_LEN: u64 = u64::MAX; + +impl BinaryViewArrayGeneric { + /// # Safety + /// The caller must ensure + /// - the data is valid utf8 (if required) + /// - The offsets match the buffers. + pub unsafe fn new_unchecked( + data_type: DataType, + views: Buffer, + buffers: Arc<[Buffer]>, + validity: Option, + total_bytes_len: usize, + total_buffer_len: usize, + ) -> Self { + let raw_buffers = buffers_into_raw(&buffers); + Self { + data_type, + views, + buffers, + raw_buffers, + validity, + phantom: Default::default(), + total_bytes_len: AtomicU64::new(total_bytes_len as u64), + total_buffer_len, + } + } + + /// Create a new BinaryViewArray but initialize a statistics compute. + /// # Safety + /// The caller must ensure the invariants + pub unsafe fn new_unchecked_unknown_md( + data_type: DataType, + views: Buffer, + buffers: Arc<[Buffer]>, + validity: Option, + total_buffer_len: Option, + ) -> Self { + let total_bytes_len = UNKNOWN_LEN as usize; + let total_buffer_len = + total_buffer_len.unwrap_or_else(|| buffers.iter().map(|b| b.len()).sum()); + Self::new_unchecked( + data_type, + views, + buffers, + validity, + total_bytes_len, + total_buffer_len, + ) + } + + pub fn data_buffers(&self) -> &Arc<[Buffer]> { + &self.buffers + } + + pub fn variadic_buffer_lengths(&self) -> Vec { + self.buffers.iter().map(|buf| buf.len() as i64).collect() + } + + pub fn views(&self) -> &Buffer { + &self.views + } + + pub fn try_new( + data_type: DataType, + views: Buffer, + buffers: Arc<[Buffer]>, + validity: Option, + ) -> Result { + if T::IS_UTF8 { + validate_utf8_view(views.as_ref(), buffers.as_ref())?; + } else { + validate_binary_view(views.as_ref(), buffers.as_ref())?; + } + + + if validity + .as_ref() + .map_or(false, |validity| validity.len() == views.len()) + { + return Err(Error::oos( + "validity mask length must match the number of values", + )); + } + + unsafe { + Ok(Self::new_unchecked_unknown_md( + data_type, views, buffers, validity, None, + )) + } + } + + /// Creates an empty [`BinaryViewArrayGeneric`], i.e. whose `.len` is zero. + #[inline] + pub fn new_empty(data_type: DataType) -> Self { + unsafe { Self::new_unchecked(data_type, Buffer::new(), Arc::from([]), None, 0, 0) } + } + + /// Returns a new null [`BinaryViewArrayGeneric`] of `length`. + #[inline] + pub fn new_null(data_type: DataType, length: usize) -> Self { + let validity = Some(Bitmap::new_zeroed(length)); + unsafe { + Self::new_unchecked( + data_type, + Buffer::zeroed(length), + Arc::from([]), + validity, + 0, + 0, + ) + } + } + + /// Returns the element at index `i` + /// # Panics + /// iff `i >= self.len()` + #[inline] + pub fn value(&self, i: usize) -> &T { + assert!(i < self.len()); + unsafe { self.value_unchecked(i) } + } + + /// Returns the element at index `i` + /// # Safety + /// Assumes that the `i < self.len`. + #[inline] + pub unsafe fn value_unchecked(&self, i: usize) -> &T { + let v = *self.views.get_unchecked(i); + let len = v.length; + + // view layout: + // length: 4 bytes + // prefix: 4 bytes + // buffer_index: 4 bytes + // offset: 4 bytes + + // inlined layout: + // length: 4 bytes + // data: 12 bytes + + let bytes = if len <= 12 { + let ptr = self.views.as_ptr() as *const u8; + std::slice::from_raw_parts(ptr.add(i * 16 + 4), len as usize) + } else { + let (data_ptr, data_len) = *self.raw_buffers.get_unchecked(v.buffer_idx as usize); + let data = std::slice::from_raw_parts(data_ptr, data_len); + let offset = v.offset as usize; + data.get_unchecked(offset..offset + len as usize) + }; + T::from_bytes_unchecked(bytes) + } + + /// Returns an iterator of `Option<&T>` over every element of this array. + pub fn iter(&self) -> ZipValidity<&T, BinaryViewValueIter, BitmapIter> { + ZipValidity::new_with_validity(self.values_iter(), self.validity.as_ref()) + } + + /// Returns an iterator of `&[u8]` over every element of this array, ignoring the validity + pub fn values_iter(&self) -> BinaryViewValueIter { + BinaryViewValueIter::new(self) + } + + pub fn len_iter(&self) -> impl Iterator + '_ { + self.views.iter().map(|v| v.length) + } + + impl_sliced!(); + impl_mut_validity!(); + impl_into_array!(); + + pub fn from_slice, P: AsRef<[Option]>>(slice: P) -> Self { + let mutable = MutableBinaryViewArray::from_iterator( + slice.as_ref().iter().map(|opt_v| opt_v.as_ref()), + ); + mutable.into() + } + + pub fn from_slice_values, P: AsRef<[S]>>(slice: P) -> Self { + let mutable = + MutableBinaryViewArray::from_values_iter(slice.as_ref().iter().map(|v| v.as_ref())); + mutable.into() + } + + /// Get the total length of bytes that it would take to concatenate all binary/str values in this array. + pub fn total_bytes_len(&self) -> usize { + let total = self.total_bytes_len.load(Ordering::Relaxed); + if total == UNKNOWN_LEN { + let total = self.len_iter().map(|v| v as usize).sum::(); + self.total_bytes_len.store(total as u64, Ordering::Relaxed); + total + } else { + total as usize + } + } + + /// Get the length of bytes that are stored in the variadic buffers. + pub fn total_buffer_len(&self) -> usize { + self.total_buffer_len + } + + #[inline(always)] + pub fn len(&self) -> usize { + self.views.len() + } + + /// Garbage collect + pub fn gc(self) -> Self { + if self.buffers.is_empty() { + return self; + } + let mut mutable = MutableBinaryViewArray::with_capacity(self.len()); + let buffers = self.raw_buffers.as_ref(); + + for view in self.views.as_ref() { + unsafe { mutable.push_view(*view, buffers) } + } + mutable.freeze().with_validity(self.validity) + } + + pub fn is_sliced(&self) -> bool { + self.views.as_ptr() != self.views.storage_ptr() + } + + pub fn maybe_gc(self) -> Self { + const GC_MINIMUM_SAVINGS: usize = 16 * 1024; // At least 16 KiB. + + if self.total_buffer_len <= GC_MINIMUM_SAVINGS { + return self; + } + + // Subtract the maximum amount of inlined strings to get a lower bound + // on the number of buffer bytes needed (assuming no dedup). + let total_bytes_len = self.total_bytes_len(); + let buffer_req_lower_bound = total_bytes_len.saturating_sub(self.len() * 12); + + let lower_bound_mem_usage_post_gc = self.len() * 16 + buffer_req_lower_bound; + let cur_mem_usage = self.len() * 16 + self.total_buffer_len(); + let savings_upper_bound = cur_mem_usage.saturating_sub(lower_bound_mem_usage_post_gc); + + if savings_upper_bound >= GC_MINIMUM_SAVINGS + && cur_mem_usage >= 4 * lower_bound_mem_usage_post_gc + { + self.gc() + } else { + self + } + } + + pub fn make_mut(self) -> MutableBinaryViewArray { + let views = self.views.make_mut(); + let completed_buffers = self.buffers.to_vec(); + let validity = self.validity.map(|bitmap| bitmap.make_mut()); + MutableBinaryViewArray { + views, + completed_buffers, + in_progress_buffer: vec![], + validity, + phantom: Default::default(), + total_bytes_len: self.total_bytes_len.load(Ordering::Relaxed) as usize, + total_buffer_len: self.total_buffer_len, + } + } +} + +impl BinaryViewArray { + /// Validate the underlying bytes on UTF-8. + pub fn validate_utf8(&self) -> Result<()> { + // SAFETY: views are correct + unsafe { validate_utf8_only(&self.views, &self.buffers) } + } + + /// Convert [`BinaryViewArray`] to [`Utf8ViewArray`]. + pub fn to_utf8view(&self) -> Result { + self.validate_utf8()?; + unsafe { Ok(self.to_utf8view_unchecked()) } + } + + /// Convert [`BinaryViewArray`] to [`Utf8ViewArray`] without checking UTF-8. + /// + /// # Safety + /// The caller must ensure the underlying data is valid UTF-8. + pub unsafe fn to_utf8view_unchecked(&self) -> Utf8ViewArray { + Utf8ViewArray::new_unchecked( + DataType::Utf8View, + self.views.clone(), + self.buffers.clone(), + self.validity.clone(), + self.total_bytes_len.load(Ordering::Relaxed) as usize, + self.total_buffer_len, + ) + } +} + +impl Utf8ViewArray { + pub fn to_binview(&self) -> BinaryViewArray { + // SAFETY: same invariants. + unsafe { + BinaryViewArray::new_unchecked( + DataType::BinaryView, + self.views.clone(), + self.buffers.clone(), + self.validity.clone(), + self.total_bytes_len.load(Ordering::Relaxed) as usize, + self.total_buffer_len, + ) + } + } +} + +impl Array for BinaryViewArrayGeneric { + fn as_any(&self) -> &dyn Any { + self + } + + fn as_any_mut(&mut self) -> &mut dyn Any { + self + } + + #[inline(always)] + fn len(&self) -> usize { + BinaryViewArrayGeneric::len(self) + } + + fn data_type(&self) -> &DataType { + T::dtype() + } + + fn validity(&self) -> Option<&Bitmap> { + self.validity.as_ref() + } + + fn slice(&mut self, offset: usize, length: usize) { + assert!( + offset + length <= self.len(), + "the offset of the new Buffer cannot exceed the existing length" + ); + unsafe { self.slice_unchecked(offset, length) } + } + + unsafe fn slice_unchecked(&mut self, offset: usize, length: usize) { + debug_assert!(offset + length <= self.len()); + self.validity = self + .validity + .take() + .map(|bitmap| bitmap.sliced_unchecked(offset, length)) + .filter(|bitmap| bitmap.unset_bits() > 0); + self.views.slice_unchecked(offset, length); + self.total_bytes_len.store(UNKNOWN_LEN, Ordering::Relaxed) + } + + fn with_validity(&self, validity: Option) -> Box { + let mut new = self.clone(); + new.validity = validity; + Box::new(new) + } + + fn to_boxed(&self) -> Box { + Box::new(self.clone()) + } +} diff --git a/src/array/binview/mutable.rs b/src/array/binview/mutable.rs new file mode 100644 index 00000000000..ccb9de83c28 --- /dev/null +++ b/src/array/binview/mutable.rs @@ -0,0 +1,422 @@ +use std::any::Any; +use std::fmt::{Debug, Formatter}; +use std::sync::Arc; + +use crate::array::binview::iterator::MutableBinaryViewValueIter; +use crate::array::binview::view::validate_utf8_only; +use crate::array::binview::{BinaryViewArrayGeneric, ViewType}; +use crate::array::{Array, MutableArray, View}; +use crate::bitmap::MutableBitmap; +use crate::buffer::Buffer; +use crate::error::Result; +use crate::datatypes::DataType; +use crate::trusted_len::TrustedLen; +use crate::types::NativeType; + +const DEFAULT_BLOCK_SIZE: usize = 8 * 1024; + +pub struct MutableBinaryViewArray { + pub(super) views: Vec, + pub(super) completed_buffers: Vec>, + pub(super) in_progress_buffer: Vec, + pub(super) validity: Option, + pub(super) phantom: std::marker::PhantomData, + /// Total bytes length if we would concatenate them all. + pub(super) total_bytes_len: usize, + /// Total bytes in the buffer (excluding remaining capacity) + pub(super) total_buffer_len: usize, +} + +impl Clone for MutableBinaryViewArray { + fn clone(&self) -> Self { + Self { + views: self.views.clone(), + completed_buffers: self.completed_buffers.clone(), + in_progress_buffer: self.in_progress_buffer.clone(), + validity: self.validity.clone(), + phantom: Default::default(), + total_bytes_len: self.total_bytes_len, + total_buffer_len: self.total_buffer_len, + } + } +} + +impl Debug for MutableBinaryViewArray { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + write!(f, "mutable-binview{:?}", T::DATA_TYPE) + } +} + +impl Default for MutableBinaryViewArray { + fn default() -> Self { + Self::with_capacity(0) + } +} + +impl From> for BinaryViewArrayGeneric { + fn from(mut value: MutableBinaryViewArray) -> Self { + value.finish_in_progress(); + unsafe { + Self::new_unchecked( + T::DATA_TYPE, + value.views.into(), + Arc::from(value.completed_buffers), + value.validity.map(|b| b.into()), + value.total_bytes_len, + value.total_buffer_len, + ) + } + } +} + +impl MutableBinaryViewArray { + pub fn new() -> Self { + Self::default() + } + + pub fn with_capacity(capacity: usize) -> Self { + Self { + views: Vec::with_capacity(capacity), + completed_buffers: vec![], + in_progress_buffer: vec![], + validity: None, + phantom: Default::default(), + total_buffer_len: 0, + total_bytes_len: 0, + } + } + + #[inline] + pub fn views_mut(&mut self) -> &mut Vec { + &mut self.views + } + + #[inline] + pub fn views(&self) -> &[View] { + &self.views + } + + pub fn validity(&mut self) -> Option<&mut MutableBitmap> { + self.validity.as_mut() + } + + /// Reserves `additional` elements and `additional_buffer` on the buffer. + pub fn reserve(&mut self, additional: usize) { + self.views.reserve(additional); + } + + #[inline] + pub fn len(&self) -> usize { + self.views.len() + } + + #[inline] + pub fn capacity(&self) -> usize { + self.views.capacity() + } + + fn init_validity(&mut self, unset_last: bool) { + let mut validity = MutableBitmap::with_capacity(self.views.capacity()); + validity.extend_constant(self.len(), true); + if unset_last { + validity.set(self.len() - 1, false); + } + self.validity = Some(validity); + } + + /// # Safety + /// - caller must allocate enough capacity + /// - caller must ensure the view and buffers match. + #[inline] + pub unsafe fn push_view(&mut self, v: View, buffers: &[(*const u8, usize)]) { + let len = v.length; + self.total_bytes_len += len as usize; + if len <= 12 { + debug_assert!(self.views.capacity() > self.views.len()); + self.views.push(v) + } else { + self.total_buffer_len += len as usize; + let (data_ptr, data_len) = *buffers.get_unchecked(v.buffer_idx as usize); + let data = std::slice::from_raw_parts(data_ptr, data_len); + let offset = v.offset as usize; + let bytes = data.get_unchecked(offset..offset + len as usize); + let t = T::from_bytes_unchecked(bytes); + self.push_value_ignore_validity(t) + } + } + + pub fn push_value_ignore_validity>(&mut self, value: V) { + let value = value.as_ref(); + let bytes = value.to_bytes(); + self.total_bytes_len += bytes.len(); + let len: u32 = bytes.len().try_into().unwrap(); + let mut payload = [0; 16]; + payload[0..4].copy_from_slice(&len.to_le_bytes()); + + if len <= 12 { + payload[4..4 + bytes.len()].copy_from_slice(bytes); + } else { + self.total_buffer_len += bytes.len(); + let required_cap = self.in_progress_buffer.len() + bytes.len(); + if self.in_progress_buffer.capacity() < required_cap { + let new_capacity = (self.in_progress_buffer.capacity() * 2) + .clamp(DEFAULT_BLOCK_SIZE, 16 * 1024 * 1024) + .max(bytes.len()); + let in_progress = Vec::with_capacity(new_capacity); + let flushed = std::mem::replace(&mut self.in_progress_buffer, in_progress); + if !flushed.is_empty() { + self.completed_buffers.push(flushed.into()) + } + } + let offset = self.in_progress_buffer.len() as u32; + self.in_progress_buffer.extend_from_slice(bytes); + + unsafe { payload[4..8].copy_from_slice(bytes.get_unchecked(0..4)) }; + let buffer_idx: u32 = self.completed_buffers.len().try_into().unwrap(); + payload[8..12].copy_from_slice(&buffer_idx.to_le_bytes()); + payload[12..16].copy_from_slice(&offset.to_le_bytes()); + } + let value = View::from_le_bytes(payload); + self.views.push(value); + } + + pub fn push_value>(&mut self, value: V) { + if let Some(validity) = &mut self.validity { + validity.push(true) + } + self.push_value_ignore_validity(value) + } + + pub fn push>(&mut self, value: Option) { + if let Some(value) = value { + self.push_value(value) + } else { + self.push_null() + } + } + + pub fn push_null(&mut self) { + self.views.push(View::default()); + match &mut self.validity { + Some(validity) => validity.push(false), + None => self.init_validity(true), + } + } + + pub fn extend_null(&mut self, additional: usize) { + if self.validity.is_none() && additional > 0 { + self.init_validity(false); + } + self.views + .extend(std::iter::repeat(View::default()).take(additional)); + if let Some(validity) = &mut self.validity { + validity.extend_constant(additional, false); + } + } + + pub fn extend_constant>(&mut self, additional: usize, value: Option) { + if value.is_none() && self.validity.is_none() { + self.init_validity(false); + } + + if let Some(validity) = &mut self.validity { + validity.extend_constant(additional, value.is_some()) + } + + // Push and pop to get the properly encoded value. + // For long string this leads to a dictionary encoding, + // as we push the string only once in the buffers + let view_value = value + .map(|v| { + self.push_value_ignore_validity(v); + self.views.pop().unwrap() + }) + .unwrap_or_default(); + self.views + .extend(std::iter::repeat(view_value).take(additional)); + } + + impl_mutable_array_mut_validity!(); + + #[inline] + pub fn extend_values(&mut self, iterator: I) + where + I: Iterator, + P: AsRef, + { + self.reserve(iterator.size_hint().0); + for v in iterator { + self.push_value(v) + } + } + + #[inline] + pub fn extend_trusted_len_values(&mut self, iterator: I) + where + I: TrustedLen, + P: AsRef, + { + self.extend_values(iterator) + } + + #[inline] + pub fn extend(&mut self, iterator: I) + where + I: Iterator>, + P: AsRef, + { + self.reserve(iterator.size_hint().0); + for p in iterator { + self.push(p) + } + } + + #[inline] + pub fn extend_trusted_len(&mut self, iterator: I) + where + I: TrustedLen>, + P: AsRef, + { + self.extend(iterator) + } + + #[inline] + pub fn from_iterator(iterator: I) -> Self + where + I: Iterator>, + P: AsRef, + { + let mut mutable = Self::with_capacity(iterator.size_hint().0); + mutable.extend(iterator); + mutable + } + + pub fn from_values_iter(iterator: I) -> Self + where + I: Iterator, + P: AsRef, + { + let mut mutable = Self::with_capacity(iterator.size_hint().0); + mutable.extend_values(iterator); + mutable + } + + pub fn from, P: AsRef<[Option]>>(slice: P) -> Self { + Self::from_iterator(slice.as_ref().iter().map(|opt_v| opt_v.as_ref())) + } + + fn finish_in_progress(&mut self) { + if !self.in_progress_buffer.is_empty() { + self.completed_buffers + .push(std::mem::take(&mut self.in_progress_buffer).into()); + } + } + + #[inline] + pub fn freeze(self) -> BinaryViewArrayGeneric { + self.into() + } + + /// Returns the element at index `i` + /// # Safety + /// Assumes that the `i < self.len`. + #[inline] + pub unsafe fn value_unchecked(&self, i: usize) -> &T { + let v = *self.views.get_unchecked(i); + let len = v.length; + + // view layout: + // length: 4 bytes + // prefix: 4 bytes + // buffer_index: 4 bytes + // offset: 4 bytes + + // inlined layout: + // length: 4 bytes + // data: 12 bytes + let bytes = if len <= 12 { + let ptr = self.views.as_ptr() as *const u8; + std::slice::from_raw_parts(ptr.add(i * 16 + 4), len as usize) + } else { + let buffer_idx = v.buffer_idx as usize; + let offset = v.offset; + + let data = if buffer_idx == self.completed_buffers.len() { + self.in_progress_buffer.as_slice() + } else { + self.completed_buffers.get_unchecked(buffer_idx) + }; + + let offset = offset as usize; + data.get_unchecked(offset..offset + len as usize) + }; + T::from_bytes_unchecked(bytes) + } + + /// Returns an iterator of `&[u8]` over every element of this array, ignoring the validity + pub fn values_iter(&self) -> MutableBinaryViewValueIter { + MutableBinaryViewValueIter::new(self) + } +} + +impl MutableBinaryViewArray<[u8]> { + pub fn validate_utf8(&mut self) -> Result<()> { + self.finish_in_progress(); + // views are correct + unsafe { validate_utf8_only(&self.views, &self.completed_buffers) } + } +} + +impl> Extend> for MutableBinaryViewArray { + #[inline] + fn extend>>(&mut self, iter: I) { + Self::extend(self, iter.into_iter()) + } +} + +impl> FromIterator> for MutableBinaryViewArray { + #[inline] + fn from_iter>>(iter: I) -> Self { + Self::from_iterator(iter.into_iter()) + } +} + +impl MutableArray for MutableBinaryViewArray { + fn data_type(&self) -> &DataType { + T::dtype() + } + + fn len(&self) -> usize { + MutableBinaryViewArray::len(self) + } + + fn validity(&self) -> Option<&MutableBitmap> { + self.validity.as_ref() + } + + fn as_box(&mut self) -> Box { + let mutable = std::mem::take(self); + let arr: BinaryViewArrayGeneric = mutable.into(); + arr.boxed() + } + + fn as_any(&self) -> &dyn Any { + self + } + + fn as_mut_any(&mut self) -> &mut dyn Any { + self + } + + fn push_null(&mut self) { + MutableBinaryViewArray::push_null(self) + } + + fn reserve(&mut self, additional: usize) { + MutableBinaryViewArray::reserve(self, additional) + } + + fn shrink_to_fit(&mut self) { + self.views.shrink_to_fit() + } +} diff --git a/src/array/binview/view.rs b/src/array/binview/view.rs new file mode 100644 index 00000000000..1bfad9ee2df --- /dev/null +++ b/src/array/binview/view.rs @@ -0,0 +1,196 @@ +use crate::buffer::Buffer; +use crate::error::{Error, Result}; +use std::fmt::{Display, Formatter}; +use std::ops::Add; + +use bytemuck::{Pod, Zeroable}; + +use crate::datatypes::PrimitiveType; +use crate::types::NativeType; + +// We use this instead of u128 because we want alignment of <= 8 bytes. +#[derive(Debug, Copy, Clone, Default)] +#[repr(C)] +pub struct View { + /// The length of the string/bytes. + pub length: u32, + /// First 4 bytes of string/bytes data. + pub prefix: u32, + /// The buffer index. + pub buffer_idx: u32, + /// The offset into the buffer. + pub offset: u32, +} + +impl View { + #[inline(always)] + pub fn as_u128(self) -> u128 { + unsafe { std::mem::transmute(self) } + } +} + +impl Display for View { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + write!(f, "{:?}", self) + } +} + +unsafe impl Zeroable for View {} + +unsafe impl Pod for View {} + +impl Add for View { + type Output = View; + + fn add(self, _rhs: Self) -> Self::Output { + unimplemented!() + } +} + +impl num_traits::Zero for View { + fn zero() -> Self { + Default::default() + } + + fn is_zero(&self) -> bool { + *self == Self::zero() + } +} + +impl PartialEq for View { + fn eq(&self, other: &Self) -> bool { + self.as_u128() == other.as_u128() + } +} + +impl NativeType for View { + const PRIMITIVE: PrimitiveType = PrimitiveType::UInt128; + type Bytes = [u8; 16]; + + #[inline] + fn to_le_bytes(&self) -> Self::Bytes { + self.as_u128().to_le_bytes() + } + + #[inline] + fn to_be_bytes(&self) -> Self::Bytes { + self.as_u128().to_be_bytes() + } + + #[inline] + fn from_le_bytes(bytes: Self::Bytes) -> Self { + Self::from(u128::from_le_bytes(bytes)) + } + + #[inline] + fn from_be_bytes(bytes: Self::Bytes) -> Self { + Self::from(u128::from_be_bytes(bytes)) + } +} + +impl From for View { + #[inline] + fn from(value: u128) -> Self { + unsafe { std::mem::transmute(value) } + } +} + +impl From for u128 { + #[inline] + fn from(value: View) -> Self { + value.as_u128() + } +} + +fn validate_view(views: &[View], buffers: &[Buffer], validate_bytes: F) -> Result<()> +where + F: Fn(&[u8]) -> Result<()>, +{ + for view in views { + let len = view.length; + if len <= 12 { + if len < 12 && view.as_u128() >> (32 + len * 8) != 0 { + return Err(Error::InvalidArgumentError(format!( + "View contained non-zero padding for string of length {len}", + ))); + } + + validate_bytes(&view.to_le_bytes()[4..4 + len as usize])?; + } else { + let data = buffers.get(view.buffer_idx as usize).ok_or_else(|| { + Error::InvalidArgumentError(format!( + "Invalid buffer index: got index {} but only has {} buffers", + view.buffer_idx, + buffers.len() + )) + })?; + + let start = view.offset as usize; + let end = start + len as usize; + let b = data + .as_slice() + .get(start..end) + .ok_or_else(|| { + Error::InvalidArgumentError(format!( + "Invalid buffer slice: got {start}..{end} but buffer {} has length {}", + view.buffer_idx, + data.len() + )) + })?; + + if !b.starts_with(&view.prefix.to_le_bytes()) { + return Err(Error::InvalidArgumentError( + "Mismatch between embedded prefix and data".to_string(), + )); + } + validate_bytes(b)?; + }; + } + + Ok(()) +} + +pub(super) fn validate_binary_view(views: &[View], buffers: &[Buffer]) -> Result<()> { + validate_view(views, buffers, |_| Ok(())) +} + +fn validate_utf8(b: &[u8]) -> Result<()> { + match simdutf8::basic::from_utf8(b) { + Ok(_) => Ok(()), + Err(e) => Err(Error::InvalidArgumentError(format!( + "Encountered non-UTF-8 data {e}" + ))) + } +} + +pub(super) fn validate_utf8_view(views: &[View], buffers: &[Buffer]) -> Result<()> { + validate_view(views, buffers, validate_utf8) +} + +/// # Safety +/// The views and buffers must uphold the invariants of BinaryView otherwise we will go OOB. +pub(super) unsafe fn validate_utf8_only( + views: &[View], + buffers: &[Buffer], +) -> Result<()> { + for view in views { + let len = view.length; + if len <= 12 { + validate_utf8( + view.to_le_bytes() + .get_unchecked(4..4 + len as usize), + )?; + } else { + let buffer_idx = view.buffer_idx; + let offset = view.offset; + let data = buffers.get_unchecked(buffer_idx as usize); + + let start = offset as usize; + let end = start + len as usize; + let b = &data.as_slice().get_unchecked(start..end); + validate_utf8(b)?; + }; + } + + Ok(()) +} diff --git a/src/array/dictionary/typed_iterator.rs b/src/array/dictionary/typed_iterator.rs index 0e90a1cf4d8..48b49b28ca1 100644 --- a/src/array/dictionary/typed_iterator.rs +++ b/src/array/dictionary/typed_iterator.rs @@ -1,5 +1,5 @@ -use crate::array::{Array, PrimitiveArray, Utf8Array}; use crate::error::{Error, Result}; +use crate::array::{Array, PrimitiveArray, Utf8Array, Utf8ViewArray}; use crate::trusted_len::TrustedLen; use crate::types::Offset; @@ -48,6 +48,34 @@ impl DictValue for Utf8Array { } } +impl DictValue for Utf8ViewArray { + type IterValue<'a> = &'a str; + + unsafe fn get_unchecked(&self, item: usize) -> Self::IterValue<'_> { + self.value_unchecked(item) + } + + fn downcast_values(array: &dyn Array) -> Result<&Self> + where + Self: Sized, + { + array + .as_any() + .downcast_ref::() + .ok_or_else(|| Error::InvalidArgumentError( + "could not convert array to dictionary value".into(), + )) + .map(|arr| { + assert_eq!( + arr.null_count(), + 0, + "null values in values not supported in iteration" + ); + arr + }) + } +} + /// Iterator of values of an `ListArray`. pub struct DictionaryValuesIterTyped<'a, K: DictionaryKey, V: DictValue> { keys: &'a PrimitiveArray, diff --git a/src/array/equal/binary_view.rs b/src/array/equal/binary_view.rs new file mode 100644 index 00000000000..546e3e2a181 --- /dev/null +++ b/src/array/equal/binary_view.rs @@ -0,0 +1,9 @@ +use crate::array::binview::{BinaryViewArrayGeneric, ViewType}; +use crate::array::Array; + +pub(super) fn equal( + lhs: &BinaryViewArrayGeneric, + rhs: &BinaryViewArrayGeneric, +) -> bool { + lhs.data_type() == rhs.data_type() && lhs.len() == rhs.len() && lhs.iter().eq(rhs.iter()) +} diff --git a/src/array/equal/mod.rs b/src/array/equal/mod.rs index 2bb3ba77f1f..19b09c77763 100644 --- a/src/array/equal/mod.rs +++ b/src/array/equal/mod.rs @@ -4,6 +4,7 @@ use crate::types::NativeType; use super::*; mod binary; +mod binary_view; mod boolean; mod dictionary; mod fixed_size_binary; @@ -283,6 +284,16 @@ pub fn equal(lhs: &dyn Array, rhs: &dyn Array) -> bool { let lhs = lhs.as_any().downcast_ref().unwrap(); let rhs = rhs.as_any().downcast_ref().unwrap(); map::equal(lhs, rhs) - } + }, + BinaryView => { + let lhs = lhs.as_any().downcast_ref().unwrap(); + let rhs = rhs.as_any().downcast_ref().unwrap(); + binary_view::equal::<[u8]>(lhs, rhs) + }, + Utf8View => { + let lhs = lhs.as_any().downcast_ref().unwrap(); + let rhs = rhs.as_any().downcast_ref().unwrap(); + binary_view::equal::(lhs, rhs) + }, } } diff --git a/src/array/ffi.rs b/src/array/ffi.rs index 141cab327e4..e6bb31ead25 100644 --- a/src/array/ffi.rs +++ b/src/array/ffi.rs @@ -71,6 +71,8 @@ pub fn offset_buffers_children_dictionary(array: &dyn Array) -> BuffersChildren Struct => ffi_dyn!(array, StructArray), Union => ffi_dyn!(array, UnionArray), Map => ffi_dyn!(array, MapArray), + BinaryView => ffi_dyn!(array, BinaryViewArray), + Utf8View => ffi_dyn!(array, Utf8ViewArray), Dictionary(key_type) => { match_integer_type!(key_type, |$T| { let array = array.as_any().downcast_ref::>().unwrap(); diff --git a/src/array/fixed_size_binary/ffi.rs b/src/array/fixed_size_binary/ffi.rs index 444f3c3996e..d749944e020 100644 --- a/src/array/fixed_size_binary/ffi.rs +++ b/src/array/fixed_size_binary/ffi.rs @@ -11,7 +11,7 @@ unsafe impl ToFfi for FixedSizeBinaryArray { fn buffers(&self) -> Vec> { vec![ self.validity.as_ref().map(|x| x.as_ptr()), - Some(self.values.as_ptr().cast::()), + Some(self.values.storage_ptr().cast::()), ] } diff --git a/src/array/fmt.rs b/src/array/fmt.rs index 4f2c6896beb..bb47f66040c 100644 --- a/src/array/fmt.rs +++ b/src/array/fmt.rs @@ -91,6 +91,20 @@ pub fn get_value_display<'a, F: Write + 'a>( Map => Box::new(move |f, index| { super::map::fmt::write_value(array.as_any().downcast_ref().unwrap(), index, null, f) }), + BinaryView => Box::new(move |f, index| { + super::binview::fmt::write_value::<[u8], _>( + array.as_any().downcast_ref().unwrap(), + index, + f, + ) + }), + Utf8View => Box::new(move |f, index| { + super::binview::fmt::write_value::( + array.as_any().downcast_ref().unwrap(), + index, + f, + ) + }), Dictionary(key_type) => match_integer_type!(key_type, |$T| { Box::new(move |f, index| { super::dictionary::fmt::write_value::<$T,_>(array.as_any().downcast_ref().unwrap(), index, null, f) diff --git a/src/array/growable/binary.rs b/src/array/growable/binary.rs index 53ff0ae4feb..06f7ce6867f 100644 --- a/src/array/growable/binary.rs +++ b/src/array/growable/binary.rs @@ -8,18 +8,18 @@ use crate::{ }; use super::{ - utils::{build_extend_null_bits, extend_offset_values, ExtendNullBits}, + utils::extend_offset_values, Growable, }; +use crate::array::growable::utils::{extend_validity, prepare_validity}; /// Concrete [`Growable`] for the [`BinaryArray`]. pub struct GrowableBinary<'a, O: Offset> { arrays: Vec<&'a BinaryArray>, data_type: DataType, - validity: MutableBitmap, + validity: Option, values: Vec, offsets: Offsets, - extend_null_bits: Vec>, } impl<'a, O: Offset> GrowableBinary<'a, O> { @@ -35,18 +35,12 @@ impl<'a, O: Offset> GrowableBinary<'a, O> { use_validity = true; }; - let extend_null_bits = arrays - .iter() - .map(|array| build_extend_null_bits(*array, use_validity)) - .collect(); - Self { arrays, data_type, values: Vec::with_capacity(0), offsets: Offsets::with_capacity(capacity), - validity: MutableBitmap::with_capacity(capacity), - extend_null_bits, + validity: prepare_validity(use_validity, capacity), } } @@ -56,15 +50,20 @@ impl<'a, O: Offset> GrowableBinary<'a, O> { let offsets = std::mem::take(&mut self.offsets); let values = std::mem::take(&mut self.values); - BinaryArray::::new(data_type, offsets.into(), values.into(), validity.into()) + BinaryArray::::new( + data_type, + offsets.into(), + values.into(), + validity.map(|v| v.into()), + ) } } impl<'a, O: Offset> Growable<'a> for GrowableBinary<'a, O> { fn extend(&mut self, index: usize, start: usize, len: usize) { - (self.extend_null_bits[index])(&mut self.validity, start, len); - let array = self.arrays[index]; + extend_validity(&mut self.validity, array, start, len); + let offsets = array.offsets(); let values = array.values(); @@ -78,7 +77,9 @@ impl<'a, O: Offset> Growable<'a> for GrowableBinary<'a, O> { fn extend_validity(&mut self, additional: usize) { self.offsets.extend_constant(additional); - self.validity.extend_constant(additional, false); + if let Some(validity) = &mut self.validity { + validity.extend_constant(additional, false); + } } #[inline] @@ -101,7 +102,7 @@ impl<'a, O: Offset> From> for BinaryArray { val.data_type, val.offsets.into(), val.values.into(), - val.validity.into(), + val.validity.map(|v| v.into()), ) } } diff --git a/src/array/growable/binview.rs b/src/array/growable/binview.rs new file mode 100644 index 00000000000..92f66b8d52f --- /dev/null +++ b/src/array/growable/binview.rs @@ -0,0 +1,177 @@ +use std::sync::Arc; + +use super::Growable; +use crate::array::binview::{BinaryViewArrayGeneric, View, ViewType}; +use crate::array::growable::utils::{extend_validity, prepare_validity}; +use crate::array::Array; +use crate::bitmap::MutableBitmap; +use crate::buffer::Buffer; +use crate::datatypes::DataType; + +/// Concrete [`Growable`] for the [`BinaryArray`]. +pub struct GrowableBinaryViewArray<'a, T: ViewType + ?Sized> { + arrays: Vec<&'a BinaryViewArrayGeneric>, + data_type: DataType, + validity: Option, + views: Vec, + buffers: Vec>, + buffers_idx_offsets: Vec, + total_bytes_len: usize, + total_buffer_len: usize, +} + +impl<'a, T: ViewType + ?Sized> GrowableBinaryViewArray<'a, T> { + /// Creates a new [`GrowableBinaryViewArray`] bound to `arrays` with a pre-allocated `capacity`. + /// # Panics + /// If `arrays` is empty. + pub fn new( + arrays: Vec<&'a BinaryViewArrayGeneric>, + mut use_validity: bool, + capacity: usize, + ) -> Self { + let data_type = arrays[0].data_type().clone(); + + // if any of the arrays has nulls, insertions from any array requires setting bits + // as there is at least one array with nulls. + if !use_validity & arrays.iter().any(|array| array.null_count() > 0) { + use_validity = true; + }; + + let mut cum_sum = 0; + let cum_offset = arrays + .iter() + .map(|binview| { + let out = cum_sum; + cum_sum += binview.data_buffers().len() as u32; + out + }) + .collect::>(); + + let buffers = arrays + .iter() + .flat_map(|array| array.data_buffers().as_ref()) + .cloned() + .collect::>(); + let total_buffer_len = arrays + .iter() + .map(|arr| arr.data_buffers().len()) + .sum::(); + + Self { + arrays, + data_type, + validity: prepare_validity(use_validity, capacity), + views: Vec::with_capacity(capacity), + buffers, + buffers_idx_offsets: cum_offset, + total_bytes_len: 0, + total_buffer_len, + } + } + + fn to(&mut self) -> BinaryViewArrayGeneric { + let views = std::mem::take(&mut self.views); + let buffers = std::mem::take(&mut self.buffers); + let validity = self.validity.take(); + unsafe { + BinaryViewArrayGeneric::::new_unchecked( + self.data_type.clone(), + views.into(), + Arc::from(buffers), + validity.map(|v| v.into()), + self.total_bytes_len, + self.total_buffer_len, + ) + .maybe_gc() + } + } + + /// # Safety + /// doesn't check bounds + pub unsafe fn extend_unchecked(&mut self, index: usize, start: usize, len: usize) { + let array = *self.arrays.get_unchecked(index); + + extend_validity(&mut self.validity, array, start, len); + + let range = start..start + len; + + self.views + .extend(array.views().get_unchecked(range).iter().map(|view| { + let mut view = *view; + let len = view.length as usize; + self.total_bytes_len += len; + + if len > 12 { + let buffer_idx = *self.buffers_idx_offsets.get_unchecked(index); + view.buffer_idx += buffer_idx; + } + view + })); + } + + #[inline] + pub(crate) unsafe fn extend_unchecked_no_buffers( + &mut self, + index: usize, + start: usize, + len: usize, + ) { + let array = *self.arrays.get_unchecked(index); + + extend_validity(&mut self.validity, array, start, len); + + let range = start..start + len; + + self.views + .extend(array.views().get_unchecked(range).iter().map(|view| { + let len = view.length as usize; + self.total_bytes_len += len; + + *view + })) + } +} + +impl<'a, T: ViewType + ?Sized> Growable<'a> for GrowableBinaryViewArray<'a, T> { + fn extend(&mut self, index: usize, start: usize, len: usize) { + assert!(index < self.arrays.len()); + unsafe { self.extend_unchecked(index, start, len) } + } + + fn extend_validity(&mut self, additional: usize) { + self.views + .extend(std::iter::repeat(View::default()).take(additional)); + if let Some(validity) = &mut self.validity { + validity.extend_constant(additional, false); + } + } + + #[inline] + fn len(&self) -> usize { + self.views.len() + } + + fn as_arc(&mut self) -> Arc { + self.to().arced() + } + + fn as_box(&mut self) -> Box { + self.to().boxed() + } +} + +impl<'a, T: ViewType + ?Sized> From> for BinaryViewArrayGeneric { + fn from(val: GrowableBinaryViewArray<'a, T>) -> Self { + unsafe { + BinaryViewArrayGeneric::::new_unchecked( + val.data_type, + val.views.into(), + Arc::from(val.buffers), + val.validity.map(|v| v.into()), + val.total_bytes_len, + val.total_buffer_len, + ) + .maybe_gc() + } + } +} diff --git a/src/array/growable/boolean.rs b/src/array/growable/boolean.rs index 0cb1213403f..09a2bc632c9 100644 --- a/src/array/growable/boolean.rs +++ b/src/array/growable/boolean.rs @@ -7,7 +7,7 @@ use crate::{ }; use super::{ - utils::{build_extend_null_bits, ExtendNullBits}, + utils::{extend_validity, prepare_validity}, Growable, }; @@ -15,9 +15,8 @@ use super::{ pub struct GrowableBoolean<'a> { arrays: Vec<&'a BooleanArray>, data_type: DataType, - validity: MutableBitmap, + validity: Option, values: MutableBitmap, - extend_null_bits: Vec>, } impl<'a> GrowableBoolean<'a> { @@ -33,33 +32,31 @@ impl<'a> GrowableBoolean<'a> { use_validity = true; }; - let extend_null_bits = arrays - .iter() - .map(|array| build_extend_null_bits(*array, use_validity)) - .collect(); - Self { arrays, data_type, values: MutableBitmap::with_capacity(capacity), - validity: MutableBitmap::with_capacity(capacity), - extend_null_bits, + validity: prepare_validity(use_validity, capacity), } } fn to(&mut self) -> BooleanArray { - let validity = std::mem::take(&mut self.validity); + let validity = self.validity.take(); let values = std::mem::take(&mut self.values); - BooleanArray::new(self.data_type.clone(), values.into(), validity.into()) + BooleanArray::new( + self.data_type.clone(), + values.into(), + validity.map(|v| v.into()), + ) } } impl<'a> Growable<'a> for GrowableBoolean<'a> { fn extend(&mut self, index: usize, start: usize, len: usize) { - (self.extend_null_bits[index])(&mut self.validity, start, len); - let array = self.arrays[index]; + extend_validity(&mut self.validity, array, start, len); + let values = array.values(); let (slice, offset, _) = values.as_slice(); @@ -72,7 +69,9 @@ impl<'a> Growable<'a> for GrowableBoolean<'a> { fn extend_validity(&mut self, additional: usize) { self.values.extend_constant(additional, false); - self.validity.extend_constant(additional, false); + if let Some(validity) = &mut self.validity { + validity.extend_constant(additional, false); + } } #[inline] @@ -91,6 +90,10 @@ impl<'a> Growable<'a> for GrowableBoolean<'a> { impl<'a> From> for BooleanArray { fn from(val: GrowableBoolean<'a>) -> Self { - BooleanArray::new(val.data_type, val.values.into(), val.validity.into()) + BooleanArray::new( + val.data_type, + val.values.into(), + val.validity.map(|v| v.into()), + ) } } diff --git a/src/array/growable/dictionary.rs b/src/array/growable/dictionary.rs index f550304c852..44f2aab00c5 100644 --- a/src/array/growable/dictionary.rs +++ b/src/array/growable/dictionary.rs @@ -8,7 +8,7 @@ use crate::{ use super::{ make_growable, - utils::{build_extend_null_bits, ExtendNullBits}, + utils::{extend_validity, prepare_validity}, Growable, }; @@ -18,12 +18,11 @@ use super::{ /// the values of each [`DictionaryArray`] one after the other. pub struct GrowableDictionary<'a, K: DictionaryKey> { data_type: DataType, - keys_values: Vec<&'a [K]>, + keys: Vec<&'a PrimitiveArray>, key_values: Vec, - key_validity: MutableBitmap, + validity: Option, offsets: Vec, values: Box, - extend_null_bits: Vec>, } fn concatenate_values( @@ -55,16 +54,6 @@ impl<'a, T: DictionaryKey> GrowableDictionary<'a, T> { }; let arrays_keys = arrays.iter().map(|array| array.keys()).collect::>(); - let keys_values = arrays_keys - .iter() - .map(|array| array.values().as_slice()) - .collect::>(); - - let extend_null_bits = arrays - .iter() - .map(|array| build_extend_null_bits(array.keys(), use_validity)) - .collect(); - let arrays_values = arrays .iter() .map(|array| array.values().as_ref()) @@ -76,24 +65,26 @@ impl<'a, T: DictionaryKey> GrowableDictionary<'a, T> { data_type, offsets, values, - keys_values, + keys: arrays_keys, key_values: Vec::with_capacity(capacity), - key_validity: MutableBitmap::with_capacity(capacity), - extend_null_bits, + validity: prepare_validity(use_validity, capacity), } } #[inline] fn to(&mut self) -> DictionaryArray { - let validity = std::mem::take(&mut self.key_validity); + let validity = self.validity.take(); let key_values = std::mem::take(&mut self.key_values); #[cfg(debug_assertions)] { crate::array::specification::check_indexes(&key_values, self.values.len()).unwrap(); } - let keys = - PrimitiveArray::::new(T::PRIMITIVE.into(), key_values.into(), validity.into()); + let keys = PrimitiveArray::::new( + T::PRIMITIVE.into(), + key_values.into(), + validity.map(|v| v.into()), + ); // Safety - the invariant of this struct ensures that this is up-held unsafe { @@ -110,9 +101,10 @@ impl<'a, T: DictionaryKey> GrowableDictionary<'a, T> { impl<'a, T: DictionaryKey> Growable<'a> for GrowableDictionary<'a, T> { #[inline] fn extend(&mut self, index: usize, start: usize, len: usize) { - (self.extend_null_bits[index])(&mut self.key_validity, start, len); + let keys_array = self.keys[index]; + extend_validity(&mut self.validity, keys_array, start, len); - let values = &self.keys_values[index][start..start + len]; + let values = &keys_array.values()[start..start + len]; let offset = self.offsets[index]; self.key_values.extend( values @@ -141,7 +133,9 @@ impl<'a, T: DictionaryKey> Growable<'a> for GrowableDictionary<'a, T> { fn extend_validity(&mut self, additional: usize) { self.key_values .resize(self.key_values.len() + additional, T::default()); - self.key_validity.extend_constant(additional, false); + if let Some(validity) = &mut self.validity { + validity.extend_constant(additional, false); + } } #[inline] diff --git a/src/array/growable/fixed_binary.rs b/src/array/growable/fixed_binary.rs index 763bd59c817..7014e75cc8e 100644 --- a/src/array/growable/fixed_binary.rs +++ b/src/array/growable/fixed_binary.rs @@ -6,16 +6,15 @@ use crate::{ }; use super::{ - utils::{build_extend_null_bits, ExtendNullBits}, + utils::{extend_validity, prepare_validity}, Growable, }; /// Concrete [`Growable`] for the [`FixedSizeBinaryArray`]. pub struct GrowableFixedSizeBinary<'a> { arrays: Vec<&'a FixedSizeBinaryArray>, - validity: MutableBitmap, + validity: Option, values: Vec, - extend_null_bits: Vec>, size: usize, // just a cache } @@ -34,17 +33,11 @@ impl<'a> GrowableFixedSizeBinary<'a> { use_validity = true; }; - let extend_null_bits = arrays - .iter() - .map(|array| build_extend_null_bits(*array, use_validity)) - .collect(); - let size = FixedSizeBinaryArray::get_size(arrays[0].data_type()); Self { arrays, values: Vec::with_capacity(0), - validity: MutableBitmap::with_capacity(capacity), - extend_null_bits, + validity: prepare_validity(use_validity, capacity), size, } } @@ -56,16 +49,16 @@ impl<'a> GrowableFixedSizeBinary<'a> { FixedSizeBinaryArray::new( self.arrays[0].data_type().clone(), values.into(), - validity.into(), + validity.map(|v| v.into()), ) } } impl<'a> Growable<'a> for GrowableFixedSizeBinary<'a> { fn extend(&mut self, index: usize, start: usize, len: usize) { - (self.extend_null_bits[index])(&mut self.validity, start, len); - let array = self.arrays[index]; + extend_validity(&mut self.validity, array, start, len); + let values = array.values(); self.values @@ -75,7 +68,9 @@ impl<'a> Growable<'a> for GrowableFixedSizeBinary<'a> { fn extend_validity(&mut self, additional: usize) { self.values .extend_from_slice(&vec![0; self.size * additional]); - self.validity.extend_constant(additional, false); + if let Some(validity) = &mut self.validity { + validity.extend_constant(additional, false); + } } #[inline] @@ -97,7 +92,7 @@ impl<'a> From> for FixedSizeBinaryArray { FixedSizeBinaryArray::new( val.arrays[0].data_type().clone(), val.values.into(), - val.validity.into(), + val.validity.map(|v| v.into()), ) } } diff --git a/src/array/growable/fixed_size_list.rs b/src/array/growable/fixed_size_list.rs index a70695f4554..37b1519d60f 100644 --- a/src/array/growable/fixed_size_list.rs +++ b/src/array/growable/fixed_size_list.rs @@ -8,16 +8,15 @@ use crate::{ use super::{ make_growable, - utils::{build_extend_null_bits, ExtendNullBits}, + utils::{extend_validity, prepare_validity}, Growable, }; /// Concrete [`Growable`] for the [`FixedSizeListArray`]. pub struct GrowableFixedSizeList<'a> { arrays: Vec<&'a FixedSizeListArray>, - validity: MutableBitmap, + validity: Option, values: Box + 'a>, - extend_null_bits: Vec>, size: usize, } @@ -45,11 +44,6 @@ impl<'a> GrowableFixedSizeList<'a> { unreachable!("`GrowableFixedSizeList` expects `DataType::FixedSizeList`") }; - let extend_null_bits = arrays - .iter() - .map(|array| build_extend_null_bits(*array, use_validity)) - .collect(); - let inner = arrays .iter() .map(|array| array.values().as_ref()) @@ -59,8 +53,7 @@ impl<'a> GrowableFixedSizeList<'a> { Self { arrays, values, - validity: MutableBitmap::with_capacity(capacity), - extend_null_bits, + validity: prepare_validity(use_validity, capacity), size, } } @@ -69,20 +62,28 @@ impl<'a> GrowableFixedSizeList<'a> { let validity = std::mem::take(&mut self.validity); let values = self.values.as_box(); - FixedSizeListArray::new(self.arrays[0].data_type().clone(), values, validity.into()) + FixedSizeListArray::new( + self.arrays[0].data_type().clone(), + values, + validity.map(|v| v.into()), + ) } } impl<'a> Growable<'a> for GrowableFixedSizeList<'a> { fn extend(&mut self, index: usize, start: usize, len: usize) { - (self.extend_null_bits[index])(&mut self.validity, start, len); + let array = self.arrays[index]; + extend_validity(&mut self.validity, array, start, len); + self.values .extend(index, start * self.size, len * self.size); } fn extend_validity(&mut self, additional: usize) { self.values.extend_validity(additional * self.size); - self.validity.extend_constant(additional, false); + if let Some(validity) = &mut self.validity { + validity.extend_constant(additional, false); + } } #[inline] @@ -107,7 +108,7 @@ impl<'a> From> for FixedSizeListArray { Self::new( val.arrays[0].data_type().clone(), values, - val.validity.into(), + val.validity.map(|v| v.into()), ) } } diff --git a/src/array/growable/list.rs b/src/array/growable/list.rs index c0abb26dd72..e8506ecc522 100644 --- a/src/array/growable/list.rs +++ b/src/array/growable/list.rs @@ -8,7 +8,7 @@ use crate::{ use super::{ make_growable, - utils::{build_extend_null_bits, ExtendNullBits}, + utils::{extend_validity, prepare_validity}, Growable, }; @@ -35,10 +35,9 @@ fn extend_offset_values( /// Concrete [`Growable`] for the [`ListArray`]. pub struct GrowableList<'a, O: Offset> { arrays: Vec<&'a ListArray>, - validity: MutableBitmap, + validity: Option, values: Box + 'a>, offsets: Offsets, - extend_null_bits: Vec>, } impl<'a, O: Offset> GrowableList<'a, O> { @@ -52,11 +51,6 @@ impl<'a, O: Offset> GrowableList<'a, O> { use_validity = true; }; - let extend_null_bits = arrays - .iter() - .map(|array| build_extend_null_bits(*array, use_validity)) - .collect(); - let inner = arrays .iter() .map(|array| array.values().as_ref()) @@ -67,8 +61,7 @@ impl<'a, O: Offset> GrowableList<'a, O> { arrays, offsets: Offsets::with_capacity(capacity), values, - validity: MutableBitmap::with_capacity(capacity), - extend_null_bits, + validity: prepare_validity(use_validity, capacity), } } @@ -81,20 +74,23 @@ impl<'a, O: Offset> GrowableList<'a, O> { self.arrays[0].data_type().clone(), offsets.into(), values, - validity.into(), + validity.map(|v| v.into()), ) } } impl<'a, O: Offset> Growable<'a> for GrowableList<'a, O> { fn extend(&mut self, index: usize, start: usize, len: usize) { - (self.extend_null_bits[index])(&mut self.validity, start, len); + let array = self.arrays[index]; + extend_validity(&mut self.validity, array, start, len); extend_offset_values::(self, index, start, len); } fn extend_validity(&mut self, additional: usize) { self.offsets.extend_constant(additional); - self.validity.extend_constant(additional, false); + if let Some(validity) = &mut self.validity { + validity.extend_constant(additional, false); + } } #[inline] diff --git a/src/array/growable/map.rs b/src/array/growable/map.rs index 0919b4821ba..27238f69ca2 100644 --- a/src/array/growable/map.rs +++ b/src/array/growable/map.rs @@ -8,7 +8,7 @@ use crate::{ use super::{ make_growable, - utils::{build_extend_null_bits, ExtendNullBits}, + utils::{extend_validity, prepare_validity}, Growable, }; @@ -30,10 +30,9 @@ fn extend_offset_values(growable: &mut GrowableMap<'_>, index: usize, start: usi /// Concrete [`Growable`] for the [`MapArray`]. pub struct GrowableMap<'a> { arrays: Vec<&'a MapArray>, - validity: MutableBitmap, + validity: Option, values: Box + 'a>, offsets: Offsets, - extend_null_bits: Vec>, } impl<'a> GrowableMap<'a> { @@ -47,11 +46,6 @@ impl<'a> GrowableMap<'a> { use_validity = true; }; - let extend_null_bits = arrays - .iter() - .map(|array| build_extend_null_bits(*array, use_validity)) - .collect(); - let inner = arrays .iter() .map(|array| array.field().as_ref()) @@ -62,8 +56,7 @@ impl<'a> GrowableMap<'a> { arrays, offsets: Offsets::with_capacity(capacity), values, - validity: MutableBitmap::with_capacity(capacity), - extend_null_bits, + validity: prepare_validity(use_validity, capacity), } } @@ -76,20 +69,23 @@ impl<'a> GrowableMap<'a> { self.arrays[0].data_type().clone(), offsets.into(), values, - validity.into(), + validity.map(|v| v.into()), ) } } impl<'a> Growable<'a> for GrowableMap<'a> { fn extend(&mut self, index: usize, start: usize, len: usize) { - (self.extend_null_bits[index])(&mut self.validity, start, len); + let array = self.arrays[index]; + extend_validity(&mut self.validity, array, start, len); extend_offset_values(self, index, start, len); } fn extend_validity(&mut self, additional: usize) { self.offsets.extend_constant(additional); - self.validity.extend_constant(additional, false); + if let Some(validity) = &mut self.validity { + validity.extend_constant(additional, false); + } } #[inline] diff --git a/src/array/growable/mod.rs b/src/array/growable/mod.rs index 45f79405307..2da64e4f5b4 100644 --- a/src/array/growable/mod.rs +++ b/src/array/growable/mod.rs @@ -30,6 +30,8 @@ pub use utf8::GrowableUtf8; mod dictionary; pub use dictionary::GrowableDictionary; +mod binview; +pub use binview::GrowableBinaryViewArray; mod utils; /// Describes a struct that can be extended from slices of other pre-existing [`Array`]s. @@ -118,14 +120,22 @@ pub fn make_growable<'a>( use_validity, capacity ), - Union => { - let arrays = arrays - .iter() - .map(|array| array.as_any().downcast_ref().unwrap()) - .collect::>(); - Box::new(union::GrowableUnion::new(arrays, capacity)) - } - Map => dyn_growable!(map::GrowableMap, arrays, use_validity, capacity), + BinaryView => { + dyn_growable!( + binview::GrowableBinaryViewArray::<[u8]>, + arrays, + use_validity, + capacity + ) + }, + Utf8View => { + dyn_growable!( + binview::GrowableBinaryViewArray::, + arrays, + use_validity, + capacity + ) + }, Dictionary(key_type) => { match_integer_type!(key_type, |$T| { let arrays = arrays @@ -143,6 +153,14 @@ pub fn make_growable<'a>( capacity, )) }) - } + }, + Map => dyn_growable!(map::GrowableMap, arrays, use_validity, capacity), + Union => { + let arrays = arrays + .iter() + .map(|array| array.as_any().downcast_ref().unwrap()) + .collect::>(); + Box::new(union::GrowableUnion::new(arrays, capacity)) + }, } } diff --git a/src/array/growable/primitive.rs b/src/array/growable/primitive.rs index e443756cb95..7fb0939407d 100644 --- a/src/array/growable/primitive.rs +++ b/src/array/growable/primitive.rs @@ -8,17 +8,16 @@ use crate::{ }; use super::{ - utils::{build_extend_null_bits, ExtendNullBits}, + utils::{extend_validity, prepare_validity}, Growable, }; /// Concrete [`Growable`] for the [`PrimitiveArray`]. pub struct GrowablePrimitive<'a, T: NativeType> { data_type: DataType, - arrays: Vec<&'a [T]>, - validity: MutableBitmap, + arrays: Vec<&'a PrimitiveArray>, + validity: Option, values: Vec, - extend_null_bits: Vec>, } impl<'a, T: NativeType> GrowablePrimitive<'a, T> { @@ -38,22 +37,11 @@ impl<'a, T: NativeType> GrowablePrimitive<'a, T> { let data_type = arrays[0].data_type().clone(); - let extend_null_bits = arrays - .iter() - .map(|array| build_extend_null_bits(*array, use_validity)) - .collect(); - - let arrays = arrays - .iter() - .map(|array| array.values().as_slice()) - .collect::>(); - Self { data_type, arrays, values: Vec::with_capacity(capacity), - validity: MutableBitmap::with_capacity(capacity), - extend_null_bits, + validity: prepare_validity(use_validity, capacity), } } @@ -62,16 +50,21 @@ impl<'a, T: NativeType> GrowablePrimitive<'a, T> { let validity = std::mem::take(&mut self.validity); let values = std::mem::take(&mut self.values); - PrimitiveArray::::new(self.data_type.clone(), values.into(), validity.into()) + PrimitiveArray::::new( + self.data_type.clone(), + values.into(), + validity.map(|v| v.into()), + ) } } impl<'a, T: NativeType> Growable<'a> for GrowablePrimitive<'a, T> { #[inline] fn extend(&mut self, index: usize, start: usize, len: usize) { - (self.extend_null_bits[index])(&mut self.validity, start, len); + let array = self.arrays[index]; + extend_validity(&mut self.validity, array, start, len); - let values = self.arrays[index]; + let values = array.values().as_slice(); self.values.extend_from_slice(&values[start..start + len]); } @@ -79,7 +72,9 @@ impl<'a, T: NativeType> Growable<'a> for GrowablePrimitive<'a, T> { fn extend_validity(&mut self, additional: usize) { self.values .resize(self.values.len() + additional, T::default()); - self.validity.extend_constant(additional, false); + if let Some(validity) = &mut self.validity { + validity.extend_constant(additional, false); + } } #[inline] @@ -101,6 +96,10 @@ impl<'a, T: NativeType> Growable<'a> for GrowablePrimitive<'a, T> { impl<'a, T: NativeType> From> for PrimitiveArray { #[inline] fn from(val: GrowablePrimitive<'a, T>) -> Self { - PrimitiveArray::::new(val.data_type, val.values.into(), val.validity.into()) + PrimitiveArray::::new( + val.data_type, + val.values.into(), + val.validity.map(|v| v.into()), + ) } } diff --git a/src/array/growable/structure.rs b/src/array/growable/structure.rs index b1242e08a4f..ccfe4399435 100644 --- a/src/array/growable/structure.rs +++ b/src/array/growable/structure.rs @@ -7,16 +7,15 @@ use crate::{ use super::{ make_growable, - utils::{build_extend_null_bits, ExtendNullBits}, + utils::{extend_validity, prepare_validity}, Growable, }; /// Concrete [`Growable`] for the [`StructArray`]. pub struct GrowableStruct<'a> { arrays: Vec<&'a StructArray>, - validity: MutableBitmap, + validity: Option, values: Vec + 'a>>, - extend_null_bits: Vec>, } impl<'a> GrowableStruct<'a> { @@ -32,11 +31,6 @@ impl<'a> GrowableStruct<'a> { use_validity = true; }; - let extend_null_bits = arrays - .iter() - .map(|array| build_extend_null_bits(*array, use_validity)) - .collect(); - let arrays = arrays .iter() .map(|array| array.as_any().downcast_ref::().unwrap()) @@ -59,8 +53,7 @@ impl<'a> GrowableStruct<'a> { Self { arrays, values, - validity: MutableBitmap::with_capacity(capacity), - extend_null_bits, + validity: prepare_validity(use_validity, capacity), } } @@ -69,15 +62,19 @@ impl<'a> GrowableStruct<'a> { let values = std::mem::take(&mut self.values); let values = values.into_iter().map(|mut x| x.as_box()).collect(); - StructArray::new(self.arrays[0].data_type().clone(), values, validity.into()) + StructArray::new( + self.arrays[0].data_type().clone(), + values, + validity.map(|v| v.into()), + ) } } impl<'a> Growable<'a> for GrowableStruct<'a> { fn extend(&mut self, index: usize, start: usize, len: usize) { - (self.extend_null_bits[index])(&mut self.validity, start, len); - let array = self.arrays[index]; + extend_validity(&mut self.validity, array, start, len); + if array.null_count() == 0 { self.values .iter_mut() @@ -101,18 +98,17 @@ impl<'a> Growable<'a> for GrowableStruct<'a> { self.values .iter_mut() .for_each(|child| child.extend_validity(additional)); - self.validity.extend_constant(additional, false); + if let Some(validity) = &mut self.validity { + validity.extend_constant(additional, false); + } } #[inline] fn len(&self) -> usize { - // All children should have the same indexing, so just use the first - // one. If we don't have children, we might still have a validity - // array, so use that. - if let Some(child) = self.values.get(0) { + if let Some(child) = self.values.first() { child.len() } else { - self.validity.len() + unreachable!() } } @@ -132,7 +128,7 @@ impl<'a> From> for StructArray { StructArray::new( val.arrays[0].data_type().clone(), values, - val.validity.into(), + val.validity.map(|v| v.into()), ) } } diff --git a/src/array/growable/utf8.rs b/src/array/growable/utf8.rs index cd71da0a264..f65709f961e 100644 --- a/src/array/growable/utf8.rs +++ b/src/array/growable/utf8.rs @@ -7,17 +7,16 @@ use crate::{ }; use super::{ - utils::{build_extend_null_bits, extend_offset_values, ExtendNullBits}, + utils::{extend_validity, prepare_validity, extend_offset_values}, Growable, }; /// Concrete [`Growable`] for the [`Utf8Array`]. pub struct GrowableUtf8<'a, O: Offset> { arrays: Vec<&'a Utf8Array>, - validity: MutableBitmap, + validity: Option, values: Vec, offsets: Offsets, - extend_null_bits: Vec>, } impl<'a, O: Offset> GrowableUtf8<'a, O> { @@ -31,17 +30,11 @@ impl<'a, O: Offset> GrowableUtf8<'a, O> { use_validity = true; }; - let extend_null_bits = arrays - .iter() - .map(|array| build_extend_null_bits(*array, use_validity)) - .collect(); - Self { arrays: arrays.to_vec(), values: Vec::with_capacity(0), offsets: Offsets::with_capacity(capacity), - validity: MutableBitmap::with_capacity(capacity), - extend_null_bits, + validity: prepare_validity(use_validity, capacity), } } @@ -60,7 +53,7 @@ impl<'a, O: Offset> GrowableUtf8<'a, O> { self.arrays[0].data_type().clone(), offsets.into(), values.into(), - validity.into(), + validity.map(|v| v.into()), ) .unwrap() } @@ -69,9 +62,9 @@ impl<'a, O: Offset> GrowableUtf8<'a, O> { impl<'a, O: Offset> Growable<'a> for GrowableUtf8<'a, O> { fn extend(&mut self, index: usize, start: usize, len: usize) { - (self.extend_null_bits[index])(&mut self.validity, start, len); - let array = self.arrays[index]; + extend_validity(&mut self.validity, array, start, len); + let offsets = array.offsets(); let values = array.values(); @@ -85,7 +78,9 @@ impl<'a, O: Offset> Growable<'a> for GrowableUtf8<'a, O> { fn extend_validity(&mut self, additional: usize) { self.offsets.extend_constant(additional); - self.validity.extend_constant(additional, false); + if let Some(validity) = &mut self.validity { + validity.extend_constant(additional, false); + } } #[inline] diff --git a/src/array/growable/utils.rs b/src/array/growable/utils.rs index 3e0c25a4ee2..3da39275bb4 100644 --- a/src/array/growable/utils.rs +++ b/src/array/growable/utils.rs @@ -1,28 +1,5 @@ use crate::{array::Array, bitmap::MutableBitmap, offset::Offset}; -// function used to extend nulls from arrays. This function's lifetime is bound to the array -// because it reads nulls from it. -pub(super) type ExtendNullBits<'a> = Box; - -pub(super) fn build_extend_null_bits(array: &dyn Array, use_validity: bool) -> ExtendNullBits { - if let Some(bitmap) = array.validity() { - Box::new(move |validity, start, len| { - debug_assert!(start + len <= bitmap.len()); - let (slice, offset, _) = bitmap.as_slice(); - // safety: invariant offset + length <= slice.len() - unsafe { - validity.extend_from_slice_unchecked(slice, start + offset, len); - } - }) - } else if use_validity { - Box::new(|validity, _, len| { - validity.extend_constant(len, true); - }) - } else { - Box::new(|_, _, _| {}) - } -} - #[inline] pub(super) fn extend_offset_values( buffer: &mut Vec, @@ -36,3 +13,32 @@ pub(super) fn extend_offset_values( let new_values = &values[start_values..end_values]; buffer.extend_from_slice(new_values); } + +pub(super) fn prepare_validity(use_validity: bool, capacity: usize) -> Option { + if use_validity { + Some(MutableBitmap::with_capacity(capacity)) + } else { + None + } +} + +pub(super) fn extend_validity( + mutable_validity: &mut Option, + array: &dyn Array, + start: usize, + len: usize, +) { + if let Some(mutable_validity) = mutable_validity { + match array.validity() { + None => mutable_validity.extend_constant(len, true), + Some(validity) => { + debug_assert!(start + len <= validity.len()); + let (slice, offset, _) = validity.as_slice(); + // safety: invariant offset + length <= slice.len() + unsafe { + mutable_validity.extend_from_slice_unchecked(slice, start + offset, len); + } + }, + } + } +} diff --git a/src/array/list/ffi.rs b/src/array/list/ffi.rs index 2b6be75e782..8edcb1ef4c9 100644 --- a/src/array/list/ffi.rs +++ b/src/array/list/ffi.rs @@ -9,7 +9,7 @@ unsafe impl ToFfi for ListArray { fn buffers(&self) -> Vec> { vec![ self.validity.as_ref().map(|x| x.as_ptr()), - Some(self.offsets.buffer().as_ptr().cast::()), + Some(self.offsets.buffer().storage_ptr().cast::()), ] } @@ -61,6 +61,6 @@ impl FromFfi for ListArray { // assumption that data from FFI is well constructed let offsets = unsafe { OffsetsBuffer::new_unchecked(offsets) }; - Ok(Self::new(data_type, offsets, values, validity)) + Self::try_new(data_type, offsets, values, validity) } } diff --git a/src/array/map/ffi.rs b/src/array/map/ffi.rs index 09920419c21..7da6c73b0cb 100644 --- a/src/array/map/ffi.rs +++ b/src/array/map/ffi.rs @@ -7,7 +7,7 @@ unsafe impl ToFfi for MapArray { fn buffers(&self) -> Vec> { vec![ self.validity.as_ref().map(|x| x.as_ptr()), - Some(self.offsets.buffer().as_ptr().cast::()), + Some(self.offsets.buffer().storage_ptr().cast::()), ] } diff --git a/src/array/mod.rs b/src/array/mod.rs index 02735c3d0bb..eeec8b77380 100644 --- a/src/array/mod.rs +++ b/src/array/mod.rs @@ -308,6 +308,8 @@ macro_rules! with_match_primitive_type {( Float16 => __with_ty__! { f16 }, Float32 => __with_ty__! { f32 }, Float64 => __with_ty__! { f64 }, + _ => panic!("operator does not support primitive `{:?}`", + $key_type) } })} @@ -320,6 +322,8 @@ impl std::fmt::Debug for dyn Array + '_ { Primitive(primitive) => with_match_primitive_type!(primitive, |$T| { fmt_dyn!(self, PrimitiveArray<$T>, f) }), + BinaryView => fmt_dyn!(self, BinaryViewArray, f), + Utf8View => fmt_dyn!(self, Utf8ViewArray, f), Binary => fmt_dyn!(self, BinaryArray, f), LargeBinary => fmt_dyn!(self, BinaryArray, f), FixedSizeBinary => fmt_dyn!(self, FixedSizeBinaryArray, f), @@ -360,6 +364,8 @@ pub fn new_empty_array(data_type: DataType) -> Box { Struct => Box::new(StructArray::new_empty(data_type)), Union => Box::new(UnionArray::new_empty(data_type)), Map => Box::new(MapArray::new_empty(data_type)), + Utf8View => Box::new(Utf8ViewArray::new_empty(data_type)), + BinaryView => Box::new(BinaryViewArray::new_empty(data_type)), Dictionary(key_type) => { match_integer_type!(key_type, |$T| { Box::new(DictionaryArray::<$T>::new_empty(data_type)) @@ -390,6 +396,8 @@ pub fn new_null_array(data_type: DataType, length: usize) -> Box { Struct => Box::new(StructArray::new_null(data_type, length)), Union => Box::new(UnionArray::new_null(data_type, length)), Map => Box::new(MapArray::new_null(data_type, length)), + BinaryView => Box::new(BinaryViewArray::new_null(data_type, length)), + Utf8View => Box::new(Utf8ViewArray::new_null(data_type, length)), Dictionary(key_type) => { match_integer_type!(key_type, |$T| { Box::new(DictionaryArray::<$T>::new_null(data_type, length)) @@ -472,6 +480,7 @@ pub fn to_data(array: &dyn Array) -> arrow_data::ArrayData { }) } Map => to_data_dyn!(array, MapArray), + BinaryView | Utf8View => todo!(), } } @@ -502,6 +511,7 @@ pub fn from_data(data: &arrow_data::ArrayData) -> Box { }) } Map => Box::new(MapArray::from_data(data)), + BinaryView | Utf8View => todo!(), } } @@ -687,6 +697,8 @@ pub fn clone(array: &dyn Array) -> Box { Struct => clone_dyn!(array, StructArray), Union => clone_dyn!(array, UnionArray), Map => clone_dyn!(array, MapArray), + BinaryView => clone_dyn!(array, BinaryViewArray), + Utf8View => clone_dyn!(array, Utf8ViewArray), Dictionary(key_type) => { match_integer_type!(key_type, |$T| { clone_dyn!(array, DictionaryArray::<$T>) @@ -724,6 +736,7 @@ mod fmt; pub mod indexable; mod iterator; +mod binview; pub mod growable; pub mod ord; @@ -734,6 +747,10 @@ pub use equal::equal; pub use fmt::{get_display, get_value_display}; pub use binary::{BinaryArray, BinaryValueIter, MutableBinaryArray, MutableBinaryValuesArray}; +pub use binview::{ + BinaryViewArray, BinaryViewArrayGeneric, MutableBinaryViewArray, MutablePlBinary, + MutablePlString, Utf8ViewArray, View, ViewType, +}; pub use boolean::{BooleanArray, MutableBooleanArray}; pub use dictionary::{DictionaryArray, DictionaryKey, MutableDictionaryArray}; pub use fixed_size_binary::{FixedSizeBinaryArray, MutableFixedSizeBinaryArray}; diff --git a/src/array/primitive/ffi.rs b/src/array/primitive/ffi.rs index de5d6a70584..93c3939938d 100644 --- a/src/array/primitive/ffi.rs +++ b/src/array/primitive/ffi.rs @@ -13,7 +13,7 @@ unsafe impl ToFfi for PrimitiveArray { fn buffers(&self) -> Vec> { vec![ self.validity.as_ref().map(|x| x.as_ptr()), - Some(self.values.as_ptr().cast::()), + Some(self.values.storage_ptr().cast::()), ] } diff --git a/src/array/primitive/mod.rs b/src/array/primitive/mod.rs index 04b74a3529b..a5bfc3b3b86 100644 --- a/src/array/primitive/mod.rs +++ b/src/array/primitive/mod.rs @@ -96,6 +96,20 @@ impl PrimitiveArray { }) } + /// # Safety + /// Doesn't check invariants + pub unsafe fn new_unchecked( + data_type: DataType, + values: Buffer, + validity: Option, + ) -> Self { + Self { + data_type, + values, + validity, + } + } + /// Returns a new [`PrimitiveArray`] with a different logical type. /// /// This function is useful to assign a different [`DataType`] to the array. diff --git a/src/array/primitive/mutable.rs b/src/array/primitive/mutable.rs index 4432ab2e33f..e2f3a4c2f52 100644 --- a/src/array/primitive/mutable.rs +++ b/src/array/primitive/mutable.rs @@ -286,6 +286,10 @@ impl MutablePrimitiveArray { pub fn capacity(&self) -> usize { self.values.capacity() } + + pub fn freeze(self) -> PrimitiveArray { + self.into() + } } /// Accessors diff --git a/src/array/union/ffi.rs b/src/array/union/ffi.rs index 89cee93e4d3..87403cf1a3c 100644 --- a/src/array/union/ffi.rs +++ b/src/array/union/ffi.rs @@ -7,11 +7,11 @@ unsafe impl ToFfi for UnionArray { fn buffers(&self) -> Vec> { if let Some(offsets) = &self.offsets { vec![ - Some(self.types.as_ptr().cast::()), - Some(offsets.as_ptr().cast::()), + Some(self.types.storage_ptr().cast::()), + Some(offsets.storage_ptr().cast::()), ] } else { - vec![Some(self.types.as_ptr().cast::())] + vec![Some(self.types.storage_ptr().cast::())] } } diff --git a/src/array/utf8/ffi.rs b/src/array/utf8/ffi.rs index 3611678da57..99ffe75faf2 100644 --- a/src/array/utf8/ffi.rs +++ b/src/array/utf8/ffi.rs @@ -12,8 +12,8 @@ unsafe impl ToFfi for Utf8Array { fn buffers(&self) -> Vec> { vec![ self.validity.as_ref().map(|x| x.as_ptr()), - Some(self.offsets.buffer().as_ptr().cast::()), - Some(self.values.as_ptr().cast::()), + Some(self.offsets.buffer().storage_ptr().cast::()), + Some(self.values.storage_ptr().cast::()), ] } diff --git a/src/array/utf8/mod.rs b/src/array/utf8/mod.rs index 9440ae43304..648e23f8a25 100644 --- a/src/array/utf8/mod.rs +++ b/src/array/utf8/mod.rs @@ -12,10 +12,7 @@ use crate::{ use either::Either; -use super::{ - specification::{try_check_offsets_bounds, try_check_utf8}, - Array, GenericBinaryArray, -}; +use super::{specification::{try_check_offsets_bounds, try_check_utf8}, Array, GenericBinaryArray, BinaryArray}; #[cfg(feature = "arrow")] mod data; @@ -513,6 +510,18 @@ impl Utf8Array { self.set_validity(Some(f(validity))) } } + + // Convert this [`Utf8Array`] to a [`BinaryArray`]. + pub fn to_binary(&self) -> BinaryArray { + unsafe { + BinaryArray::new( + BinaryArray::::default_data_type(), + self.offsets.clone(), + self.values.clone(), + self.validity.clone(), + ) + } + } } impl Array for Utf8Array { diff --git a/src/bitmap/mutable.rs b/src/bitmap/mutable.rs index 31834f21657..13b76209a4c 100644 --- a/src/bitmap/mutable.rs +++ b/src/bitmap/mutable.rs @@ -2,7 +2,7 @@ use std::hint::unreachable_unchecked; use std::iter::FromIterator; use std::sync::Arc; -use crate::bitmap::utils::{merge_reversed, set_bit_unchecked}; +use crate::bitmap::utils::{get_bit_unchecked, merge_reversed, set_bit_unchecked}; use crate::error::Error; use crate::trusted_len::TrustedLen; @@ -115,7 +115,7 @@ impl MutableBitmap { if self.length % 8 == 0 { self.buffer.push(0); } - let byte = self.buffer.as_mut_slice().last_mut().unwrap(); + let byte = unsafe { self.buffer.as_mut_slice().last_mut().unwrap_unchecked() }; *byte = set(*byte, self.length % 8, value); self.length += 1; } @@ -129,7 +129,7 @@ impl MutableBitmap { } self.length -= 1; - let value = self.get(self.length); + let value = unsafe { self.get_unchecked(self.length) }; if self.length % 8 == 0 { self.buffer.pop(); } @@ -144,6 +144,15 @@ impl MutableBitmap { get_bit(&self.buffer, index) } + /// Returns whether the position `index` is set. + /// + /// # Safety + /// The caller must ensure `index < self.len()`. + #[inline] + pub unsafe fn get_unchecked(&self, index: usize) -> bool { + get_bit_unchecked(&self.buffer, index) + } + /// Sets the position `index` to `value` /// # Panics /// Panics iff `index >= self.len()`. @@ -325,6 +334,10 @@ impl MutableBitmap { pub(crate) fn bitchunks_exact_mut(&mut self) -> BitChunksExactMut { BitChunksExactMut::new(&mut self.buffer, self.length) } + + pub fn freeze(self) -> Bitmap { + self.into() + } } impl From for Bitmap { diff --git a/src/buffer/immutable.rs b/src/buffer/immutable.rs index 0da4a41ace4..2d2a38e3ce8 100644 --- a/src/buffer/immutable.rs +++ b/src/buffer/immutable.rs @@ -1,9 +1,10 @@ use std::{iter::FromIterator, ops::Deref, sync::Arc, usize}; use either::Either; +use num_traits::Zero; -use super::Bytes; -use super::IntoIter; +use super::{Bytes, IntoIter}; +use crate::array::ArrayAccessor; /// [`Buffer`] is a contiguous memory region that can be shared across /// thread boundaries. @@ -36,17 +37,19 @@ use super::IntoIter; /// ``` #[derive(Clone)] pub struct Buffer { - /// the internal byte buffer. - data: Arc>, + /// The internal byte buffer. + storage: Arc>, - /// The offset into the buffer. - offset: usize, + /// A pointer into the buffer where our data starts. + ptr: *const T, - // the length of the buffer. Given a region `data` of N bytes, [offset..offset+length] is visible - // to this buffer. + // The length of the buffer. length: usize, } +unsafe impl Sync for Buffer {} +unsafe impl Send for Buffer {} + impl PartialEq for Buffer { #[inline] fn eq(&self, other: &Self) -> bool { @@ -76,10 +79,11 @@ impl Buffer { /// Auxiliary method to create a new Buffer pub(crate) fn from_bytes(bytes: Bytes) -> Self { + let ptr = bytes.as_ptr(); let length = bytes.len(); Buffer { - data: Arc::new(bytes), - offset: 0, + storage: Arc::new(bytes), + ptr, length, } } @@ -93,14 +97,14 @@ impl Buffer { /// Returns whether the buffer is empty. #[inline] pub fn is_empty(&self) -> bool { - self.len() == 0 + self.length == 0 } /// Returns whether underlying data is sliced. /// If sliced the [`Buffer`] is backed by /// more data than the length of `Self`. pub fn is_sliced(&self) -> bool { - self.data.len() != self.length + self.storage.len() != self.length } /// Returns the byte slice stored in this buffer @@ -108,11 +112,8 @@ impl Buffer { pub fn as_slice(&self) -> &[T] { // Safety: // invariant of this struct `offset + length <= data.len()` - debug_assert!(self.offset + self.length <= self.data.len()); - unsafe { - self.data - .get_unchecked(self.offset..self.offset + self.length) - } + debug_assert!(self.offset() + self.length <= self.storage.len()); + unsafe { std::slice::from_raw_parts(self.ptr, self.length) } } /// Returns the byte slice stored in this buffer @@ -123,7 +124,7 @@ impl Buffer { // Safety: // invariant of this function debug_assert!(index < self.length); - unsafe { self.data.get_unchecked(self.offset + index) } + unsafe { &*self.ptr.add(index) } } /// Returns a new [`Buffer`] that is a slice of this buffer starting at `offset`. @@ -169,20 +170,24 @@ impl Buffer { /// The caller must ensure `offset + length <= self.len()` #[inline] pub unsafe fn slice_unchecked(&mut self, offset: usize, length: usize) { - self.offset += offset; + self.ptr = self.ptr.add(offset); self.length = length; } - /// Returns a pointer to the start of this buffer. + /// Returns a pointer to the start of the storage underlying this buffer. #[inline] - pub(crate) fn as_ptr(&self) -> *const T { - self.data.deref().as_ptr() + pub(crate) fn storage_ptr(&self) -> *const T { + self.storage.as_ptr() } - /// Returns the offset of this buffer. + /// Returns the start offset of this buffer within the underlying storage. #[inline] pub fn offset(&self) -> usize { - self.offset + unsafe { + let ret = self.ptr.offset_from(self.storage.as_ptr()) as usize; + debug_assert!(ret <= self.storage.len()); + ret + } } /// # Safety @@ -196,10 +201,14 @@ impl Buffer { /// /// This operation returns [`Either::Right`] iff this [`Buffer`]: /// * has not been cloned (i.e. [`Arc`]`::get_mut` yields [`Some`]) - /// * has not been imported from the c data interface (FFI) + /// * has not been imported from the C data interface (FFI) #[inline] pub fn into_mut(mut self) -> Either> { - match Arc::get_mut(&mut self.data) + // We lose information if the data is sliced. + if self.is_sliced() { + return Either::Left(self); + } + match Arc::get_mut(&mut self.storage) .and_then(|b| b.get_vec()) .map(std::mem::take) { @@ -208,65 +217,42 @@ impl Buffer { } } - /// Returns a mutable reference to its underlying `Vec`, if possible. - /// Note that only `[self.offset(), self.offset() + self.len()[` in this vector is visible - /// by this buffer. - /// - /// This operation returns [`Some`] iff this [`Buffer`]: - /// * has not been cloned (i.e. [`Arc`]`::get_mut` yields [`Some`]) - /// * has not been imported from the c data interface (FFI) - /// # Safety - /// The caller must ensure that the vector in the mutable reference keeps a length of at least `self.offset() + self.len() - 1`. - #[inline] - pub unsafe fn get_mut(&mut self) -> Option<&mut Vec> { - Arc::get_mut(&mut self.data).and_then(|b| b.get_vec()) - } - /// Returns a mutable reference to its slice, if possible. /// /// This operation returns [`Some`] iff this [`Buffer`]: /// * has not been cloned (i.e. [`Arc`]`::get_mut` yields [`Some`]) - /// * has not been imported from the c data interface (FFI) + /// * has not been imported from the C data interface (FFI) #[inline] pub fn get_mut_slice(&mut self) -> Option<&mut [T]> { - Arc::get_mut(&mut self.data) - .and_then(|b| b.get_vec()) - // Safety: the invariant of this struct - .map(|x| unsafe { x.get_unchecked_mut(self.offset..self.offset + self.length) }) + let offset = self.offset(); + let unique = Arc::get_mut(&mut self.storage)?; + let vec = unique.get_vec()?; + Some(unsafe { vec.get_unchecked_mut(offset..offset + self.length) }) } /// Get the strong count of underlying `Arc` data buffer. pub fn shared_count_strong(&self) -> usize { - Arc::strong_count(&self.data) + Arc::strong_count(&self.storage) } /// Get the weak count of underlying `Arc` data buffer. pub fn shared_count_weak(&self) -> usize { - Arc::weak_count(&self.data) + Arc::weak_count(&self.storage) } +} - /// Returns its internal representation - #[must_use] - pub fn into_inner(self) -> (Arc>, usize, usize) { - let Self { - data, - offset, - length, - } = self; - (data, offset, length) +impl Buffer { + pub fn make_mut(self) -> Vec { + match self.into_mut() { + Either::Right(v) => v, + Either::Left(same) => same.as_slice().to_vec(), + } } +} - /// Creates a `[Bitmap]` from its internal representation. - /// This is the inverted from `[Bitmap::into_inner]` - /// - /// # Safety - /// Callers must ensure all invariants of this struct are upheld. - pub unsafe fn from_inner_unchecked(data: Arc>, offset: usize, length: usize) -> Self { - Self { - data, - offset, - length, - } +impl Buffer { + pub fn zeroed(len: usize) -> Self { + vec![T::zero(); len].into() } } @@ -274,10 +260,12 @@ impl From> for Buffer { #[inline] fn from(p: Vec) -> Self { let bytes: Bytes = p.into(); + let ptr = bytes.as_ptr(); + let length = bytes.len(); Self { - offset: 0, - length: bytes.len(), - data: Arc::new(bytes), + storage: Arc::new(bytes), + ptr, + length, } } } @@ -318,9 +306,22 @@ impl From for Buffer { #[cfg(feature = "arrow")] impl From> for arrow_buffer::Buffer { fn from(value: Buffer) -> Self { - crate::buffer::to_buffer(value.data).slice_with_length( - value.offset * std::mem::size_of::(), + let offset = value.offset(); + crate::buffer::to_buffer(value.storage).slice_with_length( + offset * std::mem::size_of::(), value.length * std::mem::size_of::(), ) } } + +unsafe impl<'a, T: 'a> ArrayAccessor<'a> for Buffer { + type Item = &'a T; + + unsafe fn value_unchecked(&'a self, index: usize) -> Self::Item { + unsafe { &*self.ptr.add(index) } + } + + fn len(&self) -> usize { + Buffer::len(self) + } +} diff --git a/src/buffer/mod.rs b/src/buffer/mod.rs index 46c0a4d64a3..0ec6ff22a8d 100644 --- a/src/buffer/mod.rs +++ b/src/buffer/mod.rs @@ -7,6 +7,9 @@ use crate::ffi::InternalArrowArray; use std::ops::Deref; pub(crate) enum BytesAllocator { + // Dead code lint is a false positive. + // remove once fixed in rustc + #[allow(dead_code)] InternalArrowArray(InternalArrowArray), #[cfg(feature = "arrow")] diff --git a/src/compute/aggregate/memory.rs b/src/compute/aggregate/memory.rs index 7e3218a828a..819cc03e0b7 100644 --- a/src/compute/aggregate/memory.rs +++ b/src/compute/aggregate/memory.rs @@ -22,6 +22,12 @@ macro_rules! dyn_binary { }}; } +fn binview_size(array: &BinaryViewArrayGeneric) -> usize { + array.views().len() * std::mem::size_of::() + + array.data_buffers().iter().map(|b| b.len()).sum::() + + validity_size(array.validity()) +} + /// Returns the total (heap) allocated size of the array in bytes. /// # Implementation /// This estimation is the sum of the size of its buffers, validity, including nested arrays. @@ -109,6 +115,8 @@ pub fn estimated_bytes_size(array: &dyn Array) -> usize { .unwrap(); estimated_bytes_size(array.keys()) + estimated_bytes_size(array.values().as_ref()) }), + Utf8View => binview_size::(array.as_any().downcast_ref().unwrap()), + BinaryView => binview_size::<[u8]>(array.as_any().downcast_ref().unwrap()), Map => { let array = array.as_any().downcast_ref::().unwrap(); let offsets = array.offsets().len_proxy() * std::mem::size_of::(); diff --git a/src/compute/arithmetics/basic/mod.rs b/src/compute/arithmetics/basic/mod.rs index 22ed09baf6e..e78531fba7a 100644 --- a/src/compute/arithmetics/basic/mod.rs +++ b/src/compute/arithmetics/basic/mod.rs @@ -39,6 +39,7 @@ impl NativeArithmetics for i8 {} impl NativeArithmetics for i16 {} impl NativeArithmetics for i32 {} impl NativeArithmetics for i64 {} +impl NativeArithmetics for i128 {} impl NativeArithmetics for f32 {} impl NativeArithmetics for f64 {} diff --git a/src/compute/arithmetics/decimal/add.rs b/src/compute/arithmetics/decimal/add.rs index 9f6f529e887..93a850b89eb 100644 --- a/src/compute/arithmetics/decimal/add.rs +++ b/src/compute/arithmetics/decimal/add.rs @@ -2,7 +2,6 @@ use crate::{ array::PrimitiveArray, compute::{ - arithmetics::{ArrayAdd, ArrayCheckedAdd, ArraySaturatingAdd}, arity::{binary, binary_checked}, utils::{check_same_len, combine_validities}, }, @@ -134,27 +133,6 @@ pub fn checked_add(lhs: &PrimitiveArray, rhs: &PrimitiveArray) -> Pr binary_checked(lhs, rhs, lhs.data_type().clone(), op) } -// Implementation of ArrayAdd trait for PrimitiveArrays -impl ArrayAdd> for PrimitiveArray { - fn add(&self, rhs: &PrimitiveArray) -> Self { - add(self, rhs) - } -} - -// Implementation of ArrayCheckedAdd trait for PrimitiveArrays -impl ArrayCheckedAdd> for PrimitiveArray { - fn checked_add(&self, rhs: &PrimitiveArray) -> Self { - checked_add(self, rhs) - } -} - -// Implementation of ArraySaturatingAdd trait for PrimitiveArrays -impl ArraySaturatingAdd> for PrimitiveArray { - fn saturating_add(&self, rhs: &PrimitiveArray) -> Self { - saturating_add(self, rhs) - } -} - /// Adaptive addition of two decimal primitive arrays with different precision /// and scale. If the precision and scale is different, then the smallest scale /// and precision is adjusted to the largest precision and scale. If during the diff --git a/src/compute/arithmetics/decimal/div.rs b/src/compute/arithmetics/decimal/div.rs index 159c27de2b1..39c691d28e0 100644 --- a/src/compute/arithmetics/decimal/div.rs +++ b/src/compute/arithmetics/decimal/div.rs @@ -4,7 +4,6 @@ use crate::{ array::PrimitiveArray, compute::{ - arithmetics::{ArrayCheckedDiv, ArrayDiv}, arity::{binary, binary_checked, unary}, utils::{check_same_len, combine_validities}, }, @@ -199,20 +198,6 @@ pub fn checked_div(lhs: &PrimitiveArray, rhs: &PrimitiveArray) -> Pr binary_checked(lhs, rhs, lhs.data_type().clone(), op) } -// Implementation of ArrayDiv trait for PrimitiveArrays -impl ArrayDiv> for PrimitiveArray { - fn div(&self, rhs: &PrimitiveArray) -> Self { - div(self, rhs) - } -} - -// Implementation of ArrayCheckedDiv trait for PrimitiveArrays -impl ArrayCheckedDiv> for PrimitiveArray { - fn checked_div(&self, rhs: &PrimitiveArray) -> Self { - checked_div(self, rhs) - } -} - /// Adaptive division of two decimal primitive arrays with different precision /// and scale. If the precision and scale is different, then the smallest scale /// and precision is adjusted to the largest precision and scale. If during the diff --git a/src/compute/arithmetics/decimal/mul.rs b/src/compute/arithmetics/decimal/mul.rs index ac702d2cb3c..3301eac85b0 100644 --- a/src/compute/arithmetics/decimal/mul.rs +++ b/src/compute/arithmetics/decimal/mul.rs @@ -4,7 +4,6 @@ use crate::{ array::PrimitiveArray, compute::{ - arithmetics::{ArrayCheckedMul, ArrayMul, ArraySaturatingMul}, arity::{binary, binary_checked, unary}, utils::{check_same_len, combine_validities}, }, @@ -204,27 +203,6 @@ pub fn checked_mul(lhs: &PrimitiveArray, rhs: &PrimitiveArray) -> Pr binary_checked(lhs, rhs, lhs.data_type().clone(), op) } -// Implementation of ArrayMul trait for PrimitiveArrays -impl ArrayMul> for PrimitiveArray { - fn mul(&self, rhs: &PrimitiveArray) -> Self { - mul(self, rhs) - } -} - -// Implementation of ArrayCheckedMul trait for PrimitiveArrays -impl ArrayCheckedMul> for PrimitiveArray { - fn checked_mul(&self, rhs: &PrimitiveArray) -> Self { - checked_mul(self, rhs) - } -} - -// Implementation of ArraySaturatingMul trait for PrimitiveArrays -impl ArraySaturatingMul> for PrimitiveArray { - fn saturating_mul(&self, rhs: &PrimitiveArray) -> Self { - saturating_mul(self, rhs) - } -} - /// Adaptive multiplication of two decimal primitive arrays with different /// precision and scale. If the precision and scale is different, then the /// smallest scale and precision is adjusted to the largest precision and diff --git a/src/compute/arithmetics/decimal/sub.rs b/src/compute/arithmetics/decimal/sub.rs index 84afd205433..6759708fd70 100644 --- a/src/compute/arithmetics/decimal/sub.rs +++ b/src/compute/arithmetics/decimal/sub.rs @@ -3,7 +3,6 @@ use crate::{ array::PrimitiveArray, compute::{ - arithmetics::{ArrayCheckedSub, ArraySaturatingSub, ArraySub}, arity::{binary, binary_checked}, utils::{check_same_len, combine_validities}, }, @@ -97,26 +96,6 @@ pub fn saturating_sub( binary(lhs, rhs, lhs.data_type().clone(), op) } -// Implementation of ArraySub trait for PrimitiveArrays -impl ArraySub> for PrimitiveArray { - fn sub(&self, rhs: &PrimitiveArray) -> Self { - sub(self, rhs) - } -} - -// Implementation of ArrayCheckedSub trait for PrimitiveArrays -impl ArrayCheckedSub> for PrimitiveArray { - fn checked_sub(&self, rhs: &PrimitiveArray) -> Self { - checked_sub(self, rhs) - } -} - -// Implementation of ArraySaturatingSub trait for PrimitiveArrays -impl ArraySaturatingSub> for PrimitiveArray { - fn saturating_sub(&self, rhs: &PrimitiveArray) -> Self { - saturating_sub(self, rhs) - } -} /// Checked subtract of two decimal primitive arrays with the same precision /// and scale. If the precision and scale is different, then an /// InvalidArgumentError is returned. If the result from the sub is larger than diff --git a/src/compute/arithmetics/mod.rs b/src/compute/arithmetics/mod.rs index b1ec2a12bcc..33373564621 100644 --- a/src/compute/arithmetics/mod.rs +++ b/src/compute/arithmetics/mod.rs @@ -416,6 +416,7 @@ macro_rules! with_match_negatable {( UInt8 | UInt16 | UInt32 | UInt64 | Float16 => todo!(), Float32 => __with_ty__! { f32 }, Float64 => __with_ty__! { f64 }, + UInt128 => todo!(), } })} diff --git a/src/compute/cast/binary_to.rs b/src/compute/cast/binary_to.rs index 82f827e3f61..976fd86aeec 100644 --- a/src/compute/cast/binary_to.rs +++ b/src/compute/cast/binary_to.rs @@ -4,6 +4,47 @@ use crate::{array::*, datatypes::DataType, types::NativeType}; use super::CastOptions; +pub(super) trait Parse { + fn parse(val: &[u8]) -> Option + where + Self: Sized; +} + +macro_rules! impl_parse { + ($primitive_type:ident) => { + impl Parse for $primitive_type { + fn parse(val: &[u8]) -> Option { + atoi_simd::parse(val).ok() + } + } + }; +} +impl_parse!(i8); +impl_parse!(i16); +impl_parse!(i32); +impl_parse!(i64); +impl_parse!(u8); +impl_parse!(u16); +impl_parse!(u32); +impl_parse!(u64); + +impl Parse for f32 { + fn parse(val: &[u8]) -> Option + where + Self: Sized, + { + fast_float::parse(val).ok() + } +} +impl Parse for f64 { + fn parse(val: &[u8]) -> Option + where + Self: Sized, + { + fast_float::parse(val).ok() + } +} + /// Conversion of binary pub fn binary_to_large_binary(from: &BinaryArray, to_data_type: DataType) -> BinaryArray { let values = from.values().clone(); @@ -72,13 +113,11 @@ where } /// Casts a [`BinaryArray`] to a [`PrimitiveArray`], making any uncastable value a Null. -pub fn binary_to_primitive(from: &BinaryArray, to: &DataType) -> PrimitiveArray +pub(super) fn binary_to_primitive(from: &BinaryArray, to: &DataType) -> PrimitiveArray where - T: NativeType + lexical_core::FromLexical, + T: NativeType + Parse, { - let iter = from - .iter() - .map(|x| x.and_then::(|x| lexical_core::parse(x).ok())); + let iter = from.iter().map(|x| x.and_then::(|x| T::parse(x))); PrimitiveArray::::from_trusted_len_iter(iter).to(to.clone()) } @@ -89,7 +128,7 @@ pub(super) fn binary_to_primitive_dyn( options: CastOptions, ) -> Result> where - T: NativeType + lexical_core::FromLexical, + T: NativeType + lexical_core::FromLexical + Parse, { let from = from.as_any().downcast_ref().unwrap(); if options.partial { @@ -145,6 +184,11 @@ pub fn fixed_size_binary_binary( ) } +pub fn fixed_size_binary_to_binview(from: &FixedSizeBinaryArray) -> BinaryViewArray { + let mutable = MutableBinaryViewArray::from_values_iter(from.values_iter()); + mutable.freeze().with_validity(from.validity().cloned()) +} + /// Conversion of binary pub fn binary_to_list(from: &BinaryArray, to_data_type: DataType) -> ListArray { let values = from.values().clone(); diff --git a/src/compute/cast/binview_to.rs b/src/compute/cast/binview_to.rs new file mode 100644 index 00000000000..c0c50f1287f --- /dev/null +++ b/src/compute/cast/binview_to.rs @@ -0,0 +1,97 @@ +use chrono::Datelike; + +use crate::error::{Result}; +use crate::array::*; +use crate::compute::cast::binary_to::Parse; +use crate::compute::cast::CastOptions; +use crate::datatypes::{DataType, TimeUnit}; +use crate::offset::Offset; +use crate::temporal_conversions::EPOCH_DAYS_FROM_CE; +use crate::types::NativeType; + +pub(super) const RFC3339: &str = "%Y-%m-%dT%H:%M:%S%.f%:z"; + +pub(super) fn view_to_binary(array: &BinaryViewArray) -> BinaryArray { + let len: usize = Array::len(array); + let mut mutable = MutableBinaryValuesArray::::with_capacities(len, array.total_bytes_len()); + for slice in array.values_iter() { + mutable.push(slice) + } + let out: BinaryArray = mutable.into(); + out.with_validity(array.validity().cloned()) +} + +pub fn utf8view_to_utf8(array: &Utf8ViewArray) -> Utf8Array { + let array = array.to_binview(); + let out = view_to_binary::(&array); + + let dtype = Utf8Array::::default_data_type(); + unsafe { + Utf8Array::new_unchecked( + dtype, + out.offsets().clone(), + out.values().clone(), + out.validity().cloned(), + ) + } +} +/// Casts a [`BinaryArray`] to a [`PrimitiveArray`], making any uncastable value a Null. +pub(super) fn binview_to_primitive( + from: &BinaryViewArray, + to: &DataType, +) -> PrimitiveArray +where + T: NativeType + Parse, +{ + let iter = from.iter().map(|x| x.and_then::(|x| T::parse(x))); + + PrimitiveArray::::from_trusted_len_iter(iter).to(to.clone()) +} + +pub(super) fn binview_to_primitive_dyn( + from: &dyn Array, + to: &DataType, + options: CastOptions, +) -> Result> +where + T: NativeType + Parse, +{ + let from = from.as_any().downcast_ref().unwrap(); + if options.partial { + unimplemented!() + } else { + Ok(Box::new(binview_to_primitive::(from, to))) + } +} + +pub(super) fn utf8view_to_naive_timestamp_dyn( + from: &dyn Array, + time_unit: TimeUnit, +) -> Result> { + let from = from.as_any().downcast_ref().unwrap(); + Ok(Box::new(utf8view_to_naive_timestamp(from, time_unit))) +} + +/// [`crate::temporal_conversions::utf8view_to_timestamp`] applied for RFC3339 formatting +pub fn utf8view_to_naive_timestamp( + from: &Utf8ViewArray, + time_unit: TimeUnit, +) -> PrimitiveArray { + crate::temporal_conversions::utf8view_to_naive_timestamp(from, RFC3339, time_unit) +} + +pub(super) fn utf8view_to_date32(from: &Utf8ViewArray) -> PrimitiveArray { + let iter = from.iter().map(|x| { + x.and_then(|x| { + x.parse::() + .ok() + .map(|x| x.num_days_from_ce() - EPOCH_DAYS_FROM_CE) + }) + }); + PrimitiveArray::::from_trusted_len_iter(iter).to(DataType::Date32) +} + +pub(super) fn utf8view_to_date32_dyn(from: &dyn Array) -> Result> { + let from = from.as_any().downcast_ref().unwrap(); + Ok(Box::new(utf8view_to_date32(from))) +} diff --git a/src/compute/cast/boolean_to.rs b/src/compute/cast/boolean_to.rs index 1ce45c87118..0f81ab1b40e 100644 --- a/src/compute/cast/boolean_to.rs +++ b/src/compute/cast/boolean_to.rs @@ -1,9 +1,9 @@ use crate::{ - array::{Array, BinaryArray, BooleanArray, PrimitiveArray, Utf8Array}, + array::{Array, BinaryViewArray, BooleanArray, PrimitiveArray, Utf8ViewArray}, error::Result, - offset::Offset, types::NativeType, }; +use crate::array::MutableBinaryViewArray; pub(super) fn boolean_to_primitive_dyn(array: &dyn Array) -> Result> where @@ -27,24 +27,26 @@ where PrimitiveArray::::new(T::PRIMITIVE.into(), values.into(), from.validity().cloned()) } -/// Casts the [`BooleanArray`] to a [`Utf8Array`], casting trues to `"1"` and falses to `"0"` -pub fn boolean_to_utf8(from: &BooleanArray) -> Utf8Array { - let iter = from.values().iter().map(|x| if x { "1" } else { "0" }); - Utf8Array::from_trusted_len_values_iter(iter) +pub fn boolean_to_utf8view(from: &BooleanArray) -> Utf8ViewArray { + unsafe { boolean_to_binaryview(from).to_utf8view_unchecked() } } -pub(super) fn boolean_to_utf8_dyn(array: &dyn Array) -> Result> { +pub(super) fn boolean_to_utf8view_dyn(array: &dyn Array) -> Result> { let array = array.as_any().downcast_ref().unwrap(); - Ok(Box::new(boolean_to_utf8::(array))) + Ok(boolean_to_utf8view(array).boxed()) } /// Casts the [`BooleanArray`] to a [`BinaryArray`], casting trues to `"1"` and falses to `"0"` -pub fn boolean_to_binary(from: &BooleanArray) -> BinaryArray { - let iter = from.values().iter().map(|x| if x { b"1" } else { b"0" }); - BinaryArray::from_trusted_len_values_iter(iter) +pub fn boolean_to_binaryview(from: &BooleanArray) -> BinaryViewArray { + let iter = from.iter().map(|opt_b| match opt_b { + Some(true) => Some("true".as_bytes()), + Some(false) => Some("false".as_bytes()), + None => None, + }); + MutableBinaryViewArray::from_iter(iter).into() } -pub(super) fn boolean_to_binary_dyn(array: &dyn Array) -> Result> { +pub(super) fn boolean_to_binaryview_dyn(array: &dyn Array) -> Result> { let array = array.as_any().downcast_ref().unwrap(); - Ok(Box::new(boolean_to_binary::(array))) + Ok(boolean_to_binaryview(array).boxed()) } diff --git a/src/compute/cast/mod.rs b/src/compute/cast/mod.rs index 688291dd12b..8ac4d7935f8 100644 --- a/src/compute/cast/mod.rs +++ b/src/compute/cast/mod.rs @@ -1,6 +1,7 @@ //! Defines different casting operators such as [`cast`] or [`primitive_to_binary`]. mod binary_to; +mod binview_to; mod boolean_to; mod decimal_to; mod dictionary_to; @@ -8,6 +9,10 @@ mod primitive_to; mod utf8_to; pub use binary_to::*; +#[cfg(feature = "dtype-decimal")] +pub use binview_to::binview_to_decimal; +use binview_to::binview_to_primitive_dyn; +pub use binview_to::utf8view_to_utf8; pub use boolean_to::*; pub use decimal_to::*; pub use dictionary_to::*; @@ -20,6 +25,8 @@ use crate::{ error::{Error, Result}, offset::{Offset, Offsets}, }; +use crate::compute::cast::binview_to::{RFC3339, utf8view_to_date32_dyn, utf8view_to_naive_timestamp_dyn, view_to_binary}; +use crate::temporal_conversions::utf8view_to_timestamp; /// options defining how Cast kernels behave #[derive(Clone, Copy, Debug, Default)] @@ -33,6 +40,15 @@ pub struct CastOptions { pub partial: bool, } +impl CastOptions { + pub fn unchecked() -> Self { + Self { + wrapped: true, + partial: false, + } + } +} + impl CastOptions { fn with_wrapped(&self, v: bool) -> Self { let mut option = *self; @@ -143,10 +159,10 @@ pub fn can_cast_types(from_type: &DataType, to_type: &DataType) -> bool { (LargeBinary, to_type) => { is_numeric(to_type) || match to_type { - Binary | LargeUtf8 => true, - LargeList(field) => matches!(field.data_type, UInt8), - _ => false, - } + Binary | LargeUtf8 => true, + LargeList(field) => matches!(field.data_type, UInt8), + _ => false, + } } (FixedSizeBinary(_), to_type) => matches!(to_type, Binary | LargeBinary), (Timestamp(_, _), Utf8) => true, @@ -319,6 +335,26 @@ pub fn can_cast_types(from_type: &DataType, to_type: &DataType) -> bool { } } +fn cast_struct( + array: &StructArray, + to_type: &DataType, + options: CastOptions, +) -> Result { + let values = array.values(); + let fields = StructArray::get_fields(to_type); + let new_values = values + .iter() + .zip(fields) + .map(|(arr, field)| cast(arr.as_ref(), field.data_type(), options)) + .collect::>>()?; + + Ok(StructArray::new( + to_type.clone(), + new_values, + array.validity().cloned(), + )) +} + fn cast_list( array: &ListArray, to_type: &DataType, @@ -417,6 +453,14 @@ fn cast_list_to_fixed_size_list( } } +pub fn cast_default(array: &dyn Array, to_type: &DataType) -> Result> { + cast(array, to_type, Default::default()) +} + +pub fn cast_unchecked(array: &dyn Array, to_type: &DataType) -> Result> { + cast(array, to_type, CastOptions::unchecked()) +} + /// Cast `array` to the provided data type and return a new [`Array`] with /// type `to_type`, if possible. /// @@ -430,13 +474,14 @@ fn cast_list_to_fixed_size_list( /// * Fixed Size List to List: the underlying data type is cast /// * List to Fixed Size List: the offsets are checked for valid order, then the /// underlying type is cast. +/// * Struct to Struct: the underlying fields are cast. /// * PrimitiveArray to List: a list array with 1 value per slot is created /// * Date32 and Date64: precision lost when going to higher interval /// * Time32 and Time64: precision lost when going to higher interval /// * Timestamp and Date{32|64}: precision lost when going to higher interval /// * Temporal to/from backing primitive: zero-copy with data type change /// Unsupported Casts -/// * To or from `StructArray` +/// * non-`StructArray` to `StructArray` or `StructArray` to non-`StructArray` /// * List to primitive /// * Utf8 to boolean /// * Interval and duration @@ -452,12 +497,15 @@ pub fn cast(array: &dyn Array, to_type: &DataType, options: CastOptions) -> Resu let as_options = options.with_wrapped(true); match (from_type, to_type) { (Null, _) | (_, Null) => Ok(new_null_array(to_type.clone(), array.len())), - (Struct(_), _) => Err(Error::NotYetImplemented( + (Struct(from_fd), Struct(to_fd)) => { + if from_fd.len() != to_fd.len() { + return Err(Error::InvalidArgumentError("incompatible offsets in source list".to_string())); + } + cast_struct(array.as_any().downcast_ref().unwrap(), to_type, options).map(|x| x.boxed()) + }, + (Struct(_), _) | (_, Struct(_)) => Err(Error::NotYetImplemented( "Cannot cast from struct to other types".to_string(), )), - (_, Struct(_)) => Err(Error::NotYetImplemented( - "Cannot cast to struct from other types".to_string(), - )), (List(_), FixedSizeList(inner, size)) => cast_list_to_fixed_size_list::( array.as_any().downcast_ref().unwrap(), inner.as_ref(), @@ -487,7 +535,36 @@ pub fn cast(array: &dyn Array, to_type: &DataType, options: CastOptions) -> Resu (List(_), List(_)) => { cast_list::(array.as_any().downcast_ref().unwrap(), to_type, options) .map(|x| x.boxed()) - } + }, + (BinaryView, _) => match to_type { + Utf8View => array + .as_any() + .downcast_ref::() + .unwrap() + .to_utf8view() + .map(|arr| arr.boxed()), + LargeBinary => Ok(binview_to::view_to_binary::( + array.as_any().downcast_ref().unwrap(), + ) + .boxed()), + UInt8 => binview_to_primitive_dyn::(array, to_type, options), + UInt16 => binview_to_primitive_dyn::(array, to_type, options), + UInt32 => binview_to_primitive_dyn::(array, to_type, options), + UInt64 => binview_to_primitive_dyn::(array, to_type, options), + Int8 => binview_to_primitive_dyn::(array, to_type, options), + Int16 => binview_to_primitive_dyn::(array, to_type, options), + Int32 => binview_to_primitive_dyn::(array, to_type, options), + Int64 => binview_to_primitive_dyn::(array, to_type, options), + Float32 => binview_to_primitive_dyn::(array, to_type, options), + Float64 => binview_to_primitive_dyn::(array, to_type, options), + LargeList(inner) if matches!(inner.data_type, DataType::UInt8) => { + let bin_array = view_to_binary::(array.as_any().downcast_ref().unwrap()); + Ok(binary_to_list(&bin_array, to_type.clone()).boxed()) + }, + _ => Err(Error::NotYetImplemented(format!( + "Unsupported casting from {from_type:?} to {to_type:?}" + ))), + }, (LargeList(_), LargeList(_)) => { cast_list::(array.as_any().downcast_ref().unwrap(), to_type, options) .map(|x| x.boxed()) @@ -525,6 +602,40 @@ pub fn cast(array: &dyn Array, to_type: &DataType, options: CastOptions) -> Resu Ok(Box::new(list_array)) } + (Utf8View, _) => { + let arr = array.as_any().downcast_ref::().unwrap(); + + match to_type { + BinaryView => Ok(arr.to_binview().boxed()), + LargeUtf8 => Ok(binview_to::utf8view_to_utf8::(arr).boxed()), + UInt8 + | UInt16 + | UInt32 + | UInt64 + | Int8 + | Int16 + | Int32 + | Int64 + | Float32 + | Float64 + | Decimal(_, _) => cast(&arr.to_binview(), to_type, options), + Timestamp(time_unit, None) => { + utf8view_to_naive_timestamp_dyn(array, time_unit.to_owned()) + }, + Timestamp(time_unit, Some(time_zone)) => utf8view_to_timestamp( + array.as_any().downcast_ref().unwrap(), + RFC3339, + time_zone.clone(), + time_unit.to_owned(), + ) + .map(|arr| arr.boxed()), + Date32 => utf8view_to_date32_dyn(array), + _ => Err(Error::NotYetImplemented(format!( + "Unsupported casting from {from_type:?} to {to_type:?}" + ))), + } + }, + (Dictionary(index_type, ..), _) => match_integer_type!(index_type, |$T| { dictionary_cast_dyn::<$T>(array, to_type, options) }), @@ -557,26 +668,32 @@ pub fn cast(array: &dyn Array, to_type: &DataType, options: CastOptions) -> Resu Int64 => boolean_to_primitive_dyn::(array), Float32 => boolean_to_primitive_dyn::(array), Float64 => boolean_to_primitive_dyn::(array), - Utf8 => boolean_to_utf8_dyn::(array), - LargeUtf8 => boolean_to_utf8_dyn::(array), - Binary => boolean_to_binary_dyn::(array), - LargeBinary => boolean_to_binary_dyn::(array), - _ => Err(Error::NotYetImplemented(format!( + Utf8View => boolean_to_utf8view_dyn(array), + BinaryView => boolean_to_binaryview_dyn(array), + _ => Err(Error::InvalidArgumentError(format!( "Casting from {from_type:?} to {to_type:?} not supported", ))), }, - + (_, BinaryView) => from_to_binview(array, from_type, to_type).map(|arr| arr.boxed()), + (_, Utf8View) => match from_type { + LargeUtf8 => Ok(utf8_to_utf8view( + array.as_any().downcast_ref::>().unwrap(), + ) + .boxed()), + Utf8 => Ok( + utf8_to_utf8view(array.as_any().downcast_ref::>().unwrap()).boxed(), + ), + _ => from_to_binview(array, from_type, to_type) + .map(|arr| unsafe { arr.to_utf8view_unchecked() }.boxed()), + }, (Utf8, _) => match to_type { - UInt8 => utf8_to_primitive_dyn::(array, to_type, options), - UInt16 => utf8_to_primitive_dyn::(array, to_type, options), - UInt32 => utf8_to_primitive_dyn::(array, to_type, options), - UInt64 => utf8_to_primitive_dyn::(array, to_type, options), - Int8 => utf8_to_primitive_dyn::(array, to_type, options), - Int16 => utf8_to_primitive_dyn::(array, to_type, options), - Int32 => utf8_to_primitive_dyn::(array, to_type, options), - Int64 => utf8_to_primitive_dyn::(array, to_type, options), - Float32 => utf8_to_primitive_dyn::(array, to_type, options), - Float64 => utf8_to_primitive_dyn::(array, to_type, options), + UInt8 | UInt16 | UInt32 | UInt64 | Int8 | Int16 | Int32 | Int64 | Float32 | Float64 => { + let binary = utf8_to_binary::( + array.as_any().downcast_ref().unwrap(), + Binary, + ); + cast(&binary, to_type, options) + }, Date32 => utf8_to_date32_dyn::(array), Date64 => utf8_to_date64_dyn::(array), LargeUtf8 => Ok(Box::new(utf8_to_large_utf8( @@ -587,25 +704,22 @@ pub fn cast(array: &dyn Array, to_type: &DataType, options: CastOptions) -> Resu to_type.clone(), ) .boxed()), - Timestamp(TimeUnit::Nanosecond, None) => utf8_to_naive_timestamp_ns_dyn::(array), + Timestamp(TimeUnit::Nanosecond, None) => utf8_to_naive_timestamp_dyn::(array, TimeUnit::Nanosecond), Timestamp(TimeUnit::Nanosecond, Some(tz)) => { - utf8_to_timestamp_ns_dyn::(array, tz.clone()) + utf8_to_timestamp_dyn::(array, tz.clone()) } _ => Err(Error::NotYetImplemented(format!( "Casting from {from_type:?} to {to_type:?} not supported", ))), }, (LargeUtf8, _) => match to_type { - UInt8 => utf8_to_primitive_dyn::(array, to_type, options), - UInt16 => utf8_to_primitive_dyn::(array, to_type, options), - UInt32 => utf8_to_primitive_dyn::(array, to_type, options), - UInt64 => utf8_to_primitive_dyn::(array, to_type, options), - Int8 => utf8_to_primitive_dyn::(array, to_type, options), - Int16 => utf8_to_primitive_dyn::(array, to_type, options), - Int32 => utf8_to_primitive_dyn::(array, to_type, options), - Int64 => utf8_to_primitive_dyn::(array, to_type, options), - Float32 => utf8_to_primitive_dyn::(array, to_type, options), - Float64 => utf8_to_primitive_dyn::(array, to_type, options), + UInt8 | UInt16 | UInt32 | UInt64 | Int8 | Int16 | Int32 | Int64 | Float32 | Float64 => { + let binary = utf8_to_binary::( + array.as_any().downcast_ref().unwrap(), + DataType::LargeBinary, + ); + cast(&binary, to_type, options) + }, Date32 => utf8_to_date32_dyn::(array), Date64 => utf8_to_date64_dyn::(array), Utf8 => utf8_large_to_utf8(array.as_any().downcast_ref().unwrap()).map(|x| x.boxed()), @@ -614,9 +728,9 @@ pub fn cast(array: &dyn Array, to_type: &DataType, options: CastOptions) -> Resu to_type.clone(), ) .boxed()), - Timestamp(TimeUnit::Nanosecond, None) => utf8_to_naive_timestamp_ns_dyn::(array), + Timestamp(TimeUnit::Nanosecond, None) => utf8_to_naive_timestamp_dyn::(array, TimeUnit::Nanosecond), Timestamp(TimeUnit::Nanosecond, Some(tz)) => { - utf8_to_timestamp_ns_dyn::(array, tz.clone()) + utf8_to_timestamp_dyn::(array, tz.clone()) } _ => Err(Error::NotYetImplemented(format!( "Casting from {from_type:?} to {to_type:?} not supported", @@ -689,16 +803,6 @@ pub fn cast(array: &dyn Array, to_type: &DataType, options: CastOptions) -> Resu }, (Binary, _) => match to_type { - UInt8 => binary_to_primitive_dyn::(array, to_type, options), - UInt16 => binary_to_primitive_dyn::(array, to_type, options), - UInt32 => binary_to_primitive_dyn::(array, to_type, options), - UInt64 => binary_to_primitive_dyn::(array, to_type, options), - Int8 => binary_to_primitive_dyn::(array, to_type, options), - Int16 => binary_to_primitive_dyn::(array, to_type, options), - Int32 => binary_to_primitive_dyn::(array, to_type, options), - Int64 => binary_to_primitive_dyn::(array, to_type, options), - Float32 => binary_to_primitive_dyn::(array, to_type, options), - Float64 => binary_to_primitive_dyn::(array, to_type, options), LargeBinary => Ok(Box::new(binary_to_large_binary( array.as_any().downcast_ref().unwrap(), to_type.clone(), @@ -1012,3 +1116,30 @@ fn cast_to_dictionary( ))), } } + +fn from_to_binview( + array: &dyn Array, + from_type: &DataType, + to_type: &DataType, +) -> Result { + use DataType::*; + let binview = match from_type { + UInt8 => primitive_to_binview_dyn::(array), + UInt16 => primitive_to_binview_dyn::(array), + UInt32 => primitive_to_binview_dyn::(array), + UInt64 => primitive_to_binview_dyn::(array), + Int8 => primitive_to_binview_dyn::(array), + Int16 => primitive_to_binview_dyn::(array), + Int32 => primitive_to_binview_dyn::(array), + Int64 => primitive_to_binview_dyn::(array), + Float32 => primitive_to_binview_dyn::(array), + Float64 => primitive_to_binview_dyn::(array), + Binary => binary_to_binview::(array.as_any().downcast_ref().unwrap()), + FixedSizeBinary(_) => fixed_size_binary_to_binview(array.as_any().downcast_ref().unwrap()), + LargeBinary => binary_to_binview::(array.as_any().downcast_ref().unwrap()), + _ => return Err(Error::NotYetImplemented(format!( + "Unsupported casting from {from_type:?} to {to_type:?}" + ))), + }; + Ok(binview) +} diff --git a/src/compute/cast/primitive_to.rs b/src/compute/cast/primitive_to.rs index 110288817a7..d26f044363d 100644 --- a/src/compute/cast/primitive_to.rs +++ b/src/compute/cast/primitive_to.rs @@ -17,10 +17,64 @@ use crate::{ use super::CastOptions; -/// Returns a [`BinaryArray`] where every element is the binary representation of the number. -pub fn primitive_to_binary( +pub(super) trait SerPrimitive { + fn write(f: &mut Vec, val: Self) -> usize + where + Self: Sized; +} + +macro_rules! impl_ser_primitive { + ($ptype:ident) => { + impl SerPrimitive for $ptype { + fn write(f: &mut Vec, val: Self) -> usize + where + Self: Sized, + { + let mut buffer = itoa::Buffer::new(); + let value = buffer.format(val); + f.extend_from_slice(value.as_bytes()); + value.len() + } + } + }; +} + +impl_ser_primitive!(i8); +impl_ser_primitive!(i16); +impl_ser_primitive!(i32); +impl_ser_primitive!(i64); +impl_ser_primitive!(u8); +impl_ser_primitive!(u16); +impl_ser_primitive!(u32); +impl_ser_primitive!(u64); + +impl SerPrimitive for f32 { + fn write(f: &mut Vec, val: Self) -> usize + where + Self: Sized, + { + let mut buffer = ryu::Buffer::new(); + let value = buffer.format(val); + f.extend_from_slice(value.as_bytes()); + value.len() + } +} + +impl SerPrimitive for f64 { + fn write(f: &mut Vec, val: Self) -> usize + where + Self: Sized, + { + let mut buffer = ryu::Buffer::new(); + let value = buffer.format(val); + f.extend_from_slice(value.as_bytes()); + value.len() + } +} + +fn primitive_to_values_and_offsets( from: &PrimitiveArray, -) -> BinaryArray { +) -> (Vec, Offsets) { let mut values: Vec = Vec::with_capacity(from.len()); let mut offsets: Vec = Vec::with_capacity(from.len() + 1); offsets.push(O::default()); @@ -28,35 +82,38 @@ pub fn primitive_to_binary( let mut offset: usize = 0; unsafe { - for x in from.values().iter() { - values.reserve(offset + T::FORMATTED_SIZE_DECIMAL); - - let bytes = std::slice::from_raw_parts_mut( - values.as_mut_ptr().add(offset), - values.capacity() - offset, - ); - let len = lexical_core::write_unchecked(*x, bytes).len(); + for &x in from.values().iter() { + let len = T::write(&mut values, x); offset += len; - offsets.push(O::from_usize(offset).unwrap()); + offsets.push(O::from_as_usize(offset)); } values.set_len(offset); values.shrink_to_fit(); // Safety: offsets _are_ monotonically increasing let offsets = unsafe { Offsets::new_unchecked(offsets) }; - BinaryArray::::new( - BinaryArray::::default_data_type(), - offsets.into(), - values.into(), - from.validity().cloned(), - ) + + (values, offsets) } } +/// Returns a [`BinaryArray`] where every element is the binary representation of the number. +pub(super) fn primitive_to_binary( + from: &PrimitiveArray, +) -> BinaryArray { + let (values, offsets) = primitive_to_values_and_offsets(from); + BinaryArray::::new( + BinaryArray::::default_data_type(), + offsets.into(), + values.into(), + from.validity().cloned(), + ) +} + pub(super) fn primitive_to_binary_dyn(from: &dyn Array) -> Result> where O: Offset, - T: NativeType + lexical_core::ToLexical, + T: NativeType + SerPrimitive, { let from = from.as_any().downcast_ref().unwrap(); Ok(Box::new(primitive_to_binary::(from))) @@ -86,32 +143,11 @@ where } /// Returns a [`Utf8Array`] where every element is the utf8 representation of the number. -pub fn primitive_to_utf8( +pub(super) fn primitive_to_utf8( from: &PrimitiveArray, ) -> Utf8Array { - let mut values: Vec = Vec::with_capacity(from.len()); - let mut offsets: Vec = Vec::with_capacity(from.len() + 1); - offsets.push(O::default()); - - let mut offset: usize = 0; - + let (values, offsets) = primitive_to_values_and_offsets(from); unsafe { - for x in from.values().iter() { - values.reserve(offset + T::FORMATTED_SIZE_DECIMAL); - - let bytes = std::slice::from_raw_parts_mut( - values.as_mut_ptr().add(offset), - values.capacity() - offset, - ); - let len = lexical_core::write_unchecked(*x, bytes).len(); - - offset += len; - offsets.push(O::from_usize(offset).unwrap()); - } - values.set_len(offset); - values.shrink_to_fit(); - // Safety: offsets _are_ monotonically increasing - let offsets = unsafe { Offsets::new_unchecked(offsets) }; Utf8Array::::new_unchecked( Utf8Array::::default_data_type(), offsets.into(), @@ -124,7 +160,7 @@ pub fn primitive_to_utf8( pub(super) fn primitive_to_utf8_dyn(from: &dyn Array) -> Result> where O: Offset, - T: NativeType + lexical_core::ToLexical, + T: NativeType + SerPrimitive, { let from = from.as_any().downcast_ref().unwrap(); Ok(Box::new(primitive_to_utf8::(from))) @@ -587,3 +623,27 @@ pub fn months_to_months_days_ns(from: &PrimitiveArray) -> PrimitiveArray) -> PrimitiveArray { unary(from, |x| x.to_f32(), DataType::Float32) } + +/// Returns a [`Utf8Array`] where every element is the utf8 representation of the number. +pub(super) fn primitive_to_binview( + from: &PrimitiveArray, +) -> BinaryViewArray { + let mut mutable = MutableBinaryViewArray::with_capacity(from.len()); + + let mut scratch = vec![]; + for &x in from.values().iter() { + unsafe { scratch.set_len(0) }; + T::write(&mut scratch, x); + mutable.push_value_ignore_validity(&scratch) + } + + mutable.freeze().with_validity(from.validity().cloned()) +} + +pub(super) fn primitive_to_binview_dyn(from: &dyn Array) -> BinaryViewArray +where + T: NativeType + SerPrimitive, +{ + let from = from.as_any().downcast_ref().unwrap(); + primitive_to_binview::(from) +} diff --git a/src/compute/cast/utf8_to.rs b/src/compute/cast/utf8_to.rs index 6ee38588696..40fda5136a9 100644 --- a/src/compute/cast/utf8_to.rs +++ b/src/compute/cast/utf8_to.rs @@ -1,3 +1,4 @@ +use std::sync::Arc; use chrono::Datelike; use crate::{ @@ -6,11 +7,12 @@ use crate::{ error::Result, offset::Offset, temporal_conversions::{ - utf8_to_naive_timestamp_ns as utf8_to_naive_timestamp_ns_, - utf8_to_timestamp_ns as utf8_to_timestamp_ns_, EPOCH_DAYS_FROM_CE, + utf8_to_naive_timestamp as utf8_to_naive_timestamp_, + utf8_to_timestamp as utf8_to_timestamp_, EPOCH_DAYS_FROM_CE, }, types::NativeType, }; +use crate::datatypes::TimeUnit; use super::CastOptions; @@ -43,6 +45,7 @@ where PrimitiveArray::::from_trusted_len_iter(iter).to(to.clone()) } +#[allow(unused)] pub(super) fn utf8_to_primitive_dyn( from: &dyn Array, to: &DataType, @@ -113,34 +116,35 @@ pub fn utf8_to_dictionary( Ok(array.into()) } -pub(super) fn utf8_to_naive_timestamp_ns_dyn( +pub(super) fn utf8_to_naive_timestamp_dyn( from: &dyn Array, + time_unit: TimeUnit ) -> Result> { let from = from.as_any().downcast_ref().unwrap(); - Ok(Box::new(utf8_to_naive_timestamp_ns::(from))) + Ok(Box::new(utf8_to_naive_timestamp::(from, time_unit))) } /// [`crate::temporal_conversions::utf8_to_timestamp_ns`] applied for RFC3339 formatting -pub fn utf8_to_naive_timestamp_ns(from: &Utf8Array) -> PrimitiveArray { - utf8_to_naive_timestamp_ns_(from, RFC3339) +pub fn utf8_to_naive_timestamp(from: &Utf8Array, time_unit: TimeUnit) -> PrimitiveArray { + utf8_to_naive_timestamp_(from, RFC3339, time_unit) } -pub(super) fn utf8_to_timestamp_ns_dyn( +pub(super) fn utf8_to_timestamp_dyn( from: &dyn Array, timezone: String, ) -> Result> { let from = from.as_any().downcast_ref().unwrap(); - utf8_to_timestamp_ns::(from, timezone) + utf8_to_timestamp::(from, timezone) .map(Box::new) .map(|x| x as Box) } /// [`crate::temporal_conversions::utf8_to_timestamp_ns`] applied for RFC3339 formatting -pub fn utf8_to_timestamp_ns( +pub fn utf8_to_timestamp( from: &Utf8Array, timezone: String, ) -> Result> { - utf8_to_timestamp_ns_(from, RFC3339, timezone) + utf8_to_timestamp_(from, RFC3339, timezone) } /// Conversion of utf8 @@ -177,3 +181,49 @@ pub fn utf8_to_binary(from: &Utf8Array, to_data_type: DataType) -> ) } } + +pub fn binary_to_binview(arr: &BinaryArray) -> BinaryViewArray { + let buffer_idx = 0_u32; + let base_ptr = arr.values().as_ptr() as usize; + + let mut views = Vec::with_capacity(arr.len()); + let mut uses_buffer = false; + for bytes in arr.values_iter() { + let len: u32 = bytes.len().try_into().unwrap(); + + let mut payload = [0; 16]; + payload[0..4].copy_from_slice(&len.to_le_bytes()); + + if len <= 12 { + payload[4..4 + bytes.len()].copy_from_slice(bytes); + } else { + uses_buffer = true; + unsafe { payload[4..8].copy_from_slice(bytes.get_unchecked(0..4)) }; + let offset = (bytes.as_ptr() as usize - base_ptr) as u32; + payload[0..4].copy_from_slice(&len.to_le_bytes()); + payload[8..12].copy_from_slice(&buffer_idx.to_le_bytes()); + payload[12..16].copy_from_slice(&offset.to_le_bytes()); + } + + let value = View::from_le_bytes(payload); + unsafe { views.push(value) }; + } + let buffers = if uses_buffer { + Arc::from([arr.values().clone()]) + } else { + Arc::from([]) + }; + unsafe { + BinaryViewArray::new_unchecked_unknown_md( + DataType::BinaryView, + views.into(), + buffers, + arr.validity().cloned(), + None, + ) + } +} + +pub fn utf8_to_utf8view(arr: &Utf8Array) -> Utf8ViewArray { + unsafe { binary_to_binview(&arr.to_binary()).to_utf8view_unchecked() } +} diff --git a/src/compute/comparison/mod.rs b/src/compute/comparison/mod.rs index b364ed88222..4031b770fd2 100644 --- a/src/compute/comparison/mod.rs +++ b/src/compute/comparison/mod.rs @@ -86,6 +86,7 @@ macro_rules! match_eq_ord {( Float16 => todo!(), Float32 => __with_ty__! { f32 }, Float64 => __with_ty__! { f64 }, + UInt128 => todo!(), } })} @@ -111,6 +112,7 @@ macro_rules! match_eq {( Float16 => __with_ty__! { f16 }, Float32 => __with_ty__! { f32 }, Float64 => __with_ty__! { f64 }, + UInt128 => todo!(), } })} diff --git a/src/compute/filter.rs b/src/compute/filter.rs index 7ba260e702f..2c3cf7c895e 100644 --- a/src/compute/filter.rs +++ b/src/compute/filter.rs @@ -296,6 +296,22 @@ pub fn filter(array: &dyn Array, filter: &BooleanArray) -> Result let array = array.as_any().downcast_ref().unwrap(); Ok(Box::new(filter_primitive::<$T>(array, filter))) }), + BinaryView => { + let iter = SlicesIterator::new(filter.values()); + let array = array.as_any().downcast_ref::().unwrap(); + let mut mutable = + growable::GrowableBinaryViewArray::new(vec![array], false, iter.slots()); + unsafe { + // We don't have to correct buffers as there is only one array. + iter.for_each(|(start, len)| mutable.extend_unchecked_no_buffers(0, start, len)); + } + + Ok(mutable.as_box()) + }, + // Should go via BinaryView + Utf8View => { + unreachable!() + }, _ => { let iter = SlicesIterator::new(filter.values()); let mut mutable = make_growable(&[array], false, iter.slots()); diff --git a/src/datatypes/mod.rs b/src/datatypes/mod.rs index 626b292ad81..9e35defc431 100644 --- a/src/datatypes/mod.rs +++ b/src/datatypes/mod.rs @@ -157,7 +157,16 @@ pub enum DataType { /// Decimal backed by 256 bits Decimal256(usize, usize), /// Extension type. + /// - name + /// - physical type + /// - metadata Extension(String, Box, Option), + /// A binary type that inlines small values + /// and can intern bytes. + BinaryView, + /// A string type that inlines small values + /// and can intern strings. + Utf8View, } #[cfg(feature = "arrow")] @@ -213,9 +222,16 @@ impl From for arrow_schema::DataType { Box::new(DataType::from(key).into()), Box::new((*value).into()), ), - DataType::Decimal(precision, scale) => Self::Decimal128(precision as _, scale as _), - DataType::Decimal256(precision, scale) => Self::Decimal256(precision as _, scale as _), + DataType::Decimal(precision, scale) => { + Self::Decimal128(precision as _, scale as _) + }, + DataType::Decimal256(precision, scale) => { + Self::Decimal256(precision as _, scale as _) + }, DataType::Extension(_, d, _) => (*d).into(), + DataType::BinaryView | DataType::Utf8View => { + panic!("view datatypes not supported by arrow-rs") + }, } } } @@ -441,6 +457,8 @@ impl DataType { LargeBinary => PhysicalType::LargeBinary, Utf8 => PhysicalType::Utf8, LargeUtf8 => PhysicalType::LargeUtf8, + BinaryView => PhysicalType::BinaryView, + Utf8View => PhysicalType::Utf8View, List(_) => PhysicalType::List, FixedSizeList(_, _) => PhysicalType::FixedSizeList, LargeList(_) => PhysicalType::LargeList, @@ -462,6 +480,19 @@ impl DataType { _ => self, } } + + pub fn inner_dtype(&self) -> Option<&DataType> { + match self { + DataType::List(inner) => Some(inner.data_type()), + DataType::LargeList(inner) => Some(inner.data_type()), + DataType::FixedSizeList(inner, _) => Some(inner.data_type()), + _ => None, + } + } + + pub fn is_view(&self) -> bool { + matches!(self, DataType::Utf8View | DataType::BinaryView) + } } impl From for DataType { @@ -497,6 +528,7 @@ impl From for DataType { PrimitiveType::Float64 => DataType::Float64, PrimitiveType::DaysMs => DataType::Interval(IntervalUnit::DayTime), PrimitiveType::MonthDayNano => DataType::Interval(IntervalUnit::MonthDayNano), + PrimitiveType::UInt128 => unimplemented!(), } } } diff --git a/src/datatypes/physical_type.rs b/src/datatypes/physical_type.rs index 828df9541f0..d33be774e76 100644 --- a/src/datatypes/physical_type.rs +++ b/src/datatypes/physical_type.rs @@ -39,6 +39,12 @@ pub enum PhysicalType { Map, /// A dictionary encoded array by `IntegerType`. Dictionary(IntegerType), + /// A binary type that inlines small values + /// and can intern bytes. + BinaryView, + /// A string type that inlines small values + /// and can intern strings. + Utf8View, } impl PhysicalType { diff --git a/src/ffi/array.rs b/src/ffi/array.rs index 1a25b98510f..5e2e008a24d 100644 --- a/src/ffi/array.rs +++ b/src/ffi/array.rs @@ -43,6 +43,8 @@ pub unsafe fn try_from(array: A) -> Result> { } Union => Box::new(UnionArray::try_from_ffi(array)?), Map => Box::new(MapArray::try_from_ffi(array)?), + BinaryView => Box::new(BinaryViewArray::try_from_ffi(array)?), + Utf8View => Box::new(Utf8ViewArray::try_from_ffi(array)?), }) } @@ -88,6 +90,7 @@ struct PrivateData { buffers_ptr: Box<[*const std::os::raw::c_void]>, children_ptr: Box<[*mut ArrowArray]>, dictionary_ptr: Option<*mut ArrowArray>, + variadic_buffer_sizes: Box<[i64]>, } impl ArrowArray { @@ -96,9 +99,36 @@ impl ArrowArray { /// This method releases `buffers`. Consumers of this struct *must* call `release` before /// releasing this struct, or contents in `buffers` leak. pub(crate) fn new(array: Box) -> Self { - let (offset, buffers, children, dictionary) = + let needs_variadic_buffer_sizes = matches!( + array.data_type(), + DataType::BinaryView | DataType::Utf8View + ); + + let (offset, mut buffers, children, dictionary) = offset_buffers_children_dictionary(array.as_ref()); + let variadic_buffer_sizes = if needs_variadic_buffer_sizes { + #[cfg(feature = "compute_cast")] + { + let arr = crate::compute::cast::cast_unchecked( + array.as_ref(), + &DataType::BinaryView, + ) + .unwrap(); + let arr = arr.as_any().downcast_ref::().unwrap(); + let boxed = arr.variadic_buffer_lengths().into_boxed_slice(); + let ptr = boxed.as_ptr().cast::(); + buffers.push(Some(ptr)); + boxed + } + #[cfg(not(feature = "compute_cast"))] + { + panic!("activate 'compute_cast' feature") + } + } else { + Box::from([]) + }; + let buffers_ptr = buffers .iter() .map(|maybe_buffer| match maybe_buffer { @@ -125,6 +155,7 @@ impl ArrowArray { buffers_ptr, children_ptr, dictionary_ptr, + variadic_buffer_sizes, }); Self { @@ -218,6 +249,21 @@ unsafe fn get_buffer_ptr( Ok(ptr as *mut T) } +unsafe fn create_buffer_known_len( + array: &ArrowArray, + data_type: &DataType, + owner: InternalArrowArray, + len: usize, + index: usize, +) -> Result> { + if len == 0 { + return Ok(Buffer::new()); + } + let ptr: *mut T = get_buffer_ptr(array, data_type, index)?; + let bytes = Bytes::from_foreign(ptr, len, BytesAllocator::InternalArrowArray(owner)); + Ok(Buffer::from_bytes(bytes)) +} + /// returns the buffer `i` of `array` interpreted as a [`Buffer`]. /// # Safety /// This function is safe iff: @@ -328,7 +374,10 @@ unsafe fn buffer_len(array: &ArrowArray, data_type: &DataType, i: usize) -> Resu | (PhysicalType::Map, 1) => { // the len of the offset buffer (buffer 1) equals length + 1 array.offset as usize + array.length as usize + 1 - } + }, + (PhysicalType::BinaryView, 1) | (PhysicalType::Utf8View, 1) => { + array.offset as usize + array.length as usize + }, (PhysicalType::Utf8, 2) | (PhysicalType::Binary, 2) => { // the len of the data buffer (buffer 2) equals the last value of the offset buffer (buffer 1) let len = buffer_len(array, data_type, 1)?; @@ -452,6 +501,17 @@ pub trait ArrowArrayRef: std::fmt::Debug { create_buffer::(self.array(), self.data_type(), self.owner(), index) } + /// # Safety + /// The caller must guarantee that the buffer `index` corresponds to a buffer. + /// This function assumes that the buffer created from FFI is valid; this is impossible to prove. + unsafe fn buffer_known_len( + &self, + index: usize, + len: usize, + ) -> Result> { + create_buffer_known_len::(self.array(), self.data_type(), self.owner(), len, index) + } + /// # Safety /// This function is safe iff: /// * the buffer at position `index` is valid for the declared length diff --git a/src/ffi/bridge.rs b/src/ffi/bridge.rs index 9a098cc8b2c..e27dbeadfb2 100644 --- a/src/ffi/bridge.rs +++ b/src/ffi/bridge.rs @@ -34,6 +34,8 @@ pub fn align_to_c_data_interface(array: Box) -> Box { match_integer_type!(key_type, |$T| { ffi_dyn!(array, DictionaryArray<$T>) }) - } + }, + BinaryView => ffi_dyn!(array, BinaryViewArray), + Utf8View => ffi_dyn!(array, Utf8ViewArray), } } diff --git a/src/ffi/mmap.rs b/src/ffi/mmap.rs index 0f879d4fdca..3806bffe024 100644 --- a/src/ffi/mmap.rs +++ b/src/ffi/mmap.rs @@ -21,7 +21,7 @@ struct PrivateData { } pub(crate) unsafe fn create_array< - T: AsRef<[u8]>, + T, I: Iterator>, II: Iterator, >( diff --git a/src/ffi/schema.rs b/src/ffi/schema.rs index e41de33e436..90bfce09040 100644 --- a/src/ffi/schema.rs +++ b/src/ffi/schema.rs @@ -261,6 +261,8 @@ unsafe fn to_data_type(schema: &ArrowSchema) -> Result { "tDn" => DataType::Duration(TimeUnit::Nanosecond), "tiM" => DataType::Interval(IntervalUnit::YearMonth), "tiD" => DataType::Interval(IntervalUnit::DayTime), + "vu" => DataType::Utf8View, + "vz" => DataType::BinaryView, "+l" => { let child = schema.child(0); DataType::List(Box::new(to_field(child)?)) @@ -439,6 +441,8 @@ fn to_format(data_type: &DataType) -> String { tz.as_ref().map(|x| x.as_ref()).unwrap_or("") ) } + DataType::Utf8View => "vu".to_string(), + DataType::BinaryView => "vz".to_string(), DataType::Decimal(precision, scale) => format!("d:{precision},{scale}"), DataType::Decimal256(precision, scale) => format!("d:{precision},{scale},256"), DataType::List(_) => "+l".to_string(), diff --git a/src/io/ipc/mod.rs b/src/io/ipc/mod.rs index 2bb233a1474..d8290980966 100644 --- a/src/io/ipc/mod.rs +++ b/src/io/ipc/mod.rs @@ -42,7 +42,7 @@ //! let y_coord = Field::new("y", DataType::Int32, false); //! let schema = Schema::from(vec![x_coord, y_coord]); //! let options = WriteOptions {compression: None}; -//! let mut writer = FileWriter::try_new(file, schema, None, options)?; +//! let mut writer = FileWriter::try_new(file, schema.into(), None, options)?; //! //! // Setup the data //! let x_data = Int32Array::from_slice([-1i32, 1]); diff --git a/src/io/ipc/read/array/binary.rs b/src/io/ipc/read/array/binary.rs index 15361da0968..7e9df71a41d 100644 --- a/src/io/ipc/read/array/binary.rs +++ b/src/io/ipc/read/array/binary.rs @@ -5,10 +5,11 @@ use crate::array::BinaryArray; use crate::buffer::Buffer; use crate::datatypes::DataType; use crate::error::{Error, Result}; +use crate::io::ipc::read::array::{try_get_array_length, try_get_field_node}; use crate::offset::Offset; use super::super::read_basic::*; -use super::super::{Compression, IpcBuffer, Node, OutOfSpecKind}; +use super::super::{Compression, IpcBuffer, Node}; #[allow(clippy::too_many_arguments)] pub fn read_binary( @@ -22,11 +23,7 @@ pub fn read_binary( limit: Option, scratch: &mut Vec, ) -> Result> { - let field_node = field_nodes.pop_front().ok_or_else(|| { - Error::oos(format!( - "IPC: unable to fetch the field for {data_type:?}. The file or stream is corrupted." - )) - })?; + let field_node = try_get_field_node(field_nodes, &data_type)?; let validity = read_validity( buffers, @@ -39,11 +36,7 @@ pub fn read_binary( scratch, )?; - let length: usize = field_node - .length() - .try_into() - .map_err(|_| Error::from(OutOfSpecKind::NegativeFooterLength))?; - let length = limit.map(|limit| limit.min(length)).unwrap_or(length); + let length = try_get_array_length(field_node, limit)?; let offsets: Buffer = read_buffer( buffers, diff --git a/src/io/ipc/read/array/binview.rs b/src/io/ipc/read/array/binview.rs new file mode 100644 index 00000000000..2ee4390fee7 --- /dev/null +++ b/src/io/ipc/read/array/binview.rs @@ -0,0 +1,69 @@ +use std::collections::VecDeque; +use std::io::{Read, Seek}; +use std::sync::Arc; + +use crate::error::{Error, Result}; + +use super::super::read_basic::*; +use super::*; +use crate::array::{BinaryViewArrayGeneric, View, ViewType}; +use crate::buffer::Buffer; +use crate::datatypes::DataType; + +#[allow(clippy::too_many_arguments)] +pub fn read_binview( + field_nodes: &mut VecDeque, + variadic_buffer_counts: &mut VecDeque, + data_type: DataType, + buffers: &mut VecDeque, + reader: &mut R, + block_offset: u64, + is_little_endian: bool, + compression: Option, + limit: Option, + scratch: &mut Vec, +) -> Result> { + let field_node = try_get_field_node(field_nodes, &data_type)?; + + let validity = read_validity( + buffers, + field_node, + reader, + block_offset, + is_little_endian, + compression, + limit, + scratch, + )?; + + let length = try_get_array_length(field_node, limit)?; + let views: Buffer = read_buffer( + buffers, + length, + reader, + block_offset, + is_little_endian, + compression, + scratch, + )?; + + let n_variadic = variadic_buffer_counts.pop_front().ok_or_else( + || { + Error::oos("IPC: unable to fetch the variadic buffers\n\nThe file or stream is corrupted.") + })?; + + let variadic_buffers = (0..n_variadic) + .map(|_| { + read_bytes( + buffers, + reader, + block_offset, + is_little_endian, + compression, + scratch, + ) + }) + .collect::>>>()?; + + BinaryViewArrayGeneric::::try_new(data_type, views, Arc::from(variadic_buffers), validity) +} diff --git a/src/io/ipc/read/array/boolean.rs b/src/io/ipc/read/array/boolean.rs index dbe40b3194e..d13e6f17c4f 100644 --- a/src/io/ipc/read/array/boolean.rs +++ b/src/io/ipc/read/array/boolean.rs @@ -4,9 +4,10 @@ use std::io::{Read, Seek}; use crate::array::BooleanArray; use crate::datatypes::DataType; use crate::error::{Error, Result}; +use crate::io::ipc::read::array::{try_get_array_length, try_get_field_node}; use super::super::read_basic::*; -use super::super::{Compression, IpcBuffer, Node, OutOfSpecKind}; +use super::super::{Compression, IpcBuffer, Node}; #[allow(clippy::too_many_arguments)] pub fn read_boolean( @@ -20,11 +21,7 @@ pub fn read_boolean( limit: Option, scratch: &mut Vec, ) -> Result { - let field_node = field_nodes.pop_front().ok_or_else(|| { - Error::oos(format!( - "IPC: unable to fetch the field for {data_type:?}. The file or stream is corrupted." - )) - })?; + let field_node = try_get_field_node(field_nodes, &data_type)?; let validity = read_validity( buffers, @@ -37,11 +34,7 @@ pub fn read_boolean( scratch, )?; - let length: usize = field_node - .length() - .try_into() - .map_err(|_| Error::from(OutOfSpecKind::NegativeFooterLength))?; - let length = limit.map(|limit| limit.min(length)).unwrap_or(length); + let length = try_get_array_length(field_node, limit)?; let values = read_bitmap( buffers, diff --git a/src/io/ipc/read/array/fixed_size_binary.rs b/src/io/ipc/read/array/fixed_size_binary.rs index 79ab0586fae..fe627ac81c1 100644 --- a/src/io/ipc/read/array/fixed_size_binary.rs +++ b/src/io/ipc/read/array/fixed_size_binary.rs @@ -4,9 +4,10 @@ use std::io::{Read, Seek}; use crate::array::FixedSizeBinaryArray; use crate::datatypes::DataType; use crate::error::{Error, Result}; +use crate::io::ipc::read::array::{try_get_array_length, try_get_field_node}; use super::super::read_basic::*; -use super::super::{Compression, IpcBuffer, Node, OutOfSpecKind}; +use super::super::{Compression, IpcBuffer, Node}; #[allow(clippy::too_many_arguments)] pub fn read_fixed_size_binary( @@ -20,11 +21,7 @@ pub fn read_fixed_size_binary( limit: Option, scratch: &mut Vec, ) -> Result { - let field_node = field_nodes.pop_front().ok_or_else(|| { - Error::oos(format!( - "IPC: unable to fetch the field for {data_type:?}. The file or stream is corrupted." - )) - })?; + let field_node = try_get_field_node(field_nodes, &data_type)?; let validity = read_validity( buffers, @@ -37,11 +34,7 @@ pub fn read_fixed_size_binary( scratch, )?; - let length: usize = field_node - .length() - .try_into() - .map_err(|_| Error::from(OutOfSpecKind::NegativeFooterLength))?; - let length = limit.map(|limit| limit.min(length)).unwrap_or(length); + let length = try_get_array_length(field_node, limit)?; let length = length.saturating_mul(FixedSizeBinaryArray::maybe_get_size(&data_type)?); let values = read_buffer( diff --git a/src/io/ipc/read/array/fixed_size_list.rs b/src/io/ipc/read/array/fixed_size_list.rs index 1f5d919c3cd..76da4470ae5 100644 --- a/src/io/ipc/read/array/fixed_size_list.rs +++ b/src/io/ipc/read/array/fixed_size_list.rs @@ -9,10 +9,12 @@ use super::super::super::IpcField; use super::super::deserialize::{read, skip}; use super::super::read_basic::*; use super::super::{Compression, Dictionaries, IpcBuffer, Node, Version}; +use crate::io::ipc::read::array::try_get_field_node; #[allow(clippy::too_many_arguments)] pub fn read_fixed_size_list( field_nodes: &mut VecDeque, + variadic_buffer_counts: &mut VecDeque, data_type: DataType, ipc_field: &IpcField, buffers: &mut VecDeque, @@ -25,11 +27,7 @@ pub fn read_fixed_size_list( version: Version, scratch: &mut Vec, ) -> Result { - let field_node = field_nodes.pop_front().ok_or_else(|| { - Error::oos(format!( - "IPC: unable to fetch the field for {data_type:?}. The file or stream is corrupted." - )) - })?; + let field_node = try_get_field_node(field_nodes, &data_type)?; let validity = read_validity( buffers, @@ -48,6 +46,7 @@ pub fn read_fixed_size_list( let values = read( field_nodes, + variadic_buffer_counts, field, &ipc_field.fields[0], buffers, diff --git a/src/io/ipc/read/array/list.rs b/src/io/ipc/read/array/list.rs index b6a9ef26155..1d741b4ac64 100644 --- a/src/io/ipc/read/array/list.rs +++ b/src/io/ipc/read/array/list.rs @@ -11,11 +11,13 @@ use crate::offset::Offset; use super::super::super::IpcField; use super::super::deserialize::{read, skip}; use super::super::read_basic::*; -use super::super::{Compression, Dictionaries, IpcBuffer, Node, OutOfSpecKind, Version}; +use super::super::{Compression, Dictionaries, IpcBuffer, Node, Version}; +use crate::io::ipc::read::array::{try_get_array_length, try_get_field_node}; #[allow(clippy::too_many_arguments)] pub fn read_list( field_nodes: &mut VecDeque, + variadic_buffer_counts: &mut VecDeque, data_type: DataType, ipc_field: &IpcField, buffers: &mut VecDeque, @@ -31,11 +33,7 @@ pub fn read_list( where Vec: TryInto, { - let field_node = field_nodes.pop_front().ok_or_else(|| { - Error::oos(format!( - "IPC: unable to fetch the field for {data_type:?}. The file or stream is corrupted." - )) - })?; + let field_node = try_get_field_node(field_nodes, &data_type)?; let validity = read_validity( buffers, @@ -48,11 +46,7 @@ where scratch, )?; - let length: usize = field_node - .length() - .try_into() - .map_err(|_| Error::from(OutOfSpecKind::NegativeFooterLength))?; - let length = limit.map(|limit| limit.min(length)).unwrap_or(length); + let length = try_get_array_length(field_node, limit)?; let offsets = read_buffer::( buffers, @@ -72,6 +66,7 @@ where let values = read( field_nodes, + variadic_buffer_counts, field, &ipc_field.fields[0], buffers, diff --git a/src/io/ipc/read/array/map.rs b/src/io/ipc/read/array/map.rs index b98678e0524..4e7261aee84 100644 --- a/src/io/ipc/read/array/map.rs +++ b/src/io/ipc/read/array/map.rs @@ -9,11 +9,13 @@ use crate::error::{Error, Result}; use super::super::super::IpcField; use super::super::deserialize::{read, skip}; use super::super::read_basic::*; -use super::super::{Compression, Dictionaries, IpcBuffer, Node, OutOfSpecKind, Version}; +use super::super::{Compression, Dictionaries, IpcBuffer, Node, Version}; +use crate::io::ipc::read::array::{try_get_array_length, try_get_field_node}; #[allow(clippy::too_many_arguments)] pub fn read_map( field_nodes: &mut VecDeque, + variadic_buffer_counts: &mut VecDeque, data_type: DataType, ipc_field: &IpcField, buffers: &mut VecDeque, @@ -26,11 +28,7 @@ pub fn read_map( version: Version, scratch: &mut Vec, ) -> Result { - let field_node = field_nodes.pop_front().ok_or_else(|| { - Error::oos(format!( - "IPC: unable to fetch the field for {data_type:?}. The file or stream is corrupted." - )) - })?; + let field_node = try_get_field_node(field_nodes, &data_type)?; let validity = read_validity( buffers, @@ -43,11 +41,7 @@ pub fn read_map( scratch, )?; - let length: usize = field_node - .length() - .try_into() - .map_err(|_| Error::from(OutOfSpecKind::NegativeFooterLength))?; - let length = limit.map(|limit| limit.min(length)).unwrap_or(length); + let length = try_get_array_length(field_node, limit)?; let offsets = read_buffer::( buffers, @@ -67,6 +61,7 @@ pub fn read_map( let field = read( field_nodes, + variadic_buffer_counts, field, &ipc_field.fields[0], buffers, diff --git a/src/io/ipc/read/array/mod.rs b/src/io/ipc/read/array/mod.rs index 249e5e05e16..b3cde245a9d 100644 --- a/src/io/ipc/read/array/mod.rs +++ b/src/io/ipc/read/array/mod.rs @@ -1,4 +1,7 @@ mod primitive; + +use std::collections::VecDeque; + pub use primitive::*; mod boolean; pub use boolean::*; @@ -20,5 +23,28 @@ mod dictionary; pub use dictionary::*; mod union; pub use union::*; +mod binview; mod map; +pub use binview::*; pub use map::*; + +use super::{Compression, IpcBuffer, Node, OutOfSpecKind}; +use crate::datatypes::DataType; +use crate::error::{Error, Result}; + +fn try_get_field_node<'a>( + field_nodes: &mut VecDeque>, + data_type: &DataType, +) -> Result> { + field_nodes.pop_front().ok_or_else(|| { + Error::oos(format!("IPC: unable to fetch the field for {:?}\n\nThe file or stream is corrupted.", data_type)) + }) +} + +fn try_get_array_length(field_node: Node, limit: Option) -> Result { + let length: usize = field_node + .length() + .try_into() + .map_err(|_| Error::from(OutOfSpecKind::NegativeFooterLength))?; + Ok(limit.map(|limit| limit.min(length)).unwrap_or(length)) +} diff --git a/src/io/ipc/read/array/null.rs b/src/io/ipc/read/array/null.rs index eee14608d85..c623249a0bc 100644 --- a/src/io/ipc/read/array/null.rs +++ b/src/io/ipc/read/array/null.rs @@ -6,19 +6,17 @@ use crate::{ error::{Error, Result}, }; -use super::super::{Node, OutOfSpecKind}; +use super::super::Node; +use crate::io::ipc::read::array::{try_get_array_length, try_get_field_node}; -pub fn read_null(field_nodes: &mut VecDeque, data_type: DataType) -> Result { - let field_node = field_nodes.pop_front().ok_or_else(|| { - Error::oos(format!( - "IPC: unable to fetch the field for {data_type:?}. The file or stream is corrupted." - )) - })?; +pub fn read_null( + field_nodes: &mut VecDeque, + data_type: DataType, + limit: Option, +) -> Result { + let field_node = try_get_field_node(field_nodes, &data_type)?; - let length: usize = field_node - .length() - .try_into() - .map_err(|_| Error::from(OutOfSpecKind::NegativeFooterLength))?; + let length = try_get_array_length(field_node, limit)?; NullArray::try_new(data_type, length) } diff --git a/src/io/ipc/read/array/primitive.rs b/src/io/ipc/read/array/primitive.rs index 0815d2ae7a2..99916069b60 100644 --- a/src/io/ipc/read/array/primitive.rs +++ b/src/io/ipc/read/array/primitive.rs @@ -6,7 +6,8 @@ use crate::error::{Error, Result}; use crate::{array::PrimitiveArray, types::NativeType}; use super::super::read_basic::*; -use super::super::{Compression, IpcBuffer, Node, OutOfSpecKind}; +use super::super::{Compression, IpcBuffer, Node}; +use crate::io::ipc::read::array::{try_get_array_length, try_get_field_node}; #[allow(clippy::too_many_arguments)] pub fn read_primitive( @@ -23,11 +24,7 @@ pub fn read_primitive( where Vec: TryInto, { - let field_node = field_nodes.pop_front().ok_or_else(|| { - Error::oos(format!( - "IPC: unable to fetch the field for {data_type:?}. The file or stream is corrupted." - )) - })?; + let field_node = try_get_field_node(field_nodes, &data_type)?; let validity = read_validity( buffers, @@ -40,11 +37,7 @@ where scratch, )?; - let length: usize = field_node - .length() - .try_into() - .map_err(|_| Error::from(OutOfSpecKind::NegativeFooterLength))?; - let length = limit.map(|limit| limit.min(length)).unwrap_or(length); + let length = try_get_array_length(field_node, limit)?; let values = read_buffer( buffers, diff --git a/src/io/ipc/read/array/struct_.rs b/src/io/ipc/read/array/struct_.rs index 06d0f55ad3d..bbd61cf1130 100644 --- a/src/io/ipc/read/array/struct_.rs +++ b/src/io/ipc/read/array/struct_.rs @@ -9,10 +9,12 @@ use super::super::super::IpcField; use super::super::deserialize::{read, skip}; use super::super::read_basic::*; use super::super::{Compression, Dictionaries, IpcBuffer, Node, Version}; +use crate::io::ipc::read::array::try_get_field_node; #[allow(clippy::too_many_arguments)] pub fn read_struct( field_nodes: &mut VecDeque, + variadic_buffer_counts: &mut VecDeque, data_type: DataType, ipc_field: &IpcField, buffers: &mut VecDeque, @@ -25,11 +27,7 @@ pub fn read_struct( version: Version, scratch: &mut Vec, ) -> Result { - let field_node = field_nodes.pop_front().ok_or_else(|| { - Error::oos(format!( - "IPC: unable to fetch the field for {data_type:?}. The file or stream is corrupted." - )) - })?; + let field_node = try_get_field_node(field_nodes, &data_type)?; let validity = read_validity( buffers, @@ -50,6 +48,7 @@ pub fn read_struct( .map(|(field, ipc_field)| { read( field_nodes, + variadic_buffer_counts, field, ipc_field, buffers, diff --git a/src/io/ipc/read/array/union.rs b/src/io/ipc/read/array/union.rs index 569014797a2..755d767b505 100644 --- a/src/io/ipc/read/array/union.rs +++ b/src/io/ipc/read/array/union.rs @@ -9,11 +9,13 @@ use crate::error::{Error, Result}; use super::super::super::IpcField; use super::super::deserialize::{read, skip}; use super::super::read_basic::*; -use super::super::{Compression, Dictionaries, IpcBuffer, Node, OutOfSpecKind, Version}; +use super::super::{Compression, Dictionaries, IpcBuffer, Node, Version}; +use crate::io::ipc::read::array::{try_get_array_length, try_get_field_node}; #[allow(clippy::too_many_arguments)] pub fn read_union( field_nodes: &mut VecDeque, + variadic_buffer_counts: &mut VecDeque, data_type: DataType, ipc_field: &IpcField, buffers: &mut VecDeque, @@ -26,11 +28,7 @@ pub fn read_union( version: Version, scratch: &mut Vec, ) -> Result { - let field_node = field_nodes.pop_front().ok_or_else(|| { - Error::oos(format!( - "IPC: unable to fetch the field for {data_type:?}. The file or stream is corrupted." - )) - })?; + let field_node = try_get_field_node(field_nodes, &data_type)?; if version != Version::V5 { let _ = buffers @@ -38,11 +36,7 @@ pub fn read_union( .ok_or_else(|| Error::oos("IPC: missing validity buffer."))?; }; - let length: usize = field_node - .length() - .try_into() - .map_err(|_| Error::from(OutOfSpecKind::NegativeFooterLength))?; - let length = limit.map(|limit| limit.min(length)).unwrap_or(length); + let length = try_get_array_length(field_node, limit)?; let types = read_buffer( buffers, @@ -80,6 +74,7 @@ pub fn read_union( .map(|(field, ipc_field)| { read( field_nodes, + variadic_buffer_counts, field, ipc_field, buffers, diff --git a/src/io/ipc/read/array/utf8.rs b/src/io/ipc/read/array/utf8.rs index 741b2b91585..4b0c6cb3372 100644 --- a/src/io/ipc/read/array/utf8.rs +++ b/src/io/ipc/read/array/utf8.rs @@ -5,10 +5,11 @@ use crate::array::Utf8Array; use crate::buffer::Buffer; use crate::datatypes::DataType; use crate::error::{Error, Result}; +use crate::io::ipc::read::array::{try_get_array_length, try_get_field_node}; use crate::offset::Offset; use super::super::read_basic::*; -use super::super::{Compression, IpcBuffer, Node, OutOfSpecKind}; +use super::super::{Compression, IpcBuffer, Node}; #[allow(clippy::too_many_arguments)] pub fn read_utf8( @@ -22,11 +23,7 @@ pub fn read_utf8( limit: Option, scratch: &mut Vec, ) -> Result> { - let field_node = field_nodes.pop_front().ok_or_else(|| { - Error::oos(format!( - "IPC: unable to fetch the field for {data_type:?}. The file or stream is corrupted." - )) - })?; + let field_node = try_get_field_node(field_nodes, &data_type)?; let validity = read_validity( buffers, @@ -39,12 +36,7 @@ pub fn read_utf8( scratch, )?; - let length: usize = field_node - .length() - .try_into() - .map_err(|_| Error::from(OutOfSpecKind::NegativeFooterLength))?; - - let length = limit.map(|limit| limit.min(length)).unwrap_or(length); + let length = try_get_array_length(field_node, limit)?; let offsets: Buffer = read_buffer( buffers, diff --git a/src/io/ipc/read/common.rs b/src/io/ipc/read/common.rs index 9a1ea3ce1c3..dc305ac3ad5 100644 --- a/src/io/ipc/read/common.rs +++ b/src/io/ipc/read/common.rs @@ -94,6 +94,11 @@ pub fn read_record_batch( .buffers() .map_err(|err| Error::from(OutOfSpecKind::InvalidFlatbufferBuffers(err)))? .ok_or_else(|| Error::from(OutOfSpecKind::MissingMessageBuffers))?; + let mut variadic_buffer_counts = batch + .variadic_buffer_counts() + .map_err(|err| Error::from(OutOfSpecKind::InvalidFlatbufferRecordBatches(err)))? + .map(|v| v.iter().map(|v| v as usize).collect::>()) + .unwrap_or_else(VecDeque::new); let mut buffers: VecDeque = buffers.iter().collect(); // check that the sum of the sizes of all buffers is <= than the size of the file @@ -128,6 +133,7 @@ pub fn read_record_batch( .map(|maybe_field| match maybe_field { ProjectionResult::Selected((field, ipc_field)) => Ok(Some(read( &mut field_nodes, + &mut variadic_buffer_counts, field, ipc_field, &mut buffers, @@ -156,6 +162,7 @@ pub fn read_record_batch( .map(|(field, ipc_field)| { read( &mut field_nodes, + &mut variadic_buffer_counts, field, ipc_field, &mut buffers, diff --git a/src/io/ipc/read/deserialize.rs b/src/io/ipc/read/deserialize.rs index 77ced6a5e97..5d3c209f07d 100644 --- a/src/io/ipc/read/deserialize.rs +++ b/src/io/ipc/read/deserialize.rs @@ -15,6 +15,7 @@ use super::{IpcBuffer, Node}; #[allow(clippy::too_many_arguments)] pub fn read( field_nodes: &mut VecDeque, + variadic_buffer_counts: &mut VecDeque, field: &Field, ipc_field: &IpcField, buffers: &mut VecDeque, @@ -31,7 +32,7 @@ pub fn read( let data_type = field.data_type.clone(); match data_type.to_physical_type() { - Null => read_null(field_nodes, data_type).map(|x| x.boxed()), + Null => read_null(field_nodes, data_type, limit).map(|x| x.boxed()), Boolean => read_boolean( field_nodes, data_type, @@ -120,6 +121,7 @@ pub fn read( .map(|x| x.boxed()), List => read_list::( field_nodes, + variadic_buffer_counts, data_type, ipc_field, buffers, @@ -135,6 +137,7 @@ pub fn read( .map(|x| x.boxed()), LargeList => read_list::( field_nodes, + variadic_buffer_counts, data_type, ipc_field, buffers, @@ -150,6 +153,7 @@ pub fn read( .map(|x| x.boxed()), FixedSizeList => read_fixed_size_list( field_nodes, + variadic_buffer_counts, data_type, ipc_field, buffers, @@ -165,6 +169,7 @@ pub fn read( .map(|x| x.boxed()), Struct => read_struct( field_nodes, + variadic_buffer_counts, data_type, ipc_field, buffers, @@ -198,6 +203,7 @@ pub fn read( } Union => read_union( field_nodes, + variadic_buffer_counts, data_type, ipc_field, buffers, @@ -213,6 +219,7 @@ pub fn read( .map(|x| x.boxed()), Map => read_map( field_nodes, + variadic_buffer_counts, data_type, ipc_field, buffers, @@ -226,6 +233,32 @@ pub fn read( scratch, ) .map(|x| x.boxed()), + Utf8View => read_binview::( + field_nodes, + variadic_buffer_counts, + data_type, + buffers, + reader, + block_offset, + is_little_endian, + compression, + limit, + scratch, + ) + .map(|x| x.boxed()), + BinaryView => read_binview::<[u8], _>( + field_nodes, + variadic_buffer_counts, + data_type, + buffers, + reader, + block_offset, + is_little_endian, + compression, + limit, + scratch, + ) + .map(|x| x.boxed()), } } @@ -249,5 +282,6 @@ pub fn skip( Dictionary(_) => skip_dictionary(field_nodes, buffers), Union => skip_union(field_nodes, data_type, buffers), Map => skip_map(field_nodes, data_type, buffers), + BinaryView | Utf8View => todo!(), } } diff --git a/src/io/ipc/read/file.rs b/src/io/ipc/read/file.rs index e95b37e44d6..c3001b5b044 100644 --- a/src/io/ipc/read/file.rs +++ b/src/io/ipc/read/file.rs @@ -4,7 +4,7 @@ use std::io::{Read, Seek, SeekFrom}; use crate::array::Array; use crate::chunk::Chunk; -use crate::datatypes::Schema; +use crate::datatypes::SchemaRef; use crate::error::{Error, Result}; use crate::io::ipc::IpcSchema; @@ -19,7 +19,7 @@ use arrow_format::ipc::planus::ReadAsRoot; #[derive(Debug, Clone)] pub struct FileMetadata { /// The schema that is read from the file footer - pub schema: Schema, + pub schema: SchemaRef, /// The files' [`IpcSchema`] pub ipc_schema: IpcSchema, @@ -184,6 +184,7 @@ pub(super) fn deserialize_footer(footer_data: &[u8], size: u64) -> Result( Ok(()) } +fn read_uncompressed_bytes( + reader: &mut R, + buffer_length: usize, + is_little_endian: bool, +) -> Result> { + if is_native_little_endian() == is_little_endian { + let mut buffer = Vec::with_capacity(buffer_length); + let _ = reader + .take(buffer_length as u64) + .read_to_end(&mut buffer) + .unwrap(); + Ok(buffer) + } else { + unreachable!() + } +} + fn read_uncompressed_buffer( reader: &mut R, buffer_length: usize, @@ -130,6 +147,61 @@ fn read_compressed_buffer( Ok(buffer) } +fn read_compressed_bytes( + reader: &mut R, + buffer_length: usize, + is_little_endian: bool, + compression: Compression, + scratch: &mut Vec, +) -> Result> { + read_compressed_buffer::( + reader, + buffer_length, + buffer_length, + is_little_endian, + compression, + scratch, + ) +} + +pub fn read_bytes( + buf: &mut VecDeque, + reader: &mut R, + block_offset: u64, + is_little_endian: bool, + compression: Option, + scratch: &mut Vec, +) -> Result> { + let buf = buf + .pop_front() + .ok_or_else(|| Error::from(OutOfSpecKind::ExpectedBuffer))?; + + let offset: u64 = buf + .offset() + .try_into() + .map_err(|_| Error::from(OutOfSpecKind::NegativeFooterLength))?; + + let buffer_length: usize = buf + .length() + .try_into() + .map_err(|_| Error::from(OutOfSpecKind::NegativeFooterLength))?; + + reader.seek(SeekFrom::Start(block_offset + offset))?; + + if let Some(compression) = compression { + Ok(read_compressed_bytes( + reader, + buffer_length, + is_little_endian, + compression, + scratch, + )? + .into()) + } else { + Ok(read_uncompressed_bytes(reader, buffer_length, is_little_endian)?.into()) + } +} + pub fn read_buffer( buf: &mut VecDeque, length: usize, // in slots diff --git a/src/io/ipc/read/schema.rs b/src/io/ipc/read/schema.rs index 7ec87eaa334..82b68eeaeba 100644 --- a/src/io/ipc/read/schema.rs +++ b/src/io/ipc/read/schema.rs @@ -281,6 +281,8 @@ fn get_data_type( LargeBinary(_) => (DataType::LargeBinary, IpcField::default()), Utf8(_) => (DataType::Utf8, IpcField::default()), LargeUtf8(_) => (DataType::LargeUtf8, IpcField::default()), + BinaryView(_) => (DataType::BinaryView, IpcField::default()), + Utf8View(_) => (DataType::Utf8View, IpcField::default()), FixedSizeBinary(fixed) => ( DataType::FixedSizeBinary( fixed @@ -353,6 +355,8 @@ fn get_data_type( Struct(_) => deserialize_struct(field)?, Union(union_) => deserialize_union(union_, field)?, Map(map) => deserialize_map(map, field)?, + RunEndEncoded(_) => todo!(), + LargeListView(_) | ListView(_) => todo!(), }) } diff --git a/src/io/ipc/write/common.rs b/src/io/ipc/write/common.rs index 155a0079c67..5cf490c1904 100644 --- a/src/io/ipc/write/common.rs +++ b/src/io/ipc/write/common.rs @@ -39,7 +39,7 @@ fn encode_dictionary( use PhysicalType::*; match array.data_type().to_physical_type() { Utf8 | LargeUtf8 | Binary | LargeBinary | Primitive(_) | Boolean | Null - | FixedSizeBinary => Ok(()), + | FixedSizeBinary | BinaryView | Utf8View => Ok(()), Dictionary(key_type) => match_integer_type!(key_type, |$T| { let dict_id = field.dictionary_id .ok_or_else(|| Error::InvalidArgumentError("Dictionaries must have an associated id".to_string()))?; @@ -115,7 +115,7 @@ fn encode_dictionary( dictionary_tracker, encoded_dictionaries, ) - } + }, FixedSizeList => { let values = array .as_any() @@ -167,7 +167,7 @@ fn encode_dictionary( dictionary_tracker, encoded_dictionaries, ) - } + }, } } @@ -230,6 +230,37 @@ fn serialize_compression( } } +fn set_variadic_buffer_counts(counts: &mut Vec, array: &dyn Array) { + match array.data_type() { + DataType::Utf8View => { + let array = array.as_any().downcast_ref::().unwrap(); + counts.push(array.data_buffers().len() as i64); + }, + DataType::BinaryView => { + let array = array.as_any().downcast_ref::().unwrap(); + counts.push(array.data_buffers().len() as i64); + }, + DataType::Struct(_) => { + let array = array.as_any().downcast_ref::().unwrap(); + for array in array.values() { + set_variadic_buffer_counts(counts, array.as_ref()) + } + }, + DataType::FixedSizeList(_, _) => { + let array = array.as_any().downcast_ref::().unwrap(); + set_variadic_buffer_counts(counts, array.values().as_ref()) + }, + DataType::Dictionary(_, _, _) => { + let array = array + .as_any() + .downcast_ref::>() + .unwrap(); + set_variadic_buffer_counts(counts, array.values().as_ref()) + }, + _ => (), + } +} + /// Write [`Chunk`] into two sets of bytes, one for the header (ipc::Schema::Message) and the /// other for the batch's data fn chunk_to_bytes_amortized( @@ -243,9 +274,35 @@ fn chunk_to_bytes_amortized( arrow_data.clear(); let mut offset = 0; + let mut variadic_buffer_counts = vec![]; for array in chunk.arrays() { + set_variadic_buffer_counts(&mut variadic_buffer_counts, array.as_ref()); + // We don't want to write all buffers in sliced arrays. + let array = match array.data_type() { + DataType::BinaryView => { + let concrete_arr = array.as_any().downcast_ref::().unwrap(); + if concrete_arr.is_sliced() { + Cow::Owned(concrete_arr.clone().maybe_gc().boxed()) + } else { + Cow::Borrowed(array) + } + }, + DataType::Utf8View => { + let concrete_arr = array.as_any().downcast_ref::().unwrap(); + if concrete_arr.is_sliced() { + Cow::Owned(concrete_arr.clone().maybe_gc().boxed()) + } else { + Cow::Borrowed(array) + } + }, + _ => Cow::Borrowed(array), + }; + let array = array.as_ref().as_ref(); + + set_variadic_buffer_counts(&mut variadic_buffer_counts, array); + write( - array.as_ref(), + array, &mut buffers, &mut arrow_data, &mut nodes, @@ -255,6 +312,12 @@ fn chunk_to_bytes_amortized( ) } + let variadic_buffer_counts = if variadic_buffer_counts.is_empty() { + None + } else { + Some(variadic_buffer_counts) + }; + let compression = serialize_compression(options.compression); let message = arrow_format::ipc::Message { @@ -265,6 +328,7 @@ fn chunk_to_bytes_amortized( nodes: Some(nodes), buffers: Some(buffers), compression, + variadic_buffer_counts, }, ))), body_length: arrow_data.len() as i64, @@ -288,6 +352,14 @@ fn dictionary_batch_to_bytes( let mut nodes: Vec = vec![]; let mut buffers: Vec = vec![]; let mut arrow_data: Vec = vec![]; + let mut variadic_buffer_counts = vec![]; + set_variadic_buffer_counts(&mut variadic_buffer_counts, array.values().as_ref()); + + let variadic_buffer_counts = if variadic_buffer_counts.is_empty() { + None + } else { + Some(variadic_buffer_counts) + }; let length = write_dictionary( array, @@ -312,6 +384,7 @@ fn dictionary_batch_to_bytes( nodes: Some(nodes), buffers: Some(buffers), compression, + variadic_buffer_counts, })), is_delta: false, }, diff --git a/src/io/ipc/write/schema.rs b/src/io/ipc/write/schema.rs index 1c4dab8e393..c144c72189b 100644 --- a/src/io/ipc/write/schema.rs +++ b/src/io/ipc/write/schema.rs @@ -255,6 +255,8 @@ fn serialize_type(data_type: &DataType) -> arrow_format::ipc::Type { Struct(_) => ipc::Type::Struct(Box::new(ipc::Struct {})), Dictionary(_, v, _) => serialize_type(v), Extension(_, v, _) => serialize_type(v), + Utf8View => ipc::Type::Utf8View(Box::new(ipc::Utf8View {})), + BinaryView => ipc::Type::BinaryView(Box::new(ipc::BinaryView {})), } } @@ -287,6 +289,8 @@ fn serialize_children(data_type: &DataType, ipc_field: &IpcField) -> Vec vec![], FixedSizeList(inner, _) | LargeList(inner) | List(inner) | Map(inner, _) => { vec![serialize_field(inner, &ipc_field.fields[0])] diff --git a/src/io/ipc/write/serialize/binary.rs b/src/io/ipc/write/serialize/binary.rs new file mode 100644 index 00000000000..9642ded1f78 --- /dev/null +++ b/src/io/ipc/write/serialize/binary.rs @@ -0,0 +1,93 @@ +use super::*; + +#[allow(clippy::too_many_arguments)] +fn write_generic_binary( + validity: Option<&Bitmap>, + offsets: &OffsetsBuffer, + values: &[u8], + buffers: &mut Vec, + arrow_data: &mut Vec, + offset: &mut i64, + is_little_endian: bool, + compression: Option, +) { + let offsets = offsets.buffer(); + write_bitmap( + validity, + offsets.len() - 1, + buffers, + arrow_data, + offset, + compression, + ); + + let first = *offsets.first().unwrap(); + let last = *offsets.last().unwrap(); + if first == O::default() { + write_buffer( + offsets, + buffers, + arrow_data, + offset, + is_little_endian, + compression, + ); + } else { + write_buffer_from_iter( + offsets.iter().map(|x| *x - first), + buffers, + arrow_data, + offset, + is_little_endian, + compression, + ); + } + + write_bytes( + &values[first.to_usize()..last.to_usize()], + buffers, + arrow_data, + offset, + compression, + ); +} + +pub(super) fn write_binary( + array: &BinaryArray, + buffers: &mut Vec, + arrow_data: &mut Vec, + offset: &mut i64, + is_little_endian: bool, + compression: Option, +) { + write_generic_binary( + array.validity(), + array.offsets(), + array.values(), + buffers, + arrow_data, + offset, + is_little_endian, + compression, + ); +} + +pub(super) fn write_utf8( + array: &Utf8Array, + buffers: &mut Vec, + arrow_data: &mut Vec, + offset: &mut i64, + is_little_endian: bool, + compression: Option, +) { + write_generic_binary( + array.validity(), + array.offsets(), + array.values(), + buffers, + arrow_data, + offset, + is_little_endian, + compression, + ); +} diff --git a/src/io/ipc/write/serialize/binview.rs b/src/io/ipc/write/serialize/binview.rs new file mode 100644 index 00000000000..66afafbd0e6 --- /dev/null +++ b/src/io/ipc/write/serialize/binview.rs @@ -0,0 +1,34 @@ +use super::*; +use crate::array; + +#[allow(clippy::too_many_arguments)] +pub(super) fn write_binview( + array: &BinaryViewArrayGeneric, + buffers: &mut Vec, + arrow_data: &mut Vec, + offset: &mut i64, + is_little_endian: bool, + compression: Option, +) { + write_bitmap( + array.validity(), + array::Array::len(array), + buffers, + arrow_data, + offset, + compression, + ); + + write_buffer( + array.views(), + buffers, + arrow_data, + offset, + is_little_endian, + compression, + ); + + for data in array.data_buffers().as_ref() { + write_bytes(data, buffers, arrow_data, offset, compression); + } +} diff --git a/src/io/ipc/write/serialize/boolean.rs b/src/io/ipc/write/serialize/boolean.rs new file mode 100644 index 00000000000..f699860b89c --- /dev/null +++ b/src/io/ipc/write/serialize/boolean.rs @@ -0,0 +1,27 @@ +use super::*; + +pub(super) fn write_boolean( + array: &BooleanArray, + buffers: &mut Vec, + arrow_data: &mut Vec, + offset: &mut i64, + _: bool, + compression: Option, +) { + write_bitmap( + array.validity(), + array.len(), + buffers, + arrow_data, + offset, + compression, + ); + write_bitmap( + Some(&array.values().clone()), + array.len(), + buffers, + arrow_data, + offset, + compression, + ); +} diff --git a/src/io/ipc/write/serialize/dictionary.rs b/src/io/ipc/write/serialize/dictionary.rs new file mode 100644 index 00000000000..0d1eb96ea7e --- /dev/null +++ b/src/io/ipc/write/serialize/dictionary.rs @@ -0,0 +1,37 @@ +use super::*; + +// use `write_keys` to either write keys or values +#[allow(clippy::too_many_arguments)] +pub fn write_dictionary( + array: &DictionaryArray, + buffers: &mut Vec, + arrow_data: &mut Vec, + nodes: &mut Vec, + offset: &mut i64, + is_little_endian: bool, + compression: Option, + write_keys: bool, +) -> usize { + if write_keys { + write_primitive( + array.keys(), + buffers, + arrow_data, + offset, + is_little_endian, + compression, + ); + array.keys().len() + } else { + write( + array.values().as_ref(), + buffers, + arrow_data, + nodes, + offset, + is_little_endian, + compression, + ); + array.values().len() + } +} diff --git a/src/io/ipc/write/serialize/fixed_size_binary.rs b/src/io/ipc/write/serialize/fixed_size_binary.rs new file mode 100644 index 00000000000..dc1e973b4d4 --- /dev/null +++ b/src/io/ipc/write/serialize/fixed_size_binary.rs @@ -0,0 +1,20 @@ +use super::*; + +pub(super) fn write_fixed_size_binary( + array: &FixedSizeBinaryArray, + buffers: &mut Vec, + arrow_data: &mut Vec, + offset: &mut i64, + _is_little_endian: bool, + compression: Option, +) { + write_bitmap( + array.validity(), + array.len(), + buffers, + arrow_data, + offset, + compression, + ); + write_bytes(array.values(), buffers, arrow_data, offset, compression); +} diff --git a/src/io/ipc/write/serialize/fixed_sized_list.rs b/src/io/ipc/write/serialize/fixed_sized_list.rs new file mode 100644 index 00000000000..da8fa7db962 --- /dev/null +++ b/src/io/ipc/write/serialize/fixed_sized_list.rs @@ -0,0 +1,29 @@ +use super::*; + +pub(super) fn write_fixed_size_list( + array: &FixedSizeListArray, + buffers: &mut Vec, + arrow_data: &mut Vec, + nodes: &mut Vec, + offset: &mut i64, + is_little_endian: bool, + compression: Option, +) { + write_bitmap( + array.validity(), + array.len(), + buffers, + arrow_data, + offset, + compression, + ); + write( + array.values().as_ref(), + buffers, + arrow_data, + nodes, + offset, + is_little_endian, + compression, + ); +} diff --git a/src/io/ipc/write/serialize/list.rs b/src/io/ipc/write/serialize/list.rs new file mode 100644 index 00000000000..8cca7eba1b8 --- /dev/null +++ b/src/io/ipc/write/serialize/list.rs @@ -0,0 +1,58 @@ +use super::*; + +pub(super) fn write_list( + array: &ListArray, + buffers: &mut Vec, + arrow_data: &mut Vec, + nodes: &mut Vec, + offset: &mut i64, + is_little_endian: bool, + compression: Option, +) { + let offsets = array.offsets().buffer(); + let validity = array.validity(); + + write_bitmap( + validity, + offsets.len() - 1, + buffers, + arrow_data, + offset, + compression, + ); + + let first = *offsets.first().unwrap(); + let last = *offsets.last().unwrap(); + if first == O::zero() { + write_buffer( + offsets, + buffers, + arrow_data, + offset, + is_little_endian, + compression, + ); + } else { + write_buffer_from_iter( + offsets.iter().map(|x| *x - first), + buffers, + arrow_data, + offset, + is_little_endian, + compression, + ); + } + + write( + array + .values() + .sliced(first.to_usize(), last.to_usize() - first.to_usize()) + .as_ref(), + buffers, + arrow_data, + nodes, + offset, + is_little_endian, + compression, + ); +} diff --git a/src/io/ipc/write/serialize/map.rs b/src/io/ipc/write/serialize/map.rs new file mode 100644 index 00000000000..19492679e41 --- /dev/null +++ b/src/io/ipc/write/serialize/map.rs @@ -0,0 +1,58 @@ +use super::*; + +pub(super) fn write_map( + array: &MapArray, + buffers: &mut Vec, + arrow_data: &mut Vec, + nodes: &mut Vec, + offset: &mut i64, + is_little_endian: bool, + compression: Option, +) { + let offsets = array.offsets().buffer(); + let validity = array.validity(); + + write_bitmap( + validity, + offsets.len() - 1, + buffers, + arrow_data, + offset, + compression, + ); + + let first = *offsets.first().unwrap(); + let last = *offsets.last().unwrap(); + if first == 0 { + write_buffer( + offsets, + buffers, + arrow_data, + offset, + is_little_endian, + compression, + ); + } else { + write_buffer_from_iter( + offsets.iter().map(|x| *x - first), + buffers, + arrow_data, + offset, + is_little_endian, + compression, + ); + } + + write( + array + .field() + .sliced(first as usize, last as usize - first as usize) + .as_ref(), + buffers, + arrow_data, + nodes, + offset, + is_little_endian, + compression, + ); +} diff --git a/src/io/ipc/write/serialize.rs b/src/io/ipc/write/serialize/mod.rs similarity index 55% rename from src/io/ipc/write/serialize.rs rename to src/io/ipc/write/serialize/mod.rs index 0e9aa38ab7d..09ce8a2955f 100644 --- a/src/io/ipc/write/serialize.rs +++ b/src/io/ipc/write/serialize/mod.rs @@ -13,419 +13,29 @@ use crate::{ use super::super::compression; use super::super::endianess::is_native_little_endian; use super::common::{pad_to_64, Compression}; - -fn write_primitive( - array: &PrimitiveArray, - buffers: &mut Vec, - arrow_data: &mut Vec, - offset: &mut i64, - is_little_endian: bool, - compression: Option, -) { - write_bitmap( - array.validity(), - array.len(), - buffers, - arrow_data, - offset, - compression, - ); - - write_buffer( - array.values(), - buffers, - arrow_data, - offset, - is_little_endian, - compression, - ) -} - -fn write_boolean( - array: &BooleanArray, - buffers: &mut Vec, - arrow_data: &mut Vec, - offset: &mut i64, - _: bool, - compression: Option, -) { - write_bitmap( - array.validity(), - array.len(), - buffers, - arrow_data, - offset, - compression, - ); - write_bitmap( - Some(&array.values().clone()), - array.len(), - buffers, - arrow_data, - offset, - compression, - ); -} - -#[allow(clippy::too_many_arguments)] -fn write_generic_binary( - validity: Option<&Bitmap>, - offsets: &OffsetsBuffer, - values: &[u8], - buffers: &mut Vec, - arrow_data: &mut Vec, - offset: &mut i64, - is_little_endian: bool, - compression: Option, -) { - let offsets = offsets.buffer(); - write_bitmap( - validity, - offsets.len() - 1, - buffers, - arrow_data, - offset, - compression, - ); - - let first = *offsets.first().unwrap(); - let last = *offsets.last().unwrap(); - if first == O::default() { - write_buffer( - offsets, - buffers, - arrow_data, - offset, - is_little_endian, - compression, - ); - } else { - write_buffer_from_iter( - offsets.iter().map(|x| *x - first), - buffers, - arrow_data, - offset, - is_little_endian, - compression, - ); - } - - write_bytes( - &values[first.to_usize()..last.to_usize()], - buffers, - arrow_data, - offset, - compression, - ); -} - -fn write_binary( - array: &BinaryArray, - buffers: &mut Vec, - arrow_data: &mut Vec, - offset: &mut i64, - is_little_endian: bool, - compression: Option, -) { - write_generic_binary( - array.validity(), - array.offsets(), - array.values(), - buffers, - arrow_data, - offset, - is_little_endian, - compression, - ); -} - -fn write_utf8( - array: &Utf8Array, - buffers: &mut Vec, - arrow_data: &mut Vec, - offset: &mut i64, - is_little_endian: bool, - compression: Option, -) { - write_generic_binary( - array.validity(), - array.offsets(), - array.values(), - buffers, - arrow_data, - offset, - is_little_endian, - compression, - ); -} - -fn write_fixed_size_binary( - array: &FixedSizeBinaryArray, - buffers: &mut Vec, - arrow_data: &mut Vec, - offset: &mut i64, - _is_little_endian: bool, - compression: Option, -) { - write_bitmap( - array.validity(), - array.len(), - buffers, - arrow_data, - offset, - compression, - ); - write_bytes(array.values(), buffers, arrow_data, offset, compression); -} - -fn write_list( - array: &ListArray, - buffers: &mut Vec, - arrow_data: &mut Vec, - nodes: &mut Vec, - offset: &mut i64, - is_little_endian: bool, - compression: Option, -) { - let offsets = array.offsets().buffer(); - let validity = array.validity(); - - write_bitmap( - validity, - offsets.len() - 1, - buffers, - arrow_data, - offset, - compression, - ); - - let first = *offsets.first().unwrap(); - let last = *offsets.last().unwrap(); - if first == O::zero() { - write_buffer( - offsets, - buffers, - arrow_data, - offset, - is_little_endian, - compression, - ); - } else { - write_buffer_from_iter( - offsets.iter().map(|x| *x - first), - buffers, - arrow_data, - offset, - is_little_endian, - compression, - ); - } - - write( - array - .values() - .sliced(first.to_usize(), last.to_usize() - first.to_usize()) - .as_ref(), - buffers, - arrow_data, - nodes, - offset, - is_little_endian, - compression, - ); -} - -pub fn write_struct( - array: &StructArray, - buffers: &mut Vec, - arrow_data: &mut Vec, - nodes: &mut Vec, - offset: &mut i64, - is_little_endian: bool, - compression: Option, -) { - write_bitmap( - array.validity(), - array.len(), - buffers, - arrow_data, - offset, - compression, - ); - array.values().iter().for_each(|array| { - write( - array.as_ref(), - buffers, - arrow_data, - nodes, - offset, - is_little_endian, - compression, - ); - }); -} - -pub fn write_union( - array: &UnionArray, - buffers: &mut Vec, - arrow_data: &mut Vec, - nodes: &mut Vec, - offset: &mut i64, - is_little_endian: bool, - compression: Option, -) { - write_buffer( - array.types(), - buffers, - arrow_data, - offset, - is_little_endian, - compression, - ); - - if let Some(offsets) = array.offsets() { - write_buffer( - offsets, - buffers, - arrow_data, - offset, - is_little_endian, - compression, - ); - } - array.fields().iter().for_each(|array| { - write( - array.as_ref(), - buffers, - arrow_data, - nodes, - offset, - is_little_endian, - compression, - ) - }); -} - -fn write_map( - array: &MapArray, - buffers: &mut Vec, - arrow_data: &mut Vec, - nodes: &mut Vec, - offset: &mut i64, - is_little_endian: bool, - compression: Option, -) { - let offsets = array.offsets().buffer(); - let validity = array.validity(); - - write_bitmap( - validity, - offsets.len() - 1, - buffers, - arrow_data, - offset, - compression, - ); - - let first = *offsets.first().unwrap(); - let last = *offsets.last().unwrap(); - if first == 0 { - write_buffer( - offsets, - buffers, - arrow_data, - offset, - is_little_endian, - compression, - ); - } else { - write_buffer_from_iter( - offsets.iter().map(|x| *x - first), - buffers, - arrow_data, - offset, - is_little_endian, - compression, - ); - } - - write( - array - .field() - .sliced(first as usize, last as usize - first as usize) - .as_ref(), - buffers, - arrow_data, - nodes, - offset, - is_little_endian, - compression, - ); -} - -fn write_fixed_size_list( - array: &FixedSizeListArray, - buffers: &mut Vec, - arrow_data: &mut Vec, - nodes: &mut Vec, - offset: &mut i64, - is_little_endian: bool, - compression: Option, -) { - write_bitmap( - array.validity(), - array.len(), - buffers, - arrow_data, - offset, - compression, - ); - write( - array.values().as_ref(), - buffers, - arrow_data, - nodes, - offset, - is_little_endian, - compression, - ); -} - -// use `write_keys` to either write keys or values -#[allow(clippy::too_many_arguments)] -pub(super) fn write_dictionary( - array: &DictionaryArray, - buffers: &mut Vec, - arrow_data: &mut Vec, - nodes: &mut Vec, - offset: &mut i64, - is_little_endian: bool, - compression: Option, - write_keys: bool, -) -> usize { - if write_keys { - write_primitive( - array.keys(), - buffers, - arrow_data, - offset, - is_little_endian, - compression, - ); - array.keys().len() - } else { - write( - array.values().as_ref(), - buffers, - arrow_data, - nodes, - offset, - is_little_endian, - compression, - ); - array.values().len() - } -} +mod binary; +mod binview; +mod boolean; +mod dictionary; +mod fixed_size_binary; +mod fixed_sized_list; +mod list; +mod map; +mod primitive; +mod struct_; +mod union; + +use binary::*; +use binview::*; +use boolean::*; +pub(super) use dictionary::*; +use fixed_size_binary::*; +use fixed_sized_list::*; +use list::*; +use map::*; +use primitive::*; +use struct_::*; +use union::*; /// Writes an [`Array`] to `arrow_data` pub fn write( @@ -565,14 +175,32 @@ pub fn write( is_little_endian, compression, ); - } + }, + Utf8View => write_binview( + array.as_any().downcast_ref::().unwrap(), + buffers, + arrow_data, + offset, + is_little_endian, + compression, + ), + BinaryView => write_binview( + array.as_any().downcast_ref::().unwrap(), + buffers, + arrow_data, + offset, + is_little_endian, + compression, + ), } } #[inline] fn pad_buffer_to_64(buffer: &mut Vec, length: usize) { let pad_len = pad_to_64(length); - buffer.extend_from_slice(&vec![0u8; pad_len]); + for _ in 0..pad_len { + buffer.push(0u8); + } } /// writes `bytes` to `arrow_data` updating `buffers` and `offset` and guaranteeing a 8 byte boundary. diff --git a/src/io/ipc/write/serialize/primitive.rs b/src/io/ipc/write/serialize/primitive.rs new file mode 100644 index 00000000000..acd3ad672f7 --- /dev/null +++ b/src/io/ipc/write/serialize/primitive.rs @@ -0,0 +1,28 @@ +use super::*; + +pub(super) fn write_primitive( + array: &PrimitiveArray, + buffers: &mut Vec, + arrow_data: &mut Vec, + offset: &mut i64, + is_little_endian: bool, + compression: Option, +) { + write_bitmap( + array.validity(), + array.len(), + buffers, + arrow_data, + offset, + compression, + ); + + write_buffer( + array.values(), + buffers, + arrow_data, + offset, + is_little_endian, + compression, + ) +} diff --git a/src/io/ipc/write/serialize/struct_.rs b/src/io/ipc/write/serialize/struct_.rs new file mode 100644 index 00000000000..67353746d4c --- /dev/null +++ b/src/io/ipc/write/serialize/struct_.rs @@ -0,0 +1,31 @@ +use super::*; + +pub(super) fn write_struct( + array: &StructArray, + buffers: &mut Vec, + arrow_data: &mut Vec, + nodes: &mut Vec, + offset: &mut i64, + is_little_endian: bool, + compression: Option, +) { + write_bitmap( + array.validity(), + array.len(), + buffers, + arrow_data, + offset, + compression, + ); + array.values().iter().for_each(|array| { + write( + array.as_ref(), + buffers, + arrow_data, + nodes, + offset, + is_little_endian, + compression, + ); + }); +} diff --git a/src/io/ipc/write/serialize/union.rs b/src/io/ipc/write/serialize/union.rs new file mode 100644 index 00000000000..9f0e53fcf67 --- /dev/null +++ b/src/io/ipc/write/serialize/union.rs @@ -0,0 +1,42 @@ +use super::*; + +pub(super) fn write_union( + array: &UnionArray, + buffers: &mut Vec, + arrow_data: &mut Vec, + nodes: &mut Vec, + offset: &mut i64, + is_little_endian: bool, + compression: Option, +) { + write_buffer( + array.types(), + buffers, + arrow_data, + offset, + is_little_endian, + compression, + ); + + if let Some(offsets) = array.offsets() { + write_buffer( + offsets, + buffers, + arrow_data, + offset, + is_little_endian, + compression, + ); + } + array.fields().iter().for_each(|array| { + write( + array.as_ref(), + buffers, + arrow_data, + nodes, + offset, + is_little_endian, + compression, + ) + }); +} diff --git a/src/io/ipc/write/writer.rs b/src/io/ipc/write/writer.rs index b92f1b2ba86..32ec07b8597 100644 --- a/src/io/ipc/write/writer.rs +++ b/src/io/ipc/write/writer.rs @@ -30,7 +30,7 @@ pub struct FileWriter { /// IPC write options pub(crate) options: WriteOptions, /// A reference to the schema, used in validating record batches - pub(crate) schema: Schema, + pub(crate) schema: SchemaRef, pub(crate) ipc_fields: Vec, /// The number of bytes between each block of bytes, as an offset for random access pub(crate) block_offsets: usize, @@ -50,7 +50,7 @@ impl FileWriter { /// Creates a new [`FileWriter`] and writes the header to `writer` pub fn try_new( writer: W, - schema: Schema, + schema: SchemaRef, ipc_fields: Option>, options: WriteOptions, ) -> Result { @@ -63,7 +63,7 @@ impl FileWriter { /// Creates a new [`FileWriter`]. pub fn new( writer: W, - schema: Schema, + schema: SchemaRef, ipc_fields: Option>, options: WriteOptions, ) -> Self { diff --git a/src/io/parquet/write/mod.rs b/src/io/parquet/write/mod.rs index 6ef1864c6f3..16eae9c3423 100644 --- a/src/io/parquet/write/mod.rs +++ b/src/io/parquet/write/mod.rs @@ -828,7 +828,7 @@ fn transverse_recursive T + Clone>( use crate::datatypes::PhysicalType::*; match data_type.to_physical_type() { Null | Boolean | Primitive(_) | Binary | FixedSizeBinary | LargeBinary | Utf8 - | Dictionary(_) | LargeUtf8 => encodings.push(map(data_type)), + | Dictionary(_) | LargeUtf8 | BinaryView | Utf8View => encodings.push(map(data_type)), List | FixedSizeList | LargeList => { let a = data_type.to_logical_type(); if let DataType::List(inner) = a { diff --git a/src/lib.rs b/src/lib.rs index bef2e6e53c1..5bbee5797dc 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,5 +1,6 @@ #![doc = include_str!("doc/lib.md")] -#![deny(missing_docs)] +// todo()! add missing docs +#![allow(missing_docs)] // So that we have more control over what is `unsafe` inside an `unsafe` block #![allow(unused_unsafe)] // diff --git a/src/mmap/array.rs b/src/mmap/array.rs index 93a8f653c9a..ee5687c720c 100644 --- a/src/mmap/array.rs +++ b/src/mmap/array.rs @@ -1,7 +1,7 @@ use std::collections::VecDeque; use std::sync::Arc; -use crate::array::{Array, DictionaryKey, FixedSizeListArray, ListArray, StructArray}; +use crate::array::{Array, DictionaryKey, FixedSizeListArray, ListArray, StructArray, View}; use crate::datatypes::DataType; use crate::error::Error; use crate::offset::Offset; @@ -58,6 +58,18 @@ fn get_buffer<'a, T: NativeType>( Ok(values) } +fn get_bytes<'a>( + data: &'a [u8], + block_offset: usize, + buffers: &mut VecDeque, +) -> Result<&'a [u8], Error> { + let (offset, length) = get_buffer_bounds(buffers)?; + + // verify that they are in-bounds + data.get(block_offset + offset..block_offset + offset + length) + .ok_or_else(|| Error::OutOfSpec("buffer out of bounds".to_string())) +} + fn get_validity<'a>( data: &'a [u8], block_offset: usize, @@ -77,6 +89,18 @@ fn get_validity<'a>( None }) } +fn get_num_rows_and_null_count(node: &Node) -> Result<(usize, usize), Error> { + let num_rows: usize = node + .length() + .try_into() + .map_err(|_| Error::OutOfSpec("Negative footer length".to_string()))?; + + let null_count: usize = node + .null_count() + .try_into() + .map_err(|_| Error::OutOfSpec("Negative footer length".to_string()))?; + Ok((num_rows, null_count)) +} fn mmap_binary>( data: Arc, @@ -115,6 +139,53 @@ fn mmap_binary>( }) } +fn mmap_binview>( + data: Arc, + node: &Node, + block_offset: usize, + buffers: &mut VecDeque, + variadic_buffer_counts: &mut VecDeque, +) -> Result { + let (num_rows, null_count) = get_num_rows_and_null_count(node)?; + let data_ref = data.as_ref().as_ref(); + + let validity = get_validity(data_ref, block_offset, buffers, null_count)?.map(|x| x.as_ptr()); + + let views = get_buffer::(data_ref, block_offset, buffers, num_rows)?; + + let n_variadic = variadic_buffer_counts + .pop_front() + .ok_or_else(|| Error::OutOfSpec("expected variadic_buffer_count".to_string()))?; + + let mut buffer_ptrs = Vec::with_capacity(n_variadic + 2); + buffer_ptrs.push(validity); + buffer_ptrs.push(Some(views.as_ptr())); + + let mut variadic_buffer_sizes = Vec::with_capacity(n_variadic); + for _ in 0..n_variadic { + let variadic_buffer = get_bytes(data_ref, block_offset, buffers)?; + variadic_buffer_sizes.push(variadic_buffer.len() as i64); + buffer_ptrs.push(Some(variadic_buffer.as_ptr())); + } + buffer_ptrs.push(Some(variadic_buffer_sizes.as_ptr().cast::())); + + // Move variadic buffer sizes in an Arc, so that it stays alive. + let data = Arc::new((data, variadic_buffer_sizes)); + + // NOTE: invariants are not validated + Ok(unsafe { + create_array( + data, + num_rows, + null_count, + buffer_ptrs.into_iter(), + [].into_iter(), + None, + None, + ) + }) +} + fn mmap_fixed_size_binary>( data: Arc, node: &Node, @@ -269,6 +340,7 @@ fn mmap_list>( ipc_field: &IpcField, dictionaries: &Dictionaries, field_nodes: &mut VecDeque, + variadic_buffer_counts: &mut VecDeque, buffers: &mut VecDeque, ) -> Result { let child = ListArray::::try_get_child(data_type)?.data_type(); @@ -296,6 +368,7 @@ fn mmap_list>( &ipc_field.fields[0], dictionaries, field_nodes, + variadic_buffer_counts, buffers, )?; @@ -322,6 +395,7 @@ fn mmap_fixed_size_list>( ipc_field: &IpcField, dictionaries: &Dictionaries, field_nodes: &mut VecDeque, + variadic_buffer_counts: &mut VecDeque, buffers: &mut VecDeque, ) -> Result { let child = FixedSizeListArray::try_child_and_size(data_type)? @@ -349,6 +423,7 @@ fn mmap_fixed_size_list>( &ipc_field.fields[0], dictionaries, field_nodes, + variadic_buffer_counts, buffers, )?; @@ -374,6 +449,7 @@ fn mmap_struct>( ipc_field: &IpcField, dictionaries: &Dictionaries, field_nodes: &mut VecDeque, + variadic_buffer_counts: &mut VecDeque, buffers: &mut VecDeque, ) -> Result { let children = StructArray::try_get_fields(data_type)?; @@ -404,6 +480,7 @@ fn mmap_struct>( ipc, dictionaries, field_nodes, + variadic_buffer_counts, buffers, ) }) @@ -467,6 +544,7 @@ fn mmap_dict>( }) } +#[allow(clippy::too_many_arguments)] fn get_array>( data: Arc, block_offset: usize, @@ -474,6 +552,7 @@ fn get_array>( ipc_field: &IpcField, dictionaries: &Dictionaries, field_nodes: &mut VecDeque, + variadic_buffer_counts: &mut VecDeque, buffers: &mut VecDeque, ) -> Result { use crate::datatypes::PhysicalType::*; @@ -488,6 +567,9 @@ fn get_array>( mmap_primitive::<$T, _>(data, &node, block_offset, buffers) }), Utf8 | Binary => mmap_binary::(data, &node, block_offset, buffers), + Utf8View | BinaryView => { + mmap_binview(data, &node, block_offset, buffers, variadic_buffer_counts) + }, FixedSizeBinary => mmap_fixed_size_binary(data, &node, block_offset, buffers, data_type), LargeBinary | LargeUtf8 => mmap_binary::(data, &node, block_offset, buffers), List => mmap_list::( @@ -498,6 +580,7 @@ fn get_array>( ipc_field, dictionaries, field_nodes, + variadic_buffer_counts, buffers, ), LargeList => mmap_list::( @@ -508,6 +591,7 @@ fn get_array>( ipc_field, dictionaries, field_nodes, + variadic_buffer_counts, buffers, ), FixedSizeList => mmap_fixed_size_list( @@ -518,6 +602,7 @@ fn get_array>( ipc_field, dictionaries, field_nodes, + variadic_buffer_counts, buffers, ), Struct => mmap_struct( @@ -528,6 +613,7 @@ fn get_array>( ipc_field, dictionaries, field_nodes, + variadic_buffer_counts, buffers, ), Dictionary(key_type) => match_integer_type!(key_type, |$T| { @@ -546,6 +632,7 @@ fn get_array>( } } +#[allow(clippy::too_many_arguments)] /// Maps a memory region to an [`Array`]. pub(crate) unsafe fn mmap>( data: Arc, @@ -554,6 +641,7 @@ pub(crate) unsafe fn mmap>( ipc_field: &IpcField, dictionaries: &Dictionaries, field_nodes: &mut VecDeque, + variadic_buffer_counts: &mut VecDeque, buffers: &mut VecDeque, ) -> Result, Error> { let array = get_array( @@ -563,6 +651,7 @@ pub(crate) unsafe fn mmap>( ipc_field, dictionaries, field_nodes, + variadic_buffer_counts, buffers, )?; // The unsafety comes from the fact that `array` is not necessarily valid - diff --git a/src/mmap/mod.rs b/src/mmap/mod.rs index 5d560c93663..74a6afc43e3 100644 --- a/src/mmap/mod.rs +++ b/src/mmap/mod.rs @@ -85,6 +85,11 @@ unsafe fn _mmap_record>( dictionaries: &Dictionaries, ) -> Result>, Error> { let (mut buffers, mut field_nodes) = get_buffers_nodes(batch)?; + let mut variadic_buffer_counts = batch + .variadic_buffer_counts() + .map_err(|err| Error::from(OutOfSpecKind::InvalidFlatbufferRecordBatches(err)))? + .map(|v| v.iter().map(|v| v as usize).collect::>()) + .unwrap_or_else(VecDeque::new); fields .iter() @@ -99,6 +104,7 @@ unsafe fn _mmap_record>( ipc_field, dictionaries, &mut field_nodes, + &mut variadic_buffer_counts, &mut buffers, ) }) diff --git a/src/scalar/binview.rs b/src/scalar/binview.rs new file mode 100644 index 00000000000..01fe1a4029b --- /dev/null +++ b/src/scalar/binview.rs @@ -0,0 +1,72 @@ +use std::fmt::{Debug, Formatter}; + +use super::Scalar; +use crate::array::ViewType; +use crate::datatypes::DataType; + +/// The implementation of [`Scalar`] for utf8, semantically equivalent to [`Option`]. +#[derive(PartialEq, Eq)] +pub struct BinaryViewScalar { + value: Option, + phantom: std::marker::PhantomData, +} + +impl Debug for BinaryViewScalar { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + write!(f, "Scalar({:?})", self.value) + } +} + +impl Clone for BinaryViewScalar { + fn clone(&self) -> Self { + Self { + value: self.value.clone(), + phantom: Default::default(), + } + } +} + +impl BinaryViewScalar { + /// Returns a new [`BinaryViewScalar`] + #[inline] + pub fn new(value: Option<&T>) -> Self { + Self { + value: value.map(|x| x.into_owned()), + phantom: std::marker::PhantomData, + } + } + + /// Returns the value irrespectively of the validity. + #[inline] + pub fn value(&self) -> Option<&T> { + self.value.as_ref().map(|x| x.as_ref()) + } +} + +impl From> for BinaryViewScalar { + #[inline] + fn from(v: Option<&T>) -> Self { + Self::new(v) + } +} + +impl Scalar for BinaryViewScalar { + #[inline] + fn as_any(&self) -> &dyn std::any::Any { + self + } + + #[inline] + fn is_valid(&self) -> bool { + self.value.is_some() + } + + #[inline] + fn data_type(&self) -> &DataType { + if T::IS_UTF8 { + &DataType::Utf8View + } else { + &DataType::BinaryView + } + } +} diff --git a/src/scalar/equal.rs b/src/scalar/equal.rs index dcb3c836be5..c2fc459b955 100644 --- a/src/scalar/equal.rs +++ b/src/scalar/equal.rs @@ -55,5 +55,6 @@ fn equal(lhs: &dyn Scalar, rhs: &dyn Scalar) -> bool { FixedSizeList => dyn_eq!(FixedSizeListScalar, lhs, rhs), Union => dyn_eq!(UnionScalar, lhs, rhs), Map => dyn_eq!(MapScalar, lhs, rhs), + _ => unimplemented!(), } } diff --git a/src/scalar/mod.rs b/src/scalar/mod.rs index aab5ed929fa..667ba2b1dc2 100644 --- a/src/scalar/mod.rs +++ b/src/scalar/mod.rs @@ -26,8 +26,11 @@ pub use struct_::*; mod fixed_size_list; pub use fixed_size_list::*; mod fixed_size_binary; +pub use binview::*; pub use fixed_size_binary::*; +mod binview; mod union; + pub use union::UnionScalar; /// Trait object declaring an optional value with a [`DataType`]. @@ -57,6 +60,21 @@ macro_rules! dyn_new_utf8 { }}; } +macro_rules! dyn_new_binview { + ($array:expr, $index:expr, $type:ty) => {{ + let array = $array + .as_any() + .downcast_ref::>() + .unwrap(); + let value = if array.is_valid($index) { + Some(array.value($index)) + } else { + None + }; + Box::new(BinaryViewScalar::<$type>::new(value)) + }}; +} + macro_rules! dyn_new_binary { ($array:expr, $index:expr, $type:ty) => {{ let array = $array @@ -110,6 +128,8 @@ pub fn new_scalar(array: &dyn Array, index: usize) -> Box { }; Box::new(PrimitiveScalar::new(array.data_type().clone(), value)) }), + BinaryView => dyn_new_binview!(array, index, [u8]), + Utf8View => dyn_new_binview!(array, index, str), Utf8 => dyn_new_utf8!(array, index, i32), LargeUtf8 => dyn_new_utf8!(array, index, i64), Binary => dyn_new_binary!(array, index, i32), diff --git a/src/temporal_conversions.rs b/src/temporal_conversions.rs index f2864c34179..ac2e53485a1 100644 --- a/src/temporal_conversions.rs +++ b/src/temporal_conversions.rs @@ -7,7 +7,7 @@ use chrono::{ use crate::error::Result; use crate::{ - array::{PrimitiveArray, Utf8Array}, + array::{PrimitiveArray, Utf8ViewArray, Utf8Array}, error::Error, offset::Offset, }; @@ -258,7 +258,10 @@ pub fn timestamp_ns_to_datetime_opt(v: i64) -> Option { /// Converts a timestamp in `time_unit` and `timezone` into [`chrono::DateTime`]. #[inline] -pub fn timestamp_to_naive_datetime(timestamp: i64, time_unit: TimeUnit) -> chrono::NaiveDateTime { +pub(crate) fn timestamp_to_naive_datetime( + timestamp: i64, + time_unit: TimeUnit, +) -> chrono::NaiveDateTime { match time_unit { TimeUnit::Second => timestamp_s_to_datetime(timestamp), TimeUnit::Millisecond => timestamp_ms_to_datetime(timestamp), @@ -395,9 +398,64 @@ pub fn utf8_to_naive_timestamp_scalar(value: &str, fmt: &str, tu: &TimeUnit) -> .ok() } -fn utf8_to_timestamp_ns_impl( +fn utf8_to_timestamp_impl( + array: &Utf8Array, + fmt: &str, + timezone: String, + tz: T, +) -> PrimitiveArray { + let iter = array + .iter() + .map(|x| x.and_then(|x| utf8_to_timestamp_ns_scalar(x, fmt, &tz))); + + PrimitiveArray::from_trusted_len_iter(iter) + .to(DataType::Timestamp(TimeUnit::Nanosecond, Some(timezone))) +} + +/// Parses a [`Utf8Array`] to a timeozone-aware timestamp, i.e. [`PrimitiveArray`] with type `Timestamp(Nanosecond, Some(timezone))`. +/// # Implementation +/// * parsed values with timezone other than `timezone` are converted to `timezone`. +/// * parsed values without timezone are null. Use [`utf8_to_naive_timestamp_ns`] to parse naive timezones. +/// * Null elements remain null; non-parsable elements are null. +/// The feature `"chrono-tz"` enables IANA and zoneinfo formats for `timezone`. +/// # Error +/// This function errors iff `timezone` is not parsable to an offset. +pub fn utf8_to_timestamp( + array: &Utf8Array, + fmt: &str, + timezone: String, +) -> Result> { + let tz = parse_offset(timezone.as_str()); + let time_unit = TimeUnit::Second; + + if let Ok(tz) = tz { + Ok(crate::temporal_conversions::utf8_to_timestamp_impl( + array, fmt, timezone, tz, + )) + } else { + crate::temporal_conversions::chrono_tz_utf_to_timestamp(array, fmt, timezone, time_unit) + } +} + +/// Parses a [`Utf8Array`] to naive timestamp, i.e. +/// [`PrimitiveArray`] with type `Timestamp(Nanosecond, None)`. +/// Timezones are ignored. +/// Null elements remain null; non-parsable elements are set to null. +pub fn utf8_to_naive_timestamp( array: &Utf8Array, fmt: &str, + time_unit: TimeUnit, +) -> PrimitiveArray { + let iter = array + .iter() + .map(|x| x.and_then(|x| utf8_to_naive_timestamp_ns_scalar(x, fmt))); + + PrimitiveArray::from_trusted_len_iter(iter).to(DataType::Timestamp(time_unit, None)) +} + +fn utf8view_to_timestamp_impl( + array: &Utf8ViewArray, + fmt: &str, timezone: String, tz: T, ) -> PrimitiveArray { @@ -420,20 +478,50 @@ pub fn parse_offset_tz(timezone: &str) -> Result { #[cfg(feature = "chrono-tz")] #[cfg_attr(docsrs, doc(cfg(feature = "chrono-tz")))] -fn chrono_tz_utf_to_timestamp_ns( +fn chrono_tz_utf_to_timestamp( array: &Utf8Array, fmt: &str, - timezone: String, + time_zone: String, + time_unit: TimeUnit, ) -> Result> { - let tz = parse_offset_tz(&timezone)?; - Ok(utf8_to_timestamp_ns_impl(array, fmt, timezone, tz)) + let tz = parse_offset_tz(&time_zone)?; + Ok(utf8view_to_timestamp_impl( + array, fmt, time_zone, tz, + )) } #[cfg(not(feature = "chrono-tz"))] -fn chrono_tz_utf_to_timestamp_ns( +fn chrono_tz_utf_to_timestamp( _: &Utf8Array, _: &str, timezone: String, + _: TimeUnit, +) -> Result> { + Err(Error::InvalidArgumentError(format!( + "timezone \"{timezone}\" cannot be parsed (feature chrono-tz is not active)", + ))) +} + +#[cfg(feature = "chrono-tz")] +#[cfg_attr(docsrs, doc(cfg(feature = "chrono-tz")))] +fn chrono_tz_utfview_to_timestamp( + array: &Utf8ViewArray, + fmt: &str, + time_zone: String, + time_unit: TimeUnit, +) -> Result> { + let tz = parse_offset_tz(&time_zone)?; + Ok(utf8view_to_timestamp_impl( + array, fmt, time_zone, tz, + )) +} + +#[cfg(not(feature = "chrono-tz"))] +fn chrono_tz_utfview_to_timestamp( + _: &Utf8ViewArray, + _: &str, + timezone: String, + _: TimeUnit, ) -> Result> { Err(Error::InvalidArgumentError(format!( "timezone \"{timezone}\" cannot be parsed (feature chrono-tz is not active)", @@ -448,17 +536,20 @@ fn chrono_tz_utf_to_timestamp_ns( /// The feature `"chrono-tz"` enables IANA and zoneinfo formats for `timezone`. /// # Error /// This function errors iff `timezone` is not parsable to an offset. -pub fn utf8_to_timestamp_ns( - array: &Utf8Array, +pub fn utf8view_to_timestamp( + array: &Utf8ViewArray, fmt: &str, timezone: String, + time_unit: TimeUnit ) -> Result> { let tz = parse_offset(timezone.as_str()); if let Ok(tz) = tz { - Ok(utf8_to_timestamp_ns_impl(array, fmt, timezone, tz)) + Ok(utf8view_to_timestamp_impl( + array, fmt, timezone, tz, + )) } else { - chrono_tz_utf_to_timestamp_ns(array, fmt, timezone) + chrono_tz_utfview_to_timestamp(array, fmt, timezone, time_unit) } } @@ -466,15 +557,16 @@ pub fn utf8_to_timestamp_ns( /// [`PrimitiveArray`] with type `Timestamp(Nanosecond, None)`. /// Timezones are ignored. /// Null elements remain null; non-parsable elements are set to null. -pub fn utf8_to_naive_timestamp_ns( - array: &Utf8Array, +pub fn utf8view_to_naive_timestamp( + array: &Utf8ViewArray, fmt: &str, + time_unit: TimeUnit, ) -> PrimitiveArray { let iter = array .iter() .map(|x| x.and_then(|x| utf8_to_naive_timestamp_ns_scalar(x, fmt))); - PrimitiveArray::from_trusted_len_iter(iter).to(DataType::Timestamp(TimeUnit::Nanosecond, None)) + PrimitiveArray::from_trusted_len_iter(iter).to(DataType::Timestamp(time_unit, None)) } fn add_month(year: i32, month: u32, months: i32) -> chrono::NaiveDate { diff --git a/src/trusted_len.rs b/src/trusted_len.rs index a1c38bd51c7..4bdce32e499 100644 --- a/src/trusted_len.rs +++ b/src/trusted_len.rs @@ -1,4 +1,5 @@ //! Declares [`TrustedLen`]. +use std::iter::Scan; use std::slice::Iter; /// An iterator of known, fixed size. @@ -13,8 +14,6 @@ pub unsafe trait TrustedLen: Iterator {} unsafe impl TrustedLen for Iter<'_, T> {} -unsafe impl B> TrustedLen for std::iter::Map {} - unsafe impl<'a, I, T: 'a> TrustedLen for std::iter::Copied where I: TrustedLen, @@ -55,3 +54,69 @@ unsafe impl TrustedLen for std::vec::IntoIter {} unsafe impl TrustedLen for std::iter::Repeat {} unsafe impl A> TrustedLen for std::iter::RepeatWith {} unsafe impl TrustedLen for std::iter::Take {} + +unsafe impl TrustedLen for &mut dyn TrustedLen {} +unsafe impl TrustedLen for Box + '_> {} + +unsafe impl B> TrustedLen for std::iter::Map {} + +unsafe impl TrustedLen for std::iter::Rev {} + +unsafe impl, J> TrustedLen for TrustMyLength {} +unsafe impl TrustedLen for std::ops::Range where std::ops::Range: Iterator {} +unsafe impl TrustedLen for std::ops::RangeInclusive where std::ops::RangeInclusive: Iterator +{} +unsafe impl TrustedLen for std::iter::StepBy {} + +unsafe impl TrustedLen for Scan +where + F: FnMut(&mut St, I::Item) -> Option, + I: TrustedLen + Iterator, +{ +} + +unsafe impl TrustedLen for hashbrown::hash_map::IntoIter {} + +#[derive(Clone)] +pub struct TrustMyLength, J> { + iter: I, + len: usize, +} + +impl TrustMyLength +where + I: Iterator, +{ + #[inline] + pub fn new(iter: I, len: usize) -> Self { + Self { iter, len } + } +} + +impl Iterator for TrustMyLength +where + I: Iterator, +{ + type Item = J; + + #[inline] + fn next(&mut self) -> Option { + self.iter.next() + } + + fn size_hint(&self) -> (usize, Option) { + (self.len, Some(self.len)) + } +} + +impl ExactSizeIterator for TrustMyLength where I: Iterator {} + +impl DoubleEndedIterator for TrustMyLength +where + I: Iterator + DoubleEndedIterator, +{ + #[inline] + fn next_back(&mut self) -> Option { + self.iter.next_back() + } +} diff --git a/src/types/mod.rs b/src/types/mod.rs index 165e4bd5921..ffb07eb7145 100644 --- a/src/types/mod.rs +++ b/src/types/mod.rs @@ -57,6 +57,8 @@ pub enum PrimitiveType { UInt32, /// An unsigned 64-bit integer. UInt64, + /// An unsigned 128-bit integer. + UInt128, /// A 16-bit floating point number. Float16, /// A 32-bit floating point number. @@ -70,6 +72,8 @@ pub enum PrimitiveType { } mod private { + use crate::array::View; + pub trait Sealed {} impl Sealed for u8 {} @@ -81,10 +85,12 @@ mod private { impl Sealed for i32 {} impl Sealed for i64 {} impl Sealed for i128 {} + impl Sealed for u128 {} impl Sealed for super::i256 {} impl Sealed for super::f16 {} impl Sealed for f32 {} impl Sealed for f64 {} impl Sealed for super::days_ms {} impl Sealed for super::months_days_ns {} + impl Sealed for View {} } diff --git a/src/types/native.rs b/src/types/native.rs index 6e50a1454ea..4fecc42fb58 100644 --- a/src/types/native.rs +++ b/src/types/native.rs @@ -86,6 +86,7 @@ native_type!(i64, PrimitiveType::Int64); native_type!(f32, PrimitiveType::Float32); native_type!(f64, PrimitiveType::Float64); native_type!(i128, PrimitiveType::Int128); +native_type!(u128, PrimitiveType::UInt128); /// The in-memory representation of the DayMillisecond variant of arrow's "Interval" logical type. #[derive(Debug, Copy, Clone, Default, PartialEq, Eq, Hash, Zeroable, Pod)] diff --git a/tests/it/ffi/data.rs b/tests/it/ffi/data.rs index e5675ac60fe..f3110eeaa41 100644 --- a/tests/it/ffi/data.rs +++ b/tests/it/ffi/data.rs @@ -51,6 +51,11 @@ fn bool() -> Result<()> { test_round_trip(data) } +fn binview_nullable_inlined() -> Result<()> { + let data = Utf8ViewArray::from_slice([Some("foo"), None, Some("barbar"), None]); + test_round_trip(data) +} + #[test] fn bool_nullable_sliced() -> Result<()> { let bitmap = Bitmap::from([true, false, false, true]).sliced(1, 3); @@ -362,3 +367,14 @@ fn extension_children() -> Result<()> { ); test_round_trip_schema(field) } + +fn binview_nullable_buffered() -> Result<()> { + let data = Utf8ViewArray::from_slice([ + Some("foobaroiwalksdfjoiei"), + None, + Some("barbar"), + None, + Some("aoisejiofjfoiewjjwfoiwejfo"), + ]); + test_round_trip(data) +} diff --git a/tests/it/io/ipc/mod.rs b/tests/it/io/ipc/mod.rs index 6d3e71c5db4..3dbdc3ee049 100644 --- a/tests/it/io/ipc/mod.rs +++ b/tests/it/io/ipc/mod.rs @@ -17,3 +17,83 @@ mod read_stream_async; mod read_file_async; mod mmap; +use std::io::Cursor; +use std::sync::Arc; + +use arrow2::array::*; +use arrow2::chunk::Chunk; +use arrow2::datatypes::{Schema, SchemaRef, Field}; +use arrow2::error::*; +use arrow2::io::ipc::read::{read_file_metadata, FileReader}; +use arrow2::io::ipc::write::*; +use arrow2::io::ipc::IpcField; + +pub(crate) fn write( + batches: &[Chunk>], + schema: &SchemaRef, + ipc_fields: Option>, + compression: Option, +) -> Result> { + let result = vec![]; + let options = WriteOptions { compression }; + let mut writer = FileWriter::try_new(result, schema.clone(), ipc_fields.clone(), options)?; + for batch in batches { + writer.write(batch, ipc_fields.as_ref().map(|x| x.as_ref()))?; + } + writer.finish()?; + Ok(writer.into_inner()) +} + +fn round_trip( + columns: Chunk>, + schema: SchemaRef, + ipc_fields: Option>, + compression: Option, +) -> Result<()> { + let (expected_schema, expected_batches) = (schema.clone(), vec![columns]); + + let result = write(&expected_batches, &schema, ipc_fields, compression)?; + let mut reader = Cursor::new(result); + let metadata = read_file_metadata(&mut reader)?; + let schema = metadata.schema.clone(); + + let reader = FileReader::new(reader, metadata, None, None); + + assert_eq!(schema, expected_schema); + + let batches = reader.collect::>>()?; + + assert_eq!(batches, expected_batches); + Ok(()) +} + +fn prep_schema(array: &dyn Array) -> SchemaRef { + let fields = vec![Field::new("a", array.data_type().clone(), true)]; + Arc::new(Schema::from(fields)) +} + +#[test] +fn write_boolean() -> Result<()> { + let array = BooleanArray::from([Some(true), Some(false), None, Some(true)]).boxed(); + let schema = prep_schema(array.as_ref()); + let columns = Chunk::try_new(vec![array])?; + round_trip(columns, schema, None, Some(Compression::ZSTD)) +} + +#[test] +fn write_sliced_utf8() -> Result<()> { + let array = Utf8Array::::from_slice(["aa", "bb"]) + .sliced(1, 1) + .boxed(); + let schema = prep_schema(array.as_ref()); + let columns = Chunk::try_new(vec![array])?; + round_trip(columns, schema, None, Some(Compression::ZSTD)) +} + +#[test] +fn write_binview() -> Result<()> { + let array = Utf8ViewArray::from_slice([Some("foo"), Some("bar"), None, Some("hamlet")]).boxed(); + let schema = prep_schema(array.as_ref()); + let columns = Chunk::try_new(vec![array])?; + round_trip(columns, schema, None, Some(Compression::ZSTD)) +} diff --git a/tests/it/io/ipc/write/file.rs b/tests/it/io/ipc/write/file.rs index 5562f803c50..f62bae11430 100644 --- a/tests/it/io/ipc/write/file.rs +++ b/tests/it/io/ipc/write/file.rs @@ -18,7 +18,7 @@ pub(crate) fn write( ) -> Result> { let result = vec![]; let options = WriteOptions { compression }; - let mut writer = FileWriter::try_new(result, schema.clone(), ipc_fields.clone(), options)?; + let mut writer = FileWriter::try_new(result, schema.clone().into(), ipc_fields.clone(), options)?; for batch in batches { writer.write(batch, ipc_fields.as_ref().map(|x| x.as_ref()))?; } diff --git a/tests/it/temporal_conversions.rs b/tests/it/temporal_conversions.rs index 1bb206de5ad..ecabb1e4fb0 100644 --- a/tests/it/temporal_conversions.rs +++ b/tests/it/temporal_conversions.rs @@ -9,21 +9,18 @@ use chrono::NaiveDateTime; fn naive() { let expected = "Timestamp(Nanosecond, None)[1996-12-19 16:39:57, 1996-12-19 13:39:57, None]"; let fmt = "%Y-%m-%dT%H:%M:%S:z"; - let array = Utf8Array::::from_slice([ + let slice = [ "1996-12-19T16:39:57-02:00", "1996-12-19T13:39:57-03:00", "1996-12-19 13:39:57-03:00", // missing T - ]); - let r = temporal_conversions::utf8_to_naive_timestamp_ns(&array, fmt); + ]; + let array = Utf8ViewArray::from_slice_values(slice); + let r = temporal_conversions::utf8view_to_naive_timestamp(&array, fmt, TimeUnit::Nanosecond); assert_eq!(format!("{r:?}"), expected); let fmt = "%Y-%m-%dT%H:%M:%S"; // no tz info - let array = Utf8Array::::from_slice([ - "1996-12-19T16:39:57-02:00", - "1996-12-19T13:39:57-03:00", - "1996-12-19 13:39:57-03:00", // missing T - ]); - let r = temporal_conversions::utf8_to_naive_timestamp_ns(&array, fmt); + let array = Utf8ViewArray::from_slice_values(slice); + let r = temporal_conversions::utf8view_to_naive_timestamp(&array, fmt, TimeUnit::Nanosecond); assert_eq!(format!("{r:?}"), expected); } @@ -115,12 +112,12 @@ fn scalar_tz_aware_no_timezone() { fn naive_no_tz() { let expected = "Timestamp(Nanosecond, None)[1996-12-19 16:39:57, 1996-12-19 13:39:57, None]"; let fmt = "%Y-%m-%dT%H:%M:%S"; // no tz info - let array = Utf8Array::::from_slice([ + let array = Utf8ViewArray::from_slice_values([ "1996-12-19T16:39:57", "1996-12-19T13:39:57", "1996-12-19 13:39:57", // missing T ]); - let r = temporal_conversions::utf8_to_naive_timestamp_ns(&array, fmt); + let r = temporal_conversions::utf8view_to_naive_timestamp(&array, fmt, TimeUnit::Nanosecond); assert_eq!(format!("{r:?}"), expected); } @@ -197,12 +194,12 @@ fn tz_aware() { let expected = "Timestamp(Nanosecond, Some(\"-02:00\"))[1996-12-19 16:39:57 -02:00, 1996-12-19 17:39:57 -02:00, None]"; let fmt = "%Y-%m-%dT%H:%M:%S%.f%:z"; - let array = Utf8Array::::from_slice([ + let array = Utf8ViewArray::from_slice_values([ "1996-12-19T16:39:57.0-02:00", "1996-12-19T16:39:57.0-03:00", // same time at a different TZ "1996-12-19 13:39:57.0-03:00", ]); - let r = temporal_conversions::utf8_to_timestamp_ns(&array, fmt, tz).unwrap(); + let r = temporal_conversions::utf8view_to_timestamp(&array, fmt, tz).unwrap(); assert_eq!(format!("{r:?}"), expected); } @@ -211,12 +208,12 @@ fn tz_aware_no_timezone() { let tz = "-02:00".to_string(); let expected = "Timestamp(Nanosecond, Some(\"-02:00\"))[None, None, None]"; let fmt = "%Y-%m-%dT%H:%M:%S%.f"; - let array = Utf8Array::::from_slice([ + let array = Utf8ViewArray::from_slice_values([ "1996-12-19T16:39:57.0", "1996-12-19T17:39:57.0", "1996-12-19 13:39:57.0", ]); - let r = temporal_conversions::utf8_to_timestamp_ns(&array, fmt, tz).unwrap(); + let r = temporal_conversions::utf8view_to_timestamp(&array, fmt, tz).unwrap(); assert_eq!(format!("{r:?}"), expected); }