From 54797de5e4860cebc4eb73ad1890457cd1a658eb Mon Sep 17 00:00:00 2001 From: Ritchie Vink Date: Wed, 2 Feb 2022 21:00:18 +0100 Subject: [PATCH] `PrimitiveArray -> MutableArray` (#794) --- Cargo.toml | 1 + src/array/mod.rs | 2 +- src/array/primitive/mod.rs | 36 ++++++++++ src/bitmap/immutable.rs | 17 ++++- src/buffer/bytes.rs | 95 ++++++++++++++++---------- src/buffer/foreign.rs | 55 +++++++++++++++ src/buffer/immutable.rs | 28 +++++++- src/buffer/mod.rs | 1 + src/ffi/ffi.rs | 4 +- tests/it/array/primitive/mod.rs | 1 + tests/it/array/primitive/to_mutable.rs | 53 ++++++++++++++ 11 files changed, 252 insertions(+), 41 deletions(-) create mode 100644 src/buffer/foreign.rs create mode 100644 tests/it/array/primitive/to_mutable.rs diff --git a/Cargo.toml b/Cargo.toml index e13c15dc143..42171337c0f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -15,6 +15,7 @@ name = "arrow2" bench = false [dependencies] +either = "1.6" num-traits = "0.2" bytemuck = { version = "1", features = ["derive"] } chrono = { version = "0.4", default_features = false, features = ["std"] } diff --git a/src/array/mod.rs b/src/array/mod.rs index 1ee56e252b7..7f27fe650cd 100644 --- a/src/array/mod.rs +++ b/src/array/mod.rs @@ -29,7 +29,7 @@ use crate::{ pub(self) mod physical_binary; /// A trait representing an immutable Arrow array. Arrow arrays are trait objects -/// that are infalibly downcasted to concrete types according to the [`Array::data_type`]. +/// that are infallibly downcasted to concrete types according to the [`Array::data_type`]. pub trait Array: Send + Sync { /// Convert to trait object. fn as_any(&self) -> &dyn Any; diff --git a/src/array/primitive/mod.rs b/src/array/primitive/mod.rs index 6da02cc5313..1c2563e3e02 100644 --- a/src/array/primitive/mod.rs +++ b/src/array/primitive/mod.rs @@ -7,6 +7,7 @@ use crate::{ }; use super::Array; +use either::Either; mod display; mod ffi; @@ -184,6 +185,41 @@ impl PrimitiveArray { validity: self.validity, } } + /// Try to convert this `PrimitiveArray` to a `MutablePrimitiveArray` + pub fn into_mut(self) -> Either> { + use Either::*; + + if let Some(bitmap) = self.validity { + match bitmap.into_mut() { + Left(bitmap) => Left(PrimitiveArray::from_data( + self.data_type, + self.values, + Some(bitmap), + )), + Right(mutable_bitmap) => match self.values.get_vec() { + Left(buffer) => Left(PrimitiveArray::from_data( + self.data_type, + buffer, + Some(mutable_bitmap.into()), + )), + Right(values) => Right(MutablePrimitiveArray::from_data( + self.data_type, + values, + Some(mutable_bitmap), + )), + }, + } + } else { + match self.values.get_vec() { + Left(buffer) => Left(PrimitiveArray::from_data(self.data_type, buffer, None)), + Right(values) => Right(MutablePrimitiveArray::from_data( + self.data_type, + values, + None, + )), + } + } + } } impl Array for PrimitiveArray { diff --git a/src/bitmap/immutable.rs b/src/bitmap/immutable.rs index eea39d78480..25d7f244968 100644 --- a/src/bitmap/immutable.rs +++ b/src/bitmap/immutable.rs @@ -1,3 +1,4 @@ +use either::Either; use std::iter::FromIterator; use std::sync::Arc; @@ -107,7 +108,7 @@ impl Bitmap { self.null_count } - /// Slices `self`, offseting by `offset` and truncating up to `length` bits. + /// Slices `self`, offsetting by `offset` and truncating up to `length` bits. /// # Panic /// Panics iff `self.offset + offset + length >= self.bytes.len() * 8`, i.e. if the offset and `length` /// exceeds the allocated capacity of `self`. @@ -175,6 +176,20 @@ impl Bitmap { pub(crate) fn offset(&self) -> usize { self.offset } + + /// Try to convert this `Bitmap` to a `MutableBitmap` + pub fn into_mut(mut self) -> Either { + match ( + self.offset, + Arc::get_mut(&mut self.bytes).and_then(|b| b.get_vec()), + ) { + (0, Some(v)) => { + let data = std::mem::take(v); + Either::Right(MutableBitmap::from_vec(data, self.length)) + } + _ => Either::Left(self), + } + } } impl> From

for Bitmap { diff --git a/src/buffer/bytes.rs b/src/buffer/bytes.rs index 66d8839992d..109302a4816 100644 --- a/src/buffer/bytes.rs +++ b/src/buffer/bytes.rs @@ -1,17 +1,17 @@ //! This module contains an implementation of a contiguous immutable memory region that knows //! how to de-allocate itself, [`Bytes`]. -use std::slice; use std::{fmt::Debug, fmt::Formatter}; use std::{ptr::NonNull, sync::Arc}; +use super::foreign::MaybeForeign; use crate::ffi; use crate::types::NativeType; /// Mode of deallocating memory regions pub enum Deallocation { /// Native deallocation, using Rust deallocator with Arrow-specific memory aligment - Native(usize), + Native, // Foreign interface, via a callback Foreign(Arc), } @@ -19,8 +19,8 @@ pub enum Deallocation { impl Debug for Deallocation { fn fmt(&self, f: &mut Formatter) -> std::fmt::Result { match self { - Deallocation::Native(capacity) => { - write!(f, "Deallocation::Native {{ capacity: {} }}", capacity) + Deallocation::Native => { + write!(f, "Deallocation::Native") } Deallocation::Foreign(_) => { write!(f, "Deallocation::Foreign {{ capacity: unknown }}") @@ -36,12 +36,8 @@ impl Debug for Deallocation { /// When the region is allocated by a foreign allocator, [Deallocation::Foreign], this calls the /// foreign deallocator to deallocate the region when it is no longer needed. pub struct Bytes { - /// The raw pointer to be begining of the region - ptr: NonNull, - - /// The number of bytes visible to this region. This is always smaller than its capacity (when avaliable). - len: usize, - + /// inner data + data: MaybeForeign, /// how to deallocate this region deallocation: Deallocation, } @@ -59,13 +55,25 @@ impl Bytes { /// /// This function is unsafe as there is no guarantee that the given pointer is valid for `len` /// bytes. If the `ptr` and `capacity` come from a `Buffer`, then this is guaranteed. + /// + /// # Panics + /// + /// This function panics if the give deallocation not is `Deallocation::Foreign` #[inline] - pub unsafe fn new(ptr: std::ptr::NonNull, len: usize, deallocation: Deallocation) -> Self { - Self { - ptr, - len, - deallocation, - } + pub unsafe fn from_ffi( + ptr: std::ptr::NonNull, + len: usize, + deallocation: Deallocation, + ) -> Self { + assert!(matches!(deallocation, Deallocation::Foreign(_))); + // This line is technically outside the assumptions of `Vec::from_raw_parts`, since + // `ptr` was not allocated by `Vec`. However, one of the invariants of this struct + // is that we do not expose this region as a `Vec`; we only use `Vec` on it to provide + // immutable access to the region (via `Vec::deref` to `&[T]`). + let data = Vec::from_raw_parts(ptr.as_ptr(), len, len); + let data = MaybeForeign::new(data); + + Self { data, deallocation } } #[inline] @@ -75,24 +83,37 @@ impl Bytes { #[inline] pub fn len(&self) -> usize { - self.len + self.data.len() } #[inline] pub fn ptr(&self) -> NonNull { - self.ptr + debug_assert!(!self.data.as_ptr().is_null()); + unsafe { NonNull::new_unchecked(self.data.as_ptr() as *mut T) } + } + + /// Returns a mutable reference to the internal [`Vec`] if it is natively allocated. + /// Returns `None` if allocated by a foreign interface. + pub fn get_vec(&mut self) -> Option<&mut Vec> { + match &self.deallocation { + Deallocation::Foreign(_) => None, + // Safety: + // The allocation is native so we can share the vec + Deallocation::Native => Some(unsafe { self.data.mut_vec() }), + } } } impl Drop for Bytes { - #[inline] fn drop(&mut self) { - match &self.deallocation { - Deallocation::Native(capacity) => unsafe { - let _ = Vec::from_raw_parts(self.ptr.as_ptr(), self.len, *capacity); - }, - // foreign interface knows how to deallocate itself. - Deallocation::Foreign(_) => (), + match self.deallocation { + // a foreign interface knows how to deallocate itself + Deallocation::Foreign(_) => {} + Deallocation::Native => { + // Safety: + // the allocation is native, so we can safely drop + unsafe { self.data.drop_local() } + } } } } @@ -101,7 +122,7 @@ impl std::ops::Deref for Bytes { type Target = [T]; fn deref(&self) -> &[T] { - unsafe { slice::from_raw_parts(self.ptr.as_ptr(), self.len) } + &self.data } } @@ -113,7 +134,12 @@ impl PartialEq for Bytes { impl Debug for Bytes { fn fmt(&self, f: &mut Formatter) -> std::fmt::Result { - write!(f, "Bytes {{ ptr: {:?}, len: {}, data: ", self.ptr, self.len,)?; + write!( + f, + "Bytes {{ ptr: {:?}, len: {}, data: ", + self.data.as_ptr(), + self.len(), + )?; f.debug_list().entries(self.iter()).finish()?; @@ -123,15 +149,12 @@ impl Debug for Bytes { impl From> for Bytes { #[inline] - fn from(mut data: Vec) -> Self { - let ptr = NonNull::new(data.as_mut_ptr()).unwrap(); - let len = data.len(); - let capacity = data.capacity(); - - let result = unsafe { Bytes::new(ptr, len, Deallocation::Native(capacity)) }; - // so that the memory region is not deallocated. - std::mem::forget(data); - result + fn from(data: Vec) -> Self { + let data = MaybeForeign::new(data); + Self { + data, + deallocation: Deallocation::Native, + } } } diff --git a/src/buffer/foreign.rs b/src/buffer/foreign.rs new file mode 100644 index 00000000000..ca5cb7f0225 --- /dev/null +++ b/src/buffer/foreign.rs @@ -0,0 +1,55 @@ +// this code is in its own module so that inner types are not accessible +// as that might break invariants assumptions +use crate::types::NativeType; +use std::mem::ManuallyDrop; +use std::ops::{Deref, DerefMut}; + +/// Holds a `Vec` that may hold a pointer that is not allocated by `Vec`. It is therefore not +/// safe to deallocate the inner type naively +/// +/// This struct exists to avoid holding an `enum` of a `Vec` or a foreign pointer, whose `deref` +/// is known to be least 50% more expensive than the deref of a `Vec`. +/// +/// # Safety +/// +/// it is unsafe to take and drop the inner value of `MaybeForeign` +/// Doing so is only allowed if the `Vec` was created from the native `Vec` allocator +pub(super) struct MaybeForeign { + inner: ManuallyDrop>, +} + +impl MaybeForeign { + #[inline] + pub(super) fn new(data: Vec) -> Self { + Self { + inner: ManuallyDrop::new(data), + } + } + + /// # Safety + /// This function may only be called if the inner `Vec` was allocated + /// by `Vec` allocator `A`. + #[inline] + pub(super) unsafe fn drop_local(&mut self) { + let data = std::mem::take(&mut self.inner); + let _data = ManuallyDrop::into_inner(data); + } + + /// # Safety + /// This function may only be called if the inner `Vec` was allocated + /// in Rust and the default `Vec` allocator `A`. + /// Otherwise, users may reallocate `Vec`, which is unsound + #[inline] + pub(super) unsafe fn mut_vec(&mut self) -> &mut Vec { + self.inner.deref_mut() + } +} + +impl Deref for MaybeForeign { + type Target = [T]; + + #[inline] + fn deref(&self) -> &Self::Target { + &self.inner + } +} diff --git a/src/buffer/immutable.rs b/src/buffer/immutable.rs index a890699afd3..ec67febbe9f 100644 --- a/src/buffer/immutable.rs +++ b/src/buffer/immutable.rs @@ -1,3 +1,4 @@ +use either::Either; use std::{iter::FromIterator, sync::Arc, usize}; use crate::{trusted_len::TrustedLen, types::NativeType}; @@ -84,7 +85,13 @@ impl Buffer { /// Returns the byte slice stored in this buffer #[inline] pub fn as_slice(&self) -> &[T] { - &self.data[self.offset..self.offset + self.length] + // Safety: + // invariant of this struct `offset + length <= data.len()` + debug_assert!(self.offset + self.length <= self.data.len()); + unsafe { + self.data + .get_unchecked(self.offset..self.offset + self.length) + } } /// Returns a new [Buffer] that is a slice of this buffer starting at `offset`. @@ -123,6 +130,25 @@ impl Buffer { pub fn offset(&self) -> usize { self.offset } + + /// Try to get the inner data as a mutable [`Vec`]. + /// This succeeds iff: + /// * This data was allocated by Rust (i.e. it does not come from the C data interface) + /// * This region is not being shared any other struct. + /// * This buffer has no offset + pub fn get_vec(mut self) -> Either> { + if self.offset != 0 { + Either::Left(self) + } else { + match Arc::get_mut(&mut self.data).and_then(|b| b.get_vec()) { + Some(v) => { + let data = std::mem::take(v); + Either::Right(data) + } + None => Either::Left(self), + } + } + } } impl Buffer { diff --git a/src/buffer/mod.rs b/src/buffer/mod.rs index dc055f8ce04..5841a147fd4 100644 --- a/src/buffer/mod.rs +++ b/src/buffer/mod.rs @@ -4,5 +4,6 @@ mod immutable; pub(crate) mod bytes; +mod foreign; pub use immutable::Buffer; diff --git a/src/ffi/ffi.rs b/src/ffi/ffi.rs index 73c999a36ac..528a679dd5b 100644 --- a/src/ffi/ffi.rs +++ b/src/ffi/ffi.rs @@ -205,7 +205,7 @@ unsafe fn create_buffer( let len = buffer_len(array, data_type, index)?; let offset = buffer_offset(array, data_type, index); let bytes = ptr - .map(|ptr| Bytes::new(ptr, len, deallocation)) + .map(|ptr| Bytes::from_ffi(ptr, len, deallocation)) .ok_or_else(|| { ArrowError::OutOfSpec(format!("The buffer at position {} is null", index)) })?; @@ -240,7 +240,7 @@ unsafe fn create_bitmap( let bytes_len = bytes_for(offset + len); let ptr = NonNull::new(ptr as *mut u8); let bytes = ptr - .map(|ptr| Bytes::new(ptr, bytes_len, deallocation)) + .map(|ptr| Bytes::from_ffi(ptr, bytes_len, deallocation)) .ok_or_else(|| { ArrowError::OutOfSpec(format!( "The buffer {} is a null pointer and cannot be interpreted as a bitmap", diff --git a/tests/it/array/primitive/mod.rs b/tests/it/array/primitive/mod.rs index 1ca5114927a..341cc0615de 100644 --- a/tests/it/array/primitive/mod.rs +++ b/tests/it/array/primitive/mod.rs @@ -9,6 +9,7 @@ use arrow2::{ }; mod mutable; +mod to_mutable; #[test] fn basics() { diff --git a/tests/it/array/primitive/to_mutable.rs b/tests/it/array/primitive/to_mutable.rs new file mode 100644 index 00000000000..eadd5fb853d --- /dev/null +++ b/tests/it/array/primitive/to_mutable.rs @@ -0,0 +1,53 @@ +use arrow2::array::PrimitiveArray; +use arrow2::bitmap::Bitmap; +use arrow2::datatypes::DataType; +use either::Either; + +#[test] +fn array_to_mutable() { + let data = vec![1, 2, 3]; + let arr = PrimitiveArray::from_data(DataType::Int32, data.into(), None); + + // to mutable push and freeze again + let mut mut_arr = arr.into_mut().unwrap_right(); + mut_arr.push(Some(5)); + let immut: PrimitiveArray = mut_arr.into(); + assert_eq!(immut.values().as_slice(), [1, 2, 3, 5]); + + // let's cause a realloc and see if miri is ok + let mut mut_arr = immut.into_mut().unwrap_right(); + mut_arr.extend_constant(256, Some(9)); + let immut: PrimitiveArray = mut_arr.into(); + assert_eq!(immut.values().len(), 256 + 4); +} + +#[test] +fn array_to_mutable_not_owned() { + let data = vec![1, 2, 3]; + let arr = PrimitiveArray::from_data(DataType::Int32, data.into(), None); + let arr2 = arr.clone(); + + // to the `to_mutable` should fail and we should get back the original array + match arr2.into_mut() { + Either::Left(arr2) => { + assert_eq!(arr, arr2); + } + _ => panic!(), + } +} + +#[test] +#[allow(clippy::redundant_clone)] +fn array_to_mutable_validity() { + let data = vec![1, 2, 3]; + + // both have a single reference should be ok + let bitmap = Bitmap::from_iter([true, false, true]); + let arr = PrimitiveArray::from_data(DataType::Int32, data.clone().into(), Some(bitmap)); + assert!(matches!(arr.into_mut(), Either::Right(_))); + + // now we clone the bitmap increasing the ref count + let bitmap = Bitmap::from_iter([true, false, true]); + let arr = PrimitiveArray::from_data(DataType::Int32, data.into(), Some(bitmap.clone())); + assert!(matches!(arr.into_mut(), Either::Left(_))); +}