From 02205316c914ad863d22548d0f6d760019f49a24 Mon Sep 17 00:00:00 2001 From: "Jorge C. Leitao" Date: Fri, 24 Jun 2022 04:21:32 +0000 Subject: [PATCH] Simplified Bytes --- src/buffer/bytes.rs | 159 +++++++++++++++--------------------------- src/buffer/foreign.rs | 55 --------------- src/buffer/mod.rs | 1 - src/ffi/array.rs | 24 +++---- 4 files changed, 68 insertions(+), 171 deletions(-) delete mode 100644 src/buffer/foreign.rs diff --git a/src/buffer/bytes.rs b/src/buffer/bytes.rs index de9b6b73b5a..3d573da5fe5 100644 --- a/src/buffer/bytes.rs +++ b/src/buffer/bytes.rs @@ -1,161 +1,118 @@ -//! This module contains an implementation of a contiguous immutable memory region that knows -//! how to de-allocate itself, [`Bytes`]. - -use std::{fmt::Debug, fmt::Formatter}; +use std::mem::ManuallyDrop; +use std::ops::{Deref, DerefMut}; +use std::panic::RefUnwindSafe; use std::{ptr::NonNull, sync::Arc}; -use super::foreign::MaybeForeign; -use crate::ffi; -use crate::types::NativeType; - /// Mode of deallocating memory regions -pub enum Deallocation { - /// Native deallocation, using Rust deallocator with Arrow-specific memory aligment +enum Allocation { + /// Native allocation Native, - // Foreign interface, via a callback - Foreign(Arc), + // A foreign allocator and its ref count + Foreign(Arc), } -impl Debug for Deallocation { - fn fmt(&self, f: &mut Formatter) -> std::fmt::Result { - match self { - Deallocation::Native => { - write!(f, "Deallocation::Native") - } - Deallocation::Foreign(_) => { - write!(f, "Deallocation::Foreign {{ capacity: unknown }}") - } - } - } -} - -/// A continuous, fixed-size, immutable memory region that knows how to de-allocate itself. +/// A continuous memory region that may be allocated externally. /// -/// In the most common case, this buffer is allocated using Rust's native allocator. -/// However, it may also be allocated by a foreign allocator, [Deallocation::Foreign]. -pub struct Bytes { - /// inner data - data: MaybeForeign, - /// how to deallocate this region - deallocation: Deallocation, +/// In the most common case, this is created from [`Vec`]. +/// However, this region can also be allocated by a foreign allocator. +pub struct Bytes { + /// An implementation using an `enum` of a `Vec` or a foreign pointer is not used + /// because `deref` is at least 50% more expensive than the deref of a `Vec`. + data: ManuallyDrop>, + /// the region was allocated + allocation: Allocation, } -impl Bytes { - /// Takes ownership of an allocated memory region, - /// - /// # Arguments - /// - /// * `ptr` - Pointer to raw parts - /// * `len` - Length of raw parts in **bytes** - /// * `capacity` - Total allocated memory for the pointer `ptr`, in **bytes** - /// +impl Bytes { + /// Takes ownership of an allocated memory region `[ptr, ptr+len[`, /// # Safety - /// - /// This function is unsafe as there is no guarantee that the given pointer is valid for `len` - /// bytes. If the `ptr` and `capacity` come from a `Buffer`, then this is guaranteed. - /// - /// # Panics - /// - /// This function panics if the give deallocation not is `Deallocation::Foreign` + /// This function is safe iff: + /// * the region is properly allocated in that a slice can be safely built from it. + /// * the region is immutable. + /// # Implementation + /// This function leaks iff `owner` does not deallocate the region when dropped. #[inline] - pub unsafe fn from_ffi( + pub unsafe fn from_owned( ptr: std::ptr::NonNull, len: usize, - deallocation: Deallocation, + owner: Arc, ) -> Self { - assert!(matches!(deallocation, Deallocation::Foreign(_))); // This line is technically outside the assumptions of `Vec::from_raw_parts`, since // `ptr` was not allocated by `Vec`. However, one of the invariants of this struct // is that we do not expose this region as a `Vec`; we only use `Vec` on it to provide // immutable access to the region (via `Vec::deref` to `&[T]`). + // MIRI does not complain, which seems to agree with the line of thought. let data = Vec::from_raw_parts(ptr.as_ptr(), len, len); - let data = MaybeForeign::new(data); - - Self { data, deallocation } - } + let data = ManuallyDrop::new(data); - #[inline] - fn as_slice(&self) -> &[T] { - self + Self { + data, + allocation: Allocation::Foreign(owner), + } } + /// The length of the region #[inline] pub fn len(&self) -> usize { self.data.len() } + /// The pointer to the region #[inline] pub fn ptr(&self) -> NonNull { - debug_assert!(!self.data.as_ptr().is_null()); unsafe { NonNull::new_unchecked(self.data.as_ptr() as *mut T) } } - /// Returns a mutable reference to the internal [`Vec`] if it is natively allocated. - /// Returns `None` if allocated by a foreign interface. + /// Returns a `Some` mutable reference of [`Vec`] iff this was initialized + /// from a [`Vec`] and `None` otherwise. + #[inline] pub fn get_vec(&mut self) -> Option<&mut Vec> { - match &self.deallocation { - Deallocation::Foreign(_) => None, - // Safety: - // The allocation is native so we can share the vec - Deallocation::Native => Some(unsafe { self.data.mut_vec() }), + match &self.allocation { + Allocation::Foreign(_) => None, + Allocation::Native => Some(self.data.deref_mut()), } } } -impl Drop for Bytes { +impl Drop for Bytes { + #[inline] fn drop(&mut self) { - match self.deallocation { - // a foreign interface knows how to deallocate itself - Deallocation::Foreign(_) => {} - Deallocation::Native => { - // Safety: - // the allocation is native, so we can safely drop - unsafe { self.data.drop_local() } + match self.allocation { + Allocation::Foreign(_) => { + // The ref count of the foreign is reduced by one + // we can't deallocate `Vec` here since the region was allocated by + // a foreign allocator + } + Allocation::Native => { + let data = std::mem::take(&mut self.data); + let _ = ManuallyDrop::into_inner(data); } } } } -impl std::ops::Deref for Bytes { +impl std::ops::Deref for Bytes { type Target = [T]; + #[inline] fn deref(&self) -> &[T] { &self.data } } -impl PartialEq for Bytes { +impl PartialEq for Bytes { + #[inline] fn eq(&self, other: &Bytes) -> bool { - self.as_slice() == other.as_slice() + self.deref() == other.deref() } } -impl Debug for Bytes { - fn fmt(&self, f: &mut Formatter) -> std::fmt::Result { - write!( - f, - "Bytes {{ ptr: {:?}, len: {}, data: ", - self.data.as_ptr(), - self.len(), - )?; - - f.debug_list().entries(self.iter()).finish()?; - - write!(f, " }}") - } -} - -impl From> for Bytes { +impl From> for Bytes { #[inline] fn from(data: Vec) -> Self { - let data = MaybeForeign::new(data); Self { - data, - deallocation: Deallocation::Native, + data: ManuallyDrop::new(data), + allocation: Allocation::Native, } } } - -// This is sound because `Bytes` is an immutable container -unsafe impl Send for Bytes {} -unsafe impl Sync for Bytes {} diff --git a/src/buffer/foreign.rs b/src/buffer/foreign.rs deleted file mode 100644 index ca5cb7f0225..00000000000 --- a/src/buffer/foreign.rs +++ /dev/null @@ -1,55 +0,0 @@ -// this code is in its own module so that inner types are not accessible -// as that might break invariants assumptions -use crate::types::NativeType; -use std::mem::ManuallyDrop; -use std::ops::{Deref, DerefMut}; - -/// Holds a `Vec` that may hold a pointer that is not allocated by `Vec`. It is therefore not -/// safe to deallocate the inner type naively -/// -/// This struct exists to avoid holding an `enum` of a `Vec` or a foreign pointer, whose `deref` -/// is known to be least 50% more expensive than the deref of a `Vec`. -/// -/// # Safety -/// -/// it is unsafe to take and drop the inner value of `MaybeForeign` -/// Doing so is only allowed if the `Vec` was created from the native `Vec` allocator -pub(super) struct MaybeForeign { - inner: ManuallyDrop>, -} - -impl MaybeForeign { - #[inline] - pub(super) fn new(data: Vec) -> Self { - Self { - inner: ManuallyDrop::new(data), - } - } - - /// # Safety - /// This function may only be called if the inner `Vec` was allocated - /// by `Vec` allocator `A`. - #[inline] - pub(super) unsafe fn drop_local(&mut self) { - let data = std::mem::take(&mut self.inner); - let _data = ManuallyDrop::into_inner(data); - } - - /// # Safety - /// This function may only be called if the inner `Vec` was allocated - /// in Rust and the default `Vec` allocator `A`. - /// Otherwise, users may reallocate `Vec`, which is unsound - #[inline] - pub(super) unsafe fn mut_vec(&mut self) -> &mut Vec { - self.inner.deref_mut() - } -} - -impl Deref for MaybeForeign { - type Target = [T]; - - #[inline] - fn deref(&self) -> &Self::Target { - &self.inner - } -} diff --git a/src/buffer/mod.rs b/src/buffer/mod.rs index 2ae6ff09909..0868908e820 100644 --- a/src/buffer/mod.rs +++ b/src/buffer/mod.rs @@ -3,6 +3,5 @@ mod immutable; pub(crate) mod bytes; -mod foreign; pub use immutable::Buffer; diff --git a/src/ffi/array.rs b/src/ffi/array.rs index 08ae9b68bf2..700e83e2e05 100644 --- a/src/ffi/array.rs +++ b/src/ffi/array.rs @@ -4,10 +4,7 @@ use std::{ptr::NonNull, sync::Arc}; use crate::{ array::*, bitmap::{utils::bytes_for, Bitmap}, - buffer::{ - bytes::{Bytes, Deallocation}, - Buffer, - }, + buffer::{bytes::Bytes, Buffer}, datatypes::{DataType, PhysicalType}, error::{Error, Result}, ffi::schema::get_child, @@ -181,7 +178,7 @@ impl ArrowArray { unsafe fn create_buffer( array: &ArrowArray, data_type: &DataType, - deallocation: Deallocation, + owner: Arc, index: usize, ) -> Result> { if array.buffers.is_null() { @@ -197,7 +194,7 @@ unsafe fn create_buffer( let len = buffer_len(array, data_type, index)?; let offset = buffer_offset(array, data_type, index); let bytes = ptr - .map(|ptr| Bytes::from_ffi(ptr, len, deallocation)) + .map(|ptr| Bytes::from_owned(ptr, len, owner)) .ok_or_else(|| Error::OutOfSpec(format!("The buffer at position {} is null", index)))?; Ok(Buffer::from_bytes(bytes).slice(offset, len - offset)) @@ -212,7 +209,7 @@ unsafe fn create_buffer( /// This function assumes that `ceil(self.length * bits, 8)` is the size of the buffer unsafe fn create_bitmap( array: &ArrowArray, - deallocation: Deallocation, + owner: Arc, index: usize, ) -> Result { if array.buffers.is_null() { @@ -228,7 +225,7 @@ unsafe fn create_bitmap( let bytes_len = bytes_for(offset + len); let ptr = NonNull::new(ptr as *mut u8); let bytes = ptr - .map(|ptr| Bytes::from_ffi(ptr, bytes_len, deallocation)) + .map(|ptr| Bytes::from_owned(ptr, bytes_len, owner)) .ok_or_else(|| { Error::OutOfSpec(format!( "The buffer {} is a null pointer and cannot be interpreted as a bitmap", @@ -344,8 +341,8 @@ fn create_dictionary( } pub trait ArrowArrayRef: std::fmt::Debug { - fn deallocation(&self) -> Deallocation { - Deallocation::Foreign(self.parent().clone()) + fn owner(&self) -> Arc { + self.parent().clone() } /// returns the null bit buffer. @@ -358,7 +355,7 @@ pub trait ArrowArrayRef: std::fmt::Debug { if self.array().null_count() == 0 { Ok(None) } else { - create_bitmap(self.array(), self.deallocation(), 0).map(Some) + create_bitmap(self.array(), self.owner(), 0).map(Some) } } @@ -366,15 +363,14 @@ pub trait ArrowArrayRef: std::fmt::Debug { /// The caller must guarantee that the buffer `index` corresponds to a bitmap. /// This function assumes that the bitmap created from FFI is valid; this is impossible to prove. unsafe fn buffer(&self, index: usize) -> Result> { - create_buffer::(self.array(), self.data_type(), self.deallocation(), index) + create_buffer::(self.array(), self.data_type(), self.owner(), index) } /// # Safety /// The caller must guarantee that the buffer `index` corresponds to a bitmap. /// This function assumes that the bitmap created from FFI is valid; this is impossible to prove. unsafe fn bitmap(&self, index: usize) -> Result { - // +1 to ignore null bitmap - create_bitmap(self.array(), self.deallocation(), index) + create_bitmap(self.array(), self.owner(), index) } /// # Safety