From 1cd7453a4f750dedc1b352bb02062499326f1e8b Mon Sep 17 00:00:00 2001 From: Jorge Leitao Date: Mon, 11 Jul 2022 07:09:36 -0700 Subject: [PATCH] Moved Bytes to own crate (#1141) --- Cargo.toml | 1 + src/array/binary/ffi.rs | 2 +- src/array/boolean/ffi.rs | 2 +- src/array/dictionary/ffi.rs | 2 +- src/array/ffi.rs | 4 +- src/array/fixed_size_binary/ffi.rs | 2 +- src/array/fixed_size_list/ffi.rs | 2 +- src/array/list/ffi.rs | 2 +- src/array/map/ffi.rs | 2 +- src/array/null.rs | 2 +- src/array/primitive/ffi.rs | 2 +- src/array/struct_/ffi.rs | 2 +- src/array/union/ffi.rs | 2 +- src/array/utf8/ffi.rs | 2 +- src/bitmap/immutable.rs | 10 +-- src/buffer/bytes.rs | 118 ----------------------------- src/buffer/immutable.rs | 17 +++-- src/buffer/mod.rs | 4 +- src/ffi/array.rs | 8 +- 19 files changed, 39 insertions(+), 147 deletions(-) delete mode 100644 src/buffer/bytes.rs diff --git a/Cargo.toml b/Cargo.toml index b9d6749aca8..1aada784f46 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -15,6 +15,7 @@ name = "arrow2" bench = false [dependencies] +foreign_vec = "0.1.0" either = "1.6" num-traits = "0.2" dyn-clone = "1" diff --git a/src/array/binary/ffi.rs b/src/array/binary/ffi.rs index c8a72615bce..2c6792237ad 100644 --- a/src/array/binary/ffi.rs +++ b/src/array/binary/ffi.rs @@ -9,7 +9,7 @@ use crate::error::Result; use super::BinaryArray; unsafe impl ToFfi for BinaryArray { - fn buffers(&self) -> Vec>> { + fn buffers(&self) -> Vec> { vec![ self.validity.as_ref().map(|x| x.as_ptr()), Some(self.offsets.as_ptr().cast::()), diff --git a/src/array/boolean/ffi.rs b/src/array/boolean/ffi.rs index da7482c11e2..c2081c49a5d 100644 --- a/src/array/boolean/ffi.rs +++ b/src/array/boolean/ffi.rs @@ -9,7 +9,7 @@ use crate::error::Result; use super::BooleanArray; unsafe impl ToFfi for BooleanArray { - fn buffers(&self) -> Vec>> { + fn buffers(&self) -> Vec> { vec![ self.validity.as_ref().map(|x| x.as_ptr()), Some(self.values.as_ptr()), diff --git a/src/array/dictionary/ffi.rs b/src/array/dictionary/ffi.rs index 1e06f81e6ed..fb84665276e 100644 --- a/src/array/dictionary/ffi.rs +++ b/src/array/dictionary/ffi.rs @@ -7,7 +7,7 @@ use crate::{ use super::{DictionaryArray, DictionaryKey}; unsafe impl ToFfi for DictionaryArray { - fn buffers(&self) -> Vec>> { + fn buffers(&self) -> Vec> { self.keys.buffers() } diff --git a/src/array/ffi.rs b/src/array/ffi.rs index c879b7fa037..141cab327e4 100644 --- a/src/array/ffi.rs +++ b/src/array/ffi.rs @@ -9,7 +9,7 @@ use crate::error::Result; /// Implementing this trait incorrect will lead to UB pub(crate) unsafe trait ToFfi { /// The pointers to the buffers. - fn buffers(&self) -> Vec>>; + fn buffers(&self) -> Vec>; /// The children fn children(&self) -> Vec> { @@ -47,7 +47,7 @@ macro_rules! ffi_dyn { type BuffersChildren = ( usize, - Vec>>, + Vec>, Vec>, Option>, ); diff --git a/src/array/fixed_size_binary/ffi.rs b/src/array/fixed_size_binary/ffi.rs index 1bb07d322bf..f1775e74d51 100644 --- a/src/array/fixed_size_binary/ffi.rs +++ b/src/array/fixed_size_binary/ffi.rs @@ -8,7 +8,7 @@ use crate::{ use super::FixedSizeBinaryArray; unsafe impl ToFfi for FixedSizeBinaryArray { - fn buffers(&self) -> Vec>> { + fn buffers(&self) -> Vec> { vec![ self.validity.as_ref().map(|x| x.as_ptr()), Some(self.values.as_ptr().cast::()), diff --git a/src/array/fixed_size_list/ffi.rs b/src/array/fixed_size_list/ffi.rs index 47367b180a5..128496c2952 100644 --- a/src/array/fixed_size_list/ffi.rs +++ b/src/array/fixed_size_list/ffi.rs @@ -9,7 +9,7 @@ use crate::{ }; unsafe impl ToFfi for FixedSizeListArray { - fn buffers(&self) -> Vec>> { + fn buffers(&self) -> Vec> { vec![self.validity.as_ref().map(|x| x.as_ptr())] } diff --git a/src/array/list/ffi.rs b/src/array/list/ffi.rs index 717ca063835..c6f1b2985dd 100644 --- a/src/array/list/ffi.rs +++ b/src/array/list/ffi.rs @@ -4,7 +4,7 @@ use super::super::{ffi::ToFfi, Array, Offset}; use super::ListArray; unsafe impl ToFfi for ListArray { - fn buffers(&self) -> Vec>> { + fn buffers(&self) -> Vec> { vec![ self.validity.as_ref().map(|x| x.as_ptr()), Some(self.offsets.as_ptr().cast::()), diff --git a/src/array/map/ffi.rs b/src/array/map/ffi.rs index 538da4cbc3d..bbf3846999a 100644 --- a/src/array/map/ffi.rs +++ b/src/array/map/ffi.rs @@ -4,7 +4,7 @@ use super::super::{ffi::ToFfi, Array}; use super::MapArray; unsafe impl ToFfi for MapArray { - fn buffers(&self) -> Vec>> { + fn buffers(&self) -> Vec> { vec![ self.validity.as_ref().map(|x| x.as_ptr()), Some(self.offsets.as_ptr().cast::()), diff --git a/src/array/null.rs b/src/array/null.rs index 6a00fcb3530..041b59ada1d 100644 --- a/src/array/null.rs +++ b/src/array/null.rs @@ -127,7 +127,7 @@ impl std::fmt::Debug for NullArray { } unsafe impl ToFfi for NullArray { - fn buffers(&self) -> Vec>> { + fn buffers(&self) -> Vec> { // `None` is technically not required by the specification, but older C++ implementations require it, so leaving // it here for backward compatibility vec![None] diff --git a/src/array/primitive/ffi.rs b/src/array/primitive/ffi.rs index f1a73692279..de5d6a70584 100644 --- a/src/array/primitive/ffi.rs +++ b/src/array/primitive/ffi.rs @@ -10,7 +10,7 @@ use crate::error::Result; use super::PrimitiveArray; unsafe impl ToFfi for PrimitiveArray { - fn buffers(&self) -> Vec>> { + fn buffers(&self) -> Vec> { vec![ self.validity.as_ref().map(|x| x.as_ptr()), Some(self.values.as_ptr().cast::()), diff --git a/src/array/struct_/ffi.rs b/src/array/struct_/ffi.rs index 61b36ca1b82..b29a342d889 100644 --- a/src/array/struct_/ffi.rs +++ b/src/array/struct_/ffi.rs @@ -3,7 +3,7 @@ use super::StructArray; use crate::{error::Result, ffi}; unsafe impl ToFfi for StructArray { - fn buffers(&self) -> Vec>> { + fn buffers(&self) -> Vec> { vec![self.validity.as_ref().map(|x| x.as_ptr())] } diff --git a/src/array/union/ffi.rs b/src/array/union/ffi.rs index 94bcdea42c8..fd12f15b23c 100644 --- a/src/array/union/ffi.rs +++ b/src/array/union/ffi.rs @@ -4,7 +4,7 @@ use super::super::{ffi::ToFfi, Array}; use super::UnionArray; unsafe impl ToFfi for UnionArray { - fn buffers(&self) -> Vec>> { + fn buffers(&self) -> Vec> { if let Some(offsets) = &self.offsets { vec![ Some(self.types.as_ptr().cast::()), diff --git a/src/array/utf8/ffi.rs b/src/array/utf8/ffi.rs index e8caa53b65e..abaed6b28e7 100644 --- a/src/array/utf8/ffi.rs +++ b/src/array/utf8/ffi.rs @@ -8,7 +8,7 @@ use crate::{ use super::Utf8Array; unsafe impl ToFfi for Utf8Array { - fn buffers(&self) -> Vec>> { + fn buffers(&self) -> Vec> { vec![ self.validity.as_ref().map(|x| x.as_ptr()), Some(self.offsets.as_ptr().cast::()), diff --git a/src/bitmap/immutable.rs b/src/bitmap/immutable.rs index 5384ae5ec13..f4a7cc7df34 100644 --- a/src/bitmap/immutable.rs +++ b/src/bitmap/immutable.rs @@ -1,8 +1,8 @@ +use std::{iter::FromIterator, ops::Deref, sync::Arc}; + use either::Either; -use std::iter::FromIterator; -use std::sync::Arc; -use crate::{buffer::bytes::Bytes, error::Error, trusted_len::TrustedLen}; +use crate::{buffer::Bytes, error::Error, trusted_len::TrustedLen}; use super::{ chunk_iter_to_vec, @@ -216,8 +216,8 @@ impl Bitmap { /// Returns a pointer to the start of this [`Bitmap`] (ignores `offsets`) /// This pointer is allocated iff `self.len() > 0`. - pub(crate) fn as_ptr(&self) -> std::ptr::NonNull { - self.bytes.ptr() + pub(crate) fn as_ptr(&self) -> *const u8 { + self.bytes.deref().as_ptr() } /// Returns a pointer to the start of this [`Bitmap`] (ignores `offsets`) diff --git a/src/buffer/bytes.rs b/src/buffer/bytes.rs deleted file mode 100644 index 67ad7bed132..00000000000 --- a/src/buffer/bytes.rs +++ /dev/null @@ -1,118 +0,0 @@ -use std::mem::ManuallyDrop; -use std::ops::{Deref, DerefMut}; -use std::panic::RefUnwindSafe; -use std::ptr::NonNull; - -/// Mode of deallocating memory regions -enum Allocation { - /// Native allocation - Native, - // A foreign allocator and its ref count - Foreign(Box), -} - -/// A continuous memory region that may be allocated externally. -/// -/// In the most common case, this is created from [`Vec`]. -/// However, this region can also be allocated by a foreign allocator. -pub struct Bytes { - /// An implementation using an `enum` of a `Vec` or a foreign pointer is not used - /// because `deref` is at least 50% more expensive than the deref of a `Vec`. - data: ManuallyDrop>, - /// the region was allocated - allocation: Allocation, -} - -impl Bytes { - /// Takes ownership of an allocated memory region `[ptr, ptr+len[`, - /// # Safety - /// This function is safe iff: - /// * the region is properly allocated in that a slice can be safely built from it. - /// * the region is immutable. - /// # Implementation - /// This function leaks iff `owner` does not deallocate the region when dropped. - #[inline] - pub unsafe fn from_owned( - ptr: std::ptr::NonNull, - len: usize, - owner: Box, - ) -> Self { - // This line is technically outside the assumptions of `Vec::from_raw_parts`, since - // `ptr` was not allocated by `Vec`. However, one of the invariants of this struct - // is that we do not expose this region as a `Vec`; we only use `Vec` on it to provide - // immutable access to the region (via `Vec::deref` to `&[T]`). - // MIRI does not complain, which seems to agree with the line of thought. - let data = Vec::from_raw_parts(ptr.as_ptr(), len, len); - let data = ManuallyDrop::new(data); - - Self { - data, - allocation: Allocation::Foreign(owner), - } - } - - /// The length of the region - #[inline] - pub fn len(&self) -> usize { - self.data.len() - } - - /// The pointer to the region - #[inline] - pub fn ptr(&self) -> NonNull { - unsafe { NonNull::new_unchecked(self.data.as_ptr() as *mut T) } - } - - /// Returns a `Some` mutable reference of [`Vec`] iff this was initialized - /// from a [`Vec`] and `None` otherwise. - #[inline] - pub fn get_vec(&mut self) -> Option<&mut Vec> { - match &self.allocation { - Allocation::Foreign(_) => None, - Allocation::Native => Some(self.data.deref_mut()), - } - } -} - -impl Drop for Bytes { - #[inline] - fn drop(&mut self) { - match self.allocation { - Allocation::Foreign(_) => { - // The ref count of the foreign is reduced by one - // we can't deallocate `Vec` here since the region was allocated by - // a foreign allocator - } - Allocation::Native => { - let data = std::mem::take(&mut self.data); - let _ = ManuallyDrop::into_inner(data); - } - } - } -} - -impl std::ops::Deref for Bytes { - type Target = [T]; - - #[inline] - fn deref(&self) -> &[T] { - &self.data - } -} - -impl PartialEq for Bytes { - #[inline] - fn eq(&self, other: &Bytes) -> bool { - self.deref() == other.deref() - } -} - -impl From> for Bytes { - #[inline] - fn from(data: Vec) -> Self { - Self { - data: ManuallyDrop::new(data), - allocation: Allocation::Native, - } - } -} diff --git a/src/buffer/immutable.rs b/src/buffer/immutable.rs index 8bbe0115d2e..1d7e77add63 100644 --- a/src/buffer/immutable.rs +++ b/src/buffer/immutable.rs @@ -1,8 +1,8 @@ -use std::{iter::FromIterator, sync::Arc, usize}; +use std::{iter::FromIterator, ops::Deref, sync::Arc, usize}; use crate::types::NativeType; -use super::bytes::Bytes; +use super::Bytes; /// [`Buffer`] is a contiguous memory region of plain old data types /// that can be shared across thread boundaries. @@ -33,7 +33,7 @@ use super::bytes::Bytes; /// // but cloning forbids getting mut since `slice` and `buffer` now share data /// assert_eq!(buffer.get_mut(), None); /// ``` -#[derive(Clone, PartialEq)] +#[derive(Clone)] pub struct Buffer { /// the internal byte buffer. data: Arc>, @@ -46,6 +46,13 @@ pub struct Buffer { length: usize, } +impl PartialEq for Buffer { + #[inline] + fn eq(&self, other: &Self) -> bool { + self.deref() == other.deref() + } +} + impl std::fmt::Debug for Buffer { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { std::fmt::Debug::fmt(&**self, f) @@ -127,8 +134,8 @@ impl Buffer { /// Returns a pointer to the start of this buffer. #[inline] - pub(crate) fn as_ptr(&self) -> std::ptr::NonNull { - self.data.ptr() + pub(crate) fn as_ptr(&self) -> *const T { + self.data.deref().as_ptr() } /// Returns the offset of this buffer. diff --git a/src/buffer/mod.rs b/src/buffer/mod.rs index 0868908e820..10f62488707 100644 --- a/src/buffer/mod.rs +++ b/src/buffer/mod.rs @@ -2,6 +2,8 @@ mod immutable; -pub(crate) mod bytes; +use crate::ffi::InternalArrowArray; + +pub(crate) type Bytes = foreign_vec::ForeignVec, T>; pub use immutable::Buffer; diff --git a/src/ffi/array.rs b/src/ffi/array.rs index 6bce8162557..fd45ce9652c 100644 --- a/src/ffi/array.rs +++ b/src/ffi/array.rs @@ -5,7 +5,7 @@ use std::sync::Arc; use crate::{ array::*, bitmap::{utils::bytes_for, Bitmap}, - buffer::{bytes::Bytes, Buffer}, + buffer::{Buffer, Bytes}, datatypes::{DataType, PhysicalType}, error::{Error, Result}, ffi::schema::get_child, @@ -101,7 +101,7 @@ impl ArrowArray { let buffers_ptr = buffers .iter() .map(|maybe_buffer| match maybe_buffer { - Some(b) => b.as_ptr() as *const std::os::raw::c_void, + Some(b) => *b as *const std::os::raw::c_void, None => std::ptr::null(), }) .collect::>(); @@ -195,7 +195,7 @@ unsafe fn create_buffer( let len = buffer_len(array, data_type, index)?; let offset = buffer_offset(array, data_type, index); let bytes = ptr - .map(|ptr| Bytes::from_owned(ptr, len, owner)) + .map(|ptr| Bytes::from_foreign(ptr.as_ptr(), len, owner)) .ok_or_else(|| Error::OutOfSpec(format!("The buffer at position {} is null", index)))?; Ok(Buffer::from_bytes(bytes).slice(offset, len - offset)) @@ -226,7 +226,7 @@ unsafe fn create_bitmap( let bytes_len = bytes_for(offset + len); let ptr = NonNull::new(ptr as *mut u8); let bytes = ptr - .map(|ptr| Bytes::from_owned(ptr, bytes_len, owner)) + .map(|ptr| Bytes::from_foreign(ptr.as_ptr(), bytes_len, owner)) .ok_or_else(|| { Error::OutOfSpec(format!( "The buffer {} is a null pointer and cannot be interpreted as a bitmap",