From c48b594cae4e0655f58b5b7e4aa92b69817d6ad2 Mon Sep 17 00:00:00 2001 From: "Jorge C. Leitao" Date: Sat, 30 Oct 2021 18:32:46 +0000 Subject: [PATCH 1/4] Fixing ffi --- src/array/binary/ffi.rs | 38 ++++++++++++++++++++++---- src/array/binary/mod.rs | 4 --- src/array/boolean/ffi.rs | 32 ++++++++++++++++++++-- src/array/boolean/mod.rs | 3 -- src/array/dictionary/ffi.rs | 13 +++++++-- src/array/dictionary/mod.rs | 4 --- src/array/ffi.rs | 22 +++++++++++---- src/array/fixed_size_binary/ffi.rs | 44 ++++++++++++++++++++++++++++++ src/array/fixed_size_binary/mod.rs | 19 ++----------- src/array/fixed_size_list/ffi.rs | 27 ++++++++++++++++++ src/array/fixed_size_list/mod.rs | 20 ++------------ src/array/list/ffi.rs | 40 +++++++++++++++++++++++---- src/array/list/mod.rs | 7 ++--- src/array/map/ffi.rs | 31 +++++++++++++++++---- src/array/map/mod.rs | 7 ++--- src/array/mod.rs | 6 ++-- src/array/null.rs | 19 ++++++------- src/array/primitive/ffi.rs | 35 +++++++++++++++++++++--- src/array/primitive/mod.rs | 6 +--- src/array/struct_.rs | 5 ---- src/array/union/ffi.rs | 18 +++++++----- src/array/utf8/ffi.rs | 37 ++++++++++++++++++++++--- src/array/utf8/mod.rs | 4 --- src/bitmap/bitmap_ops.rs | 12 ++++++++ src/bitmap/immutable.rs | 6 ++++ src/buffer/immutable.rs | 8 ++++-- src/ffi/ffi.rs | 5 ++-- src/ffi/mod.rs | 3 ++ 28 files changed, 344 insertions(+), 131 deletions(-) create mode 100644 src/array/fixed_size_binary/ffi.rs create mode 100644 src/array/fixed_size_list/ffi.rs diff --git a/src/array/binary/ffi.rs b/src/array/binary/ffi.rs index 4ca8d981ebd..2aa39d3412a 100644 --- a/src/array/binary/ffi.rs +++ b/src/array/binary/ffi.rs @@ -1,5 +1,6 @@ use crate::{ array::{FromFfi, Offset, ToFfi}, + bitmap::align, datatypes::DataType, ffi, }; @@ -12,14 +13,41 @@ unsafe impl ToFfi for BinaryArray { fn buffers(&self) -> Vec>> { vec![ self.validity.as_ref().map(|x| x.as_ptr()), - std::ptr::NonNull::new(self.offsets.as_ptr() as *mut u8), - std::ptr::NonNull::new(self.values.as_ptr() as *mut u8), + Some(self.offsets.as_ptr().cast::()), + Some(self.values.as_ptr().cast::()), ] } - #[inline] - fn offset(&self) -> usize { - self.offset + fn offset(&self) -> Option { + let offset = self.offsets.offset(); + if let Some(bitmap) = self.validity.as_ref() { + if bitmap.offset() == offset { + Some(offset) + } else { + None + } + } else { + Some(offset) + } + } + + fn to_ffi_aligned(&self) -> Self { + let offset = self.offsets.offset(); + + let validity = self.validity.as_ref().map(|bitmap| { + if bitmap.offset() == offset { + bitmap.clone() + } else { + align(bitmap, offset) + } + }); + + Self { + data_type: self.data_type.clone(), + validity, + offsets: self.offsets.clone(), + values: self.values.clone(), + } } } diff --git a/src/array/binary/mod.rs b/src/array/binary/mod.rs index 036e3f74122..ed446561dde 100644 --- a/src/array/binary/mod.rs +++ b/src/array/binary/mod.rs @@ -23,7 +23,6 @@ pub struct BinaryArray { offsets: Buffer, values: Buffer, validity: Option, - offset: usize, } // constructors @@ -71,7 +70,6 @@ impl BinaryArray { offsets, values, validity, - offset: 0, } } @@ -113,7 +111,6 @@ impl BinaryArray { offsets, values, validity, - offset: 0, } } @@ -146,7 +143,6 @@ impl BinaryArray { offsets, values: self.values.clone(), validity, - offset: self.offset + offset, } } diff --git a/src/array/boolean/ffi.rs b/src/array/boolean/ffi.rs index e4264d93829..a396a466eeb 100644 --- a/src/array/boolean/ffi.rs +++ b/src/array/boolean/ffi.rs @@ -1,5 +1,6 @@ use crate::{ array::{FromFfi, ToFfi}, + bitmap::align, datatypes::DataType, ffi, }; @@ -16,8 +17,35 @@ unsafe impl ToFfi for BooleanArray { ] } - fn offset(&self) -> usize { - self.offset + fn offset(&self) -> Option { + let offset = self.values.offset(); + if let Some(bitmap) = self.validity.as_ref() { + if bitmap.offset() == offset { + Some(offset) + } else { + None + } + } else { + Some(offset) + } + } + + fn to_ffi_aligned(&self) -> Self { + let offset = self.values.offset(); + + let validity = self.validity.as_ref().map(|bitmap| { + if bitmap.offset() == offset { + bitmap.clone() + } else { + align(bitmap, offset) + } + }); + + Self { + data_type: self.data_type.clone(), + validity, + values: self.values.clone(), + } } } diff --git a/src/array/boolean/mod.rs b/src/array/boolean/mod.rs index 065eb199775..06935af24ee 100644 --- a/src/array/boolean/mod.rs +++ b/src/array/boolean/mod.rs @@ -20,7 +20,6 @@ pub struct BooleanArray { data_type: DataType, values: Bitmap, validity: Option, - offset: usize, } impl BooleanArray { @@ -50,7 +49,6 @@ impl BooleanArray { data_type, values, validity, - offset: 0, } } @@ -83,7 +81,6 @@ impl BooleanArray { data_type: self.data_type.clone(), values: self.values.clone().slice_unchecked(offset, length), validity, - offset: self.offset + offset, } } diff --git a/src/array/dictionary/ffi.rs b/src/array/dictionary/ffi.rs index c6f8f8fa201..111eb281e7a 100644 --- a/src/array/dictionary/ffi.rs +++ b/src/array/dictionary/ffi.rs @@ -11,9 +11,16 @@ unsafe impl ToFfi for DictionaryArray { self.keys.buffers() } - #[inline] - fn offset(&self) -> usize { - self.offset + fn offset(&self) -> Option { + self.keys.offset() + } + + fn to_ffi_aligned(&self) -> Self { + Self { + data_type: self.data_type.clone(), + keys: self.keys.to_ffi_aligned(), + values: self.values, + } } } diff --git a/src/array/dictionary/mod.rs b/src/array/dictionary/mod.rs index 3b33cb3e17b..012d937d6d9 100644 --- a/src/array/dictionary/mod.rs +++ b/src/array/dictionary/mod.rs @@ -37,7 +37,6 @@ pub struct DictionaryArray { data_type: DataType, keys: PrimitiveArray, values: Arc, - offset: usize, } impl DictionaryArray { @@ -69,7 +68,6 @@ impl DictionaryArray { data_type, keys, values, - offset: 0, } } @@ -81,7 +79,6 @@ impl DictionaryArray { data_type: self.data_type.clone(), keys: self.keys.clone().slice(offset, length), values: self.values.clone(), - offset: self.offset + offset, } } @@ -93,7 +90,6 @@ impl DictionaryArray { data_type: self.data_type.clone(), keys: self.keys.clone().slice_unchecked(offset, length), values: self.values.clone(), - offset: self.offset + offset, } } diff --git a/src/array/ffi.rs b/src/array/ffi.rs index 176b0966ff5..a7f0dd5a280 100644 --- a/src/array/ffi.rs +++ b/src/array/ffi.rs @@ -9,17 +9,20 @@ use crate::error::Result; /// [C data interface](https://arrow.apache.org/docs/format/CDataInterface.html) (FFI). /// Safety: /// Implementing this trait incorrect will lead to UB -pub unsafe trait ToFfi { +pub(crate) unsafe trait ToFfi { /// The pointers to the buffers. fn buffers(&self) -> Vec>>; - /// The offset - fn offset(&self) -> usize; - /// The children fn children(&self) -> Vec> { vec![] } + + /// The offset + fn offset(&self) -> Option; + + /// return a partial clone of self with an offset. + fn to_ffi_aligned(&self) -> Self; } /// Trait describing how a struct imports into itself from the @@ -35,17 +38,23 @@ pub trait FromFfi: Sized { macro_rules! ffi_dyn { ($array:expr, $ty:ty) => {{ let array = $array.as_any().downcast_ref::<$ty>().unwrap(); - (array.buffers(), array.children(), None) + ( + array.offset().unwrap(), + array.buffers(), + array.children(), + None, + ) }}; } type BuffersChildren = ( + usize, Vec>>, Vec>, Option>, ); -pub fn buffers_children_dictionary(array: &dyn Array) -> BuffersChildren { +pub fn offset_buffers_children_dictionary(array: &dyn Array) -> BuffersChildren { use PhysicalType::*; match array.data_type().to_physical_type() { Null => ffi_dyn!(array, NullArray), @@ -68,6 +77,7 @@ pub fn buffers_children_dictionary(array: &dyn Array) -> BuffersChildren { with_match_physical_dictionary_key_type!(key_type, |$T| { let array = array.as_any().downcast_ref::>().unwrap(); ( + array.offset().unwrap(), array.buffers(), array.children(), Some(array.values().clone()), diff --git a/src/array/fixed_size_binary/ffi.rs b/src/array/fixed_size_binary/ffi.rs new file mode 100644 index 00000000000..ce9fbc98fb5 --- /dev/null +++ b/src/array/fixed_size_binary/ffi.rs @@ -0,0 +1,44 @@ +use crate::{array::ToFfi, bitmap::align}; + +use super::FixedSizeBinaryArray; + +unsafe impl ToFfi for FixedSizeBinaryArray { + fn buffers(&self) -> Vec>> { + vec![ + self.validity.as_ref().map(|x| x.as_ptr()), + Some(self.values.as_ptr().cast::()), + ] + } + + fn offset(&self) -> Option { + let offset = self.values.offset() / self.size as usize; + if let Some(bitmap) = self.validity.as_ref() { + if bitmap.offset() == offset { + Some(offset) + } else { + None + } + } else { + Some(offset) + } + } + + fn to_ffi_aligned(&self) -> Self { + let offset = self.values.offset() / self.size; + + let validity = self.validity.as_ref().map(|bitmap| { + if bitmap.offset() == offset { + bitmap.clone() + } else { + align(bitmap, offset) + } + }); + + Self { + size: self.size, + data_type: self.data_type.clone(), + validity, + values: self.values.clone(), + } + } +} diff --git a/src/array/fixed_size_binary/mod.rs b/src/array/fixed_size_binary/mod.rs index 3091aae3b62..faf3ec7ff72 100644 --- a/src/array/fixed_size_binary/mod.rs +++ b/src/array/fixed_size_binary/mod.rs @@ -1,7 +1,8 @@ use crate::{bitmap::Bitmap, buffer::Buffer, datatypes::DataType, error::Result}; -use super::{display_fmt, display_helper, ffi::ToFfi, Array}; +use super::{display_fmt, display_helper, Array}; +mod ffi; mod iterator; mod mutable; pub use mutable::*; @@ -14,7 +15,6 @@ pub struct FixedSizeBinaryArray { data_type: DataType, values: Buffer, validity: Option, - offset: usize, } impl FixedSizeBinaryArray { @@ -47,7 +47,6 @@ impl FixedSizeBinaryArray { data_type, values, validity, - offset: 0, } } @@ -83,7 +82,6 @@ impl FixedSizeBinaryArray { size: self.size, values, validity, - offset: self.offset + offset, } } @@ -182,19 +180,6 @@ impl std::fmt::Display for FixedSizeBinaryArray { } } -unsafe impl ToFfi for FixedSizeBinaryArray { - fn buffers(&self) -> Vec>> { - vec![ - self.validity.as_ref().map(|x| x.as_ptr()), - std::ptr::NonNull::new(self.values.as_ptr() as *mut u8), - ] - } - - fn offset(&self) -> usize { - self.offset - } -} - impl FixedSizeBinaryArray { /// Creates a [`FixedSizeBinaryArray`] from an fallible iterator of optional `[u8]`. pub fn try_from_iter, I: IntoIterator>>( diff --git a/src/array/fixed_size_list/ffi.rs b/src/array/fixed_size_list/ffi.rs new file mode 100644 index 00000000000..760879c766b --- /dev/null +++ b/src/array/fixed_size_list/ffi.rs @@ -0,0 +1,27 @@ +use std::sync::Arc; + +use super::super::{ffi::ToFfi, Array}; +use super::FixedSizeListArray; + +unsafe impl ToFfi for FixedSizeListArray { + fn buffers(&self) -> Vec>> { + vec![self.validity.as_ref().map(|x| x.as_ptr())] + } + + fn children(&self) -> Vec> { + vec![self.values().clone()] + } + + fn offset(&self) -> Option { + Some( + self.validity + .as_ref() + .map(|bitmap| bitmap.offset()) + .unwrap_or_default(), + ) + } + + fn to_ffi_aligned(&self) -> Self { + self.clone() + } +} diff --git a/src/array/fixed_size_list/mod.rs b/src/array/fixed_size_list/mod.rs index 5b78569eed4..6c29c2a6b58 100644 --- a/src/array/fixed_size_list/mod.rs +++ b/src/array/fixed_size_list/mod.rs @@ -5,8 +5,9 @@ use crate::{ datatypes::{DataType, Field}, }; -use super::{display_fmt, ffi::ToFfi, new_empty_array, new_null_array, Array}; +use super::{display_fmt, new_empty_array, new_null_array, Array}; +mod ffi; mod iterator; pub use iterator::*; mod mutable; @@ -20,7 +21,6 @@ pub struct FixedSizeListArray { data_type: DataType, values: Arc, validity: Option, - offset: usize, } impl FixedSizeListArray { @@ -60,7 +60,6 @@ impl FixedSizeListArray { data_type, values, validity, - offset: 0, } } @@ -97,7 +96,6 @@ impl FixedSizeListArray { size: self.size, values, validity, - offset: self.offset + offset, } } @@ -194,17 +192,3 @@ impl std::fmt::Display for FixedSizeListArray { display_fmt(self.iter(), "FixedSizeListArray", f, true) } } - -unsafe impl ToFfi for FixedSizeListArray { - fn buffers(&self) -> Vec>> { - vec![self.validity.as_ref().map(|x| x.as_ptr())] - } - - fn offset(&self) -> usize { - self.offset - } - - fn children(&self) -> Vec> { - vec![self.values().clone()] - } -} diff --git a/src/array/list/ffi.rs b/src/array/list/ffi.rs index 14235ce1ddd..3168242228a 100644 --- a/src/array/list/ffi.rs +++ b/src/array/list/ffi.rs @@ -1,6 +1,6 @@ use std::sync::Arc; -use crate::{array::FromFfi, error::Result, ffi}; +use crate::{array::FromFfi, bitmap::align, error::Result, ffi}; use super::super::{ffi::ToFfi, specification::Offset, Array}; use super::ListArray; @@ -9,17 +9,45 @@ unsafe impl ToFfi for ListArray { fn buffers(&self) -> Vec>> { vec![ self.validity.as_ref().map(|x| x.as_ptr()), - std::ptr::NonNull::new(self.offsets.as_ptr() as *mut u8), + Some(self.offsets.as_ptr().cast::()), ] } - fn offset(&self) -> usize { - self.offset - } - fn children(&self) -> Vec> { vec![self.values.clone()] } + + fn offset(&self) -> Option { + let offset = self.offsets.offset(); + if let Some(bitmap) = self.validity.as_ref() { + if bitmap.offset() == offset { + Some(offset) + } else { + None + } + } else { + Some(offset) + } + } + + fn to_ffi_aligned(&self) -> Self { + let offset = self.offsets.offset(); + + let validity = self.validity.as_ref().map(|bitmap| { + if bitmap.offset() == offset { + bitmap.clone() + } else { + align(bitmap, offset) + } + }); + + Self { + data_type: self.data_type.clone(), + validity, + offsets: self.offsets.clone(), + values: self.values.clone(), + } + } } impl FromFfi for ListArray { diff --git a/src/array/list/mod.rs b/src/array/list/mod.rs index 4a92c352fa3..8920f89e4c5 100644 --- a/src/array/list/mod.rs +++ b/src/array/list/mod.rs @@ -25,7 +25,6 @@ pub struct ListArray { offsets: Buffer, values: Arc, validity: Option, - offset: usize, } impl ListArray { @@ -78,7 +77,6 @@ impl ListArray { offsets, values, validity, - offset: 0, } } @@ -107,7 +105,6 @@ impl ListArray { offsets, values: self.values.clone(), validity, - offset: self.offset + offset, } } @@ -144,8 +141,8 @@ impl ListArray { /// Assumes that the `i < self.len`. #[inline] pub unsafe fn value_unchecked(&self, i: usize) -> Box { - let offset = *self.offsets.as_ptr().add(i); - let offset_1 = *self.offsets.as_ptr().add(i + 1); + let offset = *self.offsets.get_unchecked(i); + let offset_1 = *self.offsets.get_unchecked(i + 1); let length = (offset_1 - offset).to_usize(); self.values.slice_unchecked(offset.to_usize(), length) diff --git a/src/array/map/ffi.rs b/src/array/map/ffi.rs index 50b9039c351..f87f473d5f8 100644 --- a/src/array/map/ffi.rs +++ b/src/array/map/ffi.rs @@ -1,6 +1,6 @@ use std::sync::Arc; -use crate::{array::FromFfi, error::Result, ffi}; +use crate::{array::FromFfi, bitmap::align, error::Result, ffi}; use super::super::{ffi::ToFfi, Array}; use super::MapArray; @@ -9,17 +9,36 @@ unsafe impl ToFfi for MapArray { fn buffers(&self) -> Vec>> { vec![ self.validity.as_ref().map(|x| x.as_ptr()), - std::ptr::NonNull::new(self.offsets.as_ptr() as *mut u8), + Some(self.offsets.as_ptr().cast::()), ] } - fn offset(&self) -> usize { - self.offset - } - fn children(&self) -> Vec> { vec![self.field.clone()] } + + fn offset(&self) -> Option { + todo!() + } + + fn to_ffi_aligned(&self) -> Self { + let offset = self.offsets.offset(); + + let validity = self.validity.as_ref().map(|bitmap| { + if bitmap.offset() == offset { + bitmap.clone() + } else { + align(bitmap, offset) + } + }); + + Self { + data_type: self.data_type.clone(), + validity, + offsets: self.offsets.clone(), + field: self.field.clone(), + } + } } impl FromFfi for MapArray { diff --git a/src/array/map/mod.rs b/src/array/map/mod.rs index 0efc83e7a4e..616bc969c12 100644 --- a/src/array/map/mod.rs +++ b/src/array/map/mod.rs @@ -22,7 +22,6 @@ pub struct MapArray { field: Arc, // invariant: offsets.len() - 1 == Bitmap::len() validity: Option, - offset: usize, } impl MapArray { @@ -81,7 +80,6 @@ impl MapArray { data_type, field, offsets, - offset: 0, validity, } } @@ -111,7 +109,6 @@ impl MapArray { offsets, field: self.field.clone(), validity, - offset: self.offset + offset, } } } @@ -148,8 +145,8 @@ impl MapArray { /// Assumes that the `i < self.len`. #[inline] pub unsafe fn value_unchecked(&self, i: usize) -> Box { - let offset = *self.offsets.as_ptr().add(i); - let offset_1 = *self.offsets.as_ptr().add(i + 1); + let offset = *self.offsets.get_unchecked(i); + let offset_1 = *self.offsets.get_unchecked(i + 1); let length = (offset_1 - offset).to_usize(); self.field.slice_unchecked(offset.to_usize(), length) diff --git a/src/array/mod.rs b/src/array/mod.rs index 52469577ee7..fa8caf7002c 100644 --- a/src/array/mod.rs +++ b/src/array/mod.rs @@ -393,9 +393,9 @@ pub use struct_::StructArray; pub use union::UnionArray; pub use utf8::{MutableUtf8Array, Utf8Array, Utf8ValuesIter}; -pub(crate) use self::ffi::buffers_children_dictionary; -pub use self::ffi::FromFfi; -pub use self::ffi::ToFfi; +pub(crate) use self::ffi::offset_buffers_children_dictionary; +pub(crate) use self::ffi::FromFfi; +pub(crate) use self::ffi::ToFfi; /// A trait describing the ability of a struct to create itself from a iterator. /// This is similar to [`Extend`], but accepted the creation to error. diff --git a/src/array/null.rs b/src/array/null.rs index 1906cc140f4..cb194e3901e 100644 --- a/src/array/null.rs +++ b/src/array/null.rs @@ -7,7 +7,6 @@ use super::{ffi::ToFfi, Array}; pub struct NullArray { data_type: DataType, length: usize, - offset: usize, } impl NullArray { @@ -23,19 +22,14 @@ impl NullArray { /// Returns a new [`NullArray`]. pub fn from_data(data_type: DataType, length: usize) -> Self { - Self { - data_type, - length, - offset: 0, - } + Self { data_type, length } } /// Returns a slice of the [`NullArray`]. - pub fn slice(&self, offset: usize, length: usize) -> Self { + pub fn slice(&self, _offset: usize, length: usize) -> Self { Self { data_type: self.data_type.clone(), length, - offset: self.offset + offset, } } } @@ -82,8 +76,11 @@ unsafe impl ToFfi for NullArray { vec![] } - #[inline] - fn offset(&self) -> usize { - self.offset + fn offset(&self) -> Option { + Some(0) + } + + fn to_ffi_aligned(&self) -> Self { + self.clone() } } diff --git a/src/array/primitive/ffi.rs b/src/array/primitive/ffi.rs index 3418e2516fc..5eef467d06a 100644 --- a/src/array/primitive/ffi.rs +++ b/src/array/primitive/ffi.rs @@ -1,5 +1,6 @@ use crate::{ array::{FromFfi, ToFfi}, + bitmap::align, ffi, types::NativeType, }; @@ -12,13 +13,39 @@ unsafe impl ToFfi for PrimitiveArray { fn buffers(&self) -> Vec>> { vec![ self.validity.as_ref().map(|x| x.as_ptr()), - std::ptr::NonNull::new(self.values.as_ptr() as *mut u8), + Some(self.values.as_ptr().cast::()), ] } - #[inline] - fn offset(&self) -> usize { - self.offset + fn offset(&self) -> Option { + let offset = self.values.offset(); + if let Some(bitmap) = self.validity.as_ref() { + if bitmap.offset() == offset { + Some(offset) + } else { + None + } + } else { + Some(offset) + } + } + + fn to_ffi_aligned(&self) -> Self { + let offset = self.values.offset(); + + let validity = self.validity.as_ref().map(|bitmap| { + if bitmap.offset() == offset { + bitmap.clone() + } else { + align(bitmap, offset) + } + }); + + Self { + data_type: self.data_type.clone(), + validity, + values: self.values.clone(), + } } } diff --git a/src/array/primitive/mod.rs b/src/array/primitive/mod.rs index f75ffd9a2b0..37ab0817998 100644 --- a/src/array/primitive/mod.rs +++ b/src/array/primitive/mod.rs @@ -35,7 +35,6 @@ pub struct PrimitiveArray { data_type: DataType, values: Buffer, validity: Option, - offset: usize, } impl PrimitiveArray { @@ -75,7 +74,6 @@ impl PrimitiveArray { data_type, values, validity, - offset: 0, } } @@ -108,7 +106,6 @@ impl PrimitiveArray { data_type: self.data_type.clone(), values: self.values.clone().slice_unchecked(offset, length), validity, - offset: self.offset + offset, } } @@ -150,7 +147,7 @@ impl PrimitiveArray { /// Caller must be sure that `i < self.len()` #[inline] pub unsafe fn value_unchecked(&self, i: usize) -> T { - *self.values().as_ptr().add(i) + *self.values.get_unchecked(i) } /// Returns a new [`PrimitiveArray`] with a different logical type. @@ -171,7 +168,6 @@ impl PrimitiveArray { data_type, values: self.values, validity: self.validity, - offset: self.offset, } } } diff --git a/src/array/struct_.rs b/src/array/struct_.rs index 8dc327a5719..b8b24ffeb00 100644 --- a/src/array/struct_.rs +++ b/src/array/struct_.rs @@ -223,11 +223,6 @@ unsafe impl ToFfi for StructArray { vec![self.validity.as_ref().map(|x| x.as_ptr())] } - fn offset(&self) -> usize { - // we do not support offsets in structs. Instead, if an FFI we slice the incoming arrays - 0 - } - fn children(&self) -> Vec> { self.values.clone() } diff --git a/src/array/union/ffi.rs b/src/array/union/ffi.rs index 640a7d6f0ab..6fc8ee99f2e 100644 --- a/src/array/union/ffi.rs +++ b/src/array/union/ffi.rs @@ -10,21 +10,25 @@ unsafe impl ToFfi for UnionArray { if let Some(offsets) = &self.offsets { vec![ None, - std::ptr::NonNull::new(self.types.as_ptr() as *mut u8), - std::ptr::NonNull::new(offsets.as_ptr() as *mut u8), + Some(self.types.as_ptr().cast::()), + Some(offsets.as_ptr().cast::()), ] } else { - vec![None, std::ptr::NonNull::new(self.types.as_ptr() as *mut u8)] + vec![None, Some(self.types.as_ptr().cast::())] } } - fn offset(&self) -> usize { - self.offset - } - fn children(&self) -> Vec> { self.fields.clone() } + + fn offset(&self) -> Option { + todo!() + } + + fn to_ffi_aligned(&self) -> Self { + todo!() + } } impl FromFfi for UnionArray { diff --git a/src/array/utf8/ffi.rs b/src/array/utf8/ffi.rs index 4cc716b8e4d..c2c03fa4573 100644 --- a/src/array/utf8/ffi.rs +++ b/src/array/utf8/ffi.rs @@ -1,5 +1,6 @@ use crate::{ array::{FromFfi, Offset, ToFfi}, + bitmap::align, error::Result, ffi, }; @@ -10,13 +11,41 @@ unsafe impl ToFfi for Utf8Array { fn buffers(&self) -> Vec>> { vec![ self.validity.as_ref().map(|x| x.as_ptr()), - std::ptr::NonNull::new(self.offsets.as_ptr() as *mut u8), - std::ptr::NonNull::new(self.values.as_ptr() as *mut u8), + Some(self.offsets.as_ptr().cast::()), + Some(self.values.as_ptr().cast::()), ] } - fn offset(&self) -> usize { - self.offset + fn offset(&self) -> Option { + let offset = self.offsets.offset(); + if let Some(bitmap) = self.validity.as_ref() { + if bitmap.offset() == offset { + Some(offset) + } else { + None + } + } else { + Some(offset) + } + } + + fn to_ffi_aligned(&self) -> Self { + let offset = self.offsets.offset(); + + let validity = self.validity.as_ref().map(|bitmap| { + if bitmap.offset() == offset { + bitmap.clone() + } else { + align(bitmap, offset) + } + }); + + Self { + data_type: self.data_type.clone(), + validity, + offsets: self.offsets.clone(), + values: self.values.clone(), + } } } diff --git a/src/array/utf8/mod.rs b/src/array/utf8/mod.rs index 1d6b75faec3..ef895092dc9 100644 --- a/src/array/utf8/mod.rs +++ b/src/array/utf8/mod.rs @@ -36,7 +36,6 @@ pub struct Utf8Array { offsets: Buffer, values: Buffer, validity: Option, - offset: usize, } impl Utf8Array { @@ -86,7 +85,6 @@ impl Utf8Array { offsets, values, validity, - offset: 0, } } @@ -128,7 +126,6 @@ impl Utf8Array { offsets, values, validity, - offset: 0, } } @@ -161,7 +158,6 @@ impl Utf8Array { offsets, values: self.values.clone(), validity, - offset: self.offset + offset, } } diff --git a/src/bitmap/bitmap_ops.rs b/src/bitmap/bitmap_ops.rs index 8a95b5f10c7..c7fdfad061f 100644 --- a/src/bitmap/bitmap_ops.rs +++ b/src/bitmap/bitmap_ops.rs @@ -132,6 +132,18 @@ where } } +// create a new bitmap semantically equal to ``bitmap`` but with an offset equal to ``offset`` +pub(crate) fn align(bitmap: &Bitmap, new_offset: usize) -> Bitmap { + let (slice, offset, length) = bitmap.as_slice(); + + let bitmap: Bitmap = std::iter::repeat(false) + .take(new_offset) + .chain(bitmap.iter()) + .collect(); + + bitmap.slice(new_offset, length) +} + #[inline] fn and(lhs: &Bitmap, rhs: &Bitmap) -> Bitmap { binary(lhs, rhs, |x, y| x & y) diff --git a/src/bitmap/immutable.rs b/src/bitmap/immutable.rs index c2f5e91fd9d..8a98c2caa97 100644 --- a/src/bitmap/immutable.rs +++ b/src/bitmap/immutable.rs @@ -168,6 +168,12 @@ impl Bitmap { pub(crate) fn as_ptr(&self) -> std::ptr::NonNull { self.bytes.ptr() } + + /// Returns a pointer to the start of this [`Bitmap`] (ignores `offsets`) + /// This pointer is allocated iff `self.len() > 0`. + pub(crate) fn offset(&self) -> usize { + self.offset + } } impl> From

for Bitmap { diff --git a/src/buffer/immutable.rs b/src/buffer/immutable.rs index eab542e67e3..0f13e3c3e66 100644 --- a/src/buffer/immutable.rs +++ b/src/buffer/immutable.rs @@ -117,8 +117,12 @@ impl Buffer { /// Returns a pointer to the start of this buffer. #[inline] - pub fn as_ptr(&self) -> *const T { - unsafe { self.data.ptr().as_ptr().add(self.offset) } + pub(crate) fn as_ptr(&self) -> std::ptr::NonNull { + self.data.ptr() + } + + pub(crate) fn offset(&self) -> usize { + self.offset } } diff --git a/src/ffi/ffi.rs b/src/ffi/ffi.rs index c4b839b332f..a89bea46191 100644 --- a/src/ffi/ffi.rs +++ b/src/ffi/ffi.rs @@ -18,7 +18,7 @@ use std::{ptr::NonNull, sync::Arc}; use crate::{ - array::{buffers_children_dictionary, Array}, + array::{offset_buffers_children_dictionary, Array}, bitmap::{utils::bytes_for, Bitmap}, buffer::{ bytes::{Bytes, Deallocation}, @@ -96,7 +96,8 @@ impl Ffi_ArrowArray { /// This method releases `buffers`. Consumers of this struct *must* call `release` before /// releasing this struct, or contents in `buffers` leak. pub(crate) fn new(array: Arc) -> Self { - let (buffers, children, dictionary) = buffers_children_dictionary(array.as_ref()); + let (offset, buffers, children, dictionary) = + offset_buffers_children_dictionary(array.as_ref()); let buffers_ptr = buffers .iter() diff --git a/src/ffi/mod.rs b/src/ffi/mod.rs index 88c8d238cc8..802939382d5 100644 --- a/src/ffi/mod.rs +++ b/src/ffi/mod.rs @@ -2,6 +2,7 @@ //! contains FFI bindings to import and export [`Array`](crate::array::Array) via //! Arrow's [C Data Interface](https://arrow.apache.org/docs/format/CDataInterface.html) mod array; +mod bridge; #[allow(clippy::module_inception)] mod ffi; mod schema; @@ -24,6 +25,8 @@ use self::schema::to_field; /// # Safety /// The pointer `ptr` must be allocated and valid pub unsafe fn export_array_to_c(array: Arc, ptr: *mut Ffi_ArrowArray) { + let array = bridge::align_to_c_data_interface(array); + *ptr = Ffi_ArrowArray::new(array); } From ff930e43c22864fb39548b56ac48135ac74d3e6c Mon Sep 17 00:00:00 2001 From: "Jorge C. Leitao" Date: Mon, 1 Nov 2021 06:37:20 +0000 Subject: [PATCH 2/4] Testing. --- .../tests/test_sql.py | 81 +++++++++++++++++-- src/array/dictionary/ffi.rs | 2 +- src/array/ffi.rs | 2 +- src/array/struct_.rs | 13 +++ src/bitmap/bitmap_ops.rs | 4 +- src/buffer/immutable.rs | 1 + src/ffi/bridge.rs | 41 ++++++++++ src/ffi/ffi.rs | 26 ++++-- tests/it/ffi.rs | 25 +++++- 9 files changed, 175 insertions(+), 20 deletions(-) create mode 100644 src/ffi/bridge.rs diff --git a/arrow-pyarrow-integration-testing/tests/test_sql.py b/arrow-pyarrow-integration-testing/tests/test_sql.py index 8aef60832cf..27da0e63515 100644 --- a/arrow-pyarrow-integration-testing/tests/test_sql.py +++ b/arrow-pyarrow-integration-testing/tests/test_sql.py @@ -39,15 +39,52 @@ def tearDown(self): # No leak of C++ memory self.assertEqual(self.old_allocated_cpp, pyarrow.total_allocated_bytes()) - def test_string_roundtrip(self): - """ - Python -> Rust -> Python - """ + def test_primitive(self): + a = pyarrow.array([0, None, 2, 3, 4]) + b = arrow_pyarrow_integration_testing.round_trip_array(a) + + b.validate(full=True) + assert a.to_pylist() == b.to_pylist() + assert a.type == b.type + + def test_primitive_sliced(self): + a = pyarrow.array([0, None, 2, 3, 4]).slice(1, 2) + b = arrow_pyarrow_integration_testing.round_trip_array(a) + + b.validate(full=True) + assert a.to_pylist() == b.to_pylist() + assert a.type == b.type + + def test_boolean(self): + a = pyarrow.array([True, None, False, True, False]) + b = arrow_pyarrow_integration_testing.round_trip_array(a) + + b.validate(full=True) + assert a.to_pylist() == b.to_pylist() + assert a.type == b.type + + def test_boolean_sliced(self): + a = pyarrow.array([True, None, False, True, False]).slice(1, 2) + b = arrow_pyarrow_integration_testing.round_trip_array(a) + + b.validate(full=True) + assert a.to_pylist() == b.to_pylist() + assert a.type == b.type + + def test_string(self): a = pyarrow.array(["a", None, "ccc"]) b = arrow_pyarrow_integration_testing.round_trip_array(a) c = pyarrow.array(["a", None, "ccc"]) self.assertEqual(b, c) + def test_string_sliced(self): + a = pyarrow.array(["a", None, "ccc"]).slice(1, 2) + b = arrow_pyarrow_integration_testing.round_trip_array(a) + + b.validate(full=True) + assert a.to_pylist() == b.to_pylist() + assert a.type == b.type + def test_decimal_roundtrip(self): """ Python -> Rust -> Python @@ -75,10 +112,17 @@ def test_list_array(self): assert a.to_pylist() == b.to_pylist() assert a.type == b.type - def test_struct_array(self): - """ - Python -> Rust -> Python - """ + def test_list_sliced(self): + a = pyarrow.array( + [[], None, [1, 2], [4, 5, 6]], pyarrow.list_(pyarrow.int64()) + ).slice(1, 2) + b = arrow_pyarrow_integration_testing.round_trip_array(a) + + b.validate(full=True) + assert a.to_pylist() == b.to_pylist() + assert a.type == b.type + + def test_struct(self): fields = [ ("f1", pyarrow.int32()), ("f2", pyarrow.string()), @@ -99,6 +143,27 @@ def test_struct_array(self): assert a.to_pylist() == b.to_pylist() assert a.type == b.type + # see https://issues.apache.org/jira/browse/ARROW-14383 + def _test_struct_sliced(self): + fields = [ + ("f1", pyarrow.int32()), + ("f2", pyarrow.string()), + ] + a = pyarrow.array( + [ + {"f1": 1, "f2": "a"}, + None, + {"f1": 3, "f2": None}, + {"f1": None, "f2": "d"}, + {"f1": None, "f2": None}, + ], + pyarrow.struct(fields), + ).slice(1, 2) + b = arrow_pyarrow_integration_testing.round_trip_array(a) + b.validate(full=True) + assert a.to_pylist() == b.to_pylist() + assert a.type == b.type + def test_list_list_array(self): """ Python -> Rust -> Python diff --git a/src/array/dictionary/ffi.rs b/src/array/dictionary/ffi.rs index 111eb281e7a..6c97539f402 100644 --- a/src/array/dictionary/ffi.rs +++ b/src/array/dictionary/ffi.rs @@ -19,7 +19,7 @@ unsafe impl ToFfi for DictionaryArray { Self { data_type: self.data_type.clone(), keys: self.keys.to_ffi_aligned(), - values: self.values, + values: self.values.clone(), } } } diff --git a/src/array/ffi.rs b/src/array/ffi.rs index a7f0dd5a280..a104403ecac 100644 --- a/src/array/ffi.rs +++ b/src/array/ffi.rs @@ -7,7 +7,7 @@ use crate::error::Result; /// Trait describing how a struct presents itself to the /// [C data interface](https://arrow.apache.org/docs/format/CDataInterface.html) (FFI). -/// Safety: +/// # Safety /// Implementing this trait incorrect will lead to UB pub(crate) unsafe trait ToFfi { /// The pointers to the buffers. diff --git a/src/array/struct_.rs b/src/array/struct_.rs index b8b24ffeb00..00cb1042ad2 100644 --- a/src/array/struct_.rs +++ b/src/array/struct_.rs @@ -226,6 +226,19 @@ unsafe impl ToFfi for StructArray { fn children(&self) -> Vec> { self.values.clone() } + + fn offset(&self) -> Option { + Some( + self.validity + .as_ref() + .map(|bitmap| bitmap.offset()) + .unwrap_or_default(), + ) + } + + fn to_ffi_aligned(&self) -> Self { + self.clone() + } } impl FromFfi for StructArray { diff --git a/src/bitmap/bitmap_ops.rs b/src/bitmap/bitmap_ops.rs index c7fdfad061f..065ba0dcbd4 100644 --- a/src/bitmap/bitmap_ops.rs +++ b/src/bitmap/bitmap_ops.rs @@ -132,9 +132,9 @@ where } } -// create a new bitmap semantically equal to ``bitmap`` but with an offset equal to ``offset`` +// create a new [`Bitmap`] semantically equal to ``bitmap`` but with an offset equal to ``offset`` pub(crate) fn align(bitmap: &Bitmap, new_offset: usize) -> Bitmap { - let (slice, offset, length) = bitmap.as_slice(); + let length = bitmap.len(); let bitmap: Bitmap = std::iter::repeat(false) .take(new_offset) diff --git a/src/buffer/immutable.rs b/src/buffer/immutable.rs index 0f13e3c3e66..540734370b6 100644 --- a/src/buffer/immutable.rs +++ b/src/buffer/immutable.rs @@ -121,6 +121,7 @@ impl Buffer { self.data.ptr() } + /// Returns a offset of this buffer pub(crate) fn offset(&self) -> usize { self.offset } diff --git a/src/ffi/bridge.rs b/src/ffi/bridge.rs new file mode 100644 index 00000000000..53b9167386f --- /dev/null +++ b/src/ffi/bridge.rs @@ -0,0 +1,41 @@ +use std::sync::Arc; + +use crate::array::*; + +macro_rules! ffi_dyn { + ($array:expr, $ty:ty) => {{ + let a = $array.as_any().downcast_ref::<$ty>().unwrap(); + if a.offset().is_some() { + $array + } else { + Arc::new(a.to_ffi_aligned()) + } + }}; +} + +pub fn align_to_c_data_interface(array: Arc) -> Arc { + use crate::datatypes::PhysicalType::*; + match array.data_type().to_physical_type() { + Null => ffi_dyn!(array, NullArray), + Boolean => ffi_dyn!(array, BooleanArray), + Primitive(primitive) => with_match_primitive_type!(primitive, |$T| { + ffi_dyn!(array, PrimitiveArray<$T>) + }), + Binary => ffi_dyn!(array, BinaryArray), + LargeBinary => ffi_dyn!(array, BinaryArray), + FixedSizeBinary => ffi_dyn!(array, FixedSizeBinaryArray), + Utf8 => ffi_dyn!(array, Utf8Array::), + LargeUtf8 => ffi_dyn!(array, Utf8Array::), + List => ffi_dyn!(array, ListArray::), + LargeList => ffi_dyn!(array, ListArray::), + FixedSizeList => ffi_dyn!(array, FixedSizeListArray), + Struct => ffi_dyn!(array, StructArray), + Union => ffi_dyn!(array, UnionArray), + Map => ffi_dyn!(array, MapArray), + Dictionary(key_type) => { + with_match_physical_dictionary_key_type!(key_type, |$T| { + ffi_dyn!(array, DictionaryArray<$T>) + }) + } + } +} diff --git a/src/ffi/ffi.rs b/src/ffi/ffi.rs index a89bea46191..586edb302d1 100644 --- a/src/ffi/ffi.rs +++ b/src/ffi/ffi.rs @@ -102,7 +102,6 @@ impl Ffi_ArrowArray { let buffers_ptr = buffers .iter() .map(|maybe_buffer| match maybe_buffer { - // note that `raw_data` takes into account the buffer's offset Some(b) => b.as_ptr() as *const std::os::raw::c_void, None => std::ptr::null(), }) @@ -131,7 +130,7 @@ impl Ffi_ArrowArray { Self { length, null_count, - offset: 0i64, + offset: offset as i64, n_buffers, n_children, buffers: private_data.buffers_ptr.as_mut_ptr(), @@ -190,13 +189,14 @@ unsafe fn create_buffer( let buffers = array.buffers as *mut *const u8; let len = buffer_len(array, data_type, index)?; + let offset = array.offset as usize; assert!(index < array.n_buffers as usize); let ptr = *buffers.add(index); let ptr = NonNull::new(ptr as *mut T); let bytes = ptr - .map(|ptr| Bytes::new(ptr, len, deallocation)) + .map(|ptr| Bytes::new(ptr, offset + len, deallocation)) .ok_or_else(|| ArrowError::Ffi(format!("The buffer at position {} is null", index))); bytes.map(Buffer::from_bytes) @@ -218,12 +218,13 @@ unsafe fn create_bitmap( return Err(ArrowError::Ffi("The array buffers are null".to_string())); } let len = array.length as usize; + let offset = array.offset as usize; let buffers = array.buffers as *mut *const u8; assert!(index < array.n_buffers as usize); let ptr = *buffers.add(index); - let bytes_len = bytes_for(len); + let bytes_len = bytes_for(offset + len); let ptr = NonNull::new(ptr as *mut u8); let bytes = ptr .map(|ptr| Bytes::new(ptr, bytes_len, deallocation)) @@ -234,7 +235,22 @@ unsafe fn create_bitmap( )) })?; - Ok(Bitmap::from_bytes(bytes, len)) + Ok(Bitmap::from_bytes(bytes, offset + len)) +} + +fn buffer_offset(array: &Ffi_ArrowArray, data_type: &DataType, i: usize) -> usize { + use PhysicalType::*; + match (data_type.to_physical_type(), i) { + (Utf8, 1) + | (LargeUtf8, 1) + | (Binary, 1) + | (LargeBinary, 1) + | (List, 1) + | (LargeList, 1) + | (Map, 1) => array.offset as usize, + (LargeUtf8, 2) | (LargeBinary, 2) | (Utf8, 2) | (Binary, 2) => 0, + _ => array.offset as usize, + } } /// Returns the length, in slots, of the buffer `i` (indexed according to the C data interface) diff --git a/tests/it/ffi.rs b/tests/it/ffi.rs index facc143988c..c7912d6cd81 100644 --- a/tests/it/ffi.rs +++ b/tests/it/ffi.rs @@ -1,13 +1,12 @@ use arrow2::array::*; +use arrow2::bitmap::Bitmap; use arrow2::datatypes::{DataType, Field, TimeUnit}; use arrow2::{error::Result, ffi}; use std::collections::BTreeMap; use std::sync::Arc; -fn test_round_trip(expected: impl Array + Clone + 'static) -> Result<()> { - let array: Arc = Arc::new(expected.clone()); +fn _test_round_trip(array: Arc, expected: Box) -> Result<()> { let field = Field::new("a", array.data_type().clone(), true); - let expected = Box::new(expected) as Box; let array_ptr = Box::new(ffi::Ffi_ArrowArray::empty()); let schema_ptr = Box::new(ffi::Ffi_ArrowSchema::empty()); @@ -32,6 +31,15 @@ fn test_round_trip(expected: impl Array + Clone + 'static) -> Result<()> { Ok(()) } +fn test_round_trip(expected: impl Array + Clone + 'static) -> Result<()> { + let array: Arc = Arc::new(expected.clone()); + let expected = Box::new(expected) as Box; + _test_round_trip(array.clone(), clone(expected.as_ref()))?; + + // sliced + _test_round_trip(array.slice(1, 2).into(), expected.slice(1, 2)) +} + fn test_round_trip_schema(field: Field) -> Result<()> { // create a `ArrowArray` from the data. let schema_ptr = Box::new(ffi::Ffi_ArrowSchema::empty()); @@ -138,6 +146,17 @@ fn list_list() -> Result<()> { test_round_trip(array) } +#[test] +fn struct_() -> Result<()> { + let data_type = DataType::Struct(vec![Field::new("a", DataType::Int32, true)]); + let values = vec![Arc::new(Int32Array::from([Some(1), None, Some(3)])) as Arc]; + let validity = Bitmap::from([true, false, true]); + + let array = StructArray::from_data(data_type, values, validity.into()); + + test_round_trip(array) +} + #[test] fn dict() -> Result<()> { let data = vec![Some("a"), Some("a"), None, Some("b")]; From d420bdafa44ca489c56811bf96290ba20fa1fba7 Mon Sep 17 00:00:00 2001 From: "Jorge C. Leitao" Date: Mon, 1 Nov 2021 17:39:51 +0100 Subject: [PATCH 3/4] Save --- src/ffi/ffi.rs | 28 ++++++++++++---------------- 1 file changed, 12 insertions(+), 16 deletions(-) diff --git a/src/ffi/ffi.rs b/src/ffi/ffi.rs index 586edb302d1..97e5fa54cc1 100644 --- a/src/ffi/ffi.rs +++ b/src/ffi/ffi.rs @@ -183,22 +183,23 @@ unsafe fn create_buffer( deallocation: Deallocation, index: usize, ) -> Result> { + println!("{:#?}", array); if array.buffers.is_null() { return Err(ArrowError::Ffi("The array buffers are null".to_string())); } - let buffers = array.buffers as *mut *const u8; - let len = buffer_len(array, data_type, index)?; - let offset = array.offset as usize; + let buffers = array.buffers as *mut *const u8; assert!(index < array.n_buffers as usize); let ptr = *buffers.add(index); - let ptr = NonNull::new(ptr as *mut T); + + let len = buffer_len(array, data_type, index)?; let bytes = ptr - .map(|ptr| Bytes::new(ptr, offset + len, deallocation)) + .map(|ptr| Bytes::new(ptr, len, deallocation)) .ok_or_else(|| ArrowError::Ffi(format!("The buffer at position {} is null", index))); + println!("{:?}", bytes); bytes.map(Buffer::from_bytes) } @@ -241,13 +242,6 @@ unsafe fn create_bitmap( fn buffer_offset(array: &Ffi_ArrowArray, data_type: &DataType, i: usize) -> usize { use PhysicalType::*; match (data_type.to_physical_type(), i) { - (Utf8, 1) - | (LargeUtf8, 1) - | (Binary, 1) - | (LargeBinary, 1) - | (List, 1) - | (LargeList, 1) - | (Map, 1) => array.offset as usize, (LargeUtf8, 2) | (LargeBinary, 2) | (Utf8, 2) | (Binary, 2) => 0, _ => array.offset as usize, } @@ -267,30 +261,32 @@ fn buffer_len(array: &Ffi_ArrowArray, data_type: &DataType, i: usize) -> Result< | (PhysicalType::LargeList, 1) | (PhysicalType::Map, 1) => { // the len of the offset buffer (buffer 1) equals length + 1 - array.length as usize + 1 + array.offset as usize + array.length as usize + 1 } (PhysicalType::Utf8, 2) | (PhysicalType::Binary, 2) => { // the len of the data buffer (buffer 2) equals the last value of the offset buffer (buffer 1) let len = buffer_len(array, data_type, 1)?; + let offset = buffer_offset(array, data_type, 1); // first buffer is the null buffer => add(1) let offset_buffer = unsafe { *(array.buffers as *mut *const u8).add(1) }; // interpret as i32 let offset_buffer = offset_buffer as *const i32; // get last offset - (unsafe { *offset_buffer.add(len - 1) }) as usize + (unsafe { *offset_buffer.add(offset + len - 1) }) as usize } (PhysicalType::LargeUtf8, 2) | (PhysicalType::LargeBinary, 2) => { // the len of the data buffer (buffer 2) equals the last value of the offset buffer (buffer 1) let len = buffer_len(array, data_type, 1)?; + let offset = buffer_offset(array, data_type, 1); // first buffer is the null buffer => add(1) let offset_buffer = unsafe { *(array.buffers as *mut *const u8).add(1) }; // interpret as i64 let offset_buffer = offset_buffer as *const i64; // get last offset - (unsafe { *offset_buffer.add(len - 1) }) as usize + (unsafe { *offset_buffer.add(offset + len - 1) }) as usize } // buffer len of primitive types - _ => array.length as usize, + _ => array.offset as usize + array.length as usize, }) } From e51c9182fb0e9812eb215f9eed0fe6feabd9e181 Mon Sep 17 00:00:00 2001 From: "Jorge C. Leitao" Date: Mon, 1 Nov 2021 16:58:13 +0000 Subject: [PATCH 4/4] Fixed --- src/array/binary/ffi.rs | 23 +++-------------------- src/array/boolean/ffi.rs | 13 ++----------- src/array/dictionary/ffi.rs | 11 ++--------- src/array/list/ffi.rs | 12 +++--------- src/array/map/ffi.rs | 21 ++++++++++++--------- src/array/primitive/ffi.rs | 10 ++-------- src/array/struct_.rs | 13 ++++--------- src/array/union/ffi.rs | 4 ++-- src/array/utf8/ffi.rs | 12 +++--------- src/ffi/ffi.rs | 20 ++++++++++---------- 10 files changed, 43 insertions(+), 96 deletions(-) diff --git a/src/array/binary/ffi.rs b/src/array/binary/ffi.rs index 2aa39d3412a..2e4e94f3858 100644 --- a/src/array/binary/ffi.rs +++ b/src/array/binary/ffi.rs @@ -1,7 +1,6 @@ use crate::{ array::{FromFfi, Offset, ToFfi}, bitmap::align, - datatypes::DataType, ffi, }; @@ -54,29 +53,13 @@ unsafe impl ToFfi for BinaryArray { impl FromFfi for BinaryArray { unsafe fn try_from_ffi(array: A) -> Result { let data_type = array.field().data_type().clone(); - let expected = if O::is_large() { - DataType::LargeBinary - } else { - DataType::Binary - }; - assert_eq!(data_type, expected); - let length = array.array().len(); - let offset = array.array().offset(); - let mut validity = unsafe { array.validity() }?; - let mut offsets = unsafe { array.buffer::(0) }?; + let validity = unsafe { array.validity() }?; + let offsets = unsafe { array.buffer::(0) }?; let values = unsafe { array.buffer::(1) }?; - if offset > 0 { - offsets = offsets.slice(offset, length); - validity = validity.map(|x| x.slice(offset, length)) - } - Ok(Self::from_data_unchecked( - Self::default_data_type(), - offsets, - values, - validity, + data_type, offsets, values, validity, )) } } diff --git a/src/array/boolean/ffi.rs b/src/array/boolean/ffi.rs index a396a466eeb..c042d538aab 100644 --- a/src/array/boolean/ffi.rs +++ b/src/array/boolean/ffi.rs @@ -1,7 +1,6 @@ use crate::{ array::{FromFfi, ToFfi}, bitmap::align, - datatypes::DataType, ffi, }; @@ -52,16 +51,8 @@ unsafe impl ToFfi for BooleanArray { impl FromFfi for BooleanArray { unsafe fn try_from_ffi(array: A) -> Result { let data_type = array.field().data_type().clone(); - assert_eq!(data_type, DataType::Boolean); - let length = array.array().len(); - let offset = array.array().offset(); - let mut validity = unsafe { array.validity() }?; - let mut values = unsafe { array.bitmap(0) }?; - - if offset > 0 { - values = values.slice(offset, length); - validity = validity.map(|x| x.slice(offset, length)) - } + let validity = unsafe { array.validity() }?; + let values = unsafe { array.bitmap(0) }?; Ok(Self::from_data(data_type, values, validity)) } } diff --git a/src/array/dictionary/ffi.rs b/src/array/dictionary/ffi.rs index 6c97539f402..a32d88b3c87 100644 --- a/src/array/dictionary/ffi.rs +++ b/src/array/dictionary/ffi.rs @@ -27,17 +27,10 @@ unsafe impl ToFfi for DictionaryArray { impl FromFfi for DictionaryArray { unsafe fn try_from_ffi(array: A) -> Result { // keys: similar to PrimitiveArray, but the datatype is the inner one - let length = array.array().len(); - let offset = array.array().offset(); - let mut validity = unsafe { array.validity() }?; - let mut values = unsafe { array.buffer::(0) }?; + let validity = unsafe { array.validity() }?; + let values = unsafe { array.buffer::(0) }?; - if offset > 0 { - values = values.slice(offset, length); - validity = validity.map(|x| x.slice(offset, length)) - } let keys = PrimitiveArray::::from_data(K::DATA_TYPE, values, validity); - // values let values = array.dictionary()?.unwrap(); let values = ffi::try_from(values)?.into(); diff --git a/src/array/list/ffi.rs b/src/array/list/ffi.rs index 3168242228a..1f511730ac0 100644 --- a/src/array/list/ffi.rs +++ b/src/array/list/ffi.rs @@ -53,17 +53,11 @@ unsafe impl ToFfi for ListArray { impl FromFfi for ListArray { unsafe fn try_from_ffi(array: A) -> Result { let data_type = array.field().data_type().clone(); - let length = array.array().len(); - let offset = array.array().offset(); - let mut validity = unsafe { array.validity() }?; - let mut offsets = unsafe { array.buffer::(0) }?; - let child = array.child(0)?; + let validity = unsafe { array.validity() }?; + let offsets = unsafe { array.buffer::(0) }?; + let child = unsafe { array.child(0)? }; let values = ffi::try_from(child)?.into(); - if offset > 0 { - offsets = offsets.slice(offset, length); - validity = validity.map(|x| x.slice(offset, length)) - } Ok(Self::from_data(data_type, offsets, values, validity)) } } diff --git a/src/array/map/ffi.rs b/src/array/map/ffi.rs index f87f473d5f8..12a56f5251d 100644 --- a/src/array/map/ffi.rs +++ b/src/array/map/ffi.rs @@ -18,7 +18,16 @@ unsafe impl ToFfi for MapArray { } fn offset(&self) -> Option { - todo!() + let offset = self.offsets.offset(); + if let Some(bitmap) = self.validity.as_ref() { + if bitmap.offset() == offset { + Some(offset) + } else { + None + } + } else { + Some(offset) + } } fn to_ffi_aligned(&self) -> Self { @@ -44,17 +53,11 @@ unsafe impl ToFfi for MapArray { impl FromFfi for MapArray { unsafe fn try_from_ffi(array: A) -> Result { let data_type = array.field().data_type().clone(); - let length = array.array().len(); - let offset = array.array().offset(); - let mut validity = unsafe { array.validity() }?; - let mut offsets = unsafe { array.buffer::(0) }?; + let validity = unsafe { array.validity() }?; + let offsets = unsafe { array.buffer::(0) }?; let child = array.child(0)?; let values = ffi::try_from(child)?.into(); - if offset > 0 { - offsets = offsets.slice(offset, length); - validity = validity.map(|x| x.slice(offset, length)) - } Ok(Self::from_data(data_type, offsets, values, validity)) } } diff --git a/src/array/primitive/ffi.rs b/src/array/primitive/ffi.rs index 5eef467d06a..0e8b3e74a10 100644 --- a/src/array/primitive/ffi.rs +++ b/src/array/primitive/ffi.rs @@ -52,15 +52,9 @@ unsafe impl ToFfi for PrimitiveArray { impl FromFfi for PrimitiveArray { unsafe fn try_from_ffi(array: A) -> Result { let data_type = array.field().data_type().clone(); - let length = array.array().len(); - let offset = array.array().offset(); - let mut validity = unsafe { array.validity() }?; - let mut values = unsafe { array.buffer::(0) }?; + let validity = unsafe { array.validity() }?; + let values = unsafe { array.buffer::(0) }?; - if offset > 0 { - values = values.slice(offset, length); - validity = validity.map(|x| x.slice(offset, length)) - } Ok(Self::from_data(data_type, values, validity)) } } diff --git a/src/array/struct_.rs b/src/array/struct_.rs index 00cb1042ad2..6e767ae712f 100644 --- a/src/array/struct_.rs +++ b/src/array/struct_.rs @@ -243,12 +243,10 @@ unsafe impl ToFfi for StructArray { impl FromFfi for StructArray { unsafe fn try_from_ffi(array: A) -> Result { - let field = array.field(); - let fields = Self::get_fields(field.data_type()).to_vec(); + let data_type = array.field().data_type().clone(); + let fields = Self::get_fields(&data_type); - let length = array.array().len(); - let offset = array.array().offset(); - let mut validity = unsafe { array.validity() }?; + let validity = unsafe { array.validity() }?; let values = (0..fields.len()) .map(|index| { let child = array.child(index)?; @@ -256,9 +254,6 @@ impl FromFfi for StructArray { }) .collect::>>>()?; - if offset > 0 { - validity = validity.map(|x| x.slice(offset, length)) - } - Ok(Self::from_data(DataType::Struct(fields), values, validity)) + Ok(Self::from_data(data_type, values, validity)) } } diff --git a/src/array/union/ffi.rs b/src/array/union/ffi.rs index 6fc8ee99f2e..fbab5dd1596 100644 --- a/src/array/union/ffi.rs +++ b/src/array/union/ffi.rs @@ -23,11 +23,11 @@ unsafe impl ToFfi for UnionArray { } fn offset(&self) -> Option { - todo!() + Some(self.types.offset()) } fn to_ffi_aligned(&self) -> Self { - todo!() + self.clone() } } diff --git a/src/array/utf8/ffi.rs b/src/array/utf8/ffi.rs index c2c03fa4573..59b1a60ce95 100644 --- a/src/array/utf8/ffi.rs +++ b/src/array/utf8/ffi.rs @@ -51,17 +51,11 @@ unsafe impl ToFfi for Utf8Array { impl FromFfi for Utf8Array { unsafe fn try_from_ffi(array: A) -> Result { - let length = array.array().len(); - let offset = array.array().offset(); - let mut validity = unsafe { array.validity() }?; - let mut offsets = unsafe { array.buffer::(0) }?; + let data_type = array.field().data_type().clone(); + let validity = unsafe { array.validity() }?; + let offsets = unsafe { array.buffer::(0) }?; let values = unsafe { array.buffer::(1)? }; - if offset > 0 { - offsets = offsets.slice(offset, length); - validity = validity.map(|x| x.slice(offset, length)) - } - let data_type = Self::default_data_type(); Ok(Self::from_data_unchecked( data_type, offsets, values, validity, )) diff --git a/src/ffi/ffi.rs b/src/ffi/ffi.rs index 97e5fa54cc1..33ffadc7245 100644 --- a/src/ffi/ffi.rs +++ b/src/ffi/ffi.rs @@ -183,7 +183,6 @@ unsafe fn create_buffer( deallocation: Deallocation, index: usize, ) -> Result> { - println!("{:#?}", array); if array.buffers.is_null() { return Err(ArrowError::Ffi("The array buffers are null".to_string())); } @@ -195,12 +194,12 @@ unsafe fn create_buffer( let ptr = NonNull::new(ptr as *mut T); let len = buffer_len(array, data_type, index)?; + let offset = buffer_offset(array, data_type, index); let bytes = ptr .map(|ptr| Bytes::new(ptr, len, deallocation)) - .ok_or_else(|| ArrowError::Ffi(format!("The buffer at position {} is null", index))); + .ok_or_else(|| ArrowError::Ffi(format!("The buffer at position {} is null", index)))?; - println!("{:?}", bytes); - bytes.map(Buffer::from_bytes) + Ok(Buffer::from_bytes(bytes).slice(offset, len - offset)) } /// returns a new buffer corresponding to the index `i` of the FFI array. It may not exist (null pointer). @@ -236,7 +235,7 @@ unsafe fn create_bitmap( )) })?; - Ok(Bitmap::from_bytes(bytes, offset + len)) + Ok(Bitmap::from_bytes(bytes, offset + len).slice(offset, len)) } fn buffer_offset(array: &Ffi_ArrowArray, data_type: &DataType, i: usize) -> usize { @@ -266,24 +265,23 @@ fn buffer_len(array: &Ffi_ArrowArray, data_type: &DataType, i: usize) -> Result< (PhysicalType::Utf8, 2) | (PhysicalType::Binary, 2) => { // the len of the data buffer (buffer 2) equals the last value of the offset buffer (buffer 1) let len = buffer_len(array, data_type, 1)?; - let offset = buffer_offset(array, data_type, 1); // first buffer is the null buffer => add(1) let offset_buffer = unsafe { *(array.buffers as *mut *const u8).add(1) }; // interpret as i32 let offset_buffer = offset_buffer as *const i32; // get last offset - (unsafe { *offset_buffer.add(offset + len - 1) }) as usize + + (unsafe { *offset_buffer.add(len - 1) }) as usize } (PhysicalType::LargeUtf8, 2) | (PhysicalType::LargeBinary, 2) => { // the len of the data buffer (buffer 2) equals the last value of the offset buffer (buffer 1) let len = buffer_len(array, data_type, 1)?; - let offset = buffer_offset(array, data_type, 1); // first buffer is the null buffer => add(1) let offset_buffer = unsafe { *(array.buffers as *mut *const u8).add(1) }; // interpret as i64 let offset_buffer = offset_buffer as *const i64; // get last offset - (unsafe { *offset_buffer.add(offset + len - 1) }) as usize + (unsafe { *offset_buffer.add(len - 1) }) as usize } // buffer len of primitive types _ => array.offset as usize + array.length as usize, @@ -363,7 +361,9 @@ pub trait ArrowArrayRef { create_bitmap(self.array(), self.deallocation(), index + 1) } - fn child(&self, index: usize) -> Result { + /// # Safety + /// The caller must guarantee that the child `index` is valid per c data interface. + unsafe fn child(&self, index: usize) -> Result { create_child(self.array(), self.field(), self.parent().clone(), index) }