diff --git a/arrow-pyarrow-integration-testing/tests/test_sql.py b/arrow-pyarrow-integration-testing/tests/test_sql.py index 8aef60832cf..27da0e63515 100644 --- a/arrow-pyarrow-integration-testing/tests/test_sql.py +++ b/arrow-pyarrow-integration-testing/tests/test_sql.py @@ -39,15 +39,52 @@ def tearDown(self): # No leak of C++ memory self.assertEqual(self.old_allocated_cpp, pyarrow.total_allocated_bytes()) - def test_string_roundtrip(self): - """ - Python -> Rust -> Python - """ + def test_primitive(self): + a = pyarrow.array([0, None, 2, 3, 4]) + b = arrow_pyarrow_integration_testing.round_trip_array(a) + + b.validate(full=True) + assert a.to_pylist() == b.to_pylist() + assert a.type == b.type + + def test_primitive_sliced(self): + a = pyarrow.array([0, None, 2, 3, 4]).slice(1, 2) + b = arrow_pyarrow_integration_testing.round_trip_array(a) + + b.validate(full=True) + assert a.to_pylist() == b.to_pylist() + assert a.type == b.type + + def test_boolean(self): + a = pyarrow.array([True, None, False, True, False]) + b = arrow_pyarrow_integration_testing.round_trip_array(a) + + b.validate(full=True) + assert a.to_pylist() == b.to_pylist() + assert a.type == b.type + + def test_boolean_sliced(self): + a = pyarrow.array([True, None, False, True, False]).slice(1, 2) + b = arrow_pyarrow_integration_testing.round_trip_array(a) + + b.validate(full=True) + assert a.to_pylist() == b.to_pylist() + assert a.type == b.type + + def test_string(self): a = pyarrow.array(["a", None, "ccc"]) b = arrow_pyarrow_integration_testing.round_trip_array(a) c = pyarrow.array(["a", None, "ccc"]) self.assertEqual(b, c) + def test_string_sliced(self): + a = pyarrow.array(["a", None, "ccc"]).slice(1, 2) + b = arrow_pyarrow_integration_testing.round_trip_array(a) + + b.validate(full=True) + assert a.to_pylist() == b.to_pylist() + assert a.type == b.type + def test_decimal_roundtrip(self): """ Python -> Rust -> Python @@ -75,10 +112,17 @@ def test_list_array(self): assert a.to_pylist() == b.to_pylist() assert a.type == b.type - def test_struct_array(self): - """ - Python -> Rust -> Python - """ + def test_list_sliced(self): + a = pyarrow.array( + [[], None, [1, 2], [4, 5, 6]], pyarrow.list_(pyarrow.int64()) + ).slice(1, 2) + b = arrow_pyarrow_integration_testing.round_trip_array(a) + + b.validate(full=True) + assert a.to_pylist() == b.to_pylist() + assert a.type == b.type + + def test_struct(self): fields = [ ("f1", pyarrow.int32()), ("f2", pyarrow.string()), @@ -99,6 +143,27 @@ def test_struct_array(self): assert a.to_pylist() == b.to_pylist() assert a.type == b.type + # see https://issues.apache.org/jira/browse/ARROW-14383 + def _test_struct_sliced(self): + fields = [ + ("f1", pyarrow.int32()), + ("f2", pyarrow.string()), + ] + a = pyarrow.array( + [ + {"f1": 1, "f2": "a"}, + None, + {"f1": 3, "f2": None}, + {"f1": None, "f2": "d"}, + {"f1": None, "f2": None}, + ], + pyarrow.struct(fields), + ).slice(1, 2) + b = arrow_pyarrow_integration_testing.round_trip_array(a) + b.validate(full=True) + assert a.to_pylist() == b.to_pylist() + assert a.type == b.type + def test_list_list_array(self): """ Python -> Rust -> Python diff --git a/src/array/binary/ffi.rs b/src/array/binary/ffi.rs index 4ca8d981ebd..2e4e94f3858 100644 --- a/src/array/binary/ffi.rs +++ b/src/array/binary/ffi.rs @@ -1,6 +1,6 @@ use crate::{ array::{FromFfi, Offset, ToFfi}, - datatypes::DataType, + bitmap::align, ffi, }; @@ -12,43 +12,54 @@ unsafe impl ToFfi for BinaryArray { fn buffers(&self) -> Vec>> { vec![ self.validity.as_ref().map(|x| x.as_ptr()), - std::ptr::NonNull::new(self.offsets.as_ptr() as *mut u8), - std::ptr::NonNull::new(self.values.as_ptr() as *mut u8), + Some(self.offsets.as_ptr().cast::()), + Some(self.values.as_ptr().cast::()), ] } - #[inline] - fn offset(&self) -> usize { - self.offset + fn offset(&self) -> Option { + let offset = self.offsets.offset(); + if let Some(bitmap) = self.validity.as_ref() { + if bitmap.offset() == offset { + Some(offset) + } else { + None + } + } else { + Some(offset) + } + } + + fn to_ffi_aligned(&self) -> Self { + let offset = self.offsets.offset(); + + let validity = self.validity.as_ref().map(|bitmap| { + if bitmap.offset() == offset { + bitmap.clone() + } else { + align(bitmap, offset) + } + }); + + Self { + data_type: self.data_type.clone(), + validity, + offsets: self.offsets.clone(), + values: self.values.clone(), + } } } impl FromFfi for BinaryArray { unsafe fn try_from_ffi(array: A) -> Result { let data_type = array.field().data_type().clone(); - let expected = if O::is_large() { - DataType::LargeBinary - } else { - DataType::Binary - }; - assert_eq!(data_type, expected); - - let length = array.array().len(); - let offset = array.array().offset(); - let mut validity = unsafe { array.validity() }?; - let mut offsets = unsafe { array.buffer::(0) }?; - let values = unsafe { array.buffer::(1) }?; - if offset > 0 { - offsets = offsets.slice(offset, length); - validity = validity.map(|x| x.slice(offset, length)) - } + let validity = unsafe { array.validity() }?; + let offsets = unsafe { array.buffer::(0) }?; + let values = unsafe { array.buffer::(1) }?; Ok(Self::from_data_unchecked( - Self::default_data_type(), - offsets, - values, - validity, + data_type, offsets, values, validity, )) } } diff --git a/src/array/binary/mod.rs b/src/array/binary/mod.rs index 036e3f74122..ed446561dde 100644 --- a/src/array/binary/mod.rs +++ b/src/array/binary/mod.rs @@ -23,7 +23,6 @@ pub struct BinaryArray { offsets: Buffer, values: Buffer, validity: Option, - offset: usize, } // constructors @@ -71,7 +70,6 @@ impl BinaryArray { offsets, values, validity, - offset: 0, } } @@ -113,7 +111,6 @@ impl BinaryArray { offsets, values, validity, - offset: 0, } } @@ -146,7 +143,6 @@ impl BinaryArray { offsets, values: self.values.clone(), validity, - offset: self.offset + offset, } } diff --git a/src/array/boolean/ffi.rs b/src/array/boolean/ffi.rs index e4264d93829..c042d538aab 100644 --- a/src/array/boolean/ffi.rs +++ b/src/array/boolean/ffi.rs @@ -1,6 +1,6 @@ use crate::{ array::{FromFfi, ToFfi}, - datatypes::DataType, + bitmap::align, ffi, }; @@ -16,24 +16,43 @@ unsafe impl ToFfi for BooleanArray { ] } - fn offset(&self) -> usize { - self.offset + fn offset(&self) -> Option { + let offset = self.values.offset(); + if let Some(bitmap) = self.validity.as_ref() { + if bitmap.offset() == offset { + Some(offset) + } else { + None + } + } else { + Some(offset) + } + } + + fn to_ffi_aligned(&self) -> Self { + let offset = self.values.offset(); + + let validity = self.validity.as_ref().map(|bitmap| { + if bitmap.offset() == offset { + bitmap.clone() + } else { + align(bitmap, offset) + } + }); + + Self { + data_type: self.data_type.clone(), + validity, + values: self.values.clone(), + } } } impl FromFfi for BooleanArray { unsafe fn try_from_ffi(array: A) -> Result { let data_type = array.field().data_type().clone(); - assert_eq!(data_type, DataType::Boolean); - let length = array.array().len(); - let offset = array.array().offset(); - let mut validity = unsafe { array.validity() }?; - let mut values = unsafe { array.bitmap(0) }?; - - if offset > 0 { - values = values.slice(offset, length); - validity = validity.map(|x| x.slice(offset, length)) - } + let validity = unsafe { array.validity() }?; + let values = unsafe { array.bitmap(0) }?; Ok(Self::from_data(data_type, values, validity)) } } diff --git a/src/array/boolean/mod.rs b/src/array/boolean/mod.rs index 065eb199775..06935af24ee 100644 --- a/src/array/boolean/mod.rs +++ b/src/array/boolean/mod.rs @@ -20,7 +20,6 @@ pub struct BooleanArray { data_type: DataType, values: Bitmap, validity: Option, - offset: usize, } impl BooleanArray { @@ -50,7 +49,6 @@ impl BooleanArray { data_type, values, validity, - offset: 0, } } @@ -83,7 +81,6 @@ impl BooleanArray { data_type: self.data_type.clone(), values: self.values.clone().slice_unchecked(offset, length), validity, - offset: self.offset + offset, } } diff --git a/src/array/dictionary/ffi.rs b/src/array/dictionary/ffi.rs index c6f8f8fa201..a32d88b3c87 100644 --- a/src/array/dictionary/ffi.rs +++ b/src/array/dictionary/ffi.rs @@ -11,26 +11,26 @@ unsafe impl ToFfi for DictionaryArray { self.keys.buffers() } - #[inline] - fn offset(&self) -> usize { - self.offset + fn offset(&self) -> Option { + self.keys.offset() + } + + fn to_ffi_aligned(&self) -> Self { + Self { + data_type: self.data_type.clone(), + keys: self.keys.to_ffi_aligned(), + values: self.values.clone(), + } } } impl FromFfi for DictionaryArray { unsafe fn try_from_ffi(array: A) -> Result { // keys: similar to PrimitiveArray, but the datatype is the inner one - let length = array.array().len(); - let offset = array.array().offset(); - let mut validity = unsafe { array.validity() }?; - let mut values = unsafe { array.buffer::(0) }?; + let validity = unsafe { array.validity() }?; + let values = unsafe { array.buffer::(0) }?; - if offset > 0 { - values = values.slice(offset, length); - validity = validity.map(|x| x.slice(offset, length)) - } let keys = PrimitiveArray::::from_data(K::DATA_TYPE, values, validity); - // values let values = array.dictionary()?.unwrap(); let values = ffi::try_from(values)?.into(); diff --git a/src/array/dictionary/mod.rs b/src/array/dictionary/mod.rs index 3b33cb3e17b..012d937d6d9 100644 --- a/src/array/dictionary/mod.rs +++ b/src/array/dictionary/mod.rs @@ -37,7 +37,6 @@ pub struct DictionaryArray { data_type: DataType, keys: PrimitiveArray, values: Arc, - offset: usize, } impl DictionaryArray { @@ -69,7 +68,6 @@ impl DictionaryArray { data_type, keys, values, - offset: 0, } } @@ -81,7 +79,6 @@ impl DictionaryArray { data_type: self.data_type.clone(), keys: self.keys.clone().slice(offset, length), values: self.values.clone(), - offset: self.offset + offset, } } @@ -93,7 +90,6 @@ impl DictionaryArray { data_type: self.data_type.clone(), keys: self.keys.clone().slice_unchecked(offset, length), values: self.values.clone(), - offset: self.offset + offset, } } diff --git a/src/array/ffi.rs b/src/array/ffi.rs index 176b0966ff5..a104403ecac 100644 --- a/src/array/ffi.rs +++ b/src/array/ffi.rs @@ -7,19 +7,22 @@ use crate::error::Result; /// Trait describing how a struct presents itself to the /// [C data interface](https://arrow.apache.org/docs/format/CDataInterface.html) (FFI). -/// Safety: +/// # Safety /// Implementing this trait incorrect will lead to UB -pub unsafe trait ToFfi { +pub(crate) unsafe trait ToFfi { /// The pointers to the buffers. fn buffers(&self) -> Vec>>; - /// The offset - fn offset(&self) -> usize; - /// The children fn children(&self) -> Vec> { vec![] } + + /// The offset + fn offset(&self) -> Option; + + /// return a partial clone of self with an offset. + fn to_ffi_aligned(&self) -> Self; } /// Trait describing how a struct imports into itself from the @@ -35,17 +38,23 @@ pub trait FromFfi: Sized { macro_rules! ffi_dyn { ($array:expr, $ty:ty) => {{ let array = $array.as_any().downcast_ref::<$ty>().unwrap(); - (array.buffers(), array.children(), None) + ( + array.offset().unwrap(), + array.buffers(), + array.children(), + None, + ) }}; } type BuffersChildren = ( + usize, Vec>>, Vec>, Option>, ); -pub fn buffers_children_dictionary(array: &dyn Array) -> BuffersChildren { +pub fn offset_buffers_children_dictionary(array: &dyn Array) -> BuffersChildren { use PhysicalType::*; match array.data_type().to_physical_type() { Null => ffi_dyn!(array, NullArray), @@ -68,6 +77,7 @@ pub fn buffers_children_dictionary(array: &dyn Array) -> BuffersChildren { with_match_physical_dictionary_key_type!(key_type, |$T| { let array = array.as_any().downcast_ref::>().unwrap(); ( + array.offset().unwrap(), array.buffers(), array.children(), Some(array.values().clone()), diff --git a/src/array/fixed_size_binary/ffi.rs b/src/array/fixed_size_binary/ffi.rs new file mode 100644 index 00000000000..ce9fbc98fb5 --- /dev/null +++ b/src/array/fixed_size_binary/ffi.rs @@ -0,0 +1,44 @@ +use crate::{array::ToFfi, bitmap::align}; + +use super::FixedSizeBinaryArray; + +unsafe impl ToFfi for FixedSizeBinaryArray { + fn buffers(&self) -> Vec>> { + vec![ + self.validity.as_ref().map(|x| x.as_ptr()), + Some(self.values.as_ptr().cast::()), + ] + } + + fn offset(&self) -> Option { + let offset = self.values.offset() / self.size as usize; + if let Some(bitmap) = self.validity.as_ref() { + if bitmap.offset() == offset { + Some(offset) + } else { + None + } + } else { + Some(offset) + } + } + + fn to_ffi_aligned(&self) -> Self { + let offset = self.values.offset() / self.size; + + let validity = self.validity.as_ref().map(|bitmap| { + if bitmap.offset() == offset { + bitmap.clone() + } else { + align(bitmap, offset) + } + }); + + Self { + size: self.size, + data_type: self.data_type.clone(), + validity, + values: self.values.clone(), + } + } +} diff --git a/src/array/fixed_size_binary/mod.rs b/src/array/fixed_size_binary/mod.rs index 3091aae3b62..faf3ec7ff72 100644 --- a/src/array/fixed_size_binary/mod.rs +++ b/src/array/fixed_size_binary/mod.rs @@ -1,7 +1,8 @@ use crate::{bitmap::Bitmap, buffer::Buffer, datatypes::DataType, error::Result}; -use super::{display_fmt, display_helper, ffi::ToFfi, Array}; +use super::{display_fmt, display_helper, Array}; +mod ffi; mod iterator; mod mutable; pub use mutable::*; @@ -14,7 +15,6 @@ pub struct FixedSizeBinaryArray { data_type: DataType, values: Buffer, validity: Option, - offset: usize, } impl FixedSizeBinaryArray { @@ -47,7 +47,6 @@ impl FixedSizeBinaryArray { data_type, values, validity, - offset: 0, } } @@ -83,7 +82,6 @@ impl FixedSizeBinaryArray { size: self.size, values, validity, - offset: self.offset + offset, } } @@ -182,19 +180,6 @@ impl std::fmt::Display for FixedSizeBinaryArray { } } -unsafe impl ToFfi for FixedSizeBinaryArray { - fn buffers(&self) -> Vec>> { - vec![ - self.validity.as_ref().map(|x| x.as_ptr()), - std::ptr::NonNull::new(self.values.as_ptr() as *mut u8), - ] - } - - fn offset(&self) -> usize { - self.offset - } -} - impl FixedSizeBinaryArray { /// Creates a [`FixedSizeBinaryArray`] from an fallible iterator of optional `[u8]`. pub fn try_from_iter, I: IntoIterator>>( diff --git a/src/array/fixed_size_list/ffi.rs b/src/array/fixed_size_list/ffi.rs new file mode 100644 index 00000000000..760879c766b --- /dev/null +++ b/src/array/fixed_size_list/ffi.rs @@ -0,0 +1,27 @@ +use std::sync::Arc; + +use super::super::{ffi::ToFfi, Array}; +use super::FixedSizeListArray; + +unsafe impl ToFfi for FixedSizeListArray { + fn buffers(&self) -> Vec>> { + vec![self.validity.as_ref().map(|x| x.as_ptr())] + } + + fn children(&self) -> Vec> { + vec![self.values().clone()] + } + + fn offset(&self) -> Option { + Some( + self.validity + .as_ref() + .map(|bitmap| bitmap.offset()) + .unwrap_or_default(), + ) + } + + fn to_ffi_aligned(&self) -> Self { + self.clone() + } +} diff --git a/src/array/fixed_size_list/mod.rs b/src/array/fixed_size_list/mod.rs index 5b78569eed4..6c29c2a6b58 100644 --- a/src/array/fixed_size_list/mod.rs +++ b/src/array/fixed_size_list/mod.rs @@ -5,8 +5,9 @@ use crate::{ datatypes::{DataType, Field}, }; -use super::{display_fmt, ffi::ToFfi, new_empty_array, new_null_array, Array}; +use super::{display_fmt, new_empty_array, new_null_array, Array}; +mod ffi; mod iterator; pub use iterator::*; mod mutable; @@ -20,7 +21,6 @@ pub struct FixedSizeListArray { data_type: DataType, values: Arc, validity: Option, - offset: usize, } impl FixedSizeListArray { @@ -60,7 +60,6 @@ impl FixedSizeListArray { data_type, values, validity, - offset: 0, } } @@ -97,7 +96,6 @@ impl FixedSizeListArray { size: self.size, values, validity, - offset: self.offset + offset, } } @@ -194,17 +192,3 @@ impl std::fmt::Display for FixedSizeListArray { display_fmt(self.iter(), "FixedSizeListArray", f, true) } } - -unsafe impl ToFfi for FixedSizeListArray { - fn buffers(&self) -> Vec>> { - vec![self.validity.as_ref().map(|x| x.as_ptr())] - } - - fn offset(&self) -> usize { - self.offset - } - - fn children(&self) -> Vec> { - vec![self.values().clone()] - } -} diff --git a/src/array/list/ffi.rs b/src/array/list/ffi.rs index 14235ce1ddd..1f511730ac0 100644 --- a/src/array/list/ffi.rs +++ b/src/array/list/ffi.rs @@ -1,6 +1,6 @@ use std::sync::Arc; -use crate::{array::FromFfi, error::Result, ffi}; +use crate::{array::FromFfi, bitmap::align, error::Result, ffi}; use super::super::{ffi::ToFfi, specification::Offset, Array}; use super::ListArray; @@ -9,33 +9,55 @@ unsafe impl ToFfi for ListArray { fn buffers(&self) -> Vec>> { vec![ self.validity.as_ref().map(|x| x.as_ptr()), - std::ptr::NonNull::new(self.offsets.as_ptr() as *mut u8), + Some(self.offsets.as_ptr().cast::()), ] } - fn offset(&self) -> usize { - self.offset - } - fn children(&self) -> Vec> { vec![self.values.clone()] } + + fn offset(&self) -> Option { + let offset = self.offsets.offset(); + if let Some(bitmap) = self.validity.as_ref() { + if bitmap.offset() == offset { + Some(offset) + } else { + None + } + } else { + Some(offset) + } + } + + fn to_ffi_aligned(&self) -> Self { + let offset = self.offsets.offset(); + + let validity = self.validity.as_ref().map(|bitmap| { + if bitmap.offset() == offset { + bitmap.clone() + } else { + align(bitmap, offset) + } + }); + + Self { + data_type: self.data_type.clone(), + validity, + offsets: self.offsets.clone(), + values: self.values.clone(), + } + } } impl FromFfi for ListArray { unsafe fn try_from_ffi(array: A) -> Result { let data_type = array.field().data_type().clone(); - let length = array.array().len(); - let offset = array.array().offset(); - let mut validity = unsafe { array.validity() }?; - let mut offsets = unsafe { array.buffer::(0) }?; - let child = array.child(0)?; + let validity = unsafe { array.validity() }?; + let offsets = unsafe { array.buffer::(0) }?; + let child = unsafe { array.child(0)? }; let values = ffi::try_from(child)?.into(); - if offset > 0 { - offsets = offsets.slice(offset, length); - validity = validity.map(|x| x.slice(offset, length)) - } Ok(Self::from_data(data_type, offsets, values, validity)) } } diff --git a/src/array/list/mod.rs b/src/array/list/mod.rs index 4a92c352fa3..8920f89e4c5 100644 --- a/src/array/list/mod.rs +++ b/src/array/list/mod.rs @@ -25,7 +25,6 @@ pub struct ListArray { offsets: Buffer, values: Arc, validity: Option, - offset: usize, } impl ListArray { @@ -78,7 +77,6 @@ impl ListArray { offsets, values, validity, - offset: 0, } } @@ -107,7 +105,6 @@ impl ListArray { offsets, values: self.values.clone(), validity, - offset: self.offset + offset, } } @@ -144,8 +141,8 @@ impl ListArray { /// Assumes that the `i < self.len`. #[inline] pub unsafe fn value_unchecked(&self, i: usize) -> Box { - let offset = *self.offsets.as_ptr().add(i); - let offset_1 = *self.offsets.as_ptr().add(i + 1); + let offset = *self.offsets.get_unchecked(i); + let offset_1 = *self.offsets.get_unchecked(i + 1); let length = (offset_1 - offset).to_usize(); self.values.slice_unchecked(offset.to_usize(), length) diff --git a/src/array/map/ffi.rs b/src/array/map/ffi.rs index 50b9039c351..12a56f5251d 100644 --- a/src/array/map/ffi.rs +++ b/src/array/map/ffi.rs @@ -1,6 +1,6 @@ use std::sync::Arc; -use crate::{array::FromFfi, error::Result, ffi}; +use crate::{array::FromFfi, bitmap::align, error::Result, ffi}; use super::super::{ffi::ToFfi, Array}; use super::MapArray; @@ -9,33 +9,55 @@ unsafe impl ToFfi for MapArray { fn buffers(&self) -> Vec>> { vec![ self.validity.as_ref().map(|x| x.as_ptr()), - std::ptr::NonNull::new(self.offsets.as_ptr() as *mut u8), + Some(self.offsets.as_ptr().cast::()), ] } - fn offset(&self) -> usize { - self.offset - } - fn children(&self) -> Vec> { vec![self.field.clone()] } + + fn offset(&self) -> Option { + let offset = self.offsets.offset(); + if let Some(bitmap) = self.validity.as_ref() { + if bitmap.offset() == offset { + Some(offset) + } else { + None + } + } else { + Some(offset) + } + } + + fn to_ffi_aligned(&self) -> Self { + let offset = self.offsets.offset(); + + let validity = self.validity.as_ref().map(|bitmap| { + if bitmap.offset() == offset { + bitmap.clone() + } else { + align(bitmap, offset) + } + }); + + Self { + data_type: self.data_type.clone(), + validity, + offsets: self.offsets.clone(), + field: self.field.clone(), + } + } } impl FromFfi for MapArray { unsafe fn try_from_ffi(array: A) -> Result { let data_type = array.field().data_type().clone(); - let length = array.array().len(); - let offset = array.array().offset(); - let mut validity = unsafe { array.validity() }?; - let mut offsets = unsafe { array.buffer::(0) }?; + let validity = unsafe { array.validity() }?; + let offsets = unsafe { array.buffer::(0) }?; let child = array.child(0)?; let values = ffi::try_from(child)?.into(); - if offset > 0 { - offsets = offsets.slice(offset, length); - validity = validity.map(|x| x.slice(offset, length)) - } Ok(Self::from_data(data_type, offsets, values, validity)) } } diff --git a/src/array/map/mod.rs b/src/array/map/mod.rs index 0efc83e7a4e..616bc969c12 100644 --- a/src/array/map/mod.rs +++ b/src/array/map/mod.rs @@ -22,7 +22,6 @@ pub struct MapArray { field: Arc, // invariant: offsets.len() - 1 == Bitmap::len() validity: Option, - offset: usize, } impl MapArray { @@ -81,7 +80,6 @@ impl MapArray { data_type, field, offsets, - offset: 0, validity, } } @@ -111,7 +109,6 @@ impl MapArray { offsets, field: self.field.clone(), validity, - offset: self.offset + offset, } } } @@ -148,8 +145,8 @@ impl MapArray { /// Assumes that the `i < self.len`. #[inline] pub unsafe fn value_unchecked(&self, i: usize) -> Box { - let offset = *self.offsets.as_ptr().add(i); - let offset_1 = *self.offsets.as_ptr().add(i + 1); + let offset = *self.offsets.get_unchecked(i); + let offset_1 = *self.offsets.get_unchecked(i + 1); let length = (offset_1 - offset).to_usize(); self.field.slice_unchecked(offset.to_usize(), length) diff --git a/src/array/mod.rs b/src/array/mod.rs index 52469577ee7..fa8caf7002c 100644 --- a/src/array/mod.rs +++ b/src/array/mod.rs @@ -393,9 +393,9 @@ pub use struct_::StructArray; pub use union::UnionArray; pub use utf8::{MutableUtf8Array, Utf8Array, Utf8ValuesIter}; -pub(crate) use self::ffi::buffers_children_dictionary; -pub use self::ffi::FromFfi; -pub use self::ffi::ToFfi; +pub(crate) use self::ffi::offset_buffers_children_dictionary; +pub(crate) use self::ffi::FromFfi; +pub(crate) use self::ffi::ToFfi; /// A trait describing the ability of a struct to create itself from a iterator. /// This is similar to [`Extend`], but accepted the creation to error. diff --git a/src/array/null.rs b/src/array/null.rs index 1906cc140f4..cb194e3901e 100644 --- a/src/array/null.rs +++ b/src/array/null.rs @@ -7,7 +7,6 @@ use super::{ffi::ToFfi, Array}; pub struct NullArray { data_type: DataType, length: usize, - offset: usize, } impl NullArray { @@ -23,19 +22,14 @@ impl NullArray { /// Returns a new [`NullArray`]. pub fn from_data(data_type: DataType, length: usize) -> Self { - Self { - data_type, - length, - offset: 0, - } + Self { data_type, length } } /// Returns a slice of the [`NullArray`]. - pub fn slice(&self, offset: usize, length: usize) -> Self { + pub fn slice(&self, _offset: usize, length: usize) -> Self { Self { data_type: self.data_type.clone(), length, - offset: self.offset + offset, } } } @@ -82,8 +76,11 @@ unsafe impl ToFfi for NullArray { vec![] } - #[inline] - fn offset(&self) -> usize { - self.offset + fn offset(&self) -> Option { + Some(0) + } + + fn to_ffi_aligned(&self) -> Self { + self.clone() } } diff --git a/src/array/primitive/ffi.rs b/src/array/primitive/ffi.rs index 3418e2516fc..0e8b3e74a10 100644 --- a/src/array/primitive/ffi.rs +++ b/src/array/primitive/ffi.rs @@ -1,5 +1,6 @@ use crate::{ array::{FromFfi, ToFfi}, + bitmap::align, ffi, types::NativeType, }; @@ -12,28 +13,48 @@ unsafe impl ToFfi for PrimitiveArray { fn buffers(&self) -> Vec>> { vec![ self.validity.as_ref().map(|x| x.as_ptr()), - std::ptr::NonNull::new(self.values.as_ptr() as *mut u8), + Some(self.values.as_ptr().cast::()), ] } - #[inline] - fn offset(&self) -> usize { - self.offset + fn offset(&self) -> Option { + let offset = self.values.offset(); + if let Some(bitmap) = self.validity.as_ref() { + if bitmap.offset() == offset { + Some(offset) + } else { + None + } + } else { + Some(offset) + } + } + + fn to_ffi_aligned(&self) -> Self { + let offset = self.values.offset(); + + let validity = self.validity.as_ref().map(|bitmap| { + if bitmap.offset() == offset { + bitmap.clone() + } else { + align(bitmap, offset) + } + }); + + Self { + data_type: self.data_type.clone(), + validity, + values: self.values.clone(), + } } } impl FromFfi for PrimitiveArray { unsafe fn try_from_ffi(array: A) -> Result { let data_type = array.field().data_type().clone(); - let length = array.array().len(); - let offset = array.array().offset(); - let mut validity = unsafe { array.validity() }?; - let mut values = unsafe { array.buffer::(0) }?; - - if offset > 0 { - values = values.slice(offset, length); - validity = validity.map(|x| x.slice(offset, length)) - } + let validity = unsafe { array.validity() }?; + let values = unsafe { array.buffer::(0) }?; + Ok(Self::from_data(data_type, values, validity)) } } diff --git a/src/array/primitive/mod.rs b/src/array/primitive/mod.rs index f75ffd9a2b0..37ab0817998 100644 --- a/src/array/primitive/mod.rs +++ b/src/array/primitive/mod.rs @@ -35,7 +35,6 @@ pub struct PrimitiveArray { data_type: DataType, values: Buffer, validity: Option, - offset: usize, } impl PrimitiveArray { @@ -75,7 +74,6 @@ impl PrimitiveArray { data_type, values, validity, - offset: 0, } } @@ -108,7 +106,6 @@ impl PrimitiveArray { data_type: self.data_type.clone(), values: self.values.clone().slice_unchecked(offset, length), validity, - offset: self.offset + offset, } } @@ -150,7 +147,7 @@ impl PrimitiveArray { /// Caller must be sure that `i < self.len()` #[inline] pub unsafe fn value_unchecked(&self, i: usize) -> T { - *self.values().as_ptr().add(i) + *self.values.get_unchecked(i) } /// Returns a new [`PrimitiveArray`] with a different logical type. @@ -171,7 +168,6 @@ impl PrimitiveArray { data_type, values: self.values, validity: self.validity, - offset: self.offset, } } } diff --git a/src/array/struct_.rs b/src/array/struct_.rs index 8dc327a5719..6e767ae712f 100644 --- a/src/array/struct_.rs +++ b/src/array/struct_.rs @@ -223,24 +223,30 @@ unsafe impl ToFfi for StructArray { vec![self.validity.as_ref().map(|x| x.as_ptr())] } - fn offset(&self) -> usize { - // we do not support offsets in structs. Instead, if an FFI we slice the incoming arrays - 0 - } - fn children(&self) -> Vec> { self.values.clone() } + + fn offset(&self) -> Option { + Some( + self.validity + .as_ref() + .map(|bitmap| bitmap.offset()) + .unwrap_or_default(), + ) + } + + fn to_ffi_aligned(&self) -> Self { + self.clone() + } } impl FromFfi for StructArray { unsafe fn try_from_ffi(array: A) -> Result { - let field = array.field(); - let fields = Self::get_fields(field.data_type()).to_vec(); + let data_type = array.field().data_type().clone(); + let fields = Self::get_fields(&data_type); - let length = array.array().len(); - let offset = array.array().offset(); - let mut validity = unsafe { array.validity() }?; + let validity = unsafe { array.validity() }?; let values = (0..fields.len()) .map(|index| { let child = array.child(index)?; @@ -248,9 +254,6 @@ impl FromFfi for StructArray { }) .collect::>>>()?; - if offset > 0 { - validity = validity.map(|x| x.slice(offset, length)) - } - Ok(Self::from_data(DataType::Struct(fields), values, validity)) + Ok(Self::from_data(data_type, values, validity)) } } diff --git a/src/array/union/ffi.rs b/src/array/union/ffi.rs index 640a7d6f0ab..fbab5dd1596 100644 --- a/src/array/union/ffi.rs +++ b/src/array/union/ffi.rs @@ -10,21 +10,25 @@ unsafe impl ToFfi for UnionArray { if let Some(offsets) = &self.offsets { vec![ None, - std::ptr::NonNull::new(self.types.as_ptr() as *mut u8), - std::ptr::NonNull::new(offsets.as_ptr() as *mut u8), + Some(self.types.as_ptr().cast::()), + Some(offsets.as_ptr().cast::()), ] } else { - vec![None, std::ptr::NonNull::new(self.types.as_ptr() as *mut u8)] + vec![None, Some(self.types.as_ptr().cast::())] } } - fn offset(&self) -> usize { - self.offset - } - fn children(&self) -> Vec> { self.fields.clone() } + + fn offset(&self) -> Option { + Some(self.types.offset()) + } + + fn to_ffi_aligned(&self) -> Self { + self.clone() + } } impl FromFfi for UnionArray { diff --git a/src/array/utf8/ffi.rs b/src/array/utf8/ffi.rs index 4cc716b8e4d..59b1a60ce95 100644 --- a/src/array/utf8/ffi.rs +++ b/src/array/utf8/ffi.rs @@ -1,5 +1,6 @@ use crate::{ array::{FromFfi, Offset, ToFfi}, + bitmap::align, error::Result, ffi, }; @@ -10,29 +11,51 @@ unsafe impl ToFfi for Utf8Array { fn buffers(&self) -> Vec>> { vec![ self.validity.as_ref().map(|x| x.as_ptr()), - std::ptr::NonNull::new(self.offsets.as_ptr() as *mut u8), - std::ptr::NonNull::new(self.values.as_ptr() as *mut u8), + Some(self.offsets.as_ptr().cast::()), + Some(self.values.as_ptr().cast::()), ] } - fn offset(&self) -> usize { - self.offset + fn offset(&self) -> Option { + let offset = self.offsets.offset(); + if let Some(bitmap) = self.validity.as_ref() { + if bitmap.offset() == offset { + Some(offset) + } else { + None + } + } else { + Some(offset) + } + } + + fn to_ffi_aligned(&self) -> Self { + let offset = self.offsets.offset(); + + let validity = self.validity.as_ref().map(|bitmap| { + if bitmap.offset() == offset { + bitmap.clone() + } else { + align(bitmap, offset) + } + }); + + Self { + data_type: self.data_type.clone(), + validity, + offsets: self.offsets.clone(), + values: self.values.clone(), + } } } impl FromFfi for Utf8Array { unsafe fn try_from_ffi(array: A) -> Result { - let length = array.array().len(); - let offset = array.array().offset(); - let mut validity = unsafe { array.validity() }?; - let mut offsets = unsafe { array.buffer::(0) }?; + let data_type = array.field().data_type().clone(); + let validity = unsafe { array.validity() }?; + let offsets = unsafe { array.buffer::(0) }?; let values = unsafe { array.buffer::(1)? }; - if offset > 0 { - offsets = offsets.slice(offset, length); - validity = validity.map(|x| x.slice(offset, length)) - } - let data_type = Self::default_data_type(); Ok(Self::from_data_unchecked( data_type, offsets, values, validity, )) diff --git a/src/array/utf8/mod.rs b/src/array/utf8/mod.rs index 1d6b75faec3..ef895092dc9 100644 --- a/src/array/utf8/mod.rs +++ b/src/array/utf8/mod.rs @@ -36,7 +36,6 @@ pub struct Utf8Array { offsets: Buffer, values: Buffer, validity: Option, - offset: usize, } impl Utf8Array { @@ -86,7 +85,6 @@ impl Utf8Array { offsets, values, validity, - offset: 0, } } @@ -128,7 +126,6 @@ impl Utf8Array { offsets, values, validity, - offset: 0, } } @@ -161,7 +158,6 @@ impl Utf8Array { offsets, values: self.values.clone(), validity, - offset: self.offset + offset, } } diff --git a/src/bitmap/bitmap_ops.rs b/src/bitmap/bitmap_ops.rs index 8a95b5f10c7..065ba0dcbd4 100644 --- a/src/bitmap/bitmap_ops.rs +++ b/src/bitmap/bitmap_ops.rs @@ -132,6 +132,18 @@ where } } +// create a new [`Bitmap`] semantically equal to ``bitmap`` but with an offset equal to ``offset`` +pub(crate) fn align(bitmap: &Bitmap, new_offset: usize) -> Bitmap { + let length = bitmap.len(); + + let bitmap: Bitmap = std::iter::repeat(false) + .take(new_offset) + .chain(bitmap.iter()) + .collect(); + + bitmap.slice(new_offset, length) +} + #[inline] fn and(lhs: &Bitmap, rhs: &Bitmap) -> Bitmap { binary(lhs, rhs, |x, y| x & y) diff --git a/src/bitmap/immutable.rs b/src/bitmap/immutable.rs index c2f5e91fd9d..8a98c2caa97 100644 --- a/src/bitmap/immutable.rs +++ b/src/bitmap/immutable.rs @@ -168,6 +168,12 @@ impl Bitmap { pub(crate) fn as_ptr(&self) -> std::ptr::NonNull { self.bytes.ptr() } + + /// Returns a pointer to the start of this [`Bitmap`] (ignores `offsets`) + /// This pointer is allocated iff `self.len() > 0`. + pub(crate) fn offset(&self) -> usize { + self.offset + } } impl> From

for Bitmap { diff --git a/src/buffer/immutable.rs b/src/buffer/immutable.rs index eab542e67e3..540734370b6 100644 --- a/src/buffer/immutable.rs +++ b/src/buffer/immutable.rs @@ -117,8 +117,13 @@ impl Buffer { /// Returns a pointer to the start of this buffer. #[inline] - pub fn as_ptr(&self) -> *const T { - unsafe { self.data.ptr().as_ptr().add(self.offset) } + pub(crate) fn as_ptr(&self) -> std::ptr::NonNull { + self.data.ptr() + } + + /// Returns a offset of this buffer + pub(crate) fn offset(&self) -> usize { + self.offset } } diff --git a/src/ffi/bridge.rs b/src/ffi/bridge.rs new file mode 100644 index 00000000000..53b9167386f --- /dev/null +++ b/src/ffi/bridge.rs @@ -0,0 +1,41 @@ +use std::sync::Arc; + +use crate::array::*; + +macro_rules! ffi_dyn { + ($array:expr, $ty:ty) => {{ + let a = $array.as_any().downcast_ref::<$ty>().unwrap(); + if a.offset().is_some() { + $array + } else { + Arc::new(a.to_ffi_aligned()) + } + }}; +} + +pub fn align_to_c_data_interface(array: Arc) -> Arc { + use crate::datatypes::PhysicalType::*; + match array.data_type().to_physical_type() { + Null => ffi_dyn!(array, NullArray), + Boolean => ffi_dyn!(array, BooleanArray), + Primitive(primitive) => with_match_primitive_type!(primitive, |$T| { + ffi_dyn!(array, PrimitiveArray<$T>) + }), + Binary => ffi_dyn!(array, BinaryArray), + LargeBinary => ffi_dyn!(array, BinaryArray), + FixedSizeBinary => ffi_dyn!(array, FixedSizeBinaryArray), + Utf8 => ffi_dyn!(array, Utf8Array::), + LargeUtf8 => ffi_dyn!(array, Utf8Array::), + List => ffi_dyn!(array, ListArray::), + LargeList => ffi_dyn!(array, ListArray::), + FixedSizeList => ffi_dyn!(array, FixedSizeListArray), + Struct => ffi_dyn!(array, StructArray), + Union => ffi_dyn!(array, UnionArray), + Map => ffi_dyn!(array, MapArray), + Dictionary(key_type) => { + with_match_physical_dictionary_key_type!(key_type, |$T| { + ffi_dyn!(array, DictionaryArray<$T>) + }) + } + } +} diff --git a/src/ffi/ffi.rs b/src/ffi/ffi.rs index c4b839b332f..33ffadc7245 100644 --- a/src/ffi/ffi.rs +++ b/src/ffi/ffi.rs @@ -18,7 +18,7 @@ use std::{ptr::NonNull, sync::Arc}; use crate::{ - array::{buffers_children_dictionary, Array}, + array::{offset_buffers_children_dictionary, Array}, bitmap::{utils::bytes_for, Bitmap}, buffer::{ bytes::{Bytes, Deallocation}, @@ -96,12 +96,12 @@ impl Ffi_ArrowArray { /// This method releases `buffers`. Consumers of this struct *must* call `release` before /// releasing this struct, or contents in `buffers` leak. pub(crate) fn new(array: Arc) -> Self { - let (buffers, children, dictionary) = buffers_children_dictionary(array.as_ref()); + let (offset, buffers, children, dictionary) = + offset_buffers_children_dictionary(array.as_ref()); let buffers_ptr = buffers .iter() .map(|maybe_buffer| match maybe_buffer { - // note that `raw_data` takes into account the buffer's offset Some(b) => b.as_ptr() as *const std::os::raw::c_void, None => std::ptr::null(), }) @@ -130,7 +130,7 @@ impl Ffi_ArrowArray { Self { length, null_count, - offset: 0i64, + offset: offset as i64, n_buffers, n_children, buffers: private_data.buffers_ptr.as_mut_ptr(), @@ -186,19 +186,20 @@ unsafe fn create_buffer( if array.buffers.is_null() { return Err(ArrowError::Ffi("The array buffers are null".to_string())); } - let buffers = array.buffers as *mut *const u8; - let len = buffer_len(array, data_type, index)?; + let buffers = array.buffers as *mut *const u8; assert!(index < array.n_buffers as usize); let ptr = *buffers.add(index); - let ptr = NonNull::new(ptr as *mut T); + + let len = buffer_len(array, data_type, index)?; + let offset = buffer_offset(array, data_type, index); let bytes = ptr .map(|ptr| Bytes::new(ptr, len, deallocation)) - .ok_or_else(|| ArrowError::Ffi(format!("The buffer at position {} is null", index))); + .ok_or_else(|| ArrowError::Ffi(format!("The buffer at position {} is null", index)))?; - bytes.map(Buffer::from_bytes) + Ok(Buffer::from_bytes(bytes).slice(offset, len - offset)) } /// returns a new buffer corresponding to the index `i` of the FFI array. It may not exist (null pointer). @@ -217,12 +218,13 @@ unsafe fn create_bitmap( return Err(ArrowError::Ffi("The array buffers are null".to_string())); } let len = array.length as usize; + let offset = array.offset as usize; let buffers = array.buffers as *mut *const u8; assert!(index < array.n_buffers as usize); let ptr = *buffers.add(index); - let bytes_len = bytes_for(len); + let bytes_len = bytes_for(offset + len); let ptr = NonNull::new(ptr as *mut u8); let bytes = ptr .map(|ptr| Bytes::new(ptr, bytes_len, deallocation)) @@ -233,7 +235,15 @@ unsafe fn create_bitmap( )) })?; - Ok(Bitmap::from_bytes(bytes, len)) + Ok(Bitmap::from_bytes(bytes, offset + len).slice(offset, len)) +} + +fn buffer_offset(array: &Ffi_ArrowArray, data_type: &DataType, i: usize) -> usize { + use PhysicalType::*; + match (data_type.to_physical_type(), i) { + (LargeUtf8, 2) | (LargeBinary, 2) | (Utf8, 2) | (Binary, 2) => 0, + _ => array.offset as usize, + } } /// Returns the length, in slots, of the buffer `i` (indexed according to the C data interface) @@ -250,7 +260,7 @@ fn buffer_len(array: &Ffi_ArrowArray, data_type: &DataType, i: usize) -> Result< | (PhysicalType::LargeList, 1) | (PhysicalType::Map, 1) => { // the len of the offset buffer (buffer 1) equals length + 1 - array.length as usize + 1 + array.offset as usize + array.length as usize + 1 } (PhysicalType::Utf8, 2) | (PhysicalType::Binary, 2) => { // the len of the data buffer (buffer 2) equals the last value of the offset buffer (buffer 1) @@ -260,6 +270,7 @@ fn buffer_len(array: &Ffi_ArrowArray, data_type: &DataType, i: usize) -> Result< // interpret as i32 let offset_buffer = offset_buffer as *const i32; // get last offset + (unsafe { *offset_buffer.add(len - 1) }) as usize } (PhysicalType::LargeUtf8, 2) | (PhysicalType::LargeBinary, 2) => { @@ -273,7 +284,7 @@ fn buffer_len(array: &Ffi_ArrowArray, data_type: &DataType, i: usize) -> Result< (unsafe { *offset_buffer.add(len - 1) }) as usize } // buffer len of primitive types - _ => array.length as usize, + _ => array.offset as usize + array.length as usize, }) } @@ -350,7 +361,9 @@ pub trait ArrowArrayRef { create_bitmap(self.array(), self.deallocation(), index + 1) } - fn child(&self, index: usize) -> Result { + /// # Safety + /// The caller must guarantee that the child `index` is valid per c data interface. + unsafe fn child(&self, index: usize) -> Result { create_child(self.array(), self.field(), self.parent().clone(), index) } diff --git a/src/ffi/mod.rs b/src/ffi/mod.rs index 88c8d238cc8..802939382d5 100644 --- a/src/ffi/mod.rs +++ b/src/ffi/mod.rs @@ -2,6 +2,7 @@ //! contains FFI bindings to import and export [`Array`](crate::array::Array) via //! Arrow's [C Data Interface](https://arrow.apache.org/docs/format/CDataInterface.html) mod array; +mod bridge; #[allow(clippy::module_inception)] mod ffi; mod schema; @@ -24,6 +25,8 @@ use self::schema::to_field; /// # Safety /// The pointer `ptr` must be allocated and valid pub unsafe fn export_array_to_c(array: Arc, ptr: *mut Ffi_ArrowArray) { + let array = bridge::align_to_c_data_interface(array); + *ptr = Ffi_ArrowArray::new(array); } diff --git a/tests/it/ffi.rs b/tests/it/ffi.rs index facc143988c..c7912d6cd81 100644 --- a/tests/it/ffi.rs +++ b/tests/it/ffi.rs @@ -1,13 +1,12 @@ use arrow2::array::*; +use arrow2::bitmap::Bitmap; use arrow2::datatypes::{DataType, Field, TimeUnit}; use arrow2::{error::Result, ffi}; use std::collections::BTreeMap; use std::sync::Arc; -fn test_round_trip(expected: impl Array + Clone + 'static) -> Result<()> { - let array: Arc = Arc::new(expected.clone()); +fn _test_round_trip(array: Arc, expected: Box) -> Result<()> { let field = Field::new("a", array.data_type().clone(), true); - let expected = Box::new(expected) as Box; let array_ptr = Box::new(ffi::Ffi_ArrowArray::empty()); let schema_ptr = Box::new(ffi::Ffi_ArrowSchema::empty()); @@ -32,6 +31,15 @@ fn test_round_trip(expected: impl Array + Clone + 'static) -> Result<()> { Ok(()) } +fn test_round_trip(expected: impl Array + Clone + 'static) -> Result<()> { + let array: Arc = Arc::new(expected.clone()); + let expected = Box::new(expected) as Box; + _test_round_trip(array.clone(), clone(expected.as_ref()))?; + + // sliced + _test_round_trip(array.slice(1, 2).into(), expected.slice(1, 2)) +} + fn test_round_trip_schema(field: Field) -> Result<()> { // create a `ArrowArray` from the data. let schema_ptr = Box::new(ffi::Ffi_ArrowSchema::empty()); @@ -138,6 +146,17 @@ fn list_list() -> Result<()> { test_round_trip(array) } +#[test] +fn struct_() -> Result<()> { + let data_type = DataType::Struct(vec![Field::new("a", DataType::Int32, true)]); + let values = vec![Arc::new(Int32Array::from([Some(1), None, Some(3)])) as Arc]; + let validity = Bitmap::from([true, false, true]); + + let array = StructArray::from_data(data_type, values, validity.into()); + + test_round_trip(array) +} + #[test] fn dict() -> Result<()> { let data = vec![Some("a"), Some("a"), None, Some("b")];