From e6b6c8356911333bf08acd2a4234ebc25599e479 Mon Sep 17 00:00:00 2001 From: Jorge Leitao Date: Sat, 23 Oct 2021 21:20:41 +0200 Subject: [PATCH] Fixed ffi of sliced arrays (#540) --- .../tests/test_sql.py | 81 +++++++++++++++++-- src/array/README.md | 11 --- src/array/binary/ffi.rs | 28 +++---- src/array/binary/mod.rs | 4 - src/array/boolean/ffi.rs | 22 +++-- src/array/boolean/mod.rs | 9 ++- src/array/dictionary/ffi.rs | 15 +--- src/array/dictionary/mod.rs | 4 - src/array/ffi.rs | 3 - src/array/fixed_size_binary/ffi.rs | 23 ++++++ src/array/fixed_size_binary/mod.rs | 19 +---- src/array/fixed_size_list/ffi.rs | 16 ++++ src/array/fixed_size_list/mod.rs | 20 +---- src/array/list/ffi.rs | 30 +++---- src/array/list/mod.rs | 3 - src/array/map/ffi.rs | 27 +++---- src/array/map/mod.rs | 3 - src/array/null.rs | 15 +--- src/array/primitive/ffi.rs | 22 ++--- src/array/primitive/mod.rs | 4 - src/array/specification.rs | 5 +- src/array/struct_.rs | 12 +-- src/array/union/ffi.rs | 12 +-- src/array/union/mod.rs | 5 -- src/array/utf8/ffi.rs | 26 +++--- src/array/utf8/mod.rs | 4 - src/bitmap/immutable.rs | 15 ++-- src/bitmap/mutable.rs | 4 +- src/buffer/immutable.rs | 6 +- src/buffer/mutable.rs | 2 +- src/ffi/ffi.rs | 47 +++++++---- src/io/ipc/read/read_basic.rs | 2 +- tests/it/ffi.rs | 25 +++++- 33 files changed, 270 insertions(+), 254 deletions(-) create mode 100644 src/array/fixed_size_binary/ffi.rs create mode 100644 src/array/fixed_size_list/ffi.rs diff --git a/arrow-pyarrow-integration-testing/tests/test_sql.py b/arrow-pyarrow-integration-testing/tests/test_sql.py index 8aef60832cf..27da0e63515 100644 --- a/arrow-pyarrow-integration-testing/tests/test_sql.py +++ b/arrow-pyarrow-integration-testing/tests/test_sql.py @@ -39,15 +39,52 @@ def tearDown(self): # No leak of C++ memory self.assertEqual(self.old_allocated_cpp, pyarrow.total_allocated_bytes()) - def test_string_roundtrip(self): - """ - Python -> Rust -> Python - """ + def test_primitive(self): + a = pyarrow.array([0, None, 2, 3, 4]) + b = arrow_pyarrow_integration_testing.round_trip_array(a) + + b.validate(full=True) + assert a.to_pylist() == b.to_pylist() + assert a.type == b.type + + def test_primitive_sliced(self): + a = pyarrow.array([0, None, 2, 3, 4]).slice(1, 2) + b = arrow_pyarrow_integration_testing.round_trip_array(a) + + b.validate(full=True) + assert a.to_pylist() == b.to_pylist() + assert a.type == b.type + + def test_boolean(self): + a = pyarrow.array([True, None, False, True, False]) + b = arrow_pyarrow_integration_testing.round_trip_array(a) + + b.validate(full=True) + assert a.to_pylist() == b.to_pylist() + assert a.type == b.type + + def test_boolean_sliced(self): + a = pyarrow.array([True, None, False, True, False]).slice(1, 2) + b = arrow_pyarrow_integration_testing.round_trip_array(a) + + b.validate(full=True) + assert a.to_pylist() == b.to_pylist() + assert a.type == b.type + + def test_string(self): a = pyarrow.array(["a", None, "ccc"]) b = arrow_pyarrow_integration_testing.round_trip_array(a) c = pyarrow.array(["a", None, "ccc"]) self.assertEqual(b, c) + def test_string_sliced(self): + a = pyarrow.array(["a", None, "ccc"]).slice(1, 2) + b = arrow_pyarrow_integration_testing.round_trip_array(a) + + b.validate(full=True) + assert a.to_pylist() == b.to_pylist() + assert a.type == b.type + def test_decimal_roundtrip(self): """ Python -> Rust -> Python @@ -75,10 +112,17 @@ def test_list_array(self): assert a.to_pylist() == b.to_pylist() assert a.type == b.type - def test_struct_array(self): - """ - Python -> Rust -> Python - """ + def test_list_sliced(self): + a = pyarrow.array( + [[], None, [1, 2], [4, 5, 6]], pyarrow.list_(pyarrow.int64()) + ).slice(1, 2) + b = arrow_pyarrow_integration_testing.round_trip_array(a) + + b.validate(full=True) + assert a.to_pylist() == b.to_pylist() + assert a.type == b.type + + def test_struct(self): fields = [ ("f1", pyarrow.int32()), ("f2", pyarrow.string()), @@ -99,6 +143,27 @@ def test_struct_array(self): assert a.to_pylist() == b.to_pylist() assert a.type == b.type + # see https://issues.apache.org/jira/browse/ARROW-14383 + def _test_struct_sliced(self): + fields = [ + ("f1", pyarrow.int32()), + ("f2", pyarrow.string()), + ] + a = pyarrow.array( + [ + {"f1": 1, "f2": "a"}, + None, + {"f1": 3, "f2": None}, + {"f1": None, "f2": "d"}, + {"f1": None, "f2": None}, + ], + pyarrow.struct(fields), + ).slice(1, 2) + b = arrow_pyarrow_integration_testing.round_trip_array(a) + b.validate(full=True) + assert a.to_pylist() == b.to_pylist() + assert a.type == b.type + def test_list_list_array(self): """ Python -> Rust -> Python diff --git a/src/array/README.md b/src/array/README.md index a814839527c..9282cf611c6 100644 --- a/src/array/README.md +++ b/src/array/README.md @@ -41,17 +41,6 @@ This document describes the overall design of this module. * `from_trusted_len_values_iter` from an iterator of trusted len of values * `try_from_trusted_len_iter` from an fallible iterator of trusted len of optional values -### Slot offsets - -* An array MUST have a `offset: usize` measuring the number of slots that the array is currently offsetted by if the specification requires. - -* An array MUST implement `fn slice(&self, offset: usize, length: usize) -> Self` that returns an offseted and/or truncated clone of the array. This function MUST increase the array's offset if it exists. - -* Conversely, `offset` MUST only be changed by `slice`. - -The rational of the above is that it enable us to be fully interoperable with the offset logic supported by the C data interface, while at the same time easily perform array slices -within Rust's type safety mechanism. - ### Mutable Arrays * An array MAY have a mutable counterpart. E.g. `MutablePrimitiveArray` is the mutable counterpart of `PrimitiveArray`. diff --git a/src/array/binary/ffi.rs b/src/array/binary/ffi.rs index 4ca8d981ebd..5f1ff1a3030 100644 --- a/src/array/binary/ffi.rs +++ b/src/array/binary/ffi.rs @@ -10,17 +10,22 @@ use super::BinaryArray; unsafe impl ToFfi for BinaryArray { fn buffers(&self) -> Vec>> { + let offset = self + .validity + .as_ref() + .map(|x| x.offset()) + .unwrap_or_default(); + + let offsets = std::ptr::NonNull::new(unsafe { + self.offsets.as_ptr().offset(-(offset as isize)) as *mut u8 + }); + vec![ self.validity.as_ref().map(|x| x.as_ptr()), - std::ptr::NonNull::new(self.offsets.as_ptr() as *mut u8), + offsets, std::ptr::NonNull::new(self.values.as_ptr() as *mut u8), ] } - - #[inline] - fn offset(&self) -> usize { - self.offset - } } impl FromFfi for BinaryArray { @@ -33,17 +38,10 @@ impl FromFfi for BinaryArray { }; assert_eq!(data_type, expected); - let length = array.array().len(); - let offset = array.array().offset(); - let mut validity = unsafe { array.validity() }?; - let mut offsets = unsafe { array.buffer::(0) }?; + let validity = unsafe { array.validity() }?; + let offsets = unsafe { array.buffer::(0) }?; let values = unsafe { array.buffer::(1) }?; - if offset > 0 { - offsets = offsets.slice(offset, length); - validity = validity.map(|x| x.slice(offset, length)) - } - Ok(Self::from_data_unchecked( Self::default_data_type(), offsets, diff --git a/src/array/binary/mod.rs b/src/array/binary/mod.rs index 036e3f74122..ed446561dde 100644 --- a/src/array/binary/mod.rs +++ b/src/array/binary/mod.rs @@ -23,7 +23,6 @@ pub struct BinaryArray { offsets: Buffer, values: Buffer, validity: Option, - offset: usize, } // constructors @@ -71,7 +70,6 @@ impl BinaryArray { offsets, values, validity, - offset: 0, } } @@ -113,7 +111,6 @@ impl BinaryArray { offsets, values, validity, - offset: 0, } } @@ -146,7 +143,6 @@ impl BinaryArray { offsets, values: self.values.clone(), validity, - offset: self.offset + offset, } } diff --git a/src/array/boolean/ffi.rs b/src/array/boolean/ffi.rs index e4264d93829..a69c81a461b 100644 --- a/src/array/boolean/ffi.rs +++ b/src/array/boolean/ffi.rs @@ -10,30 +10,28 @@ use super::BooleanArray; unsafe impl ToFfi for BooleanArray { fn buffers(&self) -> Vec>> { + let offset = self + .validity + .as_ref() + .map(|x| x.offset()) + .unwrap_or_default(); + + assert!(self.values.offset() >= offset); vec![ self.validity.as_ref().map(|x| x.as_ptr()), Some(self.values.as_ptr()), ] } - - fn offset(&self) -> usize { - self.offset - } } impl FromFfi for BooleanArray { unsafe fn try_from_ffi(array: A) -> Result { let data_type = array.field().data_type().clone(); assert_eq!(data_type, DataType::Boolean); - let length = array.array().len(); - let offset = array.array().offset(); - let mut validity = unsafe { array.validity() }?; - let mut values = unsafe { array.bitmap(0) }?; - if offset > 0 { - values = values.slice(offset, length); - validity = validity.map(|x| x.slice(offset, length)) - } + let validity = unsafe { array.validity() }?; + let values = unsafe { array.bitmap(0) }?; + Ok(Self::from_data(data_type, values, validity)) } } diff --git a/src/array/boolean/mod.rs b/src/array/boolean/mod.rs index 065eb199775..15618f18f40 100644 --- a/src/array/boolean/mod.rs +++ b/src/array/boolean/mod.rs @@ -20,7 +20,6 @@ pub struct BooleanArray { data_type: DataType, values: Bitmap, validity: Option, - offset: usize, } impl BooleanArray { @@ -46,11 +45,13 @@ impl BooleanArray { if data_type.to_physical_type() != PhysicalType::Boolean { panic!("BooleanArray can only be initialized with DataType::Boolean") } + if matches!(&validity, Some(bitmap) if bitmap.offset() != values.offset()) { + panic!("validity's bitmap offset must equal values offset") + } Self { data_type, values, validity, - offset: 0, } } @@ -83,7 +84,6 @@ impl BooleanArray { data_type: self.data_type.clone(), values: self.values.clone().slice_unchecked(offset, length), validity, - offset: self.offset + offset, } } @@ -94,6 +94,9 @@ impl BooleanArray { if matches!(&validity, Some(bitmap) if bitmap.len() != self.len()) { panic!("validity should be as least as large as the array") } + if matches!(&validity, Some(bitmap) if bitmap.offset() != self.values.offset()) { + panic!("validity's bitmap offset must equal values offset") + } let mut arr = self.clone(); arr.validity = validity; arr diff --git a/src/array/dictionary/ffi.rs b/src/array/dictionary/ffi.rs index c6f8f8fa201..5a288afb6eb 100644 --- a/src/array/dictionary/ffi.rs +++ b/src/array/dictionary/ffi.rs @@ -10,25 +10,14 @@ unsafe impl ToFfi for DictionaryArray { fn buffers(&self) -> Vec>> { self.keys.buffers() } - - #[inline] - fn offset(&self) -> usize { - self.offset - } } impl FromFfi for DictionaryArray { unsafe fn try_from_ffi(array: A) -> Result { // keys: similar to PrimitiveArray, but the datatype is the inner one - let length = array.array().len(); - let offset = array.array().offset(); - let mut validity = unsafe { array.validity() }?; - let mut values = unsafe { array.buffer::(0) }?; + let validity = unsafe { array.validity() }?; + let values = unsafe { array.buffer::(0) }?; - if offset > 0 { - values = values.slice(offset, length); - validity = validity.map(|x| x.slice(offset, length)) - } let keys = PrimitiveArray::::from_data(K::DATA_TYPE, values, validity); // values let values = array.dictionary()?.unwrap(); diff --git a/src/array/dictionary/mod.rs b/src/array/dictionary/mod.rs index 3b33cb3e17b..012d937d6d9 100644 --- a/src/array/dictionary/mod.rs +++ b/src/array/dictionary/mod.rs @@ -37,7 +37,6 @@ pub struct DictionaryArray { data_type: DataType, keys: PrimitiveArray, values: Arc, - offset: usize, } impl DictionaryArray { @@ -69,7 +68,6 @@ impl DictionaryArray { data_type, keys, values, - offset: 0, } } @@ -81,7 +79,6 @@ impl DictionaryArray { data_type: self.data_type.clone(), keys: self.keys.clone().slice(offset, length), values: self.values.clone(), - offset: self.offset + offset, } } @@ -93,7 +90,6 @@ impl DictionaryArray { data_type: self.data_type.clone(), keys: self.keys.clone().slice_unchecked(offset, length), values: self.values.clone(), - offset: self.offset + offset, } } diff --git a/src/array/ffi.rs b/src/array/ffi.rs index 176b0966ff5..9ed0bfca080 100644 --- a/src/array/ffi.rs +++ b/src/array/ffi.rs @@ -13,9 +13,6 @@ pub unsafe trait ToFfi { /// The pointers to the buffers. fn buffers(&self) -> Vec>>; - /// The offset - fn offset(&self) -> usize; - /// The children fn children(&self) -> Vec> { vec![] diff --git a/src/array/fixed_size_binary/ffi.rs b/src/array/fixed_size_binary/ffi.rs new file mode 100644 index 00000000000..ff8a89d763d --- /dev/null +++ b/src/array/fixed_size_binary/ffi.rs @@ -0,0 +1,23 @@ +use crate::array::ffi::ToFfi; + +use super::FixedSizeBinaryArray; + +unsafe impl ToFfi for FixedSizeBinaryArray { + fn buffers(&self) -> Vec>> { + let offset = self + .validity + .as_ref() + .map(|x| x.offset()) + .unwrap_or_default(); + let offset = offset * self.size(); + + // note that this may point to _before_ the pointer. This is fine because the C data interface + // requires users to only access past the offset + let values = + std::ptr::NonNull::new( + unsafe { self.values.as_ptr().offset(-(offset as isize)) } as *mut u8 + ); + + vec![self.validity.as_ref().map(|x| x.as_ptr()), values] + } +} diff --git a/src/array/fixed_size_binary/mod.rs b/src/array/fixed_size_binary/mod.rs index 3091aae3b62..faf3ec7ff72 100644 --- a/src/array/fixed_size_binary/mod.rs +++ b/src/array/fixed_size_binary/mod.rs @@ -1,7 +1,8 @@ use crate::{bitmap::Bitmap, buffer::Buffer, datatypes::DataType, error::Result}; -use super::{display_fmt, display_helper, ffi::ToFfi, Array}; +use super::{display_fmt, display_helper, Array}; +mod ffi; mod iterator; mod mutable; pub use mutable::*; @@ -14,7 +15,6 @@ pub struct FixedSizeBinaryArray { data_type: DataType, values: Buffer, validity: Option, - offset: usize, } impl FixedSizeBinaryArray { @@ -47,7 +47,6 @@ impl FixedSizeBinaryArray { data_type, values, validity, - offset: 0, } } @@ -83,7 +82,6 @@ impl FixedSizeBinaryArray { size: self.size, values, validity, - offset: self.offset + offset, } } @@ -182,19 +180,6 @@ impl std::fmt::Display for FixedSizeBinaryArray { } } -unsafe impl ToFfi for FixedSizeBinaryArray { - fn buffers(&self) -> Vec>> { - vec![ - self.validity.as_ref().map(|x| x.as_ptr()), - std::ptr::NonNull::new(self.values.as_ptr() as *mut u8), - ] - } - - fn offset(&self) -> usize { - self.offset - } -} - impl FixedSizeBinaryArray { /// Creates a [`FixedSizeBinaryArray`] from an fallible iterator of optional `[u8]`. pub fn try_from_iter, I: IntoIterator>>( diff --git a/src/array/fixed_size_list/ffi.rs b/src/array/fixed_size_list/ffi.rs new file mode 100644 index 00000000000..c803dc6ac25 --- /dev/null +++ b/src/array/fixed_size_list/ffi.rs @@ -0,0 +1,16 @@ +use std::sync::Arc; + +use crate::array::ffi::ToFfi; +use crate::array::Array; + +use super::FixedSizeListArray; + +unsafe impl ToFfi for FixedSizeListArray { + fn buffers(&self) -> Vec>> { + vec![self.validity.as_ref().map(|x| x.as_ptr())] + } + + fn children(&self) -> Vec> { + vec![self.values().clone()] + } +} diff --git a/src/array/fixed_size_list/mod.rs b/src/array/fixed_size_list/mod.rs index 5b78569eed4..6c29c2a6b58 100644 --- a/src/array/fixed_size_list/mod.rs +++ b/src/array/fixed_size_list/mod.rs @@ -5,8 +5,9 @@ use crate::{ datatypes::{DataType, Field}, }; -use super::{display_fmt, ffi::ToFfi, new_empty_array, new_null_array, Array}; +use super::{display_fmt, new_empty_array, new_null_array, Array}; +mod ffi; mod iterator; pub use iterator::*; mod mutable; @@ -20,7 +21,6 @@ pub struct FixedSizeListArray { data_type: DataType, values: Arc, validity: Option, - offset: usize, } impl FixedSizeListArray { @@ -60,7 +60,6 @@ impl FixedSizeListArray { data_type, values, validity, - offset: 0, } } @@ -97,7 +96,6 @@ impl FixedSizeListArray { size: self.size, values, validity, - offset: self.offset + offset, } } @@ -194,17 +192,3 @@ impl std::fmt::Display for FixedSizeListArray { display_fmt(self.iter(), "FixedSizeListArray", f, true) } } - -unsafe impl ToFfi for FixedSizeListArray { - fn buffers(&self) -> Vec>> { - vec![self.validity.as_ref().map(|x| x.as_ptr())] - } - - fn offset(&self) -> usize { - self.offset - } - - fn children(&self) -> Vec> { - vec![self.values().clone()] - } -} diff --git a/src/array/list/ffi.rs b/src/array/list/ffi.rs index 14235ce1ddd..cb354727f92 100644 --- a/src/array/list/ffi.rs +++ b/src/array/list/ffi.rs @@ -7,14 +7,20 @@ use super::ListArray; unsafe impl ToFfi for ListArray { fn buffers(&self) -> Vec>> { - vec![ - self.validity.as_ref().map(|x| x.as_ptr()), - std::ptr::NonNull::new(self.offsets.as_ptr() as *mut u8), - ] - } + let offset = self + .validity + .as_ref() + .map(|x| x.offset()) + .unwrap_or_default(); + + // note that this may point to _before_ the pointer. This is fine because the C data interface + // requires users to only access past the offset + let offsets = + std::ptr::NonNull::new( + unsafe { self.offsets.as_ptr().offset(-(offset as isize)) } as *mut u8 + ); - fn offset(&self) -> usize { - self.offset + vec![self.validity.as_ref().map(|x| x.as_ptr()), offsets] } fn children(&self) -> Vec> { @@ -25,17 +31,11 @@ unsafe impl ToFfi for ListArray { impl FromFfi for ListArray { unsafe fn try_from_ffi(array: A) -> Result { let data_type = array.field().data_type().clone(); - let length = array.array().len(); - let offset = array.array().offset(); - let mut validity = unsafe { array.validity() }?; - let mut offsets = unsafe { array.buffer::(0) }?; + let validity = unsafe { array.validity() }?; + let offsets = unsafe { array.buffer::(0) }?; let child = array.child(0)?; let values = ffi::try_from(child)?.into(); - if offset > 0 { - offsets = offsets.slice(offset, length); - validity = validity.map(|x| x.slice(offset, length)) - } Ok(Self::from_data(data_type, offsets, values, validity)) } } diff --git a/src/array/list/mod.rs b/src/array/list/mod.rs index 4a92c352fa3..89566571dcc 100644 --- a/src/array/list/mod.rs +++ b/src/array/list/mod.rs @@ -25,7 +25,6 @@ pub struct ListArray { offsets: Buffer, values: Arc, validity: Option, - offset: usize, } impl ListArray { @@ -78,7 +77,6 @@ impl ListArray { offsets, values, validity, - offset: 0, } } @@ -107,7 +105,6 @@ impl ListArray { offsets, values: self.values.clone(), validity, - offset: self.offset + offset, } } diff --git a/src/array/map/ffi.rs b/src/array/map/ffi.rs index 50b9039c351..8161a17d02f 100644 --- a/src/array/map/ffi.rs +++ b/src/array/map/ffi.rs @@ -7,14 +7,17 @@ use super::MapArray; unsafe impl ToFfi for MapArray { fn buffers(&self) -> Vec>> { - vec![ - self.validity.as_ref().map(|x| x.as_ptr()), - std::ptr::NonNull::new(self.offsets.as_ptr() as *mut u8), - ] - } + let offset = self + .validity + .as_ref() + .map(|x| x.offset()) + .unwrap_or_default(); + + let offsets = std::ptr::NonNull::new(unsafe { + self.offsets.as_ptr().offset(-(offset as isize)) as *mut u8 + }); - fn offset(&self) -> usize { - self.offset + vec![self.validity.as_ref().map(|x| x.as_ptr()), offsets] } fn children(&self) -> Vec> { @@ -25,17 +28,11 @@ unsafe impl ToFfi for MapArray { impl FromFfi for MapArray { unsafe fn try_from_ffi(array: A) -> Result { let data_type = array.field().data_type().clone(); - let length = array.array().len(); - let offset = array.array().offset(); - let mut validity = unsafe { array.validity() }?; - let mut offsets = unsafe { array.buffer::(0) }?; + let validity = unsafe { array.validity() }?; + let offsets = unsafe { array.buffer::(0) }?; let child = array.child(0)?; let values = ffi::try_from(child)?.into(); - if offset > 0 { - offsets = offsets.slice(offset, length); - validity = validity.map(|x| x.slice(offset, length)) - } Ok(Self::from_data(data_type, offsets, values, validity)) } } diff --git a/src/array/map/mod.rs b/src/array/map/mod.rs index 0efc83e7a4e..ee908be847f 100644 --- a/src/array/map/mod.rs +++ b/src/array/map/mod.rs @@ -22,7 +22,6 @@ pub struct MapArray { field: Arc, // invariant: offsets.len() - 1 == Bitmap::len() validity: Option, - offset: usize, } impl MapArray { @@ -81,7 +80,6 @@ impl MapArray { data_type, field, offsets, - offset: 0, validity, } } @@ -111,7 +109,6 @@ impl MapArray { offsets, field: self.field.clone(), validity, - offset: self.offset + offset, } } } diff --git a/src/array/null.rs b/src/array/null.rs index 1906cc140f4..e20f754cc75 100644 --- a/src/array/null.rs +++ b/src/array/null.rs @@ -7,7 +7,6 @@ use super::{ffi::ToFfi, Array}; pub struct NullArray { data_type: DataType, length: usize, - offset: usize, } impl NullArray { @@ -23,19 +22,14 @@ impl NullArray { /// Returns a new [`NullArray`]. pub fn from_data(data_type: DataType, length: usize) -> Self { - Self { - data_type, - length, - offset: 0, - } + Self { data_type, length } } /// Returns a slice of the [`NullArray`]. - pub fn slice(&self, offset: usize, length: usize) -> Self { + pub fn slice(&self, _offset: usize, length: usize) -> Self { Self { data_type: self.data_type.clone(), length, - offset: self.offset + offset, } } } @@ -81,9 +75,4 @@ unsafe impl ToFfi for NullArray { fn buffers(&self) -> Vec>> { vec![] } - - #[inline] - fn offset(&self) -> usize { - self.offset - } } diff --git a/src/array/primitive/ffi.rs b/src/array/primitive/ffi.rs index 3418e2516fc..f009a2fb832 100644 --- a/src/array/primitive/ffi.rs +++ b/src/array/primitive/ffi.rs @@ -10,30 +10,24 @@ use super::PrimitiveArray; unsafe impl ToFfi for PrimitiveArray { fn buffers(&self) -> Vec>> { + let offset = self + .validity + .as_ref() + .map(|x| x.offset()) + .unwrap_or_default() as isize; vec![ self.validity.as_ref().map(|x| x.as_ptr()), - std::ptr::NonNull::new(self.values.as_ptr() as *mut u8), + std::ptr::NonNull::new(unsafe { self.values.as_ptr().offset(-offset) as *mut u8 }), ] } - - #[inline] - fn offset(&self) -> usize { - self.offset - } } impl FromFfi for PrimitiveArray { unsafe fn try_from_ffi(array: A) -> Result { let data_type = array.field().data_type().clone(); - let length = array.array().len(); - let offset = array.array().offset(); - let mut validity = unsafe { array.validity() }?; - let mut values = unsafe { array.buffer::(0) }?; + let validity = unsafe { array.validity() }?; + let values = unsafe { array.buffer::(0) }?; - if offset > 0 { - values = values.slice(offset, length); - validity = validity.map(|x| x.slice(offset, length)) - } Ok(Self::from_data(data_type, values, validity)) } } diff --git a/src/array/primitive/mod.rs b/src/array/primitive/mod.rs index f75ffd9a2b0..5a127968d0b 100644 --- a/src/array/primitive/mod.rs +++ b/src/array/primitive/mod.rs @@ -35,7 +35,6 @@ pub struct PrimitiveArray { data_type: DataType, values: Buffer, validity: Option, - offset: usize, } impl PrimitiveArray { @@ -75,7 +74,6 @@ impl PrimitiveArray { data_type, values, validity, - offset: 0, } } @@ -108,7 +106,6 @@ impl PrimitiveArray { data_type: self.data_type.clone(), values: self.values.clone().slice_unchecked(offset, length), validity, - offset: self.offset + offset, } } @@ -171,7 +168,6 @@ impl PrimitiveArray { data_type, values: self.values, validity: self.validity, - offset: self.offset, } } } diff --git a/src/array/specification.rs b/src/array/specification.rs index 1edcb70653f..0b26f4a67fc 100644 --- a/src/array/specification.rs +++ b/src/array/specification.rs @@ -100,12 +100,11 @@ pub fn check_offsets_and_utf8(offsets: &[O], values: &[u8]) { } /// # Panics iff: +/// * the `offsets` is empty, or /// * the `offsets` is not monotonically increasing, or /// * any offset is larger or equal to `values_len`. pub fn check_offsets(offsets: &[O], values_len: usize) { - if offsets.is_empty() { - return; - } + assert!(!offsets.is_empty()); let mut last = offsets[0]; // assert monotonicity diff --git a/src/array/struct_.rs b/src/array/struct_.rs index 8dc327a5719..d7e504436b1 100644 --- a/src/array/struct_.rs +++ b/src/array/struct_.rs @@ -223,11 +223,6 @@ unsafe impl ToFfi for StructArray { vec![self.validity.as_ref().map(|x| x.as_ptr())] } - fn offset(&self) -> usize { - // we do not support offsets in structs. Instead, if an FFI we slice the incoming arrays - 0 - } - fn children(&self) -> Vec> { self.values.clone() } @@ -238,9 +233,7 @@ impl FromFfi for StructArray { let field = array.field(); let fields = Self::get_fields(field.data_type()).to_vec(); - let length = array.array().len(); - let offset = array.array().offset(); - let mut validity = unsafe { array.validity() }?; + let validity = unsafe { array.validity() }?; let values = (0..fields.len()) .map(|index| { let child = array.child(index)?; @@ -248,9 +241,6 @@ impl FromFfi for StructArray { }) .collect::>>>()?; - if offset > 0 { - validity = validity.map(|x| x.slice(offset, length)) - } Ok(Self::from_data(DataType::Struct(fields), values, validity)) } } diff --git a/src/array/union/ffi.rs b/src/array/union/ffi.rs index 640a7d6f0ab..7f9dfbb7273 100644 --- a/src/array/union/ffi.rs +++ b/src/array/union/ffi.rs @@ -18,10 +18,6 @@ unsafe impl ToFfi for UnionArray { } } - fn offset(&self) -> usize { - self.offset - } - fn children(&self) -> Vec> { self.fields.clone() } @@ -33,15 +29,13 @@ impl FromFfi for UnionArray { let data_type = field.data_type().clone(); let fields = Self::get_fields(field.data_type()); - let mut types = unsafe { array.buffer::(0) }?; + let types = unsafe { array.buffer::(0) }?; let offsets = if Self::is_sparse(&data_type) { None } else { Some(unsafe { array.buffer::(1) }?) }; - let length = array.array().len(); - let offset = array.array().offset(); let fields = (0..fields.len()) .map(|index| { let child = array.child(index)?; @@ -49,10 +43,6 @@ impl FromFfi for UnionArray { }) .collect::>>>()?; - if offset > 0 { - types = types.slice(offset, length); - }; - Ok(Self::from_data(data_type, types, fields, offsets)) } } diff --git a/src/array/union/mod.rs b/src/array/union/mod.rs index b5a6a1d2605..026db7eb684 100644 --- a/src/array/union/mod.rs +++ b/src/array/union/mod.rs @@ -31,7 +31,6 @@ pub struct UnionArray { fields: Vec>, offsets: Option>, data_type: DataType, - offset: usize, } impl UnionArray { @@ -78,7 +77,6 @@ impl UnionArray { fields, offsets, types: Buffer::new(), - offset: 0, } } else { panic!("Union struct must be created with the corresponding Union DataType") @@ -125,7 +123,6 @@ impl UnionArray { fields, offsets, types, - offset: 0, } } @@ -193,7 +190,6 @@ impl UnionArray { fields_hash: self.fields_hash.clone(), types: self.types.clone().slice(offset, length), offsets: self.offsets.clone(), - offset: self.offset + offset, } } @@ -210,7 +206,6 @@ impl UnionArray { fields_hash: self.fields_hash.clone(), types: self.types.clone().slice_unchecked(offset, length), offsets: self.offsets.clone(), - offset: self.offset + offset, } } } diff --git a/src/array/utf8/ffi.rs b/src/array/utf8/ffi.rs index 4cc716b8e4d..0ef904fa09d 100644 --- a/src/array/utf8/ffi.rs +++ b/src/array/utf8/ffi.rs @@ -8,30 +8,30 @@ use super::Utf8Array; unsafe impl ToFfi for Utf8Array { fn buffers(&self) -> Vec>> { + let offset = self + .validity + .as_ref() + .map(|x| x.offset()) + .unwrap_or_default(); + + let offsets = std::ptr::NonNull::new(unsafe { + self.offsets.as_ptr().offset(-(offset as isize)) as *mut u8 + }); + vec![ self.validity.as_ref().map(|x| x.as_ptr()), - std::ptr::NonNull::new(self.offsets.as_ptr() as *mut u8), + offsets, std::ptr::NonNull::new(self.values.as_ptr() as *mut u8), ] } - - fn offset(&self) -> usize { - self.offset - } } impl FromFfi for Utf8Array { unsafe fn try_from_ffi(array: A) -> Result { - let length = array.array().len(); - let offset = array.array().offset(); - let mut validity = unsafe { array.validity() }?; - let mut offsets = unsafe { array.buffer::(0) }?; + let validity = unsafe { array.validity() }?; + let offsets = unsafe { array.buffer::(0) }?; let values = unsafe { array.buffer::(1)? }; - if offset > 0 { - offsets = offsets.slice(offset, length); - validity = validity.map(|x| x.slice(offset, length)) - } let data_type = Self::default_data_type(); Ok(Self::from_data_unchecked( data_type, offsets, values, validity, diff --git a/src/array/utf8/mod.rs b/src/array/utf8/mod.rs index 1d6b75faec3..ef895092dc9 100644 --- a/src/array/utf8/mod.rs +++ b/src/array/utf8/mod.rs @@ -36,7 +36,6 @@ pub struct Utf8Array { offsets: Buffer, values: Buffer, validity: Option, - offset: usize, } impl Utf8Array { @@ -86,7 +85,6 @@ impl Utf8Array { offsets, values, validity, - offset: 0, } } @@ -128,7 +126,6 @@ impl Utf8Array { offsets, values, validity, - offset: 0, } } @@ -161,7 +158,6 @@ impl Utf8Array { offsets, values: self.values.clone(), validity, - offset: self.offset + offset, } } diff --git a/src/bitmap/immutable.rs b/src/bitmap/immutable.rs index c2f5e91fd9d..3932a9d451d 100644 --- a/src/bitmap/immutable.rs +++ b/src/bitmap/immutable.rs @@ -66,23 +66,28 @@ impl Bitmap { /// # Panic /// Panics iff `length <= bytes.len() * 8` #[inline] - pub(crate) fn from_bytes(bytes: Bytes, length: usize) -> Self { - assert!(length <= bytes.len() * 8); - let null_count = count_zeros(&bytes, 0, length); + pub(crate) fn from_bytes(bytes: Bytes, offset: usize, length: usize) -> Self { + assert!(offset + length <= bytes.len() * 8); + let null_count = count_zeros(&bytes, offset, length); Self { length, - offset: 0, + offset, bytes: Arc::new(bytes), null_count, } } + #[inline] + pub(crate) fn offset(&self) -> usize { + self.offset + } + /// Creates a new [`Bitmap`] from [`MutableBuffer`] and a length. /// # Panic /// Panics iff `length <= buffer.len() * 8` #[inline] pub fn from_u8_buffer(buffer: MutableBuffer, length: usize) -> Self { - Bitmap::from_bytes(buffer.into(), length) + Bitmap::from_bytes(buffer.into(), 0, length) } /// Creates a new [`Bitmap`] from a slice and length. diff --git a/src/bitmap/mutable.rs b/src/bitmap/mutable.rs index 58631abde76..b7e8b8d3d2b 100644 --- a/src/bitmap/mutable.rs +++ b/src/bitmap/mutable.rs @@ -230,7 +230,7 @@ impl MutableBitmap { impl From for Bitmap { #[inline] fn from(buffer: MutableBitmap) -> Self { - Bitmap::from_bytes(buffer.buffer.into(), buffer.length) + Bitmap::from_bytes(buffer.buffer.into(), 0, buffer.length) } } @@ -238,7 +238,7 @@ impl From for Option { #[inline] fn from(buffer: MutableBitmap) -> Self { if buffer.null_count() > 0 { - Some(Bitmap::from_bytes(buffer.buffer.into(), buffer.length)) + Some(Bitmap::from_bytes(buffer.buffer.into(), 0, buffer.length)) } else { None } diff --git a/src/buffer/immutable.rs b/src/buffer/immutable.rs index eab542e67e3..861d121e96d 100644 --- a/src/buffer/immutable.rs +++ b/src/buffer/immutable.rs @@ -63,11 +63,11 @@ impl Buffer { } /// Auxiliary method to create a new Buffer - pub(crate) fn from_bytes(bytes: Bytes) -> Self { - let length = bytes.len(); + pub(crate) fn from_bytes(bytes: Bytes, offset: usize) -> Self { + let length = bytes.len() - offset; Buffer { data: Arc::new(bytes), - offset: 0, + offset, length, } } diff --git a/src/buffer/mutable.rs b/src/buffer/mutable.rs index 70c4e4415d0..b7790e8ec79 100644 --- a/src/buffer/mutable.rs +++ b/src/buffer/mutable.rs @@ -431,7 +431,7 @@ impl> From

for MutableBuffer { impl From> for Buffer { #[inline] fn from(buffer: MutableBuffer) -> Self { - Self::from_bytes(buffer.into()) + Self::from_bytes(buffer.into(), 0) } } diff --git a/src/ffi/ffi.rs b/src/ffi/ffi.rs index c4b839b332f..f4960520594 100644 --- a/src/ffi/ffi.rs +++ b/src/ffi/ffi.rs @@ -96,6 +96,12 @@ impl Ffi_ArrowArray { /// This method releases `buffers`. Consumers of this struct *must* call `release` before /// releasing this struct, or contents in `buffers` leak. pub(crate) fn new(array: Arc) -> Self { + let offset = array + .validity() + .as_ref() + .map(|x| x.offset()) + .unwrap_or_default() as i64; + let (buffers, children, dictionary) = buffers_children_dictionary(array.as_ref()); let buffers_ptr = buffers @@ -130,10 +136,10 @@ impl Ffi_ArrowArray { Self { length, null_count, - offset: 0i64, + offset, n_buffers, n_children, - buffers: private_data.buffers_ptr.as_mut_ptr(), + buffers: unsafe { private_data.buffers_ptr.as_mut_ptr() }, children: private_data.children_ptr.as_mut_ptr(), dictionary: private_data.dictionary_ptr.unwrap_or(std::ptr::null_mut()), release: Some(c_release_array), @@ -157,16 +163,6 @@ impl Ffi_ArrowArray { } } - /// the length of the array - pub(crate) fn len(&self) -> usize { - self.length as usize - } - - /// the offset of the array - pub(crate) fn offset(&self) -> usize { - self.offset as usize - } - /// the null count of the array pub(crate) fn null_count(&self) -> usize { self.null_count as usize @@ -188,6 +184,7 @@ unsafe fn create_buffer( } let buffers = array.buffers as *mut *const u8; + let offset = buffer_offset(array, data_type, index); let len = buffer_len(array, data_type, index)?; assert!(index < array.n_buffers as usize); @@ -195,10 +192,10 @@ unsafe fn create_buffer( let ptr = NonNull::new(ptr as *mut T); let bytes = ptr - .map(|ptr| Bytes::new(ptr, len, deallocation)) + .map(|ptr| Bytes::new(ptr, offset + len, deallocation)) .ok_or_else(|| ArrowError::Ffi(format!("The buffer at position {} is null", index))); - bytes.map(Buffer::from_bytes) + bytes.map(|x| Buffer::from_bytes(x, offset)) } /// returns a new buffer corresponding to the index `i` of the FFI array. It may not exist (null pointer). @@ -217,6 +214,8 @@ unsafe fn create_bitmap( return Err(ArrowError::Ffi("The array buffers are null".to_string())); } let len = array.length as usize; + let offset = array.offset as usize; + let buffers = array.buffers as *mut *const u8; assert!(index < array.n_buffers as usize); @@ -233,7 +232,19 @@ unsafe fn create_bitmap( )) })?; - Ok(Bitmap::from_bytes(bytes, len)) + Ok(Bitmap::from_bytes(bytes, offset, len)) +} + +/// Returns the offset, in slots, of the buffer `i` (indexed according to the C data interface) +fn buffer_offset(array: &Ffi_ArrowArray, data_type: &DataType, i: usize) -> usize { + match (data_type.to_physical_type(), i) { + (PhysicalType::Utf8, 2) + | (PhysicalType::Binary, 2) + | (PhysicalType::LargeUtf8, 2) + | (PhysicalType::LargeBinary, 2) => 0, + // buffer len of primitive types + _ => array.offset as usize, + } } /// Returns the length, in slots, of the buffer `i` (indexed according to the C data interface) @@ -255,22 +266,24 @@ fn buffer_len(array: &Ffi_ArrowArray, data_type: &DataType, i: usize) -> Result< (PhysicalType::Utf8, 2) | (PhysicalType::Binary, 2) => { // the len of the data buffer (buffer 2) equals the last value of the offset buffer (buffer 1) let len = buffer_len(array, data_type, 1)?; + let offset = buffer_offset(array, data_type, 1); // first buffer is the null buffer => add(1) let offset_buffer = unsafe { *(array.buffers as *mut *const u8).add(1) }; // interpret as i32 let offset_buffer = offset_buffer as *const i32; // get last offset - (unsafe { *offset_buffer.add(len - 1) }) as usize + (unsafe { *offset_buffer.add(offset).add(len - 1) }) as usize } (PhysicalType::LargeUtf8, 2) | (PhysicalType::LargeBinary, 2) => { // the len of the data buffer (buffer 2) equals the last value of the offset buffer (buffer 1) let len = buffer_len(array, data_type, 1)?; + let offset = buffer_offset(array, data_type, 1); // first buffer is the null buffer => add(1) let offset_buffer = unsafe { *(array.buffers as *mut *const u8).add(1) }; // interpret as i64 let offset_buffer = offset_buffer as *const i64; // get last offset - (unsafe { *offset_buffer.add(len - 1) }) as usize + (unsafe { *offset_buffer.add(offset).add(len - 1) }) as usize } // buffer len of primitive types _ => array.length as usize, diff --git a/src/io/ipc/read/read_basic.rs b/src/io/ipc/read/read_basic.rs index 33c01a67d1a..85975565959 100644 --- a/src/io/ipc/read/read_basic.rs +++ b/src/io/ipc/read/read_basic.rs @@ -231,7 +231,7 @@ pub fn read_bitmap( read_uncompressed_bitmap(length, bytes, reader) }?; - Ok(Bitmap::from_bytes(buffer.into(), length)) + Ok(Bitmap::from_bytes(buffer.into(), 0, length)) } pub fn read_validity( diff --git a/tests/it/ffi.rs b/tests/it/ffi.rs index facc143988c..c7912d6cd81 100644 --- a/tests/it/ffi.rs +++ b/tests/it/ffi.rs @@ -1,13 +1,12 @@ use arrow2::array::*; +use arrow2::bitmap::Bitmap; use arrow2::datatypes::{DataType, Field, TimeUnit}; use arrow2::{error::Result, ffi}; use std::collections::BTreeMap; use std::sync::Arc; -fn test_round_trip(expected: impl Array + Clone + 'static) -> Result<()> { - let array: Arc = Arc::new(expected.clone()); +fn _test_round_trip(array: Arc, expected: Box) -> Result<()> { let field = Field::new("a", array.data_type().clone(), true); - let expected = Box::new(expected) as Box; let array_ptr = Box::new(ffi::Ffi_ArrowArray::empty()); let schema_ptr = Box::new(ffi::Ffi_ArrowSchema::empty()); @@ -32,6 +31,15 @@ fn test_round_trip(expected: impl Array + Clone + 'static) -> Result<()> { Ok(()) } +fn test_round_trip(expected: impl Array + Clone + 'static) -> Result<()> { + let array: Arc = Arc::new(expected.clone()); + let expected = Box::new(expected) as Box; + _test_round_trip(array.clone(), clone(expected.as_ref()))?; + + // sliced + _test_round_trip(array.slice(1, 2).into(), expected.slice(1, 2)) +} + fn test_round_trip_schema(field: Field) -> Result<()> { // create a `ArrowArray` from the data. let schema_ptr = Box::new(ffi::Ffi_ArrowSchema::empty()); @@ -138,6 +146,17 @@ fn list_list() -> Result<()> { test_round_trip(array) } +#[test] +fn struct_() -> Result<()> { + let data_type = DataType::Struct(vec![Field::new("a", DataType::Int32, true)]); + let values = vec![Arc::new(Int32Array::from([Some(1), None, Some(3)])) as Arc]; + let validity = Bitmap::from([true, false, true]); + + let array = StructArray::from_data(data_type, values, validity.into()); + + test_round_trip(array) +} + #[test] fn dict() -> Result<()> { let data = vec![Some("a"), Some("a"), None, Some("b")];