From 24851fa5968674aae782e961164d166c826faefb Mon Sep 17 00:00:00 2001 From: "Jorge C. Leitao" Date: Tue, 26 Oct 2021 16:18:44 +0000 Subject: [PATCH] Revert "Fixed ffi of sliced arrays (#540)" This reverts commit e6b6c8356911333bf08acd2a4234ebc25599e479. --- .../tests/test_sql.py | 81 ++----------------- src/array/README.md | 11 +++ src/array/binary/ffi.rs | 28 ++++--- src/array/binary/mod.rs | 4 + src/array/boolean/ffi.rs | 22 ++--- src/array/boolean/mod.rs | 9 +-- src/array/dictionary/ffi.rs | 15 +++- src/array/dictionary/mod.rs | 4 + src/array/ffi.rs | 3 + src/array/fixed_size_binary/ffi.rs | 23 ------ src/array/fixed_size_binary/mod.rs | 19 ++++- src/array/fixed_size_list/ffi.rs | 16 ---- src/array/fixed_size_list/mod.rs | 20 ++++- src/array/list/ffi.rs | 30 +++---- src/array/list/mod.rs | 3 + src/array/map/ffi.rs | 27 ++++--- src/array/map/mod.rs | 3 + src/array/null.rs | 15 +++- src/array/primitive/ffi.rs | 22 +++-- src/array/primitive/mod.rs | 4 + src/array/specification.rs | 5 +- src/array/struct_.rs | 12 ++- src/array/union/ffi.rs | 12 ++- src/array/union/mod.rs | 5 ++ src/array/utf8/ffi.rs | 26 +++--- src/array/utf8/mod.rs | 4 + src/bitmap/immutable.rs | 15 ++-- src/bitmap/mutable.rs | 4 +- src/buffer/immutable.rs | 6 +- src/buffer/mutable.rs | 2 +- src/ffi/ffi.rs | 47 ++++------- src/io/ipc/read/read_basic.rs | 2 +- tests/it/ffi.rs | 25 +----- 33 files changed, 254 insertions(+), 270 deletions(-) delete mode 100644 src/array/fixed_size_binary/ffi.rs delete mode 100644 src/array/fixed_size_list/ffi.rs diff --git a/arrow-pyarrow-integration-testing/tests/test_sql.py b/arrow-pyarrow-integration-testing/tests/test_sql.py index 27da0e63515..8aef60832cf 100644 --- a/arrow-pyarrow-integration-testing/tests/test_sql.py +++ b/arrow-pyarrow-integration-testing/tests/test_sql.py @@ -39,52 +39,15 @@ def tearDown(self): # No leak of C++ memory self.assertEqual(self.old_allocated_cpp, pyarrow.total_allocated_bytes()) - def test_primitive(self): - a = pyarrow.array([0, None, 2, 3, 4]) - b = arrow_pyarrow_integration_testing.round_trip_array(a) - - b.validate(full=True) - assert a.to_pylist() == b.to_pylist() - assert a.type == b.type - - def test_primitive_sliced(self): - a = pyarrow.array([0, None, 2, 3, 4]).slice(1, 2) - b = arrow_pyarrow_integration_testing.round_trip_array(a) - - b.validate(full=True) - assert a.to_pylist() == b.to_pylist() - assert a.type == b.type - - def test_boolean(self): - a = pyarrow.array([True, None, False, True, False]) - b = arrow_pyarrow_integration_testing.round_trip_array(a) - - b.validate(full=True) - assert a.to_pylist() == b.to_pylist() - assert a.type == b.type - - def test_boolean_sliced(self): - a = pyarrow.array([True, None, False, True, False]).slice(1, 2) - b = arrow_pyarrow_integration_testing.round_trip_array(a) - - b.validate(full=True) - assert a.to_pylist() == b.to_pylist() - assert a.type == b.type - - def test_string(self): + def test_string_roundtrip(self): + """ + Python -> Rust -> Python + """ a = pyarrow.array(["a", None, "ccc"]) b = arrow_pyarrow_integration_testing.round_trip_array(a) c = pyarrow.array(["a", None, "ccc"]) self.assertEqual(b, c) - def test_string_sliced(self): - a = pyarrow.array(["a", None, "ccc"]).slice(1, 2) - b = arrow_pyarrow_integration_testing.round_trip_array(a) - - b.validate(full=True) - assert a.to_pylist() == b.to_pylist() - assert a.type == b.type - def test_decimal_roundtrip(self): """ Python -> Rust -> Python @@ -112,17 +75,10 @@ def test_list_array(self): assert a.to_pylist() == b.to_pylist() assert a.type == b.type - def test_list_sliced(self): - a = pyarrow.array( - [[], None, [1, 2], [4, 5, 6]], pyarrow.list_(pyarrow.int64()) - ).slice(1, 2) - b = arrow_pyarrow_integration_testing.round_trip_array(a) - - b.validate(full=True) - assert a.to_pylist() == b.to_pylist() - assert a.type == b.type - - def test_struct(self): + def test_struct_array(self): + """ + Python -> Rust -> Python + """ fields = [ ("f1", pyarrow.int32()), ("f2", pyarrow.string()), @@ -143,27 +99,6 @@ def test_struct(self): assert a.to_pylist() == b.to_pylist() assert a.type == b.type - # see https://issues.apache.org/jira/browse/ARROW-14383 - def _test_struct_sliced(self): - fields = [ - ("f1", pyarrow.int32()), - ("f2", pyarrow.string()), - ] - a = pyarrow.array( - [ - {"f1": 1, "f2": "a"}, - None, - {"f1": 3, "f2": None}, - {"f1": None, "f2": "d"}, - {"f1": None, "f2": None}, - ], - pyarrow.struct(fields), - ).slice(1, 2) - b = arrow_pyarrow_integration_testing.round_trip_array(a) - b.validate(full=True) - assert a.to_pylist() == b.to_pylist() - assert a.type == b.type - def test_list_list_array(self): """ Python -> Rust -> Python diff --git a/src/array/README.md b/src/array/README.md index 9282cf611c6..a814839527c 100644 --- a/src/array/README.md +++ b/src/array/README.md @@ -41,6 +41,17 @@ This document describes the overall design of this module. * `from_trusted_len_values_iter` from an iterator of trusted len of values * `try_from_trusted_len_iter` from an fallible iterator of trusted len of optional values +### Slot offsets + +* An array MUST have a `offset: usize` measuring the number of slots that the array is currently offsetted by if the specification requires. + +* An array MUST implement `fn slice(&self, offset: usize, length: usize) -> Self` that returns an offseted and/or truncated clone of the array. This function MUST increase the array's offset if it exists. + +* Conversely, `offset` MUST only be changed by `slice`. + +The rational of the above is that it enable us to be fully interoperable with the offset logic supported by the C data interface, while at the same time easily perform array slices +within Rust's type safety mechanism. + ### Mutable Arrays * An array MAY have a mutable counterpart. E.g. `MutablePrimitiveArray` is the mutable counterpart of `PrimitiveArray`. diff --git a/src/array/binary/ffi.rs b/src/array/binary/ffi.rs index 5f1ff1a3030..4ca8d981ebd 100644 --- a/src/array/binary/ffi.rs +++ b/src/array/binary/ffi.rs @@ -10,22 +10,17 @@ use super::BinaryArray; unsafe impl ToFfi for BinaryArray { fn buffers(&self) -> Vec>> { - let offset = self - .validity - .as_ref() - .map(|x| x.offset()) - .unwrap_or_default(); - - let offsets = std::ptr::NonNull::new(unsafe { - self.offsets.as_ptr().offset(-(offset as isize)) as *mut u8 - }); - vec![ self.validity.as_ref().map(|x| x.as_ptr()), - offsets, + std::ptr::NonNull::new(self.offsets.as_ptr() as *mut u8), std::ptr::NonNull::new(self.values.as_ptr() as *mut u8), ] } + + #[inline] + fn offset(&self) -> usize { + self.offset + } } impl FromFfi for BinaryArray { @@ -38,10 +33,17 @@ impl FromFfi for BinaryArray { }; assert_eq!(data_type, expected); - let validity = unsafe { array.validity() }?; - let offsets = unsafe { array.buffer::(0) }?; + let length = array.array().len(); + let offset = array.array().offset(); + let mut validity = unsafe { array.validity() }?; + let mut offsets = unsafe { array.buffer::(0) }?; let values = unsafe { array.buffer::(1) }?; + if offset > 0 { + offsets = offsets.slice(offset, length); + validity = validity.map(|x| x.slice(offset, length)) + } + Ok(Self::from_data_unchecked( Self::default_data_type(), offsets, diff --git a/src/array/binary/mod.rs b/src/array/binary/mod.rs index ed446561dde..036e3f74122 100644 --- a/src/array/binary/mod.rs +++ b/src/array/binary/mod.rs @@ -23,6 +23,7 @@ pub struct BinaryArray { offsets: Buffer, values: Buffer, validity: Option, + offset: usize, } // constructors @@ -70,6 +71,7 @@ impl BinaryArray { offsets, values, validity, + offset: 0, } } @@ -111,6 +113,7 @@ impl BinaryArray { offsets, values, validity, + offset: 0, } } @@ -143,6 +146,7 @@ impl BinaryArray { offsets, values: self.values.clone(), validity, + offset: self.offset + offset, } } diff --git a/src/array/boolean/ffi.rs b/src/array/boolean/ffi.rs index a69c81a461b..e4264d93829 100644 --- a/src/array/boolean/ffi.rs +++ b/src/array/boolean/ffi.rs @@ -10,28 +10,30 @@ use super::BooleanArray; unsafe impl ToFfi for BooleanArray { fn buffers(&self) -> Vec>> { - let offset = self - .validity - .as_ref() - .map(|x| x.offset()) - .unwrap_or_default(); - - assert!(self.values.offset() >= offset); vec![ self.validity.as_ref().map(|x| x.as_ptr()), Some(self.values.as_ptr()), ] } + + fn offset(&self) -> usize { + self.offset + } } impl FromFfi for BooleanArray { unsafe fn try_from_ffi(array: A) -> Result { let data_type = array.field().data_type().clone(); assert_eq!(data_type, DataType::Boolean); + let length = array.array().len(); + let offset = array.array().offset(); + let mut validity = unsafe { array.validity() }?; + let mut values = unsafe { array.bitmap(0) }?; - let validity = unsafe { array.validity() }?; - let values = unsafe { array.bitmap(0) }?; - + if offset > 0 { + values = values.slice(offset, length); + validity = validity.map(|x| x.slice(offset, length)) + } Ok(Self::from_data(data_type, values, validity)) } } diff --git a/src/array/boolean/mod.rs b/src/array/boolean/mod.rs index 15618f18f40..065eb199775 100644 --- a/src/array/boolean/mod.rs +++ b/src/array/boolean/mod.rs @@ -20,6 +20,7 @@ pub struct BooleanArray { data_type: DataType, values: Bitmap, validity: Option, + offset: usize, } impl BooleanArray { @@ -45,13 +46,11 @@ impl BooleanArray { if data_type.to_physical_type() != PhysicalType::Boolean { panic!("BooleanArray can only be initialized with DataType::Boolean") } - if matches!(&validity, Some(bitmap) if bitmap.offset() != values.offset()) { - panic!("validity's bitmap offset must equal values offset") - } Self { data_type, values, validity, + offset: 0, } } @@ -84,6 +83,7 @@ impl BooleanArray { data_type: self.data_type.clone(), values: self.values.clone().slice_unchecked(offset, length), validity, + offset: self.offset + offset, } } @@ -94,9 +94,6 @@ impl BooleanArray { if matches!(&validity, Some(bitmap) if bitmap.len() != self.len()) { panic!("validity should be as least as large as the array") } - if matches!(&validity, Some(bitmap) if bitmap.offset() != self.values.offset()) { - panic!("validity's bitmap offset must equal values offset") - } let mut arr = self.clone(); arr.validity = validity; arr diff --git a/src/array/dictionary/ffi.rs b/src/array/dictionary/ffi.rs index 5a288afb6eb..c6f8f8fa201 100644 --- a/src/array/dictionary/ffi.rs +++ b/src/array/dictionary/ffi.rs @@ -10,14 +10,25 @@ unsafe impl ToFfi for DictionaryArray { fn buffers(&self) -> Vec>> { self.keys.buffers() } + + #[inline] + fn offset(&self) -> usize { + self.offset + } } impl FromFfi for DictionaryArray { unsafe fn try_from_ffi(array: A) -> Result { // keys: similar to PrimitiveArray, but the datatype is the inner one - let validity = unsafe { array.validity() }?; - let values = unsafe { array.buffer::(0) }?; + let length = array.array().len(); + let offset = array.array().offset(); + let mut validity = unsafe { array.validity() }?; + let mut values = unsafe { array.buffer::(0) }?; + if offset > 0 { + values = values.slice(offset, length); + validity = validity.map(|x| x.slice(offset, length)) + } let keys = PrimitiveArray::::from_data(K::DATA_TYPE, values, validity); // values let values = array.dictionary()?.unwrap(); diff --git a/src/array/dictionary/mod.rs b/src/array/dictionary/mod.rs index 012d937d6d9..3b33cb3e17b 100644 --- a/src/array/dictionary/mod.rs +++ b/src/array/dictionary/mod.rs @@ -37,6 +37,7 @@ pub struct DictionaryArray { data_type: DataType, keys: PrimitiveArray, values: Arc, + offset: usize, } impl DictionaryArray { @@ -68,6 +69,7 @@ impl DictionaryArray { data_type, keys, values, + offset: 0, } } @@ -79,6 +81,7 @@ impl DictionaryArray { data_type: self.data_type.clone(), keys: self.keys.clone().slice(offset, length), values: self.values.clone(), + offset: self.offset + offset, } } @@ -90,6 +93,7 @@ impl DictionaryArray { data_type: self.data_type.clone(), keys: self.keys.clone().slice_unchecked(offset, length), values: self.values.clone(), + offset: self.offset + offset, } } diff --git a/src/array/ffi.rs b/src/array/ffi.rs index 9ed0bfca080..176b0966ff5 100644 --- a/src/array/ffi.rs +++ b/src/array/ffi.rs @@ -13,6 +13,9 @@ pub unsafe trait ToFfi { /// The pointers to the buffers. fn buffers(&self) -> Vec>>; + /// The offset + fn offset(&self) -> usize; + /// The children fn children(&self) -> Vec> { vec![] diff --git a/src/array/fixed_size_binary/ffi.rs b/src/array/fixed_size_binary/ffi.rs deleted file mode 100644 index ff8a89d763d..00000000000 --- a/src/array/fixed_size_binary/ffi.rs +++ /dev/null @@ -1,23 +0,0 @@ -use crate::array::ffi::ToFfi; - -use super::FixedSizeBinaryArray; - -unsafe impl ToFfi for FixedSizeBinaryArray { - fn buffers(&self) -> Vec>> { - let offset = self - .validity - .as_ref() - .map(|x| x.offset()) - .unwrap_or_default(); - let offset = offset * self.size(); - - // note that this may point to _before_ the pointer. This is fine because the C data interface - // requires users to only access past the offset - let values = - std::ptr::NonNull::new( - unsafe { self.values.as_ptr().offset(-(offset as isize)) } as *mut u8 - ); - - vec![self.validity.as_ref().map(|x| x.as_ptr()), values] - } -} diff --git a/src/array/fixed_size_binary/mod.rs b/src/array/fixed_size_binary/mod.rs index faf3ec7ff72..3091aae3b62 100644 --- a/src/array/fixed_size_binary/mod.rs +++ b/src/array/fixed_size_binary/mod.rs @@ -1,8 +1,7 @@ use crate::{bitmap::Bitmap, buffer::Buffer, datatypes::DataType, error::Result}; -use super::{display_fmt, display_helper, Array}; +use super::{display_fmt, display_helper, ffi::ToFfi, Array}; -mod ffi; mod iterator; mod mutable; pub use mutable::*; @@ -15,6 +14,7 @@ pub struct FixedSizeBinaryArray { data_type: DataType, values: Buffer, validity: Option, + offset: usize, } impl FixedSizeBinaryArray { @@ -47,6 +47,7 @@ impl FixedSizeBinaryArray { data_type, values, validity, + offset: 0, } } @@ -82,6 +83,7 @@ impl FixedSizeBinaryArray { size: self.size, values, validity, + offset: self.offset + offset, } } @@ -180,6 +182,19 @@ impl std::fmt::Display for FixedSizeBinaryArray { } } +unsafe impl ToFfi for FixedSizeBinaryArray { + fn buffers(&self) -> Vec>> { + vec![ + self.validity.as_ref().map(|x| x.as_ptr()), + std::ptr::NonNull::new(self.values.as_ptr() as *mut u8), + ] + } + + fn offset(&self) -> usize { + self.offset + } +} + impl FixedSizeBinaryArray { /// Creates a [`FixedSizeBinaryArray`] from an fallible iterator of optional `[u8]`. pub fn try_from_iter, I: IntoIterator>>( diff --git a/src/array/fixed_size_list/ffi.rs b/src/array/fixed_size_list/ffi.rs deleted file mode 100644 index c803dc6ac25..00000000000 --- a/src/array/fixed_size_list/ffi.rs +++ /dev/null @@ -1,16 +0,0 @@ -use std::sync::Arc; - -use crate::array::ffi::ToFfi; -use crate::array::Array; - -use super::FixedSizeListArray; - -unsafe impl ToFfi for FixedSizeListArray { - fn buffers(&self) -> Vec>> { - vec![self.validity.as_ref().map(|x| x.as_ptr())] - } - - fn children(&self) -> Vec> { - vec![self.values().clone()] - } -} diff --git a/src/array/fixed_size_list/mod.rs b/src/array/fixed_size_list/mod.rs index 6c29c2a6b58..5b78569eed4 100644 --- a/src/array/fixed_size_list/mod.rs +++ b/src/array/fixed_size_list/mod.rs @@ -5,9 +5,8 @@ use crate::{ datatypes::{DataType, Field}, }; -use super::{display_fmt, new_empty_array, new_null_array, Array}; +use super::{display_fmt, ffi::ToFfi, new_empty_array, new_null_array, Array}; -mod ffi; mod iterator; pub use iterator::*; mod mutable; @@ -21,6 +20,7 @@ pub struct FixedSizeListArray { data_type: DataType, values: Arc, validity: Option, + offset: usize, } impl FixedSizeListArray { @@ -60,6 +60,7 @@ impl FixedSizeListArray { data_type, values, validity, + offset: 0, } } @@ -96,6 +97,7 @@ impl FixedSizeListArray { size: self.size, values, validity, + offset: self.offset + offset, } } @@ -192,3 +194,17 @@ impl std::fmt::Display for FixedSizeListArray { display_fmt(self.iter(), "FixedSizeListArray", f, true) } } + +unsafe impl ToFfi for FixedSizeListArray { + fn buffers(&self) -> Vec>> { + vec![self.validity.as_ref().map(|x| x.as_ptr())] + } + + fn offset(&self) -> usize { + self.offset + } + + fn children(&self) -> Vec> { + vec![self.values().clone()] + } +} diff --git a/src/array/list/ffi.rs b/src/array/list/ffi.rs index cb354727f92..14235ce1ddd 100644 --- a/src/array/list/ffi.rs +++ b/src/array/list/ffi.rs @@ -7,20 +7,14 @@ use super::ListArray; unsafe impl ToFfi for ListArray { fn buffers(&self) -> Vec>> { - let offset = self - .validity - .as_ref() - .map(|x| x.offset()) - .unwrap_or_default(); - - // note that this may point to _before_ the pointer. This is fine because the C data interface - // requires users to only access past the offset - let offsets = - std::ptr::NonNull::new( - unsafe { self.offsets.as_ptr().offset(-(offset as isize)) } as *mut u8 - ); + vec![ + self.validity.as_ref().map(|x| x.as_ptr()), + std::ptr::NonNull::new(self.offsets.as_ptr() as *mut u8), + ] + } - vec![self.validity.as_ref().map(|x| x.as_ptr()), offsets] + fn offset(&self) -> usize { + self.offset } fn children(&self) -> Vec> { @@ -31,11 +25,17 @@ unsafe impl ToFfi for ListArray { impl FromFfi for ListArray { unsafe fn try_from_ffi(array: A) -> Result { let data_type = array.field().data_type().clone(); - let validity = unsafe { array.validity() }?; - let offsets = unsafe { array.buffer::(0) }?; + let length = array.array().len(); + let offset = array.array().offset(); + let mut validity = unsafe { array.validity() }?; + let mut offsets = unsafe { array.buffer::(0) }?; let child = array.child(0)?; let values = ffi::try_from(child)?.into(); + if offset > 0 { + offsets = offsets.slice(offset, length); + validity = validity.map(|x| x.slice(offset, length)) + } Ok(Self::from_data(data_type, offsets, values, validity)) } } diff --git a/src/array/list/mod.rs b/src/array/list/mod.rs index 89566571dcc..4a92c352fa3 100644 --- a/src/array/list/mod.rs +++ b/src/array/list/mod.rs @@ -25,6 +25,7 @@ pub struct ListArray { offsets: Buffer, values: Arc, validity: Option, + offset: usize, } impl ListArray { @@ -77,6 +78,7 @@ impl ListArray { offsets, values, validity, + offset: 0, } } @@ -105,6 +107,7 @@ impl ListArray { offsets, values: self.values.clone(), validity, + offset: self.offset + offset, } } diff --git a/src/array/map/ffi.rs b/src/array/map/ffi.rs index 8161a17d02f..50b9039c351 100644 --- a/src/array/map/ffi.rs +++ b/src/array/map/ffi.rs @@ -7,17 +7,14 @@ use super::MapArray; unsafe impl ToFfi for MapArray { fn buffers(&self) -> Vec>> { - let offset = self - .validity - .as_ref() - .map(|x| x.offset()) - .unwrap_or_default(); - - let offsets = std::ptr::NonNull::new(unsafe { - self.offsets.as_ptr().offset(-(offset as isize)) as *mut u8 - }); + vec![ + self.validity.as_ref().map(|x| x.as_ptr()), + std::ptr::NonNull::new(self.offsets.as_ptr() as *mut u8), + ] + } - vec![self.validity.as_ref().map(|x| x.as_ptr()), offsets] + fn offset(&self) -> usize { + self.offset } fn children(&self) -> Vec> { @@ -28,11 +25,17 @@ unsafe impl ToFfi for MapArray { impl FromFfi for MapArray { unsafe fn try_from_ffi(array: A) -> Result { let data_type = array.field().data_type().clone(); - let validity = unsafe { array.validity() }?; - let offsets = unsafe { array.buffer::(0) }?; + let length = array.array().len(); + let offset = array.array().offset(); + let mut validity = unsafe { array.validity() }?; + let mut offsets = unsafe { array.buffer::(0) }?; let child = array.child(0)?; let values = ffi::try_from(child)?.into(); + if offset > 0 { + offsets = offsets.slice(offset, length); + validity = validity.map(|x| x.slice(offset, length)) + } Ok(Self::from_data(data_type, offsets, values, validity)) } } diff --git a/src/array/map/mod.rs b/src/array/map/mod.rs index ee908be847f..0efc83e7a4e 100644 --- a/src/array/map/mod.rs +++ b/src/array/map/mod.rs @@ -22,6 +22,7 @@ pub struct MapArray { field: Arc, // invariant: offsets.len() - 1 == Bitmap::len() validity: Option, + offset: usize, } impl MapArray { @@ -80,6 +81,7 @@ impl MapArray { data_type, field, offsets, + offset: 0, validity, } } @@ -109,6 +111,7 @@ impl MapArray { offsets, field: self.field.clone(), validity, + offset: self.offset + offset, } } } diff --git a/src/array/null.rs b/src/array/null.rs index e20f754cc75..1906cc140f4 100644 --- a/src/array/null.rs +++ b/src/array/null.rs @@ -7,6 +7,7 @@ use super::{ffi::ToFfi, Array}; pub struct NullArray { data_type: DataType, length: usize, + offset: usize, } impl NullArray { @@ -22,14 +23,19 @@ impl NullArray { /// Returns a new [`NullArray`]. pub fn from_data(data_type: DataType, length: usize) -> Self { - Self { data_type, length } + Self { + data_type, + length, + offset: 0, + } } /// Returns a slice of the [`NullArray`]. - pub fn slice(&self, _offset: usize, length: usize) -> Self { + pub fn slice(&self, offset: usize, length: usize) -> Self { Self { data_type: self.data_type.clone(), length, + offset: self.offset + offset, } } } @@ -75,4 +81,9 @@ unsafe impl ToFfi for NullArray { fn buffers(&self) -> Vec>> { vec![] } + + #[inline] + fn offset(&self) -> usize { + self.offset + } } diff --git a/src/array/primitive/ffi.rs b/src/array/primitive/ffi.rs index f009a2fb832..3418e2516fc 100644 --- a/src/array/primitive/ffi.rs +++ b/src/array/primitive/ffi.rs @@ -10,24 +10,30 @@ use super::PrimitiveArray; unsafe impl ToFfi for PrimitiveArray { fn buffers(&self) -> Vec>> { - let offset = self - .validity - .as_ref() - .map(|x| x.offset()) - .unwrap_or_default() as isize; vec![ self.validity.as_ref().map(|x| x.as_ptr()), - std::ptr::NonNull::new(unsafe { self.values.as_ptr().offset(-offset) as *mut u8 }), + std::ptr::NonNull::new(self.values.as_ptr() as *mut u8), ] } + + #[inline] + fn offset(&self) -> usize { + self.offset + } } impl FromFfi for PrimitiveArray { unsafe fn try_from_ffi(array: A) -> Result { let data_type = array.field().data_type().clone(); - let validity = unsafe { array.validity() }?; - let values = unsafe { array.buffer::(0) }?; + let length = array.array().len(); + let offset = array.array().offset(); + let mut validity = unsafe { array.validity() }?; + let mut values = unsafe { array.buffer::(0) }?; + if offset > 0 { + values = values.slice(offset, length); + validity = validity.map(|x| x.slice(offset, length)) + } Ok(Self::from_data(data_type, values, validity)) } } diff --git a/src/array/primitive/mod.rs b/src/array/primitive/mod.rs index 5a127968d0b..f75ffd9a2b0 100644 --- a/src/array/primitive/mod.rs +++ b/src/array/primitive/mod.rs @@ -35,6 +35,7 @@ pub struct PrimitiveArray { data_type: DataType, values: Buffer, validity: Option, + offset: usize, } impl PrimitiveArray { @@ -74,6 +75,7 @@ impl PrimitiveArray { data_type, values, validity, + offset: 0, } } @@ -106,6 +108,7 @@ impl PrimitiveArray { data_type: self.data_type.clone(), values: self.values.clone().slice_unchecked(offset, length), validity, + offset: self.offset + offset, } } @@ -168,6 +171,7 @@ impl PrimitiveArray { data_type, values: self.values, validity: self.validity, + offset: self.offset, } } } diff --git a/src/array/specification.rs b/src/array/specification.rs index 0b26f4a67fc..1edcb70653f 100644 --- a/src/array/specification.rs +++ b/src/array/specification.rs @@ -100,11 +100,12 @@ pub fn check_offsets_and_utf8(offsets: &[O], values: &[u8]) { } /// # Panics iff: -/// * the `offsets` is empty, or /// * the `offsets` is not monotonically increasing, or /// * any offset is larger or equal to `values_len`. pub fn check_offsets(offsets: &[O], values_len: usize) { - assert!(!offsets.is_empty()); + if offsets.is_empty() { + return; + } let mut last = offsets[0]; // assert monotonicity diff --git a/src/array/struct_.rs b/src/array/struct_.rs index d7e504436b1..8dc327a5719 100644 --- a/src/array/struct_.rs +++ b/src/array/struct_.rs @@ -223,6 +223,11 @@ unsafe impl ToFfi for StructArray { vec![self.validity.as_ref().map(|x| x.as_ptr())] } + fn offset(&self) -> usize { + // we do not support offsets in structs. Instead, if an FFI we slice the incoming arrays + 0 + } + fn children(&self) -> Vec> { self.values.clone() } @@ -233,7 +238,9 @@ impl FromFfi for StructArray { let field = array.field(); let fields = Self::get_fields(field.data_type()).to_vec(); - let validity = unsafe { array.validity() }?; + let length = array.array().len(); + let offset = array.array().offset(); + let mut validity = unsafe { array.validity() }?; let values = (0..fields.len()) .map(|index| { let child = array.child(index)?; @@ -241,6 +248,9 @@ impl FromFfi for StructArray { }) .collect::>>>()?; + if offset > 0 { + validity = validity.map(|x| x.slice(offset, length)) + } Ok(Self::from_data(DataType::Struct(fields), values, validity)) } } diff --git a/src/array/union/ffi.rs b/src/array/union/ffi.rs index 7f9dfbb7273..640a7d6f0ab 100644 --- a/src/array/union/ffi.rs +++ b/src/array/union/ffi.rs @@ -18,6 +18,10 @@ unsafe impl ToFfi for UnionArray { } } + fn offset(&self) -> usize { + self.offset + } + fn children(&self) -> Vec> { self.fields.clone() } @@ -29,13 +33,15 @@ impl FromFfi for UnionArray { let data_type = field.data_type().clone(); let fields = Self::get_fields(field.data_type()); - let types = unsafe { array.buffer::(0) }?; + let mut types = unsafe { array.buffer::(0) }?; let offsets = if Self::is_sparse(&data_type) { None } else { Some(unsafe { array.buffer::(1) }?) }; + let length = array.array().len(); + let offset = array.array().offset(); let fields = (0..fields.len()) .map(|index| { let child = array.child(index)?; @@ -43,6 +49,10 @@ impl FromFfi for UnionArray { }) .collect::>>>()?; + if offset > 0 { + types = types.slice(offset, length); + }; + Ok(Self::from_data(data_type, types, fields, offsets)) } } diff --git a/src/array/union/mod.rs b/src/array/union/mod.rs index 026db7eb684..b5a6a1d2605 100644 --- a/src/array/union/mod.rs +++ b/src/array/union/mod.rs @@ -31,6 +31,7 @@ pub struct UnionArray { fields: Vec>, offsets: Option>, data_type: DataType, + offset: usize, } impl UnionArray { @@ -77,6 +78,7 @@ impl UnionArray { fields, offsets, types: Buffer::new(), + offset: 0, } } else { panic!("Union struct must be created with the corresponding Union DataType") @@ -123,6 +125,7 @@ impl UnionArray { fields, offsets, types, + offset: 0, } } @@ -190,6 +193,7 @@ impl UnionArray { fields_hash: self.fields_hash.clone(), types: self.types.clone().slice(offset, length), offsets: self.offsets.clone(), + offset: self.offset + offset, } } @@ -206,6 +210,7 @@ impl UnionArray { fields_hash: self.fields_hash.clone(), types: self.types.clone().slice_unchecked(offset, length), offsets: self.offsets.clone(), + offset: self.offset + offset, } } } diff --git a/src/array/utf8/ffi.rs b/src/array/utf8/ffi.rs index 0ef904fa09d..4cc716b8e4d 100644 --- a/src/array/utf8/ffi.rs +++ b/src/array/utf8/ffi.rs @@ -8,30 +8,30 @@ use super::Utf8Array; unsafe impl ToFfi for Utf8Array { fn buffers(&self) -> Vec>> { - let offset = self - .validity - .as_ref() - .map(|x| x.offset()) - .unwrap_or_default(); - - let offsets = std::ptr::NonNull::new(unsafe { - self.offsets.as_ptr().offset(-(offset as isize)) as *mut u8 - }); - vec![ self.validity.as_ref().map(|x| x.as_ptr()), - offsets, + std::ptr::NonNull::new(self.offsets.as_ptr() as *mut u8), std::ptr::NonNull::new(self.values.as_ptr() as *mut u8), ] } + + fn offset(&self) -> usize { + self.offset + } } impl FromFfi for Utf8Array { unsafe fn try_from_ffi(array: A) -> Result { - let validity = unsafe { array.validity() }?; - let offsets = unsafe { array.buffer::(0) }?; + let length = array.array().len(); + let offset = array.array().offset(); + let mut validity = unsafe { array.validity() }?; + let mut offsets = unsafe { array.buffer::(0) }?; let values = unsafe { array.buffer::(1)? }; + if offset > 0 { + offsets = offsets.slice(offset, length); + validity = validity.map(|x| x.slice(offset, length)) + } let data_type = Self::default_data_type(); Ok(Self::from_data_unchecked( data_type, offsets, values, validity, diff --git a/src/array/utf8/mod.rs b/src/array/utf8/mod.rs index ef895092dc9..1d6b75faec3 100644 --- a/src/array/utf8/mod.rs +++ b/src/array/utf8/mod.rs @@ -36,6 +36,7 @@ pub struct Utf8Array { offsets: Buffer, values: Buffer, validity: Option, + offset: usize, } impl Utf8Array { @@ -85,6 +86,7 @@ impl Utf8Array { offsets, values, validity, + offset: 0, } } @@ -126,6 +128,7 @@ impl Utf8Array { offsets, values, validity, + offset: 0, } } @@ -158,6 +161,7 @@ impl Utf8Array { offsets, values: self.values.clone(), validity, + offset: self.offset + offset, } } diff --git a/src/bitmap/immutable.rs b/src/bitmap/immutable.rs index 3932a9d451d..c2f5e91fd9d 100644 --- a/src/bitmap/immutable.rs +++ b/src/bitmap/immutable.rs @@ -66,28 +66,23 @@ impl Bitmap { /// # Panic /// Panics iff `length <= bytes.len() * 8` #[inline] - pub(crate) fn from_bytes(bytes: Bytes, offset: usize, length: usize) -> Self { - assert!(offset + length <= bytes.len() * 8); - let null_count = count_zeros(&bytes, offset, length); + pub(crate) fn from_bytes(bytes: Bytes, length: usize) -> Self { + assert!(length <= bytes.len() * 8); + let null_count = count_zeros(&bytes, 0, length); Self { length, - offset, + offset: 0, bytes: Arc::new(bytes), null_count, } } - #[inline] - pub(crate) fn offset(&self) -> usize { - self.offset - } - /// Creates a new [`Bitmap`] from [`MutableBuffer`] and a length. /// # Panic /// Panics iff `length <= buffer.len() * 8` #[inline] pub fn from_u8_buffer(buffer: MutableBuffer, length: usize) -> Self { - Bitmap::from_bytes(buffer.into(), 0, length) + Bitmap::from_bytes(buffer.into(), length) } /// Creates a new [`Bitmap`] from a slice and length. diff --git a/src/bitmap/mutable.rs b/src/bitmap/mutable.rs index b7e8b8d3d2b..58631abde76 100644 --- a/src/bitmap/mutable.rs +++ b/src/bitmap/mutable.rs @@ -230,7 +230,7 @@ impl MutableBitmap { impl From for Bitmap { #[inline] fn from(buffer: MutableBitmap) -> Self { - Bitmap::from_bytes(buffer.buffer.into(), 0, buffer.length) + Bitmap::from_bytes(buffer.buffer.into(), buffer.length) } } @@ -238,7 +238,7 @@ impl From for Option { #[inline] fn from(buffer: MutableBitmap) -> Self { if buffer.null_count() > 0 { - Some(Bitmap::from_bytes(buffer.buffer.into(), 0, buffer.length)) + Some(Bitmap::from_bytes(buffer.buffer.into(), buffer.length)) } else { None } diff --git a/src/buffer/immutable.rs b/src/buffer/immutable.rs index 861d121e96d..eab542e67e3 100644 --- a/src/buffer/immutable.rs +++ b/src/buffer/immutable.rs @@ -63,11 +63,11 @@ impl Buffer { } /// Auxiliary method to create a new Buffer - pub(crate) fn from_bytes(bytes: Bytes, offset: usize) -> Self { - let length = bytes.len() - offset; + pub(crate) fn from_bytes(bytes: Bytes) -> Self { + let length = bytes.len(); Buffer { data: Arc::new(bytes), - offset, + offset: 0, length, } } diff --git a/src/buffer/mutable.rs b/src/buffer/mutable.rs index b7790e8ec79..70c4e4415d0 100644 --- a/src/buffer/mutable.rs +++ b/src/buffer/mutable.rs @@ -431,7 +431,7 @@ impl> From

for MutableBuffer { impl From> for Buffer { #[inline] fn from(buffer: MutableBuffer) -> Self { - Self::from_bytes(buffer.into(), 0) + Self::from_bytes(buffer.into()) } } diff --git a/src/ffi/ffi.rs b/src/ffi/ffi.rs index f4960520594..c4b839b332f 100644 --- a/src/ffi/ffi.rs +++ b/src/ffi/ffi.rs @@ -96,12 +96,6 @@ impl Ffi_ArrowArray { /// This method releases `buffers`. Consumers of this struct *must* call `release` before /// releasing this struct, or contents in `buffers` leak. pub(crate) fn new(array: Arc) -> Self { - let offset = array - .validity() - .as_ref() - .map(|x| x.offset()) - .unwrap_or_default() as i64; - let (buffers, children, dictionary) = buffers_children_dictionary(array.as_ref()); let buffers_ptr = buffers @@ -136,10 +130,10 @@ impl Ffi_ArrowArray { Self { length, null_count, - offset, + offset: 0i64, n_buffers, n_children, - buffers: unsafe { private_data.buffers_ptr.as_mut_ptr() }, + buffers: private_data.buffers_ptr.as_mut_ptr(), children: private_data.children_ptr.as_mut_ptr(), dictionary: private_data.dictionary_ptr.unwrap_or(std::ptr::null_mut()), release: Some(c_release_array), @@ -163,6 +157,16 @@ impl Ffi_ArrowArray { } } + /// the length of the array + pub(crate) fn len(&self) -> usize { + self.length as usize + } + + /// the offset of the array + pub(crate) fn offset(&self) -> usize { + self.offset as usize + } + /// the null count of the array pub(crate) fn null_count(&self) -> usize { self.null_count as usize @@ -184,7 +188,6 @@ unsafe fn create_buffer( } let buffers = array.buffers as *mut *const u8; - let offset = buffer_offset(array, data_type, index); let len = buffer_len(array, data_type, index)?; assert!(index < array.n_buffers as usize); @@ -192,10 +195,10 @@ unsafe fn create_buffer( let ptr = NonNull::new(ptr as *mut T); let bytes = ptr - .map(|ptr| Bytes::new(ptr, offset + len, deallocation)) + .map(|ptr| Bytes::new(ptr, len, deallocation)) .ok_or_else(|| ArrowError::Ffi(format!("The buffer at position {} is null", index))); - bytes.map(|x| Buffer::from_bytes(x, offset)) + bytes.map(Buffer::from_bytes) } /// returns a new buffer corresponding to the index `i` of the FFI array. It may not exist (null pointer). @@ -214,8 +217,6 @@ unsafe fn create_bitmap( return Err(ArrowError::Ffi("The array buffers are null".to_string())); } let len = array.length as usize; - let offset = array.offset as usize; - let buffers = array.buffers as *mut *const u8; assert!(index < array.n_buffers as usize); @@ -232,19 +233,7 @@ unsafe fn create_bitmap( )) })?; - Ok(Bitmap::from_bytes(bytes, offset, len)) -} - -/// Returns the offset, in slots, of the buffer `i` (indexed according to the C data interface) -fn buffer_offset(array: &Ffi_ArrowArray, data_type: &DataType, i: usize) -> usize { - match (data_type.to_physical_type(), i) { - (PhysicalType::Utf8, 2) - | (PhysicalType::Binary, 2) - | (PhysicalType::LargeUtf8, 2) - | (PhysicalType::LargeBinary, 2) => 0, - // buffer len of primitive types - _ => array.offset as usize, - } + Ok(Bitmap::from_bytes(bytes, len)) } /// Returns the length, in slots, of the buffer `i` (indexed according to the C data interface) @@ -266,24 +255,22 @@ fn buffer_len(array: &Ffi_ArrowArray, data_type: &DataType, i: usize) -> Result< (PhysicalType::Utf8, 2) | (PhysicalType::Binary, 2) => { // the len of the data buffer (buffer 2) equals the last value of the offset buffer (buffer 1) let len = buffer_len(array, data_type, 1)?; - let offset = buffer_offset(array, data_type, 1); // first buffer is the null buffer => add(1) let offset_buffer = unsafe { *(array.buffers as *mut *const u8).add(1) }; // interpret as i32 let offset_buffer = offset_buffer as *const i32; // get last offset - (unsafe { *offset_buffer.add(offset).add(len - 1) }) as usize + (unsafe { *offset_buffer.add(len - 1) }) as usize } (PhysicalType::LargeUtf8, 2) | (PhysicalType::LargeBinary, 2) => { // the len of the data buffer (buffer 2) equals the last value of the offset buffer (buffer 1) let len = buffer_len(array, data_type, 1)?; - let offset = buffer_offset(array, data_type, 1); // first buffer is the null buffer => add(1) let offset_buffer = unsafe { *(array.buffers as *mut *const u8).add(1) }; // interpret as i64 let offset_buffer = offset_buffer as *const i64; // get last offset - (unsafe { *offset_buffer.add(offset).add(len - 1) }) as usize + (unsafe { *offset_buffer.add(len - 1) }) as usize } // buffer len of primitive types _ => array.length as usize, diff --git a/src/io/ipc/read/read_basic.rs b/src/io/ipc/read/read_basic.rs index 85975565959..33c01a67d1a 100644 --- a/src/io/ipc/read/read_basic.rs +++ b/src/io/ipc/read/read_basic.rs @@ -231,7 +231,7 @@ pub fn read_bitmap( read_uncompressed_bitmap(length, bytes, reader) }?; - Ok(Bitmap::from_bytes(buffer.into(), 0, length)) + Ok(Bitmap::from_bytes(buffer.into(), length)) } pub fn read_validity( diff --git a/tests/it/ffi.rs b/tests/it/ffi.rs index c7912d6cd81..facc143988c 100644 --- a/tests/it/ffi.rs +++ b/tests/it/ffi.rs @@ -1,12 +1,13 @@ use arrow2::array::*; -use arrow2::bitmap::Bitmap; use arrow2::datatypes::{DataType, Field, TimeUnit}; use arrow2::{error::Result, ffi}; use std::collections::BTreeMap; use std::sync::Arc; -fn _test_round_trip(array: Arc, expected: Box) -> Result<()> { +fn test_round_trip(expected: impl Array + Clone + 'static) -> Result<()> { + let array: Arc = Arc::new(expected.clone()); let field = Field::new("a", array.data_type().clone(), true); + let expected = Box::new(expected) as Box; let array_ptr = Box::new(ffi::Ffi_ArrowArray::empty()); let schema_ptr = Box::new(ffi::Ffi_ArrowSchema::empty()); @@ -31,15 +32,6 @@ fn _test_round_trip(array: Arc, expected: Box) -> Result<( Ok(()) } -fn test_round_trip(expected: impl Array + Clone + 'static) -> Result<()> { - let array: Arc = Arc::new(expected.clone()); - let expected = Box::new(expected) as Box; - _test_round_trip(array.clone(), clone(expected.as_ref()))?; - - // sliced - _test_round_trip(array.slice(1, 2).into(), expected.slice(1, 2)) -} - fn test_round_trip_schema(field: Field) -> Result<()> { // create a `ArrowArray` from the data. let schema_ptr = Box::new(ffi::Ffi_ArrowSchema::empty()); @@ -146,17 +138,6 @@ fn list_list() -> Result<()> { test_round_trip(array) } -#[test] -fn struct_() -> Result<()> { - let data_type = DataType::Struct(vec![Field::new("a", DataType::Int32, true)]); - let values = vec![Arc::new(Int32Array::from([Some(1), None, Some(3)])) as Arc]; - let validity = Bitmap::from([true, false, true]); - - let array = StructArray::from_data(data_type, values, validity.into()); - - test_round_trip(array) -} - #[test] fn dict() -> Result<()> { let data = vec![Some("a"), Some("a"), None, Some("b")];