Skip to content
This repository has been archived by the owner on Feb 18, 2024. It is now read-only.

Commit

Permalink
Fixed ffi of sliced arrays (#540)
Browse files Browse the repository at this point in the history
  • Loading branch information
jorgecarleitao authored Oct 23, 2021
1 parent b424e90 commit e6b6c83
Show file tree
Hide file tree
Showing 33 changed files with 270 additions and 254 deletions.
81 changes: 73 additions & 8 deletions arrow-pyarrow-integration-testing/tests/test_sql.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,15 +39,52 @@ def tearDown(self):
# No leak of C++ memory
self.assertEqual(self.old_allocated_cpp, pyarrow.total_allocated_bytes())

def test_string_roundtrip(self):
"""
Python -> Rust -> Python
"""
def test_primitive(self):
a = pyarrow.array([0, None, 2, 3, 4])
b = arrow_pyarrow_integration_testing.round_trip_array(a)

b.validate(full=True)
assert a.to_pylist() == b.to_pylist()
assert a.type == b.type

def test_primitive_sliced(self):
a = pyarrow.array([0, None, 2, 3, 4]).slice(1, 2)
b = arrow_pyarrow_integration_testing.round_trip_array(a)

b.validate(full=True)
assert a.to_pylist() == b.to_pylist()
assert a.type == b.type

def test_boolean(self):
a = pyarrow.array([True, None, False, True, False])
b = arrow_pyarrow_integration_testing.round_trip_array(a)

b.validate(full=True)
assert a.to_pylist() == b.to_pylist()
assert a.type == b.type

def test_boolean_sliced(self):
a = pyarrow.array([True, None, False, True, False]).slice(1, 2)
b = arrow_pyarrow_integration_testing.round_trip_array(a)

b.validate(full=True)
assert a.to_pylist() == b.to_pylist()
assert a.type == b.type

def test_string(self):
a = pyarrow.array(["a", None, "ccc"])
b = arrow_pyarrow_integration_testing.round_trip_array(a)
c = pyarrow.array(["a", None, "ccc"])
self.assertEqual(b, c)

def test_string_sliced(self):
a = pyarrow.array(["a", None, "ccc"]).slice(1, 2)
b = arrow_pyarrow_integration_testing.round_trip_array(a)

b.validate(full=True)
assert a.to_pylist() == b.to_pylist()
assert a.type == b.type

def test_decimal_roundtrip(self):
"""
Python -> Rust -> Python
Expand Down Expand Up @@ -75,10 +112,17 @@ def test_list_array(self):
assert a.to_pylist() == b.to_pylist()
assert a.type == b.type

def test_struct_array(self):
"""
Python -> Rust -> Python
"""
def test_list_sliced(self):
a = pyarrow.array(
[[], None, [1, 2], [4, 5, 6]], pyarrow.list_(pyarrow.int64())
).slice(1, 2)
b = arrow_pyarrow_integration_testing.round_trip_array(a)

b.validate(full=True)
assert a.to_pylist() == b.to_pylist()
assert a.type == b.type

def test_struct(self):
fields = [
("f1", pyarrow.int32()),
("f2", pyarrow.string()),
Expand All @@ -99,6 +143,27 @@ def test_struct_array(self):
assert a.to_pylist() == b.to_pylist()
assert a.type == b.type

# see https://issues.apache.org/jira/browse/ARROW-14383
def _test_struct_sliced(self):
fields = [
("f1", pyarrow.int32()),
("f2", pyarrow.string()),
]
a = pyarrow.array(
[
{"f1": 1, "f2": "a"},
None,
{"f1": 3, "f2": None},
{"f1": None, "f2": "d"},
{"f1": None, "f2": None},
],
pyarrow.struct(fields),
).slice(1, 2)
b = arrow_pyarrow_integration_testing.round_trip_array(a)
b.validate(full=True)
assert a.to_pylist() == b.to_pylist()
assert a.type == b.type

def test_list_list_array(self):
"""
Python -> Rust -> Python
Expand Down
11 changes: 0 additions & 11 deletions src/array/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -41,17 +41,6 @@ This document describes the overall design of this module.
* `from_trusted_len_values_iter` from an iterator of trusted len of values
* `try_from_trusted_len_iter` from an fallible iterator of trusted len of optional values

### Slot offsets

* An array MUST have a `offset: usize` measuring the number of slots that the array is currently offsetted by if the specification requires.

* An array MUST implement `fn slice(&self, offset: usize, length: usize) -> Self` that returns an offseted and/or truncated clone of the array. This function MUST increase the array's offset if it exists.

* Conversely, `offset` MUST only be changed by `slice`.

The rational of the above is that it enable us to be fully interoperable with the offset logic supported by the C data interface, while at the same time easily perform array slices
within Rust's type safety mechanism.

### Mutable Arrays

* An array MAY have a mutable counterpart. E.g. `MutablePrimitiveArray<T>` is the mutable counterpart of `PrimitiveArray<T>`.
Expand Down
28 changes: 13 additions & 15 deletions src/array/binary/ffi.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,17 +10,22 @@ use super::BinaryArray;

unsafe impl<O: Offset> ToFfi for BinaryArray<O> {
fn buffers(&self) -> Vec<Option<std::ptr::NonNull<u8>>> {
let offset = self
.validity
.as_ref()
.map(|x| x.offset())
.unwrap_or_default();

let offsets = std::ptr::NonNull::new(unsafe {
self.offsets.as_ptr().offset(-(offset as isize)) as *mut u8
});

vec![
self.validity.as_ref().map(|x| x.as_ptr()),
std::ptr::NonNull::new(self.offsets.as_ptr() as *mut u8),
offsets,
std::ptr::NonNull::new(self.values.as_ptr() as *mut u8),
]
}

#[inline]
fn offset(&self) -> usize {

This comment has been minimized.

Copy link
@ritchie46

ritchie46 Oct 26, 2021

Collaborator

Was this intended? If so, what has changed when slicing a list array?

self.offset
}
}

impl<O: Offset, A: ffi::ArrowArrayRef> FromFfi<A> for BinaryArray<O> {
Expand All @@ -33,17 +38,10 @@ impl<O: Offset, A: ffi::ArrowArrayRef> FromFfi<A> for BinaryArray<O> {
};
assert_eq!(data_type, expected);

let length = array.array().len();
let offset = array.array().offset();
let mut validity = unsafe { array.validity() }?;
let mut offsets = unsafe { array.buffer::<O>(0) }?;
let validity = unsafe { array.validity() }?;
let offsets = unsafe { array.buffer::<O>(0) }?;
let values = unsafe { array.buffer::<u8>(1) }?;

if offset > 0 {
offsets = offsets.slice(offset, length);
validity = validity.map(|x| x.slice(offset, length))
}

Ok(Self::from_data_unchecked(
Self::default_data_type(),
offsets,
Expand Down
4 changes: 0 additions & 4 deletions src/array/binary/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@ pub struct BinaryArray<O: Offset> {
offsets: Buffer<O>,
values: Buffer<u8>,
validity: Option<Bitmap>,
offset: usize,
}

// constructors
Expand Down Expand Up @@ -71,7 +70,6 @@ impl<O: Offset> BinaryArray<O> {
offsets,
values,
validity,
offset: 0,
}
}

Expand Down Expand Up @@ -113,7 +111,6 @@ impl<O: Offset> BinaryArray<O> {
offsets,
values,
validity,
offset: 0,
}
}

Expand Down Expand Up @@ -146,7 +143,6 @@ impl<O: Offset> BinaryArray<O> {
offsets,
values: self.values.clone(),
validity,
offset: self.offset + offset,
}
}

Expand Down
22 changes: 10 additions & 12 deletions src/array/boolean/ffi.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,30 +10,28 @@ use super::BooleanArray;

unsafe impl ToFfi for BooleanArray {
fn buffers(&self) -> Vec<Option<std::ptr::NonNull<u8>>> {
let offset = self
.validity
.as_ref()
.map(|x| x.offset())
.unwrap_or_default();

assert!(self.values.offset() >= offset);
vec![
self.validity.as_ref().map(|x| x.as_ptr()),
Some(self.values.as_ptr()),
]
}

fn offset(&self) -> usize {
self.offset
}
}

impl<A: ffi::ArrowArrayRef> FromFfi<A> for BooleanArray {
unsafe fn try_from_ffi(array: A) -> Result<Self> {
let data_type = array.field().data_type().clone();
assert_eq!(data_type, DataType::Boolean);
let length = array.array().len();
let offset = array.array().offset();
let mut validity = unsafe { array.validity() }?;
let mut values = unsafe { array.bitmap(0) }?;

if offset > 0 {
values = values.slice(offset, length);
validity = validity.map(|x| x.slice(offset, length))
}
let validity = unsafe { array.validity() }?;
let values = unsafe { array.bitmap(0) }?;

Ok(Self::from_data(data_type, values, validity))
}
}
9 changes: 6 additions & 3 deletions src/array/boolean/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@ pub struct BooleanArray {
data_type: DataType,
values: Bitmap,
validity: Option<Bitmap>,
offset: usize,
}

impl BooleanArray {
Expand All @@ -46,11 +45,13 @@ impl BooleanArray {
if data_type.to_physical_type() != PhysicalType::Boolean {
panic!("BooleanArray can only be initialized with DataType::Boolean")
}
if matches!(&validity, Some(bitmap) if bitmap.offset() != values.offset()) {
panic!("validity's bitmap offset must equal values offset")
}
Self {
data_type,
values,
validity,
offset: 0,
}
}

Expand Down Expand Up @@ -83,7 +84,6 @@ impl BooleanArray {
data_type: self.data_type.clone(),
values: self.values.clone().slice_unchecked(offset, length),
validity,
offset: self.offset + offset,
}
}

Expand All @@ -94,6 +94,9 @@ impl BooleanArray {
if matches!(&validity, Some(bitmap) if bitmap.len() != self.len()) {
panic!("validity should be as least as large as the array")
}
if matches!(&validity, Some(bitmap) if bitmap.offset() != self.values.offset()) {
panic!("validity's bitmap offset must equal values offset")
}
let mut arr = self.clone();
arr.validity = validity;
arr
Expand Down
15 changes: 2 additions & 13 deletions src/array/dictionary/ffi.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,25 +10,14 @@ unsafe impl<K: DictionaryKey> ToFfi for DictionaryArray<K> {
fn buffers(&self) -> Vec<Option<std::ptr::NonNull<u8>>> {
self.keys.buffers()
}

#[inline]
fn offset(&self) -> usize {
self.offset
}
}

impl<K: DictionaryKey, A: ffi::ArrowArrayRef> FromFfi<A> for DictionaryArray<K> {
unsafe fn try_from_ffi(array: A) -> Result<Self> {
// keys: similar to PrimitiveArray, but the datatype is the inner one
let length = array.array().len();
let offset = array.array().offset();
let mut validity = unsafe { array.validity() }?;
let mut values = unsafe { array.buffer::<K>(0) }?;
let validity = unsafe { array.validity() }?;
let values = unsafe { array.buffer::<K>(0) }?;

if offset > 0 {
values = values.slice(offset, length);
validity = validity.map(|x| x.slice(offset, length))
}
let keys = PrimitiveArray::<K>::from_data(K::DATA_TYPE, values, validity);
// values
let values = array.dictionary()?.unwrap();
Expand Down
4 changes: 0 additions & 4 deletions src/array/dictionary/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,6 @@ pub struct DictionaryArray<K: DictionaryKey> {
data_type: DataType,
keys: PrimitiveArray<K>,
values: Arc<dyn Array>,
offset: usize,
}

impl<K: DictionaryKey> DictionaryArray<K> {
Expand Down Expand Up @@ -69,7 +68,6 @@ impl<K: DictionaryKey> DictionaryArray<K> {
data_type,
keys,
values,
offset: 0,
}
}

Expand All @@ -81,7 +79,6 @@ impl<K: DictionaryKey> DictionaryArray<K> {
data_type: self.data_type.clone(),
keys: self.keys.clone().slice(offset, length),
values: self.values.clone(),
offset: self.offset + offset,
}
}

Expand All @@ -93,7 +90,6 @@ impl<K: DictionaryKey> DictionaryArray<K> {
data_type: self.data_type.clone(),
keys: self.keys.clone().slice_unchecked(offset, length),
values: self.values.clone(),
offset: self.offset + offset,
}
}

Expand Down
3 changes: 0 additions & 3 deletions src/array/ffi.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,6 @@ pub unsafe trait ToFfi {
/// The pointers to the buffers.
fn buffers(&self) -> Vec<Option<std::ptr::NonNull<u8>>>;

/// The offset
fn offset(&self) -> usize;

/// The children
fn children(&self) -> Vec<Arc<dyn Array>> {
vec![]
Expand Down
23 changes: 23 additions & 0 deletions src/array/fixed_size_binary/ffi.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
use crate::array::ffi::ToFfi;

use super::FixedSizeBinaryArray;

unsafe impl ToFfi for FixedSizeBinaryArray {
fn buffers(&self) -> Vec<Option<std::ptr::NonNull<u8>>> {
let offset = self
.validity
.as_ref()
.map(|x| x.offset())
.unwrap_or_default();
let offset = offset * self.size();

// note that this may point to _before_ the pointer. This is fine because the C data interface
// requires users to only access past the offset
let values =
std::ptr::NonNull::new(
unsafe { self.values.as_ptr().offset(-(offset as isize)) } as *mut u8
);

vec![self.validity.as_ref().map(|x| x.as_ptr()), values]
}
}
Loading

0 comments on commit e6b6c83

Please sign in to comment.