Skip to content
This repository has been archived by the owner on Feb 18, 2024. It is now read-only.

Commit

Permalink
Simplified offset handling.
Browse files Browse the repository at this point in the history
  • Loading branch information
jorgecarleitao committed Oct 19, 2021
1 parent 2e7d38d commit 71c6f8f
Show file tree
Hide file tree
Showing 25 changed files with 128 additions and 129 deletions.
71 changes: 59 additions & 12 deletions arrow-pyarrow-integration-testing/tests/test_sql.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,27 +39,46 @@ def tearDown(self):
# No leak of C++ memory
self.assertEqual(self.old_allocated_cpp, pyarrow.total_allocated_bytes())

def test_primitive(self):
a = pyarrow.array([0, None, 2, 3, 4])
b = arrow_pyarrow_integration_testing.round_trip_array(a)

b.validate(full=True)
assert a.to_pylist() == b.to_pylist()
assert a.type == b.type

def test_primitive_sliced(self):
a = pyarrow.array([0, None, 2, 3, 4]).slice(1, 1)
a = pyarrow.array([0, None, 2, 3, 4]).slice(1, 2)
b = arrow_pyarrow_integration_testing.round_trip_array(a)

b.validate(full=True)
print(a.to_pylist())
print(b.to_pylist())
assert a.to_pylist() == b.to_pylist()
assert a.type == b.type

def test_string_roundtrip(self):
"""
Python -> Rust -> Python
"""
def test_boolean(self):
a = pyarrow.array([True, None, False, True, False])
b = arrow_pyarrow_integration_testing.round_trip_array(a)

b.validate(full=True)
assert a.to_pylist() == b.to_pylist()
assert a.type == b.type

def test_boolean_sliced(self):
a = pyarrow.array([True, None, False, True, False]).slice(1, 2)
b = arrow_pyarrow_integration_testing.round_trip_array(a)

b.validate(full=True)
assert a.to_pylist() == b.to_pylist()
assert a.type == b.type

def test_string(self):
a = pyarrow.array(["a", None, "ccc"])
b = arrow_pyarrow_integration_testing.round_trip_array(a)
c = pyarrow.array(["a", None, "ccc"])
self.assertEqual(b, c)

def test_string_sliced(self):
a = pyarrow.array(["a", None, "ccc"]).slice(1, 1)
a = pyarrow.array(["a", None, "ccc"]).slice(1, 2)
b = arrow_pyarrow_integration_testing.round_trip_array(a)

b.validate(full=True)
Expand Down Expand Up @@ -93,10 +112,17 @@ def test_list_array(self):
assert a.to_pylist() == b.to_pylist()
assert a.type == b.type

def test_struct_array(self):
"""
Python -> Rust -> Python
"""
def test_list_sliced(self):
a = pyarrow.array(
[[], None, [1, 2], [4, 5, 6]], pyarrow.list_(pyarrow.int64())
).slice(1, 2)
b = arrow_pyarrow_integration_testing.round_trip_array(a)

b.validate(full=True)
assert a.to_pylist() == b.to_pylist()
assert a.type == b.type

def test_struct(self):
fields = [
("f1", pyarrow.int32()),
("f2", pyarrow.string()),
Expand All @@ -117,6 +143,27 @@ def test_struct_array(self):
assert a.to_pylist() == b.to_pylist()
assert a.type == b.type

# see https://issues.apache.org/jira/browse/ARROW-14383
def _test_struct_sliced(self):
fields = [
("f1", pyarrow.int32()),
("f2", pyarrow.string()),
]
a = pyarrow.array(
[
{"f1": 1, "f2": "a"},
None,
{"f1": 3, "f2": None},
{"f1": None, "f2": "d"},
{"f1": None, "f2": None},
],
pyarrow.struct(fields),
).slice(1, 2)
b = arrow_pyarrow_integration_testing.round_trip_array(a)
b.validate(full=True)
assert a.to_pylist() == b.to_pylist()
assert a.type == b.type

def test_list_list_array(self):
"""
Python -> Rust -> Python
Expand Down
11 changes: 0 additions & 11 deletions src/array/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -41,17 +41,6 @@ This document describes the overall design of this module.
* `from_trusted_len_values_iter` from an iterator of trusted len of values
* `try_from_trusted_len_iter` from an fallible iterator of trusted len of optional values

### Slot offsets

* An array MUST have a `offset: usize` measuring the number of slots that the array is currently offsetted by if the specification requires.

* An array MUST implement `fn slice(&self, offset: usize, length: usize) -> Self` that returns an offseted and/or truncated clone of the array. This function MUST increase the array's offset if it exists.

* Conversely, `offset` MUST only be changed by `slice`.

The rational of the above is that it enable us to be fully interoperable with the offset logic supported by the C data interface, while at the same time easily perform array slices
within Rust's type safety mechanism.

### Mutable Arrays

* An array MAY have a mutable counterpart. E.g. `MutablePrimitiveArray<T>` is the mutable counterpart of `PrimitiveArray<T>`.
Expand Down
16 changes: 10 additions & 6 deletions src/array/binary/ffi.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,17 +10,21 @@ use super::BinaryArray;

unsafe impl<O: Offset> ToFfi for BinaryArray<O> {
fn buffers(&self) -> Vec<Option<std::ptr::NonNull<u8>>> {
let offset = self
.validity
.as_ref()
.map(|x| x.offset())
.unwrap_or_default();

assert!(self.offsets.offset() >= offset);
vec![
self.validity.as_ref().map(|x| x.as_ptr()),
std::ptr::NonNull::new(self.offsets.as_ptr() as *mut u8),
std::ptr::NonNull::new(unsafe {
self.offsets.as_ptr().offset(-(offset as isize)) as *mut u8
}),
std::ptr::NonNull::new(self.values.as_ptr() as *mut u8),
]
}

#[inline]
fn offset(&self) -> usize {
self.offset
}
}

impl<O: Offset, A: ffi::ArrowArrayRef> FromFfi<A> for BinaryArray<O> {
Expand Down
4 changes: 0 additions & 4 deletions src/array/binary/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@ pub struct BinaryArray<O: Offset> {
offsets: Buffer<O>,
values: Buffer<u8>,
validity: Option<Bitmap>,
offset: usize,
}

// constructors
Expand Down Expand Up @@ -71,7 +70,6 @@ impl<O: Offset> BinaryArray<O> {
offsets,
values,
validity,
offset: 0,
}
}

Expand Down Expand Up @@ -113,7 +111,6 @@ impl<O: Offset> BinaryArray<O> {
offsets,
values,
validity,
offset: 0,
}
}

Expand Down Expand Up @@ -146,7 +143,6 @@ impl<O: Offset> BinaryArray<O> {
offsets,
values: self.values.clone(),
validity,
offset: self.offset + offset,
}
}

Expand Down
11 changes: 7 additions & 4 deletions src/array/boolean/ffi.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,15 +10,18 @@ use super::BooleanArray;

unsafe impl ToFfi for BooleanArray {
fn buffers(&self) -> Vec<Option<std::ptr::NonNull<u8>>> {
let offset = self
.validity
.as_ref()
.map(|x| x.offset())
.unwrap_or_default();

assert!(self.values.offset() >= offset);
vec![
self.validity.as_ref().map(|x| x.as_ptr()),
Some(self.values.as_ptr()),
]
}

fn offset(&self) -> usize {
self.offset
}
}

impl<A: ffi::ArrowArrayRef> FromFfi<A> for BooleanArray {
Expand Down
3 changes: 0 additions & 3 deletions src/array/boolean/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@ pub struct BooleanArray {
data_type: DataType,
values: Bitmap,
validity: Option<Bitmap>,
offset: usize,
}

impl BooleanArray {
Expand Down Expand Up @@ -50,7 +49,6 @@ impl BooleanArray {
data_type,
values,
validity,
offset: 0,
}
}

Expand Down Expand Up @@ -83,7 +81,6 @@ impl BooleanArray {
data_type: self.data_type.clone(),
values: self.values.clone().slice_unchecked(offset, length),
validity,
offset: self.offset + offset,
}
}

Expand Down
5 changes: 0 additions & 5 deletions src/array/dictionary/ffi.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,6 @@ unsafe impl<K: DictionaryKey> ToFfi for DictionaryArray<K> {
fn buffers(&self) -> Vec<Option<std::ptr::NonNull<u8>>> {
self.keys.buffers()
}

#[inline]
fn offset(&self) -> usize {
self.offset
}
}

impl<K: DictionaryKey, A: ffi::ArrowArrayRef> FromFfi<A> for DictionaryArray<K> {
Expand Down
4 changes: 0 additions & 4 deletions src/array/dictionary/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,6 @@ pub struct DictionaryArray<K: DictionaryKey> {
data_type: DataType,
keys: PrimitiveArray<K>,
values: Arc<dyn Array>,
offset: usize,
}

impl<K: DictionaryKey> DictionaryArray<K> {
Expand Down Expand Up @@ -69,7 +68,6 @@ impl<K: DictionaryKey> DictionaryArray<K> {
data_type,
keys,
values,
offset: 0,
}
}

Expand All @@ -81,7 +79,6 @@ impl<K: DictionaryKey> DictionaryArray<K> {
data_type: self.data_type.clone(),
keys: self.keys.clone().slice(offset, length),
values: self.values.clone(),
offset: self.offset + offset,
}
}

Expand All @@ -93,7 +90,6 @@ impl<K: DictionaryKey> DictionaryArray<K> {
data_type: self.data_type.clone(),
keys: self.keys.clone().slice_unchecked(offset, length),
values: self.values.clone(),
offset: self.offset + offset,
}
}

Expand Down
3 changes: 0 additions & 3 deletions src/array/ffi.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,6 @@ pub unsafe trait ToFfi {
/// The pointers to the buffers.
fn buffers(&self) -> Vec<Option<std::ptr::NonNull<u8>>>;

/// The offset
fn offset(&self) -> usize;

/// The children
fn children(&self) -> Vec<Arc<dyn Array>> {
vec![]
Expand Down
20 changes: 12 additions & 8 deletions src/array/fixed_size_binary/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@ pub struct FixedSizeBinaryArray {
data_type: DataType,
values: Buffer<u8>,
validity: Option<Bitmap>,
offset: usize,
}

impl FixedSizeBinaryArray {
Expand Down Expand Up @@ -47,7 +46,6 @@ impl FixedSizeBinaryArray {
data_type,
values,
validity,
offset: 0,
}
}

Expand Down Expand Up @@ -83,7 +81,6 @@ impl FixedSizeBinaryArray {
size: self.size,
values,
validity,
offset: self.offset + offset,
}
}

Expand Down Expand Up @@ -184,15 +181,22 @@ impl std::fmt::Display for FixedSizeBinaryArray {

unsafe impl ToFfi for FixedSizeBinaryArray {
fn buffers(&self) -> Vec<Option<std::ptr::NonNull<u8>>> {
let offset = self
.validity
.as_ref()
.map(|x| x.offset())
.unwrap_or_default();
let offset = offset * self.size();

assert!(self.values.offset() >= offset);

vec![
self.validity.as_ref().map(|x| x.as_ptr()),
std::ptr::NonNull::new(self.values.as_ptr() as *mut u8),
std::ptr::NonNull::new(
unsafe { self.values.as_ptr().offset(-(offset as isize)) } as *mut u8
),
]
}

fn offset(&self) -> usize {
self.offset
}
}

impl FixedSizeBinaryArray {
Expand Down
7 changes: 0 additions & 7 deletions src/array/fixed_size_list/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@ pub struct FixedSizeListArray {
data_type: DataType,
values: Arc<dyn Array>,
validity: Option<Bitmap>,
offset: usize,
}

impl FixedSizeListArray {
Expand Down Expand Up @@ -60,7 +59,6 @@ impl FixedSizeListArray {
data_type,
values,
validity,
offset: 0,
}
}

Expand Down Expand Up @@ -97,7 +95,6 @@ impl FixedSizeListArray {
size: self.size,
values,
validity,
offset: self.offset + offset,
}
}

Expand Down Expand Up @@ -200,10 +197,6 @@ unsafe impl ToFfi for FixedSizeListArray {
vec![self.validity.as_ref().map(|x| x.as_ptr())]
}

fn offset(&self) -> usize {
self.offset
}

fn children(&self) -> Vec<Arc<dyn Array>> {
vec![self.values().clone()]
}
Expand Down
16 changes: 11 additions & 5 deletions src/array/list/ffi.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,16 +7,22 @@ use super::ListArray;

unsafe impl<O: Offset> ToFfi for ListArray<O> {
fn buffers(&self) -> Vec<Option<std::ptr::NonNull<u8>>> {
let offset = self
.validity
.as_ref()
.map(|x| x.offset())
.unwrap_or_default();

assert!(self.offsets.offset() >= offset);

vec![
self.validity.as_ref().map(|x| x.as_ptr()),
std::ptr::NonNull::new(self.offsets.as_ptr() as *mut u8),
std::ptr::NonNull::new(
unsafe { self.offsets.as_ptr().offset(-(offset as isize)) } as *mut u8
),
]
}

fn offset(&self) -> usize {
self.offset
}

fn children(&self) -> Vec<Arc<dyn Array>> {
vec![self.values.clone()]
}
Expand Down
Loading

0 comments on commit 71c6f8f

Please sign in to comment.