From 599b8ac312b67f9d353ba63f50a6407c5626cedc Mon Sep 17 00:00:00 2001 From: "Jorge C. Leitao" Date: Tue, 2 Nov 2021 16:32:37 +0000 Subject: [PATCH] Added ffi support for FixedSizeList and FixedSizeBinary --- .../tests/test_sql.py | 37 +++++++++++++++++++ src/array/binary/ffi.rs | 4 +- src/array/boolean/ffi.rs | 2 +- src/array/dictionary/ffi.rs | 2 +- src/array/ffi.rs | 2 +- src/array/fixed_size_binary/ffi.rs | 17 ++++++++- src/array/fixed_size_list/ffi.rs | 20 +++++++++- src/array/list/ffi.rs | 2 +- src/array/list/mutable.rs | 18 ++++----- src/array/map/ffi.rs | 2 +- src/array/primitive/ffi.rs | 2 +- src/array/union/ffi.rs | 4 +- src/array/utf8/ffi.rs | 4 +- src/ffi/array.rs | 2 + src/ffi/ffi.rs | 31 ++++++++++++++-- src/ffi/schema.rs | 17 ++++++++- tests/it/ffi.rs | 16 ++++++++ 17 files changed, 154 insertions(+), 28 deletions(-) diff --git a/arrow-pyarrow-integration-testing/tests/test_sql.py b/arrow-pyarrow-integration-testing/tests/test_sql.py index 27da0e63515..5fdba7b0cf1 100644 --- a/arrow-pyarrow-integration-testing/tests/test_sql.py +++ b/arrow-pyarrow-integration-testing/tests/test_sql.py @@ -85,6 +85,14 @@ def test_string_sliced(self): assert a.to_pylist() == b.to_pylist() assert a.type == b.type + def test_fixed_binary(self): + a = pyarrow.array([b"aa", None, b"cc"], pyarrow.binary(2)) + b = arrow_pyarrow_integration_testing.round_trip_array(a) + + b.validate(full=True) + assert a.to_pylist() == b.to_pylist() + assert a.type == b.type + def test_decimal_roundtrip(self): """ Python -> Rust -> Python @@ -179,6 +187,35 @@ def test_list_list_array(self): assert a.to_pylist() == b.to_pylist() assert a.type == b.type + def test_fixed_list(self): + """ + Python -> Rust -> Python + """ + a = pyarrow.array( + [None, [1, 2], [4, 5]], + pyarrow.list_(pyarrow.int64(), 2), + ) + b = arrow_pyarrow_integration_testing.round_trip_array(a) + + b.validate(full=True) + assert a.to_pylist() == b.to_pylist() + assert a.type == b.type + + # same as https://issues.apache.org/jira/browse/ARROW-14383 + def _test_fixed_list_sliced(self): + """ + Python -> Rust -> Python + """ + a = pyarrow.array( + [None, [1, 2], [4, 5]], + pyarrow.list_(pyarrow.int64(), 2), + ).slice(1, 2) + b = arrow_pyarrow_integration_testing.round_trip_array(a) + + b.validate(full=True) + assert a.to_pylist() == b.to_pylist() + assert a.type == b.type + def test_dict(self): """ Python -> Rust -> Python diff --git a/src/array/binary/ffi.rs b/src/array/binary/ffi.rs index 2e4e94f3858..2cd1fb2e628 100644 --- a/src/array/binary/ffi.rs +++ b/src/array/binary/ffi.rs @@ -55,8 +55,8 @@ impl FromFfi for BinaryArray { let data_type = array.field().data_type().clone(); let validity = unsafe { array.validity() }?; - let offsets = unsafe { array.buffer::(0) }?; - let values = unsafe { array.buffer::(1) }?; + let offsets = unsafe { array.buffer::(1) }?; + let values = unsafe { array.buffer::(2) }?; Ok(Self::from_data_unchecked( data_type, offsets, values, validity, diff --git a/src/array/boolean/ffi.rs b/src/array/boolean/ffi.rs index c042d538aab..f07d2d20263 100644 --- a/src/array/boolean/ffi.rs +++ b/src/array/boolean/ffi.rs @@ -52,7 +52,7 @@ impl FromFfi for BooleanArray { unsafe fn try_from_ffi(array: A) -> Result { let data_type = array.field().data_type().clone(); let validity = unsafe { array.validity() }?; - let values = unsafe { array.bitmap(0) }?; + let values = unsafe { array.bitmap(1) }?; Ok(Self::from_data(data_type, values, validity)) } } diff --git a/src/array/dictionary/ffi.rs b/src/array/dictionary/ffi.rs index a32d88b3c87..10c094b3931 100644 --- a/src/array/dictionary/ffi.rs +++ b/src/array/dictionary/ffi.rs @@ -28,7 +28,7 @@ impl FromFfi for DictionaryArray unsafe fn try_from_ffi(array: A) -> Result { // keys: similar to PrimitiveArray, but the datatype is the inner one let validity = unsafe { array.validity() }?; - let values = unsafe { array.buffer::(0) }?; + let values = unsafe { array.buffer::(1) }?; let keys = PrimitiveArray::::from_data(K::DATA_TYPE, values, validity); let values = array.dictionary()?.unwrap(); diff --git a/src/array/ffi.rs b/src/array/ffi.rs index a104403ecac..2fbca0493d5 100644 --- a/src/array/ffi.rs +++ b/src/array/ffi.rs @@ -27,7 +27,7 @@ pub(crate) unsafe trait ToFfi { /// Trait describing how a struct imports into itself from the /// [C data interface](https://arrow.apache.org/docs/format/CDataInterface.html) (FFI). -pub trait FromFfi: Sized { +pub(crate) trait FromFfi: Sized { /// Convert itself from FFI. /// # Safety /// This function is intrinsically `unsafe` as it requires the FFI to be made according diff --git a/src/array/fixed_size_binary/ffi.rs b/src/array/fixed_size_binary/ffi.rs index ce9fbc98fb5..4d371c44643 100644 --- a/src/array/fixed_size_binary/ffi.rs +++ b/src/array/fixed_size_binary/ffi.rs @@ -1,4 +1,9 @@ -use crate::{array::ToFfi, bitmap::align}; +use crate::{ + array::{FromFfi, ToFfi}, + bitmap::align, + error::Result, + ffi, +}; use super::FixedSizeBinaryArray; @@ -42,3 +47,13 @@ unsafe impl ToFfi for FixedSizeBinaryArray { } } } + +impl FromFfi for FixedSizeBinaryArray { + unsafe fn try_from_ffi(array: A) -> Result { + let data_type = array.field().data_type().clone(); + let validity = unsafe { array.validity() }?; + let values = unsafe { array.buffer::(1) }?; + + Ok(Self::from_data(data_type, values, validity)) + } +} diff --git a/src/array/fixed_size_list/ffi.rs b/src/array/fixed_size_list/ffi.rs index 760879c766b..1c2a2063e6d 100644 --- a/src/array/fixed_size_list/ffi.rs +++ b/src/array/fixed_size_list/ffi.rs @@ -1,7 +1,14 @@ use std::sync::Arc; -use super::super::{ffi::ToFfi, Array}; use super::FixedSizeListArray; +use crate::{ + array::{ + ffi::{FromFfi, ToFfi}, + Array, + }, + error::Result, + ffi, +}; unsafe impl ToFfi for FixedSizeListArray { fn buffers(&self) -> Vec>> { @@ -25,3 +32,14 @@ unsafe impl ToFfi for FixedSizeListArray { self.clone() } } + +impl FromFfi for FixedSizeListArray { + unsafe fn try_from_ffi(array: A) -> Result { + let data_type = array.field().data_type().clone(); + let validity = unsafe { array.validity() }?; + let child = unsafe { array.child(0)? }; + let values = ffi::try_from(child)?.into(); + + Ok(Self::from_data(data_type, values, validity)) + } +} diff --git a/src/array/list/ffi.rs b/src/array/list/ffi.rs index 1f511730ac0..b22c4073a3d 100644 --- a/src/array/list/ffi.rs +++ b/src/array/list/ffi.rs @@ -54,7 +54,7 @@ impl FromFfi for ListArray { unsafe fn try_from_ffi(array: A) -> Result { let data_type = array.field().data_type().clone(); let validity = unsafe { array.validity() }?; - let offsets = unsafe { array.buffer::(0) }?; + let offsets = unsafe { array.buffer::(1) }?; let child = unsafe { array.child(0)? }; let values = ffi::try_from(child)?.into(); diff --git a/src/array/list/mutable.rs b/src/array/list/mutable.rs index 8f4e36460cf..1c9b258aa4e 100644 --- a/src/array/list/mutable.rs +++ b/src/array/list/mutable.rs @@ -42,15 +42,6 @@ impl MutableListArray { validity: None, } } - - /// Shrinks the capacity of the [`MutableListArray`] to fit its current length. - pub fn shrink_to_fit(&mut self) { - self.values.shrink_to_fit(); - self.offsets.shrink_to_fit(); - if let Some(validity) = &mut self.validity { - validity.shrink_to_fit() - } - } } impl Default for MutableListArray { @@ -188,6 +179,15 @@ impl MutableListArray { let a: ListArray = self.into(); Arc::new(a) } + + /// Shrinks the capacity of the [`MutableListArray`] to fit its current length. + pub fn shrink_to_fit(&mut self) { + self.values.shrink_to_fit(); + self.offsets.shrink_to_fit(); + if let Some(validity) = &mut self.validity { + validity.shrink_to_fit() + } + } } impl MutableArray for MutableListArray { diff --git a/src/array/map/ffi.rs b/src/array/map/ffi.rs index 12a56f5251d..0ab6881a78b 100644 --- a/src/array/map/ffi.rs +++ b/src/array/map/ffi.rs @@ -54,7 +54,7 @@ impl FromFfi for MapArray { unsafe fn try_from_ffi(array: A) -> Result { let data_type = array.field().data_type().clone(); let validity = unsafe { array.validity() }?; - let offsets = unsafe { array.buffer::(0) }?; + let offsets = unsafe { array.buffer::(1) }?; let child = array.child(0)?; let values = ffi::try_from(child)?.into(); diff --git a/src/array/primitive/ffi.rs b/src/array/primitive/ffi.rs index 0e8b3e74a10..1c35009c739 100644 --- a/src/array/primitive/ffi.rs +++ b/src/array/primitive/ffi.rs @@ -53,7 +53,7 @@ impl FromFfi for PrimitiveArray { unsafe fn try_from_ffi(array: A) -> Result { let data_type = array.field().data_type().clone(); let validity = unsafe { array.validity() }?; - let values = unsafe { array.buffer::(0) }?; + let values = unsafe { array.buffer::(1) }?; Ok(Self::from_data(data_type, values, validity)) } diff --git a/src/array/union/ffi.rs b/src/array/union/ffi.rs index fbab5dd1596..655ed43eabb 100644 --- a/src/array/union/ffi.rs +++ b/src/array/union/ffi.rs @@ -37,11 +37,11 @@ impl FromFfi for UnionArray { let data_type = field.data_type().clone(); let fields = Self::get_fields(field.data_type()); - let mut types = unsafe { array.buffer::(0) }?; + let mut types = unsafe { array.buffer::(1) }?; let offsets = if Self::is_sparse(&data_type) { None } else { - Some(unsafe { array.buffer::(1) }?) + Some(unsafe { array.buffer::(2) }?) }; let length = array.array().len(); diff --git a/src/array/utf8/ffi.rs b/src/array/utf8/ffi.rs index 59b1a60ce95..8f5962fa5ea 100644 --- a/src/array/utf8/ffi.rs +++ b/src/array/utf8/ffi.rs @@ -53,8 +53,8 @@ impl FromFfi for Utf8Array { unsafe fn try_from_ffi(array: A) -> Result { let data_type = array.field().data_type().clone(); let validity = unsafe { array.validity() }?; - let offsets = unsafe { array.buffer::(0) }?; - let values = unsafe { array.buffer::(1)? }; + let offsets = unsafe { array.buffer::(1) }?; + let values = unsafe { array.buffer::(2)? }; Ok(Self::from_data_unchecked( data_type, offsets, values, validity, diff --git a/src/ffi/array.rs b/src/ffi/array.rs index 5b93301fdc0..fb2c6cd4446 100644 --- a/src/ffi/array.rs +++ b/src/ffi/array.rs @@ -21,8 +21,10 @@ pub unsafe fn try_from(array: A) -> Result> { LargeUtf8 => Box::new(Utf8Array::::try_from_ffi(array)?), Binary => Box::new(BinaryArray::::try_from_ffi(array)?), LargeBinary => Box::new(BinaryArray::::try_from_ffi(array)?), + FixedSizeBinary => Box::new(FixedSizeBinaryArray::try_from_ffi(array)?), List => Box::new(ListArray::::try_from_ffi(array)?), LargeList => Box::new(ListArray::::try_from_ffi(array)?), + FixedSizeList => Box::new(FixedSizeListArray::try_from_ffi(array)?), Struct => Box::new(StructArray::try_from_ffi(array)?), Dictionary(key_type) => { with_match_physical_dictionary_key_type!(key_type, |$T| { diff --git a/src/ffi/ffi.rs b/src/ffi/ffi.rs index 33ffadc7245..f05e30de1c6 100644 --- a/src/ffi/ffi.rs +++ b/src/ffi/ffi.rs @@ -252,6 +252,20 @@ fn buffer_offset(array: &Ffi_ArrowArray, data_type: &DataType, i: usize) -> usiz // to fetch offset buffer's len to build the second buffer. fn buffer_len(array: &Ffi_ArrowArray, data_type: &DataType, i: usize) -> Result { Ok(match (data_type.to_physical_type(), i) { + (PhysicalType::FixedSizeBinary, 1) => { + if let DataType::FixedSizeBinary(size) = data_type.to_logical_type() { + *size * (array.offset as usize + array.length as usize) + } else { + unreachable!() + } + } + (PhysicalType::FixedSizeList, 1) => { + if let DataType::FixedSizeList(_, size) = data_type.to_logical_type() { + *size * (array.offset as usize + array.length as usize) + } else { + unreachable!() + } + } (PhysicalType::Utf8, 1) | (PhysicalType::LargeUtf8, 1) | (PhysicalType::Binary, 1) @@ -321,7 +335,7 @@ fn create_dictionary( } } -pub trait ArrowArrayRef { +pub trait ArrowArrayRef: std::fmt::Debug { fn deallocation(&self) -> Deallocation { Deallocation::Foreign(self.parent().clone()) } @@ -344,12 +358,11 @@ pub trait ArrowArrayRef { /// The caller must guarantee that the buffer `index` corresponds to a bitmap. /// This function assumes that the bitmap created from FFI is valid; this is impossible to prove. unsafe fn buffer(&self, index: usize) -> Result> { - // +1 to ignore null bitmap create_buffer::( self.array(), self.field().data_type(), self.deallocation(), - index + 1, + index, ) } @@ -358,7 +371,7 @@ pub trait ArrowArrayRef { /// This function assumes that the bitmap created from FFI is valid; this is impossible to prove. unsafe fn bitmap(&self, index: usize) -> Result { // +1 to ignore null bitmap - create_bitmap(self.array(), self.deallocation(), index + 1) + create_bitmap(self.array(), self.deallocation(), index) } /// # Safety @@ -371,6 +384,8 @@ pub trait ArrowArrayRef { create_dictionary(self.array(), self.field(), self.parent().clone()) } + fn n_buffers(&self) -> usize; + fn parent(&self) -> &Arc; fn array(&self) -> &Ffi_ArrowArray; fn field(&self) -> &Field; @@ -420,6 +435,10 @@ impl ArrowArrayRef for Arc { fn array(&self) -> &Ffi_ArrowArray { self.array.as_ref() } + + fn n_buffers(&self) -> usize { + self.array.n_buffers as usize + } } #[derive(Debug)] @@ -442,6 +461,10 @@ impl<'a> ArrowArrayRef for ArrowArrayChild<'a> { fn array(&self) -> &Ffi_ArrowArray { self.array } + + fn n_buffers(&self) -> usize { + self.array.n_buffers as usize + } } impl<'a> ArrowArrayChild<'a> { diff --git a/src/ffi/schema.rs b/src/ffi/schema.rs index 5faf4f37188..ee5cb0fef1d 100644 --- a/src/ffi/schema.rs +++ b/src/ffi/schema.rs @@ -63,6 +63,9 @@ impl Ffi_ArrowSchema { DataType::List(field) => { vec![Box::new(Ffi_ArrowSchema::new(field.as_ref()))] } + DataType::FixedSizeList(field, _) => { + vec![Box::new(Ffi_ArrowSchema::new(field.as_ref()))] + } DataType::LargeList(field) => { vec![Box::new(Ffi_ArrowSchema::new(field.as_ref()))] } @@ -290,6 +293,17 @@ unsafe fn to_data_type(schema: &Ffi_ArrowSchema) -> Result { DataType::Timestamp(TimeUnit::Microsecond, Some(parts[1].to_string())) } else if parts.len() == 2 && parts[0] == "tsn" { DataType::Timestamp(TimeUnit::Nanosecond, Some(parts[1].to_string())) + } else if parts.len() == 2 && parts[0] == "w" { + let size = parts[1] + .parse::() + .map_err(|_| ArrowError::Ffi("size is not a valid integer".to_string()))?; + DataType::FixedSizeBinary(size) + } else if parts.len() == 2 && parts[0] == "+w" { + let size = parts[1] + .parse::() + .map_err(|_| ArrowError::Ffi("size is not a valid integer".to_string()))?; + let child = to_field(schema.child(0))?; + DataType::FixedSizeList(Box::new(child), size) } else if parts.len() == 2 && parts[0] == "d" { let parts = parts[1].split(',').collect::>(); if parts.len() < 2 || parts.len() > 3 { @@ -395,7 +409,7 @@ fn to_format(data_type: &DataType) -> String { DataType::List(_) => "+l".to_string(), DataType::LargeList(_) => "+L".to_string(), DataType::Struct(_) => "+s".to_string(), - DataType::FixedSizeBinary(size) => format!("w{}", size), + DataType::FixedSizeBinary(size) => format!("w:{}", size), DataType::FixedSizeList(_, size) => format!("+w:{}", size), DataType::Union(f, ids, mode) => { let sparsness = if mode.is_sparse() { 's' } else { 'd' }; @@ -419,6 +433,7 @@ fn to_format(data_type: &DataType) -> String { pub(super) fn get_field_child(field: &Field, index: usize) -> Result { match (index, field.data_type()) { (0, DataType::List(field)) => Ok(field.as_ref().clone()), + (0, DataType::FixedSizeList(field, _)) => Ok(field.as_ref().clone()), (0, DataType::LargeList(field)) => Ok(field.as_ref().clone()), (0, DataType::Map(field, _)) => Ok(field.as_ref().clone()), (index, DataType::Struct(fields)) => Ok(fields[index].clone()), diff --git a/tests/it/ffi.rs b/tests/it/ffi.rs index c7912d6cd81..7a1b1664f74 100644 --- a/tests/it/ffi.rs +++ b/tests/it/ffi.rs @@ -125,6 +125,22 @@ fn list() -> Result<()> { test_round_trip(array) } +#[test] +fn fixed_list() -> Result<()> { + let data = vec![ + Some(vec![Some(1i32), Some(2), Some(3)]), + None, + Some(vec![Some(4), None, Some(6)]), + ]; + + let mut array = MutableFixedSizeListArray::new(MutablePrimitiveArray::::new(), 3); + array.try_extend(data)?; + + let array: FixedSizeListArray = array.into(); + + test_round_trip(array) +} + #[test] fn list_list() -> Result<()> { let data = vec![