From 5350a233a1e27c73f1e66b279fa3019d7c9adc5d Mon Sep 17 00:00:00 2001 From: Ritchie Vink Date: Thu, 28 Dec 2023 13:36:44 +0100 Subject: [PATCH 01/25] perf: don't needlessly allocate validity in concat/rechunk (#13288) --- src/array/growable/binary.rs | 31 ++++++++-------- src/array/growable/boolean.rs | 35 +++++++++--------- src/array/growable/dictionary.rs | 40 +++++++++------------ src/array/growable/fixed_binary.rs | 25 ++++++------- src/array/growable/fixed_size_list.rs | 29 +++++++-------- src/array/growable/list.rs | 22 +++++------- src/array/growable/map.rs | 22 +++++------- src/array/growable/primitive.rs | 41 +++++++++++---------- src/array/growable/structure.rs | 36 +++++++++---------- src/array/growable/utf8.rs | 23 +++++------- src/array/growable/utils.rs | 52 +++++++++++++++------------ 11 files changed, 169 insertions(+), 187 deletions(-) diff --git a/src/array/growable/binary.rs b/src/array/growable/binary.rs index 53ff0ae4feb..06f7ce6867f 100644 --- a/src/array/growable/binary.rs +++ b/src/array/growable/binary.rs @@ -8,18 +8,18 @@ use crate::{ }; use super::{ - utils::{build_extend_null_bits, extend_offset_values, ExtendNullBits}, + utils::extend_offset_values, Growable, }; +use crate::array::growable::utils::{extend_validity, prepare_validity}; /// Concrete [`Growable`] for the [`BinaryArray`]. pub struct GrowableBinary<'a, O: Offset> { arrays: Vec<&'a BinaryArray>, data_type: DataType, - validity: MutableBitmap, + validity: Option, values: Vec, offsets: Offsets, - extend_null_bits: Vec>, } impl<'a, O: Offset> GrowableBinary<'a, O> { @@ -35,18 +35,12 @@ impl<'a, O: Offset> GrowableBinary<'a, O> { use_validity = true; }; - let extend_null_bits = arrays - .iter() - .map(|array| build_extend_null_bits(*array, use_validity)) - .collect(); - Self { arrays, data_type, values: Vec::with_capacity(0), offsets: Offsets::with_capacity(capacity), - validity: MutableBitmap::with_capacity(capacity), - extend_null_bits, + validity: prepare_validity(use_validity, capacity), } } @@ -56,15 +50,20 @@ impl<'a, O: Offset> GrowableBinary<'a, O> { let offsets = std::mem::take(&mut self.offsets); let values = std::mem::take(&mut self.values); - BinaryArray::::new(data_type, offsets.into(), values.into(), validity.into()) + BinaryArray::::new( + data_type, + offsets.into(), + values.into(), + validity.map(|v| v.into()), + ) } } impl<'a, O: Offset> Growable<'a> for GrowableBinary<'a, O> { fn extend(&mut self, index: usize, start: usize, len: usize) { - (self.extend_null_bits[index])(&mut self.validity, start, len); - let array = self.arrays[index]; + extend_validity(&mut self.validity, array, start, len); + let offsets = array.offsets(); let values = array.values(); @@ -78,7 +77,9 @@ impl<'a, O: Offset> Growable<'a> for GrowableBinary<'a, O> { fn extend_validity(&mut self, additional: usize) { self.offsets.extend_constant(additional); - self.validity.extend_constant(additional, false); + if let Some(validity) = &mut self.validity { + validity.extend_constant(additional, false); + } } #[inline] @@ -101,7 +102,7 @@ impl<'a, O: Offset> From> for BinaryArray { val.data_type, val.offsets.into(), val.values.into(), - val.validity.into(), + val.validity.map(|v| v.into()), ) } } diff --git a/src/array/growable/boolean.rs b/src/array/growable/boolean.rs index 0cb1213403f..09a2bc632c9 100644 --- a/src/array/growable/boolean.rs +++ b/src/array/growable/boolean.rs @@ -7,7 +7,7 @@ use crate::{ }; use super::{ - utils::{build_extend_null_bits, ExtendNullBits}, + utils::{extend_validity, prepare_validity}, Growable, }; @@ -15,9 +15,8 @@ use super::{ pub struct GrowableBoolean<'a> { arrays: Vec<&'a BooleanArray>, data_type: DataType, - validity: MutableBitmap, + validity: Option, values: MutableBitmap, - extend_null_bits: Vec>, } impl<'a> GrowableBoolean<'a> { @@ -33,33 +32,31 @@ impl<'a> GrowableBoolean<'a> { use_validity = true; }; - let extend_null_bits = arrays - .iter() - .map(|array| build_extend_null_bits(*array, use_validity)) - .collect(); - Self { arrays, data_type, values: MutableBitmap::with_capacity(capacity), - validity: MutableBitmap::with_capacity(capacity), - extend_null_bits, + validity: prepare_validity(use_validity, capacity), } } fn to(&mut self) -> BooleanArray { - let validity = std::mem::take(&mut self.validity); + let validity = self.validity.take(); let values = std::mem::take(&mut self.values); - BooleanArray::new(self.data_type.clone(), values.into(), validity.into()) + BooleanArray::new( + self.data_type.clone(), + values.into(), + validity.map(|v| v.into()), + ) } } impl<'a> Growable<'a> for GrowableBoolean<'a> { fn extend(&mut self, index: usize, start: usize, len: usize) { - (self.extend_null_bits[index])(&mut self.validity, start, len); - let array = self.arrays[index]; + extend_validity(&mut self.validity, array, start, len); + let values = array.values(); let (slice, offset, _) = values.as_slice(); @@ -72,7 +69,9 @@ impl<'a> Growable<'a> for GrowableBoolean<'a> { fn extend_validity(&mut self, additional: usize) { self.values.extend_constant(additional, false); - self.validity.extend_constant(additional, false); + if let Some(validity) = &mut self.validity { + validity.extend_constant(additional, false); + } } #[inline] @@ -91,6 +90,10 @@ impl<'a> Growable<'a> for GrowableBoolean<'a> { impl<'a> From> for BooleanArray { fn from(val: GrowableBoolean<'a>) -> Self { - BooleanArray::new(val.data_type, val.values.into(), val.validity.into()) + BooleanArray::new( + val.data_type, + val.values.into(), + val.validity.map(|v| v.into()), + ) } } diff --git a/src/array/growable/dictionary.rs b/src/array/growable/dictionary.rs index f550304c852..44f2aab00c5 100644 --- a/src/array/growable/dictionary.rs +++ b/src/array/growable/dictionary.rs @@ -8,7 +8,7 @@ use crate::{ use super::{ make_growable, - utils::{build_extend_null_bits, ExtendNullBits}, + utils::{extend_validity, prepare_validity}, Growable, }; @@ -18,12 +18,11 @@ use super::{ /// the values of each [`DictionaryArray`] one after the other. pub struct GrowableDictionary<'a, K: DictionaryKey> { data_type: DataType, - keys_values: Vec<&'a [K]>, + keys: Vec<&'a PrimitiveArray>, key_values: Vec, - key_validity: MutableBitmap, + validity: Option, offsets: Vec, values: Box, - extend_null_bits: Vec>, } fn concatenate_values( @@ -55,16 +54,6 @@ impl<'a, T: DictionaryKey> GrowableDictionary<'a, T> { }; let arrays_keys = arrays.iter().map(|array| array.keys()).collect::>(); - let keys_values = arrays_keys - .iter() - .map(|array| array.values().as_slice()) - .collect::>(); - - let extend_null_bits = arrays - .iter() - .map(|array| build_extend_null_bits(array.keys(), use_validity)) - .collect(); - let arrays_values = arrays .iter() .map(|array| array.values().as_ref()) @@ -76,24 +65,26 @@ impl<'a, T: DictionaryKey> GrowableDictionary<'a, T> { data_type, offsets, values, - keys_values, + keys: arrays_keys, key_values: Vec::with_capacity(capacity), - key_validity: MutableBitmap::with_capacity(capacity), - extend_null_bits, + validity: prepare_validity(use_validity, capacity), } } #[inline] fn to(&mut self) -> DictionaryArray { - let validity = std::mem::take(&mut self.key_validity); + let validity = self.validity.take(); let key_values = std::mem::take(&mut self.key_values); #[cfg(debug_assertions)] { crate::array::specification::check_indexes(&key_values, self.values.len()).unwrap(); } - let keys = - PrimitiveArray::::new(T::PRIMITIVE.into(), key_values.into(), validity.into()); + let keys = PrimitiveArray::::new( + T::PRIMITIVE.into(), + key_values.into(), + validity.map(|v| v.into()), + ); // Safety - the invariant of this struct ensures that this is up-held unsafe { @@ -110,9 +101,10 @@ impl<'a, T: DictionaryKey> GrowableDictionary<'a, T> { impl<'a, T: DictionaryKey> Growable<'a> for GrowableDictionary<'a, T> { #[inline] fn extend(&mut self, index: usize, start: usize, len: usize) { - (self.extend_null_bits[index])(&mut self.key_validity, start, len); + let keys_array = self.keys[index]; + extend_validity(&mut self.validity, keys_array, start, len); - let values = &self.keys_values[index][start..start + len]; + let values = &keys_array.values()[start..start + len]; let offset = self.offsets[index]; self.key_values.extend( values @@ -141,7 +133,9 @@ impl<'a, T: DictionaryKey> Growable<'a> for GrowableDictionary<'a, T> { fn extend_validity(&mut self, additional: usize) { self.key_values .resize(self.key_values.len() + additional, T::default()); - self.key_validity.extend_constant(additional, false); + if let Some(validity) = &mut self.validity { + validity.extend_constant(additional, false); + } } #[inline] diff --git a/src/array/growable/fixed_binary.rs b/src/array/growable/fixed_binary.rs index 763bd59c817..7014e75cc8e 100644 --- a/src/array/growable/fixed_binary.rs +++ b/src/array/growable/fixed_binary.rs @@ -6,16 +6,15 @@ use crate::{ }; use super::{ - utils::{build_extend_null_bits, ExtendNullBits}, + utils::{extend_validity, prepare_validity}, Growable, }; /// Concrete [`Growable`] for the [`FixedSizeBinaryArray`]. pub struct GrowableFixedSizeBinary<'a> { arrays: Vec<&'a FixedSizeBinaryArray>, - validity: MutableBitmap, + validity: Option, values: Vec, - extend_null_bits: Vec>, size: usize, // just a cache } @@ -34,17 +33,11 @@ impl<'a> GrowableFixedSizeBinary<'a> { use_validity = true; }; - let extend_null_bits = arrays - .iter() - .map(|array| build_extend_null_bits(*array, use_validity)) - .collect(); - let size = FixedSizeBinaryArray::get_size(arrays[0].data_type()); Self { arrays, values: Vec::with_capacity(0), - validity: MutableBitmap::with_capacity(capacity), - extend_null_bits, + validity: prepare_validity(use_validity, capacity), size, } } @@ -56,16 +49,16 @@ impl<'a> GrowableFixedSizeBinary<'a> { FixedSizeBinaryArray::new( self.arrays[0].data_type().clone(), values.into(), - validity.into(), + validity.map(|v| v.into()), ) } } impl<'a> Growable<'a> for GrowableFixedSizeBinary<'a> { fn extend(&mut self, index: usize, start: usize, len: usize) { - (self.extend_null_bits[index])(&mut self.validity, start, len); - let array = self.arrays[index]; + extend_validity(&mut self.validity, array, start, len); + let values = array.values(); self.values @@ -75,7 +68,9 @@ impl<'a> Growable<'a> for GrowableFixedSizeBinary<'a> { fn extend_validity(&mut self, additional: usize) { self.values .extend_from_slice(&vec![0; self.size * additional]); - self.validity.extend_constant(additional, false); + if let Some(validity) = &mut self.validity { + validity.extend_constant(additional, false); + } } #[inline] @@ -97,7 +92,7 @@ impl<'a> From> for FixedSizeBinaryArray { FixedSizeBinaryArray::new( val.arrays[0].data_type().clone(), val.values.into(), - val.validity.into(), + val.validity.map(|v| v.into()), ) } } diff --git a/src/array/growable/fixed_size_list.rs b/src/array/growable/fixed_size_list.rs index a70695f4554..37b1519d60f 100644 --- a/src/array/growable/fixed_size_list.rs +++ b/src/array/growable/fixed_size_list.rs @@ -8,16 +8,15 @@ use crate::{ use super::{ make_growable, - utils::{build_extend_null_bits, ExtendNullBits}, + utils::{extend_validity, prepare_validity}, Growable, }; /// Concrete [`Growable`] for the [`FixedSizeListArray`]. pub struct GrowableFixedSizeList<'a> { arrays: Vec<&'a FixedSizeListArray>, - validity: MutableBitmap, + validity: Option, values: Box + 'a>, - extend_null_bits: Vec>, size: usize, } @@ -45,11 +44,6 @@ impl<'a> GrowableFixedSizeList<'a> { unreachable!("`GrowableFixedSizeList` expects `DataType::FixedSizeList`") }; - let extend_null_bits = arrays - .iter() - .map(|array| build_extend_null_bits(*array, use_validity)) - .collect(); - let inner = arrays .iter() .map(|array| array.values().as_ref()) @@ -59,8 +53,7 @@ impl<'a> GrowableFixedSizeList<'a> { Self { arrays, values, - validity: MutableBitmap::with_capacity(capacity), - extend_null_bits, + validity: prepare_validity(use_validity, capacity), size, } } @@ -69,20 +62,28 @@ impl<'a> GrowableFixedSizeList<'a> { let validity = std::mem::take(&mut self.validity); let values = self.values.as_box(); - FixedSizeListArray::new(self.arrays[0].data_type().clone(), values, validity.into()) + FixedSizeListArray::new( + self.arrays[0].data_type().clone(), + values, + validity.map(|v| v.into()), + ) } } impl<'a> Growable<'a> for GrowableFixedSizeList<'a> { fn extend(&mut self, index: usize, start: usize, len: usize) { - (self.extend_null_bits[index])(&mut self.validity, start, len); + let array = self.arrays[index]; + extend_validity(&mut self.validity, array, start, len); + self.values .extend(index, start * self.size, len * self.size); } fn extend_validity(&mut self, additional: usize) { self.values.extend_validity(additional * self.size); - self.validity.extend_constant(additional, false); + if let Some(validity) = &mut self.validity { + validity.extend_constant(additional, false); + } } #[inline] @@ -107,7 +108,7 @@ impl<'a> From> for FixedSizeListArray { Self::new( val.arrays[0].data_type().clone(), values, - val.validity.into(), + val.validity.map(|v| v.into()), ) } } diff --git a/src/array/growable/list.rs b/src/array/growable/list.rs index c0abb26dd72..e8506ecc522 100644 --- a/src/array/growable/list.rs +++ b/src/array/growable/list.rs @@ -8,7 +8,7 @@ use crate::{ use super::{ make_growable, - utils::{build_extend_null_bits, ExtendNullBits}, + utils::{extend_validity, prepare_validity}, Growable, }; @@ -35,10 +35,9 @@ fn extend_offset_values( /// Concrete [`Growable`] for the [`ListArray`]. pub struct GrowableList<'a, O: Offset> { arrays: Vec<&'a ListArray>, - validity: MutableBitmap, + validity: Option, values: Box + 'a>, offsets: Offsets, - extend_null_bits: Vec>, } impl<'a, O: Offset> GrowableList<'a, O> { @@ -52,11 +51,6 @@ impl<'a, O: Offset> GrowableList<'a, O> { use_validity = true; }; - let extend_null_bits = arrays - .iter() - .map(|array| build_extend_null_bits(*array, use_validity)) - .collect(); - let inner = arrays .iter() .map(|array| array.values().as_ref()) @@ -67,8 +61,7 @@ impl<'a, O: Offset> GrowableList<'a, O> { arrays, offsets: Offsets::with_capacity(capacity), values, - validity: MutableBitmap::with_capacity(capacity), - extend_null_bits, + validity: prepare_validity(use_validity, capacity), } } @@ -81,20 +74,23 @@ impl<'a, O: Offset> GrowableList<'a, O> { self.arrays[0].data_type().clone(), offsets.into(), values, - validity.into(), + validity.map(|v| v.into()), ) } } impl<'a, O: Offset> Growable<'a> for GrowableList<'a, O> { fn extend(&mut self, index: usize, start: usize, len: usize) { - (self.extend_null_bits[index])(&mut self.validity, start, len); + let array = self.arrays[index]; + extend_validity(&mut self.validity, array, start, len); extend_offset_values::(self, index, start, len); } fn extend_validity(&mut self, additional: usize) { self.offsets.extend_constant(additional); - self.validity.extend_constant(additional, false); + if let Some(validity) = &mut self.validity { + validity.extend_constant(additional, false); + } } #[inline] diff --git a/src/array/growable/map.rs b/src/array/growable/map.rs index 0919b4821ba..27238f69ca2 100644 --- a/src/array/growable/map.rs +++ b/src/array/growable/map.rs @@ -8,7 +8,7 @@ use crate::{ use super::{ make_growable, - utils::{build_extend_null_bits, ExtendNullBits}, + utils::{extend_validity, prepare_validity}, Growable, }; @@ -30,10 +30,9 @@ fn extend_offset_values(growable: &mut GrowableMap<'_>, index: usize, start: usi /// Concrete [`Growable`] for the [`MapArray`]. pub struct GrowableMap<'a> { arrays: Vec<&'a MapArray>, - validity: MutableBitmap, + validity: Option, values: Box + 'a>, offsets: Offsets, - extend_null_bits: Vec>, } impl<'a> GrowableMap<'a> { @@ -47,11 +46,6 @@ impl<'a> GrowableMap<'a> { use_validity = true; }; - let extend_null_bits = arrays - .iter() - .map(|array| build_extend_null_bits(*array, use_validity)) - .collect(); - let inner = arrays .iter() .map(|array| array.field().as_ref()) @@ -62,8 +56,7 @@ impl<'a> GrowableMap<'a> { arrays, offsets: Offsets::with_capacity(capacity), values, - validity: MutableBitmap::with_capacity(capacity), - extend_null_bits, + validity: prepare_validity(use_validity, capacity), } } @@ -76,20 +69,23 @@ impl<'a> GrowableMap<'a> { self.arrays[0].data_type().clone(), offsets.into(), values, - validity.into(), + validity.map(|v| v.into()), ) } } impl<'a> Growable<'a> for GrowableMap<'a> { fn extend(&mut self, index: usize, start: usize, len: usize) { - (self.extend_null_bits[index])(&mut self.validity, start, len); + let array = self.arrays[index]; + extend_validity(&mut self.validity, array, start, len); extend_offset_values(self, index, start, len); } fn extend_validity(&mut self, additional: usize) { self.offsets.extend_constant(additional); - self.validity.extend_constant(additional, false); + if let Some(validity) = &mut self.validity { + validity.extend_constant(additional, false); + } } #[inline] diff --git a/src/array/growable/primitive.rs b/src/array/growable/primitive.rs index e443756cb95..7fb0939407d 100644 --- a/src/array/growable/primitive.rs +++ b/src/array/growable/primitive.rs @@ -8,17 +8,16 @@ use crate::{ }; use super::{ - utils::{build_extend_null_bits, ExtendNullBits}, + utils::{extend_validity, prepare_validity}, Growable, }; /// Concrete [`Growable`] for the [`PrimitiveArray`]. pub struct GrowablePrimitive<'a, T: NativeType> { data_type: DataType, - arrays: Vec<&'a [T]>, - validity: MutableBitmap, + arrays: Vec<&'a PrimitiveArray>, + validity: Option, values: Vec, - extend_null_bits: Vec>, } impl<'a, T: NativeType> GrowablePrimitive<'a, T> { @@ -38,22 +37,11 @@ impl<'a, T: NativeType> GrowablePrimitive<'a, T> { let data_type = arrays[0].data_type().clone(); - let extend_null_bits = arrays - .iter() - .map(|array| build_extend_null_bits(*array, use_validity)) - .collect(); - - let arrays = arrays - .iter() - .map(|array| array.values().as_slice()) - .collect::>(); - Self { data_type, arrays, values: Vec::with_capacity(capacity), - validity: MutableBitmap::with_capacity(capacity), - extend_null_bits, + validity: prepare_validity(use_validity, capacity), } } @@ -62,16 +50,21 @@ impl<'a, T: NativeType> GrowablePrimitive<'a, T> { let validity = std::mem::take(&mut self.validity); let values = std::mem::take(&mut self.values); - PrimitiveArray::::new(self.data_type.clone(), values.into(), validity.into()) + PrimitiveArray::::new( + self.data_type.clone(), + values.into(), + validity.map(|v| v.into()), + ) } } impl<'a, T: NativeType> Growable<'a> for GrowablePrimitive<'a, T> { #[inline] fn extend(&mut self, index: usize, start: usize, len: usize) { - (self.extend_null_bits[index])(&mut self.validity, start, len); + let array = self.arrays[index]; + extend_validity(&mut self.validity, array, start, len); - let values = self.arrays[index]; + let values = array.values().as_slice(); self.values.extend_from_slice(&values[start..start + len]); } @@ -79,7 +72,9 @@ impl<'a, T: NativeType> Growable<'a> for GrowablePrimitive<'a, T> { fn extend_validity(&mut self, additional: usize) { self.values .resize(self.values.len() + additional, T::default()); - self.validity.extend_constant(additional, false); + if let Some(validity) = &mut self.validity { + validity.extend_constant(additional, false); + } } #[inline] @@ -101,6 +96,10 @@ impl<'a, T: NativeType> Growable<'a> for GrowablePrimitive<'a, T> { impl<'a, T: NativeType> From> for PrimitiveArray { #[inline] fn from(val: GrowablePrimitive<'a, T>) -> Self { - PrimitiveArray::::new(val.data_type, val.values.into(), val.validity.into()) + PrimitiveArray::::new( + val.data_type, + val.values.into(), + val.validity.map(|v| v.into()), + ) } } diff --git a/src/array/growable/structure.rs b/src/array/growable/structure.rs index b1242e08a4f..ccfe4399435 100644 --- a/src/array/growable/structure.rs +++ b/src/array/growable/structure.rs @@ -7,16 +7,15 @@ use crate::{ use super::{ make_growable, - utils::{build_extend_null_bits, ExtendNullBits}, + utils::{extend_validity, prepare_validity}, Growable, }; /// Concrete [`Growable`] for the [`StructArray`]. pub struct GrowableStruct<'a> { arrays: Vec<&'a StructArray>, - validity: MutableBitmap, + validity: Option, values: Vec + 'a>>, - extend_null_bits: Vec>, } impl<'a> GrowableStruct<'a> { @@ -32,11 +31,6 @@ impl<'a> GrowableStruct<'a> { use_validity = true; }; - let extend_null_bits = arrays - .iter() - .map(|array| build_extend_null_bits(*array, use_validity)) - .collect(); - let arrays = arrays .iter() .map(|array| array.as_any().downcast_ref::().unwrap()) @@ -59,8 +53,7 @@ impl<'a> GrowableStruct<'a> { Self { arrays, values, - validity: MutableBitmap::with_capacity(capacity), - extend_null_bits, + validity: prepare_validity(use_validity, capacity), } } @@ -69,15 +62,19 @@ impl<'a> GrowableStruct<'a> { let values = std::mem::take(&mut self.values); let values = values.into_iter().map(|mut x| x.as_box()).collect(); - StructArray::new(self.arrays[0].data_type().clone(), values, validity.into()) + StructArray::new( + self.arrays[0].data_type().clone(), + values, + validity.map(|v| v.into()), + ) } } impl<'a> Growable<'a> for GrowableStruct<'a> { fn extend(&mut self, index: usize, start: usize, len: usize) { - (self.extend_null_bits[index])(&mut self.validity, start, len); - let array = self.arrays[index]; + extend_validity(&mut self.validity, array, start, len); + if array.null_count() == 0 { self.values .iter_mut() @@ -101,18 +98,17 @@ impl<'a> Growable<'a> for GrowableStruct<'a> { self.values .iter_mut() .for_each(|child| child.extend_validity(additional)); - self.validity.extend_constant(additional, false); + if let Some(validity) = &mut self.validity { + validity.extend_constant(additional, false); + } } #[inline] fn len(&self) -> usize { - // All children should have the same indexing, so just use the first - // one. If we don't have children, we might still have a validity - // array, so use that. - if let Some(child) = self.values.get(0) { + if let Some(child) = self.values.first() { child.len() } else { - self.validity.len() + unreachable!() } } @@ -132,7 +128,7 @@ impl<'a> From> for StructArray { StructArray::new( val.arrays[0].data_type().clone(), values, - val.validity.into(), + val.validity.map(|v| v.into()), ) } } diff --git a/src/array/growable/utf8.rs b/src/array/growable/utf8.rs index cd71da0a264..f65709f961e 100644 --- a/src/array/growable/utf8.rs +++ b/src/array/growable/utf8.rs @@ -7,17 +7,16 @@ use crate::{ }; use super::{ - utils::{build_extend_null_bits, extend_offset_values, ExtendNullBits}, + utils::{extend_validity, prepare_validity, extend_offset_values}, Growable, }; /// Concrete [`Growable`] for the [`Utf8Array`]. pub struct GrowableUtf8<'a, O: Offset> { arrays: Vec<&'a Utf8Array>, - validity: MutableBitmap, + validity: Option, values: Vec, offsets: Offsets, - extend_null_bits: Vec>, } impl<'a, O: Offset> GrowableUtf8<'a, O> { @@ -31,17 +30,11 @@ impl<'a, O: Offset> GrowableUtf8<'a, O> { use_validity = true; }; - let extend_null_bits = arrays - .iter() - .map(|array| build_extend_null_bits(*array, use_validity)) - .collect(); - Self { arrays: arrays.to_vec(), values: Vec::with_capacity(0), offsets: Offsets::with_capacity(capacity), - validity: MutableBitmap::with_capacity(capacity), - extend_null_bits, + validity: prepare_validity(use_validity, capacity), } } @@ -60,7 +53,7 @@ impl<'a, O: Offset> GrowableUtf8<'a, O> { self.arrays[0].data_type().clone(), offsets.into(), values.into(), - validity.into(), + validity.map(|v| v.into()), ) .unwrap() } @@ -69,9 +62,9 @@ impl<'a, O: Offset> GrowableUtf8<'a, O> { impl<'a, O: Offset> Growable<'a> for GrowableUtf8<'a, O> { fn extend(&mut self, index: usize, start: usize, len: usize) { - (self.extend_null_bits[index])(&mut self.validity, start, len); - let array = self.arrays[index]; + extend_validity(&mut self.validity, array, start, len); + let offsets = array.offsets(); let values = array.values(); @@ -85,7 +78,9 @@ impl<'a, O: Offset> Growable<'a> for GrowableUtf8<'a, O> { fn extend_validity(&mut self, additional: usize) { self.offsets.extend_constant(additional); - self.validity.extend_constant(additional, false); + if let Some(validity) = &mut self.validity { + validity.extend_constant(additional, false); + } } #[inline] diff --git a/src/array/growable/utils.rs b/src/array/growable/utils.rs index 3e0c25a4ee2..3da39275bb4 100644 --- a/src/array/growable/utils.rs +++ b/src/array/growable/utils.rs @@ -1,28 +1,5 @@ use crate::{array::Array, bitmap::MutableBitmap, offset::Offset}; -// function used to extend nulls from arrays. This function's lifetime is bound to the array -// because it reads nulls from it. -pub(super) type ExtendNullBits<'a> = Box; - -pub(super) fn build_extend_null_bits(array: &dyn Array, use_validity: bool) -> ExtendNullBits { - if let Some(bitmap) = array.validity() { - Box::new(move |validity, start, len| { - debug_assert!(start + len <= bitmap.len()); - let (slice, offset, _) = bitmap.as_slice(); - // safety: invariant offset + length <= slice.len() - unsafe { - validity.extend_from_slice_unchecked(slice, start + offset, len); - } - }) - } else if use_validity { - Box::new(|validity, _, len| { - validity.extend_constant(len, true); - }) - } else { - Box::new(|_, _, _| {}) - } -} - #[inline] pub(super) fn extend_offset_values( buffer: &mut Vec, @@ -36,3 +13,32 @@ pub(super) fn extend_offset_values( let new_values = &values[start_values..end_values]; buffer.extend_from_slice(new_values); } + +pub(super) fn prepare_validity(use_validity: bool, capacity: usize) -> Option { + if use_validity { + Some(MutableBitmap::with_capacity(capacity)) + } else { + None + } +} + +pub(super) fn extend_validity( + mutable_validity: &mut Option, + array: &dyn Array, + start: usize, + len: usize, +) { + if let Some(mutable_validity) = mutable_validity { + match array.validity() { + None => mutable_validity.extend_constant(len, true), + Some(validity) => { + debug_assert!(start + len <= validity.len()); + let (slice, offset, _) = validity.as_slice(); + // safety: invariant offset + length <= slice.len() + unsafe { + mutable_validity.extend_from_slice_unchecked(slice, start + offset, len); + } + }, + } + } +} From dcbeabe8db0646d796ca69ec557a97dc19c5151d Mon Sep 17 00:00:00 2001 From: Ritchie Vink Date: Fri, 5 Jan 2024 13:28:24 +0100 Subject: [PATCH 02/25] feat: implement `BinaryView` and `Utf8View` in `polars-arrow` (#13243) --- Cargo.toml | 2 +- src/array/binview/ffi.rs | 72 ++++++++ src/array/binview/fmt.rs | 37 ++++ src/array/binview/iterator.rs | 30 ++++ src/array/binview/mod.rs | 304 ++++++++++++++++++++++++++++++++ src/array/binview/mutable.rs | 182 +++++++++++++++++++ src/array/binview/view.rs | 81 +++++++++ src/array/equal/binary_view.rs | 9 + src/array/equal/mod.rs | 13 +- src/array/ffi.rs | 2 + src/array/fmt.rs | 14 ++ src/array/growable/binview.rs | 116 ++++++++++++ src/array/growable/mod.rs | 35 +++- src/array/mod.rs | 12 ++ src/bitmap/mutable.rs | 15 +- src/buffer/immutable.rs | 7 + src/compute/aggregate/memory.rs | 8 + src/datatypes/mod.rs | 36 +++- src/datatypes/physical_type.rs | 6 + src/ffi/array.rs | 2 + src/ffi/bridge.rs | 4 +- src/ffi/schema.rs | 2 + src/io/ipc/read/deserialize.rs | 2 + src/io/ipc/read/schema.rs | 4 + src/io/ipc/write/common.rs | 27 ++- src/io/ipc/write/schema.rs | 3 + src/io/ipc/write/serialize.rs | 3 +- src/scalar/binview.rs | 72 ++++++++ src/scalar/mod.rs | 20 +++ src/types/mod.rs | 3 + src/types/native.rs | 1 + 31 files changed, 1105 insertions(+), 19 deletions(-) create mode 100644 src/array/binview/ffi.rs create mode 100644 src/array/binview/fmt.rs create mode 100644 src/array/binview/iterator.rs create mode 100644 src/array/binview/mod.rs create mode 100644 src/array/binview/mutable.rs create mode 100644 src/array/binview/view.rs create mode 100644 src/array/equal/binary_view.rs create mode 100644 src/array/growable/binview.rs create mode 100644 src/scalar/binview.rs diff --git a/Cargo.toml b/Cargo.toml index a8e5933d2fe..0c8dc4fd71b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -57,7 +57,7 @@ indexmap = { version = "^1.6", optional = true } # used to print columns in a nice columnar format comfy-table = { version = "6.0", optional = true, default-features = false } -arrow-format = { version = "0.8", optional = true, features = ["ipc"] } +arrow-format = { package = "polars-arrow-format", version = "0.1.0", optional = true, features = ["ipc"] } hex = { version = "^0.4", optional = true } diff --git a/src/array/binview/ffi.rs b/src/array/binview/ffi.rs new file mode 100644 index 00000000000..00f9896b9e3 --- /dev/null +++ b/src/array/binview/ffi.rs @@ -0,0 +1,72 @@ +use std::sync::Arc; + +use polars_error::PolarsResult; + +use super::BinaryViewArrayGeneric; +use crate::array::binview::ViewType; +use crate::array::{FromFfi, ToFfi}; +use crate::bitmap::align; +use crate::ffi; + +unsafe impl ToFfi for BinaryViewArrayGeneric { + fn buffers(&self) -> Vec> { + let mut buffers = Vec::with_capacity(self.buffers.len() + 2); + buffers.push(self.validity.as_ref().map(|x| x.as_ptr())); + buffers.push(Some(self.views.as_ptr().cast::())); + buffers.extend(self.buffers.iter().map(|b| Some(b.as_ptr()))); + buffers + } + + fn offset(&self) -> Option { + let offset = self.views.offset(); + if let Some(bitmap) = self.validity.as_ref() { + if bitmap.offset() == offset { + Some(offset) + } else { + None + } + } else { + Some(offset) + } + } + + fn to_ffi_aligned(&self) -> Self { + let offset = self.views.offset(); + + let validity = self.validity.as_ref().map(|bitmap| { + if bitmap.offset() == offset { + bitmap.clone() + } else { + align(bitmap, offset) + } + }); + + Self { + data_type: self.data_type.clone(), + validity, + views: self.views.clone(), + buffers: self.buffers.clone(), + raw_buffers: self.raw_buffers.clone(), + phantom: Default::default(), + } + } +} + +impl FromFfi for BinaryViewArrayGeneric { + unsafe fn try_from_ffi(array: A) -> PolarsResult { + let data_type = array.data_type().clone(); + + let validity = unsafe { array.validity() }?; + let views = unsafe { array.buffer::(1) }?; + + let n = array.n_buffers() - 2; + let mut buffers = Vec::with_capacity(n); + + for i in 2..n + 2 { + let values = unsafe { array.buffer::(i) }?; + buffers.push(values); + } + + Self::try_new(data_type, views, Arc::from(buffers), validity) + } +} diff --git a/src/array/binview/fmt.rs b/src/array/binview/fmt.rs new file mode 100644 index 00000000000..1337588c61c --- /dev/null +++ b/src/array/binview/fmt.rs @@ -0,0 +1,37 @@ +use std::fmt::{Debug, Formatter, Result, Write}; + +use super::super::fmt::write_vec; +use super::BinaryViewArrayGeneric; +use crate::array::binview::ViewType; +use crate::array::Array; + +pub fn write_value<'a, T: ViewType + ?Sized, W: Write>( + array: &'a BinaryViewArrayGeneric, + index: usize, + f: &mut W, +) -> Result +where + &'a T: Debug, +{ + let bytes = array.value(index).to_bytes(); + let writer = |f: &mut W, index| write!(f, "{}", bytes[index]); + + write_vec(f, writer, None, bytes.len(), "None", false) +} + +impl Debug for BinaryViewArrayGeneric +where + for<'a> &'a T: Debug, +{ + fn fmt(&self, f: &mut Formatter<'_>) -> Result { + let writer = |f: &mut Formatter, index| write_value(self, index, f); + + let head = if T::IS_UTF8 { + "Utf8ViewArray" + } else { + "BinaryViewArray" + }; + write!(f, "{head}")?; + write_vec(f, writer, self.validity(), self.len(), "None", false) + } +} diff --git a/src/array/binview/iterator.rs b/src/array/binview/iterator.rs new file mode 100644 index 00000000000..5e53fb8fec6 --- /dev/null +++ b/src/array/binview/iterator.rs @@ -0,0 +1,30 @@ +use super::BinaryViewArrayGeneric; +use crate::array::binview::ViewType; +use crate::array::{ArrayAccessor, ArrayValuesIter}; +use crate::bitmap::utils::{BitmapIter, ZipValidity}; + +unsafe impl<'a, T: ViewType + ?Sized> ArrayAccessor<'a> for BinaryViewArrayGeneric { + type Item = &'a T; + + #[inline] + unsafe fn value_unchecked(&'a self, index: usize) -> Self::Item { + self.value_unchecked(index) + } + + #[inline] + fn len(&self) -> usize { + self.views.len() + } +} + +/// Iterator of values of an [`BinaryArray`]. +pub type BinaryViewValueIter<'a, T> = ArrayValuesIter<'a, BinaryViewArrayGeneric>; + +impl<'a, T: ViewType + ?Sized> IntoIterator for &'a BinaryViewArrayGeneric { + type Item = Option<&'a T>; + type IntoIter = ZipValidity<&'a T, BinaryViewValueIter<'a, T>, BitmapIter<'a>>; + + fn into_iter(self) -> Self::IntoIter { + self.iter() + } +} diff --git a/src/array/binview/mod.rs b/src/array/binview/mod.rs new file mode 100644 index 00000000000..98ed1b6f8e6 --- /dev/null +++ b/src/array/binview/mod.rs @@ -0,0 +1,304 @@ +//! See thread: https://lists.apache.org/thread/w88tpz76ox8h3rxkjl4so6rg3f1rv7wt +mod ffi; +pub(super) mod fmt; +mod iterator; +mod mutable; +mod view; + +use std::any::Any; +use std::fmt::Debug; +use std::marker::PhantomData; +use std::sync::Arc; + +use polars_error::*; + +use crate::array::Array; +use crate::bitmap::Bitmap; +use crate::buffer::Buffer; +use crate::datatypes::ArrowDataType; + +mod private { + pub trait Sealed: Send + Sync {} + + impl Sealed for str {} + impl Sealed for [u8] {} +} +use private::Sealed; + +use crate::array::binview::iterator::BinaryViewValueIter; +use crate::array::binview::view::{validate_binary_view, validate_utf8_view}; +use crate::array::iterator::NonNullValuesIter; +use crate::bitmap::utils::{BitmapIter, ZipValidity}; + +pub type BinaryViewArray = BinaryViewArrayGeneric<[u8]>; +pub type Utf8ViewArray = BinaryViewArrayGeneric; + +pub trait ViewType: Sealed + 'static + PartialEq { + const IS_UTF8: bool; + const DATA_TYPE: ArrowDataType; + type Owned: Debug + Clone + Sync + Send + AsRef; + + /// # Safety + /// The caller must ensure `index < self.len()`. + unsafe fn from_bytes_unchecked(slice: &[u8]) -> &Self; + + fn to_bytes(&self) -> &[u8]; + + #[allow(clippy::wrong_self_convention)] + fn into_owned(&self) -> Self::Owned; +} + +impl ViewType for str { + const IS_UTF8: bool = true; + const DATA_TYPE: ArrowDataType = ArrowDataType::Utf8View; + type Owned = String; + + #[inline(always)] + unsafe fn from_bytes_unchecked(slice: &[u8]) -> &Self { + std::str::from_utf8_unchecked(slice) + } + + #[inline(always)] + fn to_bytes(&self) -> &[u8] { + self.as_bytes() + } + + fn into_owned(&self) -> Self::Owned { + self.to_string() + } +} + +impl ViewType for [u8] { + const IS_UTF8: bool = false; + const DATA_TYPE: ArrowDataType = ArrowDataType::BinaryView; + type Owned = Vec; + + #[inline(always)] + unsafe fn from_bytes_unchecked(slice: &[u8]) -> &Self { + slice + } + + #[inline(always)] + fn to_bytes(&self) -> &[u8] { + self + } + + fn into_owned(&self) -> Self::Owned { + self.to_vec() + } +} + +pub struct BinaryViewArrayGeneric { + data_type: ArrowDataType, + views: Buffer, + buffers: Arc<[Buffer]>, + // Raw buffer access. (pointer, len). + raw_buffers: Arc<[(*const u8, usize)]>, + validity: Option, + phantom: PhantomData, +} + +impl Clone for BinaryViewArrayGeneric { + fn clone(&self) -> Self { + Self { + data_type: self.data_type.clone(), + views: self.views.clone(), + buffers: self.buffers.clone(), + raw_buffers: self.raw_buffers.clone(), + validity: self.validity.clone(), + phantom: Default::default(), + } + } +} + +unsafe impl Send for BinaryViewArrayGeneric {} +unsafe impl Sync for BinaryViewArrayGeneric {} + +fn buffers_into_raw(buffers: &[Buffer]) -> Arc<[(*const T, usize)]> { + buffers + .iter() + .map(|buf| (buf.as_ptr(), buf.len())) + .collect() +} + +impl BinaryViewArrayGeneric { + /// # Safety + /// The caller must ensure + /// - the data is valid utf8 (if required) + /// - The offsets match the buffers. + pub unsafe fn new_unchecked( + data_type: ArrowDataType, + views: Buffer, + buffers: Arc<[Buffer]>, + validity: Option, + ) -> Self { + let raw_buffers = buffers_into_raw(&buffers); + Self { + data_type, + views, + buffers, + raw_buffers, + validity, + phantom: Default::default(), + } + } + + pub fn data_buffers(&self) -> &[Buffer] { + self.buffers.as_ref() + } + + pub fn views(&self) -> &Buffer { + &self.views + } + + pub fn try_new( + data_type: ArrowDataType, + views: Buffer, + buffers: Arc<[Buffer]>, + validity: Option, + ) -> PolarsResult { + if T::IS_UTF8 { + validate_utf8_view(views.as_ref(), buffers.as_ref())?; + } else { + validate_binary_view(views.as_ref(), buffers.as_ref())?; + } + + if let Some(validity) = &validity { + polars_ensure!(validity.len()== views.len(), ComputeError: "validity mask length must match the number of values" ) + } + + let raw_buffers = buffers_into_raw(&buffers); + Ok(Self { + data_type, + views, + buffers, + raw_buffers, + validity, + phantom: Default::default(), + }) + } + + /// Creates an empty [`BinaryViewArrayGeneric`], i.e. whose `.len` is zero. + #[inline] + pub fn new_empty(data_type: ArrowDataType) -> Self { + unsafe { Self::new_unchecked(data_type, Buffer::new(), Arc::from([]), None) } + } + + /// Returns a new null [`BinaryViewArrayGeneric`] of `length`. + #[inline] + pub fn new_null(data_type: ArrowDataType, length: usize) -> Self { + let validity = Some(Bitmap::new_zeroed(length)); + unsafe { Self::new_unchecked(data_type, Buffer::zeroed(length), Arc::from([]), validity) } + } + + /// Returns the element at index `i` + /// # Panics + /// iff `i >= self.len()` + #[inline] + pub fn value(&self, i: usize) -> &T { + assert!(i < self.len()); + unsafe { self.value_unchecked(i) } + } + + /// Returns the element at index `i` + /// # Safety + /// Assumes that the `i < self.len`. + #[inline] + pub unsafe fn value_unchecked(&self, i: usize) -> &T { + let v = *self.views.get_unchecked(i); + let len = v as u32; + + // view layout: + // length: 4 bytes + // prefix: 4 bytes + // buffer_index: 4 bytes + // offset: 4 bytes + + // inlined layout: + // length: 4 bytes + // data: 12 bytes + + let bytes = if len <= 12 { + let ptr = self.views.as_ptr() as *const u8; + std::slice::from_raw_parts(ptr.add(i * 16 + 4), len as usize) + } else { + let buffer_idx = (v >> 64) as u32; + let offset = (v >> 96) as u32; + let (data_ptr, data_len) = *self.raw_buffers.get_unchecked(buffer_idx as usize); + let data = std::slice::from_raw_parts(data_ptr, data_len); + let offset = offset as usize; + data.get_unchecked(offset..offset + len as usize) + }; + T::from_bytes_unchecked(bytes) + } + + /// Returns an iterator of `Option<&T>` over every element of this array. + pub fn iter(&self) -> ZipValidity<&T, BinaryViewValueIter, BitmapIter> { + ZipValidity::new_with_validity(self.values_iter(), self.validity.as_ref()) + } + + /// Returns an iterator of `&[u8]` over every element of this array, ignoring the validity + pub fn values_iter(&self) -> BinaryViewValueIter { + BinaryViewValueIter::new(self) + } + + /// Returns an iterator of the non-null values. + #[inline] + pub fn non_null_values_iter(&self) -> NonNullValuesIter<'_, BinaryViewArrayGeneric> { + NonNullValuesIter::new(self, self.validity()) + } + + impl_sliced!(); + impl_mut_validity!(); + impl_into_array!(); +} + +impl Array for BinaryViewArrayGeneric { + fn as_any(&self) -> &dyn Any { + self + } + + fn as_any_mut(&mut self) -> &mut dyn Any { + self + } + + fn len(&self) -> usize { + self.views.len() + } + + fn data_type(&self) -> &ArrowDataType { + &self.data_type + } + + fn validity(&self) -> Option<&Bitmap> { + self.validity.as_ref() + } + + fn slice(&mut self, offset: usize, length: usize) { + assert!( + offset + length <= self.len(), + "the offset of the new Buffer cannot exceed the existing length" + ); + unsafe { self.slice_unchecked(offset, length) } + todo!() + } + + unsafe fn slice_unchecked(&mut self, offset: usize, length: usize) { + self.validity = self + .validity + .take() + .map(|bitmap| bitmap.sliced_unchecked(offset, length)) + .filter(|bitmap| bitmap.unset_bits() > 0); + self.views.slice_unchecked(offset, length); + } + + fn with_validity(&self, validity: Option) -> Box { + let mut new = self.clone(); + new.validity = validity; + Box::new(new) + } + + fn to_boxed(&self) -> Box { + Box::new(self.clone()) + } +} diff --git a/src/array/binview/mutable.rs b/src/array/binview/mutable.rs new file mode 100644 index 00000000000..5bbd0c170f7 --- /dev/null +++ b/src/array/binview/mutable.rs @@ -0,0 +1,182 @@ +use std::sync::Arc; + +use polars_utils::slice::GetSaferUnchecked; + +use crate::array::binview::{BinaryViewArrayGeneric, ViewType}; +use crate::bitmap::MutableBitmap; +use crate::buffer::Buffer; + +const DEFAULT_BLOCK_SIZE: usize = 8 * 1024; + +#[derive(Debug, Clone)] +pub struct MutableBinaryViewArray { + views: Vec, + completed_buffers: Vec>, + in_progress_buffer: Vec, + validity: Option, + phantom: std::marker::PhantomData, +} + +impl Default for MutableBinaryViewArray { + fn default() -> Self { + Self::with_capacity(0) + } +} + +impl From> for BinaryViewArrayGeneric { + fn from(mut value: MutableBinaryViewArray) -> Self { + value + .completed_buffers + .push(std::mem::take(&mut value.in_progress_buffer).into()); + + unsafe { + Self::new_unchecked( + T::DATA_TYPE, + value.views.into(), + Arc::from(value.completed_buffers), + value.validity.map(|b| b.into()), + ) + } + } +} + +impl MutableBinaryViewArray { + pub fn new() -> Self { + Self::default() + } + + pub fn with_capacity(capacity: usize) -> Self { + Self { + views: Vec::with_capacity(capacity), + completed_buffers: vec![], + in_progress_buffer: vec![], + validity: None, + phantom: Default::default(), + } + } + + /// Reserves `additional` elements and `additional_buffer` on the buffer. + pub fn reserve(&mut self, additional: usize) { + self.views.reserve(additional); + } + + pub fn len(&self) -> usize { + self.views.len() + } + + fn init_validity(&mut self) { + let mut validity = MutableBitmap::with_capacity(self.views.capacity()); + validity.extend_constant(self.len(), true); + validity.set(self.len() - 1, false); + self.validity = Some(validity); + } + + pub fn push_value>(&mut self, value: V) { + if let Some(validity) = &mut self.validity { + validity.push(true) + } + + let value = value.as_ref(); + let bytes = value.to_bytes(); + let len: u32 = bytes.len().try_into().unwrap(); + let mut payload = [0; 16]; + payload[0..4].copy_from_slice(&len.to_le_bytes()); + + if len <= 12 { + payload[4..4 + bytes.len()].copy_from_slice(bytes); + } else { + let required_cap = self.in_progress_buffer.len() + bytes.len(); + if self.in_progress_buffer.capacity() < required_cap { + let new_capacity = (self.in_progress_buffer.capacity() * 2) + .clamp(DEFAULT_BLOCK_SIZE, 16 * 1024 * 1024) + .max(bytes.len()); + let in_progress = Vec::with_capacity(new_capacity); + let flushed = std::mem::replace(&mut self.in_progress_buffer, in_progress); + if !flushed.is_empty() { + self.completed_buffers.push(flushed.into()) + } + } + let offset = self.in_progress_buffer.len() as u32; + self.in_progress_buffer.extend_from_slice(bytes); + + unsafe { payload[4..8].copy_from_slice(bytes.get_unchecked_release(0..4)) }; + let buffer_idx: u32 = self.completed_buffers.len().try_into().unwrap(); + payload[8..12].copy_from_slice(&buffer_idx.to_le_bytes()); + payload[12..16].copy_from_slice(&offset.to_le_bytes()); + } + let value = u128::from_le_bytes(payload); + self.views.push(value); + } + + pub fn push>(&mut self, value: Option) { + if let Some(value) = value { + self.push_value(value) + } else { + self.views.push(0); + match &mut self.validity { + Some(validity) => validity.push(false), + None => self.init_validity(), + } + } + } + + impl_mutable_array_mut_validity!(); + + #[inline] + pub fn extend_values(&mut self, iterator: I) + where + I: Iterator, + P: AsRef, + { + self.reserve(iterator.size_hint().0); + for v in iterator { + self.push_value(v) + } + } + + #[inline] + pub fn extend(&mut self, iterator: I) + where + I: Iterator>, + P: AsRef, + { + self.reserve(iterator.size_hint().0); + for p in iterator { + self.push(p) + } + } + + pub fn from_iter(iterator: I) -> Self + where + I: Iterator>, + P: AsRef, + { + let mut mutable = Self::with_capacity(iterator.size_hint().0); + mutable.extend(iterator); + mutable + } + + pub fn from_values_iter(iterator: I) -> Self + where + I: Iterator, + P: AsRef, + { + let mut mutable = Self::with_capacity(iterator.size_hint().0); + mutable.extend_values(iterator); + mutable + } +} + +impl> Extend> for MutableBinaryViewArray { + #[inline] + fn extend>>(&mut self, iter: I) { + Self::extend(self, iter.into_iter()) + } +} + +impl> FromIterator> for MutableBinaryViewArray { + #[inline] + fn from_iter>>(iter: I) -> Self { + Self::from_iter(iter.into_iter()) + } +} diff --git a/src/array/binview/view.rs b/src/array/binview/view.rs new file mode 100644 index 00000000000..a24b1f1daec --- /dev/null +++ b/src/array/binview/view.rs @@ -0,0 +1,81 @@ +use polars_error::*; + +use crate::buffer::Buffer; + +pub struct View { + /// The length of the string/bytes. + pub length: u32, + /// First 4 bytes of string/bytes data. + pub prefix: u32, + /// The buffer index. + pub buffer_idx: u32, + /// The offset into the buffer. + pub offset: u32, +} + +impl From for View { + #[inline] + fn from(value: u128) -> Self { + Self { + length: value as u32, + prefix: (value >> 64) as u32, + buffer_idx: (value >> 64) as u32, + offset: (value >> 96) as u32, + } + } +} + +impl From for u128 { + #[inline] + fn from(value: View) -> Self { + value.length as u128 + | ((value.prefix as u128) << 32) + | ((value.buffer_idx as u128) << 64) + | ((value.offset as u128) << 96) + } +} + +fn validate_view(views: &[u128], buffers: &[Buffer], validate_bytes: F) -> PolarsResult<()> +where + F: Fn(&[u8]) -> PolarsResult<()>, +{ + for view in views { + let len = *view as u32; + if len <= 12 { + if len < 12 && view >> (32 + len * 8) != 0 { + polars_bail!(ComputeError: "view contained non-zero padding in prefix"); + } + + validate_bytes(&view.to_le_bytes()[4..4 + len as usize])?; + } else { + let view = View::from(*view); + + let data = buffers.get(view.buffer_idx as usize).ok_or_else(|| { + polars_err!(OutOfBounds: "view index out of bounds\n\nGot: {} buffers and index: {}", buffers.len(), view.buffer_idx) + })?; + + let start = view.offset as usize; + let end = start + len as usize; + let b = data + .as_slice() + .get(start..end) + .ok_or_else(|| polars_err!(OutOfBounds: "buffer slice out of bounds"))?; + + polars_ensure!(b.starts_with(&view.prefix.to_le_bytes()), ComputeError: "prefix does not match string data"); + validate_bytes(b)?; + }; + } + + Ok(()) +} + +pub(super) fn validate_binary_view(views: &[u128], buffers: &[Buffer]) -> PolarsResult<()> { + validate_view(views, buffers, |_| Ok(())) +} + +pub(super) fn validate_utf8_view(views: &[u128], buffers: &[Buffer]) -> PolarsResult<()> { + validate_view(views, buffers, |b| match simdutf8::basic::from_utf8(b) { + Ok(_) => Ok(()), + Err(_) => Err(polars_err!(ComputeError: "invalid utf8")), + }) +} diff --git a/src/array/equal/binary_view.rs b/src/array/equal/binary_view.rs new file mode 100644 index 00000000000..546e3e2a181 --- /dev/null +++ b/src/array/equal/binary_view.rs @@ -0,0 +1,9 @@ +use crate::array::binview::{BinaryViewArrayGeneric, ViewType}; +use crate::array::Array; + +pub(super) fn equal( + lhs: &BinaryViewArrayGeneric, + rhs: &BinaryViewArrayGeneric, +) -> bool { + lhs.data_type() == rhs.data_type() && lhs.len() == rhs.len() && lhs.iter().eq(rhs.iter()) +} diff --git a/src/array/equal/mod.rs b/src/array/equal/mod.rs index 2bb3ba77f1f..19b09c77763 100644 --- a/src/array/equal/mod.rs +++ b/src/array/equal/mod.rs @@ -4,6 +4,7 @@ use crate::types::NativeType; use super::*; mod binary; +mod binary_view; mod boolean; mod dictionary; mod fixed_size_binary; @@ -283,6 +284,16 @@ pub fn equal(lhs: &dyn Array, rhs: &dyn Array) -> bool { let lhs = lhs.as_any().downcast_ref().unwrap(); let rhs = rhs.as_any().downcast_ref().unwrap(); map::equal(lhs, rhs) - } + }, + BinaryView => { + let lhs = lhs.as_any().downcast_ref().unwrap(); + let rhs = rhs.as_any().downcast_ref().unwrap(); + binary_view::equal::<[u8]>(lhs, rhs) + }, + Utf8View => { + let lhs = lhs.as_any().downcast_ref().unwrap(); + let rhs = rhs.as_any().downcast_ref().unwrap(); + binary_view::equal::(lhs, rhs) + }, } } diff --git a/src/array/ffi.rs b/src/array/ffi.rs index 141cab327e4..e6bb31ead25 100644 --- a/src/array/ffi.rs +++ b/src/array/ffi.rs @@ -71,6 +71,8 @@ pub fn offset_buffers_children_dictionary(array: &dyn Array) -> BuffersChildren Struct => ffi_dyn!(array, StructArray), Union => ffi_dyn!(array, UnionArray), Map => ffi_dyn!(array, MapArray), + BinaryView => ffi_dyn!(array, BinaryViewArray), + Utf8View => ffi_dyn!(array, Utf8ViewArray), Dictionary(key_type) => { match_integer_type!(key_type, |$T| { let array = array.as_any().downcast_ref::>().unwrap(); diff --git a/src/array/fmt.rs b/src/array/fmt.rs index 4f2c6896beb..bb47f66040c 100644 --- a/src/array/fmt.rs +++ b/src/array/fmt.rs @@ -91,6 +91,20 @@ pub fn get_value_display<'a, F: Write + 'a>( Map => Box::new(move |f, index| { super::map::fmt::write_value(array.as_any().downcast_ref().unwrap(), index, null, f) }), + BinaryView => Box::new(move |f, index| { + super::binview::fmt::write_value::<[u8], _>( + array.as_any().downcast_ref().unwrap(), + index, + f, + ) + }), + Utf8View => Box::new(move |f, index| { + super::binview::fmt::write_value::( + array.as_any().downcast_ref().unwrap(), + index, + f, + ) + }), Dictionary(key_type) => match_integer_type!(key_type, |$T| { Box::new(move |f, index| { super::dictionary::fmt::write_value::<$T,_>(array.as_any().downcast_ref().unwrap(), index, null, f) diff --git a/src/array/growable/binview.rs b/src/array/growable/binview.rs new file mode 100644 index 00000000000..40793b1b3a4 --- /dev/null +++ b/src/array/growable/binview.rs @@ -0,0 +1,116 @@ +use std::sync::Arc; + +use super::Growable; +use crate::array::binview::{BinaryViewArrayGeneric, ViewType}; +use crate::array::growable::utils::{extend_validity, prepare_validity}; +use crate::array::Array; +use crate::bitmap::MutableBitmap; +use crate::buffer::Buffer; +use crate::datatypes::ArrowDataType; + +/// Concrete [`Growable`] for the [`BinaryArray`]. +pub struct GrowableBinaryViewArray<'a, T: ViewType + ?Sized> { + arrays: Vec<&'a BinaryViewArrayGeneric>, + data_type: ArrowDataType, + validity: Option, + views: Vec, + buffers: Vec>, +} + +impl<'a, T: ViewType + ?Sized> GrowableBinaryViewArray<'a, T> { + /// Creates a new [`GrowableBinaryViewArray`] bound to `arrays` with a pre-allocated `capacity`. + /// # Panics + /// If `arrays` is empty. + pub fn new( + arrays: Vec<&'a BinaryViewArrayGeneric>, + mut use_validity: bool, + capacity: usize, + ) -> Self { + let data_type = arrays[0].data_type().clone(); + + // if any of the arrays has nulls, insertions from any array requires setting bits + // as there is at least one array with nulls. + if !use_validity & arrays.iter().any(|array| array.null_count() > 0) { + use_validity = true; + }; + + let n_buffers = arrays + .iter() + .map(|binview| binview.data_buffers().len()) + .sum::(); + + Self { + arrays, + data_type, + validity: prepare_validity(use_validity, capacity), + views: Vec::with_capacity(capacity), + buffers: Vec::with_capacity(n_buffers), + } + } + + fn to(&mut self) -> BinaryViewArrayGeneric { + let views = std::mem::take(&mut self.views); + let buffers = std::mem::take(&mut self.buffers); + let validity = self.validity.take(); + unsafe { + BinaryViewArrayGeneric::::new_unchecked( + self.data_type.clone(), + views.into(), + Arc::from(buffers), + validity.map(|v| v.into()), + ) + } + } +} + +impl<'a, T: ViewType + ?Sized> Growable<'a> for GrowableBinaryViewArray<'a, T> { + fn extend(&mut self, index: usize, start: usize, len: usize) { + let array = self.arrays[index]; + extend_validity(&mut self.validity, array, start, len); + + let buffer_offset: u32 = self.buffers.len().try_into().expect("unsupported"); + let buffer_offset = (buffer_offset as u128) << 64; + + let range = start..start + len; + self.buffers + .extend_from_slice(&array.data_buffers()[range.clone()]); + self.views.extend(array.views()[range].iter().map(|&view| { + // If null the buffer index is ignored because the length is 0, + // so we can just do this + view + buffer_offset + })); + } + + fn extend_validity(&mut self, additional: usize) { + self.views.extend(std::iter::repeat(0).take(additional)); + if let Some(validity) = &mut self.validity { + validity.extend_constant(additional, false); + } + } + + #[inline] + fn len(&self) -> usize { + self.views.len() + } + + fn as_arc(&mut self) -> Arc { + self.to().arced() + } + + fn as_box(&mut self) -> Box { + self.to().boxed() + } +} + +impl<'a, T: ViewType + ?Sized> From> for BinaryViewArrayGeneric { + fn from(val: GrowableBinaryViewArray<'a, T>) -> Self { + unsafe { + BinaryViewArrayGeneric::::new_unchecked( + val.data_type, + val.views.into(), + Arc::from(val.buffers), + val.validity.map(|v| v.into()), + ) + } + } +} diff --git a/src/array/growable/mod.rs b/src/array/growable/mod.rs index 45f79405307..89706a77b35 100644 --- a/src/array/growable/mod.rs +++ b/src/array/growable/mod.rs @@ -30,6 +30,7 @@ pub use utf8::GrowableUtf8; mod dictionary; pub use dictionary::GrowableDictionary; +mod binview; mod utils; /// Describes a struct that can be extended from slices of other pre-existing [`Array`]s. @@ -118,14 +119,22 @@ pub fn make_growable<'a>( use_validity, capacity ), - Union => { - let arrays = arrays - .iter() - .map(|array| array.as_any().downcast_ref().unwrap()) - .collect::>(); - Box::new(union::GrowableUnion::new(arrays, capacity)) - } - Map => dyn_growable!(map::GrowableMap, arrays, use_validity, capacity), + BinaryView => { + dyn_growable!( + binview::GrowableBinaryViewArray::<[u8]>, + arrays, + use_validity, + capacity + ) + }, + Utf8View => { + dyn_growable!( + binview::GrowableBinaryViewArray::, + arrays, + use_validity, + capacity + ) + }, Dictionary(key_type) => { match_integer_type!(key_type, |$T| { let arrays = arrays @@ -143,6 +152,14 @@ pub fn make_growable<'a>( capacity, )) }) - } + }, + Map => dyn_growable!(map::GrowableMap, arrays, use_validity, capacity), + Union => { + let arrays = arrays + .iter() + .map(|array| array.as_any().downcast_ref().unwrap()) + .collect::>(); + Box::new(union::GrowableUnion::new(arrays, capacity)) + }, } } diff --git a/src/array/mod.rs b/src/array/mod.rs index 02735c3d0bb..16518e6514c 100644 --- a/src/array/mod.rs +++ b/src/array/mod.rs @@ -320,6 +320,8 @@ impl std::fmt::Debug for dyn Array + '_ { Primitive(primitive) => with_match_primitive_type!(primitive, |$T| { fmt_dyn!(self, PrimitiveArray<$T>, f) }), + BinaryView => fmt_dyn!(self, Utf8ViewArray, f), + Utf8View => fmt_dyn!(self, BinaryViewArray, f), Binary => fmt_dyn!(self, BinaryArray, f), LargeBinary => fmt_dyn!(self, BinaryArray, f), FixedSizeBinary => fmt_dyn!(self, FixedSizeBinaryArray, f), @@ -360,6 +362,8 @@ pub fn new_empty_array(data_type: DataType) -> Box { Struct => Box::new(StructArray::new_empty(data_type)), Union => Box::new(UnionArray::new_empty(data_type)), Map => Box::new(MapArray::new_empty(data_type)), + Utf8View => Box::new(Utf8ViewArray::new_empty(data_type)), + BinaryView => Box::new(BinaryViewArray::new_empty(data_type)), Dictionary(key_type) => { match_integer_type!(key_type, |$T| { Box::new(DictionaryArray::<$T>::new_empty(data_type)) @@ -390,6 +394,8 @@ pub fn new_null_array(data_type: DataType, length: usize) -> Box { Struct => Box::new(StructArray::new_null(data_type, length)), Union => Box::new(UnionArray::new_null(data_type, length)), Map => Box::new(MapArray::new_null(data_type, length)), + BinaryView => Box::new(BinaryViewArray::new_null(data_type, length)), + Utf8View => Box::new(Utf8ViewArray::new_null(data_type, length)), Dictionary(key_type) => { match_integer_type!(key_type, |$T| { Box::new(DictionaryArray::<$T>::new_null(data_type, length)) @@ -472,6 +478,7 @@ pub fn to_data(array: &dyn Array) -> arrow_data::ArrayData { }) } Map => to_data_dyn!(array, MapArray), + BinaryView | Utf8View => todo!(), } } @@ -502,6 +509,7 @@ pub fn from_data(data: &arrow_data::ArrayData) -> Box { }) } Map => Box::new(MapArray::from_data(data)), + BinaryView | Utf8View => todo!(), } } @@ -687,6 +695,8 @@ pub fn clone(array: &dyn Array) -> Box { Struct => clone_dyn!(array, StructArray), Union => clone_dyn!(array, UnionArray), Map => clone_dyn!(array, MapArray), + BinaryView => clone_dyn!(array, BinaryViewArray), + Utf8View => clone_dyn!(array, Utf8ViewArray), Dictionary(key_type) => { match_integer_type!(key_type, |$T| { clone_dyn!(array, DictionaryArray::<$T>) @@ -724,6 +734,7 @@ mod fmt; pub mod indexable; mod iterator; +mod binview; pub mod growable; pub mod ord; @@ -734,6 +745,7 @@ pub use equal::equal; pub use fmt::{get_display, get_value_display}; pub use binary::{BinaryArray, BinaryValueIter, MutableBinaryArray, MutableBinaryValuesArray}; +pub use binview::{BinaryViewArray, BinaryViewArrayGeneric, Utf8ViewArray, ViewType}; pub use boolean::{BooleanArray, MutableBooleanArray}; pub use dictionary::{DictionaryArray, DictionaryKey, MutableDictionaryArray}; pub use fixed_size_binary::{FixedSizeBinaryArray, MutableFixedSizeBinaryArray}; diff --git a/src/bitmap/mutable.rs b/src/bitmap/mutable.rs index 31834f21657..10802f42d8c 100644 --- a/src/bitmap/mutable.rs +++ b/src/bitmap/mutable.rs @@ -2,7 +2,7 @@ use std::hint::unreachable_unchecked; use std::iter::FromIterator; use std::sync::Arc; -use crate::bitmap::utils::{merge_reversed, set_bit_unchecked}; +use crate::bitmap::utils::{get_bit_unchecked, merge_reversed, set_bit_unchecked}; use crate::error::Error; use crate::trusted_len::TrustedLen; @@ -115,7 +115,7 @@ impl MutableBitmap { if self.length % 8 == 0 { self.buffer.push(0); } - let byte = self.buffer.as_mut_slice().last_mut().unwrap(); + let byte = unsafe { self.buffer.as_mut_slice().last_mut().unwrap_unchecked() }; *byte = set(*byte, self.length % 8, value); self.length += 1; } @@ -129,7 +129,7 @@ impl MutableBitmap { } self.length -= 1; - let value = self.get(self.length); + let value = unsafe { self.get_unchecked(self.length) }; if self.length % 8 == 0 { self.buffer.pop(); } @@ -144,6 +144,15 @@ impl MutableBitmap { get_bit(&self.buffer, index) } + /// Returns whether the position `index` is set. + /// + /// # Safety + /// The caller must ensure `index < self.len()`. + #[inline] + pub unsafe fn get_unchecked(&self, index: usize) -> bool { + get_bit_unchecked(&self.buffer, index) + } + /// Sets the position `index` to `value` /// # Panics /// Panics iff `index >= self.len()`. diff --git a/src/buffer/immutable.rs b/src/buffer/immutable.rs index 0da4a41ace4..3736b91a5a0 100644 --- a/src/buffer/immutable.rs +++ b/src/buffer/immutable.rs @@ -1,6 +1,7 @@ use std::{iter::FromIterator, ops::Deref, sync::Arc, usize}; use either::Either; +use num_traits::Zero; use super::Bytes; use super::IntoIter; @@ -270,6 +271,12 @@ impl Buffer { } } +impl Buffer { + pub fn zeroed(len: usize) -> Self { + vec![T::zero(); len].into() + } +} + impl From> for Buffer { #[inline] fn from(p: Vec) -> Self { diff --git a/src/compute/aggregate/memory.rs b/src/compute/aggregate/memory.rs index 7e3218a828a..819cc03e0b7 100644 --- a/src/compute/aggregate/memory.rs +++ b/src/compute/aggregate/memory.rs @@ -22,6 +22,12 @@ macro_rules! dyn_binary { }}; } +fn binview_size(array: &BinaryViewArrayGeneric) -> usize { + array.views().len() * std::mem::size_of::() + + array.data_buffers().iter().map(|b| b.len()).sum::() + + validity_size(array.validity()) +} + /// Returns the total (heap) allocated size of the array in bytes. /// # Implementation /// This estimation is the sum of the size of its buffers, validity, including nested arrays. @@ -109,6 +115,8 @@ pub fn estimated_bytes_size(array: &dyn Array) -> usize { .unwrap(); estimated_bytes_size(array.keys()) + estimated_bytes_size(array.values().as_ref()) }), + Utf8View => binview_size::(array.as_any().downcast_ref().unwrap()), + BinaryView => binview_size::<[u8]>(array.as_any().downcast_ref().unwrap()), Map => { let array = array.as_any().downcast_ref::().unwrap(); let offsets = array.offsets().len_proxy() * std::mem::size_of::(); diff --git a/src/datatypes/mod.rs b/src/datatypes/mod.rs index 626b292ad81..9e35defc431 100644 --- a/src/datatypes/mod.rs +++ b/src/datatypes/mod.rs @@ -157,7 +157,16 @@ pub enum DataType { /// Decimal backed by 256 bits Decimal256(usize, usize), /// Extension type. + /// - name + /// - physical type + /// - metadata Extension(String, Box, Option), + /// A binary type that inlines small values + /// and can intern bytes. + BinaryView, + /// A string type that inlines small values + /// and can intern strings. + Utf8View, } #[cfg(feature = "arrow")] @@ -213,9 +222,16 @@ impl From for arrow_schema::DataType { Box::new(DataType::from(key).into()), Box::new((*value).into()), ), - DataType::Decimal(precision, scale) => Self::Decimal128(precision as _, scale as _), - DataType::Decimal256(precision, scale) => Self::Decimal256(precision as _, scale as _), + DataType::Decimal(precision, scale) => { + Self::Decimal128(precision as _, scale as _) + }, + DataType::Decimal256(precision, scale) => { + Self::Decimal256(precision as _, scale as _) + }, DataType::Extension(_, d, _) => (*d).into(), + DataType::BinaryView | DataType::Utf8View => { + panic!("view datatypes not supported by arrow-rs") + }, } } } @@ -441,6 +457,8 @@ impl DataType { LargeBinary => PhysicalType::LargeBinary, Utf8 => PhysicalType::Utf8, LargeUtf8 => PhysicalType::LargeUtf8, + BinaryView => PhysicalType::BinaryView, + Utf8View => PhysicalType::Utf8View, List(_) => PhysicalType::List, FixedSizeList(_, _) => PhysicalType::FixedSizeList, LargeList(_) => PhysicalType::LargeList, @@ -462,6 +480,19 @@ impl DataType { _ => self, } } + + pub fn inner_dtype(&self) -> Option<&DataType> { + match self { + DataType::List(inner) => Some(inner.data_type()), + DataType::LargeList(inner) => Some(inner.data_type()), + DataType::FixedSizeList(inner, _) => Some(inner.data_type()), + _ => None, + } + } + + pub fn is_view(&self) -> bool { + matches!(self, DataType::Utf8View | DataType::BinaryView) + } } impl From for DataType { @@ -497,6 +528,7 @@ impl From for DataType { PrimitiveType::Float64 => DataType::Float64, PrimitiveType::DaysMs => DataType::Interval(IntervalUnit::DayTime), PrimitiveType::MonthDayNano => DataType::Interval(IntervalUnit::MonthDayNano), + PrimitiveType::UInt128 => unimplemented!(), } } } diff --git a/src/datatypes/physical_type.rs b/src/datatypes/physical_type.rs index 828df9541f0..d33be774e76 100644 --- a/src/datatypes/physical_type.rs +++ b/src/datatypes/physical_type.rs @@ -39,6 +39,12 @@ pub enum PhysicalType { Map, /// A dictionary encoded array by `IntegerType`. Dictionary(IntegerType), + /// A binary type that inlines small values + /// and can intern bytes. + BinaryView, + /// A string type that inlines small values + /// and can intern strings. + Utf8View, } impl PhysicalType { diff --git a/src/ffi/array.rs b/src/ffi/array.rs index 1a25b98510f..271399ca890 100644 --- a/src/ffi/array.rs +++ b/src/ffi/array.rs @@ -43,6 +43,8 @@ pub unsafe fn try_from(array: A) -> Result> { } Union => Box::new(UnionArray::try_from_ffi(array)?), Map => Box::new(MapArray::try_from_ffi(array)?), + BinaryView => Box::new(BinaryViewArray::try_from_ffi(array)?), + Utf8View => Box::new(Utf8ViewArray::try_from_ffi(array)?), }) } diff --git a/src/ffi/bridge.rs b/src/ffi/bridge.rs index 9a098cc8b2c..e27dbeadfb2 100644 --- a/src/ffi/bridge.rs +++ b/src/ffi/bridge.rs @@ -34,6 +34,8 @@ pub fn align_to_c_data_interface(array: Box) -> Box { match_integer_type!(key_type, |$T| { ffi_dyn!(array, DictionaryArray<$T>) }) - } + }, + BinaryView => ffi_dyn!(array, BinaryViewArray), + Utf8View => ffi_dyn!(array, Utf8ViewArray), } } diff --git a/src/ffi/schema.rs b/src/ffi/schema.rs index e41de33e436..6b17cb7835c 100644 --- a/src/ffi/schema.rs +++ b/src/ffi/schema.rs @@ -439,6 +439,8 @@ fn to_format(data_type: &DataType) -> String { tz.as_ref().map(|x| x.as_ref()).unwrap_or("") ) } + DataType::Utf8View => "vu".to_string(), + DataType::BinaryView => "vz".to_string(), DataType::Decimal(precision, scale) => format!("d:{precision},{scale}"), DataType::Decimal256(precision, scale) => format!("d:{precision},{scale},256"), DataType::List(_) => "+l".to_string(), diff --git a/src/io/ipc/read/deserialize.rs b/src/io/ipc/read/deserialize.rs index 77ced6a5e97..f2bd5054778 100644 --- a/src/io/ipc/read/deserialize.rs +++ b/src/io/ipc/read/deserialize.rs @@ -226,6 +226,7 @@ pub fn read( scratch, ) .map(|x| x.boxed()), + Utf8View | BinaryView => todo!(), } } @@ -249,5 +250,6 @@ pub fn skip( Dictionary(_) => skip_dictionary(field_nodes, buffers), Union => skip_union(field_nodes, data_type, buffers), Map => skip_map(field_nodes, data_type, buffers), + BinaryView | Utf8View => todo!(), } } diff --git a/src/io/ipc/read/schema.rs b/src/io/ipc/read/schema.rs index 7ec87eaa334..93ff1c6e1ea 100644 --- a/src/io/ipc/read/schema.rs +++ b/src/io/ipc/read/schema.rs @@ -353,6 +353,10 @@ fn get_data_type( Struct(_) => deserialize_struct(field)?, Union(union_) => deserialize_union(union_, field)?, Map(map) => deserialize_map(map, field)?, + RunEndEncoded(_) => todo!(), + BinaryView(_) => todo!(), + Utf8View(_) => todo!(), + LargeListView(_) | ListView(_) => todo!(), }) } diff --git a/src/io/ipc/write/common.rs b/src/io/ipc/write/common.rs index 155a0079c67..17bc3128f50 100644 --- a/src/io/ipc/write/common.rs +++ b/src/io/ipc/write/common.rs @@ -167,7 +167,8 @@ fn encode_dictionary( dictionary_tracker, encoded_dictionaries, ) - } + }, + Utf8View | BinaryView => todo!(), } } @@ -243,7 +244,23 @@ fn chunk_to_bytes_amortized( arrow_data.clear(); let mut offset = 0; + let mut variadic_buffer_counts = vec![]; for array in chunk.arrays() { + let dtype = array.data_type(); + if dtype.is_view() { + match dtype { + DataType::Utf8View => { + let array = array.as_any().downcast_ref::().unwrap(); + variadic_buffer_counts.push(array.data_buffers().len() as i64); + }, + DataType::BinaryView => { + let array = array.as_any().downcast_ref::().unwrap(); + variadic_buffer_counts.push(array.data_buffers().len() as i64); + }, + _ => {}, + } + } + write( array.as_ref(), &mut buffers, @@ -255,6 +272,12 @@ fn chunk_to_bytes_amortized( ) } + let variadic_buffer_counts = if variadic_buffer_counts.is_empty() { + None + } else { + Some(variadic_buffer_counts) + }; + let compression = serialize_compression(options.compression); let message = arrow_format::ipc::Message { @@ -265,6 +288,7 @@ fn chunk_to_bytes_amortized( nodes: Some(nodes), buffers: Some(buffers), compression, + variadic_buffer_counts, }, ))), body_length: arrow_data.len() as i64, @@ -312,6 +336,7 @@ fn dictionary_batch_to_bytes( nodes: Some(nodes), buffers: Some(buffers), compression, + variadic_buffer_counts: None, })), is_delta: false, }, diff --git a/src/io/ipc/write/schema.rs b/src/io/ipc/write/schema.rs index 1c4dab8e393..465fb8c38bf 100644 --- a/src/io/ipc/write/schema.rs +++ b/src/io/ipc/write/schema.rs @@ -255,6 +255,7 @@ fn serialize_type(data_type: &DataType) -> arrow_format::ipc::Type { Struct(_) => ipc::Type::Struct(Box::new(ipc::Struct {})), Dictionary(_, v, _) => serialize_type(v), Extension(_, v, _) => serialize_type(v), + Utf8View | BinaryView => todo!(), } } @@ -287,6 +288,8 @@ fn serialize_children(data_type: &DataType, ipc_field: &IpcField) -> Vec vec![], FixedSizeList(inner, _) | LargeList(inner) | List(inner) | Map(inner, _) => { vec![serialize_field(inner, &ipc_field.fields[0])] diff --git a/src/io/ipc/write/serialize.rs b/src/io/ipc/write/serialize.rs index 0e9aa38ab7d..3a9fb56a6af 100644 --- a/src/io/ipc/write/serialize.rs +++ b/src/io/ipc/write/serialize.rs @@ -565,7 +565,8 @@ pub fn write( is_little_endian, compression, ); - } + }, + Utf8View | BinaryView => todo!(), } } diff --git a/src/scalar/binview.rs b/src/scalar/binview.rs new file mode 100644 index 00000000000..e96c90c04ad --- /dev/null +++ b/src/scalar/binview.rs @@ -0,0 +1,72 @@ +use std::fmt::{Debug, Formatter}; + +use super::Scalar; +use crate::array::ViewType; +use crate::datatypes::ArrowDataType; + +/// The implementation of [`Scalar`] for utf8, semantically equivalent to [`Option`]. +#[derive(PartialEq, Eq)] +pub struct BinaryViewScalar { + value: Option, + phantom: std::marker::PhantomData, +} + +impl Debug for BinaryViewScalar { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + write!(f, "Scalar({:?})", self.value) + } +} + +impl Clone for BinaryViewScalar { + fn clone(&self) -> Self { + Self { + value: self.value.clone(), + phantom: Default::default(), + } + } +} + +impl BinaryViewScalar { + /// Returns a new [`BinaryViewScalar`] + #[inline] + pub fn new(value: Option<&T>) -> Self { + Self { + value: value.map(|x| x.into_owned()), + phantom: std::marker::PhantomData, + } + } + + /// Returns the value irrespectively of the validity. + #[inline] + pub fn value(&self) -> Option<&T> { + self.value.as_ref().map(|x| x.as_ref()) + } +} + +impl From> for BinaryViewScalar { + #[inline] + fn from(v: Option<&T>) -> Self { + Self::new(v) + } +} + +impl Scalar for BinaryViewScalar { + #[inline] + fn as_any(&self) -> &dyn std::any::Any { + self + } + + #[inline] + fn is_valid(&self) -> bool { + self.value.is_some() + } + + #[inline] + fn data_type(&self) -> &ArrowDataType { + if T::IS_UTF8 { + &ArrowDataType::Utf8View + } else { + &ArrowDataType::BinaryView + } + } +} diff --git a/src/scalar/mod.rs b/src/scalar/mod.rs index aab5ed929fa..667ba2b1dc2 100644 --- a/src/scalar/mod.rs +++ b/src/scalar/mod.rs @@ -26,8 +26,11 @@ pub use struct_::*; mod fixed_size_list; pub use fixed_size_list::*; mod fixed_size_binary; +pub use binview::*; pub use fixed_size_binary::*; +mod binview; mod union; + pub use union::UnionScalar; /// Trait object declaring an optional value with a [`DataType`]. @@ -57,6 +60,21 @@ macro_rules! dyn_new_utf8 { }}; } +macro_rules! dyn_new_binview { + ($array:expr, $index:expr, $type:ty) => {{ + let array = $array + .as_any() + .downcast_ref::>() + .unwrap(); + let value = if array.is_valid($index) { + Some(array.value($index)) + } else { + None + }; + Box::new(BinaryViewScalar::<$type>::new(value)) + }}; +} + macro_rules! dyn_new_binary { ($array:expr, $index:expr, $type:ty) => {{ let array = $array @@ -110,6 +128,8 @@ pub fn new_scalar(array: &dyn Array, index: usize) -> Box { }; Box::new(PrimitiveScalar::new(array.data_type().clone(), value)) }), + BinaryView => dyn_new_binview!(array, index, [u8]), + Utf8View => dyn_new_binview!(array, index, str), Utf8 => dyn_new_utf8!(array, index, i32), LargeUtf8 => dyn_new_utf8!(array, index, i64), Binary => dyn_new_binary!(array, index, i32), diff --git a/src/types/mod.rs b/src/types/mod.rs index 165e4bd5921..97134e1a76c 100644 --- a/src/types/mod.rs +++ b/src/types/mod.rs @@ -57,6 +57,8 @@ pub enum PrimitiveType { UInt32, /// An unsigned 64-bit integer. UInt64, + /// An unsigned 128-bit integer. + UInt128, /// A 16-bit floating point number. Float16, /// A 32-bit floating point number. @@ -81,6 +83,7 @@ mod private { impl Sealed for i32 {} impl Sealed for i64 {} impl Sealed for i128 {} + impl Sealed for u128 {} impl Sealed for super::i256 {} impl Sealed for super::f16 {} impl Sealed for f32 {} diff --git a/src/types/native.rs b/src/types/native.rs index 6e50a1454ea..4fecc42fb58 100644 --- a/src/types/native.rs +++ b/src/types/native.rs @@ -86,6 +86,7 @@ native_type!(i64, PrimitiveType::Int64); native_type!(f32, PrimitiveType::Float32); native_type!(f64, PrimitiveType::Float64); native_type!(i128, PrimitiveType::Int128); +native_type!(u128, PrimitiveType::UInt128); /// The in-memory representation of the DayMillisecond variant of arrow's "Interval" logical type. #[derive(Debug, Copy, Clone, Default, PartialEq, Eq, Hash, Zeroable, Pod)] From df6bc3474e38864b4edb14ee5092979c94b87dae Mon Sep 17 00:00:00 2001 From: Ritchie Vink Date: Sat, 6 Jan 2024 11:15:22 +0100 Subject: [PATCH 03/25] feat(rust): `BinaryView`/`Utf8View` IPC support (#13464) --- Cargo.toml | 2 +- src/array/binary/ffi.rs | 2 +- src/array/binview/ffi.rs | 7 +- src/array/binview/mod.rs | 11 + src/array/binview/mutable.rs | 4 + src/array/list/ffi.rs | 2 +- src/io/ipc/read/array/binary.rs | 13 +- src/io/ipc/read/array/binview.rs | 82 ++++ src/io/ipc/read/array/boolean.rs | 13 +- src/io/ipc/read/array/fixed_size_binary.rs | 13 +- src/io/ipc/read/array/fixed_size_list.rs | 9 +- src/io/ipc/read/array/list.rs | 15 +- src/io/ipc/read/array/map.rs | 15 +- src/io/ipc/read/array/mod.rs | 26 + src/io/ipc/read/array/null.rs | 18 +- src/io/ipc/read/array/primitive.rs | 13 +- src/io/ipc/read/array/struct_.rs | 9 +- src/io/ipc/read/array/union.rs | 15 +- src/io/ipc/read/array/utf8.rs | 14 +- src/io/ipc/read/common.rs | 7 + src/io/ipc/read/deserialize.rs | 34 +- src/io/ipc/read/schema.rs | 4 +- src/io/ipc/write/common.rs | 3 +- src/io/ipc/write/schema.rs | 3 +- src/io/ipc/write/serialize/binary.rs | 93 ++++ src/io/ipc/write/serialize/binview.rs | 44 ++ src/io/ipc/write/serialize/boolean.rs | 27 ++ src/io/ipc/write/serialize/dictionary.rs | 37 ++ .../ipc/write/serialize/fixed_size_binary.rs | 20 + .../ipc/write/serialize/fixed_sized_list.rs | 29 ++ src/io/ipc/write/serialize/list.rs | 58 +++ src/io/ipc/write/serialize/map.rs | 58 +++ .../write/{serialize.rs => serialize/mod.rs} | 458 ++---------------- src/io/ipc/write/serialize/primitive.rs | 28 ++ src/io/ipc/write/serialize/struct_.rs | 31 ++ src/io/ipc/write/serialize/union.rs | 42 ++ src/io/ipc/write/writer.rs | 6 +- tests/it/io/ipc/mod.rs | 80 +++ 38 files changed, 815 insertions(+), 530 deletions(-) create mode 100644 src/io/ipc/read/array/binview.rs create mode 100644 src/io/ipc/write/serialize/binary.rs create mode 100644 src/io/ipc/write/serialize/binview.rs create mode 100644 src/io/ipc/write/serialize/boolean.rs create mode 100644 src/io/ipc/write/serialize/dictionary.rs create mode 100644 src/io/ipc/write/serialize/fixed_size_binary.rs create mode 100644 src/io/ipc/write/serialize/fixed_sized_list.rs create mode 100644 src/io/ipc/write/serialize/list.rs create mode 100644 src/io/ipc/write/serialize/map.rs rename src/io/ipc/write/{serialize.rs => serialize/mod.rs} (55%) create mode 100644 src/io/ipc/write/serialize/primitive.rs create mode 100644 src/io/ipc/write/serialize/struct_.rs create mode 100644 src/io/ipc/write/serialize/union.rs diff --git a/Cargo.toml b/Cargo.toml index 0c8dc4fd71b..1fe8a281e6c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -184,7 +184,7 @@ io_json_write = ["streaming-iterator", "fallible-streaming-iterator", "lexical-c io_ipc = ["arrow-format"] io_ipc_write_async = ["io_ipc", "futures"] io_ipc_read_async = ["io_ipc", "futures", "async-stream"] -io_ipc_compression = ["lz4", "zstd"] +io_ipc_compression = ["lz4", "zstd", "io_ipc"] io_flight = ["io_ipc", "arrow-format/flight-data"] # base64 + io_ipc because arrow schemas are stored as base64-encoded ipc format. diff --git a/src/array/binary/ffi.rs b/src/array/binary/ffi.rs index 6f971c4226f..9537e63249c 100644 --- a/src/array/binary/ffi.rs +++ b/src/array/binary/ffi.rs @@ -62,6 +62,6 @@ impl FromFfi for BinaryArray { // assumption that data from FFI is well constructed let offsets = unsafe { OffsetsBuffer::new_unchecked(offsets) }; - Ok(Self::new(data_type, offsets, values, validity)) + Self::try_new(data_type, offsets, values, validity) } } diff --git a/src/array/binview/ffi.rs b/src/array/binview/ffi.rs index 00f9896b9e3..697422d8f10 100644 --- a/src/array/binview/ffi.rs +++ b/src/array/binview/ffi.rs @@ -67,6 +67,11 @@ impl FromFfi for BinaryViewArray buffers.push(values); } - Self::try_new(data_type, views, Arc::from(buffers), validity) + Ok(Self::new_unchecked( + data_type, + views, + Arc::from(buffers), + validity, + )) } } diff --git a/src/array/binview/mod.rs b/src/array/binview/mod.rs index 98ed1b6f8e6..9757c96e6fd 100644 --- a/src/array/binview/mod.rs +++ b/src/array/binview/mod.rs @@ -26,6 +26,7 @@ mod private { use private::Sealed; use crate::array::binview::iterator::BinaryViewValueIter; +use crate::array::binview::mutable::MutableBinaryViewArray; use crate::array::binview::view::{validate_binary_view, validate_utf8_view}; use crate::array::iterator::NonNullValuesIter; use crate::bitmap::utils::{BitmapIter, ZipValidity}; @@ -147,6 +148,10 @@ impl BinaryViewArrayGeneric { self.buffers.as_ref() } + pub fn variadic_buffer_lengths(&self) -> Vec { + self.buffers.iter().map(|buf| buf.len() as i64).collect() + } + pub fn views(&self) -> &Buffer { &self.views } @@ -251,6 +256,12 @@ impl BinaryViewArrayGeneric { impl_sliced!(); impl_mut_validity!(); impl_into_array!(); + + pub fn from, P: AsRef<[Option]>>(slice: P) -> Self { + let mutable = + MutableBinaryViewArray::from_iter(slice.as_ref().iter().map(|opt_v| opt_v.as_ref())); + mutable.into() + } } impl Array for BinaryViewArrayGeneric { diff --git a/src/array/binview/mutable.rs b/src/array/binview/mutable.rs index 5bbd0c170f7..b6fc86f7c5b 100644 --- a/src/array/binview/mutable.rs +++ b/src/array/binview/mutable.rs @@ -165,6 +165,10 @@ impl MutableBinaryViewArray { mutable.extend_values(iterator); mutable } + + pub fn from, P: AsRef<[Option]>>(slice: P) -> Self { + Self::from_iter(slice.as_ref().iter().map(|opt_v| opt_v.as_ref())) + } } impl> Extend> for MutableBinaryViewArray { diff --git a/src/array/list/ffi.rs b/src/array/list/ffi.rs index 2b6be75e782..67bd25f1a9b 100644 --- a/src/array/list/ffi.rs +++ b/src/array/list/ffi.rs @@ -61,6 +61,6 @@ impl FromFfi for ListArray { // assumption that data from FFI is well constructed let offsets = unsafe { OffsetsBuffer::new_unchecked(offsets) }; - Ok(Self::new(data_type, offsets, values, validity)) + Self::try_new(data_type, offsets, values, validity) } } diff --git a/src/io/ipc/read/array/binary.rs b/src/io/ipc/read/array/binary.rs index 15361da0968..3cfe4a29057 100644 --- a/src/io/ipc/read/array/binary.rs +++ b/src/io/ipc/read/array/binary.rs @@ -5,6 +5,7 @@ use crate::array::BinaryArray; use crate::buffer::Buffer; use crate::datatypes::DataType; use crate::error::{Error, Result}; +use crate::io::ipc::read::array::{try_get_array_length, try_get_field_node}; use crate::offset::Offset; use super::super::read_basic::*; @@ -22,11 +23,7 @@ pub fn read_binary( limit: Option, scratch: &mut Vec, ) -> Result> { - let field_node = field_nodes.pop_front().ok_or_else(|| { - Error::oos(format!( - "IPC: unable to fetch the field for {data_type:?}. The file or stream is corrupted." - )) - })?; + let field_node = try_get_field_node(field_nodes, &data_type)?; let validity = read_validity( buffers, @@ -39,11 +36,7 @@ pub fn read_binary( scratch, )?; - let length: usize = field_node - .length() - .try_into() - .map_err(|_| Error::from(OutOfSpecKind::NegativeFooterLength))?; - let length = limit.map(|limit| limit.min(length)).unwrap_or(length); + let length = try_get_array_length(field_node, limit)?; let offsets: Buffer = read_buffer( buffers, diff --git a/src/io/ipc/read/array/binview.rs b/src/io/ipc/read/array/binview.rs new file mode 100644 index 00000000000..c9814fc73a8 --- /dev/null +++ b/src/io/ipc/read/array/binview.rs @@ -0,0 +1,82 @@ +use std::collections::VecDeque; +use std::io::{Read, Seek}; +use std::sync::Arc; + +use crate::error::{Error, Result}; + +use super::super::read_basic::*; +use super::*; +use crate::array::{ArrayRef, BinaryViewArrayGeneric, ViewType}; +use crate::buffer::Buffer; +use crate::datatypes::DataType; + +#[allow(clippy::too_many_arguments)] +pub fn read_binview( + field_nodes: &mut VecDeque, + variadic_buffer_counts: &mut VecDeque, + data_type: DataType, + buffers: &mut VecDeque, + reader: &mut R, + block_offset: u64, + is_little_endian: bool, + compression: Option, + limit: Option, + scratch: &mut Vec, +) -> Result { + let field_node = try_get_field_node(field_nodes, &data_type)?; + + let validity = read_validity( + buffers, + field_node, + reader, + block_offset, + is_little_endian, + compression, + limit, + scratch, + )?; + + let length = try_get_array_length(field_node, limit)?; + let views: Buffer = read_buffer( + buffers, + length, + reader, + block_offset, + is_little_endian, + compression, + scratch, + )?; + + let n_variadic = variadic_buffer_counts.pop_front().ok_or_else( + || polars_err!(ComputeError: "IPC: unable to fetch the variadic buffers\n\nThe file or stream is corrupted.") + )?; + + let variadic_buffer_lengths: Buffer = read_buffer( + buffers, + n_variadic, + reader, + block_offset, + is_little_endian, + compression, + scratch, + )?; + + let variadic_buffers = variadic_buffer_lengths + .iter() + .map(|length| { + let length = *length as usize; + read_buffer( + buffers, + length, + reader, + block_offset, + is_little_endian, + compression, + scratch, + ) + }) + .collect::>>>()?; + + BinaryViewArrayGeneric::::try_new(data_type, views, Arc::from(variadic_buffers), validity) + .map(|arr| arr.boxed()) +} diff --git a/src/io/ipc/read/array/boolean.rs b/src/io/ipc/read/array/boolean.rs index dbe40b3194e..00f82ab5780 100644 --- a/src/io/ipc/read/array/boolean.rs +++ b/src/io/ipc/read/array/boolean.rs @@ -4,6 +4,7 @@ use std::io::{Read, Seek}; use crate::array::BooleanArray; use crate::datatypes::DataType; use crate::error::{Error, Result}; +use crate::io::ipc::read::array::{try_get_array_length, try_get_field_node}; use super::super::read_basic::*; use super::super::{Compression, IpcBuffer, Node, OutOfSpecKind}; @@ -20,11 +21,7 @@ pub fn read_boolean( limit: Option, scratch: &mut Vec, ) -> Result { - let field_node = field_nodes.pop_front().ok_or_else(|| { - Error::oos(format!( - "IPC: unable to fetch the field for {data_type:?}. The file or stream is corrupted." - )) - })?; + let field_node = try_get_field_node(field_nodes, &data_type)?; let validity = read_validity( buffers, @@ -37,11 +34,7 @@ pub fn read_boolean( scratch, )?; - let length: usize = field_node - .length() - .try_into() - .map_err(|_| Error::from(OutOfSpecKind::NegativeFooterLength))?; - let length = limit.map(|limit| limit.min(length)).unwrap_or(length); + let length = try_get_array_length(field_node, limit)?; let values = read_bitmap( buffers, diff --git a/src/io/ipc/read/array/fixed_size_binary.rs b/src/io/ipc/read/array/fixed_size_binary.rs index 79ab0586fae..4aba9b82fb1 100644 --- a/src/io/ipc/read/array/fixed_size_binary.rs +++ b/src/io/ipc/read/array/fixed_size_binary.rs @@ -4,6 +4,7 @@ use std::io::{Read, Seek}; use crate::array::FixedSizeBinaryArray; use crate::datatypes::DataType; use crate::error::{Error, Result}; +use crate::io::ipc::read::array::{try_get_array_length, try_get_field_node}; use super::super::read_basic::*; use super::super::{Compression, IpcBuffer, Node, OutOfSpecKind}; @@ -20,11 +21,7 @@ pub fn read_fixed_size_binary( limit: Option, scratch: &mut Vec, ) -> Result { - let field_node = field_nodes.pop_front().ok_or_else(|| { - Error::oos(format!( - "IPC: unable to fetch the field for {data_type:?}. The file or stream is corrupted." - )) - })?; + let field_node = try_get_field_node(field_nodes, &data_type)?; let validity = read_validity( buffers, @@ -37,11 +34,7 @@ pub fn read_fixed_size_binary( scratch, )?; - let length: usize = field_node - .length() - .try_into() - .map_err(|_| Error::from(OutOfSpecKind::NegativeFooterLength))?; - let length = limit.map(|limit| limit.min(length)).unwrap_or(length); + let length = try_get_array_length(field_node, limit)?; let length = length.saturating_mul(FixedSizeBinaryArray::maybe_get_size(&data_type)?); let values = read_buffer( diff --git a/src/io/ipc/read/array/fixed_size_list.rs b/src/io/ipc/read/array/fixed_size_list.rs index 1f5d919c3cd..76da4470ae5 100644 --- a/src/io/ipc/read/array/fixed_size_list.rs +++ b/src/io/ipc/read/array/fixed_size_list.rs @@ -9,10 +9,12 @@ use super::super::super::IpcField; use super::super::deserialize::{read, skip}; use super::super::read_basic::*; use super::super::{Compression, Dictionaries, IpcBuffer, Node, Version}; +use crate::io::ipc::read::array::try_get_field_node; #[allow(clippy::too_many_arguments)] pub fn read_fixed_size_list( field_nodes: &mut VecDeque, + variadic_buffer_counts: &mut VecDeque, data_type: DataType, ipc_field: &IpcField, buffers: &mut VecDeque, @@ -25,11 +27,7 @@ pub fn read_fixed_size_list( version: Version, scratch: &mut Vec, ) -> Result { - let field_node = field_nodes.pop_front().ok_or_else(|| { - Error::oos(format!( - "IPC: unable to fetch the field for {data_type:?}. The file or stream is corrupted." - )) - })?; + let field_node = try_get_field_node(field_nodes, &data_type)?; let validity = read_validity( buffers, @@ -48,6 +46,7 @@ pub fn read_fixed_size_list( let values = read( field_nodes, + variadic_buffer_counts, field, &ipc_field.fields[0], buffers, diff --git a/src/io/ipc/read/array/list.rs b/src/io/ipc/read/array/list.rs index b6a9ef26155..2cf71cbb34d 100644 --- a/src/io/ipc/read/array/list.rs +++ b/src/io/ipc/read/array/list.rs @@ -12,10 +12,12 @@ use super::super::super::IpcField; use super::super::deserialize::{read, skip}; use super::super::read_basic::*; use super::super::{Compression, Dictionaries, IpcBuffer, Node, OutOfSpecKind, Version}; +use crate::io::ipc::read::array::{try_get_array_length, try_get_field_node}; #[allow(clippy::too_many_arguments)] pub fn read_list( field_nodes: &mut VecDeque, + variadic_buffer_counts: &mut VecDeque, data_type: DataType, ipc_field: &IpcField, buffers: &mut VecDeque, @@ -31,11 +33,7 @@ pub fn read_list( where Vec: TryInto, { - let field_node = field_nodes.pop_front().ok_or_else(|| { - Error::oos(format!( - "IPC: unable to fetch the field for {data_type:?}. The file or stream is corrupted." - )) - })?; + let field_node = try_get_field_node(field_nodes, &data_type)?; let validity = read_validity( buffers, @@ -48,11 +46,7 @@ where scratch, )?; - let length: usize = field_node - .length() - .try_into() - .map_err(|_| Error::from(OutOfSpecKind::NegativeFooterLength))?; - let length = limit.map(|limit| limit.min(length)).unwrap_or(length); + let length = try_get_array_length(field_node, limit)?; let offsets = read_buffer::( buffers, @@ -72,6 +66,7 @@ where let values = read( field_nodes, + variadic_buffer_counts, field, &ipc_field.fields[0], buffers, diff --git a/src/io/ipc/read/array/map.rs b/src/io/ipc/read/array/map.rs index b98678e0524..5787e02596b 100644 --- a/src/io/ipc/read/array/map.rs +++ b/src/io/ipc/read/array/map.rs @@ -10,10 +10,12 @@ use super::super::super::IpcField; use super::super::deserialize::{read, skip}; use super::super::read_basic::*; use super::super::{Compression, Dictionaries, IpcBuffer, Node, OutOfSpecKind, Version}; +use crate::io::ipc::read::array::{try_get_array_length, try_get_field_node}; #[allow(clippy::too_many_arguments)] pub fn read_map( field_nodes: &mut VecDeque, + variadic_buffer_counts: &mut VecDeque, data_type: DataType, ipc_field: &IpcField, buffers: &mut VecDeque, @@ -26,11 +28,7 @@ pub fn read_map( version: Version, scratch: &mut Vec, ) -> Result { - let field_node = field_nodes.pop_front().ok_or_else(|| { - Error::oos(format!( - "IPC: unable to fetch the field for {data_type:?}. The file or stream is corrupted." - )) - })?; + let field_node = try_get_field_node(field_nodes, &data_type)?; let validity = read_validity( buffers, @@ -43,11 +41,7 @@ pub fn read_map( scratch, )?; - let length: usize = field_node - .length() - .try_into() - .map_err(|_| Error::from(OutOfSpecKind::NegativeFooterLength))?; - let length = limit.map(|limit| limit.min(length)).unwrap_or(length); + let length = try_get_array_length(field_node, limit)?; let offsets = read_buffer::( buffers, @@ -67,6 +61,7 @@ pub fn read_map( let field = read( field_nodes, + variadic_buffer_counts, field, &ipc_field.fields[0], buffers, diff --git a/src/io/ipc/read/array/mod.rs b/src/io/ipc/read/array/mod.rs index 249e5e05e16..c4e7b237f60 100644 --- a/src/io/ipc/read/array/mod.rs +++ b/src/io/ipc/read/array/mod.rs @@ -1,4 +1,7 @@ mod primitive; + +use std::collections::VecDeque; + pub use primitive::*; mod boolean; pub use boolean::*; @@ -20,5 +23,28 @@ mod dictionary; pub use dictionary::*; mod union; pub use union::*; +mod binview; mod map; +pub use binview::*; pub use map::*; + +use super::{Compression, IpcBuffer, Node, OutOfSpecKind}; +use crate::datatypes::DataType; +use crate::error::{Error, Result}; + +fn try_get_field_node<'a>( + field_nodes: &mut VecDeque>, + data_type: &DataType, +) -> Result> { + field_nodes.pop_front().ok_or_else(|| { + polars_err!(ComputeError: "IPC: unable to fetch the field for {:?}\n\nThe file or stream is corrupted.", data_type) + }) +} + +fn try_get_array_length(field_node: Node, limit: Option) -> Result { + let length: usize = field_node + .length() + .try_into() + .map_err(|_| polars_err!(oos = OutOfSpecKind::NegativeFooterLength))?; + Ok(limit.map(|limit| limit.min(length)).unwrap_or(length)) +} diff --git a/src/io/ipc/read/array/null.rs b/src/io/ipc/read/array/null.rs index eee14608d85..c25cc1b1c6d 100644 --- a/src/io/ipc/read/array/null.rs +++ b/src/io/ipc/read/array/null.rs @@ -7,18 +7,16 @@ use crate::{ }; use super::super::{Node, OutOfSpecKind}; +use crate::io::ipc::read::array::{try_get_array_length, try_get_field_node}; -pub fn read_null(field_nodes: &mut VecDeque, data_type: DataType) -> Result { - let field_node = field_nodes.pop_front().ok_or_else(|| { - Error::oos(format!( - "IPC: unable to fetch the field for {data_type:?}. The file or stream is corrupted." - )) - })?; +pub fn read_null( + field_nodes: &mut VecDeque, + data_type: DataType, + limit: Option, +) -> Result { + let field_node = try_get_field_node(field_nodes, &data_type)?; - let length: usize = field_node - .length() - .try_into() - .map_err(|_| Error::from(OutOfSpecKind::NegativeFooterLength))?; + let length = try_get_array_length(field_node, limit)?; NullArray::try_new(data_type, length) } diff --git a/src/io/ipc/read/array/primitive.rs b/src/io/ipc/read/array/primitive.rs index 0815d2ae7a2..057e6298f19 100644 --- a/src/io/ipc/read/array/primitive.rs +++ b/src/io/ipc/read/array/primitive.rs @@ -7,6 +7,7 @@ use crate::{array::PrimitiveArray, types::NativeType}; use super::super::read_basic::*; use super::super::{Compression, IpcBuffer, Node, OutOfSpecKind}; +use crate::io::ipc::read::array::{try_get_array_length, try_get_field_node}; #[allow(clippy::too_many_arguments)] pub fn read_primitive( @@ -23,11 +24,7 @@ pub fn read_primitive( where Vec: TryInto, { - let field_node = field_nodes.pop_front().ok_or_else(|| { - Error::oos(format!( - "IPC: unable to fetch the field for {data_type:?}. The file or stream is corrupted." - )) - })?; + let field_node = try_get_field_node(field_nodes, &data_type)?; let validity = read_validity( buffers, @@ -40,11 +37,7 @@ where scratch, )?; - let length: usize = field_node - .length() - .try_into() - .map_err(|_| Error::from(OutOfSpecKind::NegativeFooterLength))?; - let length = limit.map(|limit| limit.min(length)).unwrap_or(length); + let length = try_get_array_length(field_node, limit)?; let values = read_buffer( buffers, diff --git a/src/io/ipc/read/array/struct_.rs b/src/io/ipc/read/array/struct_.rs index 06d0f55ad3d..bbd61cf1130 100644 --- a/src/io/ipc/read/array/struct_.rs +++ b/src/io/ipc/read/array/struct_.rs @@ -9,10 +9,12 @@ use super::super::super::IpcField; use super::super::deserialize::{read, skip}; use super::super::read_basic::*; use super::super::{Compression, Dictionaries, IpcBuffer, Node, Version}; +use crate::io::ipc::read::array::try_get_field_node; #[allow(clippy::too_many_arguments)] pub fn read_struct( field_nodes: &mut VecDeque, + variadic_buffer_counts: &mut VecDeque, data_type: DataType, ipc_field: &IpcField, buffers: &mut VecDeque, @@ -25,11 +27,7 @@ pub fn read_struct( version: Version, scratch: &mut Vec, ) -> Result { - let field_node = field_nodes.pop_front().ok_or_else(|| { - Error::oos(format!( - "IPC: unable to fetch the field for {data_type:?}. The file or stream is corrupted." - )) - })?; + let field_node = try_get_field_node(field_nodes, &data_type)?; let validity = read_validity( buffers, @@ -50,6 +48,7 @@ pub fn read_struct( .map(|(field, ipc_field)| { read( field_nodes, + variadic_buffer_counts, field, ipc_field, buffers, diff --git a/src/io/ipc/read/array/union.rs b/src/io/ipc/read/array/union.rs index 569014797a2..edb22b1c908 100644 --- a/src/io/ipc/read/array/union.rs +++ b/src/io/ipc/read/array/union.rs @@ -10,10 +10,12 @@ use super::super::super::IpcField; use super::super::deserialize::{read, skip}; use super::super::read_basic::*; use super::super::{Compression, Dictionaries, IpcBuffer, Node, OutOfSpecKind, Version}; +use crate::io::ipc::read::array::{try_get_array_length, try_get_field_node}; #[allow(clippy::too_many_arguments)] pub fn read_union( field_nodes: &mut VecDeque, + variadic_buffer_counts: &mut VecDeque, data_type: DataType, ipc_field: &IpcField, buffers: &mut VecDeque, @@ -26,11 +28,7 @@ pub fn read_union( version: Version, scratch: &mut Vec, ) -> Result { - let field_node = field_nodes.pop_front().ok_or_else(|| { - Error::oos(format!( - "IPC: unable to fetch the field for {data_type:?}. The file or stream is corrupted." - )) - })?; + let field_node = try_get_field_node(field_nodes, &data_type)?; if version != Version::V5 { let _ = buffers @@ -38,11 +36,7 @@ pub fn read_union( .ok_or_else(|| Error::oos("IPC: missing validity buffer."))?; }; - let length: usize = field_node - .length() - .try_into() - .map_err(|_| Error::from(OutOfSpecKind::NegativeFooterLength))?; - let length = limit.map(|limit| limit.min(length)).unwrap_or(length); + let length = try_get_array_length(field_node, limit)?; let types = read_buffer( buffers, @@ -80,6 +74,7 @@ pub fn read_union( .map(|(field, ipc_field)| { read( field_nodes, + variadic_buffer_counts, field, ipc_field, buffers, diff --git a/src/io/ipc/read/array/utf8.rs b/src/io/ipc/read/array/utf8.rs index 741b2b91585..819181e5df1 100644 --- a/src/io/ipc/read/array/utf8.rs +++ b/src/io/ipc/read/array/utf8.rs @@ -5,6 +5,7 @@ use crate::array::Utf8Array; use crate::buffer::Buffer; use crate::datatypes::DataType; use crate::error::{Error, Result}; +use crate::io::ipc::read::array::{try_get_array_length, try_get_field_node}; use crate::offset::Offset; use super::super::read_basic::*; @@ -22,11 +23,7 @@ pub fn read_utf8( limit: Option, scratch: &mut Vec, ) -> Result> { - let field_node = field_nodes.pop_front().ok_or_else(|| { - Error::oos(format!( - "IPC: unable to fetch the field for {data_type:?}. The file or stream is corrupted." - )) - })?; + let field_node = try_get_field_node(field_nodes, &data_type)?; let validity = read_validity( buffers, @@ -39,12 +36,7 @@ pub fn read_utf8( scratch, )?; - let length: usize = field_node - .length() - .try_into() - .map_err(|_| Error::from(OutOfSpecKind::NegativeFooterLength))?; - - let length = limit.map(|limit| limit.min(length)).unwrap_or(length); + let length = try_get_array_length(field_node, limit)?; let offsets: Buffer = read_buffer( buffers, diff --git a/src/io/ipc/read/common.rs b/src/io/ipc/read/common.rs index 9a1ea3ce1c3..dc305ac3ad5 100644 --- a/src/io/ipc/read/common.rs +++ b/src/io/ipc/read/common.rs @@ -94,6 +94,11 @@ pub fn read_record_batch( .buffers() .map_err(|err| Error::from(OutOfSpecKind::InvalidFlatbufferBuffers(err)))? .ok_or_else(|| Error::from(OutOfSpecKind::MissingMessageBuffers))?; + let mut variadic_buffer_counts = batch + .variadic_buffer_counts() + .map_err(|err| Error::from(OutOfSpecKind::InvalidFlatbufferRecordBatches(err)))? + .map(|v| v.iter().map(|v| v as usize).collect::>()) + .unwrap_or_else(VecDeque::new); let mut buffers: VecDeque = buffers.iter().collect(); // check that the sum of the sizes of all buffers is <= than the size of the file @@ -128,6 +133,7 @@ pub fn read_record_batch( .map(|maybe_field| match maybe_field { ProjectionResult::Selected((field, ipc_field)) => Ok(Some(read( &mut field_nodes, + &mut variadic_buffer_counts, field, ipc_field, &mut buffers, @@ -156,6 +162,7 @@ pub fn read_record_batch( .map(|(field, ipc_field)| { read( &mut field_nodes, + &mut variadic_buffer_counts, field, ipc_field, &mut buffers, diff --git a/src/io/ipc/read/deserialize.rs b/src/io/ipc/read/deserialize.rs index f2bd5054778..af0c6126a9d 100644 --- a/src/io/ipc/read/deserialize.rs +++ b/src/io/ipc/read/deserialize.rs @@ -15,6 +15,7 @@ use super::{IpcBuffer, Node}; #[allow(clippy::too_many_arguments)] pub fn read( field_nodes: &mut VecDeque, + variadic_buffer_counts: &mut VecDeque, field: &Field, ipc_field: &IpcField, buffers: &mut VecDeque, @@ -31,7 +32,7 @@ pub fn read( let data_type = field.data_type.clone(); match data_type.to_physical_type() { - Null => read_null(field_nodes, data_type).map(|x| x.boxed()), + Null => read_null(field_nodes, data_type, limit).map(|x| x.boxed()), Boolean => read_boolean( field_nodes, data_type, @@ -120,6 +121,7 @@ pub fn read( .map(|x| x.boxed()), List => read_list::( field_nodes, + variadic_buffer_counts, data_type, ipc_field, buffers, @@ -135,6 +137,7 @@ pub fn read( .map(|x| x.boxed()), LargeList => read_list::( field_nodes, + variadic_buffer_counts, data_type, ipc_field, buffers, @@ -150,6 +153,7 @@ pub fn read( .map(|x| x.boxed()), FixedSizeList => read_fixed_size_list( field_nodes, + variadic_buffer_counts, data_type, ipc_field, buffers, @@ -165,6 +169,7 @@ pub fn read( .map(|x| x.boxed()), Struct => read_struct( field_nodes, + variadic_buffer_counts, data_type, ipc_field, buffers, @@ -198,6 +203,7 @@ pub fn read( } Union => read_union( field_nodes, + variadic_buffer_counts, data_type, ipc_field, buffers, @@ -213,6 +219,7 @@ pub fn read( .map(|x| x.boxed()), Map => read_map( field_nodes, + variadic_buffer_counts, data_type, ipc_field, buffers, @@ -226,7 +233,30 @@ pub fn read( scratch, ) .map(|x| x.boxed()), - Utf8View | BinaryView => todo!(), + Utf8View => read_binview::( + field_nodes, + variadic_buffer_counts, + data_type, + buffers, + reader, + block_offset, + is_little_endian, + compression, + limit, + scratch, + ), + BinaryView => read_binview::<[u8], _>( + field_nodes, + variadic_buffer_counts, + data_type, + buffers, + reader, + block_offset, + is_little_endian, + compression, + limit, + scratch, + ), } } diff --git a/src/io/ipc/read/schema.rs b/src/io/ipc/read/schema.rs index 93ff1c6e1ea..82b68eeaeba 100644 --- a/src/io/ipc/read/schema.rs +++ b/src/io/ipc/read/schema.rs @@ -281,6 +281,8 @@ fn get_data_type( LargeBinary(_) => (DataType::LargeBinary, IpcField::default()), Utf8(_) => (DataType::Utf8, IpcField::default()), LargeUtf8(_) => (DataType::LargeUtf8, IpcField::default()), + BinaryView(_) => (DataType::BinaryView, IpcField::default()), + Utf8View(_) => (DataType::Utf8View, IpcField::default()), FixedSizeBinary(fixed) => ( DataType::FixedSizeBinary( fixed @@ -354,8 +356,6 @@ fn get_data_type( Union(union_) => deserialize_union(union_, field)?, Map(map) => deserialize_map(map, field)?, RunEndEncoded(_) => todo!(), - BinaryView(_) => todo!(), - Utf8View(_) => todo!(), LargeListView(_) | ListView(_) => todo!(), }) } diff --git a/src/io/ipc/write/common.rs b/src/io/ipc/write/common.rs index 17bc3128f50..8618f007df4 100644 --- a/src/io/ipc/write/common.rs +++ b/src/io/ipc/write/common.rs @@ -39,7 +39,7 @@ fn encode_dictionary( use PhysicalType::*; match array.data_type().to_physical_type() { Utf8 | LargeUtf8 | Binary | LargeBinary | Primitive(_) | Boolean | Null - | FixedSizeBinary => Ok(()), + | FixedSizeBinary | BinaryView | Utf8View => Ok(()), Dictionary(key_type) => match_integer_type!(key_type, |$T| { let dict_id = field.dictionary_id .ok_or_else(|| Error::InvalidArgumentError("Dictionaries must have an associated id".to_string()))?; @@ -168,7 +168,6 @@ fn encode_dictionary( encoded_dictionaries, ) }, - Utf8View | BinaryView => todo!(), } } diff --git a/src/io/ipc/write/schema.rs b/src/io/ipc/write/schema.rs index 465fb8c38bf..c144c72189b 100644 --- a/src/io/ipc/write/schema.rs +++ b/src/io/ipc/write/schema.rs @@ -255,7 +255,8 @@ fn serialize_type(data_type: &DataType) -> arrow_format::ipc::Type { Struct(_) => ipc::Type::Struct(Box::new(ipc::Struct {})), Dictionary(_, v, _) => serialize_type(v), Extension(_, v, _) => serialize_type(v), - Utf8View | BinaryView => todo!(), + Utf8View => ipc::Type::Utf8View(Box::new(ipc::Utf8View {})), + BinaryView => ipc::Type::BinaryView(Box::new(ipc::BinaryView {})), } } diff --git a/src/io/ipc/write/serialize/binary.rs b/src/io/ipc/write/serialize/binary.rs new file mode 100644 index 00000000000..9642ded1f78 --- /dev/null +++ b/src/io/ipc/write/serialize/binary.rs @@ -0,0 +1,93 @@ +use super::*; + +#[allow(clippy::too_many_arguments)] +fn write_generic_binary( + validity: Option<&Bitmap>, + offsets: &OffsetsBuffer, + values: &[u8], + buffers: &mut Vec, + arrow_data: &mut Vec, + offset: &mut i64, + is_little_endian: bool, + compression: Option, +) { + let offsets = offsets.buffer(); + write_bitmap( + validity, + offsets.len() - 1, + buffers, + arrow_data, + offset, + compression, + ); + + let first = *offsets.first().unwrap(); + let last = *offsets.last().unwrap(); + if first == O::default() { + write_buffer( + offsets, + buffers, + arrow_data, + offset, + is_little_endian, + compression, + ); + } else { + write_buffer_from_iter( + offsets.iter().map(|x| *x - first), + buffers, + arrow_data, + offset, + is_little_endian, + compression, + ); + } + + write_bytes( + &values[first.to_usize()..last.to_usize()], + buffers, + arrow_data, + offset, + compression, + ); +} + +pub(super) fn write_binary( + array: &BinaryArray, + buffers: &mut Vec, + arrow_data: &mut Vec, + offset: &mut i64, + is_little_endian: bool, + compression: Option, +) { + write_generic_binary( + array.validity(), + array.offsets(), + array.values(), + buffers, + arrow_data, + offset, + is_little_endian, + compression, + ); +} + +pub(super) fn write_utf8( + array: &Utf8Array, + buffers: &mut Vec, + arrow_data: &mut Vec, + offset: &mut i64, + is_little_endian: bool, + compression: Option, +) { + write_generic_binary( + array.validity(), + array.offsets(), + array.values(), + buffers, + arrow_data, + offset, + is_little_endian, + compression, + ); +} diff --git a/src/io/ipc/write/serialize/binview.rs b/src/io/ipc/write/serialize/binview.rs new file mode 100644 index 00000000000..bcf5d98970d --- /dev/null +++ b/src/io/ipc/write/serialize/binview.rs @@ -0,0 +1,44 @@ +use super::*; +use crate::array; + +#[allow(clippy::too_many_arguments)] +pub(super) fn write_binview( + array: &BinaryViewArrayGeneric, + buffers: &mut Vec, + arrow_data: &mut Vec, + offset: &mut i64, + is_little_endian: bool, + compression: Option, +) { + write_bitmap( + array.validity(), + array::Array::len(array), + buffers, + arrow_data, + offset, + compression, + ); + + write_buffer( + array.views(), + buffers, + arrow_data, + offset, + is_little_endian, + compression, + ); + + let vbl = array.variadic_buffer_lengths(); + write_buffer( + &vbl, + buffers, + arrow_data, + offset, + is_little_endian, + compression, + ); + + for data in array.data_buffers() { + write_bytes(data, buffers, arrow_data, offset, compression); + } +} diff --git a/src/io/ipc/write/serialize/boolean.rs b/src/io/ipc/write/serialize/boolean.rs new file mode 100644 index 00000000000..f699860b89c --- /dev/null +++ b/src/io/ipc/write/serialize/boolean.rs @@ -0,0 +1,27 @@ +use super::*; + +pub(super) fn write_boolean( + array: &BooleanArray, + buffers: &mut Vec, + arrow_data: &mut Vec, + offset: &mut i64, + _: bool, + compression: Option, +) { + write_bitmap( + array.validity(), + array.len(), + buffers, + arrow_data, + offset, + compression, + ); + write_bitmap( + Some(&array.values().clone()), + array.len(), + buffers, + arrow_data, + offset, + compression, + ); +} diff --git a/src/io/ipc/write/serialize/dictionary.rs b/src/io/ipc/write/serialize/dictionary.rs new file mode 100644 index 00000000000..0d1eb96ea7e --- /dev/null +++ b/src/io/ipc/write/serialize/dictionary.rs @@ -0,0 +1,37 @@ +use super::*; + +// use `write_keys` to either write keys or values +#[allow(clippy::too_many_arguments)] +pub fn write_dictionary( + array: &DictionaryArray, + buffers: &mut Vec, + arrow_data: &mut Vec, + nodes: &mut Vec, + offset: &mut i64, + is_little_endian: bool, + compression: Option, + write_keys: bool, +) -> usize { + if write_keys { + write_primitive( + array.keys(), + buffers, + arrow_data, + offset, + is_little_endian, + compression, + ); + array.keys().len() + } else { + write( + array.values().as_ref(), + buffers, + arrow_data, + nodes, + offset, + is_little_endian, + compression, + ); + array.values().len() + } +} diff --git a/src/io/ipc/write/serialize/fixed_size_binary.rs b/src/io/ipc/write/serialize/fixed_size_binary.rs new file mode 100644 index 00000000000..dc1e973b4d4 --- /dev/null +++ b/src/io/ipc/write/serialize/fixed_size_binary.rs @@ -0,0 +1,20 @@ +use super::*; + +pub(super) fn write_fixed_size_binary( + array: &FixedSizeBinaryArray, + buffers: &mut Vec, + arrow_data: &mut Vec, + offset: &mut i64, + _is_little_endian: bool, + compression: Option, +) { + write_bitmap( + array.validity(), + array.len(), + buffers, + arrow_data, + offset, + compression, + ); + write_bytes(array.values(), buffers, arrow_data, offset, compression); +} diff --git a/src/io/ipc/write/serialize/fixed_sized_list.rs b/src/io/ipc/write/serialize/fixed_sized_list.rs new file mode 100644 index 00000000000..da8fa7db962 --- /dev/null +++ b/src/io/ipc/write/serialize/fixed_sized_list.rs @@ -0,0 +1,29 @@ +use super::*; + +pub(super) fn write_fixed_size_list( + array: &FixedSizeListArray, + buffers: &mut Vec, + arrow_data: &mut Vec, + nodes: &mut Vec, + offset: &mut i64, + is_little_endian: bool, + compression: Option, +) { + write_bitmap( + array.validity(), + array.len(), + buffers, + arrow_data, + offset, + compression, + ); + write( + array.values().as_ref(), + buffers, + arrow_data, + nodes, + offset, + is_little_endian, + compression, + ); +} diff --git a/src/io/ipc/write/serialize/list.rs b/src/io/ipc/write/serialize/list.rs new file mode 100644 index 00000000000..8cca7eba1b8 --- /dev/null +++ b/src/io/ipc/write/serialize/list.rs @@ -0,0 +1,58 @@ +use super::*; + +pub(super) fn write_list( + array: &ListArray, + buffers: &mut Vec, + arrow_data: &mut Vec, + nodes: &mut Vec, + offset: &mut i64, + is_little_endian: bool, + compression: Option, +) { + let offsets = array.offsets().buffer(); + let validity = array.validity(); + + write_bitmap( + validity, + offsets.len() - 1, + buffers, + arrow_data, + offset, + compression, + ); + + let first = *offsets.first().unwrap(); + let last = *offsets.last().unwrap(); + if first == O::zero() { + write_buffer( + offsets, + buffers, + arrow_data, + offset, + is_little_endian, + compression, + ); + } else { + write_buffer_from_iter( + offsets.iter().map(|x| *x - first), + buffers, + arrow_data, + offset, + is_little_endian, + compression, + ); + } + + write( + array + .values() + .sliced(first.to_usize(), last.to_usize() - first.to_usize()) + .as_ref(), + buffers, + arrow_data, + nodes, + offset, + is_little_endian, + compression, + ); +} diff --git a/src/io/ipc/write/serialize/map.rs b/src/io/ipc/write/serialize/map.rs new file mode 100644 index 00000000000..19492679e41 --- /dev/null +++ b/src/io/ipc/write/serialize/map.rs @@ -0,0 +1,58 @@ +use super::*; + +pub(super) fn write_map( + array: &MapArray, + buffers: &mut Vec, + arrow_data: &mut Vec, + nodes: &mut Vec, + offset: &mut i64, + is_little_endian: bool, + compression: Option, +) { + let offsets = array.offsets().buffer(); + let validity = array.validity(); + + write_bitmap( + validity, + offsets.len() - 1, + buffers, + arrow_data, + offset, + compression, + ); + + let first = *offsets.first().unwrap(); + let last = *offsets.last().unwrap(); + if first == 0 { + write_buffer( + offsets, + buffers, + arrow_data, + offset, + is_little_endian, + compression, + ); + } else { + write_buffer_from_iter( + offsets.iter().map(|x| *x - first), + buffers, + arrow_data, + offset, + is_little_endian, + compression, + ); + } + + write( + array + .field() + .sliced(first as usize, last as usize - first as usize) + .as_ref(), + buffers, + arrow_data, + nodes, + offset, + is_little_endian, + compression, + ); +} diff --git a/src/io/ipc/write/serialize.rs b/src/io/ipc/write/serialize/mod.rs similarity index 55% rename from src/io/ipc/write/serialize.rs rename to src/io/ipc/write/serialize/mod.rs index 3a9fb56a6af..f252ee619c0 100644 --- a/src/io/ipc/write/serialize.rs +++ b/src/io/ipc/write/serialize/mod.rs @@ -13,419 +13,30 @@ use crate::{ use super::super::compression; use super::super::endianess::is_native_little_endian; use super::common::{pad_to_64, Compression}; - -fn write_primitive( - array: &PrimitiveArray, - buffers: &mut Vec, - arrow_data: &mut Vec, - offset: &mut i64, - is_little_endian: bool, - compression: Option, -) { - write_bitmap( - array.validity(), - array.len(), - buffers, - arrow_data, - offset, - compression, - ); - - write_buffer( - array.values(), - buffers, - arrow_data, - offset, - is_little_endian, - compression, - ) -} - -fn write_boolean( - array: &BooleanArray, - buffers: &mut Vec, - arrow_data: &mut Vec, - offset: &mut i64, - _: bool, - compression: Option, -) { - write_bitmap( - array.validity(), - array.len(), - buffers, - arrow_data, - offset, - compression, - ); - write_bitmap( - Some(&array.values().clone()), - array.len(), - buffers, - arrow_data, - offset, - compression, - ); -} - -#[allow(clippy::too_many_arguments)] -fn write_generic_binary( - validity: Option<&Bitmap>, - offsets: &OffsetsBuffer, - values: &[u8], - buffers: &mut Vec, - arrow_data: &mut Vec, - offset: &mut i64, - is_little_endian: bool, - compression: Option, -) { - let offsets = offsets.buffer(); - write_bitmap( - validity, - offsets.len() - 1, - buffers, - arrow_data, - offset, - compression, - ); - - let first = *offsets.first().unwrap(); - let last = *offsets.last().unwrap(); - if first == O::default() { - write_buffer( - offsets, - buffers, - arrow_data, - offset, - is_little_endian, - compression, - ); - } else { - write_buffer_from_iter( - offsets.iter().map(|x| *x - first), - buffers, - arrow_data, - offset, - is_little_endian, - compression, - ); - } - - write_bytes( - &values[first.to_usize()..last.to_usize()], - buffers, - arrow_data, - offset, - compression, - ); -} - -fn write_binary( - array: &BinaryArray, - buffers: &mut Vec, - arrow_data: &mut Vec, - offset: &mut i64, - is_little_endian: bool, - compression: Option, -) { - write_generic_binary( - array.validity(), - array.offsets(), - array.values(), - buffers, - arrow_data, - offset, - is_little_endian, - compression, - ); -} - -fn write_utf8( - array: &Utf8Array, - buffers: &mut Vec, - arrow_data: &mut Vec, - offset: &mut i64, - is_little_endian: bool, - compression: Option, -) { - write_generic_binary( - array.validity(), - array.offsets(), - array.values(), - buffers, - arrow_data, - offset, - is_little_endian, - compression, - ); -} - -fn write_fixed_size_binary( - array: &FixedSizeBinaryArray, - buffers: &mut Vec, - arrow_data: &mut Vec, - offset: &mut i64, - _is_little_endian: bool, - compression: Option, -) { - write_bitmap( - array.validity(), - array.len(), - buffers, - arrow_data, - offset, - compression, - ); - write_bytes(array.values(), buffers, arrow_data, offset, compression); -} - -fn write_list( - array: &ListArray, - buffers: &mut Vec, - arrow_data: &mut Vec, - nodes: &mut Vec, - offset: &mut i64, - is_little_endian: bool, - compression: Option, -) { - let offsets = array.offsets().buffer(); - let validity = array.validity(); - - write_bitmap( - validity, - offsets.len() - 1, - buffers, - arrow_data, - offset, - compression, - ); - - let first = *offsets.first().unwrap(); - let last = *offsets.last().unwrap(); - if first == O::zero() { - write_buffer( - offsets, - buffers, - arrow_data, - offset, - is_little_endian, - compression, - ); - } else { - write_buffer_from_iter( - offsets.iter().map(|x| *x - first), - buffers, - arrow_data, - offset, - is_little_endian, - compression, - ); - } - - write( - array - .values() - .sliced(first.to_usize(), last.to_usize() - first.to_usize()) - .as_ref(), - buffers, - arrow_data, - nodes, - offset, - is_little_endian, - compression, - ); -} - -pub fn write_struct( - array: &StructArray, - buffers: &mut Vec, - arrow_data: &mut Vec, - nodes: &mut Vec, - offset: &mut i64, - is_little_endian: bool, - compression: Option, -) { - write_bitmap( - array.validity(), - array.len(), - buffers, - arrow_data, - offset, - compression, - ); - array.values().iter().for_each(|array| { - write( - array.as_ref(), - buffers, - arrow_data, - nodes, - offset, - is_little_endian, - compression, - ); - }); -} - -pub fn write_union( - array: &UnionArray, - buffers: &mut Vec, - arrow_data: &mut Vec, - nodes: &mut Vec, - offset: &mut i64, - is_little_endian: bool, - compression: Option, -) { - write_buffer( - array.types(), - buffers, - arrow_data, - offset, - is_little_endian, - compression, - ); - - if let Some(offsets) = array.offsets() { - write_buffer( - offsets, - buffers, - arrow_data, - offset, - is_little_endian, - compression, - ); - } - array.fields().iter().for_each(|array| { - write( - array.as_ref(), - buffers, - arrow_data, - nodes, - offset, - is_little_endian, - compression, - ) - }); -} - -fn write_map( - array: &MapArray, - buffers: &mut Vec, - arrow_data: &mut Vec, - nodes: &mut Vec, - offset: &mut i64, - is_little_endian: bool, - compression: Option, -) { - let offsets = array.offsets().buffer(); - let validity = array.validity(); - - write_bitmap( - validity, - offsets.len() - 1, - buffers, - arrow_data, - offset, - compression, - ); - - let first = *offsets.first().unwrap(); - let last = *offsets.last().unwrap(); - if first == 0 { - write_buffer( - offsets, - buffers, - arrow_data, - offset, - is_little_endian, - compression, - ); - } else { - write_buffer_from_iter( - offsets.iter().map(|x| *x - first), - buffers, - arrow_data, - offset, - is_little_endian, - compression, - ); - } - - write( - array - .field() - .sliced(first as usize, last as usize - first as usize) - .as_ref(), - buffers, - arrow_data, - nodes, - offset, - is_little_endian, - compression, - ); -} - -fn write_fixed_size_list( - array: &FixedSizeListArray, - buffers: &mut Vec, - arrow_data: &mut Vec, - nodes: &mut Vec, - offset: &mut i64, - is_little_endian: bool, - compression: Option, -) { - write_bitmap( - array.validity(), - array.len(), - buffers, - arrow_data, - offset, - compression, - ); - write( - array.values().as_ref(), - buffers, - arrow_data, - nodes, - offset, - is_little_endian, - compression, - ); -} - -// use `write_keys` to either write keys or values -#[allow(clippy::too_many_arguments)] -pub(super) fn write_dictionary( - array: &DictionaryArray, - buffers: &mut Vec, - arrow_data: &mut Vec, - nodes: &mut Vec, - offset: &mut i64, - is_little_endian: bool, - compression: Option, - write_keys: bool, -) -> usize { - if write_keys { - write_primitive( - array.keys(), - buffers, - arrow_data, - offset, - is_little_endian, - compression, - ); - array.keys().len() - } else { - write( - array.values().as_ref(), - buffers, - arrow_data, - nodes, - offset, - is_little_endian, - compression, - ); - array.values().len() - } -} +use crate::{match_integer_type, with_match_primitive_type_full}; +mod binary; +mod binview; +mod boolean; +mod dictionary; +mod fixed_size_binary; +mod fixed_sized_list; +mod list; +mod map; +mod primitive; +mod struct_; +mod union; + +use binary::*; +use binview::*; +use boolean::*; +pub(super) use dictionary::*; +use fixed_size_binary::*; +use fixed_sized_list::*; +use list::*; +use map::*; +use primitive::*; +use struct_::*; +use union::*; /// Writes an [`Array`] to `arrow_data` pub fn write( @@ -566,14 +177,31 @@ pub fn write( compression, ); }, - Utf8View | BinaryView => todo!(), + Utf8View => write_binview( + array.as_any().downcast_ref::().unwrap(), + buffers, + arrow_data, + offset, + is_little_endian, + compression, + ), + BinaryView => write_binview( + array.as_any().downcast_ref::().unwrap(), + buffers, + arrow_data, + offset, + is_little_endian, + compression, + ), } } #[inline] fn pad_buffer_to_64(buffer: &mut Vec, length: usize) { let pad_len = pad_to_64(length); - buffer.extend_from_slice(&vec![0u8; pad_len]); + for _ in 0..pad_len { + buffer.push(0u8); + } } /// writes `bytes` to `arrow_data` updating `buffers` and `offset` and guaranteeing a 8 byte boundary. diff --git a/src/io/ipc/write/serialize/primitive.rs b/src/io/ipc/write/serialize/primitive.rs new file mode 100644 index 00000000000..acd3ad672f7 --- /dev/null +++ b/src/io/ipc/write/serialize/primitive.rs @@ -0,0 +1,28 @@ +use super::*; + +pub(super) fn write_primitive( + array: &PrimitiveArray, + buffers: &mut Vec, + arrow_data: &mut Vec, + offset: &mut i64, + is_little_endian: bool, + compression: Option, +) { + write_bitmap( + array.validity(), + array.len(), + buffers, + arrow_data, + offset, + compression, + ); + + write_buffer( + array.values(), + buffers, + arrow_data, + offset, + is_little_endian, + compression, + ) +} diff --git a/src/io/ipc/write/serialize/struct_.rs b/src/io/ipc/write/serialize/struct_.rs new file mode 100644 index 00000000000..67353746d4c --- /dev/null +++ b/src/io/ipc/write/serialize/struct_.rs @@ -0,0 +1,31 @@ +use super::*; + +pub(super) fn write_struct( + array: &StructArray, + buffers: &mut Vec, + arrow_data: &mut Vec, + nodes: &mut Vec, + offset: &mut i64, + is_little_endian: bool, + compression: Option, +) { + write_bitmap( + array.validity(), + array.len(), + buffers, + arrow_data, + offset, + compression, + ); + array.values().iter().for_each(|array| { + write( + array.as_ref(), + buffers, + arrow_data, + nodes, + offset, + is_little_endian, + compression, + ); + }); +} diff --git a/src/io/ipc/write/serialize/union.rs b/src/io/ipc/write/serialize/union.rs new file mode 100644 index 00000000000..9f0e53fcf67 --- /dev/null +++ b/src/io/ipc/write/serialize/union.rs @@ -0,0 +1,42 @@ +use super::*; + +pub(super) fn write_union( + array: &UnionArray, + buffers: &mut Vec, + arrow_data: &mut Vec, + nodes: &mut Vec, + offset: &mut i64, + is_little_endian: bool, + compression: Option, +) { + write_buffer( + array.types(), + buffers, + arrow_data, + offset, + is_little_endian, + compression, + ); + + if let Some(offsets) = array.offsets() { + write_buffer( + offsets, + buffers, + arrow_data, + offset, + is_little_endian, + compression, + ); + } + array.fields().iter().for_each(|array| { + write( + array.as_ref(), + buffers, + arrow_data, + nodes, + offset, + is_little_endian, + compression, + ) + }); +} diff --git a/src/io/ipc/write/writer.rs b/src/io/ipc/write/writer.rs index b92f1b2ba86..32ec07b8597 100644 --- a/src/io/ipc/write/writer.rs +++ b/src/io/ipc/write/writer.rs @@ -30,7 +30,7 @@ pub struct FileWriter { /// IPC write options pub(crate) options: WriteOptions, /// A reference to the schema, used in validating record batches - pub(crate) schema: Schema, + pub(crate) schema: SchemaRef, pub(crate) ipc_fields: Vec, /// The number of bytes between each block of bytes, as an offset for random access pub(crate) block_offsets: usize, @@ -50,7 +50,7 @@ impl FileWriter { /// Creates a new [`FileWriter`] and writes the header to `writer` pub fn try_new( writer: W, - schema: Schema, + schema: SchemaRef, ipc_fields: Option>, options: WriteOptions, ) -> Result { @@ -63,7 +63,7 @@ impl FileWriter { /// Creates a new [`FileWriter`]. pub fn new( writer: W, - schema: Schema, + schema: SchemaRef, ipc_fields: Option>, options: WriteOptions, ) -> Self { diff --git a/tests/it/io/ipc/mod.rs b/tests/it/io/ipc/mod.rs index 6d3e71c5db4..7ca9a9bf5cc 100644 --- a/tests/it/io/ipc/mod.rs +++ b/tests/it/io/ipc/mod.rs @@ -17,3 +17,83 @@ mod read_stream_async; mod read_file_async; mod mmap; +use std::io::Cursor; +use std::sync::Arc; + +use arrow2::array::*; +use arrow2::chunk::Chunk; +use arrow2::datatypes::{Schema, SchemaRef, Field}; +use arrow2::error::*; +use arrow2::io::ipc::read::{read_file_metadata, FileReader}; +use arrow2::io::ipc::write::*; +use arrow2::io::ipc::IpcField; + +pub(crate) fn write( + batches: &[Chunk>], + schema: &SchemaRef, + ipc_fields: Option>, + compression: Option, +) -> Result> { + let result = vec![]; + let options = WriteOptions { compression }; + let mut writer = FileWriter::try_new(result, schema.clone(), ipc_fields.clone(), options)?; + for batch in batches { + writer.write(batch, ipc_fields.as_ref().map(|x| x.as_ref()))?; + } + writer.finish()?; + Ok(writer.into_inner()) +} + +fn round_trip( + columns: Chunk>, + schema: SchemaRef, + ipc_fields: Option>, + compression: Option, +) -> Result<()> { + let (expected_schema, expected_batches) = (schema.clone(), vec![columns]); + + let result = write(&expected_batches, &schema, ipc_fields, compression)?; + let mut reader = Cursor::new(result); + let metadata = read_file_metadata(&mut reader)?; + let schema = metadata.schema.clone(); + + let reader = FileReader::new(reader, metadata, None, None); + + assert_eq!(schema, expected_schema); + + let batches = reader.collect::>>()?; + + assert_eq!(batches, expected_batches); + Ok(()) +} + +fn prep_schema(array: &dyn Array) -> SchemaRef { + let fields = vec![Field::new("a", array.data_type().clone(), true)]; + Arc::new(Schema::from(fields)) +} + +#[test] +fn write_boolean() -> Result<()> { + let array = BooleanArray::from([Some(true), Some(false), None, Some(true)]).boxed(); + let schema = prep_schema(array.as_ref()); + let columns = Chunk::try_new(vec![array])?; + round_trip(columns, schema, None, Some(Compression::ZSTD)) +} + +#[test] +fn write_sliced_utf8() -> Result<()> { + let array = Utf8Array::::from_slice(["aa", "bb"]) + .sliced(1, 1) + .boxed(); + let schema = prep_schema(array.as_ref()); + let columns = Chunk::try_new(vec![array])?; + round_trip(columns, schema, None, Some(Compression::ZSTD)) +} + +#[test] +fn write_binview() -> Result<()> { + let array = Utf8ViewArray::from([Some("foo"), Some("bar"), None, Some("hamlet")]).boxed(); + let schema = prep_schema(array.as_ref()); + let columns = Chunk::try_new(vec![array])?; + round_trip(columns, schema, None, Some(Compression::ZSTD)) +} From aabbad61915f336f5d7aee2e729d535b426bfd02 Mon Sep 17 00:00:00 2001 From: Ritchie Vink Date: Wed, 10 Jan 2024 16:35:03 +0100 Subject: [PATCH 04/25] feat(rust): add `BinaryView` to `parquet` writer/reader. (#13489) --- src/array/binview/ffi.rs | 4 +- src/array/binview/fmt.rs | 21 ++-- src/array/binview/mod.rs | 162 +++++++++++++++++++------- src/array/binview/mutable.rs | 73 +++++++++--- src/array/binview/view.rs | 36 +++++- src/array/growable/binview.rs | 19 ++- src/array/mod.rs | 8 +- src/buffer/immutable.rs | 16 ++- src/compute/cast/binview_to.rs | 27 +++++ src/compute/cast/mod.rs | 32 ++++- src/io/ipc/write/serialize/binview.rs | 2 +- 11 files changed, 315 insertions(+), 85 deletions(-) create mode 100644 src/compute/cast/binview_to.rs diff --git a/src/array/binview/ffi.rs b/src/array/binview/ffi.rs index 697422d8f10..d5fffc9919a 100644 --- a/src/array/binview/ffi.rs +++ b/src/array/binview/ffi.rs @@ -48,6 +48,8 @@ unsafe impl ToFfi for BinaryViewArrayGeneric { buffers: self.buffers.clone(), raw_buffers: self.raw_buffers.clone(), phantom: Default::default(), + total_bytes_len: self.total_bytes_len, + total_buffer_len: self.total_buffer_len, } } } @@ -67,7 +69,7 @@ impl FromFfi for BinaryViewArray buffers.push(values); } - Ok(Self::new_unchecked( + Ok(Self::new_unchecked_unknown_md( data_type, views, Arc::from(buffers), diff --git a/src/array/binview/fmt.rs b/src/array/binview/fmt.rs index 1337588c61c..53a0f71dd4b 100644 --- a/src/array/binview/fmt.rs +++ b/src/array/binview/fmt.rs @@ -3,7 +3,7 @@ use std::fmt::{Debug, Formatter, Result, Write}; use super::super::fmt::write_vec; use super::BinaryViewArrayGeneric; use crate::array::binview::ViewType; -use crate::array::Array; +use crate::array::{Array, BinaryViewArray, Utf8ViewArray}; pub fn write_value<'a, T: ViewType + ?Sized, W: Write>( array: &'a BinaryViewArrayGeneric, @@ -19,19 +19,18 @@ where write_vec(f, writer, None, bytes.len(), "None", false) } -impl Debug for BinaryViewArrayGeneric -where - for<'a> &'a T: Debug, -{ +impl Debug for BinaryViewArray { fn fmt(&self, f: &mut Formatter<'_>) -> Result { let writer = |f: &mut Formatter, index| write_value(self, index, f); + write!(f, "BinaryViewArray")?; + write_vec(f, writer, self.validity(), self.len(), "None", false) + } +} - let head = if T::IS_UTF8 { - "Utf8ViewArray" - } else { - "BinaryViewArray" - }; - write!(f, "{head}")?; +impl Debug for Utf8ViewArray { + fn fmt(&self, f: &mut Formatter<'_>) -> Result { + let writer = |f: &mut Formatter, index| write!(f, "{}", self.value(index)); + write!(f, "Utf8ViewArray")?; write_vec(f, writer, self.validity(), self.len(), "None", false) } } diff --git a/src/array/binview/mod.rs b/src/array/binview/mod.rs index 9757c96e6fd..9544cdc1018 100644 --- a/src/array/binview/mod.rs +++ b/src/array/binview/mod.rs @@ -10,12 +10,10 @@ use std::fmt::Debug; use std::marker::PhantomData; use std::sync::Arc; -use polars_error::*; - use crate::array::Array; use crate::bitmap::Bitmap; use crate::buffer::Buffer; -use crate::datatypes::ArrowDataType; +use crate::datatypes::DataType; mod private { pub trait Sealed: Send + Sync {} @@ -23,20 +21,22 @@ mod private { impl Sealed for str {} impl Sealed for [u8] {} } +pub use mutable::MutableBinaryViewArray; use private::Sealed; use crate::array::binview::iterator::BinaryViewValueIter; -use crate::array::binview::mutable::MutableBinaryViewArray; -use crate::array::binview::view::{validate_binary_view, validate_utf8_view}; -use crate::array::iterator::NonNullValuesIter; +use crate::array::binview::view::{ + validate_binary_view, validate_utf8_only_view, validate_utf8_view, +}; use crate::bitmap::utils::{BitmapIter, ZipValidity}; +use crate::error::{Error, Result}; pub type BinaryViewArray = BinaryViewArrayGeneric<[u8]>; pub type Utf8ViewArray = BinaryViewArrayGeneric; -pub trait ViewType: Sealed + 'static + PartialEq { +pub trait ViewType: Sealed + 'static + PartialEq + AsRef { const IS_UTF8: bool; - const DATA_TYPE: ArrowDataType; + const DATA_TYPE: DataType; type Owned: Debug + Clone + Sync + Send + AsRef; /// # Safety @@ -51,7 +51,7 @@ pub trait ViewType: Sealed + 'static + PartialEq { impl ViewType for str { const IS_UTF8: bool = true; - const DATA_TYPE: ArrowDataType = ArrowDataType::Utf8View; + const DATA_TYPE: DataType = DataType::Utf8View; type Owned = String; #[inline(always)] @@ -71,7 +71,7 @@ impl ViewType for str { impl ViewType for [u8] { const IS_UTF8: bool = false; - const DATA_TYPE: ArrowDataType = ArrowDataType::BinaryView; + const DATA_TYPE: DataType = DataType::BinaryView; type Owned = Vec; #[inline(always)] @@ -90,13 +90,17 @@ impl ViewType for [u8] { } pub struct BinaryViewArrayGeneric { - data_type: ArrowDataType, + data_type: DataType, views: Buffer, buffers: Arc<[Buffer]>, // Raw buffer access. (pointer, len). raw_buffers: Arc<[(*const u8, usize)]>, validity: Option, phantom: PhantomData, + /// Total bytes length if we would concatenate them all. + total_bytes_len: usize, + /// Total bytes in the buffer (excluding remaining capacity) + total_buffer_len: usize, } impl Clone for BinaryViewArrayGeneric { @@ -108,6 +112,8 @@ impl Clone for BinaryViewArrayGeneric { raw_buffers: self.raw_buffers.clone(), validity: self.validity.clone(), phantom: Default::default(), + total_bytes_len: self.total_bytes_len, + total_buffer_len: self.total_buffer_len, } } } @@ -128,10 +134,12 @@ impl BinaryViewArrayGeneric { /// - the data is valid utf8 (if required) /// - The offsets match the buffers. pub unsafe fn new_unchecked( - data_type: ArrowDataType, + data_type: DataType, views: Buffer, buffers: Arc<[Buffer]>, validity: Option, + total_bytes_len: usize, + total_buffer_len: usize, ) -> Self { let raw_buffers = buffers_into_raw(&buffers); Self { @@ -141,11 +149,34 @@ impl BinaryViewArrayGeneric { raw_buffers, validity, phantom: Default::default(), + total_bytes_len, + total_buffer_len, } } - pub fn data_buffers(&self) -> &[Buffer] { - self.buffers.as_ref() + /// Create a new BinaryViewArray but initialize a statistics compute. + /// # Safety + /// The caller must ensure the invariants + pub unsafe fn new_unchecked_unknown_md( + data_type: DataType, + views: Buffer, + buffers: Arc<[Buffer]>, + validity: Option, + ) -> Self { + let total_bytes_len = views.iter().map(|v| (*v as u32) as usize).sum(); + let total_buffer_len = buffers.iter().map(|b| b.len()).sum(); + Self::new_unchecked( + data_type, + views, + buffers, + validity, + total_bytes_len, + total_buffer_len, + ) + } + + pub fn data_buffers(&self) -> &Arc<[Buffer]> { + &self.buffers } pub fn variadic_buffer_lengths(&self) -> Vec { @@ -157,11 +188,11 @@ impl BinaryViewArrayGeneric { } pub fn try_new( - data_type: ArrowDataType, + data_type: DataType, views: Buffer, buffers: Arc<[Buffer]>, validity: Option, - ) -> PolarsResult { + ) -> Result { if T::IS_UTF8 { validate_utf8_view(views.as_ref(), buffers.as_ref())?; } else { @@ -172,28 +203,33 @@ impl BinaryViewArrayGeneric { polars_ensure!(validity.len()== views.len(), ComputeError: "validity mask length must match the number of values" ) } - let raw_buffers = buffers_into_raw(&buffers); - Ok(Self { - data_type, - views, - buffers, - raw_buffers, - validity, - phantom: Default::default(), - }) + unsafe { + Ok(Self::new_unchecked_unknown_md( + data_type, views, buffers, validity, + )) + } } /// Creates an empty [`BinaryViewArrayGeneric`], i.e. whose `.len` is zero. #[inline] - pub fn new_empty(data_type: ArrowDataType) -> Self { - unsafe { Self::new_unchecked(data_type, Buffer::new(), Arc::from([]), None) } + pub fn new_empty(data_type: DataType) -> Self { + unsafe { Self::new_unchecked(data_type, Buffer::new(), Arc::from([]), None, 0, 0) } } /// Returns a new null [`BinaryViewArrayGeneric`] of `length`. #[inline] - pub fn new_null(data_type: ArrowDataType, length: usize) -> Self { + pub fn new_null(data_type: DataType, length: usize) -> Self { let validity = Some(Bitmap::new_zeroed(length)); - unsafe { Self::new_unchecked(data_type, Buffer::zeroed(length), Arc::from([]), validity) } + unsafe { + Self::new_unchecked( + data_type, + Buffer::zeroed(length), + Arc::from([]), + validity, + 0, + 0, + ) + } } /// Returns the element at index `i` @@ -247,21 +283,70 @@ impl BinaryViewArrayGeneric { BinaryViewValueIter::new(self) } - /// Returns an iterator of the non-null values. - #[inline] - pub fn non_null_values_iter(&self) -> NonNullValuesIter<'_, BinaryViewArrayGeneric> { - NonNullValuesIter::new(self, self.validity()) - } - impl_sliced!(); impl_mut_validity!(); impl_into_array!(); pub fn from, P: AsRef<[Option]>>(slice: P) -> Self { - let mutable = - MutableBinaryViewArray::from_iter(slice.as_ref().iter().map(|opt_v| opt_v.as_ref())); + let mutable = MutableBinaryViewArray::from_iterator( + slice.as_ref().iter().map(|opt_v| opt_v.as_ref()), + ); mutable.into() } + + /// Get the total length of bytes that it would take to concatenate all binary/str values in this array. + pub fn total_bytes_len(&self) -> usize { + self.total_bytes_len + } + + /// Get the length of bytes that are stored in the variadic buffers. + pub fn total_buffer_len(&self) -> usize { + self.total_buffer_len + } +} + +impl BinaryViewArray { + /// Validate the underlying bytes on UTF-8. + pub fn validate_utf8(&self) -> Result<()> { + validate_utf8_only_view(&self.views, &self.buffers) + } + + /// Convert [`BinaryViewArray`] to [`Utf8ViewArray`]. + pub fn to_utf8view(&self) -> Result { + self.validate_utf8()?; + unsafe { Ok(self.to_utf8view_unchecked()) } + } + + /// Convert [`BinaryViewArray`] to [`Utf8ViewArray`] without checking UTF-8. + /// + /// # Safety + /// The caller must ensure the underlying data is valid UTF-8. + pub unsafe fn to_utf8view_unchecked(&self) -> Utf8ViewArray { + Utf8ViewArray::new_unchecked( + DataType::Utf8View, + self.views.clone(), + self.buffers.clone(), + self.validity.clone(), + self.total_bytes_len, + self.total_buffer_len, + ) + } +} + +impl Utf8ViewArray { + pub fn to_binview(&self) -> BinaryViewArray { + // SAFETY: same invariants. + unsafe { + BinaryViewArray::new_unchecked( + DataType::BinaryView, + self.views.clone(), + self.buffers.clone(), + self.validity.clone(), + self.total_bytes_len, + self.total_buffer_len, + ) + } + } } impl Array for BinaryViewArrayGeneric { @@ -277,7 +362,7 @@ impl Array for BinaryViewArrayGeneric { self.views.len() } - fn data_type(&self) -> &ArrowDataType { + fn data_type(&self) -> &DataType { &self.data_type } @@ -291,7 +376,6 @@ impl Array for BinaryViewArrayGeneric { "the offset of the new Buffer cannot exceed the existing length" ); unsafe { self.slice_unchecked(offset, length) } - todo!() } unsafe fn slice_unchecked(&mut self, offset: usize, length: usize) { diff --git a/src/array/binview/mutable.rs b/src/array/binview/mutable.rs index b6fc86f7c5b..d863c42c456 100644 --- a/src/array/binview/mutable.rs +++ b/src/array/binview/mutable.rs @@ -1,10 +1,10 @@ use std::sync::Arc; -use polars_utils::slice::GetSaferUnchecked; - +use crate::array::binview::view::validate_utf8_only_view; use crate::array::binview::{BinaryViewArrayGeneric, ViewType}; use crate::bitmap::MutableBitmap; use crate::buffer::Buffer; +use crate::error::Result; const DEFAULT_BLOCK_SIZE: usize = 8 * 1024; @@ -15,6 +15,10 @@ pub struct MutableBinaryViewArray { in_progress_buffer: Vec, validity: Option, phantom: std::marker::PhantomData, + /// Total bytes length if we would concatenate them all. + total_bytes_len: usize, + /// Total bytes in the buffer (excluding remaining capacity) + total_buffer_len: usize, } impl Default for MutableBinaryViewArray { @@ -25,16 +29,15 @@ impl Default for MutableBinaryViewArray { impl From> for BinaryViewArrayGeneric { fn from(mut value: MutableBinaryViewArray) -> Self { - value - .completed_buffers - .push(std::mem::take(&mut value.in_progress_buffer).into()); - + value.finish_in_progress(); unsafe { Self::new_unchecked( T::DATA_TYPE, value.views.into(), Arc::from(value.completed_buffers), value.validity.map(|b| b.into()), + value.total_bytes_len, + value.total_buffer_len, ) } } @@ -52,9 +55,19 @@ impl MutableBinaryViewArray { in_progress_buffer: vec![], validity: None, phantom: Default::default(), + total_buffer_len: 0, + total_bytes_len: 0, } } + pub fn views(&mut self) -> &mut Vec { + &mut self.views + } + + pub fn validity(&mut self) -> Option<&mut MutableBitmap> { + self.validity.as_mut() + } + /// Reserves `additional` elements and `additional_buffer` on the buffer. pub fn reserve(&mut self, additional: usize) { self.views.reserve(additional); @@ -71,13 +84,10 @@ impl MutableBinaryViewArray { self.validity = Some(validity); } - pub fn push_value>(&mut self, value: V) { - if let Some(validity) = &mut self.validity { - validity.push(true) - } - + pub fn push_value_ignore_validity>(&mut self, value: V) { let value = value.as_ref(); let bytes = value.to_bytes(); + self.total_bytes_len += bytes.len(); let len: u32 = bytes.len().try_into().unwrap(); let mut payload = [0; 16]; payload[0..4].copy_from_slice(&len.to_le_bytes()); @@ -85,6 +95,7 @@ impl MutableBinaryViewArray { if len <= 12 { payload[4..4 + bytes.len()].copy_from_slice(bytes); } else { + self.total_buffer_len += bytes.len(); let required_cap = self.in_progress_buffer.len() + bytes.len(); if self.in_progress_buffer.capacity() < required_cap { let new_capacity = (self.in_progress_buffer.capacity() * 2) @@ -108,15 +119,26 @@ impl MutableBinaryViewArray { self.views.push(value); } + pub fn push_value>(&mut self, value: V) { + if let Some(validity) = &mut self.validity { + validity.push(true) + } + self.push_value_ignore_validity(value) + } + pub fn push>(&mut self, value: Option) { if let Some(value) = value { self.push_value(value) } else { - self.views.push(0); - match &mut self.validity { - Some(validity) => validity.push(false), - None => self.init_validity(), - } + self.push_null() + } + } + + pub fn push_null(&mut self) { + self.views.push(0); + match &mut self.validity { + Some(validity) => validity.push(false), + None => self.init_validity(), } } @@ -146,7 +168,7 @@ impl MutableBinaryViewArray { } } - pub fn from_iter(iterator: I) -> Self + pub fn from_iterator(iterator: I) -> Self where I: Iterator>, P: AsRef, @@ -167,7 +189,20 @@ impl MutableBinaryViewArray { } pub fn from, P: AsRef<[Option]>>(slice: P) -> Self { - Self::from_iter(slice.as_ref().iter().map(|opt_v| opt_v.as_ref())) + Self::from_iterator(slice.as_ref().iter().map(|opt_v| opt_v.as_ref())) + } + + fn finish_in_progress(&mut self) { + if !self.in_progress_buffer.is_empty() { + self.completed_buffers + .push(std::mem::take(&mut self.in_progress_buffer).into()); + } + } +} + +impl MutableBinaryViewArray<[u8]> { + pub fn validate_utf8(&mut self) -> Result<()> { + validate_utf8_only_view(&self.views, &self.completed_buffers) } } @@ -181,6 +216,6 @@ impl> Extend> for MutableBinaryViewA impl> FromIterator> for MutableBinaryViewArray { #[inline] fn from_iter>>(iter: I) -> Self { - Self::from_iter(iter.into_iter()) + Self::from_iterator(iter.into_iter()) } } diff --git a/src/array/binview/view.rs b/src/array/binview/view.rs index a24b1f1daec..49d855612a5 100644 --- a/src/array/binview/view.rs +++ b/src/array/binview/view.rs @@ -1,6 +1,5 @@ -use polars_error::*; - use crate::buffer::Buffer; +use crate::error::{Error, Result}; pub struct View { /// The length of the string/bytes. @@ -73,9 +72,34 @@ pub(super) fn validate_binary_view(views: &[u128], buffers: &[Buffer]) -> Po validate_view(views, buffers, |_| Ok(())) } -pub(super) fn validate_utf8_view(views: &[u128], buffers: &[Buffer]) -> PolarsResult<()> { - validate_view(views, buffers, |b| match simdutf8::basic::from_utf8(b) { +fn validate_utf8(b: &[u8]) -> Result<()> { + match simdutf8::basic::from_utf8(b) { Ok(_) => Ok(()), - Err(_) => Err(polars_err!(ComputeError: "invalid utf8")), - }) + Err(e) => Err(Error::InvalidArgumentError(format!( + "Encountered non-UTF-8 data {e}" + ))) + } +} + +pub(super) fn validate_utf8_view(views: &[u128], buffers: &[Buffer]) -> Result<()> { + validate_view(views, buffers, validate_utf8) +} + +pub(super) fn validate_utf8_only_view(views: &[u128], buffers: &[Buffer]) -> Result<()> { + for view in views { + let len = *view as u32; + if len <= 12 { + validate_utf8(&view.to_le_bytes()[4..4 + len as usize])?; + } else { + let view = View::from(*view); + let data = &buffers[view.buffer_idx as usize]; + + let start = view.offset as usize; + let end = start + len as usize; + let b = &data.as_slice()[start..end]; + validate_utf8(b)?; + }; + } + + Ok(()) } diff --git a/src/array/growable/binview.rs b/src/array/growable/binview.rs index 40793b1b3a4..d13474d99cd 100644 --- a/src/array/growable/binview.rs +++ b/src/array/growable/binview.rs @@ -15,6 +15,8 @@ pub struct GrowableBinaryViewArray<'a, T: ViewType + ?Sized> { validity: Option, views: Vec, buffers: Vec>, + total_bytes_len: usize, + total_buffer_len: usize, } impl<'a, T: ViewType + ?Sized> GrowableBinaryViewArray<'a, T> { @@ -45,6 +47,8 @@ impl<'a, T: ViewType + ?Sized> GrowableBinaryViewArray<'a, T> { validity: prepare_validity(use_validity, capacity), views: Vec::with_capacity(capacity), buffers: Vec::with_capacity(n_buffers), + total_bytes_len: 0, + total_buffer_len: 0, } } @@ -58,6 +62,8 @@ impl<'a, T: ViewType + ?Sized> GrowableBinaryViewArray<'a, T> { views.into(), Arc::from(buffers), validity.map(|v| v.into()), + self.total_bytes_len, + self.total_buffer_len, ) } } @@ -72,9 +78,16 @@ impl<'a, T: ViewType + ?Sized> Growable<'a> for GrowableBinaryViewArray<'a, T> { let buffer_offset = (buffer_offset as u128) << 64; let range = start..start + len; - self.buffers - .extend_from_slice(&array.data_buffers()[range.clone()]); + let buffers_range = &array.data_buffers()[range.clone()]; + self.buffers.extend_from_slice(buffers_range); + + for b in buffers_range { + self.total_buffer_len += b.len(); + } + self.views.extend(array.views()[range].iter().map(|&view| { + self.total_bytes_len += (view as u32) as usize; + // If null the buffer index is ignored because the length is 0, // so we can just do this view + buffer_offset @@ -110,6 +123,8 @@ impl<'a, T: ViewType + ?Sized> From> for BinaryVi val.views.into(), Arc::from(val.buffers), val.validity.map(|v| v.into()), + val.total_bytes_len, + val.total_buffer_len, ) } } diff --git a/src/array/mod.rs b/src/array/mod.rs index 16518e6514c..6a31ed5efc0 100644 --- a/src/array/mod.rs +++ b/src/array/mod.rs @@ -320,8 +320,8 @@ impl std::fmt::Debug for dyn Array + '_ { Primitive(primitive) => with_match_primitive_type!(primitive, |$T| { fmt_dyn!(self, PrimitiveArray<$T>, f) }), - BinaryView => fmt_dyn!(self, Utf8ViewArray, f), - Utf8View => fmt_dyn!(self, BinaryViewArray, f), + BinaryView => fmt_dyn!(self, BinaryViewArray, f), + Utf8View => fmt_dyn!(self, Utf8ViewArray, f), Binary => fmt_dyn!(self, BinaryArray, f), LargeBinary => fmt_dyn!(self, BinaryArray, f), FixedSizeBinary => fmt_dyn!(self, FixedSizeBinaryArray, f), @@ -745,7 +745,9 @@ pub use equal::equal; pub use fmt::{get_display, get_value_display}; pub use binary::{BinaryArray, BinaryValueIter, MutableBinaryArray, MutableBinaryValuesArray}; -pub use binview::{BinaryViewArray, BinaryViewArrayGeneric, Utf8ViewArray, ViewType}; +pub use binview::{ + BinaryViewArray, BinaryViewArrayGeneric, MutableBinaryViewArray, Utf8ViewArray, ViewType, +}; pub use boolean::{BooleanArray, MutableBooleanArray}; pub use dictionary::{DictionaryArray, DictionaryKey, MutableDictionaryArray}; pub use fixed_size_binary::{FixedSizeBinaryArray, MutableFixedSizeBinaryArray}; diff --git a/src/buffer/immutable.rs b/src/buffer/immutable.rs index 3736b91a5a0..917dcc383ae 100644 --- a/src/buffer/immutable.rs +++ b/src/buffer/immutable.rs @@ -3,8 +3,8 @@ use std::{iter::FromIterator, ops::Deref, sync::Arc, usize}; use either::Either; use num_traits::Zero; -use super::Bytes; -use super::IntoIter; +use super::{Bytes, IntoIter}; +use crate::array::ArrayAccessor; /// [`Buffer`] is a contiguous memory region that can be shared across /// thread boundaries. @@ -331,3 +331,15 @@ impl From> for arrow_buffer::Buffer { ) } } + +unsafe impl<'a, T: 'a> ArrayAccessor<'a> for Buffer { + type Item = &'a T; + + unsafe fn value_unchecked(&'a self, index: usize) -> Self::Item { + self.as_slice().get_unchecked(index) + } + + fn len(&self) -> usize { + Buffer::len(self) + } +} diff --git a/src/compute/cast/binview_to.rs b/src/compute/cast/binview_to.rs new file mode 100644 index 00000000000..cf1759669f5 --- /dev/null +++ b/src/compute/cast/binview_to.rs @@ -0,0 +1,27 @@ +use crate::array::*; +use crate::offset::Offset; + +pub(super) fn view_to_binary(array: &BinaryViewArray) -> BinaryArray { + let len: usize = Array::len(array); + let mut mutable = MutableBinaryValuesArray::::with_capacities(len, len * 12); + for slice in array.values_iter() { + mutable.push(slice) + } + let out: BinaryArray = mutable.into(); + out.with_validity(array.validity().cloned()) +} + +pub(super) fn utf8view_to_utf8(array: &Utf8ViewArray) -> Utf8Array { + let array = array.to_binview(); + let out = view_to_binary::(&array); + + let dtype = Utf8Array::::default_data_type(); + unsafe { + Utf8Array::new_unchecked( + dtype, + out.offsets().clone(), + out.values().clone(), + out.validity().cloned(), + ) + } +} diff --git a/src/compute/cast/mod.rs b/src/compute/cast/mod.rs index 688291dd12b..7c66d674604 100644 --- a/src/compute/cast/mod.rs +++ b/src/compute/cast/mod.rs @@ -1,6 +1,7 @@ //! Defines different casting operators such as [`cast`] or [`primitive_to_binary`]. mod binary_to; +mod binview_to; mod boolean_to; mod decimal_to; mod dictionary_to; @@ -565,7 +566,36 @@ pub fn cast(array: &dyn Array, to_type: &DataType, options: CastOptions) -> Resu "Casting from {from_type:?} to {to_type:?} not supported", ))), }, - + (Utf8View, _) => match to_type { + BinaryView => Ok(array + .as_any() + .downcast_ref::() + .unwrap() + .to_binview() + .boxed()), + LargeUtf8 => Ok(binview_to::utf8view_to_utf8::( + array.as_any().downcast_ref().unwrap(), + ) + .boxed()), + _ => Err(Error::NotYetImplemented(format!( + "Casting from {from_type:?} to {to_type:?} not supported", + ))), + }, + (BinaryView, _) => match to_type { + BinaryView => array + .as_any() + .downcast_ref::() + .unwrap() + .to_utf8view() + .map(|arr| arr.boxed()), + LargeBinary => Ok(binview_to::view_to_binary::( + array.as_any().downcast_ref().unwrap(), + ) + .boxed()), + _ => Err(Error::NotYetImplemented(format!( + "Casting from {from_type:?} to {to_type:?} not supported", + ))), + }, (Utf8, _) => match to_type { UInt8 => utf8_to_primitive_dyn::(array, to_type, options), UInt16 => utf8_to_primitive_dyn::(array, to_type, options), diff --git a/src/io/ipc/write/serialize/binview.rs b/src/io/ipc/write/serialize/binview.rs index bcf5d98970d..a9ebff6dd17 100644 --- a/src/io/ipc/write/serialize/binview.rs +++ b/src/io/ipc/write/serialize/binview.rs @@ -38,7 +38,7 @@ pub(super) fn write_binview( compression, ); - for data in array.data_buffers() { + for data in array.data_buffers().as_ref() { write_bytes(data, buffers, arrow_data, offset, compression); } } From fec11f0989c582c0483455de865632037eb068e4 Mon Sep 17 00:00:00 2001 From: Ritchie Vink Date: Sun, 14 Jan 2024 10:31:02 +0100 Subject: [PATCH 05/25] feat: implement binview comparison kernels (#13715) --- src/array/binview/mod.rs | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/array/binview/mod.rs b/src/array/binview/mod.rs index 9544cdc1018..7a7904ac43f 100644 --- a/src/array/binview/mod.rs +++ b/src/array/binview/mod.rs @@ -303,6 +303,11 @@ impl BinaryViewArrayGeneric { pub fn total_buffer_len(&self) -> usize { self.total_buffer_len } + + #[inline(always)] + pub fn len(&self) -> usize { + self.views.len() + } } impl BinaryViewArray { @@ -358,8 +363,9 @@ impl Array for BinaryViewArrayGeneric { self } + #[inline(always)] fn len(&self) -> usize { - self.views.len() + BinaryViewArrayGeneric::len(self) } fn data_type(&self) -> &DataType { From 733e1cdfb7d0280c312590753fbf9d73758b0cc5 Mon Sep 17 00:00:00 2001 From: Orson Peters Date: Mon, 15 Jan 2024 16:19:16 +0100 Subject: [PATCH 06/25] perf: directly embed data ptr in Buffer (#13744) --- src/array/binary/ffi.rs | 4 +- src/array/binview/ffi.rs | 4 +- src/array/binview/mod.rs | 4 +- src/array/fixed_size_binary/ffi.rs | 2 +- src/array/list/ffi.rs | 2 +- src/array/map/ffi.rs | 2 +- src/array/primitive/ffi.rs | 2 +- src/array/union/ffi.rs | 6 +- src/array/utf8/ffi.rs | 4 +- src/buffer/immutable.rs | 123 +++++++++++------------------ 10 files changed, 63 insertions(+), 90 deletions(-) diff --git a/src/array/binary/ffi.rs b/src/array/binary/ffi.rs index 9537e63249c..598aa87074d 100644 --- a/src/array/binary/ffi.rs +++ b/src/array/binary/ffi.rs @@ -13,8 +13,8 @@ unsafe impl ToFfi for BinaryArray { fn buffers(&self) -> Vec> { vec![ self.validity.as_ref().map(|x| x.as_ptr()), - Some(self.offsets.buffer().as_ptr().cast::()), - Some(self.values.as_ptr().cast::()), + Some(self.offsets.buffer().storage_ptr().cast::()), + Some(self.values.storage_ptr().cast::()), ] } diff --git a/src/array/binview/ffi.rs b/src/array/binview/ffi.rs index d5fffc9919a..71e1e56abf5 100644 --- a/src/array/binview/ffi.rs +++ b/src/array/binview/ffi.rs @@ -12,8 +12,8 @@ unsafe impl ToFfi for BinaryViewArrayGeneric { fn buffers(&self) -> Vec> { let mut buffers = Vec::with_capacity(self.buffers.len() + 2); buffers.push(self.validity.as_ref().map(|x| x.as_ptr())); - buffers.push(Some(self.views.as_ptr().cast::())); - buffers.extend(self.buffers.iter().map(|b| Some(b.as_ptr()))); + buffers.push(Some(self.views.storage_ptr().cast::())); + buffers.extend(self.buffers.iter().map(|b| Some(b.storage_ptr()))); buffers } diff --git a/src/array/binview/mod.rs b/src/array/binview/mod.rs index 7a7904ac43f..e29e094a636 100644 --- a/src/array/binview/mod.rs +++ b/src/array/binview/mod.rs @@ -124,7 +124,7 @@ unsafe impl Sync for BinaryViewArrayGeneric {} fn buffers_into_raw(buffers: &[Buffer]) -> Arc<[(*const T, usize)]> { buffers .iter() - .map(|buf| (buf.as_ptr(), buf.len())) + .map(|buf| (buf.storage_ptr(), buf.len())) .collect() } @@ -260,7 +260,7 @@ impl BinaryViewArrayGeneric { // data: 12 bytes let bytes = if len <= 12 { - let ptr = self.views.as_ptr() as *const u8; + let ptr = self.views.storage_ptr() as *const u8; std::slice::from_raw_parts(ptr.add(i * 16 + 4), len as usize) } else { let buffer_idx = (v >> 64) as u32; diff --git a/src/array/fixed_size_binary/ffi.rs b/src/array/fixed_size_binary/ffi.rs index 444f3c3996e..d749944e020 100644 --- a/src/array/fixed_size_binary/ffi.rs +++ b/src/array/fixed_size_binary/ffi.rs @@ -11,7 +11,7 @@ unsafe impl ToFfi for FixedSizeBinaryArray { fn buffers(&self) -> Vec> { vec![ self.validity.as_ref().map(|x| x.as_ptr()), - Some(self.values.as_ptr().cast::()), + Some(self.values.storage_ptr().cast::()), ] } diff --git a/src/array/list/ffi.rs b/src/array/list/ffi.rs index 67bd25f1a9b..8edcb1ef4c9 100644 --- a/src/array/list/ffi.rs +++ b/src/array/list/ffi.rs @@ -9,7 +9,7 @@ unsafe impl ToFfi for ListArray { fn buffers(&self) -> Vec> { vec![ self.validity.as_ref().map(|x| x.as_ptr()), - Some(self.offsets.buffer().as_ptr().cast::()), + Some(self.offsets.buffer().storage_ptr().cast::()), ] } diff --git a/src/array/map/ffi.rs b/src/array/map/ffi.rs index 09920419c21..7da6c73b0cb 100644 --- a/src/array/map/ffi.rs +++ b/src/array/map/ffi.rs @@ -7,7 +7,7 @@ unsafe impl ToFfi for MapArray { fn buffers(&self) -> Vec> { vec![ self.validity.as_ref().map(|x| x.as_ptr()), - Some(self.offsets.buffer().as_ptr().cast::()), + Some(self.offsets.buffer().storage_ptr().cast::()), ] } diff --git a/src/array/primitive/ffi.rs b/src/array/primitive/ffi.rs index de5d6a70584..93c3939938d 100644 --- a/src/array/primitive/ffi.rs +++ b/src/array/primitive/ffi.rs @@ -13,7 +13,7 @@ unsafe impl ToFfi for PrimitiveArray { fn buffers(&self) -> Vec> { vec![ self.validity.as_ref().map(|x| x.as_ptr()), - Some(self.values.as_ptr().cast::()), + Some(self.values.storage_ptr().cast::()), ] } diff --git a/src/array/union/ffi.rs b/src/array/union/ffi.rs index 89cee93e4d3..87403cf1a3c 100644 --- a/src/array/union/ffi.rs +++ b/src/array/union/ffi.rs @@ -7,11 +7,11 @@ unsafe impl ToFfi for UnionArray { fn buffers(&self) -> Vec> { if let Some(offsets) = &self.offsets { vec![ - Some(self.types.as_ptr().cast::()), - Some(offsets.as_ptr().cast::()), + Some(self.types.storage_ptr().cast::()), + Some(offsets.storage_ptr().cast::()), ] } else { - vec![Some(self.types.as_ptr().cast::())] + vec![Some(self.types.storage_ptr().cast::())] } } diff --git a/src/array/utf8/ffi.rs b/src/array/utf8/ffi.rs index 3611678da57..99ffe75faf2 100644 --- a/src/array/utf8/ffi.rs +++ b/src/array/utf8/ffi.rs @@ -12,8 +12,8 @@ unsafe impl ToFfi for Utf8Array { fn buffers(&self) -> Vec> { vec![ self.validity.as_ref().map(|x| x.as_ptr()), - Some(self.offsets.buffer().as_ptr().cast::()), - Some(self.values.as_ptr().cast::()), + Some(self.offsets.buffer().storage_ptr().cast::()), + Some(self.values.storage_ptr().cast::()), ] } diff --git a/src/buffer/immutable.rs b/src/buffer/immutable.rs index 917dcc383ae..e9268a87854 100644 --- a/src/buffer/immutable.rs +++ b/src/buffer/immutable.rs @@ -37,17 +37,19 @@ use crate::array::ArrayAccessor; /// ``` #[derive(Clone)] pub struct Buffer { - /// the internal byte buffer. - data: Arc>, + /// The internal byte buffer. + storage: Arc>, - /// The offset into the buffer. - offset: usize, + /// A pointer into the buffer where our data starts. + ptr: *const T, - // the length of the buffer. Given a region `data` of N bytes, [offset..offset+length] is visible - // to this buffer. + // The length of the buffer. length: usize, } +unsafe impl Sync for Buffer {} +unsafe impl Send for Buffer {} + impl PartialEq for Buffer { #[inline] fn eq(&self, other: &Self) -> bool { @@ -77,10 +79,11 @@ impl Buffer { /// Auxiliary method to create a new Buffer pub(crate) fn from_bytes(bytes: Bytes) -> Self { + let ptr = bytes.as_ptr(); let length = bytes.len(); Buffer { - data: Arc::new(bytes), - offset: 0, + storage: Arc::new(bytes), + ptr, length, } } @@ -94,14 +97,14 @@ impl Buffer { /// Returns whether the buffer is empty. #[inline] pub fn is_empty(&self) -> bool { - self.len() == 0 + self.length == 0 } /// Returns whether underlying data is sliced. /// If sliced the [`Buffer`] is backed by /// more data than the length of `Self`. pub fn is_sliced(&self) -> bool { - self.data.len() != self.length + self.storage.len() != self.length } /// Returns the byte slice stored in this buffer @@ -109,11 +112,8 @@ impl Buffer { pub fn as_slice(&self) -> &[T] { // Safety: // invariant of this struct `offset + length <= data.len()` - debug_assert!(self.offset + self.length <= self.data.len()); - unsafe { - self.data - .get_unchecked(self.offset..self.offset + self.length) - } + debug_assert!(self.offset() + self.length <= self.storage.len()); + unsafe { std::slice::from_raw_parts(self.ptr, self.length) } } /// Returns the byte slice stored in this buffer @@ -124,7 +124,7 @@ impl Buffer { // Safety: // invariant of this function debug_assert!(index < self.length); - unsafe { self.data.get_unchecked(self.offset + index) } + unsafe { &*self.ptr.add(index) } } /// Returns a new [`Buffer`] that is a slice of this buffer starting at `offset`. @@ -170,20 +170,24 @@ impl Buffer { /// The caller must ensure `offset + length <= self.len()` #[inline] pub unsafe fn slice_unchecked(&mut self, offset: usize, length: usize) { - self.offset += offset; + self.ptr = self.ptr.add(offset); self.length = length; } - /// Returns a pointer to the start of this buffer. + /// Returns a pointer to the start of the storage underlying this buffer. #[inline] - pub(crate) fn as_ptr(&self) -> *const T { - self.data.deref().as_ptr() + pub(crate) fn storage_ptr(&self) -> *const T { + self.storage.as_ptr() } - /// Returns the offset of this buffer. + /// Returns the start offset of this buffer within the underlying storage. #[inline] pub fn offset(&self) -> usize { - self.offset + unsafe { + let ret = self.ptr.offset_from(self.storage.as_ptr()) as usize; + debug_assert!(ret <= self.storage.len()); + ret + } } /// # Safety @@ -197,10 +201,14 @@ impl Buffer { /// /// This operation returns [`Either::Right`] iff this [`Buffer`]: /// * has not been cloned (i.e. [`Arc`]`::get_mut` yields [`Some`]) - /// * has not been imported from the c data interface (FFI) + /// * has not been imported from the C data interface (FFI) #[inline] pub fn into_mut(mut self) -> Either> { - match Arc::get_mut(&mut self.data) + // We lose information if the data is sliced. + if self.is_sliced() { + return Either::Left(self); + } + match Arc::get_mut(&mut self.storage) .and_then(|b| b.get_vec()) .map(std::mem::take) { @@ -209,65 +217,27 @@ impl Buffer { } } - /// Returns a mutable reference to its underlying `Vec`, if possible. - /// Note that only `[self.offset(), self.offset() + self.len()[` in this vector is visible - /// by this buffer. - /// - /// This operation returns [`Some`] iff this [`Buffer`]: - /// * has not been cloned (i.e. [`Arc`]`::get_mut` yields [`Some`]) - /// * has not been imported from the c data interface (FFI) - /// # Safety - /// The caller must ensure that the vector in the mutable reference keeps a length of at least `self.offset() + self.len() - 1`. - #[inline] - pub unsafe fn get_mut(&mut self) -> Option<&mut Vec> { - Arc::get_mut(&mut self.data).and_then(|b| b.get_vec()) - } - /// Returns a mutable reference to its slice, if possible. /// /// This operation returns [`Some`] iff this [`Buffer`]: /// * has not been cloned (i.e. [`Arc`]`::get_mut` yields [`Some`]) - /// * has not been imported from the c data interface (FFI) + /// * has not been imported from the C data interface (FFI) #[inline] pub fn get_mut_slice(&mut self) -> Option<&mut [T]> { - Arc::get_mut(&mut self.data) - .and_then(|b| b.get_vec()) - // Safety: the invariant of this struct - .map(|x| unsafe { x.get_unchecked_mut(self.offset..self.offset + self.length) }) + let offset = self.offset(); + let unique = Arc::get_mut(&mut self.storage)?; + let vec = unique.get_vec()?; + Some(unsafe { vec.get_unchecked_mut(offset..offset + self.length) }) } /// Get the strong count of underlying `Arc` data buffer. pub fn shared_count_strong(&self) -> usize { - Arc::strong_count(&self.data) + Arc::strong_count(&self.storage) } /// Get the weak count of underlying `Arc` data buffer. pub fn shared_count_weak(&self) -> usize { - Arc::weak_count(&self.data) - } - - /// Returns its internal representation - #[must_use] - pub fn into_inner(self) -> (Arc>, usize, usize) { - let Self { - data, - offset, - length, - } = self; - (data, offset, length) - } - - /// Creates a `[Bitmap]` from its internal representation. - /// This is the inverted from `[Bitmap::into_inner]` - /// - /// # Safety - /// Callers must ensure all invariants of this struct are upheld. - pub unsafe fn from_inner_unchecked(data: Arc>, offset: usize, length: usize) -> Self { - Self { - data, - offset, - length, - } + Arc::weak_count(&self.storage) } } @@ -281,10 +251,12 @@ impl From> for Buffer { #[inline] fn from(p: Vec) -> Self { let bytes: Bytes = p.into(); + let ptr = bytes.as_ptr(); + let length = bytes.len(); Self { - offset: 0, - length: bytes.len(), - data: Arc::new(bytes), + storage: Arc::new(bytes), + ptr, + length, } } } @@ -325,8 +297,9 @@ impl From for Buffer { #[cfg(feature = "arrow")] impl From> for arrow_buffer::Buffer { fn from(value: Buffer) -> Self { - crate::buffer::to_buffer(value.data).slice_with_length( - value.offset * std::mem::size_of::(), + let offset = value.offset(); + crate::buffer::to_buffer(value.storage).slice_with_length( + offset * std::mem::size_of::(), value.length * std::mem::size_of::(), ) } @@ -336,7 +309,7 @@ unsafe impl<'a, T: 'a> ArrayAccessor<'a> for Buffer { type Item = &'a T; unsafe fn value_unchecked(&'a self, index: usize) -> Self::Item { - self.as_slice().get_unchecked(index) + unsafe { &*self.ptr.add(index) } } fn len(&self) -> usize { From 0f5b93a4b839b3e443601dd3e56090867163206c Mon Sep 17 00:00:00 2001 From: Ritchie Vink Date: Fri, 19 Jan 2024 09:48:14 +0100 Subject: [PATCH 07/25] feat: new implementation for `String/Binary` type. (#13748) --- src/array/binview/mod.rs | 73 +++++++++++- src/array/binview/mutable.rs | 163 ++++++++++++++++++++++++++- src/array/binview/view.rs | 5 +- src/array/growable/binview.rs | 70 ++++++++---- src/array/growable/mod.rs | 1 + src/array/mod.rs | 3 +- src/array/primitive/mod.rs | 14 +++ src/array/primitive/mutable.rs | 4 + src/compute/arithmetics/basic/mod.rs | 1 + src/compute/cast/binary_to.rs | 5 + src/compute/cast/binview_to.rs | 89 ++++++++++++++- src/compute/cast/mod.rs | 162 +++++++++++++++++++------- src/compute/cast/primitive_to.rs | 24 ++++ src/compute/cast/utf8_to.rs | 54 ++++++++- src/compute/filter.rs | 16 +++ src/io/ipc/write/common.rs | 37 ++++++ src/temporal_conversions.rs | 44 +++++--- src/trusted_len.rs | 69 +++++++++++- tests/it/io/ipc/mod.rs | 2 +- 19 files changed, 734 insertions(+), 102 deletions(-) diff --git a/src/array/binview/mod.rs b/src/array/binview/mod.rs index e29e094a636..2b54b8b9cb5 100644 --- a/src/array/binview/mod.rs +++ b/src/array/binview/mod.rs @@ -21,12 +21,12 @@ mod private { impl Sealed for str {} impl Sealed for [u8] {} } +pub use iterator::BinaryViewValueIter; pub use mutable::MutableBinaryViewArray; use private::Sealed; -use crate::array::binview::iterator::BinaryViewValueIter; use crate::array::binview::view::{ - validate_binary_view, validate_utf8_only_view, validate_utf8_view, + validate_binary_view, validate_utf8_only, validate_utf8_view, }; use crate::bitmap::utils::{BitmapIter, ZipValidity}; use crate::error::{Error, Result}; @@ -34,6 +34,12 @@ use crate::error::{Error, Result}; pub type BinaryViewArray = BinaryViewArrayGeneric<[u8]>; pub type Utf8ViewArray = BinaryViewArrayGeneric; +pub type MutablePlString = MutableBinaryViewArray; +pub type MutablePlBinary = MutableBinaryViewArray<[u8]>; + +static BIN_VIEW_TYPE: DataType = DataType::BinaryView; +static UTF8_VIEW_TYPE: DataType = DataType::Utf8View; + pub trait ViewType: Sealed + 'static + PartialEq + AsRef { const IS_UTF8: bool; const DATA_TYPE: DataType; @@ -47,6 +53,8 @@ pub trait ViewType: Sealed + 'static + PartialEq + AsRef { #[allow(clippy::wrong_self_convention)] fn into_owned(&self) -> Self::Owned; + + fn dtype() -> &'static DataType; } impl ViewType for str { @@ -67,6 +75,9 @@ impl ViewType for str { fn into_owned(&self) -> Self::Owned { self.to_string() } + fn dtype() -> &'static DataType { + &UTF8_VIEW_TYPE + } } impl ViewType for [u8] { @@ -87,6 +98,10 @@ impl ViewType for [u8] { fn into_owned(&self) -> Self::Owned { self.to_vec() } + + fn dtype() -> &'static DataType { + &BIN_VIEW_TYPE + } } pub struct BinaryViewArrayGeneric { @@ -103,6 +118,12 @@ pub struct BinaryViewArrayGeneric { total_buffer_len: usize, } +impl PartialEq for BinaryViewArrayGeneric { + fn eq(&self, other: &Self) -> bool { + self.into_iter().zip(other).all(|(l, r)| l == r) + } +} + impl Clone for BinaryViewArrayGeneric { fn clone(&self) -> Self { Self { @@ -260,7 +281,7 @@ impl BinaryViewArrayGeneric { // data: 12 bytes let bytes = if len <= 12 { - let ptr = self.views.storage_ptr() as *const u8; + let ptr = self.views.as_ptr() as *const u8; std::slice::from_raw_parts(ptr.add(i * 16 + 4), len as usize) } else { let buffer_idx = (v >> 64) as u32; @@ -283,17 +304,27 @@ impl BinaryViewArrayGeneric { BinaryViewValueIter::new(self) } + pub fn len_iter(&self) -> impl Iterator + '_ { + self.views.iter().map(|v| *v as u32) + } + impl_sliced!(); impl_mut_validity!(); impl_into_array!(); - pub fn from, P: AsRef<[Option]>>(slice: P) -> Self { + pub fn from_slice, P: AsRef<[Option]>>(slice: P) -> Self { let mutable = MutableBinaryViewArray::from_iterator( slice.as_ref().iter().map(|opt_v| opt_v.as_ref()), ); mutable.into() } + pub fn from_slice_values, P: AsRef<[S]>>(slice: P) -> Self { + let mutable = + MutableBinaryViewArray::from_values_iter(slice.as_ref().iter().map(|v| v.as_ref())); + mutable.into() + } + /// Get the total length of bytes that it would take to concatenate all binary/str values in this array. pub fn total_bytes_len(&self) -> usize { self.total_bytes_len @@ -308,12 +339,40 @@ impl BinaryViewArrayGeneric { pub fn len(&self) -> usize { self.views.len() } + + /// Garbage collect + pub fn gc(self) -> Self { + if self.buffers.is_empty() { + return self; + } + let mut mutable = MutableBinaryViewArray::with_capacity(self.len()); + let buffers = self.raw_buffers.as_ref(); + + for view in self.views.as_ref() { + unsafe { mutable.push_view(*view, buffers) } + } + mutable.freeze().with_validity(self.validity) + } + + pub fn maybe_gc(self) -> Self { + if self.total_buffer_len == 0 { + return self; + } + // Subtract the maximum amount of inlined strings. + let min_in_buffer = self.total_bytes_len.saturating_sub(self.len() * 12); + let frac = (min_in_buffer as f64) / ((self.total_buffer_len() + 1) as f64); + + if frac < 0.25 { + return self.gc(); + } + self + } } impl BinaryViewArray { /// Validate the underlying bytes on UTF-8. pub fn validate_utf8(&self) -> Result<()> { - validate_utf8_only_view(&self.views, &self.buffers) + validate_utf8_only(&self.views, &self.buffers) } /// Convert [`BinaryViewArray`] to [`Utf8ViewArray`]. @@ -369,7 +428,7 @@ impl Array for BinaryViewArrayGeneric { } fn data_type(&self) -> &DataType { - &self.data_type + T::dtype() } fn validity(&self) -> Option<&Bitmap> { @@ -385,12 +444,14 @@ impl Array for BinaryViewArrayGeneric { } unsafe fn slice_unchecked(&mut self, offset: usize, length: usize) { + debug_assert!(offset + length <= self.len()); self.validity = self .validity .take() .map(|bitmap| bitmap.sliced_unchecked(offset, length)) .filter(|bitmap| bitmap.unset_bits() > 0); self.views.slice_unchecked(offset, length); + self.total_bytes_len = self.len_iter().map(|v| v as usize).sum::(); } fn with_validity(&self, validity: Option) -> Box { diff --git a/src/array/binview/mutable.rs b/src/array/binview/mutable.rs index d863c42c456..9667b278ba8 100644 --- a/src/array/binview/mutable.rs +++ b/src/array/binview/mutable.rs @@ -1,14 +1,18 @@ +use std::any::Any; +use std::fmt::{Debug, Formatter}; use std::sync::Arc; -use crate::array::binview::view::validate_utf8_only_view; +use crate::array::binview::view::validate_utf8_only; use crate::array::binview::{BinaryViewArrayGeneric, ViewType}; +use crate::array::{Array, MutableArray}; use crate::bitmap::MutableBitmap; use crate::buffer::Buffer; use crate::error::Result; +use crate::datatypes::DataType; +use crate::trusted_len::TrustedLen; const DEFAULT_BLOCK_SIZE: usize = 8 * 1024; -#[derive(Debug, Clone)] pub struct MutableBinaryViewArray { views: Vec, completed_buffers: Vec>, @@ -21,6 +25,26 @@ pub struct MutableBinaryViewArray { total_buffer_len: usize, } +impl Clone for MutableBinaryViewArray { + fn clone(&self) -> Self { + Self { + views: self.views.clone(), + completed_buffers: self.completed_buffers.clone(), + in_progress_buffer: self.in_progress_buffer.clone(), + validity: self.validity.clone(), + phantom: Default::default(), + total_bytes_len: self.total_bytes_len, + total_buffer_len: self.total_buffer_len, + } + } +} + +impl Debug for MutableBinaryViewArray { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + write!(f, "mutable-binview{:?}", T::DATA_TYPE) + } +} + impl Default for MutableBinaryViewArray { fn default() -> Self { Self::with_capacity(0) @@ -73,17 +97,48 @@ impl MutableBinaryViewArray { self.views.reserve(additional); } + #[inline] pub fn len(&self) -> usize { self.views.len() } - fn init_validity(&mut self) { + #[inline] + pub fn capacity(&self) -> usize { + self.views.capacity() + } + + fn init_validity(&mut self, unset_last: bool) { let mut validity = MutableBitmap::with_capacity(self.views.capacity()); validity.extend_constant(self.len(), true); - validity.set(self.len() - 1, false); + if unset_last { + validity.set(self.len() - 1, false); + } self.validity = Some(validity); } + /// # Safety + /// - caller must allocate enough capacity + /// - caller must ensure the view and buffers match. + #[inline] + pub unsafe fn push_view(&mut self, v: u128, buffers: &[(*const u8, usize)]) { + let len = v as u32; + self.total_bytes_len += len as usize; + if len <= 12 { + debug_assert!(self.views.capacity() > self.views.len()); + self.views.push(v) + } else { + self.total_buffer_len += len as usize; + let buffer_idx = (v >> 64) as u32; + let offset = (v >> 96) as u32; + let (data_ptr, data_len) = *buffers.get_unchecked(buffer_idx as usize); + let data = std::slice::from_raw_parts(data_ptr, data_len); + let offset = offset as usize; + let bytes = data.get_unchecked(offset..offset + len as usize); + let t = T::from_bytes_unchecked(bytes); + self.push_value_ignore_validity(t) + } + } + pub fn push_value_ignore_validity>(&mut self, value: V) { let value = value.as_ref(); let bytes = value.to_bytes(); @@ -138,10 +193,42 @@ impl MutableBinaryViewArray { self.views.push(0); match &mut self.validity { Some(validity) => validity.push(false), - None => self.init_validity(), + None => self.init_validity(true), } } + pub fn extend_null(&mut self, additional: usize) { + if self.validity.is_none() && additional > 0 { + self.init_validity(false); + } + self.views.extend(std::iter::repeat(0).take(additional)); + if let Some(validity) = &mut self.validity { + validity.extend_constant(additional, false); + } + } + + pub fn extend_constant>(&mut self, additional: usize, value: Option) { + if value.is_none() && self.validity.is_none() { + self.init_validity(false); + } + + if let Some(validity) = &mut self.validity { + validity.extend_constant(additional, value.is_some()) + } + + // Push and pop to get the properly encoded value. + // For long string this leads to a dictionary encoding, + // as we push the string only once in the buffers + let view_value = value + .map(|v| { + self.push_value_ignore_validity(v); + self.views.pop().unwrap() + }) + .unwrap_or(0); + self.views + .extend(std::iter::repeat(view_value).take(additional)); + } + impl_mutable_array_mut_validity!(); #[inline] @@ -156,6 +243,15 @@ impl MutableBinaryViewArray { } } + #[inline] + pub fn extend_trusted_len_values(&mut self, iterator: I) + where + I: TrustedLen, + P: AsRef, + { + self.extend_values(iterator) + } + #[inline] pub fn extend(&mut self, iterator: I) where @@ -168,6 +264,16 @@ impl MutableBinaryViewArray { } } + #[inline] + pub fn extend_trusted_len(&mut self, iterator: I) + where + I: TrustedLen>, + P: AsRef, + { + self.extend(iterator) + } + + #[inline] pub fn from_iterator(iterator: I) -> Self where I: Iterator>, @@ -198,11 +304,16 @@ impl MutableBinaryViewArray { .push(std::mem::take(&mut self.in_progress_buffer).into()); } } + + #[inline] + pub fn freeze(self) -> BinaryViewArrayGeneric { + self.into() + } } impl MutableBinaryViewArray<[u8]> { pub fn validate_utf8(&mut self) -> Result<()> { - validate_utf8_only_view(&self.views, &self.completed_buffers) + validate_utf8_only(&self.views, &self.completed_buffers) } } @@ -219,3 +330,43 @@ impl> FromIterator> for MutableBinar Self::from_iterator(iter.into_iter()) } } + +impl MutableArray for MutableBinaryViewArray { + fn data_type(&self) -> &DataType { + T::dtype() + } + + fn len(&self) -> usize { + MutableBinaryViewArray::len(self) + } + + fn validity(&self) -> Option<&MutableBitmap> { + self.validity.as_ref() + } + + fn as_box(&mut self) -> Box { + let mutable = std::mem::take(self); + let arr: BinaryViewArrayGeneric = mutable.into(); + arr.boxed() + } + + fn as_any(&self) -> &dyn Any { + self + } + + fn as_mut_any(&mut self) -> &mut dyn Any { + self + } + + fn push_null(&mut self) { + MutableBinaryViewArray::push_null(self) + } + + fn reserve(&mut self, additional: usize) { + MutableBinaryViewArray::reserve(self, additional) + } + + fn shrink_to_fit(&mut self) { + self.views.shrink_to_fit() + } +} diff --git a/src/array/binview/view.rs b/src/array/binview/view.rs index 49d855612a5..d0ba31f464f 100644 --- a/src/array/binview/view.rs +++ b/src/array/binview/view.rs @@ -1,6 +1,7 @@ use crate::buffer::Buffer; use crate::error::{Error, Result}; +#[derive(Debug)] pub struct View { /// The length of the string/bytes. pub length: u32, @@ -17,7 +18,7 @@ impl From for View { fn from(value: u128) -> Self { Self { length: value as u32, - prefix: (value >> 64) as u32, + prefix: (value >> 32) as u32, buffer_idx: (value >> 64) as u32, offset: (value >> 96) as u32, } @@ -85,7 +86,7 @@ pub(super) fn validate_utf8_view(views: &[u128], buffers: &[Buffer]) -> Resu validate_view(views, buffers, validate_utf8) } -pub(super) fn validate_utf8_only_view(views: &[u128], buffers: &[Buffer]) -> Result<()> { +pub(super) fn validate_utf8_only(views: &[u128], buffers: &[Buffer]) -> Result<()> { for view in views { let len = *view as u32; if len <= 12 { diff --git a/src/array/growable/binview.rs b/src/array/growable/binview.rs index d13474d99cd..a4f4b1099ed 100644 --- a/src/array/growable/binview.rs +++ b/src/array/growable/binview.rs @@ -15,6 +15,7 @@ pub struct GrowableBinaryViewArray<'a, T: ViewType + ?Sized> { validity: Option, views: Vec, buffers: Vec>, + buffers_offsets: Vec, total_bytes_len: usize, total_buffer_len: usize, } @@ -36,9 +37,24 @@ impl<'a, T: ViewType + ?Sized> GrowableBinaryViewArray<'a, T> { use_validity = true; }; - let n_buffers = arrays + let mut cum_sum = 0; + let cum_offset = arrays .iter() - .map(|binview| binview.data_buffers().len()) + .map(|binview| { + let out = cum_sum; + cum_sum += binview.data_buffers().len() as u32; + out + }) + .collect::>(); + + let buffers = arrays + .iter() + .flat_map(|array| array.data_buffers().as_ref()) + .cloned() + .collect::>(); + let total_buffer_len = arrays + .iter() + .map(|arr| arr.data_buffers().len()) .sum::(); Self { @@ -46,9 +62,10 @@ impl<'a, T: ViewType + ?Sized> GrowableBinaryViewArray<'a, T> { data_type, validity: prepare_validity(use_validity, capacity), views: Vec::with_capacity(capacity), - buffers: Vec::with_capacity(n_buffers), + buffers, + buffers_offsets: cum_offset, total_bytes_len: 0, - total_buffer_len: 0, + total_buffer_len, } } @@ -65,33 +82,39 @@ impl<'a, T: ViewType + ?Sized> GrowableBinaryViewArray<'a, T> { self.total_bytes_len, self.total_buffer_len, ) + .maybe_gc() } } -} -impl<'a, T: ViewType + ?Sized> Growable<'a> for GrowableBinaryViewArray<'a, T> { - fn extend(&mut self, index: usize, start: usize, len: usize) { - let array = self.arrays[index]; - extend_validity(&mut self.validity, array, start, len); + /// # Safety + /// doesn't check bounds + pub unsafe fn extend_unchecked(&mut self, index: usize, start: usize, len: usize) { + let array = *self.arrays.get_unchecked(index); - let buffer_offset: u32 = self.buffers.len().try_into().expect("unsupported"); - let buffer_offset = (buffer_offset as u128) << 64; + extend_validity(&mut self.validity, array, start, len); let range = start..start + len; - let buffers_range = &array.data_buffers()[range.clone()]; - self.buffers.extend_from_slice(buffers_range); - for b in buffers_range { - self.total_buffer_len += b.len(); - } - - self.views.extend(array.views()[range].iter().map(|&view| { - self.total_bytes_len += (view as u32) as usize; + self.views + .extend(array.views().get_unchecked(range).iter().map(|&view| { + let len = (view as u32) as usize; + self.total_bytes_len += len; + + if len > 12 { + let buffer_offset = *self.buffers_offsets.get_unchecked(index); + let mask = (u32::MAX as u128) << 64; + (view & !mask) | ((buffer_offset as u128) << 64) + } else { + view + } + })); + } +} - // If null the buffer index is ignored because the length is 0, - // so we can just do this - view + buffer_offset - })); +impl<'a, T: ViewType + ?Sized> Growable<'a> for GrowableBinaryViewArray<'a, T> { + fn extend(&mut self, index: usize, start: usize, len: usize) { + assert!(index < self.arrays.len()); + unsafe { self.extend_unchecked(index, start, len) } } fn extend_validity(&mut self, additional: usize) { @@ -126,6 +149,7 @@ impl<'a, T: ViewType + ?Sized> From> for BinaryVi val.total_bytes_len, val.total_buffer_len, ) + .maybe_gc() } } } diff --git a/src/array/growable/mod.rs b/src/array/growable/mod.rs index 89706a77b35..2da64e4f5b4 100644 --- a/src/array/growable/mod.rs +++ b/src/array/growable/mod.rs @@ -31,6 +31,7 @@ mod dictionary; pub use dictionary::GrowableDictionary; mod binview; +pub use binview::GrowableBinaryViewArray; mod utils; /// Describes a struct that can be extended from slices of other pre-existing [`Array`]s. diff --git a/src/array/mod.rs b/src/array/mod.rs index 6a31ed5efc0..5e5a7e2fb26 100644 --- a/src/array/mod.rs +++ b/src/array/mod.rs @@ -746,7 +746,8 @@ pub use fmt::{get_display, get_value_display}; pub use binary::{BinaryArray, BinaryValueIter, MutableBinaryArray, MutableBinaryValuesArray}; pub use binview::{ - BinaryViewArray, BinaryViewArrayGeneric, MutableBinaryViewArray, Utf8ViewArray, ViewType, + BinaryViewArray, BinaryViewArrayGeneric, MutableBinaryViewArray, MutablePlBinary, + MutablePlString, Utf8ViewArray, ViewType, }; pub use boolean::{BooleanArray, MutableBooleanArray}; pub use dictionary::{DictionaryArray, DictionaryKey, MutableDictionaryArray}; diff --git a/src/array/primitive/mod.rs b/src/array/primitive/mod.rs index 04b74a3529b..a5bfc3b3b86 100644 --- a/src/array/primitive/mod.rs +++ b/src/array/primitive/mod.rs @@ -96,6 +96,20 @@ impl PrimitiveArray { }) } + /// # Safety + /// Doesn't check invariants + pub unsafe fn new_unchecked( + data_type: DataType, + values: Buffer, + validity: Option, + ) -> Self { + Self { + data_type, + values, + validity, + } + } + /// Returns a new [`PrimitiveArray`] with a different logical type. /// /// This function is useful to assign a different [`DataType`] to the array. diff --git a/src/array/primitive/mutable.rs b/src/array/primitive/mutable.rs index 4432ab2e33f..e2f3a4c2f52 100644 --- a/src/array/primitive/mutable.rs +++ b/src/array/primitive/mutable.rs @@ -286,6 +286,10 @@ impl MutablePrimitiveArray { pub fn capacity(&self) -> usize { self.values.capacity() } + + pub fn freeze(self) -> PrimitiveArray { + self.into() + } } /// Accessors diff --git a/src/compute/arithmetics/basic/mod.rs b/src/compute/arithmetics/basic/mod.rs index 22ed09baf6e..e78531fba7a 100644 --- a/src/compute/arithmetics/basic/mod.rs +++ b/src/compute/arithmetics/basic/mod.rs @@ -39,6 +39,7 @@ impl NativeArithmetics for i8 {} impl NativeArithmetics for i16 {} impl NativeArithmetics for i32 {} impl NativeArithmetics for i64 {} +impl NativeArithmetics for i128 {} impl NativeArithmetics for f32 {} impl NativeArithmetics for f64 {} diff --git a/src/compute/cast/binary_to.rs b/src/compute/cast/binary_to.rs index 82f827e3f61..e171a0c9098 100644 --- a/src/compute/cast/binary_to.rs +++ b/src/compute/cast/binary_to.rs @@ -145,6 +145,11 @@ pub fn fixed_size_binary_binary( ) } +pub fn fixed_size_binary_to_binview(from: &FixedSizeBinaryArray) -> BinaryViewArray { + let mutable = MutableBinaryViewArray::from_values_iter(from.values_iter()); + mutable.freeze().with_validity(from.validity().cloned()) +} + /// Conversion of binary pub fn binary_to_list(from: &BinaryArray, to_data_type: DataType) -> ListArray { let values = from.values().clone(); diff --git a/src/compute/cast/binview_to.rs b/src/compute/cast/binview_to.rs index cf1759669f5..f3c0a7de2b7 100644 --- a/src/compute/cast/binview_to.rs +++ b/src/compute/cast/binview_to.rs @@ -1,9 +1,21 @@ +use chrono::Datelike; +use polars_error::PolarsResult; + use crate::array::*; +use crate::compute::cast::binary_to::Parse; +use crate::compute::cast::CastOptions; +use crate::datatypes::{ArrowDataType, TimeUnit}; +#[cfg(feature = "dtype-decimal")] +use crate::legacy::compute::decimal::deserialize_decimal; use crate::offset::Offset; +use crate::temporal_conversions::EPOCH_DAYS_FROM_CE; +use crate::types::NativeType; + +pub(super) const RFC3339: &str = "%Y-%m-%dT%H:%M:%S%.f%:z"; pub(super) fn view_to_binary(array: &BinaryViewArray) -> BinaryArray { let len: usize = Array::len(array); - let mut mutable = MutableBinaryValuesArray::::with_capacities(len, len * 12); + let mut mutable = MutableBinaryValuesArray::::with_capacities(len, array.total_bytes_len()); for slice in array.values_iter() { mutable.push(slice) } @@ -11,7 +23,7 @@ pub(super) fn view_to_binary(array: &BinaryViewArray) -> BinaryArray< out.with_validity(array.validity().cloned()) } -pub(super) fn utf8view_to_utf8(array: &Utf8ViewArray) -> Utf8Array { +pub fn utf8view_to_utf8(array: &Utf8ViewArray) -> Utf8Array { let array = array.to_binview(); let out = view_to_binary::(&array); @@ -25,3 +37,76 @@ pub(super) fn utf8view_to_utf8(array: &Utf8ViewArray) -> Utf8Array ) } } +/// Casts a [`BinaryArray`] to a [`PrimitiveArray`], making any uncastable value a Null. +pub(super) fn binview_to_primitive( + from: &BinaryViewArray, + to: &ArrowDataType, +) -> PrimitiveArray +where + T: NativeType + Parse, +{ + let iter = from.iter().map(|x| x.and_then::(|x| T::parse(x))); + + PrimitiveArray::::from_trusted_len_iter(iter).to(to.clone()) +} + +pub(super) fn binview_to_primitive_dyn( + from: &dyn Array, + to: &ArrowDataType, + options: CastOptions, +) -> PolarsResult> +where + T: NativeType + Parse, +{ + let from = from.as_any().downcast_ref().unwrap(); + if options.partial { + unimplemented!() + } else { + Ok(Box::new(binview_to_primitive::(from, to))) + } +} + +#[cfg(feature = "dtype-decimal")] +pub fn binview_to_decimal( + array: &BinaryViewArray, + precision: Option, + scale: usize, +) -> PrimitiveArray { + let precision = precision.map(|p| p as u8); + array + .iter() + .map(|val| val.and_then(|val| deserialize_decimal(val, precision, scale as u8))) + .collect() +} + +pub(super) fn utf8view_to_naive_timestamp_dyn( + from: &dyn Array, + time_unit: TimeUnit, +) -> PolarsResult> { + let from = from.as_any().downcast_ref().unwrap(); + Ok(Box::new(utf8view_to_naive_timestamp(from, time_unit))) +} + +/// [`crate::temporal_conversions::utf8view_to_timestamp`] applied for RFC3339 formatting +pub fn utf8view_to_naive_timestamp( + from: &Utf8ViewArray, + time_unit: TimeUnit, +) -> PrimitiveArray { + crate::temporal_conversions::utf8view_to_naive_timestamp(from, RFC3339, time_unit) +} + +pub(super) fn utf8view_to_date32(from: &Utf8ViewArray) -> PrimitiveArray { + let iter = from.iter().map(|x| { + x.and_then(|x| { + x.parse::() + .ok() + .map(|x| x.num_days_from_ce() - EPOCH_DAYS_FROM_CE) + }) + }); + PrimitiveArray::::from_trusted_len_iter(iter).to(ArrowDataType::Date32) +} + +pub(super) fn utf8view_to_date32_dyn(from: &dyn Array) -> PolarsResult> { + let from = from.as_any().downcast_ref().unwrap(); + Ok(Box::new(utf8view_to_date32(from))) +} diff --git a/src/compute/cast/mod.rs b/src/compute/cast/mod.rs index 7c66d674604..8e05520f0a6 100644 --- a/src/compute/cast/mod.rs +++ b/src/compute/cast/mod.rs @@ -9,6 +9,10 @@ mod primitive_to; mod utf8_to; pub use binary_to::*; +#[cfg(feature = "dtype-decimal")] +pub use binview_to::binview_to_decimal; +use binview_to::binview_to_primitive_dyn; +pub use binview_to::utf8view_to_utf8; pub use boolean_to::*; pub use decimal_to::*; pub use dictionary_to::*; @@ -21,6 +25,10 @@ use crate::{ error::{Error, Result}, offset::{Offset, Offsets}, }; +use crate::compute::cast::binview_to::{ + utf8view_to_date32_dyn, utf8view_to_naive_timestamp_dyn, view_to_binary, +}; +use crate::temporal_conversions::utf8view_to_timestamp; /// options defining how Cast kernels behave #[derive(Clone, Copy, Debug, Default)] @@ -34,6 +42,15 @@ pub struct CastOptions { pub partial: bool, } +impl CastOptions { + pub fn unchecked() -> Self { + Self { + wrapped: true, + partial: false, + } + } +} + impl CastOptions { fn with_wrapped(&self, v: bool) -> Self { let mut option = *self; @@ -418,6 +435,14 @@ fn cast_list_to_fixed_size_list( } } +pub fn cast_default(array: &dyn Array, to_type: &DataType) -> Result> { + cast(array, to_type, Default::default()) +} + +pub fn cast_unchecked(array: &dyn Array, to_type: &DataType) -> Result> { + cast(array, to_type, CastOptions::unchecked()) +} + /// Cast `array` to the provided data type and return a new [`Array`] with /// type `to_type`, if possible. /// @@ -488,7 +513,36 @@ pub fn cast(array: &dyn Array, to_type: &DataType, options: CastOptions) -> Resu (List(_), List(_)) => { cast_list::(array.as_any().downcast_ref().unwrap(), to_type, options) .map(|x| x.boxed()) - } + }, + (BinaryView, _) => match to_type { + Utf8View => array + .as_any() + .downcast_ref::() + .unwrap() + .to_utf8view() + .map(|arr| arr.boxed()), + LargeBinary => Ok(binview_to::view_to_binary::( + array.as_any().downcast_ref().unwrap(), + ) + .boxed()), + UInt8 => binview_to_primitive_dyn::(array, to_type, options), + UInt16 => binview_to_primitive_dyn::(array, to_type, options), + UInt32 => binview_to_primitive_dyn::(array, to_type, options), + UInt64 => binview_to_primitive_dyn::(array, to_type, options), + Int8 => binview_to_primitive_dyn::(array, to_type, options), + Int16 => binview_to_primitive_dyn::(array, to_type, options), + Int32 => binview_to_primitive_dyn::(array, to_type, options), + Int64 => binview_to_primitive_dyn::(array, to_type, options), + Float32 => binview_to_primitive_dyn::(array, to_type, options), + Float64 => binview_to_primitive_dyn::(array, to_type, options), + LargeList(inner) if matches!(inner.data_type, DataType::UInt8) => { + let bin_array = view_to_binary::(array.as_any().downcast_ref().unwrap()); + Ok(binary_to_list(&bin_array, to_type.clone()).boxed()) + }, + _ => Err(Error::NotYetImplemented(format!( + "Unsupported casting from {from_type:?} to {to_type:?}" + ))), + }, (LargeList(_), LargeList(_)) => { cast_list::(array.as_any().downcast_ref().unwrap(), to_type, options) .map(|x| x.boxed()) @@ -526,6 +580,39 @@ pub fn cast(array: &dyn Array, to_type: &DataType, options: CastOptions) -> Resu Ok(Box::new(list_array)) } + (Utf8View, _) => { + let arr = array.as_any().downcast_ref::().unwrap(); + + match to_type { + BinaryView => Ok(arr.to_binview().boxed()), + LargeUtf8 => Ok(binview_to::utf8view_to_utf8::(arr).boxed()), + UInt8 + | UInt16 + | UInt32 + | UInt64 + | Int8 + | Int16 + | Int32 + | Int64 + | Float32 + | Float64 + | Decimal(_, _) => cast(&arr.to_binview(), to_type, options), + Timestamp(time_unit, None) => { + utf8view_to_naive_timestamp_dyn(array, time_unit.to_owned()) + }, + Timestamp(time_unit, Some(time_zone)) => utf8view_to_timestamp( + array.as_any().downcast_ref().unwrap(), + RFC3339, + time_zone.clone(), + ) + .map(|arr| arr.boxed()), + Date32 => utf8view_to_date32_dyn(array), + _ => Err(Error::NotYetImplemented(format!( + "Unsupported casting from {from_type:?} to {to_type:?}" + ))), + } + }, + (Dictionary(index_type, ..), _) => match_integer_type!(index_type, |$T| { dictionary_cast_dyn::<$T>(array, to_type, options) }), @@ -566,35 +653,17 @@ pub fn cast(array: &dyn Array, to_type: &DataType, options: CastOptions) -> Resu "Casting from {from_type:?} to {to_type:?} not supported", ))), }, - (Utf8View, _) => match to_type { - BinaryView => Ok(array - .as_any() - .downcast_ref::() - .unwrap() - .to_binview() - .boxed()), - LargeUtf8 => Ok(binview_to::utf8view_to_utf8::( - array.as_any().downcast_ref().unwrap(), - ) - .boxed()), - _ => Err(Error::NotYetImplemented(format!( - "Casting from {from_type:?} to {to_type:?} not supported", - ))), - }, - (BinaryView, _) => match to_type { - BinaryView => array - .as_any() - .downcast_ref::() - .unwrap() - .to_utf8view() - .map(|arr| arr.boxed()), - LargeBinary => Ok(binview_to::view_to_binary::( - array.as_any().downcast_ref().unwrap(), + (_, BinaryView) => from_to_binview(array, from_type, to_type).map(|arr| arr.boxed()), + (_, Utf8View) => match from_type { + LargeUtf8 => Ok(utf8_to_utf8view( + array.as_any().downcast_ref::>().unwrap(), ) .boxed()), - _ => Err(Error::NotYetImplemented(format!( - "Casting from {from_type:?} to {to_type:?} not supported", - ))), + Utf8 => Ok( + utf8_to_utf8view(array.as_any().downcast_ref::>().unwrap()).boxed(), + ), + _ => from_to_binview(array, from_type, to_type) + .map(|arr| unsafe { arr.to_utf8view_unchecked() }.boxed()), }, (Utf8, _) => match to_type { UInt8 => utf8_to_primitive_dyn::(array, to_type, options), @@ -719,16 +788,6 @@ pub fn cast(array: &dyn Array, to_type: &DataType, options: CastOptions) -> Resu }, (Binary, _) => match to_type { - UInt8 => binary_to_primitive_dyn::(array, to_type, options), - UInt16 => binary_to_primitive_dyn::(array, to_type, options), - UInt32 => binary_to_primitive_dyn::(array, to_type, options), - UInt64 => binary_to_primitive_dyn::(array, to_type, options), - Int8 => binary_to_primitive_dyn::(array, to_type, options), - Int16 => binary_to_primitive_dyn::(array, to_type, options), - Int32 => binary_to_primitive_dyn::(array, to_type, options), - Int64 => binary_to_primitive_dyn::(array, to_type, options), - Float32 => binary_to_primitive_dyn::(array, to_type, options), - Float64 => binary_to_primitive_dyn::(array, to_type, options), LargeBinary => Ok(Box::new(binary_to_large_binary( array.as_any().downcast_ref().unwrap(), to_type.clone(), @@ -1042,3 +1101,30 @@ fn cast_to_dictionary( ))), } } + +fn from_to_binview( + array: &dyn Array, + from_type: &DataType, + to_type: &DataType, +) -> Result { + use DataType::*; + let binview = match from_type { + UInt8 => primitive_to_binview_dyn::(array), + UInt16 => primitive_to_binview_dyn::(array), + UInt32 => primitive_to_binview_dyn::(array), + UInt64 => primitive_to_binview_dyn::(array), + Int8 => primitive_to_binview_dyn::(array), + Int16 => primitive_to_binview_dyn::(array), + Int32 => primitive_to_binview_dyn::(array), + Int64 => primitive_to_binview_dyn::(array), + Float32 => primitive_to_binview_dyn::(array), + Float64 => primitive_to_binview_dyn::(array), + Binary => binary_to_binview::(array.as_any().downcast_ref().unwrap()), + FixedSizeBinary(_) => fixed_size_binary_to_binview(array.as_any().downcast_ref().unwrap()), + LargeBinary => binary_to_binview::(array.as_any().downcast_ref().unwrap()), + _ => Err(Error::NotYetImplemented(format!( + "Unsupported casting from {from_type:?} to {to_type:?}" + ))), + }; + Ok(binview) +} diff --git a/src/compute/cast/primitive_to.rs b/src/compute/cast/primitive_to.rs index 110288817a7..e2d7b63632c 100644 --- a/src/compute/cast/primitive_to.rs +++ b/src/compute/cast/primitive_to.rs @@ -587,3 +587,27 @@ pub fn months_to_months_days_ns(from: &PrimitiveArray) -> PrimitiveArray) -> PrimitiveArray { unary(from, |x| x.to_f32(), DataType::Float32) } + +/// Returns a [`Utf8Array`] where every element is the utf8 representation of the number. +pub(super) fn primitive_to_binview( + from: &PrimitiveArray, +) -> BinaryViewArray { + let mut mutable = MutableBinaryViewArray::with_capacity(from.len()); + + let mut scratch = vec![]; + for &x in from.values().iter() { + unsafe { scratch.set_len(0) }; + T::write(&mut scratch, x); + mutable.push_value_ignore_validity(&scratch) + } + + mutable.freeze().with_validity(from.validity().cloned()) +} + +pub(super) fn primitive_to_binview_dyn(from: &dyn Array) -> BinaryViewArray +where + T: NativeType, +{ + let from = from.as_any().downcast_ref().unwrap(); + primitive_to_binview::(from) +} diff --git a/src/compute/cast/utf8_to.rs b/src/compute/cast/utf8_to.rs index 6ee38588696..8c408dfb2c0 100644 --- a/src/compute/cast/utf8_to.rs +++ b/src/compute/cast/utf8_to.rs @@ -1,3 +1,4 @@ +use std::sync::Arc; use chrono::Datelike; use crate::{ @@ -6,8 +7,8 @@ use crate::{ error::Result, offset::Offset, temporal_conversions::{ - utf8_to_naive_timestamp_ns as utf8_to_naive_timestamp_ns_, - utf8_to_timestamp_ns as utf8_to_timestamp_ns_, EPOCH_DAYS_FROM_CE, + utf8view_to_naive_timestamp as utf8_to_naive_timestamp_, + utf8view_to_timestamp as utf8_to_timestamp_, EPOCH_DAYS_FROM_CE, }, types::NativeType, }; @@ -122,7 +123,7 @@ pub(super) fn utf8_to_naive_timestamp_ns_dyn( /// [`crate::temporal_conversions::utf8_to_timestamp_ns`] applied for RFC3339 formatting pub fn utf8_to_naive_timestamp_ns(from: &Utf8Array) -> PrimitiveArray { - utf8_to_naive_timestamp_ns_(from, RFC3339) + utf8_to_naive_timestamp_(from, RFC3339) } pub(super) fn utf8_to_timestamp_ns_dyn( @@ -140,7 +141,7 @@ pub fn utf8_to_timestamp_ns( from: &Utf8Array, timezone: String, ) -> Result> { - utf8_to_timestamp_ns_(from, RFC3339, timezone) + utf8_to_timestamp_(from, RFC3339, timezone) } /// Conversion of utf8 @@ -177,3 +178,48 @@ pub fn utf8_to_binary(from: &Utf8Array, to_data_type: DataType) -> ) } } + +pub fn binary_to_binview(arr: &BinaryArray) -> BinaryViewArray { + let buffer_idx = 0_u32; + let base_ptr = arr.values().as_ptr() as usize; + + let mut views = Vec::with_capacity(arr.len()); + let mut uses_buffer = false; + for bytes in arr.values_iter() { + let len: u32 = bytes.len().try_into().unwrap(); + + let mut payload = [0; 16]; + payload[0..4].copy_from_slice(&len.to_le_bytes()); + + if len <= 12 { + payload[4..4 + bytes.len()].copy_from_slice(bytes); + } else { + uses_buffer = true; + unsafe { payload[4..8].copy_from_slice(bytes.get_unchecked_release(0..4)) }; + let offset = (bytes.as_ptr() as usize - base_ptr) as u32; + payload[0..4].copy_from_slice(&len.to_le_bytes()); + payload[8..12].copy_from_slice(&buffer_idx.to_le_bytes()); + payload[12..16].copy_from_slice(&offset.to_le_bytes()); + } + + let value = u128::from_le_bytes(payload); + unsafe { views.push_unchecked(value) }; + } + let buffers = if uses_buffer { + Arc::from([arr.values().clone()]) + } else { + Arc::from([]) + }; + unsafe { + BinaryViewArray::new_unchecked_unknown_md( + DataType::BinaryView, + views.into(), + buffers, + arr.validity().cloned(), + ) + } +} + +pub fn utf8_to_utf8view(arr: &Utf8Array) -> Utf8ViewArray { + unsafe { binary_to_binview(&arr.to_binary()).to_utf8view_unchecked() } +} diff --git a/src/compute/filter.rs b/src/compute/filter.rs index 7ba260e702f..8e7d562281f 100644 --- a/src/compute/filter.rs +++ b/src/compute/filter.rs @@ -296,6 +296,22 @@ pub fn filter(array: &dyn Array, filter: &BooleanArray) -> Result let array = array.as_any().downcast_ref().unwrap(); Ok(Box::new(filter_primitive::<$T>(array, filter))) }), + BinaryView => { + let iter = SlicesIterator::new(filter.values()); + let mut mutable = growable::GrowableBinaryViewArray::new( + vec![array.as_any().downcast_ref::().unwrap()], + false, + iter.slots(), + ); + unsafe { + iter.for_each(|(start, len)| mutable.extend_unchecked(0, start, len)); + } + Ok(mutable.as_box()) + }, + // Should go via BinaryView + Utf8View => { + unreachable!() + }, _ => { let iter = SlicesIterator::new(filter.values()); let mut mutable = make_growable(&[array], false, iter.slots()); diff --git a/src/io/ipc/write/common.rs b/src/io/ipc/write/common.rs index 8618f007df4..633af4939dc 100644 --- a/src/io/ipc/write/common.rs +++ b/src/io/ipc/write/common.rs @@ -8,9 +8,14 @@ use crate::datatypes::*; use crate::error::{Error, Result}; use crate::io::ipc::endianess::is_native_little_endian; use crate::io::ipc::read::Dictionaries; +<<<<<<< HEAD use super::super::IpcField; use super::{write, write_dictionary}; +======= +use crate::legacy::prelude::LargeListArray; +use crate::match_integer_type; +>>>>>>> 64003155e8 (feat: new implementation for `String/Binary` type. (#13748)) /// Compression codec #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] @@ -230,6 +235,34 @@ fn serialize_compression( } } +fn set_variadic_buffer_counts(counts: &mut Vec, array: &dyn Array) { + match array.data_type() { + ArrowDataType::Utf8View => { + let array = array.as_any().downcast_ref::().unwrap(); + counts.push(array.data_buffers().len() as i64); + }, + ArrowDataType::BinaryView => { + let array = array.as_any().downcast_ref::().unwrap(); + counts.push(array.data_buffers().len() as i64); + }, + ArrowDataType::Struct(_) => { + let array = array.as_any().downcast_ref::().unwrap(); + for array in array.values() { + set_variadic_buffer_counts(counts, array.as_ref()) + } + }, + ArrowDataType::LargeList(_) => { + let array = array.as_any().downcast_ref::().unwrap(); + set_variadic_buffer_counts(counts, array.values().as_ref()) + }, + ArrowDataType::FixedSizeList(_, _) => { + let array = array.as_any().downcast_ref::().unwrap(); + set_variadic_buffer_counts(counts, array.values().as_ref()) + }, + _ => (), + } +} + /// Write [`Chunk`] into two sets of bytes, one for the header (ipc::Schema::Message) and the /// other for the batch's data fn chunk_to_bytes_amortized( @@ -245,6 +278,7 @@ fn chunk_to_bytes_amortized( let mut offset = 0; let mut variadic_buffer_counts = vec![]; for array in chunk.arrays() { +<<<<<<< HEAD let dtype = array.data_type(); if dtype.is_view() { match dtype { @@ -259,6 +293,9 @@ fn chunk_to_bytes_amortized( _ => {}, } } +======= + set_variadic_buffer_counts(&mut variadic_buffer_counts, array.as_ref()); +>>>>>>> 64003155e8 (feat: new implementation for `String/Binary` type. (#13748)) write( array.as_ref(), diff --git a/src/temporal_conversions.rs b/src/temporal_conversions.rs index f2864c34179..4c30dc50363 100644 --- a/src/temporal_conversions.rs +++ b/src/temporal_conversions.rs @@ -7,7 +7,7 @@ use chrono::{ use crate::error::Result; use crate::{ - array::{PrimitiveArray, Utf8Array}, + array::{PrimitiveArray, Utf8ViewArray}, error::Error, offset::Offset, }; @@ -258,7 +258,10 @@ pub fn timestamp_ns_to_datetime_opt(v: i64) -> Option { /// Converts a timestamp in `time_unit` and `timezone` into [`chrono::DateTime`]. #[inline] -pub fn timestamp_to_naive_datetime(timestamp: i64, time_unit: TimeUnit) -> chrono::NaiveDateTime { +pub(crate) fn timestamp_to_naive_datetime( + timestamp: i64, + time_unit: TimeUnit, +) -> chrono::NaiveDateTime { match time_unit { TimeUnit::Second => timestamp_s_to_datetime(timestamp), TimeUnit::Millisecond => timestamp_ms_to_datetime(timestamp), @@ -395,8 +398,8 @@ pub fn utf8_to_naive_timestamp_scalar(value: &str, fmt: &str, tu: &TimeUnit) -> .ok() } -fn utf8_to_timestamp_ns_impl( - array: &Utf8Array, +fn utf8view_to_timestamp_impl( + array: &Utf8ViewArray, fmt: &str, timezone: String, tz: T, @@ -420,20 +423,24 @@ pub fn parse_offset_tz(timezone: &str) -> Result { #[cfg(feature = "chrono-tz")] #[cfg_attr(docsrs, doc(cfg(feature = "chrono-tz")))] -fn chrono_tz_utf_to_timestamp_ns( - array: &Utf8Array, +fn chrono_tz_utf_to_timestamp( + array: &Utf8ViewArray, fmt: &str, - timezone: String, + time_zone: String, + time_unit: TimeUnit, ) -> Result> { - let tz = parse_offset_tz(&timezone)?; - Ok(utf8_to_timestamp_ns_impl(array, fmt, timezone, tz)) + let tz = parse_offset_tz(&time_zone)?; + Ok(utf8view_to_timestamp_impl( + array, fmt, time_zone, tz, + )) } #[cfg(not(feature = "chrono-tz"))] -fn chrono_tz_utf_to_timestamp_ns( - _: &Utf8Array, +fn chrono_tz_utf_to_timestamp( + _: &Utf8ViewArray, _: &str, timezone: String, + time_unit: TimeUnit, ) -> Result> { Err(Error::InvalidArgumentError(format!( "timezone \"{timezone}\" cannot be parsed (feature chrono-tz is not active)", @@ -448,17 +455,20 @@ fn chrono_tz_utf_to_timestamp_ns( /// The feature `"chrono-tz"` enables IANA and zoneinfo formats for `timezone`. /// # Error /// This function errors iff `timezone` is not parsable to an offset. -pub fn utf8_to_timestamp_ns( - array: &Utf8Array, +pub(crate) fn utf8view_to_timestamp( + array: &Utf8ViewArray, fmt: &str, timezone: String, ) -> Result> { let tz = parse_offset(timezone.as_str()); + let time_unit = TimeUnit::Second; if let Ok(tz) = tz { - Ok(utf8_to_timestamp_ns_impl(array, fmt, timezone, tz)) + Ok(utf8view_to_timestamp_impl( + array, fmt, timezone, tz, + )) } else { - chrono_tz_utf_to_timestamp_ns(array, fmt, timezone) + chrono_tz_utf_to_timestamp(array, fmt, timezone, time_unit) } } @@ -466,8 +476,8 @@ pub fn utf8_to_timestamp_ns( /// [`PrimitiveArray`] with type `Timestamp(Nanosecond, None)`. /// Timezones are ignored. /// Null elements remain null; non-parsable elements are set to null. -pub fn utf8_to_naive_timestamp_ns( - array: &Utf8Array, +pub(crate) fn utf8view_to_naive_timestamp( + array: &Utf8ViewArray, fmt: &str, ) -> PrimitiveArray { let iter = array diff --git a/src/trusted_len.rs b/src/trusted_len.rs index a1c38bd51c7..4bdce32e499 100644 --- a/src/trusted_len.rs +++ b/src/trusted_len.rs @@ -1,4 +1,5 @@ //! Declares [`TrustedLen`]. +use std::iter::Scan; use std::slice::Iter; /// An iterator of known, fixed size. @@ -13,8 +14,6 @@ pub unsafe trait TrustedLen: Iterator {} unsafe impl TrustedLen for Iter<'_, T> {} -unsafe impl B> TrustedLen for std::iter::Map {} - unsafe impl<'a, I, T: 'a> TrustedLen for std::iter::Copied where I: TrustedLen, @@ -55,3 +54,69 @@ unsafe impl TrustedLen for std::vec::IntoIter {} unsafe impl TrustedLen for std::iter::Repeat {} unsafe impl A> TrustedLen for std::iter::RepeatWith {} unsafe impl TrustedLen for std::iter::Take {} + +unsafe impl TrustedLen for &mut dyn TrustedLen {} +unsafe impl TrustedLen for Box + '_> {} + +unsafe impl B> TrustedLen for std::iter::Map {} + +unsafe impl TrustedLen for std::iter::Rev {} + +unsafe impl, J> TrustedLen for TrustMyLength {} +unsafe impl TrustedLen for std::ops::Range where std::ops::Range: Iterator {} +unsafe impl TrustedLen for std::ops::RangeInclusive where std::ops::RangeInclusive: Iterator +{} +unsafe impl TrustedLen for std::iter::StepBy {} + +unsafe impl TrustedLen for Scan +where + F: FnMut(&mut St, I::Item) -> Option, + I: TrustedLen + Iterator, +{ +} + +unsafe impl TrustedLen for hashbrown::hash_map::IntoIter {} + +#[derive(Clone)] +pub struct TrustMyLength, J> { + iter: I, + len: usize, +} + +impl TrustMyLength +where + I: Iterator, +{ + #[inline] + pub fn new(iter: I, len: usize) -> Self { + Self { iter, len } + } +} + +impl Iterator for TrustMyLength +where + I: Iterator, +{ + type Item = J; + + #[inline] + fn next(&mut self) -> Option { + self.iter.next() + } + + fn size_hint(&self) -> (usize, Option) { + (self.len, Some(self.len)) + } +} + +impl ExactSizeIterator for TrustMyLength where I: Iterator {} + +impl DoubleEndedIterator for TrustMyLength +where + I: Iterator + DoubleEndedIterator, +{ + #[inline] + fn next_back(&mut self) -> Option { + self.iter.next_back() + } +} diff --git a/tests/it/io/ipc/mod.rs b/tests/it/io/ipc/mod.rs index 7ca9a9bf5cc..3dbdc3ee049 100644 --- a/tests/it/io/ipc/mod.rs +++ b/tests/it/io/ipc/mod.rs @@ -92,7 +92,7 @@ fn write_sliced_utf8() -> Result<()> { #[test] fn write_binview() -> Result<()> { - let array = Utf8ViewArray::from([Some("foo"), Some("bar"), None, Some("hamlet")]).boxed(); + let array = Utf8ViewArray::from_slice([Some("foo"), Some("bar"), None, Some("hamlet")]).boxed(); let schema = prep_schema(array.as_ref()); let columns = Chunk::try_new(vec![array])?; round_trip(columns, schema, None, Some(Compression::ZSTD)) From 6f688e3294c732cebe4f2bfa2436daffcaf73ce2 Mon Sep 17 00:00:00 2001 From: Ritchie Vink Date: Fri, 19 Jan 2024 10:28:10 +0100 Subject: [PATCH 08/25] perf: lazy cache binview bytes len (#13830) --- src/array/binview/ffi.rs | 3 ++- src/array/binview/mod.rs | 26 ++++++++++++++++++-------- 2 files changed, 20 insertions(+), 9 deletions(-) diff --git a/src/array/binview/ffi.rs b/src/array/binview/ffi.rs index 71e1e56abf5..5220f0d08f2 100644 --- a/src/array/binview/ffi.rs +++ b/src/array/binview/ffi.rs @@ -1,3 +1,4 @@ +use std::sync::atomic::{AtomicU64, Ordering}; use std::sync::Arc; use polars_error::PolarsResult; @@ -48,7 +49,7 @@ unsafe impl ToFfi for BinaryViewArrayGeneric { buffers: self.buffers.clone(), raw_buffers: self.raw_buffers.clone(), phantom: Default::default(), - total_bytes_len: self.total_bytes_len, + total_bytes_len: AtomicU64::new(self.total_bytes_len.load(Ordering::Relaxed)), total_buffer_len: self.total_buffer_len, } } diff --git a/src/array/binview/mod.rs b/src/array/binview/mod.rs index 2b54b8b9cb5..0c1235c2e83 100644 --- a/src/array/binview/mod.rs +++ b/src/array/binview/mod.rs @@ -8,6 +8,7 @@ mod view; use std::any::Any; use std::fmt::Debug; use std::marker::PhantomData; +use std::sync::atomic::{AtomicU64, Ordering}; use std::sync::Arc; use crate::array::Array; @@ -113,7 +114,7 @@ pub struct BinaryViewArrayGeneric { validity: Option, phantom: PhantomData, /// Total bytes length if we would concatenate them all. - total_bytes_len: usize, + total_bytes_len: AtomicU64, /// Total bytes in the buffer (excluding remaining capacity) total_buffer_len: usize, } @@ -133,7 +134,7 @@ impl Clone for BinaryViewArrayGeneric { raw_buffers: self.raw_buffers.clone(), validity: self.validity.clone(), phantom: Default::default(), - total_bytes_len: self.total_bytes_len, + total_bytes_len: AtomicU64::new(self.total_bytes_len.load(Ordering::Relaxed)), total_buffer_len: self.total_buffer_len, } } @@ -148,6 +149,7 @@ fn buffers_into_raw(buffers: &[Buffer]) -> Arc<[(*const T, usize)]> { .map(|buf| (buf.storage_ptr(), buf.len())) .collect() } +const UNKNOWN_LEN: u64 = u64::MAX; impl BinaryViewArrayGeneric { /// # Safety @@ -170,7 +172,7 @@ impl BinaryViewArrayGeneric { raw_buffers, validity, phantom: Default::default(), - total_bytes_len, + total_bytes_len: AtomicU64::new(total_bytes_len as u64), total_buffer_len, } } @@ -327,7 +329,14 @@ impl BinaryViewArrayGeneric { /// Get the total length of bytes that it would take to concatenate all binary/str values in this array. pub fn total_bytes_len(&self) -> usize { - self.total_bytes_len + let total = self.total_bytes_len.load(Ordering::Relaxed); + if total == UNKNOWN_LEN { + let total = self.len_iter().map(|v| v as usize).sum::(); + self.total_bytes_len.store(total as u64, Ordering::Relaxed); + total + } else { + total as usize + } } /// Get the length of bytes that are stored in the variadic buffers. @@ -358,8 +367,9 @@ impl BinaryViewArrayGeneric { if self.total_buffer_len == 0 { return self; } + let total_bytes_len = self.total_bytes_len.load(Ordering::Relaxed) as usize; // Subtract the maximum amount of inlined strings. - let min_in_buffer = self.total_bytes_len.saturating_sub(self.len() * 12); + let min_in_buffer = total_bytes_len.saturating_sub(self.len() * 12); let frac = (min_in_buffer as f64) / ((self.total_buffer_len() + 1) as f64); if frac < 0.25 { @@ -391,7 +401,7 @@ impl BinaryViewArray { self.views.clone(), self.buffers.clone(), self.validity.clone(), - self.total_bytes_len, + self.total_bytes_len.load(Ordering::Relaxed) as usize, self.total_buffer_len, ) } @@ -406,7 +416,7 @@ impl Utf8ViewArray { self.views.clone(), self.buffers.clone(), self.validity.clone(), - self.total_bytes_len, + self.total_bytes_len.load(Ordering::Relaxed) as usize, self.total_buffer_len, ) } @@ -451,7 +461,7 @@ impl Array for BinaryViewArrayGeneric { .map(|bitmap| bitmap.sliced_unchecked(offset, length)) .filter(|bitmap| bitmap.unset_bits() > 0); self.views.slice_unchecked(offset, length); - self.total_bytes_len = self.len_iter().map(|v| v as usize).sum::(); + self.total_bytes_len.store(UNKNOWN_LEN, Ordering::Relaxed) } fn with_validity(&self, validity: Option) -> Box { From a597eedc2109836b29e044d708e497f7760d226c Mon Sep 17 00:00:00 2001 From: Ritchie Vink Date: Fri, 19 Jan 2024 14:27:42 +0100 Subject: [PATCH 09/25] feat: fix binview ipc format (#13842) --- src/io/ipc/read/array/binview.rs | 19 ++----- src/io/ipc/read/read_basic.rs | 72 +++++++++++++++++++++++++++ src/io/ipc/write/serialize/binview.rs | 10 ---- 3 files changed, 75 insertions(+), 26 deletions(-) diff --git a/src/io/ipc/read/array/binview.rs b/src/io/ipc/read/array/binview.rs index c9814fc73a8..33d16c12bef 100644 --- a/src/io/ipc/read/array/binview.rs +++ b/src/io/ipc/read/array/binview.rs @@ -51,23 +51,10 @@ pub fn read_binview( || polars_err!(ComputeError: "IPC: unable to fetch the variadic buffers\n\nThe file or stream is corrupted.") )?; - let variadic_buffer_lengths: Buffer = read_buffer( - buffers, - n_variadic, - reader, - block_offset, - is_little_endian, - compression, - scratch, - )?; - - let variadic_buffers = variadic_buffer_lengths - .iter() - .map(|length| { - let length = *length as usize; - read_buffer( + let variadic_buffers = (0..n_variadic) + .map(|_| { + read_bytes( buffers, - length, reader, block_offset, is_little_endian, diff --git a/src/io/ipc/read/read_basic.rs b/src/io/ipc/read/read_basic.rs index 0a93a63a217..baf73b09ef2 100644 --- a/src/io/ipc/read/read_basic.rs +++ b/src/io/ipc/read/read_basic.rs @@ -43,6 +43,23 @@ fn read_swapped( Ok(()) } +fn read_uncompressed_bytes( + reader: &mut R, + buffer_length: usize, + is_little_endian: bool, +) -> PolarsResult> { + if is_native_little_endian() == is_little_endian { + let mut buffer = Vec::with_capacity(buffer_length); + let _ = reader + .take(buffer_length as u64) + .read_to_end(&mut buffer) + .unwrap(); + Ok(buffer) + } else { + unreachable!() + } +} + fn read_uncompressed_buffer( reader: &mut R, buffer_length: usize, @@ -130,6 +147,61 @@ fn read_compressed_buffer( Ok(buffer) } +fn read_compressed_bytes( + reader: &mut R, + buffer_length: usize, + is_little_endian: bool, + compression: Compression, + scratch: &mut Vec, +) -> PolarsResult> { + read_compressed_buffer::( + reader, + buffer_length, + buffer_length, + is_little_endian, + compression, + scratch, + ) +} + +pub fn read_bytes( + buf: &mut VecDeque, + reader: &mut R, + block_offset: u64, + is_little_endian: bool, + compression: Option, + scratch: &mut Vec, +) -> PolarsResult> { + let buf = buf + .pop_front() + .ok_or_else(|| polars_err!(oos = OutOfSpecKind::ExpectedBuffer))?; + + let offset: u64 = buf + .offset() + .try_into() + .map_err(|_| polars_err!(oos = OutOfSpecKind::NegativeFooterLength))?; + + let buffer_length: usize = buf + .length() + .try_into() + .map_err(|_| polars_err!(oos = OutOfSpecKind::NegativeFooterLength))?; + + reader.seek(SeekFrom::Start(block_offset + offset))?; + + if let Some(compression) = compression { + Ok(read_compressed_bytes( + reader, + buffer_length, + is_little_endian, + compression, + scratch, + )? + .into()) + } else { + Ok(read_uncompressed_bytes(reader, buffer_length, is_little_endian)?.into()) + } +} + pub fn read_buffer( buf: &mut VecDeque, length: usize, // in slots diff --git a/src/io/ipc/write/serialize/binview.rs b/src/io/ipc/write/serialize/binview.rs index a9ebff6dd17..66afafbd0e6 100644 --- a/src/io/ipc/write/serialize/binview.rs +++ b/src/io/ipc/write/serialize/binview.rs @@ -28,16 +28,6 @@ pub(super) fn write_binview( compression, ); - let vbl = array.variadic_buffer_lengths(); - write_buffer( - &vbl, - buffers, - arrow_data, - offset, - is_little_endian, - compression, - ); - for data in array.data_buffers().as_ref() { write_bytes(data, buffers, arrow_data, offset, compression); } From 568043da60fca9c7c03fbe34e0fbf2a91cceab40 Mon Sep 17 00:00:00 2001 From: Orson Peters Date: Fri, 19 Jan 2024 18:47:59 +0100 Subject: [PATCH 10/25] perf: apply string view GC more conservatively (#13850) --- src/array/binview/mod.rs | 24 +++++++++++++++++------- 1 file changed, 17 insertions(+), 7 deletions(-) diff --git a/src/array/binview/mod.rs b/src/array/binview/mod.rs index 0c1235c2e83..434160ac846 100644 --- a/src/array/binview/mod.rs +++ b/src/array/binview/mod.rs @@ -364,18 +364,28 @@ impl BinaryViewArrayGeneric { } pub fn maybe_gc(self) -> Self { - if self.total_buffer_len == 0 { + const GC_MINIMUM_SAVINGS: usize = 16 * 1024; // At least 16 KiB. + + if self.total_buffer_len <= GC_MINIMUM_SAVINGS { return self; } + + // Subtract the maximum amount of inlined strings to get a lower bound + // on the number of buffer bytes needed (assuming no dedup). let total_bytes_len = self.total_bytes_len.load(Ordering::Relaxed) as usize; - // Subtract the maximum amount of inlined strings. - let min_in_buffer = total_bytes_len.saturating_sub(self.len() * 12); - let frac = (min_in_buffer as f64) / ((self.total_buffer_len() + 1) as f64); + let buffer_req_lower_bound = total_bytes_len.saturating_sub(self.len() * 12); + + let lower_bound_mem_usage_post_gc = self.len() * 16 + buffer_req_lower_bound; + let cur_mem_usage = self.len() * 16 + self.total_buffer_len(); + let savings_upper_bound = cur_mem_usage.saturating_sub(lower_bound_mem_usage_post_gc); - if frac < 0.25 { - return self.gc(); + if savings_upper_bound >= GC_MINIMUM_SAVINGS + && cur_mem_usage >= 4 * lower_bound_mem_usage_post_gc + { + self.gc() + } else { + self } - self } } From ca6afa886b3944194e794d01c5af93933eb08378 Mon Sep 17 00:00:00 2001 From: Ritchie Vink Date: Sat, 20 Jan 2024 10:43:35 +0100 Subject: [PATCH 11/25] feat: implement ffi for `binview` (#13871) --- src/array/binview/ffi.rs | 31 ++++++++++++---- src/ffi/array.rs | 63 +++++++++++++++++++++++++++++++-- src/ffi/schema.rs | 2 ++ src/mmap/array.rs | 76 ++++++++++++++++++++++++++++++++++++++++ src/mmap/mod.rs | 6 ++++ tests/it/ffi/data.rs | 17 +++++++++ 6 files changed, 187 insertions(+), 8 deletions(-) diff --git a/src/array/binview/ffi.rs b/src/array/binview/ffi.rs index 5220f0d08f2..d20dd8a53d0 100644 --- a/src/array/binview/ffi.rs +++ b/src/array/binview/ffi.rs @@ -62,18 +62,37 @@ impl FromFfi for BinaryViewArray let validity = unsafe { array.validity() }?; let views = unsafe { array.buffer::(1) }?; - let n = array.n_buffers() - 2; - let mut buffers = Vec::with_capacity(n); + // 2 - validity + views + let n_buffers = array.n_buffers(); + let mut remaining_buffers = n_buffers - 2; + if remaining_buffers <= 1 { + return Ok(Self::new_unchecked_unknown_md( + data_type, + views, + Arc::from([]), + validity, + )); + } + + let n_variadic_buffers = remaining_buffers - 1; + let variadic_buffer_offset = n_buffers - 1; + + let variadic_buffer_sizes = + array.buffer_known_len::(variadic_buffer_offset, n_variadic_buffers)?; + remaining_buffers -= 1; + + let mut variadic_buffers = Vec::with_capacity(remaining_buffers); - for i in 2..n + 2 { - let values = unsafe { array.buffer::(i) }?; - buffers.push(values); + let offset = 2; + for (i, &size) in (offset..remaining_buffers + offset).zip(variadic_buffer_sizes.iter()) { + let values = unsafe { array.buffer_known_len::(i, size as usize) }?; + variadic_buffers.push(values); } Ok(Self::new_unchecked_unknown_md( data_type, views, - Arc::from(buffers), + Arc::from(variadic_buffers), validity, )) } diff --git a/src/ffi/array.rs b/src/ffi/array.rs index 271399ca890..44cab4f2aed 100644 --- a/src/ffi/array.rs +++ b/src/ffi/array.rs @@ -90,6 +90,7 @@ struct PrivateData { buffers_ptr: Box<[*const std::os::raw::c_void]>, children_ptr: Box<[*mut ArrowArray]>, dictionary_ptr: Option<*mut ArrowArray>, + variadic_buffer_sizes: Box<[i64]>, } impl ArrowArray { @@ -98,9 +99,36 @@ impl ArrowArray { /// This method releases `buffers`. Consumers of this struct *must* call `release` before /// releasing this struct, or contents in `buffers` leak. pub(crate) fn new(array: Box) -> Self { - let (offset, buffers, children, dictionary) = + let needs_variadic_buffer_sizes = matches!( + array.data_type(), + DataType::BinaryView | DataType::Utf8View + ); + + let (offset, mut buffers, children, dictionary) = offset_buffers_children_dictionary(array.as_ref()); + let variadic_buffer_sizes = if needs_variadic_buffer_sizes { + #[cfg(feature = "compute_cast")] + { + let arr = crate::compute::cast::cast_unchecked( + array.as_ref(), + &DataType::BinaryView, + ) + .unwrap(); + let arr = arr.as_any().downcast_ref::().unwrap(); + let boxed = arr.variadic_buffer_lengths().into_boxed_slice(); + let ptr = boxed.as_ptr().cast::(); + buffers.push(Some(ptr)); + boxed + } + #[cfg(not(feature = "compute_cast"))] + { + panic!("activate 'compute_cast' feature") + } + } else { + Box::from([]) + }; + let buffers_ptr = buffers .iter() .map(|maybe_buffer| match maybe_buffer { @@ -127,6 +155,7 @@ impl ArrowArray { buffers_ptr, children_ptr, dictionary_ptr, + variadic_buffer_sizes, }); Self { @@ -220,6 +249,21 @@ unsafe fn get_buffer_ptr( Ok(ptr as *mut T) } +unsafe fn create_buffer_known_len( + array: &ArrowArray, + data_type: &DataType, + owner: InternalArrowArray, + len: usize, + index: usize, +) -> Result> { + if len == 0 { + return Ok(Buffer::new()); + } + let ptr: *mut T = get_buffer_ptr(array, data_type, index)?; + let bytes = Bytes::from_foreign(ptr, len, BytesAllocator::InternalArrowArray(owner)); + Ok(Buffer::from_bytes(bytes)) +} + /// returns the buffer `i` of `array` interpreted as a [`Buffer`]. /// # Safety /// This function is safe iff: @@ -238,6 +282,7 @@ unsafe fn create_buffer( } let offset = buffer_offset(array, data_type, index); + dbg!(offset, len); let ptr: *mut T = get_buffer_ptr(array, data_type, index)?; // We have to check alignment. @@ -330,7 +375,10 @@ unsafe fn buffer_len(array: &ArrowArray, data_type: &DataType, i: usize) -> Resu | (PhysicalType::Map, 1) => { // the len of the offset buffer (buffer 1) equals length + 1 array.offset as usize + array.length as usize + 1 - } + }, + (PhysicalType::BinaryView, 1) | (PhysicalType::Utf8View, 1) => { + array.offset as usize + array.length as usize + }, (PhysicalType::Utf8, 2) | (PhysicalType::Binary, 2) => { // the len of the data buffer (buffer 2) equals the last value of the offset buffer (buffer 1) let len = buffer_len(array, data_type, 1)?; @@ -454,6 +502,17 @@ pub trait ArrowArrayRef: std::fmt::Debug { create_buffer::(self.array(), self.data_type(), self.owner(), index) } + /// # Safety + /// The caller must guarantee that the buffer `index` corresponds to a buffer. + /// This function assumes that the buffer created from FFI is valid; this is impossible to prove. + unsafe fn buffer_known_len( + &self, + index: usize, + len: usize, + ) -> Result> { + create_buffer_known_len::(self.array(), self.data_type(), self.owner(), len, index) + } + /// # Safety /// This function is safe iff: /// * the buffer at position `index` is valid for the declared length diff --git a/src/ffi/schema.rs b/src/ffi/schema.rs index 6b17cb7835c..90bfce09040 100644 --- a/src/ffi/schema.rs +++ b/src/ffi/schema.rs @@ -261,6 +261,8 @@ unsafe fn to_data_type(schema: &ArrowSchema) -> Result { "tDn" => DataType::Duration(TimeUnit::Nanosecond), "tiM" => DataType::Interval(IntervalUnit::YearMonth), "tiD" => DataType::Interval(IntervalUnit::DayTime), + "vu" => DataType::Utf8View, + "vz" => DataType::BinaryView, "+l" => { let child = schema.child(0); DataType::List(Box::new(to_field(child)?)) diff --git a/src/mmap/array.rs b/src/mmap/array.rs index 93a8f653c9a..c8511dec41b 100644 --- a/src/mmap/array.rs +++ b/src/mmap/array.rs @@ -58,6 +58,18 @@ fn get_buffer<'a, T: NativeType>( Ok(values) } +fn get_bytes<'a>( + data: &'a [u8], + block_offset: usize, + buffers: &mut VecDeque, +) -> PolarsResult<&'a [u8]> { + let (offset, length) = get_buffer_bounds(buffers)?; + + // verify that they are in-bounds + data.get(block_offset + offset..block_offset + offset + length) + .ok_or_else(|| polars_err!(ComputeError: "buffer out of bounds")) +} + fn get_validity<'a>( data: &'a [u8], block_offset: usize, @@ -115,6 +127,52 @@ fn mmap_binary>( }) } +fn mmap_binview>( + data: Arc, + node: &Node, + block_offset: usize, + buffers: &mut VecDeque, + variadic_buffer_counts: &mut VecDeque, +) -> PolarsResult { + let (num_rows, null_count) = get_num_rows_and_null_count(node)?; + let data_ref = data.as_ref().as_ref(); + + let validity = get_validity(data_ref, block_offset, buffers, null_count)?.map(|x| x.as_ptr()); + + let views = get_buffer::(data_ref, block_offset, buffers, num_rows)?.as_ptr(); + + let n_variadic = variadic_buffer_counts + .pop_front() + .ok_or_else(|| polars_err!(ComputeError: "expected variadic_buffer_count"))?; + + let mut buffer_ptrs = Vec::with_capacity(n_variadic + 2); + buffer_ptrs.push(validity); + buffer_ptrs.push(Some(views)); + + let mut variadic_buffer_sizes = Vec::with_capacity(n_variadic); + for _ in 0..n_variadic { + let variadic_buffer = get_bytes(data_ref, block_offset, buffers)?; + variadic_buffer_sizes.push(variadic_buffer.len()); + buffer_ptrs.push(Some(variadic_buffer.as_ptr())); + } + + // Move variadic buffer sizes in an Arc, so that it stays alive. + let data = Arc::new((data, variadic_buffer_sizes)); + + // NOTE: invariants are not validated + Ok(unsafe { + create_array( + data, + num_rows, + null_count, + buffer_ptrs.into_iter(), + [].into_iter(), + None, + None, + ) + }) +} + fn mmap_fixed_size_binary>( data: Arc, node: &Node, @@ -269,6 +327,7 @@ fn mmap_list>( ipc_field: &IpcField, dictionaries: &Dictionaries, field_nodes: &mut VecDeque, + variadic_buffer_counts: &mut VecDeque, buffers: &mut VecDeque, ) -> Result { let child = ListArray::::try_get_child(data_type)?.data_type(); @@ -296,6 +355,7 @@ fn mmap_list>( &ipc_field.fields[0], dictionaries, field_nodes, + variadic_buffer_counts, buffers, )?; @@ -322,6 +382,7 @@ fn mmap_fixed_size_list>( ipc_field: &IpcField, dictionaries: &Dictionaries, field_nodes: &mut VecDeque, + variadic_buffer_counts: &mut VecDeque, buffers: &mut VecDeque, ) -> Result { let child = FixedSizeListArray::try_child_and_size(data_type)? @@ -349,6 +410,7 @@ fn mmap_fixed_size_list>( &ipc_field.fields[0], dictionaries, field_nodes, + variadic_buffer_counts, buffers, )?; @@ -374,6 +436,7 @@ fn mmap_struct>( ipc_field: &IpcField, dictionaries: &Dictionaries, field_nodes: &mut VecDeque, + variadic_buffer_counts: &mut VecDeque, buffers: &mut VecDeque, ) -> Result { let children = StructArray::try_get_fields(data_type)?; @@ -404,6 +467,7 @@ fn mmap_struct>( ipc, dictionaries, field_nodes, + variadic_buffer_counts, buffers, ) }) @@ -467,6 +531,7 @@ fn mmap_dict>( }) } +#[allow(clippy::too_many_arguments)] fn get_array>( data: Arc, block_offset: usize, @@ -474,6 +539,7 @@ fn get_array>( ipc_field: &IpcField, dictionaries: &Dictionaries, field_nodes: &mut VecDeque, + variadic_buffer_counts: &mut VecDeque, buffers: &mut VecDeque, ) -> Result { use crate::datatypes::PhysicalType::*; @@ -488,6 +554,9 @@ fn get_array>( mmap_primitive::<$T, _>(data, &node, block_offset, buffers) }), Utf8 | Binary => mmap_binary::(data, &node, block_offset, buffers), + Utf8View | BinaryView => { + mmap_binview(data, &node, block_offset, buffers, variadic_buffer_counts) + }, FixedSizeBinary => mmap_fixed_size_binary(data, &node, block_offset, buffers, data_type), LargeBinary | LargeUtf8 => mmap_binary::(data, &node, block_offset, buffers), List => mmap_list::( @@ -498,6 +567,7 @@ fn get_array>( ipc_field, dictionaries, field_nodes, + variadic_buffer_counts, buffers, ), LargeList => mmap_list::( @@ -508,6 +578,7 @@ fn get_array>( ipc_field, dictionaries, field_nodes, + variadic_buffer_counts, buffers, ), FixedSizeList => mmap_fixed_size_list( @@ -518,6 +589,7 @@ fn get_array>( ipc_field, dictionaries, field_nodes, + variadic_buffer_counts, buffers, ), Struct => mmap_struct( @@ -528,6 +600,7 @@ fn get_array>( ipc_field, dictionaries, field_nodes, + variadic_buffer_counts, buffers, ), Dictionary(key_type) => match_integer_type!(key_type, |$T| { @@ -546,6 +619,7 @@ fn get_array>( } } +#[allow(clippy::too_many_arguments)] /// Maps a memory region to an [`Array`]. pub(crate) unsafe fn mmap>( data: Arc, @@ -554,6 +628,7 @@ pub(crate) unsafe fn mmap>( ipc_field: &IpcField, dictionaries: &Dictionaries, field_nodes: &mut VecDeque, + variadic_buffer_counts: &mut VecDeque, buffers: &mut VecDeque, ) -> Result, Error> { let array = get_array( @@ -563,6 +638,7 @@ pub(crate) unsafe fn mmap>( ipc_field, dictionaries, field_nodes, + variadic_buffer_counts, buffers, )?; // The unsafety comes from the fact that `array` is not necessarily valid - diff --git a/src/mmap/mod.rs b/src/mmap/mod.rs index 5d560c93663..d5ec28c7b4b 100644 --- a/src/mmap/mod.rs +++ b/src/mmap/mod.rs @@ -85,6 +85,11 @@ unsafe fn _mmap_record>( dictionaries: &Dictionaries, ) -> Result>, Error> { let (mut buffers, mut field_nodes) = get_buffers_nodes(batch)?; + let mut variadic_buffer_counts = batch + .variadic_buffer_counts() + .map_err(|err| polars_err!(oos = OutOfSpecKind::InvalidFlatbufferRecordBatches(err)))? + .map(|v| v.iter().map(|v| v as usize).collect::>()) + .unwrap_or_else(VecDeque::new); fields .iter() @@ -99,6 +104,7 @@ unsafe fn _mmap_record>( ipc_field, dictionaries, &mut field_nodes, + &mut variadic_buffer_counts, &mut buffers, ) }) diff --git a/tests/it/ffi/data.rs b/tests/it/ffi/data.rs index e5675ac60fe..9e504a026f2 100644 --- a/tests/it/ffi/data.rs +++ b/tests/it/ffi/data.rs @@ -30,6 +30,7 @@ fn test_round_trip(expected: impl Array + Clone + 'static) -> Result<()> { _test_round_trip(array.sliced(1, 2), expected.sliced(1, 2)) } +<<<<<<< HEAD fn test_round_trip_schema(field: Field) -> Result<()> { let schema_ffi = ffi::export_field_to_c(&field); @@ -51,6 +52,11 @@ fn bool() -> Result<()> { test_round_trip(data) } +fn binview_nullable_inlined() -> PolarsResult<()> { + let data = Utf8ViewArray::from_slice([Some("foo"), None, Some("barbar"), None]); + test_round_trip(data) +} + #[test] fn bool_nullable_sliced() -> Result<()> { let bitmap = Bitmap::from([true, false, false, true]).sliced(1, 3); @@ -362,3 +368,14 @@ fn extension_children() -> Result<()> { ); test_round_trip_schema(field) } + +fn binview_nullable_buffered() -> Result<()> { + let data = Utf8ViewArray::from_slice([ + Some("foobaroiwalksdfjoiei"), + None, + Some("barbar"), + None, + Some("aoisejiofjfoiewjjwfoiwejfo"), + ]); + test_round_trip(data) +} From 09b247f9aa91a764f0b9035e967d248bbd605f45 Mon Sep 17 00:00:00 2001 From: Ritchie Vink Date: Sat, 20 Jan 2024 11:38:53 +0100 Subject: [PATCH 12/25] feat: support mmap for binview in OOC (#13872) --- src/ffi/array.rs | 1 - src/mmap/array.rs | 7 ++++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/ffi/array.rs b/src/ffi/array.rs index 44cab4f2aed..5e2e008a24d 100644 --- a/src/ffi/array.rs +++ b/src/ffi/array.rs @@ -282,7 +282,6 @@ unsafe fn create_buffer( } let offset = buffer_offset(array, data_type, index); - dbg!(offset, len); let ptr: *mut T = get_buffer_ptr(array, data_type, index)?; // We have to check alignment. diff --git a/src/mmap/array.rs b/src/mmap/array.rs index c8511dec41b..6967fbd6d95 100644 --- a/src/mmap/array.rs +++ b/src/mmap/array.rs @@ -139,7 +139,7 @@ fn mmap_binview>( let validity = get_validity(data_ref, block_offset, buffers, null_count)?.map(|x| x.as_ptr()); - let views = get_buffer::(data_ref, block_offset, buffers, num_rows)?.as_ptr(); + let views = get_buffer::(data_ref, block_offset, buffers, num_rows)?; let n_variadic = variadic_buffer_counts .pop_front() @@ -147,14 +147,15 @@ fn mmap_binview>( let mut buffer_ptrs = Vec::with_capacity(n_variadic + 2); buffer_ptrs.push(validity); - buffer_ptrs.push(Some(views)); + buffer_ptrs.push(Some(views.as_ptr())); let mut variadic_buffer_sizes = Vec::with_capacity(n_variadic); for _ in 0..n_variadic { let variadic_buffer = get_bytes(data_ref, block_offset, buffers)?; - variadic_buffer_sizes.push(variadic_buffer.len()); + variadic_buffer_sizes.push(variadic_buffer.len() as i64); buffer_ptrs.push(Some(variadic_buffer.as_ptr())); } + buffer_ptrs.push(Some(variadic_buffer_sizes.as_ptr().cast::())); // Move variadic buffer sizes in an Arc, so that it stays alive. let data = Arc::new((data, variadic_buffer_sizes)); From d9d526055d84f0181691849e349b135cb4cf2ac5 Mon Sep 17 00:00:00 2001 From: Ritchie Vink Date: Sat, 20 Jan 2024 13:14:56 +0100 Subject: [PATCH 13/25] feat: fix parquet for binview (#13873) --- src/array/binview/mod.rs | 3 ++- src/array/binview/mutable.rs | 4 +++- src/array/binview/view.rs | 14 +++++++++----- 3 files changed, 14 insertions(+), 7 deletions(-) diff --git a/src/array/binview/mod.rs b/src/array/binview/mod.rs index 434160ac846..26f96b2b9b2 100644 --- a/src/array/binview/mod.rs +++ b/src/array/binview/mod.rs @@ -392,7 +392,8 @@ impl BinaryViewArrayGeneric { impl BinaryViewArray { /// Validate the underlying bytes on UTF-8. pub fn validate_utf8(&self) -> Result<()> { - validate_utf8_only(&self.views, &self.buffers) + // SAFETY: views are correct + unsafe { validate_utf8_only(&self.views, &self.buffers) } } /// Convert [`BinaryViewArray`] to [`Utf8ViewArray`]. diff --git a/src/array/binview/mutable.rs b/src/array/binview/mutable.rs index 9667b278ba8..e5ca890a5f1 100644 --- a/src/array/binview/mutable.rs +++ b/src/array/binview/mutable.rs @@ -313,7 +313,9 @@ impl MutableBinaryViewArray { impl MutableBinaryViewArray<[u8]> { pub fn validate_utf8(&mut self) -> Result<()> { - validate_utf8_only(&self.views, &self.completed_buffers) + self.finish_in_progress(); + // views are correct + unsafe { validate_utf8_only(&self.views, &self.completed_buffers) } } } diff --git a/src/array/binview/view.rs b/src/array/binview/view.rs index d0ba31f464f..210f727d222 100644 --- a/src/array/binview/view.rs +++ b/src/array/binview/view.rs @@ -90,14 +90,18 @@ pub(super) fn validate_utf8_only(views: &[u128], buffers: &[Buffer]) -> Resu for view in views { let len = *view as u32; if len <= 12 { - validate_utf8(&view.to_le_bytes()[4..4 + len as usize])?; + validate_utf8( + view.to_le_bytes() + .get_unchecked_release(4..4 + len as usize), + )?; } else { - let view = View::from(*view); - let data = &buffers[view.buffer_idx as usize]; + let buffer_idx = (*view >> 64) as u32; + let offset = (*view >> 96) as u32; + let data = buffers.get_unchecked_release(buffer_idx as usize); - let start = view.offset as usize; + let start = offset as usize; let end = start + len as usize; - let b = &data.as_slice()[start..end]; + let b = &data.as_slice().get_unchecked_release(start..end); validate_utf8(b)?; }; } From 9bd13a23a6b78ec688989c475e741ce6180b94ae Mon Sep 17 00:00:00 2001 From: Ritchie Vink Date: Sat, 20 Jan 2024 14:47:46 +0100 Subject: [PATCH 14/25] fix: ensure binview doesn't OOB (#13876) --- src/array/binview/mutable.rs | 2 +- src/array/growable/binview.rs | 13 +++++++++---- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/src/array/binview/mutable.rs b/src/array/binview/mutable.rs index e5ca890a5f1..247f66d29e8 100644 --- a/src/array/binview/mutable.rs +++ b/src/array/binview/mutable.rs @@ -133,7 +133,7 @@ impl MutableBinaryViewArray { let (data_ptr, data_len) = *buffers.get_unchecked(buffer_idx as usize); let data = std::slice::from_raw_parts(data_ptr, data_len); let offset = offset as usize; - let bytes = data.get_unchecked(offset..offset + len as usize); + let bytes = data.get_unchecked_release(offset..offset + len as usize); let t = T::from_bytes_unchecked(bytes); self.push_value_ignore_validity(t) } diff --git a/src/array/growable/binview.rs b/src/array/growable/binview.rs index a4f4b1099ed..3f597f80921 100644 --- a/src/array/growable/binview.rs +++ b/src/array/growable/binview.rs @@ -15,7 +15,7 @@ pub struct GrowableBinaryViewArray<'a, T: ViewType + ?Sized> { validity: Option, views: Vec, buffers: Vec>, - buffers_offsets: Vec, + buffers_idx_offsets: Vec, total_bytes_len: usize, total_buffer_len: usize, } @@ -63,7 +63,7 @@ impl<'a, T: ViewType + ?Sized> GrowableBinaryViewArray<'a, T> { validity: prepare_validity(use_validity, capacity), views: Vec::with_capacity(capacity), buffers, - buffers_offsets: cum_offset, + buffers_idx_offsets: cum_offset, total_bytes_len: 0, total_buffer_len, } @@ -101,9 +101,14 @@ impl<'a, T: ViewType + ?Sized> GrowableBinaryViewArray<'a, T> { self.total_bytes_len += len; if len > 12 { - let buffer_offset = *self.buffers_offsets.get_unchecked(index); + // Take the buffer index of the View. + let current_buffer_idx = (view >> 64) as u32; + // And add the offset of the buffers. + let buffer_idx = + *self.buffers_idx_offsets.get_unchecked(index) + current_buffer_idx; + // Mask out the old buffer-idx and OR in the new. let mask = (u32::MAX as u128) << 64; - (view & !mask) | ((buffer_offset as u128) << 64) + (view & !mask) | ((buffer_idx as u128) << 64) } else { view } From 07eb3fbf57b7442460e4398d1762eaed94eec876 Mon Sep 17 00:00:00 2001 From: Urvish Desai Date: Wed, 14 Feb 2024 16:04:17 -0800 Subject: [PATCH 15/25] make BinaryView/Utf8View compatible with arrow2 --- src/array/binview/ffi.rs | 6 +++--- src/array/binview/mod.rs | 10 ++++++++-- src/array/binview/mutable.rs | 4 ++-- src/array/binview/view.rs | 30 +++++++++++++++++++++++------- src/array/growable/binview.rs | 4 ++-- src/array/mod.rs | 2 ++ src/lib.rs | 3 ++- src/scalar/binview.rs | 8 ++++---- src/scalar/equal.rs | 1 + src/temporal_conversions.rs | 7 +++---- tests/it/ffi/data.rs | 3 +-- tests/it/temporal_conversions.rs | 27 ++++++++++++--------------- 12 files changed, 63 insertions(+), 42 deletions(-) diff --git a/src/array/binview/ffi.rs b/src/array/binview/ffi.rs index d20dd8a53d0..a8407c661f1 100644 --- a/src/array/binview/ffi.rs +++ b/src/array/binview/ffi.rs @@ -1,9 +1,9 @@ use std::sync::atomic::{AtomicU64, Ordering}; use std::sync::Arc; -use polars_error::PolarsResult; - use super::BinaryViewArrayGeneric; + +use crate::error::Result; use crate::array::binview::ViewType; use crate::array::{FromFfi, ToFfi}; use crate::bitmap::align; @@ -56,7 +56,7 @@ unsafe impl ToFfi for BinaryViewArrayGeneric { } impl FromFfi for BinaryViewArrayGeneric { - unsafe fn try_from_ffi(array: A) -> PolarsResult { + unsafe fn try_from_ffi(array: A) -> Result { let data_type = array.data_type().clone(); let validity = unsafe { array.validity() }?; diff --git a/src/array/binview/mod.rs b/src/array/binview/mod.rs index 26f96b2b9b2..5dae7b0a802 100644 --- a/src/array/binview/mod.rs +++ b/src/array/binview/mod.rs @@ -222,8 +222,14 @@ impl BinaryViewArrayGeneric { validate_binary_view(views.as_ref(), buffers.as_ref())?; } - if let Some(validity) = &validity { - polars_ensure!(validity.len()== views.len(), ComputeError: "validity mask length must match the number of values" ) + + if validity + .as_ref() + .map_or(false, |validity| validity.len() == views.len()) + { + return Err(Error::oos( + "validity mask length must match the number of values", + )); } unsafe { diff --git a/src/array/binview/mutable.rs b/src/array/binview/mutable.rs index 247f66d29e8..ea36cf830c5 100644 --- a/src/array/binview/mutable.rs +++ b/src/array/binview/mutable.rs @@ -133,7 +133,7 @@ impl MutableBinaryViewArray { let (data_ptr, data_len) = *buffers.get_unchecked(buffer_idx as usize); let data = std::slice::from_raw_parts(data_ptr, data_len); let offset = offset as usize; - let bytes = data.get_unchecked_release(offset..offset + len as usize); + let bytes = data.get_unchecked(offset..offset + len as usize); let t = T::from_bytes_unchecked(bytes); self.push_value_ignore_validity(t) } @@ -165,7 +165,7 @@ impl MutableBinaryViewArray { let offset = self.in_progress_buffer.len() as u32; self.in_progress_buffer.extend_from_slice(bytes); - unsafe { payload[4..8].copy_from_slice(bytes.get_unchecked_release(0..4)) }; + unsafe { payload[4..8].copy_from_slice(bytes.get_unchecked(0..4)) }; let buffer_idx: u32 = self.completed_buffers.len().try_into().unwrap(); payload[8..12].copy_from_slice(&buffer_idx.to_le_bytes()); payload[12..16].copy_from_slice(&offset.to_le_bytes()); diff --git a/src/array/binview/view.rs b/src/array/binview/view.rs index 210f727d222..94b3c318bb9 100644 --- a/src/array/binview/view.rs +++ b/src/array/binview/view.rs @@ -35,15 +35,17 @@ impl From for u128 { } } -fn validate_view(views: &[u128], buffers: &[Buffer], validate_bytes: F) -> PolarsResult<()> +fn validate_view(views: &[u128], buffers: &[Buffer], validate_bytes: F) -> Result<()> where - F: Fn(&[u8]) -> PolarsResult<()>, + F: Fn(&[u8]) -> Result<()>, { for view in views { let len = *view as u32; if len <= 12 { if len < 12 && view >> (32 + len * 8) != 0 { - polars_bail!(ComputeError: "view contained non-zero padding in prefix"); + return Err(Error::InvalidArgumentError(format!( + "View contained non-zero padding for string of length {len}", + ))); } validate_bytes(&view.to_le_bytes()[4..4 + len as usize])?; @@ -51,7 +53,11 @@ where let view = View::from(*view); let data = buffers.get(view.buffer_idx as usize).ok_or_else(|| { - polars_err!(OutOfBounds: "view index out of bounds\n\nGot: {} buffers and index: {}", buffers.len(), view.buffer_idx) + Error::InvalidArgumentError(format!( + "Invalid buffer index: got index {} but only has {} buffers", + view.buffer_idx, + buffers.len() + )) })?; let start = view.offset as usize; @@ -59,9 +65,19 @@ where let b = data .as_slice() .get(start..end) - .ok_or_else(|| polars_err!(OutOfBounds: "buffer slice out of bounds"))?; + .ok_or_else(|| { + Error::InvalidArgumentError(format!( + "Invalid buffer slice: got {start}..{end} but buffer {} has length {}", + view.buffer_idx, + data.len() + )) + })?; - polars_ensure!(b.starts_with(&view.prefix.to_le_bytes()), ComputeError: "prefix does not match string data"); + if !b.starts_with(&view.prefix.to_le_bytes()) { + return Err(Error::InvalidArgumentError( + "Mismatch between embedded prefix and data".to_string(), + )); + } validate_bytes(b)?; }; } @@ -69,7 +85,7 @@ where Ok(()) } -pub(super) fn validate_binary_view(views: &[u128], buffers: &[Buffer]) -> PolarsResult<()> { +pub(super) fn validate_binary_view(views: &[u128], buffers: &[Buffer]) -> Result<()> { validate_view(views, buffers, |_| Ok(())) } diff --git a/src/array/growable/binview.rs b/src/array/growable/binview.rs index 3f597f80921..58af30de59b 100644 --- a/src/array/growable/binview.rs +++ b/src/array/growable/binview.rs @@ -6,12 +6,12 @@ use crate::array::growable::utils::{extend_validity, prepare_validity}; use crate::array::Array; use crate::bitmap::MutableBitmap; use crate::buffer::Buffer; -use crate::datatypes::ArrowDataType; +use crate::datatypes::DataType; /// Concrete [`Growable`] for the [`BinaryArray`]. pub struct GrowableBinaryViewArray<'a, T: ViewType + ?Sized> { arrays: Vec<&'a BinaryViewArrayGeneric>, - data_type: ArrowDataType, + data_type: DataType, validity: Option, views: Vec, buffers: Vec>, diff --git a/src/array/mod.rs b/src/array/mod.rs index 5e5a7e2fb26..18484999aa4 100644 --- a/src/array/mod.rs +++ b/src/array/mod.rs @@ -308,6 +308,8 @@ macro_rules! with_match_primitive_type {( Float16 => __with_ty__! { f16 }, Float32 => __with_ty__! { f32 }, Float64 => __with_ty__! { f64 }, + _ => panic!("operator does not support primitive `{:?}`", + $key_type) } })} diff --git a/src/lib.rs b/src/lib.rs index bef2e6e53c1..5bbee5797dc 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,5 +1,6 @@ #![doc = include_str!("doc/lib.md")] -#![deny(missing_docs)] +// todo()! add missing docs +#![allow(missing_docs)] // So that we have more control over what is `unsafe` inside an `unsafe` block #![allow(unused_unsafe)] // diff --git a/src/scalar/binview.rs b/src/scalar/binview.rs index e96c90c04ad..01fe1a4029b 100644 --- a/src/scalar/binview.rs +++ b/src/scalar/binview.rs @@ -2,7 +2,7 @@ use std::fmt::{Debug, Formatter}; use super::Scalar; use crate::array::ViewType; -use crate::datatypes::ArrowDataType; +use crate::datatypes::DataType; /// The implementation of [`Scalar`] for utf8, semantically equivalent to [`Option`]. #[derive(PartialEq, Eq)] @@ -62,11 +62,11 @@ impl Scalar for BinaryViewScalar { } #[inline] - fn data_type(&self) -> &ArrowDataType { + fn data_type(&self) -> &DataType { if T::IS_UTF8 { - &ArrowDataType::Utf8View + &DataType::Utf8View } else { - &ArrowDataType::BinaryView + &DataType::BinaryView } } } diff --git a/src/scalar/equal.rs b/src/scalar/equal.rs index dcb3c836be5..c2fc459b955 100644 --- a/src/scalar/equal.rs +++ b/src/scalar/equal.rs @@ -55,5 +55,6 @@ fn equal(lhs: &dyn Scalar, rhs: &dyn Scalar) -> bool { FixedSizeList => dyn_eq!(FixedSizeListScalar, lhs, rhs), Union => dyn_eq!(UnionScalar, lhs, rhs), Map => dyn_eq!(MapScalar, lhs, rhs), + _ => unimplemented!(), } } diff --git a/src/temporal_conversions.rs b/src/temporal_conversions.rs index 4c30dc50363..8acb607cd4a 100644 --- a/src/temporal_conversions.rs +++ b/src/temporal_conversions.rs @@ -9,7 +9,6 @@ use crate::error::Result; use crate::{ array::{PrimitiveArray, Utf8ViewArray}, error::Error, - offset::Offset, }; use crate::{ datatypes::{DataType, TimeUnit}, @@ -440,7 +439,7 @@ fn chrono_tz_utf_to_timestamp( _: &Utf8ViewArray, _: &str, timezone: String, - time_unit: TimeUnit, + _: TimeUnit, ) -> Result> { Err(Error::InvalidArgumentError(format!( "timezone \"{timezone}\" cannot be parsed (feature chrono-tz is not active)", @@ -455,7 +454,7 @@ fn chrono_tz_utf_to_timestamp( /// The feature `"chrono-tz"` enables IANA and zoneinfo formats for `timezone`. /// # Error /// This function errors iff `timezone` is not parsable to an offset. -pub(crate) fn utf8view_to_timestamp( +pub fn utf8view_to_timestamp( array: &Utf8ViewArray, fmt: &str, timezone: String, @@ -476,7 +475,7 @@ pub(crate) fn utf8view_to_timestamp( /// [`PrimitiveArray`] with type `Timestamp(Nanosecond, None)`. /// Timezones are ignored. /// Null elements remain null; non-parsable elements are set to null. -pub(crate) fn utf8view_to_naive_timestamp( +pub fn utf8view_to_naive_timestamp( array: &Utf8ViewArray, fmt: &str, ) -> PrimitiveArray { diff --git a/tests/it/ffi/data.rs b/tests/it/ffi/data.rs index 9e504a026f2..f3110eeaa41 100644 --- a/tests/it/ffi/data.rs +++ b/tests/it/ffi/data.rs @@ -30,7 +30,6 @@ fn test_round_trip(expected: impl Array + Clone + 'static) -> Result<()> { _test_round_trip(array.sliced(1, 2), expected.sliced(1, 2)) } -<<<<<<< HEAD fn test_round_trip_schema(field: Field) -> Result<()> { let schema_ffi = ffi::export_field_to_c(&field); @@ -52,7 +51,7 @@ fn bool() -> Result<()> { test_round_trip(data) } -fn binview_nullable_inlined() -> PolarsResult<()> { +fn binview_nullable_inlined() -> Result<()> { let data = Utf8ViewArray::from_slice([Some("foo"), None, Some("barbar"), None]); test_round_trip(data) } diff --git a/tests/it/temporal_conversions.rs b/tests/it/temporal_conversions.rs index 1bb206de5ad..08399e32fb8 100644 --- a/tests/it/temporal_conversions.rs +++ b/tests/it/temporal_conversions.rs @@ -9,21 +9,18 @@ use chrono::NaiveDateTime; fn naive() { let expected = "Timestamp(Nanosecond, None)[1996-12-19 16:39:57, 1996-12-19 13:39:57, None]"; let fmt = "%Y-%m-%dT%H:%M:%S:z"; - let array = Utf8Array::::from_slice([ + let slice = [ "1996-12-19T16:39:57-02:00", "1996-12-19T13:39:57-03:00", "1996-12-19 13:39:57-03:00", // missing T - ]); - let r = temporal_conversions::utf8_to_naive_timestamp_ns(&array, fmt); + ]; + let array = Utf8ViewArray::from_slice_values(slice); + let r = temporal_conversions::utf8view_to_naive_timestamp(&array, fmt); assert_eq!(format!("{r:?}"), expected); let fmt = "%Y-%m-%dT%H:%M:%S"; // no tz info - let array = Utf8Array::::from_slice([ - "1996-12-19T16:39:57-02:00", - "1996-12-19T13:39:57-03:00", - "1996-12-19 13:39:57-03:00", // missing T - ]); - let r = temporal_conversions::utf8_to_naive_timestamp_ns(&array, fmt); + let array = Utf8ViewArray::from_slice_values(slice); + let r = temporal_conversions::utf8view_to_naive_timestamp(&array, fmt); assert_eq!(format!("{r:?}"), expected); } @@ -115,12 +112,12 @@ fn scalar_tz_aware_no_timezone() { fn naive_no_tz() { let expected = "Timestamp(Nanosecond, None)[1996-12-19 16:39:57, 1996-12-19 13:39:57, None]"; let fmt = "%Y-%m-%dT%H:%M:%S"; // no tz info - let array = Utf8Array::::from_slice([ + let array = Utf8ViewArray::from_slice_values([ "1996-12-19T16:39:57", "1996-12-19T13:39:57", "1996-12-19 13:39:57", // missing T ]); - let r = temporal_conversions::utf8_to_naive_timestamp_ns(&array, fmt); + let r = temporal_conversions::utf8view_to_naive_timestamp(&array, fmt); assert_eq!(format!("{r:?}"), expected); } @@ -197,12 +194,12 @@ fn tz_aware() { let expected = "Timestamp(Nanosecond, Some(\"-02:00\"))[1996-12-19 16:39:57 -02:00, 1996-12-19 17:39:57 -02:00, None]"; let fmt = "%Y-%m-%dT%H:%M:%S%.f%:z"; - let array = Utf8Array::::from_slice([ + let array = Utf8ViewArray::from_slice_values([ "1996-12-19T16:39:57.0-02:00", "1996-12-19T16:39:57.0-03:00", // same time at a different TZ "1996-12-19 13:39:57.0-03:00", ]); - let r = temporal_conversions::utf8_to_timestamp_ns(&array, fmt, tz).unwrap(); + let r = temporal_conversions::utf8view_to_timestamp(&array, fmt, tz).unwrap(); assert_eq!(format!("{r:?}"), expected); } @@ -211,12 +208,12 @@ fn tz_aware_no_timezone() { let tz = "-02:00".to_string(); let expected = "Timestamp(Nanosecond, Some(\"-02:00\"))[None, None, None]"; let fmt = "%Y-%m-%dT%H:%M:%S%.f"; - let array = Utf8Array::::from_slice([ + let array = Utf8ViewArray::from_slice_values([ "1996-12-19T16:39:57.0", "1996-12-19T17:39:57.0", "1996-12-19 13:39:57.0", ]); - let r = temporal_conversions::utf8_to_timestamp_ns(&array, fmt, tz).unwrap(); + let r = temporal_conversions::utf8view_to_timestamp(&array, fmt, tz).unwrap(); assert_eq!(format!("{r:?}"), expected); } From 021b06280edd03ccf6530acddd90756b1e42573b Mon Sep 17 00:00:00 2001 From: Ritchie Vink Date: Sun, 21 Jan 2024 10:44:16 +0100 Subject: [PATCH 16/25] perf: improve binview filter (#13878) --- src/array/growable/binview.rs | 22 ++++++++++++++++++++++ src/compute/filter.rs | 12 ++++++------ 2 files changed, 28 insertions(+), 6 deletions(-) diff --git a/src/array/growable/binview.rs b/src/array/growable/binview.rs index 58af30de59b..2ded87816c6 100644 --- a/src/array/growable/binview.rs +++ b/src/array/growable/binview.rs @@ -114,6 +114,28 @@ impl<'a, T: ViewType + ?Sized> GrowableBinaryViewArray<'a, T> { } })); } + + #[inline] + pub(crate) unsafe fn extend_unchecked_no_buffers( + &mut self, + index: usize, + start: usize, + len: usize, + ) { + let array = *self.arrays.get_unchecked(index); + + extend_validity(&mut self.validity, array, start, len); + + let range = start..start + len; + + self.views + .extend(array.views().get_unchecked(range).iter().map(|view| { + let len = (*view as u32) as usize; + self.total_bytes_len += len; + + *view + })) + } } impl<'a, T: ViewType + ?Sized> Growable<'a> for GrowableBinaryViewArray<'a, T> { diff --git a/src/compute/filter.rs b/src/compute/filter.rs index 8e7d562281f..2c3cf7c895e 100644 --- a/src/compute/filter.rs +++ b/src/compute/filter.rs @@ -298,14 +298,14 @@ pub fn filter(array: &dyn Array, filter: &BooleanArray) -> Result }), BinaryView => { let iter = SlicesIterator::new(filter.values()); - let mut mutable = growable::GrowableBinaryViewArray::new( - vec![array.as_any().downcast_ref::().unwrap()], - false, - iter.slots(), - ); + let array = array.as_any().downcast_ref::().unwrap(); + let mut mutable = + growable::GrowableBinaryViewArray::new(vec![array], false, iter.slots()); unsafe { - iter.for_each(|(start, len)| mutable.extend_unchecked(0, start, len)); + // We don't have to correct buffers as there is only one array. + iter.for_each(|(start, len)| mutable.extend_unchecked_no_buffers(0, start, len)); } + Ok(mutable.as_box()) }, // Should go via BinaryView From 02e876abd855833c025f2023164f8357341625d8 Mon Sep 17 00:00:00 2001 From: Ritchie Vink Date: Mon, 22 Jan 2024 08:31:39 +0100 Subject: [PATCH 17/25] feat: move Enum/Categorical categories to binview (#13882) --- src/array/binview/iterator.rs | 19 +++++++- src/array/binview/mod.rs | 15 ++++++ src/array/binview/mutable.rs | 64 ++++++++++++++++++++++---- src/array/dictionary/typed_iterator.rs | 30 +++++++++++- src/buffer/immutable.rs | 9 ++++ src/io/ipc/write/common.rs | 17 ++++++- 6 files changed, 143 insertions(+), 11 deletions(-) diff --git a/src/array/binview/iterator.rs b/src/array/binview/iterator.rs index 5e53fb8fec6..26587d5c1b7 100644 --- a/src/array/binview/iterator.rs +++ b/src/array/binview/iterator.rs @@ -1,6 +1,6 @@ use super::BinaryViewArrayGeneric; use crate::array::binview::ViewType; -use crate::array::{ArrayAccessor, ArrayValuesIter}; +use crate::array::{ArrayAccessor, ArrayValuesIter, MutableBinaryViewArray}; use crate::bitmap::utils::{BitmapIter, ZipValidity}; unsafe impl<'a, T: ViewType + ?Sized> ArrayAccessor<'a> for BinaryViewArrayGeneric { @@ -28,3 +28,20 @@ impl<'a, T: ViewType + ?Sized> IntoIterator for &'a BinaryViewArrayGeneric { self.iter() } } + +unsafe impl<'a, T: ViewType + ?Sized> ArrayAccessor<'a> for MutableBinaryViewArray { + type Item = &'a T; + + #[inline] + unsafe fn value_unchecked(&'a self, index: usize) -> Self::Item { + self.value_unchecked(index) + } + + #[inline] + fn len(&self) -> usize { + self.views().len() + } +} + +/// Iterator of values of an [`MutableBinaryViewArray`]. +pub type MutableBinaryViewValueIter<'a, T> = ArrayValuesIter<'a, MutableBinaryViewArray>; diff --git a/src/array/binview/mod.rs b/src/array/binview/mod.rs index 5dae7b0a802..c5689f37898 100644 --- a/src/array/binview/mod.rs +++ b/src/array/binview/mod.rs @@ -393,6 +393,21 @@ impl BinaryViewArrayGeneric { self } } + + pub fn make_mut(self) -> MutableBinaryViewArray { + let views = self.views.make_mut(); + let completed_buffers = self.buffers.to_vec(); + let validity = self.validity.map(|bitmap| bitmap.make_mut()); + MutableBinaryViewArray { + views, + completed_buffers, + in_progress_buffer: vec![], + validity, + phantom: Default::default(), + total_bytes_len: self.total_bytes_len.load(Ordering::Relaxed) as usize, + total_buffer_len: self.total_buffer_len, + } + } } impl BinaryViewArray { diff --git a/src/array/binview/mutable.rs b/src/array/binview/mutable.rs index ea36cf830c5..3baf107bd98 100644 --- a/src/array/binview/mutable.rs +++ b/src/array/binview/mutable.rs @@ -2,6 +2,7 @@ use std::any::Any; use std::fmt::{Debug, Formatter}; use std::sync::Arc; +use crate::array::binview::iterator::MutableBinaryViewValueIter; use crate::array::binview::view::validate_utf8_only; use crate::array::binview::{BinaryViewArrayGeneric, ViewType}; use crate::array::{Array, MutableArray}; @@ -14,15 +15,15 @@ use crate::trusted_len::TrustedLen; const DEFAULT_BLOCK_SIZE: usize = 8 * 1024; pub struct MutableBinaryViewArray { - views: Vec, - completed_buffers: Vec>, - in_progress_buffer: Vec, - validity: Option, - phantom: std::marker::PhantomData, + pub(super) views: Vec, + pub(super) completed_buffers: Vec>, + pub(super) in_progress_buffer: Vec, + pub(super) validity: Option, + pub(super) phantom: std::marker::PhantomData, /// Total bytes length if we would concatenate them all. - total_bytes_len: usize, + pub(super) total_bytes_len: usize, /// Total bytes in the buffer (excluding remaining capacity) - total_buffer_len: usize, + pub(super) total_buffer_len: usize, } impl Clone for MutableBinaryViewArray { @@ -84,10 +85,16 @@ impl MutableBinaryViewArray { } } - pub fn views(&mut self) -> &mut Vec { + #[inline] + pub fn views_mut(&mut self) -> &mut Vec { &mut self.views } + #[inline] + pub fn views(&self) -> &[u128] { + &self.views + } + pub fn validity(&mut self) -> Option<&mut MutableBitmap> { self.validity.as_mut() } @@ -309,6 +316,47 @@ impl MutableBinaryViewArray { pub fn freeze(self) -> BinaryViewArrayGeneric { self.into() } + + /// Returns the element at index `i` + /// # Safety + /// Assumes that the `i < self.len`. + #[inline] + pub unsafe fn value_unchecked(&self, i: usize) -> &T { + let v = *self.views.get_unchecked(i); + let len = v as u32; + + // view layout: + // length: 4 bytes + // prefix: 4 bytes + // buffer_index: 4 bytes + // offset: 4 bytes + + // inlined layout: + // length: 4 bytes + // data: 12 bytes + let bytes = if len <= 12 { + let ptr = self.views.as_ptr() as *const u8; + std::slice::from_raw_parts(ptr.add(i * 16 + 4), len as usize) + } else { + let buffer_idx = ((v >> 64) as u32) as usize; + let offset = (v >> 96) as u32; + + let data = if buffer_idx == self.completed_buffers.len() { + self.in_progress_buffer.as_slice() + } else { + self.completed_buffers.get_unchecked_release(buffer_idx) + }; + + let offset = offset as usize; + data.get_unchecked(offset..offset + len as usize) + }; + T::from_bytes_unchecked(bytes) + } + + /// Returns an iterator of `&[u8]` over every element of this array, ignoring the validity + pub fn values_iter(&self) -> MutableBinaryViewValueIter { + MutableBinaryViewValueIter::new(self) + } } impl MutableBinaryViewArray<[u8]> { diff --git a/src/array/dictionary/typed_iterator.rs b/src/array/dictionary/typed_iterator.rs index 0e90a1cf4d8..48b49b28ca1 100644 --- a/src/array/dictionary/typed_iterator.rs +++ b/src/array/dictionary/typed_iterator.rs @@ -1,5 +1,5 @@ -use crate::array::{Array, PrimitiveArray, Utf8Array}; use crate::error::{Error, Result}; +use crate::array::{Array, PrimitiveArray, Utf8Array, Utf8ViewArray}; use crate::trusted_len::TrustedLen; use crate::types::Offset; @@ -48,6 +48,34 @@ impl DictValue for Utf8Array { } } +impl DictValue for Utf8ViewArray { + type IterValue<'a> = &'a str; + + unsafe fn get_unchecked(&self, item: usize) -> Self::IterValue<'_> { + self.value_unchecked(item) + } + + fn downcast_values(array: &dyn Array) -> Result<&Self> + where + Self: Sized, + { + array + .as_any() + .downcast_ref::() + .ok_or_else(|| Error::InvalidArgumentError( + "could not convert array to dictionary value".into(), + )) + .map(|arr| { + assert_eq!( + arr.null_count(), + 0, + "null values in values not supported in iteration" + ); + arr + }) + } +} + /// Iterator of values of an `ListArray`. pub struct DictionaryValuesIterTyped<'a, K: DictionaryKey, V: DictValue> { keys: &'a PrimitiveArray, diff --git a/src/buffer/immutable.rs b/src/buffer/immutable.rs index e9268a87854..2d2a38e3ce8 100644 --- a/src/buffer/immutable.rs +++ b/src/buffer/immutable.rs @@ -241,6 +241,15 @@ impl Buffer { } } +impl Buffer { + pub fn make_mut(self) -> Vec { + match self.into_mut() { + Either::Right(v) => v, + Either::Left(same) => same.as_slice().to_vec(), + } + } +} + impl Buffer { pub fn zeroed(len: usize) -> Self { vec![T::zero(); len].into() diff --git a/src/io/ipc/write/common.rs b/src/io/ipc/write/common.rs index 633af4939dc..794935422d5 100644 --- a/src/io/ipc/write/common.rs +++ b/src/io/ipc/write/common.rs @@ -259,6 +259,13 @@ fn set_variadic_buffer_counts(counts: &mut Vec, array: &dyn Array) { let array = array.as_any().downcast_ref::().unwrap(); set_variadic_buffer_counts(counts, array.values().as_ref()) }, + ArrowDataType::Dictionary(_, _, _) => { + let array = array + .as_any() + .downcast_ref::>() + .unwrap(); + set_variadic_buffer_counts(counts, array.values().as_ref()) + }, _ => (), } } @@ -348,6 +355,14 @@ fn dictionary_batch_to_bytes( let mut nodes: Vec = vec![]; let mut buffers: Vec = vec![]; let mut arrow_data: Vec = vec![]; + let mut variadic_buffer_counts = vec![]; + set_variadic_buffer_counts(&mut variadic_buffer_counts, array.values().as_ref()); + + let variadic_buffer_counts = if variadic_buffer_counts.is_empty() { + None + } else { + Some(variadic_buffer_counts) + }; let length = write_dictionary( array, @@ -372,7 +387,7 @@ fn dictionary_batch_to_bytes( nodes: Some(nodes), buffers: Some(buffers), compression, - variadic_buffer_counts: None, + variadic_buffer_counts, })), is_delta: false, }, From b5d9fa4ed381244ad74700343d1e69c83c745227 Mon Sep 17 00:00:00 2001 From: Ritchie Vink Date: Mon, 22 Jan 2024 13:28:47 +0100 Subject: [PATCH 18/25] perf: speedup binview filter (#13902) --- src/array/binview/ffi.rs | 2 ++ src/array/binview/mod.rs | 10 ++++++---- src/bitmap/mutable.rs | 4 ++++ src/compute/cast/utf8_to.rs | 1 + 4 files changed, 13 insertions(+), 4 deletions(-) diff --git a/src/array/binview/ffi.rs b/src/array/binview/ffi.rs index a8407c661f1..9a3653a1e13 100644 --- a/src/array/binview/ffi.rs +++ b/src/array/binview/ffi.rs @@ -71,6 +71,7 @@ impl FromFfi for BinaryViewArray views, Arc::from([]), validity, + None, )); } @@ -94,6 +95,7 @@ impl FromFfi for BinaryViewArray views, Arc::from(variadic_buffers), validity, + None, )) } } diff --git a/src/array/binview/mod.rs b/src/array/binview/mod.rs index c5689f37898..d6f33306c5c 100644 --- a/src/array/binview/mod.rs +++ b/src/array/binview/mod.rs @@ -185,9 +185,11 @@ impl BinaryViewArrayGeneric { views: Buffer, buffers: Arc<[Buffer]>, validity: Option, + total_buffer_len: Option, ) -> Self { - let total_bytes_len = views.iter().map(|v| (*v as u32) as usize).sum(); - let total_buffer_len = buffers.iter().map(|b| b.len()).sum(); + let total_bytes_len = UNKNOWN_LEN as usize; + let total_buffer_len = + total_buffer_len.unwrap_or_else(|| buffers.iter().map(|b| b.len()).sum()); Self::new_unchecked( data_type, views, @@ -234,7 +236,7 @@ impl BinaryViewArrayGeneric { unsafe { Ok(Self::new_unchecked_unknown_md( - data_type, views, buffers, validity, + data_type, views, buffers, validity, None, )) } } @@ -378,7 +380,7 @@ impl BinaryViewArrayGeneric { // Subtract the maximum amount of inlined strings to get a lower bound // on the number of buffer bytes needed (assuming no dedup). - let total_bytes_len = self.total_bytes_len.load(Ordering::Relaxed) as usize; + let total_bytes_len = self.total_bytes_len(); let buffer_req_lower_bound = total_bytes_len.saturating_sub(self.len() * 12); let lower_bound_mem_usage_post_gc = self.len() * 16 + buffer_req_lower_bound; diff --git a/src/bitmap/mutable.rs b/src/bitmap/mutable.rs index 10802f42d8c..13b76209a4c 100644 --- a/src/bitmap/mutable.rs +++ b/src/bitmap/mutable.rs @@ -334,6 +334,10 @@ impl MutableBitmap { pub(crate) fn bitchunks_exact_mut(&mut self) -> BitChunksExactMut { BitChunksExactMut::new(&mut self.buffer, self.length) } + + pub fn freeze(self) -> Bitmap { + self.into() + } } impl From for Bitmap { diff --git a/src/compute/cast/utf8_to.rs b/src/compute/cast/utf8_to.rs index 8c408dfb2c0..e4a6f17fc9f 100644 --- a/src/compute/cast/utf8_to.rs +++ b/src/compute/cast/utf8_to.rs @@ -216,6 +216,7 @@ pub fn binary_to_binview(arr: &BinaryArray) -> BinaryViewArray { views.into(), buffers, arr.validity().cloned(), + None, ) } } From 79749321528130b74f8237585d2a515ba62f975a Mon Sep 17 00:00:00 2001 From: Ritchie Vink Date: Wed, 24 Jan 2024 16:29:10 +0100 Subject: [PATCH 19/25] chore(rust): update rustc (#13947) --- src/array/binview/ffi.rs | 4 +- src/array/binview/mod.rs | 21 +++--- src/array/binview/mutable.rs | 36 +++++----- src/array/binview/view.rs | 115 +++++++++++++++++++++++++------ src/array/growable/binview.rs | 27 +++----- src/array/mod.rs | 2 +- src/buffer/mod.rs | 3 + src/compute/cast/utf8_to.rs | 2 +- src/io/ipc/read/array/binview.rs | 4 +- src/mmap/array.rs | 24 +++++-- src/types/mod.rs | 3 + 11 files changed, 162 insertions(+), 79 deletions(-) diff --git a/src/array/binview/ffi.rs b/src/array/binview/ffi.rs index 9a3653a1e13..a03b5a28e7e 100644 --- a/src/array/binview/ffi.rs +++ b/src/array/binview/ffi.rs @@ -4,7 +4,7 @@ use std::sync::Arc; use super::BinaryViewArrayGeneric; use crate::error::Result; -use crate::array::binview::ViewType; +use crate::array::binview::{View, ViewType}; use crate::array::{FromFfi, ToFfi}; use crate::bitmap::align; use crate::ffi; @@ -60,7 +60,7 @@ impl FromFfi for BinaryViewArray let data_type = array.data_type().clone(); let validity = unsafe { array.validity() }?; - let views = unsafe { array.buffer::(1) }?; + let views = unsafe { array.buffer::(1) }?; // 2 - validity + views let n_buffers = array.n_buffers(); diff --git a/src/array/binview/mod.rs b/src/array/binview/mod.rs index d6f33306c5c..70991e5f18f 100644 --- a/src/array/binview/mod.rs +++ b/src/array/binview/mod.rs @@ -34,6 +34,7 @@ use crate::error::{Error, Result}; pub type BinaryViewArray = BinaryViewArrayGeneric<[u8]>; pub type Utf8ViewArray = BinaryViewArrayGeneric; +pub use view::View; pub type MutablePlString = MutableBinaryViewArray; pub type MutablePlBinary = MutableBinaryViewArray<[u8]>; @@ -107,7 +108,7 @@ impl ViewType for [u8] { pub struct BinaryViewArrayGeneric { data_type: DataType, - views: Buffer, + views: Buffer, buffers: Arc<[Buffer]>, // Raw buffer access. (pointer, len). raw_buffers: Arc<[(*const u8, usize)]>, @@ -158,7 +159,7 @@ impl BinaryViewArrayGeneric { /// - The offsets match the buffers. pub unsafe fn new_unchecked( data_type: DataType, - views: Buffer, + views: Buffer, buffers: Arc<[Buffer]>, validity: Option, total_bytes_len: usize, @@ -182,7 +183,7 @@ impl BinaryViewArrayGeneric { /// The caller must ensure the invariants pub unsafe fn new_unchecked_unknown_md( data_type: DataType, - views: Buffer, + views: Buffer, buffers: Arc<[Buffer]>, validity: Option, total_buffer_len: Option, @@ -208,13 +209,13 @@ impl BinaryViewArrayGeneric { self.buffers.iter().map(|buf| buf.len() as i64).collect() } - pub fn views(&self) -> &Buffer { + pub fn views(&self) -> &Buffer { &self.views } pub fn try_new( data_type: DataType, - views: Buffer, + views: Buffer, buffers: Arc<[Buffer]>, validity: Option, ) -> Result { @@ -278,7 +279,7 @@ impl BinaryViewArrayGeneric { #[inline] pub unsafe fn value_unchecked(&self, i: usize) -> &T { let v = *self.views.get_unchecked(i); - let len = v as u32; + let len = v.length; // view layout: // length: 4 bytes @@ -294,11 +295,9 @@ impl BinaryViewArrayGeneric { let ptr = self.views.as_ptr() as *const u8; std::slice::from_raw_parts(ptr.add(i * 16 + 4), len as usize) } else { - let buffer_idx = (v >> 64) as u32; - let offset = (v >> 96) as u32; - let (data_ptr, data_len) = *self.raw_buffers.get_unchecked(buffer_idx as usize); + let (data_ptr, data_len) = *self.raw_buffers.get_unchecked(v.buffer_idx as usize); let data = std::slice::from_raw_parts(data_ptr, data_len); - let offset = offset as usize; + let offset = v.offset as usize; data.get_unchecked(offset..offset + len as usize) }; T::from_bytes_unchecked(bytes) @@ -315,7 +314,7 @@ impl BinaryViewArrayGeneric { } pub fn len_iter(&self) -> impl Iterator + '_ { - self.views.iter().map(|v| *v as u32) + self.views.iter().map(|v| v.length) } impl_sliced!(); diff --git a/src/array/binview/mutable.rs b/src/array/binview/mutable.rs index 3baf107bd98..9189aee5891 100644 --- a/src/array/binview/mutable.rs +++ b/src/array/binview/mutable.rs @@ -5,17 +5,18 @@ use std::sync::Arc; use crate::array::binview::iterator::MutableBinaryViewValueIter; use crate::array::binview::view::validate_utf8_only; use crate::array::binview::{BinaryViewArrayGeneric, ViewType}; -use crate::array::{Array, MutableArray}; +use crate::array::{Array, MutableArray, View}; use crate::bitmap::MutableBitmap; use crate::buffer::Buffer; use crate::error::Result; use crate::datatypes::DataType; use crate::trusted_len::TrustedLen; +use crate::types::NativeType; const DEFAULT_BLOCK_SIZE: usize = 8 * 1024; pub struct MutableBinaryViewArray { - pub(super) views: Vec, + pub(super) views: Vec, pub(super) completed_buffers: Vec>, pub(super) in_progress_buffer: Vec, pub(super) validity: Option, @@ -86,12 +87,12 @@ impl MutableBinaryViewArray { } #[inline] - pub fn views_mut(&mut self) -> &mut Vec { + pub fn views_mut(&mut self) -> &mut Vec { &mut self.views } #[inline] - pub fn views(&self) -> &[u128] { + pub fn views(&self) -> &[View] { &self.views } @@ -127,20 +128,18 @@ impl MutableBinaryViewArray { /// - caller must allocate enough capacity /// - caller must ensure the view and buffers match. #[inline] - pub unsafe fn push_view(&mut self, v: u128, buffers: &[(*const u8, usize)]) { - let len = v as u32; + pub unsafe fn push_view(&mut self, v: View, buffers: &[(*const u8, usize)]) { + let len = v.length; self.total_bytes_len += len as usize; if len <= 12 { debug_assert!(self.views.capacity() > self.views.len()); self.views.push(v) } else { self.total_buffer_len += len as usize; - let buffer_idx = (v >> 64) as u32; - let offset = (v >> 96) as u32; - let (data_ptr, data_len) = *buffers.get_unchecked(buffer_idx as usize); + let (data_ptr, data_len) = *buffers.get_unchecked_release(v.buffer_idx as usize); let data = std::slice::from_raw_parts(data_ptr, data_len); - let offset = offset as usize; - let bytes = data.get_unchecked(offset..offset + len as usize); + let offset = v.offset as usize; + let bytes = data.get_unchecked_release(offset..offset + len as usize); let t = T::from_bytes_unchecked(bytes); self.push_value_ignore_validity(t) } @@ -177,7 +176,7 @@ impl MutableBinaryViewArray { payload[8..12].copy_from_slice(&buffer_idx.to_le_bytes()); payload[12..16].copy_from_slice(&offset.to_le_bytes()); } - let value = u128::from_le_bytes(payload); + let value = View::from_le_bytes(payload); self.views.push(value); } @@ -197,7 +196,7 @@ impl MutableBinaryViewArray { } pub fn push_null(&mut self) { - self.views.push(0); + self.views.push(View::default()); match &mut self.validity { Some(validity) => validity.push(false), None => self.init_validity(true), @@ -208,7 +207,8 @@ impl MutableBinaryViewArray { if self.validity.is_none() && additional > 0 { self.init_validity(false); } - self.views.extend(std::iter::repeat(0).take(additional)); + self.views + .extend(std::iter::repeat(View::default()).take(additional)); if let Some(validity) = &mut self.validity { validity.extend_constant(additional, false); } @@ -231,7 +231,7 @@ impl MutableBinaryViewArray { self.push_value_ignore_validity(v); self.views.pop().unwrap() }) - .unwrap_or(0); + .unwrap_or_default(); self.views .extend(std::iter::repeat(view_value).take(additional)); } @@ -323,7 +323,7 @@ impl MutableBinaryViewArray { #[inline] pub unsafe fn value_unchecked(&self, i: usize) -> &T { let v = *self.views.get_unchecked(i); - let len = v as u32; + let len = v.length; // view layout: // length: 4 bytes @@ -338,8 +338,8 @@ impl MutableBinaryViewArray { let ptr = self.views.as_ptr() as *const u8; std::slice::from_raw_parts(ptr.add(i * 16 + 4), len as usize) } else { - let buffer_idx = ((v >> 64) as u32) as usize; - let offset = (v >> 96) as u32; + let buffer_idx = v.buffer_idx as usize; + let offset = v.offset; let data = if buffer_idx == self.completed_buffers.len() { self.in_progress_buffer.as_slice() diff --git a/src/array/binview/view.rs b/src/array/binview/view.rs index 94b3c318bb9..351c84f8545 100644 --- a/src/array/binview/view.rs +++ b/src/array/binview/view.rs @@ -1,7 +1,17 @@ use crate::buffer::Buffer; use crate::error::{Error, Result}; +use std::cmp::Ordering; +use std::fmt::{Display, Formatter}; +use std::ops::Add; -#[derive(Debug)] +use bytemuck::{Pod, Zeroable}; + +use crate::datatypes::PrimitiveType; +use crate::types::NativeType; + +// We use this instead of u128 because we want alignment of <= 8 bytes. +#[derive(Debug, Copy, Clone, Default)] +#[repr(C)] pub struct View { /// The length of the string/bytes. pub length: u32, @@ -13,36 +23,94 @@ pub struct View { pub offset: u32, } +impl View { + #[inline(always)] + pub fn as_u128(self) -> u128 { + unsafe { std::mem::transmute(self) } + } +} + +impl Display for View { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + write!(f, "{:?}", self) + } +} + +unsafe impl Zeroable for View {} + +unsafe impl Pod for View {} + +impl Add for View { + type Output = View; + + fn add(self, _rhs: Self) -> Self::Output { + unimplemented!() + } +} + +impl num_traits::Zero for View { + fn zero() -> Self { + Default::default() + } + + fn is_zero(&self) -> bool { + *self == Self::zero() + } +} + +impl PartialEq for View { + fn eq(&self, other: &Self) -> bool { + self.as_u128() == other.as_u128() + } +} + +impl NativeType for View { + const PRIMITIVE: PrimitiveType = PrimitiveType::UInt128; + type Bytes = [u8; 16]; + + #[inline] + fn to_le_bytes(&self) -> Self::Bytes { + self.as_u128().to_le_bytes() + } + + #[inline] + fn to_be_bytes(&self) -> Self::Bytes { + self.as_u128().to_be_bytes() + } + + #[inline] + fn from_le_bytes(bytes: Self::Bytes) -> Self { + Self::from(u128::from_le_bytes(bytes)) + } + + #[inline] + fn from_be_bytes(bytes: Self::Bytes) -> Self { + Self::from(u128::from_be_bytes(bytes)) + } +} + impl From for View { #[inline] fn from(value: u128) -> Self { - Self { - length: value as u32, - prefix: (value >> 32) as u32, - buffer_idx: (value >> 64) as u32, - offset: (value >> 96) as u32, - } + unsafe { std::mem::transmute(value) } } } impl From for u128 { #[inline] fn from(value: View) -> Self { - value.length as u128 - | ((value.prefix as u128) << 32) - | ((value.buffer_idx as u128) << 64) - | ((value.offset as u128) << 96) + value.as_u128() } } -fn validate_view(views: &[u128], buffers: &[Buffer], validate_bytes: F) -> Result<()> +fn validate_view(views: &[View], buffers: &[Buffer], validate_bytes: F) -> Result<()> where F: Fn(&[u8]) -> Result<()>, { for view in views { - let len = *view as u32; + let len = view.length; if len <= 12 { - if len < 12 && view >> (32 + len * 8) != 0 { + if len < 12 && view.as_u128() >> (32 + len * 8) != 0 { return Err(Error::InvalidArgumentError(format!( "View contained non-zero padding for string of length {len}", ))); @@ -50,8 +118,6 @@ where validate_bytes(&view.to_le_bytes()[4..4 + len as usize])?; } else { - let view = View::from(*view); - let data = buffers.get(view.buffer_idx as usize).ok_or_else(|| { Error::InvalidArgumentError(format!( "Invalid buffer index: got index {} but only has {} buffers", @@ -85,7 +151,7 @@ where Ok(()) } -pub(super) fn validate_binary_view(views: &[u128], buffers: &[Buffer]) -> Result<()> { +pub(super) fn validate_binary_view(views: &[View], buffers: &[Buffer]) -> Result<()> { validate_view(views, buffers, |_| Ok(())) } @@ -98,21 +164,26 @@ fn validate_utf8(b: &[u8]) -> Result<()> { } } -pub(super) fn validate_utf8_view(views: &[u128], buffers: &[Buffer]) -> Result<()> { +pub(super) fn validate_utf8_view(views: &[View], buffers: &[Buffer]) -> Result<()> { validate_view(views, buffers, validate_utf8) } -pub(super) fn validate_utf8_only(views: &[u128], buffers: &[Buffer]) -> Result<()> { +/// # Safety +/// The views and buffers must uphold the invariants of BinaryView otherwise we will go OOB. +pub(super) unsafe fn validate_utf8_only( + views: &[View], + buffers: &[Buffer], +) -> Result<()> { for view in views { - let len = *view as u32; + let len = view.length; if len <= 12 { validate_utf8( view.to_le_bytes() .get_unchecked_release(4..4 + len as usize), )?; } else { - let buffer_idx = (*view >> 64) as u32; - let offset = (*view >> 96) as u32; + let buffer_idx = view.buffer_idx; + let offset = view.offset; let data = buffers.get_unchecked_release(buffer_idx as usize); let start = offset as usize; diff --git a/src/array/growable/binview.rs b/src/array/growable/binview.rs index 2ded87816c6..92f66b8d52f 100644 --- a/src/array/growable/binview.rs +++ b/src/array/growable/binview.rs @@ -1,7 +1,7 @@ use std::sync::Arc; use super::Growable; -use crate::array::binview::{BinaryViewArrayGeneric, ViewType}; +use crate::array::binview::{BinaryViewArrayGeneric, View, ViewType}; use crate::array::growable::utils::{extend_validity, prepare_validity}; use crate::array::Array; use crate::bitmap::MutableBitmap; @@ -13,7 +13,7 @@ pub struct GrowableBinaryViewArray<'a, T: ViewType + ?Sized> { arrays: Vec<&'a BinaryViewArrayGeneric>, data_type: DataType, validity: Option, - views: Vec, + views: Vec, buffers: Vec>, buffers_idx_offsets: Vec, total_bytes_len: usize, @@ -96,22 +96,16 @@ impl<'a, T: ViewType + ?Sized> GrowableBinaryViewArray<'a, T> { let range = start..start + len; self.views - .extend(array.views().get_unchecked(range).iter().map(|&view| { - let len = (view as u32) as usize; + .extend(array.views().get_unchecked(range).iter().map(|view| { + let mut view = *view; + let len = view.length as usize; self.total_bytes_len += len; if len > 12 { - // Take the buffer index of the View. - let current_buffer_idx = (view >> 64) as u32; - // And add the offset of the buffers. - let buffer_idx = - *self.buffers_idx_offsets.get_unchecked(index) + current_buffer_idx; - // Mask out the old buffer-idx and OR in the new. - let mask = (u32::MAX as u128) << 64; - (view & !mask) | ((buffer_idx as u128) << 64) - } else { - view + let buffer_idx = *self.buffers_idx_offsets.get_unchecked(index); + view.buffer_idx += buffer_idx; } + view })); } @@ -130,7 +124,7 @@ impl<'a, T: ViewType + ?Sized> GrowableBinaryViewArray<'a, T> { self.views .extend(array.views().get_unchecked(range).iter().map(|view| { - let len = (*view as u32) as usize; + let len = view.length as usize; self.total_bytes_len += len; *view @@ -145,7 +139,8 @@ impl<'a, T: ViewType + ?Sized> Growable<'a> for GrowableBinaryViewArray<'a, T> { } fn extend_validity(&mut self, additional: usize) { - self.views.extend(std::iter::repeat(0).take(additional)); + self.views + .extend(std::iter::repeat(View::default()).take(additional)); if let Some(validity) = &mut self.validity { validity.extend_constant(additional, false); } diff --git a/src/array/mod.rs b/src/array/mod.rs index 18484999aa4..eeec8b77380 100644 --- a/src/array/mod.rs +++ b/src/array/mod.rs @@ -749,7 +749,7 @@ pub use fmt::{get_display, get_value_display}; pub use binary::{BinaryArray, BinaryValueIter, MutableBinaryArray, MutableBinaryValuesArray}; pub use binview::{ BinaryViewArray, BinaryViewArrayGeneric, MutableBinaryViewArray, MutablePlBinary, - MutablePlString, Utf8ViewArray, ViewType, + MutablePlString, Utf8ViewArray, View, ViewType, }; pub use boolean::{BooleanArray, MutableBooleanArray}; pub use dictionary::{DictionaryArray, DictionaryKey, MutableDictionaryArray}; diff --git a/src/buffer/mod.rs b/src/buffer/mod.rs index 46c0a4d64a3..0ec6ff22a8d 100644 --- a/src/buffer/mod.rs +++ b/src/buffer/mod.rs @@ -7,6 +7,9 @@ use crate::ffi::InternalArrowArray; use std::ops::Deref; pub(crate) enum BytesAllocator { + // Dead code lint is a false positive. + // remove once fixed in rustc + #[allow(dead_code)] InternalArrowArray(InternalArrowArray), #[cfg(feature = "arrow")] diff --git a/src/compute/cast/utf8_to.rs b/src/compute/cast/utf8_to.rs index e4a6f17fc9f..eeb015d31e3 100644 --- a/src/compute/cast/utf8_to.rs +++ b/src/compute/cast/utf8_to.rs @@ -202,7 +202,7 @@ pub fn binary_to_binview(arr: &BinaryArray) -> BinaryViewArray { payload[12..16].copy_from_slice(&offset.to_le_bytes()); } - let value = u128::from_le_bytes(payload); + let value = View::from_le_bytes(payload); unsafe { views.push_unchecked(value) }; } let buffers = if uses_buffer { diff --git a/src/io/ipc/read/array/binview.rs b/src/io/ipc/read/array/binview.rs index 33d16c12bef..79e84b3cc8b 100644 --- a/src/io/ipc/read/array/binview.rs +++ b/src/io/ipc/read/array/binview.rs @@ -6,7 +6,7 @@ use crate::error::{Error, Result}; use super::super::read_basic::*; use super::*; -use crate::array::{ArrayRef, BinaryViewArrayGeneric, ViewType}; +use crate::array::{ArrayRef, BinaryViewArrayGeneric, View, ViewType}; use crate::buffer::Buffer; use crate::datatypes::DataType; @@ -37,7 +37,7 @@ pub fn read_binview( )?; let length = try_get_array_length(field_node, limit)?; - let views: Buffer = read_buffer( + let views: Buffer = read_buffer( buffers, length, reader, diff --git a/src/mmap/array.rs b/src/mmap/array.rs index 6967fbd6d95..ee5687c720c 100644 --- a/src/mmap/array.rs +++ b/src/mmap/array.rs @@ -1,7 +1,7 @@ use std::collections::VecDeque; use std::sync::Arc; -use crate::array::{Array, DictionaryKey, FixedSizeListArray, ListArray, StructArray}; +use crate::array::{Array, DictionaryKey, FixedSizeListArray, ListArray, StructArray, View}; use crate::datatypes::DataType; use crate::error::Error; use crate::offset::Offset; @@ -62,12 +62,12 @@ fn get_bytes<'a>( data: &'a [u8], block_offset: usize, buffers: &mut VecDeque, -) -> PolarsResult<&'a [u8]> { +) -> Result<&'a [u8], Error> { let (offset, length) = get_buffer_bounds(buffers)?; // verify that they are in-bounds data.get(block_offset + offset..block_offset + offset + length) - .ok_or_else(|| polars_err!(ComputeError: "buffer out of bounds")) + .ok_or_else(|| Error::OutOfSpec("buffer out of bounds".to_string())) } fn get_validity<'a>( @@ -89,6 +89,18 @@ fn get_validity<'a>( None }) } +fn get_num_rows_and_null_count(node: &Node) -> Result<(usize, usize), Error> { + let num_rows: usize = node + .length() + .try_into() + .map_err(|_| Error::OutOfSpec("Negative footer length".to_string()))?; + + let null_count: usize = node + .null_count() + .try_into() + .map_err(|_| Error::OutOfSpec("Negative footer length".to_string()))?; + Ok((num_rows, null_count)) +} fn mmap_binary>( data: Arc, @@ -133,17 +145,17 @@ fn mmap_binview>( block_offset: usize, buffers: &mut VecDeque, variadic_buffer_counts: &mut VecDeque, -) -> PolarsResult { +) -> Result { let (num_rows, null_count) = get_num_rows_and_null_count(node)?; let data_ref = data.as_ref().as_ref(); let validity = get_validity(data_ref, block_offset, buffers, null_count)?.map(|x| x.as_ptr()); - let views = get_buffer::(data_ref, block_offset, buffers, num_rows)?; + let views = get_buffer::(data_ref, block_offset, buffers, num_rows)?; let n_variadic = variadic_buffer_counts .pop_front() - .ok_or_else(|| polars_err!(ComputeError: "expected variadic_buffer_count"))?; + .ok_or_else(|| Error::OutOfSpec("expected variadic_buffer_count".to_string()))?; let mut buffer_ptrs = Vec::with_capacity(n_variadic + 2); buffer_ptrs.push(validity); diff --git a/src/types/mod.rs b/src/types/mod.rs index 97134e1a76c..ffb07eb7145 100644 --- a/src/types/mod.rs +++ b/src/types/mod.rs @@ -72,6 +72,8 @@ pub enum PrimitiveType { } mod private { + use crate::array::View; + pub trait Sealed {} impl Sealed for u8 {} @@ -90,4 +92,5 @@ mod private { impl Sealed for f64 {} impl Sealed for super::days_ms {} impl Sealed for super::months_days_ns {} + impl Sealed for View {} } From 7664e8beca50036075ebc96999162e7ac7e70c97 Mon Sep 17 00:00:00 2001 From: Ritchie Vink Date: Sat, 27 Jan 2024 10:30:23 +0100 Subject: [PATCH 20/25] feat: gc binview when writing ipc (#14035) --- src/array/binview/mod.rs | 4 ++++ src/io/ipc/write/serialize/binview.rs | 7 ++++++- 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/src/array/binview/mod.rs b/src/array/binview/mod.rs index 70991e5f18f..45adb1e4c2d 100644 --- a/src/array/binview/mod.rs +++ b/src/array/binview/mod.rs @@ -370,6 +370,10 @@ impl BinaryViewArrayGeneric { mutable.freeze().with_validity(self.validity) } + pub fn is_sliced(&self) -> bool { + self.views.as_ptr() != self.views.storage_ptr() + } + pub fn maybe_gc(self) -> Self { const GC_MINIMUM_SAVINGS: usize = 16 * 1024; // At least 16 KiB. diff --git a/src/io/ipc/write/serialize/binview.rs b/src/io/ipc/write/serialize/binview.rs index 66afafbd0e6..a91bb1764d2 100644 --- a/src/io/ipc/write/serialize/binview.rs +++ b/src/io/ipc/write/serialize/binview.rs @@ -10,9 +10,14 @@ pub(super) fn write_binview( is_little_endian: bool, compression: Option, ) { + let array = if array.is_sliced() { + array.clone().maybe_gc() + } else { + array.clone() + }; write_bitmap( array.validity(), - array::Array::len(array), + array::Array::len(&array), buffers, arrow_data, offset, From 11078a871fe56041d7e442bdf69a838fa4be4de5 Mon Sep 17 00:00:00 2001 From: Weijie Guo Date: Mon, 29 Jan 2024 15:22:33 +0800 Subject: [PATCH 21/25] fix: json_encode should respect to logical type (#14063) --- src/compute/cast/mod.rs | 46 ++++++++++++++++++++++++++++++----------- 1 file changed, 34 insertions(+), 12 deletions(-) diff --git a/src/compute/cast/mod.rs b/src/compute/cast/mod.rs index 8e05520f0a6..872bd7a26a9 100644 --- a/src/compute/cast/mod.rs +++ b/src/compute/cast/mod.rs @@ -25,9 +25,7 @@ use crate::{ error::{Error, Result}, offset::{Offset, Offsets}, }; -use crate::compute::cast::binview_to::{ - utf8view_to_date32_dyn, utf8view_to_naive_timestamp_dyn, view_to_binary, -}; +use crate::compute::cast::binview_to::{RFC3339, utf8view_to_date32_dyn, utf8view_to_naive_timestamp_dyn, view_to_binary}; use crate::temporal_conversions::utf8view_to_timestamp; /// options defining how Cast kernels behave @@ -161,10 +159,10 @@ pub fn can_cast_types(from_type: &DataType, to_type: &DataType) -> bool { (LargeBinary, to_type) => { is_numeric(to_type) || match to_type { - Binary | LargeUtf8 => true, - LargeList(field) => matches!(field.data_type, UInt8), - _ => false, - } + Binary | LargeUtf8 => true, + LargeList(field) => matches!(field.data_type, UInt8), + _ => false, + } } (FixedSizeBinary(_), to_type) => matches!(to_type, Binary | LargeBinary), (Timestamp(_, _), Utf8) => true, @@ -337,6 +335,26 @@ pub fn can_cast_types(from_type: &DataType, to_type: &DataType) -> bool { } } +fn cast_struct( + array: &StructArray, + to_type: &DataType, + options: CastOptions, +) -> Result { + let values = array.values(); + let fields = StructArray::get_fields(to_type); + let new_values = values + .iter() + .zip(fields) + .map(|(arr, field)| cast(arr.as_ref(), field.data_type(), options)) + .collect::>>()?; + + Ok(StructArray::new( + to_type.clone(), + new_values, + array.validity().cloned(), + )) +} + fn cast_list( array: &ListArray, to_type: &DataType, @@ -456,13 +474,14 @@ pub fn cast_unchecked(array: &dyn Array, to_type: &DataType) -> Result Resu let as_options = options.with_wrapped(true); match (from_type, to_type) { (Null, _) | (_, Null) => Ok(new_null_array(to_type.clone(), array.len())), - (Struct(_), _) => Err(Error::NotYetImplemented( + (Struct(from_fd), Struct(to_fd)) => { + if from_fd.len() != to_fd.len() { + return Err(Error::InvalidArgumentError("incompatible offsets in source list".to_string())); + } + cast_struct(array.as_any().downcast_ref().unwrap(), to_type, options).map(|x| x.boxed()) + }, + (Struct(_), _) | (_, Struct(_)) => Err(Error::NotYetImplemented( "Cannot cast from struct to other types".to_string(), )), - (_, Struct(_)) => Err(Error::NotYetImplemented( - "Cannot cast to struct from other types".to_string(), - )), (List(_), FixedSizeList(inner, size)) => cast_list_to_fixed_size_list::( array.as_any().downcast_ref().unwrap(), inner.as_ref(), From 91c0c606ff59a803d7cb6819722254f1a900061f Mon Sep 17 00:00:00 2001 From: Ritchie Vink Date: Wed, 31 Jan 2024 09:59:46 +0100 Subject: [PATCH 22/25] chore: hoist boolean -> string cast (#14122) --- src/compute/cast/boolean_to.rs | 26 ++++++++++++++------------ src/compute/cast/mod.rs | 8 +++----- 2 files changed, 17 insertions(+), 17 deletions(-) diff --git a/src/compute/cast/boolean_to.rs b/src/compute/cast/boolean_to.rs index 1ce45c87118..534ae0b58cc 100644 --- a/src/compute/cast/boolean_to.rs +++ b/src/compute/cast/boolean_to.rs @@ -1,5 +1,5 @@ use crate::{ - array::{Array, BinaryArray, BooleanArray, PrimitiveArray, Utf8Array}, + array::{Array, BinaryViewArray, BooleanArray, PrimitiveArray, Utf8ViewArray}, error::Result, offset::Offset, types::NativeType, @@ -27,24 +27,26 @@ where PrimitiveArray::::new(T::PRIMITIVE.into(), values.into(), from.validity().cloned()) } -/// Casts the [`BooleanArray`] to a [`Utf8Array`], casting trues to `"1"` and falses to `"0"` -pub fn boolean_to_utf8(from: &BooleanArray) -> Utf8Array { - let iter = from.values().iter().map(|x| if x { "1" } else { "0" }); - Utf8Array::from_trusted_len_values_iter(iter) +pub fn boolean_to_utf8view(from: &BooleanArray) -> Utf8ViewArray { + unsafe { boolean_to_binaryview(from).to_utf8view_unchecked() } } -pub(super) fn boolean_to_utf8_dyn(array: &dyn Array) -> Result> { +pub(super) fn boolean_to_utf8view_dyn(array: &dyn Array) -> Result> { let array = array.as_any().downcast_ref().unwrap(); - Ok(Box::new(boolean_to_utf8::(array))) + Ok(boolean_to_utf8view(array).boxed()) } /// Casts the [`BooleanArray`] to a [`BinaryArray`], casting trues to `"1"` and falses to `"0"` -pub fn boolean_to_binary(from: &BooleanArray) -> BinaryArray { - let iter = from.values().iter().map(|x| if x { b"1" } else { b"0" }); - BinaryArray::from_trusted_len_values_iter(iter) +pub fn boolean_to_binaryview(from: &BooleanArray) -> BinaryViewArray { + let iter = from.iter().map(|opt_b| match opt_b { + Some(true) => Some("true".as_bytes()), + Some(false) => Some("false".as_bytes()), + None => None, + }); + BinaryViewArray::arr_from_iter_trusted(iter) } -pub(super) fn boolean_to_binary_dyn(array: &dyn Array) -> Result> { +pub(super) fn boolean_to_binaryview_dyn(array: &dyn Array) -> Result> { let array = array.as_any().downcast_ref().unwrap(); - Ok(Box::new(boolean_to_binary::(array))) + Ok(boolean_to_binaryview(array).boxed()) } diff --git a/src/compute/cast/mod.rs b/src/compute/cast/mod.rs index 872bd7a26a9..11a26b30c2c 100644 --- a/src/compute/cast/mod.rs +++ b/src/compute/cast/mod.rs @@ -667,11 +667,9 @@ pub fn cast(array: &dyn Array, to_type: &DataType, options: CastOptions) -> Resu Int64 => boolean_to_primitive_dyn::(array), Float32 => boolean_to_primitive_dyn::(array), Float64 => boolean_to_primitive_dyn::(array), - Utf8 => boolean_to_utf8_dyn::(array), - LargeUtf8 => boolean_to_utf8_dyn::(array), - Binary => boolean_to_binary_dyn::(array), - LargeBinary => boolean_to_binary_dyn::(array), - _ => Err(Error::NotYetImplemented(format!( + Utf8View => boolean_to_utf8view_dyn(array), + BinaryView => boolean_to_binaryview_dyn(array), + _ => Err(Error::InvalidArgumentError(format!( "Casting from {from_type:?} to {to_type:?} not supported", ))), }, From 69e55c3977c8194868022224c08fb4ea5f06c480 Mon Sep 17 00:00:00 2001 From: Urvish Desai Date: Wed, 14 Feb 2024 17:40:34 -0800 Subject: [PATCH 23/25] replace get_unchecked_release with get_unchecked --- src/array/binview/mutable.rs | 6 +++--- src/array/binview/view.rs | 7 +++---- src/ffi/array.rs | 2 +- 3 files changed, 7 insertions(+), 8 deletions(-) diff --git a/src/array/binview/mutable.rs b/src/array/binview/mutable.rs index 9189aee5891..ccb9de83c28 100644 --- a/src/array/binview/mutable.rs +++ b/src/array/binview/mutable.rs @@ -136,10 +136,10 @@ impl MutableBinaryViewArray { self.views.push(v) } else { self.total_buffer_len += len as usize; - let (data_ptr, data_len) = *buffers.get_unchecked_release(v.buffer_idx as usize); + let (data_ptr, data_len) = *buffers.get_unchecked(v.buffer_idx as usize); let data = std::slice::from_raw_parts(data_ptr, data_len); let offset = v.offset as usize; - let bytes = data.get_unchecked_release(offset..offset + len as usize); + let bytes = data.get_unchecked(offset..offset + len as usize); let t = T::from_bytes_unchecked(bytes); self.push_value_ignore_validity(t) } @@ -344,7 +344,7 @@ impl MutableBinaryViewArray { let data = if buffer_idx == self.completed_buffers.len() { self.in_progress_buffer.as_slice() } else { - self.completed_buffers.get_unchecked_release(buffer_idx) + self.completed_buffers.get_unchecked(buffer_idx) }; let offset = offset as usize; diff --git a/src/array/binview/view.rs b/src/array/binview/view.rs index 351c84f8545..1bfad9ee2df 100644 --- a/src/array/binview/view.rs +++ b/src/array/binview/view.rs @@ -1,6 +1,5 @@ use crate::buffer::Buffer; use crate::error::{Error, Result}; -use std::cmp::Ordering; use std::fmt::{Display, Formatter}; use std::ops::Add; @@ -179,16 +178,16 @@ pub(super) unsafe fn validate_utf8_only( if len <= 12 { validate_utf8( view.to_le_bytes() - .get_unchecked_release(4..4 + len as usize), + .get_unchecked(4..4 + len as usize), )?; } else { let buffer_idx = view.buffer_idx; let offset = view.offset; - let data = buffers.get_unchecked_release(buffer_idx as usize); + let data = buffers.get_unchecked(buffer_idx as usize); let start = offset as usize; let end = start + len as usize; - let b = &data.as_slice().get_unchecked_release(start..end); + let b = &data.as_slice().get_unchecked(start..end); validate_utf8(b)?; }; } diff --git a/src/ffi/array.rs b/src/ffi/array.rs index 5e2e008a24d..b13e29513e2 100644 --- a/src/ffi/array.rs +++ b/src/ffi/array.rs @@ -104,7 +104,7 @@ impl ArrowArray { DataType::BinaryView | DataType::Utf8View ); - let (offset, mut buffers, children, dictionary) = + let (offset, buffers, children, dictionary) = offset_buffers_children_dictionary(array.as_ref()); let variadic_buffer_sizes = if needs_variadic_buffer_sizes { From 3ca785f9f1333b3e92a99c8d3ab7fbc69c52fb16 Mon Sep 17 00:00:00 2001 From: Ritchie Vink Date: Tue, 13 Feb 2024 15:55:48 -0800 Subject: [PATCH 24/25] fix: don't gc after variadic buffers are written (#14473) --- examples/ipc_file_mmap.rs | 2 +- src/io/ipc/mod.rs | 2 +- src/io/ipc/read/file.rs | 5 ++- src/io/ipc/write/common.rs | 60 ++++++++++++++------------- src/io/ipc/write/serialize/binview.rs | 7 +--- src/io/ipc/write/serialize/mod.rs | 1 - tests/it/io/ipc/write/file.rs | 2 +- 7 files changed, 38 insertions(+), 41 deletions(-) diff --git a/examples/ipc_file_mmap.rs b/examples/ipc_file_mmap.rs index e51b49de5be..166f752099e 100644 --- a/examples/ipc_file_mmap.rs +++ b/examples/ipc_file_mmap.rs @@ -29,7 +29,7 @@ fn write( let options = arrow2::io::ipc::write::WriteOptions { compression }; let mut writer = arrow2::io::ipc::write::FileWriter::try_new( result, - schema.clone(), + schema.clone().into(), ipc_fields.clone(), options, )?; diff --git a/src/io/ipc/mod.rs b/src/io/ipc/mod.rs index 2bb233a1474..d8290980966 100644 --- a/src/io/ipc/mod.rs +++ b/src/io/ipc/mod.rs @@ -42,7 +42,7 @@ //! let y_coord = Field::new("y", DataType::Int32, false); //! let schema = Schema::from(vec![x_coord, y_coord]); //! let options = WriteOptions {compression: None}; -//! let mut writer = FileWriter::try_new(file, schema, None, options)?; +//! let mut writer = FileWriter::try_new(file, schema.into(), None, options)?; //! //! // Setup the data //! let x_data = Int32Array::from_slice([-1i32, 1]); diff --git a/src/io/ipc/read/file.rs b/src/io/ipc/read/file.rs index e95b37e44d6..c3001b5b044 100644 --- a/src/io/ipc/read/file.rs +++ b/src/io/ipc/read/file.rs @@ -4,7 +4,7 @@ use std::io::{Read, Seek, SeekFrom}; use crate::array::Array; use crate::chunk::Chunk; -use crate::datatypes::Schema; +use crate::datatypes::SchemaRef; use crate::error::{Error, Result}; use crate::io::ipc::IpcSchema; @@ -19,7 +19,7 @@ use arrow_format::ipc::planus::ReadAsRoot; #[derive(Debug, Clone)] pub struct FileMetadata { /// The schema that is read from the file footer - pub schema: Schema, + pub schema: SchemaRef, /// The files' [`IpcSchema`] pub ipc_schema: IpcSchema, @@ -184,6 +184,7 @@ pub(super) fn deserialize_footer(footer_data: &[u8], size: u64) -> Result>>>>>> 64003155e8 (feat: new implementation for `String/Binary` type. (#13748)) /// Compression codec #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] @@ -237,29 +233,29 @@ fn serialize_compression( fn set_variadic_buffer_counts(counts: &mut Vec, array: &dyn Array) { match array.data_type() { - ArrowDataType::Utf8View => { + DataType::Utf8View => { let array = array.as_any().downcast_ref::().unwrap(); counts.push(array.data_buffers().len() as i64); }, - ArrowDataType::BinaryView => { + DataType::BinaryView => { let array = array.as_any().downcast_ref::().unwrap(); counts.push(array.data_buffers().len() as i64); }, - ArrowDataType::Struct(_) => { + DataType::Struct(_) => { let array = array.as_any().downcast_ref::().unwrap(); for array in array.values() { set_variadic_buffer_counts(counts, array.as_ref()) } }, - ArrowDataType::LargeList(_) => { + DataType::LargeList(_) => { let array = array.as_any().downcast_ref::().unwrap(); set_variadic_buffer_counts(counts, array.values().as_ref()) }, - ArrowDataType::FixedSizeList(_, _) => { + DataType::FixedSizeList(_, _) => { let array = array.as_any().downcast_ref::().unwrap(); set_variadic_buffer_counts(counts, array.values().as_ref()) }, - ArrowDataType::Dictionary(_, _, _) => { + DataType::Dictionary(_, _, _) => { let array = array .as_any() .downcast_ref::>() @@ -285,27 +281,33 @@ fn chunk_to_bytes_amortized( let mut offset = 0; let mut variadic_buffer_counts = vec![]; for array in chunk.arrays() { -<<<<<<< HEAD - let dtype = array.data_type(); - if dtype.is_view() { - match dtype { - DataType::Utf8View => { - let array = array.as_any().downcast_ref::().unwrap(); - variadic_buffer_counts.push(array.data_buffers().len() as i64); - }, - DataType::BinaryView => { - let array = array.as_any().downcast_ref::().unwrap(); - variadic_buffer_counts.push(array.data_buffers().len() as i64); - }, - _ => {}, - } - } -======= set_variadic_buffer_counts(&mut variadic_buffer_counts, array.as_ref()); ->>>>>>> 64003155e8 (feat: new implementation for `String/Binary` type. (#13748)) + // We don't want to write all buffers in sliced arrays. + let array = match array.data_type() { + DataType::BinaryView => { + let concrete_arr = array.as_any().downcast_ref::().unwrap(); + if concrete_arr.is_sliced() { + Cow::Owned(concrete_arr.clone().maybe_gc().boxed()) + } else { + Cow::Borrowed(array) + } + }, + DataType::Utf8View => { + let concrete_arr = array.as_any().downcast_ref::().unwrap(); + if concrete_arr.is_sliced() { + Cow::Owned(concrete_arr.clone().maybe_gc().boxed()) + } else { + Cow::Borrowed(array) + } + }, + _ => Cow::Borrowed(array), + }; + let array = array.as_ref().as_ref(); + + set_variadic_buffer_counts(&mut variadic_buffer_counts, array); write( - array.as_ref(), + array, &mut buffers, &mut arrow_data, &mut nodes, diff --git a/src/io/ipc/write/serialize/binview.rs b/src/io/ipc/write/serialize/binview.rs index a91bb1764d2..66afafbd0e6 100644 --- a/src/io/ipc/write/serialize/binview.rs +++ b/src/io/ipc/write/serialize/binview.rs @@ -10,14 +10,9 @@ pub(super) fn write_binview( is_little_endian: bool, compression: Option, ) { - let array = if array.is_sliced() { - array.clone().maybe_gc() - } else { - array.clone() - }; write_bitmap( array.validity(), - array::Array::len(&array), + array::Array::len(array), buffers, arrow_data, offset, diff --git a/src/io/ipc/write/serialize/mod.rs b/src/io/ipc/write/serialize/mod.rs index f252ee619c0..09ce8a2955f 100644 --- a/src/io/ipc/write/serialize/mod.rs +++ b/src/io/ipc/write/serialize/mod.rs @@ -13,7 +13,6 @@ use crate::{ use super::super::compression; use super::super::endianess::is_native_little_endian; use super::common::{pad_to_64, Compression}; -use crate::{match_integer_type, with_match_primitive_type_full}; mod binary; mod binview; mod boolean; diff --git a/tests/it/io/ipc/write/file.rs b/tests/it/io/ipc/write/file.rs index 5562f803c50..f62bae11430 100644 --- a/tests/it/io/ipc/write/file.rs +++ b/tests/it/io/ipc/write/file.rs @@ -18,7 +18,7 @@ pub(crate) fn write( ) -> Result> { let result = vec![]; let options = WriteOptions { compression }; - let mut writer = FileWriter::try_new(result, schema.clone(), ipc_fields.clone(), options)?; + let mut writer = FileWriter::try_new(result, schema.clone().into(), ipc_fields.clone(), options)?; for batch in batches { writer.write(batch, ipc_fields.as_ref().map(|x| x.as_ref()))?; } From d404d078d3fae0befd1cf5799233a0429dfb6a83 Mon Sep 17 00:00:00 2001 From: Urvish Desai Date: Fri, 16 Feb 2024 16:07:27 -0800 Subject: [PATCH 25/25] resolve more conflicts with polars-arrow --- Cargo.toml | 4 + src/array/utf8/mod.rs | 17 ++- src/compute/arithmetics/decimal/add.rs | 22 ---- src/compute/arithmetics/decimal/div.rs | 15 --- src/compute/arithmetics/decimal/mul.rs | 22 ---- src/compute/arithmetics/decimal/sub.rs | 21 ---- src/compute/arithmetics/mod.rs | 1 + src/compute/cast/binary_to.rs | 51 ++++++++- src/compute/cast/binview_to.rs | 31 ++--- src/compute/cast/boolean_to.rs | 4 +- src/compute/cast/mod.rs | 45 ++++---- src/compute/cast/primitive_to.rs | 126 +++++++++++++-------- src/compute/cast/utf8_to.rs | 25 ++-- src/compute/comparison/mod.rs | 2 + src/ffi/array.rs | 2 +- src/ffi/mmap.rs | 2 +- src/io/ipc/read/array/binary.rs | 2 +- src/io/ipc/read/array/binview.rs | 10 +- src/io/ipc/read/array/boolean.rs | 2 +- src/io/ipc/read/array/fixed_size_binary.rs | 2 +- src/io/ipc/read/array/list.rs | 2 +- src/io/ipc/read/array/map.rs | 2 +- src/io/ipc/read/array/mod.rs | 4 +- src/io/ipc/read/array/null.rs | 2 +- src/io/ipc/read/array/primitive.rs | 2 +- src/io/ipc/read/array/union.rs | 2 +- src/io/ipc/read/array/utf8.rs | 2 +- src/io/ipc/read/deserialize.rs | 6 +- src/io/ipc/read/read_basic.rs | 12 +- src/io/ipc/write/common.rs | 7 +- src/io/parquet/write/mod.rs | 2 +- src/mmap/mod.rs | 2 +- src/temporal_conversions.rs | 95 +++++++++++++++- tests/it/temporal_conversions.rs | 6 +- 34 files changed, 313 insertions(+), 239 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 1fe8a281e6c..1650387caf1 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -21,6 +21,10 @@ num-traits = "0.2" dyn-clone = "1" bytemuck = { version = "1", features = ["derive"] } chrono = { version = "0.4.31", default_features = false, features = ["std"] } +atoi_simd = "0.15.5" +itoa = "1.0.6" +ryu = "1.0.13" +fast-float = { version = "0.2" } # for decimal i256 ethnum = "1" diff --git a/src/array/utf8/mod.rs b/src/array/utf8/mod.rs index 9440ae43304..648e23f8a25 100644 --- a/src/array/utf8/mod.rs +++ b/src/array/utf8/mod.rs @@ -12,10 +12,7 @@ use crate::{ use either::Either; -use super::{ - specification::{try_check_offsets_bounds, try_check_utf8}, - Array, GenericBinaryArray, -}; +use super::{specification::{try_check_offsets_bounds, try_check_utf8}, Array, GenericBinaryArray, BinaryArray}; #[cfg(feature = "arrow")] mod data; @@ -513,6 +510,18 @@ impl Utf8Array { self.set_validity(Some(f(validity))) } } + + // Convert this [`Utf8Array`] to a [`BinaryArray`]. + pub fn to_binary(&self) -> BinaryArray { + unsafe { + BinaryArray::new( + BinaryArray::::default_data_type(), + self.offsets.clone(), + self.values.clone(), + self.validity.clone(), + ) + } + } } impl Array for Utf8Array { diff --git a/src/compute/arithmetics/decimal/add.rs b/src/compute/arithmetics/decimal/add.rs index 9f6f529e887..93a850b89eb 100644 --- a/src/compute/arithmetics/decimal/add.rs +++ b/src/compute/arithmetics/decimal/add.rs @@ -2,7 +2,6 @@ use crate::{ array::PrimitiveArray, compute::{ - arithmetics::{ArrayAdd, ArrayCheckedAdd, ArraySaturatingAdd}, arity::{binary, binary_checked}, utils::{check_same_len, combine_validities}, }, @@ -134,27 +133,6 @@ pub fn checked_add(lhs: &PrimitiveArray, rhs: &PrimitiveArray) -> Pr binary_checked(lhs, rhs, lhs.data_type().clone(), op) } -// Implementation of ArrayAdd trait for PrimitiveArrays -impl ArrayAdd> for PrimitiveArray { - fn add(&self, rhs: &PrimitiveArray) -> Self { - add(self, rhs) - } -} - -// Implementation of ArrayCheckedAdd trait for PrimitiveArrays -impl ArrayCheckedAdd> for PrimitiveArray { - fn checked_add(&self, rhs: &PrimitiveArray) -> Self { - checked_add(self, rhs) - } -} - -// Implementation of ArraySaturatingAdd trait for PrimitiveArrays -impl ArraySaturatingAdd> for PrimitiveArray { - fn saturating_add(&self, rhs: &PrimitiveArray) -> Self { - saturating_add(self, rhs) - } -} - /// Adaptive addition of two decimal primitive arrays with different precision /// and scale. If the precision and scale is different, then the smallest scale /// and precision is adjusted to the largest precision and scale. If during the diff --git a/src/compute/arithmetics/decimal/div.rs b/src/compute/arithmetics/decimal/div.rs index 159c27de2b1..39c691d28e0 100644 --- a/src/compute/arithmetics/decimal/div.rs +++ b/src/compute/arithmetics/decimal/div.rs @@ -4,7 +4,6 @@ use crate::{ array::PrimitiveArray, compute::{ - arithmetics::{ArrayCheckedDiv, ArrayDiv}, arity::{binary, binary_checked, unary}, utils::{check_same_len, combine_validities}, }, @@ -199,20 +198,6 @@ pub fn checked_div(lhs: &PrimitiveArray, rhs: &PrimitiveArray) -> Pr binary_checked(lhs, rhs, lhs.data_type().clone(), op) } -// Implementation of ArrayDiv trait for PrimitiveArrays -impl ArrayDiv> for PrimitiveArray { - fn div(&self, rhs: &PrimitiveArray) -> Self { - div(self, rhs) - } -} - -// Implementation of ArrayCheckedDiv trait for PrimitiveArrays -impl ArrayCheckedDiv> for PrimitiveArray { - fn checked_div(&self, rhs: &PrimitiveArray) -> Self { - checked_div(self, rhs) - } -} - /// Adaptive division of two decimal primitive arrays with different precision /// and scale. If the precision and scale is different, then the smallest scale /// and precision is adjusted to the largest precision and scale. If during the diff --git a/src/compute/arithmetics/decimal/mul.rs b/src/compute/arithmetics/decimal/mul.rs index ac702d2cb3c..3301eac85b0 100644 --- a/src/compute/arithmetics/decimal/mul.rs +++ b/src/compute/arithmetics/decimal/mul.rs @@ -4,7 +4,6 @@ use crate::{ array::PrimitiveArray, compute::{ - arithmetics::{ArrayCheckedMul, ArrayMul, ArraySaturatingMul}, arity::{binary, binary_checked, unary}, utils::{check_same_len, combine_validities}, }, @@ -204,27 +203,6 @@ pub fn checked_mul(lhs: &PrimitiveArray, rhs: &PrimitiveArray) -> Pr binary_checked(lhs, rhs, lhs.data_type().clone(), op) } -// Implementation of ArrayMul trait for PrimitiveArrays -impl ArrayMul> for PrimitiveArray { - fn mul(&self, rhs: &PrimitiveArray) -> Self { - mul(self, rhs) - } -} - -// Implementation of ArrayCheckedMul trait for PrimitiveArrays -impl ArrayCheckedMul> for PrimitiveArray { - fn checked_mul(&self, rhs: &PrimitiveArray) -> Self { - checked_mul(self, rhs) - } -} - -// Implementation of ArraySaturatingMul trait for PrimitiveArrays -impl ArraySaturatingMul> for PrimitiveArray { - fn saturating_mul(&self, rhs: &PrimitiveArray) -> Self { - saturating_mul(self, rhs) - } -} - /// Adaptive multiplication of two decimal primitive arrays with different /// precision and scale. If the precision and scale is different, then the /// smallest scale and precision is adjusted to the largest precision and diff --git a/src/compute/arithmetics/decimal/sub.rs b/src/compute/arithmetics/decimal/sub.rs index 84afd205433..6759708fd70 100644 --- a/src/compute/arithmetics/decimal/sub.rs +++ b/src/compute/arithmetics/decimal/sub.rs @@ -3,7 +3,6 @@ use crate::{ array::PrimitiveArray, compute::{ - arithmetics::{ArrayCheckedSub, ArraySaturatingSub, ArraySub}, arity::{binary, binary_checked}, utils::{check_same_len, combine_validities}, }, @@ -97,26 +96,6 @@ pub fn saturating_sub( binary(lhs, rhs, lhs.data_type().clone(), op) } -// Implementation of ArraySub trait for PrimitiveArrays -impl ArraySub> for PrimitiveArray { - fn sub(&self, rhs: &PrimitiveArray) -> Self { - sub(self, rhs) - } -} - -// Implementation of ArrayCheckedSub trait for PrimitiveArrays -impl ArrayCheckedSub> for PrimitiveArray { - fn checked_sub(&self, rhs: &PrimitiveArray) -> Self { - checked_sub(self, rhs) - } -} - -// Implementation of ArraySaturatingSub trait for PrimitiveArrays -impl ArraySaturatingSub> for PrimitiveArray { - fn saturating_sub(&self, rhs: &PrimitiveArray) -> Self { - saturating_sub(self, rhs) - } -} /// Checked subtract of two decimal primitive arrays with the same precision /// and scale. If the precision and scale is different, then an /// InvalidArgumentError is returned. If the result from the sub is larger than diff --git a/src/compute/arithmetics/mod.rs b/src/compute/arithmetics/mod.rs index b1ec2a12bcc..33373564621 100644 --- a/src/compute/arithmetics/mod.rs +++ b/src/compute/arithmetics/mod.rs @@ -416,6 +416,7 @@ macro_rules! with_match_negatable {( UInt8 | UInt16 | UInt32 | UInt64 | Float16 => todo!(), Float32 => __with_ty__! { f32 }, Float64 => __with_ty__! { f64 }, + UInt128 => todo!(), } })} diff --git a/src/compute/cast/binary_to.rs b/src/compute/cast/binary_to.rs index e171a0c9098..976fd86aeec 100644 --- a/src/compute/cast/binary_to.rs +++ b/src/compute/cast/binary_to.rs @@ -4,6 +4,47 @@ use crate::{array::*, datatypes::DataType, types::NativeType}; use super::CastOptions; +pub(super) trait Parse { + fn parse(val: &[u8]) -> Option + where + Self: Sized; +} + +macro_rules! impl_parse { + ($primitive_type:ident) => { + impl Parse for $primitive_type { + fn parse(val: &[u8]) -> Option { + atoi_simd::parse(val).ok() + } + } + }; +} +impl_parse!(i8); +impl_parse!(i16); +impl_parse!(i32); +impl_parse!(i64); +impl_parse!(u8); +impl_parse!(u16); +impl_parse!(u32); +impl_parse!(u64); + +impl Parse for f32 { + fn parse(val: &[u8]) -> Option + where + Self: Sized, + { + fast_float::parse(val).ok() + } +} +impl Parse for f64 { + fn parse(val: &[u8]) -> Option + where + Self: Sized, + { + fast_float::parse(val).ok() + } +} + /// Conversion of binary pub fn binary_to_large_binary(from: &BinaryArray, to_data_type: DataType) -> BinaryArray { let values = from.values().clone(); @@ -72,13 +113,11 @@ where } /// Casts a [`BinaryArray`] to a [`PrimitiveArray`], making any uncastable value a Null. -pub fn binary_to_primitive(from: &BinaryArray, to: &DataType) -> PrimitiveArray +pub(super) fn binary_to_primitive(from: &BinaryArray, to: &DataType) -> PrimitiveArray where - T: NativeType + lexical_core::FromLexical, + T: NativeType + Parse, { - let iter = from - .iter() - .map(|x| x.and_then::(|x| lexical_core::parse(x).ok())); + let iter = from.iter().map(|x| x.and_then::(|x| T::parse(x))); PrimitiveArray::::from_trusted_len_iter(iter).to(to.clone()) } @@ -89,7 +128,7 @@ pub(super) fn binary_to_primitive_dyn( options: CastOptions, ) -> Result> where - T: NativeType + lexical_core::FromLexical, + T: NativeType + lexical_core::FromLexical + Parse, { let from = from.as_any().downcast_ref().unwrap(); if options.partial { diff --git a/src/compute/cast/binview_to.rs b/src/compute/cast/binview_to.rs index f3c0a7de2b7..c0c50f1287f 100644 --- a/src/compute/cast/binview_to.rs +++ b/src/compute/cast/binview_to.rs @@ -1,12 +1,10 @@ use chrono::Datelike; -use polars_error::PolarsResult; +use crate::error::{Result}; use crate::array::*; use crate::compute::cast::binary_to::Parse; use crate::compute::cast::CastOptions; -use crate::datatypes::{ArrowDataType, TimeUnit}; -#[cfg(feature = "dtype-decimal")] -use crate::legacy::compute::decimal::deserialize_decimal; +use crate::datatypes::{DataType, TimeUnit}; use crate::offset::Offset; use crate::temporal_conversions::EPOCH_DAYS_FROM_CE; use crate::types::NativeType; @@ -40,7 +38,7 @@ pub fn utf8view_to_utf8(array: &Utf8ViewArray) -> Utf8Array { /// Casts a [`BinaryArray`] to a [`PrimitiveArray`], making any uncastable value a Null. pub(super) fn binview_to_primitive( from: &BinaryViewArray, - to: &ArrowDataType, + to: &DataType, ) -> PrimitiveArray where T: NativeType + Parse, @@ -52,9 +50,9 @@ where pub(super) fn binview_to_primitive_dyn( from: &dyn Array, - to: &ArrowDataType, + to: &DataType, options: CastOptions, -) -> PolarsResult> +) -> Result> where T: NativeType + Parse, { @@ -66,23 +64,10 @@ where } } -#[cfg(feature = "dtype-decimal")] -pub fn binview_to_decimal( - array: &BinaryViewArray, - precision: Option, - scale: usize, -) -> PrimitiveArray { - let precision = precision.map(|p| p as u8); - array - .iter() - .map(|val| val.and_then(|val| deserialize_decimal(val, precision, scale as u8))) - .collect() -} - pub(super) fn utf8view_to_naive_timestamp_dyn( from: &dyn Array, time_unit: TimeUnit, -) -> PolarsResult> { +) -> Result> { let from = from.as_any().downcast_ref().unwrap(); Ok(Box::new(utf8view_to_naive_timestamp(from, time_unit))) } @@ -103,10 +88,10 @@ pub(super) fn utf8view_to_date32(from: &Utf8ViewArray) -> PrimitiveArray { .map(|x| x.num_days_from_ce() - EPOCH_DAYS_FROM_CE) }) }); - PrimitiveArray::::from_trusted_len_iter(iter).to(ArrowDataType::Date32) + PrimitiveArray::::from_trusted_len_iter(iter).to(DataType::Date32) } -pub(super) fn utf8view_to_date32_dyn(from: &dyn Array) -> PolarsResult> { +pub(super) fn utf8view_to_date32_dyn(from: &dyn Array) -> Result> { let from = from.as_any().downcast_ref().unwrap(); Ok(Box::new(utf8view_to_date32(from))) } diff --git a/src/compute/cast/boolean_to.rs b/src/compute/cast/boolean_to.rs index 534ae0b58cc..0f81ab1b40e 100644 --- a/src/compute/cast/boolean_to.rs +++ b/src/compute/cast/boolean_to.rs @@ -1,9 +1,9 @@ use crate::{ array::{Array, BinaryViewArray, BooleanArray, PrimitiveArray, Utf8ViewArray}, error::Result, - offset::Offset, types::NativeType, }; +use crate::array::MutableBinaryViewArray; pub(super) fn boolean_to_primitive_dyn(array: &dyn Array) -> Result> where @@ -43,7 +43,7 @@ pub fn boolean_to_binaryview(from: &BooleanArray) -> BinaryViewArray { Some(false) => Some("false".as_bytes()), None => None, }); - BinaryViewArray::arr_from_iter_trusted(iter) + MutableBinaryViewArray::from_iter(iter).into() } pub(super) fn boolean_to_binaryview_dyn(array: &dyn Array) -> Result> { diff --git a/src/compute/cast/mod.rs b/src/compute/cast/mod.rs index 11a26b30c2c..8ac4d7935f8 100644 --- a/src/compute/cast/mod.rs +++ b/src/compute/cast/mod.rs @@ -626,6 +626,7 @@ pub fn cast(array: &dyn Array, to_type: &DataType, options: CastOptions) -> Resu array.as_any().downcast_ref().unwrap(), RFC3339, time_zone.clone(), + time_unit.to_owned(), ) .map(|arr| arr.boxed()), Date32 => utf8view_to_date32_dyn(array), @@ -686,16 +687,13 @@ pub fn cast(array: &dyn Array, to_type: &DataType, options: CastOptions) -> Resu .map(|arr| unsafe { arr.to_utf8view_unchecked() }.boxed()), }, (Utf8, _) => match to_type { - UInt8 => utf8_to_primitive_dyn::(array, to_type, options), - UInt16 => utf8_to_primitive_dyn::(array, to_type, options), - UInt32 => utf8_to_primitive_dyn::(array, to_type, options), - UInt64 => utf8_to_primitive_dyn::(array, to_type, options), - Int8 => utf8_to_primitive_dyn::(array, to_type, options), - Int16 => utf8_to_primitive_dyn::(array, to_type, options), - Int32 => utf8_to_primitive_dyn::(array, to_type, options), - Int64 => utf8_to_primitive_dyn::(array, to_type, options), - Float32 => utf8_to_primitive_dyn::(array, to_type, options), - Float64 => utf8_to_primitive_dyn::(array, to_type, options), + UInt8 | UInt16 | UInt32 | UInt64 | Int8 | Int16 | Int32 | Int64 | Float32 | Float64 => { + let binary = utf8_to_binary::( + array.as_any().downcast_ref().unwrap(), + Binary, + ); + cast(&binary, to_type, options) + }, Date32 => utf8_to_date32_dyn::(array), Date64 => utf8_to_date64_dyn::(array), LargeUtf8 => Ok(Box::new(utf8_to_large_utf8( @@ -706,25 +704,22 @@ pub fn cast(array: &dyn Array, to_type: &DataType, options: CastOptions) -> Resu to_type.clone(), ) .boxed()), - Timestamp(TimeUnit::Nanosecond, None) => utf8_to_naive_timestamp_ns_dyn::(array), + Timestamp(TimeUnit::Nanosecond, None) => utf8_to_naive_timestamp_dyn::(array, TimeUnit::Nanosecond), Timestamp(TimeUnit::Nanosecond, Some(tz)) => { - utf8_to_timestamp_ns_dyn::(array, tz.clone()) + utf8_to_timestamp_dyn::(array, tz.clone()) } _ => Err(Error::NotYetImplemented(format!( "Casting from {from_type:?} to {to_type:?} not supported", ))), }, (LargeUtf8, _) => match to_type { - UInt8 => utf8_to_primitive_dyn::(array, to_type, options), - UInt16 => utf8_to_primitive_dyn::(array, to_type, options), - UInt32 => utf8_to_primitive_dyn::(array, to_type, options), - UInt64 => utf8_to_primitive_dyn::(array, to_type, options), - Int8 => utf8_to_primitive_dyn::(array, to_type, options), - Int16 => utf8_to_primitive_dyn::(array, to_type, options), - Int32 => utf8_to_primitive_dyn::(array, to_type, options), - Int64 => utf8_to_primitive_dyn::(array, to_type, options), - Float32 => utf8_to_primitive_dyn::(array, to_type, options), - Float64 => utf8_to_primitive_dyn::(array, to_type, options), + UInt8 | UInt16 | UInt32 | UInt64 | Int8 | Int16 | Int32 | Int64 | Float32 | Float64 => { + let binary = utf8_to_binary::( + array.as_any().downcast_ref().unwrap(), + DataType::LargeBinary, + ); + cast(&binary, to_type, options) + }, Date32 => utf8_to_date32_dyn::(array), Date64 => utf8_to_date64_dyn::(array), Utf8 => utf8_large_to_utf8(array.as_any().downcast_ref().unwrap()).map(|x| x.boxed()), @@ -733,9 +728,9 @@ pub fn cast(array: &dyn Array, to_type: &DataType, options: CastOptions) -> Resu to_type.clone(), ) .boxed()), - Timestamp(TimeUnit::Nanosecond, None) => utf8_to_naive_timestamp_ns_dyn::(array), + Timestamp(TimeUnit::Nanosecond, None) => utf8_to_naive_timestamp_dyn::(array, TimeUnit::Nanosecond), Timestamp(TimeUnit::Nanosecond, Some(tz)) => { - utf8_to_timestamp_ns_dyn::(array, tz.clone()) + utf8_to_timestamp_dyn::(array, tz.clone()) } _ => Err(Error::NotYetImplemented(format!( "Casting from {from_type:?} to {to_type:?} not supported", @@ -1142,7 +1137,7 @@ fn from_to_binview( Binary => binary_to_binview::(array.as_any().downcast_ref().unwrap()), FixedSizeBinary(_) => fixed_size_binary_to_binview(array.as_any().downcast_ref().unwrap()), LargeBinary => binary_to_binview::(array.as_any().downcast_ref().unwrap()), - _ => Err(Error::NotYetImplemented(format!( + _ => return Err(Error::NotYetImplemented(format!( "Unsupported casting from {from_type:?} to {to_type:?}" ))), }; diff --git a/src/compute/cast/primitive_to.rs b/src/compute/cast/primitive_to.rs index e2d7b63632c..d26f044363d 100644 --- a/src/compute/cast/primitive_to.rs +++ b/src/compute/cast/primitive_to.rs @@ -17,10 +17,64 @@ use crate::{ use super::CastOptions; -/// Returns a [`BinaryArray`] where every element is the binary representation of the number. -pub fn primitive_to_binary( +pub(super) trait SerPrimitive { + fn write(f: &mut Vec, val: Self) -> usize + where + Self: Sized; +} + +macro_rules! impl_ser_primitive { + ($ptype:ident) => { + impl SerPrimitive for $ptype { + fn write(f: &mut Vec, val: Self) -> usize + where + Self: Sized, + { + let mut buffer = itoa::Buffer::new(); + let value = buffer.format(val); + f.extend_from_slice(value.as_bytes()); + value.len() + } + } + }; +} + +impl_ser_primitive!(i8); +impl_ser_primitive!(i16); +impl_ser_primitive!(i32); +impl_ser_primitive!(i64); +impl_ser_primitive!(u8); +impl_ser_primitive!(u16); +impl_ser_primitive!(u32); +impl_ser_primitive!(u64); + +impl SerPrimitive for f32 { + fn write(f: &mut Vec, val: Self) -> usize + where + Self: Sized, + { + let mut buffer = ryu::Buffer::new(); + let value = buffer.format(val); + f.extend_from_slice(value.as_bytes()); + value.len() + } +} + +impl SerPrimitive for f64 { + fn write(f: &mut Vec, val: Self) -> usize + where + Self: Sized, + { + let mut buffer = ryu::Buffer::new(); + let value = buffer.format(val); + f.extend_from_slice(value.as_bytes()); + value.len() + } +} + +fn primitive_to_values_and_offsets( from: &PrimitiveArray, -) -> BinaryArray { +) -> (Vec, Offsets) { let mut values: Vec = Vec::with_capacity(from.len()); let mut offsets: Vec = Vec::with_capacity(from.len() + 1); offsets.push(O::default()); @@ -28,35 +82,38 @@ pub fn primitive_to_binary( let mut offset: usize = 0; unsafe { - for x in from.values().iter() { - values.reserve(offset + T::FORMATTED_SIZE_DECIMAL); - - let bytes = std::slice::from_raw_parts_mut( - values.as_mut_ptr().add(offset), - values.capacity() - offset, - ); - let len = lexical_core::write_unchecked(*x, bytes).len(); + for &x in from.values().iter() { + let len = T::write(&mut values, x); offset += len; - offsets.push(O::from_usize(offset).unwrap()); + offsets.push(O::from_as_usize(offset)); } values.set_len(offset); values.shrink_to_fit(); // Safety: offsets _are_ monotonically increasing let offsets = unsafe { Offsets::new_unchecked(offsets) }; - BinaryArray::::new( - BinaryArray::::default_data_type(), - offsets.into(), - values.into(), - from.validity().cloned(), - ) + + (values, offsets) } } +/// Returns a [`BinaryArray`] where every element is the binary representation of the number. +pub(super) fn primitive_to_binary( + from: &PrimitiveArray, +) -> BinaryArray { + let (values, offsets) = primitive_to_values_and_offsets(from); + BinaryArray::::new( + BinaryArray::::default_data_type(), + offsets.into(), + values.into(), + from.validity().cloned(), + ) +} + pub(super) fn primitive_to_binary_dyn(from: &dyn Array) -> Result> where O: Offset, - T: NativeType + lexical_core::ToLexical, + T: NativeType + SerPrimitive, { let from = from.as_any().downcast_ref().unwrap(); Ok(Box::new(primitive_to_binary::(from))) @@ -86,32 +143,11 @@ where } /// Returns a [`Utf8Array`] where every element is the utf8 representation of the number. -pub fn primitive_to_utf8( +pub(super) fn primitive_to_utf8( from: &PrimitiveArray, ) -> Utf8Array { - let mut values: Vec = Vec::with_capacity(from.len()); - let mut offsets: Vec = Vec::with_capacity(from.len() + 1); - offsets.push(O::default()); - - let mut offset: usize = 0; - + let (values, offsets) = primitive_to_values_and_offsets(from); unsafe { - for x in from.values().iter() { - values.reserve(offset + T::FORMATTED_SIZE_DECIMAL); - - let bytes = std::slice::from_raw_parts_mut( - values.as_mut_ptr().add(offset), - values.capacity() - offset, - ); - let len = lexical_core::write_unchecked(*x, bytes).len(); - - offset += len; - offsets.push(O::from_usize(offset).unwrap()); - } - values.set_len(offset); - values.shrink_to_fit(); - // Safety: offsets _are_ monotonically increasing - let offsets = unsafe { Offsets::new_unchecked(offsets) }; Utf8Array::::new_unchecked( Utf8Array::::default_data_type(), offsets.into(), @@ -124,7 +160,7 @@ pub fn primitive_to_utf8( pub(super) fn primitive_to_utf8_dyn(from: &dyn Array) -> Result> where O: Offset, - T: NativeType + lexical_core::ToLexical, + T: NativeType + SerPrimitive, { let from = from.as_any().downcast_ref().unwrap(); Ok(Box::new(primitive_to_utf8::(from))) @@ -589,7 +625,7 @@ pub fn f16_to_f32(from: &PrimitiveArray) -> PrimitiveArray { } /// Returns a [`Utf8Array`] where every element is the utf8 representation of the number. -pub(super) fn primitive_to_binview( +pub(super) fn primitive_to_binview( from: &PrimitiveArray, ) -> BinaryViewArray { let mut mutable = MutableBinaryViewArray::with_capacity(from.len()); @@ -606,7 +642,7 @@ pub(super) fn primitive_to_binview( pub(super) fn primitive_to_binview_dyn(from: &dyn Array) -> BinaryViewArray where - T: NativeType, + T: NativeType + SerPrimitive, { let from = from.as_any().downcast_ref().unwrap(); primitive_to_binview::(from) diff --git a/src/compute/cast/utf8_to.rs b/src/compute/cast/utf8_to.rs index eeb015d31e3..40fda5136a9 100644 --- a/src/compute/cast/utf8_to.rs +++ b/src/compute/cast/utf8_to.rs @@ -7,11 +7,12 @@ use crate::{ error::Result, offset::Offset, temporal_conversions::{ - utf8view_to_naive_timestamp as utf8_to_naive_timestamp_, - utf8view_to_timestamp as utf8_to_timestamp_, EPOCH_DAYS_FROM_CE, + utf8_to_naive_timestamp as utf8_to_naive_timestamp_, + utf8_to_timestamp as utf8_to_timestamp_, EPOCH_DAYS_FROM_CE, }, types::NativeType, }; +use crate::datatypes::TimeUnit; use super::CastOptions; @@ -44,6 +45,7 @@ where PrimitiveArray::::from_trusted_len_iter(iter).to(to.clone()) } +#[allow(unused)] pub(super) fn utf8_to_primitive_dyn( from: &dyn Array, to: &DataType, @@ -114,30 +116,31 @@ pub fn utf8_to_dictionary( Ok(array.into()) } -pub(super) fn utf8_to_naive_timestamp_ns_dyn( +pub(super) fn utf8_to_naive_timestamp_dyn( from: &dyn Array, + time_unit: TimeUnit ) -> Result> { let from = from.as_any().downcast_ref().unwrap(); - Ok(Box::new(utf8_to_naive_timestamp_ns::(from))) + Ok(Box::new(utf8_to_naive_timestamp::(from, time_unit))) } /// [`crate::temporal_conversions::utf8_to_timestamp_ns`] applied for RFC3339 formatting -pub fn utf8_to_naive_timestamp_ns(from: &Utf8Array) -> PrimitiveArray { - utf8_to_naive_timestamp_(from, RFC3339) +pub fn utf8_to_naive_timestamp(from: &Utf8Array, time_unit: TimeUnit) -> PrimitiveArray { + utf8_to_naive_timestamp_(from, RFC3339, time_unit) } -pub(super) fn utf8_to_timestamp_ns_dyn( +pub(super) fn utf8_to_timestamp_dyn( from: &dyn Array, timezone: String, ) -> Result> { let from = from.as_any().downcast_ref().unwrap(); - utf8_to_timestamp_ns::(from, timezone) + utf8_to_timestamp::(from, timezone) .map(Box::new) .map(|x| x as Box) } /// [`crate::temporal_conversions::utf8_to_timestamp_ns`] applied for RFC3339 formatting -pub fn utf8_to_timestamp_ns( +pub fn utf8_to_timestamp( from: &Utf8Array, timezone: String, ) -> Result> { @@ -195,7 +198,7 @@ pub fn binary_to_binview(arr: &BinaryArray) -> BinaryViewArray { payload[4..4 + bytes.len()].copy_from_slice(bytes); } else { uses_buffer = true; - unsafe { payload[4..8].copy_from_slice(bytes.get_unchecked_release(0..4)) }; + unsafe { payload[4..8].copy_from_slice(bytes.get_unchecked(0..4)) }; let offset = (bytes.as_ptr() as usize - base_ptr) as u32; payload[0..4].copy_from_slice(&len.to_le_bytes()); payload[8..12].copy_from_slice(&buffer_idx.to_le_bytes()); @@ -203,7 +206,7 @@ pub fn binary_to_binview(arr: &BinaryArray) -> BinaryViewArray { } let value = View::from_le_bytes(payload); - unsafe { views.push_unchecked(value) }; + unsafe { views.push(value) }; } let buffers = if uses_buffer { Arc::from([arr.values().clone()]) diff --git a/src/compute/comparison/mod.rs b/src/compute/comparison/mod.rs index b364ed88222..4031b770fd2 100644 --- a/src/compute/comparison/mod.rs +++ b/src/compute/comparison/mod.rs @@ -86,6 +86,7 @@ macro_rules! match_eq_ord {( Float16 => todo!(), Float32 => __with_ty__! { f32 }, Float64 => __with_ty__! { f64 }, + UInt128 => todo!(), } })} @@ -111,6 +112,7 @@ macro_rules! match_eq {( Float16 => __with_ty__! { f16 }, Float32 => __with_ty__! { f32 }, Float64 => __with_ty__! { f64 }, + UInt128 => todo!(), } })} diff --git a/src/ffi/array.rs b/src/ffi/array.rs index b13e29513e2..5e2e008a24d 100644 --- a/src/ffi/array.rs +++ b/src/ffi/array.rs @@ -104,7 +104,7 @@ impl ArrowArray { DataType::BinaryView | DataType::Utf8View ); - let (offset, buffers, children, dictionary) = + let (offset, mut buffers, children, dictionary) = offset_buffers_children_dictionary(array.as_ref()); let variadic_buffer_sizes = if needs_variadic_buffer_sizes { diff --git a/src/ffi/mmap.rs b/src/ffi/mmap.rs index 0f879d4fdca..3806bffe024 100644 --- a/src/ffi/mmap.rs +++ b/src/ffi/mmap.rs @@ -21,7 +21,7 @@ struct PrivateData { } pub(crate) unsafe fn create_array< - T: AsRef<[u8]>, + T, I: Iterator>, II: Iterator, >( diff --git a/src/io/ipc/read/array/binary.rs b/src/io/ipc/read/array/binary.rs index 3cfe4a29057..7e9df71a41d 100644 --- a/src/io/ipc/read/array/binary.rs +++ b/src/io/ipc/read/array/binary.rs @@ -9,7 +9,7 @@ use crate::io::ipc::read::array::{try_get_array_length, try_get_field_node}; use crate::offset::Offset; use super::super::read_basic::*; -use super::super::{Compression, IpcBuffer, Node, OutOfSpecKind}; +use super::super::{Compression, IpcBuffer, Node}; #[allow(clippy::too_many_arguments)] pub fn read_binary( diff --git a/src/io/ipc/read/array/binview.rs b/src/io/ipc/read/array/binview.rs index 79e84b3cc8b..2ee4390fee7 100644 --- a/src/io/ipc/read/array/binview.rs +++ b/src/io/ipc/read/array/binview.rs @@ -6,7 +6,7 @@ use crate::error::{Error, Result}; use super::super::read_basic::*; use super::*; -use crate::array::{ArrayRef, BinaryViewArrayGeneric, View, ViewType}; +use crate::array::{BinaryViewArrayGeneric, View, ViewType}; use crate::buffer::Buffer; use crate::datatypes::DataType; @@ -22,7 +22,7 @@ pub fn read_binview( compression: Option, limit: Option, scratch: &mut Vec, -) -> Result { +) -> Result> { let field_node = try_get_field_node(field_nodes, &data_type)?; let validity = read_validity( @@ -48,8 +48,9 @@ pub fn read_binview( )?; let n_variadic = variadic_buffer_counts.pop_front().ok_or_else( - || polars_err!(ComputeError: "IPC: unable to fetch the variadic buffers\n\nThe file or stream is corrupted.") - )?; + || { + Error::oos("IPC: unable to fetch the variadic buffers\n\nThe file or stream is corrupted.") + })?; let variadic_buffers = (0..n_variadic) .map(|_| { @@ -65,5 +66,4 @@ pub fn read_binview( .collect::>>>()?; BinaryViewArrayGeneric::::try_new(data_type, views, Arc::from(variadic_buffers), validity) - .map(|arr| arr.boxed()) } diff --git a/src/io/ipc/read/array/boolean.rs b/src/io/ipc/read/array/boolean.rs index 00f82ab5780..d13e6f17c4f 100644 --- a/src/io/ipc/read/array/boolean.rs +++ b/src/io/ipc/read/array/boolean.rs @@ -7,7 +7,7 @@ use crate::error::{Error, Result}; use crate::io::ipc::read::array::{try_get_array_length, try_get_field_node}; use super::super::read_basic::*; -use super::super::{Compression, IpcBuffer, Node, OutOfSpecKind}; +use super::super::{Compression, IpcBuffer, Node}; #[allow(clippy::too_many_arguments)] pub fn read_boolean( diff --git a/src/io/ipc/read/array/fixed_size_binary.rs b/src/io/ipc/read/array/fixed_size_binary.rs index 4aba9b82fb1..fe627ac81c1 100644 --- a/src/io/ipc/read/array/fixed_size_binary.rs +++ b/src/io/ipc/read/array/fixed_size_binary.rs @@ -7,7 +7,7 @@ use crate::error::{Error, Result}; use crate::io::ipc::read::array::{try_get_array_length, try_get_field_node}; use super::super::read_basic::*; -use super::super::{Compression, IpcBuffer, Node, OutOfSpecKind}; +use super::super::{Compression, IpcBuffer, Node}; #[allow(clippy::too_many_arguments)] pub fn read_fixed_size_binary( diff --git a/src/io/ipc/read/array/list.rs b/src/io/ipc/read/array/list.rs index 2cf71cbb34d..1d741b4ac64 100644 --- a/src/io/ipc/read/array/list.rs +++ b/src/io/ipc/read/array/list.rs @@ -11,7 +11,7 @@ use crate::offset::Offset; use super::super::super::IpcField; use super::super::deserialize::{read, skip}; use super::super::read_basic::*; -use super::super::{Compression, Dictionaries, IpcBuffer, Node, OutOfSpecKind, Version}; +use super::super::{Compression, Dictionaries, IpcBuffer, Node, Version}; use crate::io::ipc::read::array::{try_get_array_length, try_get_field_node}; #[allow(clippy::too_many_arguments)] diff --git a/src/io/ipc/read/array/map.rs b/src/io/ipc/read/array/map.rs index 5787e02596b..4e7261aee84 100644 --- a/src/io/ipc/read/array/map.rs +++ b/src/io/ipc/read/array/map.rs @@ -9,7 +9,7 @@ use crate::error::{Error, Result}; use super::super::super::IpcField; use super::super::deserialize::{read, skip}; use super::super::read_basic::*; -use super::super::{Compression, Dictionaries, IpcBuffer, Node, OutOfSpecKind, Version}; +use super::super::{Compression, Dictionaries, IpcBuffer, Node, Version}; use crate::io::ipc::read::array::{try_get_array_length, try_get_field_node}; #[allow(clippy::too_many_arguments)] diff --git a/src/io/ipc/read/array/mod.rs b/src/io/ipc/read/array/mod.rs index c4e7b237f60..b3cde245a9d 100644 --- a/src/io/ipc/read/array/mod.rs +++ b/src/io/ipc/read/array/mod.rs @@ -37,7 +37,7 @@ fn try_get_field_node<'a>( data_type: &DataType, ) -> Result> { field_nodes.pop_front().ok_or_else(|| { - polars_err!(ComputeError: "IPC: unable to fetch the field for {:?}\n\nThe file or stream is corrupted.", data_type) + Error::oos(format!("IPC: unable to fetch the field for {:?}\n\nThe file or stream is corrupted.", data_type)) }) } @@ -45,6 +45,6 @@ fn try_get_array_length(field_node: Node, limit: Option) -> Result let length: usize = field_node .length() .try_into() - .map_err(|_| polars_err!(oos = OutOfSpecKind::NegativeFooterLength))?; + .map_err(|_| Error::from(OutOfSpecKind::NegativeFooterLength))?; Ok(limit.map(|limit| limit.min(length)).unwrap_or(length)) } diff --git a/src/io/ipc/read/array/null.rs b/src/io/ipc/read/array/null.rs index c25cc1b1c6d..c623249a0bc 100644 --- a/src/io/ipc/read/array/null.rs +++ b/src/io/ipc/read/array/null.rs @@ -6,7 +6,7 @@ use crate::{ error::{Error, Result}, }; -use super::super::{Node, OutOfSpecKind}; +use super::super::Node; use crate::io::ipc::read::array::{try_get_array_length, try_get_field_node}; pub fn read_null( diff --git a/src/io/ipc/read/array/primitive.rs b/src/io/ipc/read/array/primitive.rs index 057e6298f19..99916069b60 100644 --- a/src/io/ipc/read/array/primitive.rs +++ b/src/io/ipc/read/array/primitive.rs @@ -6,7 +6,7 @@ use crate::error::{Error, Result}; use crate::{array::PrimitiveArray, types::NativeType}; use super::super::read_basic::*; -use super::super::{Compression, IpcBuffer, Node, OutOfSpecKind}; +use super::super::{Compression, IpcBuffer, Node}; use crate::io::ipc::read::array::{try_get_array_length, try_get_field_node}; #[allow(clippy::too_many_arguments)] diff --git a/src/io/ipc/read/array/union.rs b/src/io/ipc/read/array/union.rs index edb22b1c908..755d767b505 100644 --- a/src/io/ipc/read/array/union.rs +++ b/src/io/ipc/read/array/union.rs @@ -9,7 +9,7 @@ use crate::error::{Error, Result}; use super::super::super::IpcField; use super::super::deserialize::{read, skip}; use super::super::read_basic::*; -use super::super::{Compression, Dictionaries, IpcBuffer, Node, OutOfSpecKind, Version}; +use super::super::{Compression, Dictionaries, IpcBuffer, Node, Version}; use crate::io::ipc::read::array::{try_get_array_length, try_get_field_node}; #[allow(clippy::too_many_arguments)] diff --git a/src/io/ipc/read/array/utf8.rs b/src/io/ipc/read/array/utf8.rs index 819181e5df1..4b0c6cb3372 100644 --- a/src/io/ipc/read/array/utf8.rs +++ b/src/io/ipc/read/array/utf8.rs @@ -9,7 +9,7 @@ use crate::io::ipc::read::array::{try_get_array_length, try_get_field_node}; use crate::offset::Offset; use super::super::read_basic::*; -use super::super::{Compression, IpcBuffer, Node, OutOfSpecKind}; +use super::super::{Compression, IpcBuffer, Node}; #[allow(clippy::too_many_arguments)] pub fn read_utf8( diff --git a/src/io/ipc/read/deserialize.rs b/src/io/ipc/read/deserialize.rs index af0c6126a9d..5d3c209f07d 100644 --- a/src/io/ipc/read/deserialize.rs +++ b/src/io/ipc/read/deserialize.rs @@ -244,7 +244,8 @@ pub fn read( compression, limit, scratch, - ), + ) + .map(|x| x.boxed()), BinaryView => read_binview::<[u8], _>( field_nodes, variadic_buffer_counts, @@ -256,7 +257,8 @@ pub fn read( compression, limit, scratch, - ), + ) + .map(|x| x.boxed()), } } diff --git a/src/io/ipc/read/read_basic.rs b/src/io/ipc/read/read_basic.rs index baf73b09ef2..9dd980b2aee 100644 --- a/src/io/ipc/read/read_basic.rs +++ b/src/io/ipc/read/read_basic.rs @@ -47,7 +47,7 @@ fn read_uncompressed_bytes( reader: &mut R, buffer_length: usize, is_little_endian: bool, -) -> PolarsResult> { +) -> Result> { if is_native_little_endian() == is_little_endian { let mut buffer = Vec::with_capacity(buffer_length); let _ = reader @@ -153,7 +153,7 @@ fn read_compressed_bytes( is_little_endian: bool, compression: Compression, scratch: &mut Vec, -) -> PolarsResult> { +) -> Result> { read_compressed_buffer::( reader, buffer_length, @@ -171,20 +171,20 @@ pub fn read_bytes( is_little_endian: bool, compression: Option, scratch: &mut Vec, -) -> PolarsResult> { +) -> Result> { let buf = buf .pop_front() - .ok_or_else(|| polars_err!(oos = OutOfSpecKind::ExpectedBuffer))?; + .ok_or_else(|| Error::from(OutOfSpecKind::ExpectedBuffer))?; let offset: u64 = buf .offset() .try_into() - .map_err(|_| polars_err!(oos = OutOfSpecKind::NegativeFooterLength))?; + .map_err(|_| Error::from(OutOfSpecKind::NegativeFooterLength))?; let buffer_length: usize = buf .length() .try_into() - .map_err(|_| polars_err!(oos = OutOfSpecKind::NegativeFooterLength))?; + .map_err(|_| Error::from(OutOfSpecKind::NegativeFooterLength))?; reader.seek(SeekFrom::Start(block_offset + offset))?; diff --git a/src/io/ipc/write/common.rs b/src/io/ipc/write/common.rs index 7d3a3831ed8..5cf490c1904 100644 --- a/src/io/ipc/write/common.rs +++ b/src/io/ipc/write/common.rs @@ -1,5 +1,4 @@ use std::borrow::{Borrow, Cow}; -use arrow_array::LargeListArray; use arrow_format::ipc::planus::Builder; @@ -116,7 +115,7 @@ fn encode_dictionary( dictionary_tracker, encoded_dictionaries, ) - } + }, FixedSizeList => { let values = array .as_any() @@ -247,10 +246,6 @@ fn set_variadic_buffer_counts(counts: &mut Vec, array: &dyn Array) { set_variadic_buffer_counts(counts, array.as_ref()) } }, - DataType::LargeList(_) => { - let array = array.as_any().downcast_ref::().unwrap(); - set_variadic_buffer_counts(counts, array.values().as_ref()) - }, DataType::FixedSizeList(_, _) => { let array = array.as_any().downcast_ref::().unwrap(); set_variadic_buffer_counts(counts, array.values().as_ref()) diff --git a/src/io/parquet/write/mod.rs b/src/io/parquet/write/mod.rs index 6ef1864c6f3..16eae9c3423 100644 --- a/src/io/parquet/write/mod.rs +++ b/src/io/parquet/write/mod.rs @@ -828,7 +828,7 @@ fn transverse_recursive T + Clone>( use crate::datatypes::PhysicalType::*; match data_type.to_physical_type() { Null | Boolean | Primitive(_) | Binary | FixedSizeBinary | LargeBinary | Utf8 - | Dictionary(_) | LargeUtf8 => encodings.push(map(data_type)), + | Dictionary(_) | LargeUtf8 | BinaryView | Utf8View => encodings.push(map(data_type)), List | FixedSizeList | LargeList => { let a = data_type.to_logical_type(); if let DataType::List(inner) = a { diff --git a/src/mmap/mod.rs b/src/mmap/mod.rs index d5ec28c7b4b..74a6afc43e3 100644 --- a/src/mmap/mod.rs +++ b/src/mmap/mod.rs @@ -87,7 +87,7 @@ unsafe fn _mmap_record>( let (mut buffers, mut field_nodes) = get_buffers_nodes(batch)?; let mut variadic_buffer_counts = batch .variadic_buffer_counts() - .map_err(|err| polars_err!(oos = OutOfSpecKind::InvalidFlatbufferRecordBatches(err)))? + .map_err(|err| Error::from(OutOfSpecKind::InvalidFlatbufferRecordBatches(err)))? .map(|v| v.iter().map(|v| v as usize).collect::>()) .unwrap_or_else(VecDeque::new); diff --git a/src/temporal_conversions.rs b/src/temporal_conversions.rs index 8acb607cd4a..ac2e53485a1 100644 --- a/src/temporal_conversions.rs +++ b/src/temporal_conversions.rs @@ -7,8 +7,9 @@ use chrono::{ use crate::error::Result; use crate::{ - array::{PrimitiveArray, Utf8ViewArray}, + array::{PrimitiveArray, Utf8ViewArray, Utf8Array}, error::Error, + offset::Offset, }; use crate::{ datatypes::{DataType, TimeUnit}, @@ -397,6 +398,61 @@ pub fn utf8_to_naive_timestamp_scalar(value: &str, fmt: &str, tu: &TimeUnit) -> .ok() } +fn utf8_to_timestamp_impl( + array: &Utf8Array, + fmt: &str, + timezone: String, + tz: T, +) -> PrimitiveArray { + let iter = array + .iter() + .map(|x| x.and_then(|x| utf8_to_timestamp_ns_scalar(x, fmt, &tz))); + + PrimitiveArray::from_trusted_len_iter(iter) + .to(DataType::Timestamp(TimeUnit::Nanosecond, Some(timezone))) +} + +/// Parses a [`Utf8Array`] to a timeozone-aware timestamp, i.e. [`PrimitiveArray`] with type `Timestamp(Nanosecond, Some(timezone))`. +/// # Implementation +/// * parsed values with timezone other than `timezone` are converted to `timezone`. +/// * parsed values without timezone are null. Use [`utf8_to_naive_timestamp_ns`] to parse naive timezones. +/// * Null elements remain null; non-parsable elements are null. +/// The feature `"chrono-tz"` enables IANA and zoneinfo formats for `timezone`. +/// # Error +/// This function errors iff `timezone` is not parsable to an offset. +pub fn utf8_to_timestamp( + array: &Utf8Array, + fmt: &str, + timezone: String, +) -> Result> { + let tz = parse_offset(timezone.as_str()); + let time_unit = TimeUnit::Second; + + if let Ok(tz) = tz { + Ok(crate::temporal_conversions::utf8_to_timestamp_impl( + array, fmt, timezone, tz, + )) + } else { + crate::temporal_conversions::chrono_tz_utf_to_timestamp(array, fmt, timezone, time_unit) + } +} + +/// Parses a [`Utf8Array`] to naive timestamp, i.e. +/// [`PrimitiveArray`] with type `Timestamp(Nanosecond, None)`. +/// Timezones are ignored. +/// Null elements remain null; non-parsable elements are set to null. +pub fn utf8_to_naive_timestamp( + array: &Utf8Array, + fmt: &str, + time_unit: TimeUnit, +) -> PrimitiveArray { + let iter = array + .iter() + .map(|x| x.and_then(|x| utf8_to_naive_timestamp_ns_scalar(x, fmt))); + + PrimitiveArray::from_trusted_len_iter(iter).to(DataType::Timestamp(time_unit, None)) +} + fn utf8view_to_timestamp_impl( array: &Utf8ViewArray, fmt: &str, @@ -422,7 +478,33 @@ pub fn parse_offset_tz(timezone: &str) -> Result { #[cfg(feature = "chrono-tz")] #[cfg_attr(docsrs, doc(cfg(feature = "chrono-tz")))] -fn chrono_tz_utf_to_timestamp( +fn chrono_tz_utf_to_timestamp( + array: &Utf8Array, + fmt: &str, + time_zone: String, + time_unit: TimeUnit, +) -> Result> { + let tz = parse_offset_tz(&time_zone)?; + Ok(utf8view_to_timestamp_impl( + array, fmt, time_zone, tz, + )) +} + +#[cfg(not(feature = "chrono-tz"))] +fn chrono_tz_utf_to_timestamp( + _: &Utf8Array, + _: &str, + timezone: String, + _: TimeUnit, +) -> Result> { + Err(Error::InvalidArgumentError(format!( + "timezone \"{timezone}\" cannot be parsed (feature chrono-tz is not active)", + ))) +} + +#[cfg(feature = "chrono-tz")] +#[cfg_attr(docsrs, doc(cfg(feature = "chrono-tz")))] +fn chrono_tz_utfview_to_timestamp( array: &Utf8ViewArray, fmt: &str, time_zone: String, @@ -435,7 +517,7 @@ fn chrono_tz_utf_to_timestamp( } #[cfg(not(feature = "chrono-tz"))] -fn chrono_tz_utf_to_timestamp( +fn chrono_tz_utfview_to_timestamp( _: &Utf8ViewArray, _: &str, timezone: String, @@ -458,16 +540,16 @@ pub fn utf8view_to_timestamp( array: &Utf8ViewArray, fmt: &str, timezone: String, + time_unit: TimeUnit ) -> Result> { let tz = parse_offset(timezone.as_str()); - let time_unit = TimeUnit::Second; if let Ok(tz) = tz { Ok(utf8view_to_timestamp_impl( array, fmt, timezone, tz, )) } else { - chrono_tz_utf_to_timestamp(array, fmt, timezone, time_unit) + chrono_tz_utfview_to_timestamp(array, fmt, timezone, time_unit) } } @@ -478,12 +560,13 @@ pub fn utf8view_to_timestamp( pub fn utf8view_to_naive_timestamp( array: &Utf8ViewArray, fmt: &str, + time_unit: TimeUnit, ) -> PrimitiveArray { let iter = array .iter() .map(|x| x.and_then(|x| utf8_to_naive_timestamp_ns_scalar(x, fmt))); - PrimitiveArray::from_trusted_len_iter(iter).to(DataType::Timestamp(TimeUnit::Nanosecond, None)) + PrimitiveArray::from_trusted_len_iter(iter).to(DataType::Timestamp(time_unit, None)) } fn add_month(year: i32, month: u32, months: i32) -> chrono::NaiveDate { diff --git a/tests/it/temporal_conversions.rs b/tests/it/temporal_conversions.rs index 08399e32fb8..ecabb1e4fb0 100644 --- a/tests/it/temporal_conversions.rs +++ b/tests/it/temporal_conversions.rs @@ -15,12 +15,12 @@ fn naive() { "1996-12-19 13:39:57-03:00", // missing T ]; let array = Utf8ViewArray::from_slice_values(slice); - let r = temporal_conversions::utf8view_to_naive_timestamp(&array, fmt); + let r = temporal_conversions::utf8view_to_naive_timestamp(&array, fmt, TimeUnit::Nanosecond); assert_eq!(format!("{r:?}"), expected); let fmt = "%Y-%m-%dT%H:%M:%S"; // no tz info let array = Utf8ViewArray::from_slice_values(slice); - let r = temporal_conversions::utf8view_to_naive_timestamp(&array, fmt); + let r = temporal_conversions::utf8view_to_naive_timestamp(&array, fmt, TimeUnit::Nanosecond); assert_eq!(format!("{r:?}"), expected); } @@ -117,7 +117,7 @@ fn naive_no_tz() { "1996-12-19T13:39:57", "1996-12-19 13:39:57", // missing T ]); - let r = temporal_conversions::utf8view_to_naive_timestamp(&array, fmt); + let r = temporal_conversions::utf8view_to_naive_timestamp(&array, fmt, TimeUnit::Nanosecond); assert_eq!(format!("{r:?}"), expected); }