From 271b46d221d0f916ba9b8d63d33fb7e2d2e6d006 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=96zg=C3=BCr=20Akkurt?= Date: Thu, 22 Sep 2022 17:45:34 +0300 Subject: [PATCH 1/5] impl DoubleEndedIterator for BinaryValueIter --- src/array/binary/iterator.rs | 45 ++++++++++++++++++++++++++++-------- 1 file changed, 36 insertions(+), 9 deletions(-) diff --git a/src/array/binary/iterator.rs b/src/array/binary/iterator.rs index 10f0e8a43de..4c80a780423 100644 --- a/src/array/binary/iterator.rs +++ b/src/array/binary/iterator.rs @@ -7,12 +7,17 @@ use super::BinaryArray; pub struct BinaryValueIter<'a, O: Offset> { array: &'a BinaryArray, index: usize, + end: usize, } impl<'a, O: Offset> BinaryValueIter<'a, O> { /// Creates a new [`BinaryValueIter`] pub fn new(array: &'a BinaryArray) -> Self { - Self { array, index: 0 } + Self { + array, + index: 0, + end: array.len(), + } } } @@ -21,19 +26,41 @@ impl<'a, O: Offset> Iterator for BinaryValueIter<'a, O> { #[inline] fn next(&mut self) -> Option { - if self.index >= self.array.len() { + if self.index == self.end { return None; - } else { - self.index += 1; } - Some(unsafe { self.array.value_unchecked(self.index - 1) }) + let old = self.index; + self.index += 1; + Some(unsafe { self.array.value_unchecked(old) }) } + #[inline] fn size_hint(&self) -> (usize, Option) { - ( - self.array.len() - self.index, - Some(self.array.len() - self.index), - ) + (self.end - self.index, Some(self.end - self.index)) + } + + #[inline] + fn nth(&mut self, n: usize) -> Option { + let new_index = self.index + n; + if new_index > self.end { + self.index = self.end; + None + } else { + self.index = new_index; + self.next() + } + } +} + +impl<'a, O: Offset> DoubleEndedIterator for BinaryValueIter<'a, O> { + #[inline] + fn next_back(&mut self) -> Option { + if self.index == self.end { + None + } else { + self.end -= 1; + Some(unsafe { self.array.value_unchecked(self.end) }) + } } } From 33d7e0c970bce79078541ebe1e466c20e09d23d0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=96zg=C3=BCr=20Akkurt?= Date: Thu, 22 Sep 2022 23:27:23 +0300 Subject: [PATCH 2/5] impl MutableBinaryArray::extend_values --- src/array/binary/mutable.rs | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/src/array/binary/mutable.rs b/src/array/binary/mutable.rs index 9990a542c2d..c1ee4ba8a6e 100644 --- a/src/array/binary/mutable.rs +++ b/src/array/binary/mutable.rs @@ -345,6 +345,21 @@ impl MutableBinaryArray { unsafe { self.extend_trusted_len_values_unchecked(iterator) } } + /// Extends the [`MutableBinaryArray`] from an iterator of values. + /// This differs from `extended_trusted_len` which accepts iterator of optional values. + #[inline] + pub fn extend_values(&mut self, iterator: I) + where + P: AsRef<[u8]>, + I: Iterator, + { + let additional = extend_from_values_iter(&mut self.offsets, &mut self.values, iterator); + + if let Some(validity) = self.validity.as_mut() { + validity.extend_constant(additional, true); + } + } + /// Extends the [`MutableBinaryArray`] from an `iterator` of values of trusted length. /// This differs from `extend_trusted_len_unchecked` which accepts iterator of optional /// values. From 1934da694a928f475357f7767079e0be2cd356c8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=96zg=C3=BCr=20Akkurt?= Date: Fri, 23 Sep 2022 09:25:38 +0300 Subject: [PATCH 3/5] impl BinaryArray::into_mut --- src/array/binary/mod.rs | 84 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 84 insertions(+) diff --git a/src/array/binary/mod.rs b/src/array/binary/mod.rs index b1efa2f6f21..39c2a4fcfe9 100644 --- a/src/array/binary/mod.rs +++ b/src/array/binary/mod.rs @@ -9,6 +9,8 @@ use crate::{ trusted_len::TrustedLen, }; +use either::Either; + use super::{ specification::{try_check_offsets, try_check_offsets_bounds}, Array, GenericBinaryArray, Offset, @@ -238,6 +240,88 @@ impl BinaryArray { self.validity = validity; } + /// Try to convert this `BinaryArray` to a `MutableBinaryArray` + pub fn into_mut(mut self) -> Either> { + use Either::*; + if let Some(bitmap) = self.validity { + match bitmap.into_mut() { + // Safety: invariants are preserved + Left(bitmap) => Left(unsafe { + BinaryArray::new_unchecked( + self.data_type, + self.offsets, + self.values, + Some(bitmap), + ) + }), + Right(mutable_bitmap) => match ( + self.values.get_mut().map(std::mem::take), + self.offsets.get_mut().map(std::mem::take), + ) { + (None, None) => { + // Safety: invariants are preserved + Left(unsafe { + BinaryArray::new_unchecked( + self.data_type, + self.offsets, + self.values, + Some(mutable_bitmap.into()), + ) + }) + } + (None, Some(offsets)) => { + // Safety: invariants are preserved + Left(unsafe { + BinaryArray::new_unchecked( + self.data_type, + offsets.into(), + self.values, + Some(mutable_bitmap.into()), + ) + }) + } + (Some(mutable_values), None) => { + // Safety: invariants are preserved + Left(unsafe { + BinaryArray::new_unchecked( + self.data_type, + self.offsets, + mutable_values.into(), + Some(mutable_bitmap.into()), + ) + }) + } + (Some(values), Some(offsets)) => Right(unsafe { + MutableBinaryArray::from_data( + self.data_type, + offsets, + values, + Some(mutable_bitmap), + ) + }), + }, + } + } else { + match ( + self.values.get_mut().map(std::mem::take), + self.offsets.get_mut().map(std::mem::take), + ) { + (None, None) => Left(unsafe { + BinaryArray::new_unchecked(self.data_type, self.offsets, self.values, None) + }), + (None, Some(offsets)) => Left(unsafe { + BinaryArray::new_unchecked(self.data_type, offsets.into(), self.values, None) + }), + (Some(values), None) => Left(unsafe { + BinaryArray::new_unchecked(self.data_type, self.offsets, values.into(), None) + }), + (Some(values), Some(offsets)) => Right(unsafe { + MutableBinaryArray::from_data(self.data_type, offsets, values, None) + }), + } + } + } + /// Creates an empty [`BinaryArray`], i.e. whose `.len` is zero. pub fn new_empty(data_type: DataType) -> Self { Self::new( From 2ca230e3971ab3d6f030f409fbc6b789993f80d1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=96zg=C3=BCr=20Akkurt?= Date: Mon, 26 Sep 2022 09:35:04 +0300 Subject: [PATCH 4/5] add tests --- tests/it/array/binary/mod.rs | 59 +++++++++++++++++++++++++ tests/it/array/binary/to_mutable.rs | 67 +++++++++++++++++++++++++++++ 2 files changed, 126 insertions(+) create mode 100644 tests/it/array/binary/to_mutable.rs diff --git a/tests/it/array/binary/mod.rs b/tests/it/array/binary/mod.rs index 3e93a35c568..c36b7121414 100644 --- a/tests/it/array/binary/mod.rs +++ b/tests/it/array/binary/mod.rs @@ -6,6 +6,7 @@ use arrow2::{ }; mod mutable; +mod to_mutable; #[test] fn basics() { @@ -152,3 +153,61 @@ fn debug() { assert_eq!(format!("{:?}", array), "BinaryArray[[1, 2], [], None]"); } + +#[test] +fn into_mut_1() { + let offsets = Buffer::from(vec![0, 1]); + let values = Buffer::from(b"a".to_vec()); + let a = values.clone(); // cloned values + assert_eq!(a, values); + let array = BinaryArray::::from_data(DataType::Binary, offsets, values, None); + assert!(array.into_mut().is_left()); +} + +#[test] +fn into_mut_2() { + let offsets = Buffer::from(vec![0, 1]); + let values = Buffer::from(b"a".to_vec()); + let a = offsets.clone(); // cloned offsets + assert_eq!(a, offsets); + let array = BinaryArray::::from_data(DataType::Binary, offsets, values, None); + assert!(array.into_mut().is_left()); +} + +#[test] +fn into_mut_3() { + let offsets = Buffer::from(vec![0, 1]); + let values = Buffer::from(b"a".to_vec()); + let validity = Some([true].into()); + let a = validity.clone(); // cloned validity + assert_eq!(a, validity); + let array = BinaryArray::::new(DataType::Binary, offsets, values, validity); + assert!(array.into_mut().is_left()); +} + +#[test] +fn into_mut_4() { + let offsets = Buffer::from(vec![0, 1]); + let values = Buffer::from(b"a".to_vec()); + let validity = Some([true].into()); + let array = BinaryArray::::new(DataType::Binary, offsets, values, validity); + assert!(array.into_mut().is_right()); +} + +#[test] +fn rev_iter() { + let array = BinaryArray::::from(&[Some("hello".as_bytes()), Some(" ".as_bytes()), None]); + + assert_eq!( + array.into_iter().rev().collect::>(), + vec![None, Some(" ".as_bytes()), Some("hello".as_bytes())] + ); +} + +#[test] +fn iter_nth() { + let array = BinaryArray::::from(&[Some("hello"), Some(" "), None]); + + assert_eq!(array.iter().nth(1), Some(Some(" ".as_bytes()))); + assert_eq!(array.iter().nth(10), None); +} diff --git a/tests/it/array/binary/to_mutable.rs b/tests/it/array/binary/to_mutable.rs new file mode 100644 index 00000000000..e828244abb5 --- /dev/null +++ b/tests/it/array/binary/to_mutable.rs @@ -0,0 +1,67 @@ +use arrow2::{array::BinaryArray, bitmap::Bitmap, buffer::Buffer, datatypes::DataType}; + +#[test] +fn not_shared() { + let array = BinaryArray::::from(&[Some("hello"), Some(" "), None]); + assert!(array.into_mut().is_right()); +} + +#[test] +#[allow(clippy::redundant_clone)] +fn shared_validity() { + let validity = Bitmap::from([true]); + let array = BinaryArray::::new( + DataType::Binary, + vec![0, 1].into(), + b"a".to_vec().into(), + Some(validity.clone()), + ); + assert!(array.into_mut().is_left()) +} + +#[test] +#[allow(clippy::redundant_clone)] +fn shared_values() { + let values: Buffer = b"a".to_vec().into(); + let array = BinaryArray::::new( + DataType::Binary, + vec![0, 1].into(), + values.clone(), + Some(Bitmap::from([true])), + ); + assert!(array.into_mut().is_left()) +} + +#[test] +#[allow(clippy::redundant_clone)] +fn shared_offsets_values() { + let offsets: Buffer = vec![0, 1].into(); + let values: Buffer = b"a".to_vec().into(); + let array = BinaryArray::::new( + DataType::Binary, + offsets.clone(), + values.clone(), + Some(Bitmap::from([true])), + ); + assert!(array.into_mut().is_left()) +} + +#[test] +#[allow(clippy::redundant_clone)] +fn shared_offsets() { + let offsets: Buffer = vec![0, 1].into(); + let array = BinaryArray::::new( + DataType::Binary, + offsets.clone(), + b"a".to_vec().into(), + Some(Bitmap::from([true])), + ); + assert!(array.into_mut().is_left()) +} + +#[test] +#[allow(clippy::redundant_clone)] +fn shared_all() { + let array = BinaryArray::::from(&[Some("hello"), Some(" "), None]); + assert!(array.clone().into_mut().is_left()) +} From bc759a1ddd70880b60cff60548e1763ce989c923 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=96zg=C3=BCr=20Akkurt?= Date: Mon, 26 Sep 2022 20:59:30 +0300 Subject: [PATCH 5/5] remove unnecessary println in library code --- src/io/parquet/write/binary/basic.rs | 7 ------- 1 file changed, 7 deletions(-) diff --git a/src/io/parquet/write/binary/basic.rs b/src/io/parquet/write/binary/basic.rs index f8a8cdc47ae..58156c901e6 100644 --- a/src/io/parquet/write/binary/basic.rs +++ b/src/io/parquet/write/binary/basic.rs @@ -138,13 +138,6 @@ pub(crate) fn encode_delta( delta_bitpacked::encode(lengths, buffer); } else { - println!( - "{:?}", - offsets - .windows(2) - .map(|w| (w[1] - w[0]).to_usize() as i64) - .collect::>() - ); let lengths = offsets.windows(2).map(|w| (w[1] - w[0]).to_usize() as i64); delta_bitpacked::encode(lengths, buffer); }