From 267cac048f1e150953a7a92ce5f0f039dca786e1 Mon Sep 17 00:00:00 2001 From: Ritchie Vink Date: Thu, 19 May 2022 08:41:42 +0200 Subject: [PATCH] delay null_counts (#994) --- src/array/primitive/mutable.rs | 38 +++++++++++------------------ src/array/utf8/mutable.rs | 16 ++++++++---- src/bitmap/mutable.rs | 1 + tests/it/array/primitive/mutable.rs | 15 +++++++++--- 4 files changed, 38 insertions(+), 32 deletions(-) diff --git a/src/array/primitive/mutable.rs b/src/array/primitive/mutable.rs index 43f734f74bf..24768f83616 100644 --- a/src/array/primitive/mutable.rs +++ b/src/array/primitive/mutable.rs @@ -1,5 +1,6 @@ use std::{iter::FromIterator, sync::Arc}; +use crate::bitmap::Bitmap; use crate::{ array::{Array, MutableArray, TryExtend, TryPush}, bitmap::MutableBitmap, @@ -13,7 +14,7 @@ use super::PrimitiveArray; /// The Arrow's equivalent to `Vec>` where `T` is byte-size (e.g. `i32`). /// Converting a [`MutablePrimitiveArray`] into a [`PrimitiveArray`] is `O(1)`. -#[derive(Debug)] +#[derive(Debug, Clone)] pub struct MutablePrimitiveArray { data_type: DataType, values: Vec, @@ -22,11 +23,14 @@ pub struct MutablePrimitiveArray { impl From> for PrimitiveArray { fn from(other: MutablePrimitiveArray) -> Self { - let validity = if other.validity.as_ref().map(|x| x.null_count()).unwrap_or(0) > 0 { - other.validity.map(|x| x.into()) - } else { - None - }; + let validity = other.validity.and_then(|x| { + let bitmap: Bitmap = x.into(); + if bitmap.null_count() == 0 { + None + } else { + Some(bitmap) + } + }); PrimitiveArray::::new(other.data_type, other.values.into(), validity) } @@ -194,9 +198,7 @@ impl MutablePrimitiveArray { let mut validity = MutableBitmap::new(); validity.extend_constant(self.len(), true); extend_trusted_len_unzip(iterator, &mut validity, &mut self.values); - if validity.null_count() > 0 { - self.validity = Some(validity); - } + self.validity = Some(validity); } } /// Extends the [`MutablePrimitiveArray`] from an iterator of values of trusted len. @@ -526,11 +528,7 @@ impl>> FromIterator }) .collect(); - let validity = if validity.null_count() > 0 { - Some(validity) - } else { - None - }; + let validity = Some(validity); Self { data_type: T::PRIMITIVE.into(), @@ -588,11 +586,7 @@ where extend_trusted_len_unzip(iterator, &mut validity, &mut buffer); - let validity = if validity.null_count() > 0 { - Some(validity) - } else { - None - }; + let validity = Some(validity); (validity, buffer) } @@ -634,11 +628,7 @@ where buffer.set_len(len); null.set_len(len); - let validity = if null.null_count() > 0 { - Some(null) - } else { - None - }; + let validity = Some(null); Ok((validity, buffer)) } diff --git a/src/array/utf8/mutable.rs b/src/array/utf8/mutable.rs index f9f53a68462..f19e331cfac 100644 --- a/src/array/utf8/mutable.rs +++ b/src/array/utf8/mutable.rs @@ -13,6 +13,7 @@ use crate::{ use super::Utf8Array; use crate::array::physical_binary::*; +use crate::bitmap::Bitmap; struct StrAsBytes

(P); impl> AsRef<[u8]> for StrAsBytes { @@ -36,12 +37,21 @@ impl From> for Utf8Array { // Safety: // `MutableUtf8Array` has the same invariants as `Utf8Array` and thus // `Utf8Array` can be safely created from `MutableUtf8Array` without checks. + let validity = other.validity.and_then(|x| { + let bitmap: Bitmap = x.into(); + if bitmap.null_count() == 0 { + None + } else { + Some(bitmap) + } + }); + unsafe { Utf8Array::::from_data_unchecked( other.data_type, other.offsets.into(), other.values.into(), - other.validity.map(|x| x.into()), + validity, ) } } @@ -368,10 +378,6 @@ impl MutableUtf8Array { self.validity.as_mut().unwrap(), iterator, ); - - if self.validity.as_mut().unwrap().null_count() == 0 { - self.validity = None; - } } /// Creates a [`MutableUtf8Array`] from an iterator of trusted length. diff --git a/src/bitmap/mutable.rs b/src/bitmap/mutable.rs index 94e0b00d7c7..c8e5e046491 100644 --- a/src/bitmap/mutable.rs +++ b/src/bitmap/mutable.rs @@ -14,6 +14,7 @@ use super::Bitmap; /// The main difference against [`Vec`] is that a bitmap cannot be represented as `&[bool]`. /// # Implementation /// This container is backed by [`Vec`]. +#[derive(Clone)] pub struct MutableBitmap { buffer: Vec, // invariant: length.saturating_add(7) / 8 == buffer.len(); diff --git a/tests/it/array/primitive/mutable.rs b/tests/it/array/primitive/mutable.rs index 6ff96a9e865..99f897076dd 100644 --- a/tests/it/array/primitive/mutable.rs +++ b/tests/it/array/primitive/mutable.rs @@ -126,7 +126,8 @@ fn set() { fn from_iter() { let a = MutablePrimitiveArray::::from_iter((0..2).map(Some)); assert_eq!(a.len(), 2); - assert_eq!(a.validity(), None); + let validity = a.validity().unwrap(); + assert_eq!(validity.null_count(), 0); } #[test] @@ -176,7 +177,8 @@ fn from_trusted_len() { fn extend_trusted_len() { let mut a = MutablePrimitiveArray::::new(); a.extend_trusted_len(vec![Some(1), Some(2)].into_iter()); - assert_eq!(a.validity(), None); + let validity = a.validity().unwrap(); + assert_eq!(validity.null_count(), 0); a.extend_trusted_len(vec![None, Some(4)].into_iter()); assert_eq!( a.validity(), @@ -266,7 +268,14 @@ fn extend_from_slice() { fn set_validity() { let mut a = MutablePrimitiveArray::::new(); a.extend_trusted_len(vec![Some(1), Some(2)].into_iter()); - assert_eq!(a.validity(), None); + let validity = a.validity().unwrap(); + assert_eq!(validity.null_count(), 0); + + // test that upon conversion to array the bitmap is set to None + let arr: PrimitiveArray<_> = a.clone().into(); + assert_eq!(arr.validity(), None); + + // test set_validity a.set_validity(Some(MutableBitmap::from([false, true]))); assert_eq!(a.validity(), Some(&MutableBitmap::from([false, true]))); }