From eac1e5e024406011fd148df7d0032056f9e0bee7 Mon Sep 17 00:00:00 2001 From: melkor Date: Sat, 30 Jul 2022 19:30:03 -0700 Subject: [PATCH 1/8] Added `MutableStructArray` --- src/array/mod.rs | 2 +- src/array/struct_/mod.rs | 2 + src/array/struct_/mutable.rs | 207 ++++++++++++++++++++++++++++++ tests/it/array/struct_/mod.rs | 1 + tests/it/array/struct_/mutable.rs | 33 +++++ 5 files changed, 244 insertions(+), 1 deletion(-) create mode 100644 src/array/struct_/mutable.rs create mode 100644 tests/it/array/struct_/mutable.rs diff --git a/src/array/mod.rs b/src/array/mod.rs index 2a2f4d0e9b8..951e14d71a1 100644 --- a/src/array/mod.rs +++ b/src/array/mod.rs @@ -391,7 +391,7 @@ pub use list::{ListArray, ListValuesIter, MutableListArray}; pub use map::MapArray; pub use null::NullArray; pub use primitive::*; -pub use struct_::StructArray; +pub use struct_::{MutableStructArray, StructArray}; pub use union::UnionArray; pub use utf8::{MutableUtf8Array, Utf8Array, Utf8ValuesIter}; diff --git a/src/array/struct_/mod.rs b/src/array/struct_/mod.rs index a7c88b6fb19..3d25d8c1882 100644 --- a/src/array/struct_/mod.rs +++ b/src/array/struct_/mod.rs @@ -9,6 +9,8 @@ use super::{new_empty_array, new_null_array, Array}; mod ffi; pub(super) mod fmt; mod iterator; +mod mutable; +pub use mutable::*; /// A [`StructArray`] is a nested [`Array`] with an optional validity representing /// multiple [`Array`] with the same number of rows. diff --git a/src/array/struct_/mutable.rs b/src/array/struct_/mutable.rs new file mode 100644 index 00000000000..e5e060dd95d --- /dev/null +++ b/src/array/struct_/mutable.rs @@ -0,0 +1,207 @@ +use std::sync::Arc; + +use crate::{ + array::{Array, MutableArray}, + bitmap::MutableBitmap, + datatypes::DataType, +}; + +use super::StructArray; + +/// Converting a [`MutableStructArray`] into a [`StructArray`] is `O(1)`. +#[derive(Debug)] +pub struct MutableStructArray { + data_type: DataType, + values: Vec>, + validity: Option, +} + +impl From for StructArray { + fn from(other: MutableStructArray) -> Self { + let validity = if other.validity.as_ref().map(|x| x.unset_bits()).unwrap_or(0) > 0 { + other.validity.map(|x| x.into()) + } else { + None + }; + + StructArray::from_data( + other.data_type, + other.values.into_iter().map(|mut v| v.as_box()).collect(), + validity, + ) + } +} + +impl MutableStructArray { + /// Creates a new [`MutableStructArray`]. + pub fn new(data_type: DataType, values: Vec>) -> Self { + Self::from_data(data_type, values, None) + } + + /// Create a [`MutableStructArray`] out of low-end APIs. + /// # Panics + /// This function panics iff: + /// * `data_type` is not [`DataType::Struct`] + /// * The inner types of `data_type` are not equal to those of `values` + /// * `validity` is not `None` and its length is different from the `values`'s length + pub fn from_data( + data_type: DataType, + values: Vec>, + validity: Option, + ) -> Self { + match data_type { + DataType::Struct(ref fields) => { + assert_eq!( + fields.iter().map(|f| f.data_type()).collect::>(), + values.iter().map(|v| v.data_type()).collect::>() + ); + } + _ => panic!("StructArray must be initialized with DataType::Struct"), + }; + let self_ = Self { + data_type, + values, + validity, + }; + self_.assert_lengths(); + self_ + } + + fn assert_lengths(&self) { + let lengths: Vec<_> = self.values.iter().map(|v| v.len()).collect(); + if let Some(first_len) = lengths.first() { + if !lengths.iter().all(|l| l == first_len) { + panic!("StructArray child lengths differ: {:?}", lengths); + } + } + if let Some(ref validity) = self.validity { + assert_eq!(*lengths.first().unwrap_or(&0), validity.len()); + } + } + + /// Extract the low-end APIs from the [`MutableStructArray`]. + pub fn into_data(self) -> (DataType, Vec>, Option) { + (self.data_type, self.values, self.validity) + } + + /// The mutable values + pub fn mut_values(&mut self) -> &mut Vec> { + &mut self.values + } + + /// The values + pub fn values(&self) -> &Vec> { + &self.values + } + + /// Return the `i`th child array. + pub fn value(&mut self, i: usize) -> Option<&mut A> { + self.values[i].as_mut_any().downcast_mut::() + } +} + +impl MutableStructArray { + /// Reserves `additional` entries. + pub fn reserve(&mut self, additional: usize) { + self.values.reserve(additional); + if let Some(x) = self.validity.as_mut() { + x.reserve(additional) + } + } + + /// Call this after pushing into each child array. + pub fn push(&mut self, valid: bool) { + match &mut self.validity { + Some(validity) => validity.push(valid), + None => match valid { + true => (), + false => self.init_validity(), + }, + }; + // TODO: investigate performance implications of this assertion + self.assert_lengths(); + } + + fn push_null(&mut self) { + for v in &mut self.values { + v.push_null(); + } + self.push(false); + } + + fn init_validity(&mut self) { + let mut validity = MutableBitmap::with_capacity(self.values.capacity()); + let len = self.len(); + if len > 0 { + validity.extend_constant(len, true); + validity.set(len - 1, false); + } + self.validity = Some(validity) + } + + /// Converts itself into an [`Array`]. + pub fn into_arc(self) -> Arc { + let a: StructArray = self.into(); + Arc::new(a) + } + + /// Shrinks the capacity of the [`MutableStructArray`] to fit its current length. + pub fn shrink_to_fit(&mut self) { + self.values.shrink_to_fit(); + if let Some(validity) = &mut self.validity { + validity.shrink_to_fit() + } + } +} + +impl MutableArray for MutableStructArray { + fn len(&self) -> usize { + self.values.first().map(|v| v.len()).unwrap_or(0) + } + + fn validity(&self) -> Option<&MutableBitmap> { + self.validity.as_ref() + } + + fn as_box(&mut self) -> Box { + Box::new(StructArray::from_data( + self.data_type.clone(), + std::mem::take(&mut self.values) + .into_iter() + .map(|mut v| v.as_box()) + .collect(), + std::mem::take(&mut self.validity).map(|x| x.into()), + )) + } + + fn as_arc(&mut self) -> Arc { + Arc::new(StructArray::from_data( + self.data_type.clone(), + std::mem::take(&mut self.values) + .into_iter() + .map(|mut v| v.as_box()) + .collect(), + std::mem::take(&mut self.validity).map(|x| x.into()), + )) + } + + fn data_type(&self) -> &DataType { + &self.data_type + } + + fn as_any(&self) -> &dyn std::any::Any { + self + } + + fn as_mut_any(&mut self) -> &mut dyn std::any::Any { + self + } + + fn push_null(&mut self) { + self.push_null() + } + + fn shrink_to_fit(&mut self) { + self.shrink_to_fit() + } +} diff --git a/tests/it/array/struct_/mod.rs b/tests/it/array/struct_/mod.rs index 9ec34eca0ef..8869c215e5f 100644 --- a/tests/it/array/struct_/mod.rs +++ b/tests/it/array/struct_/mod.rs @@ -1,4 +1,5 @@ mod iterator; +mod mutable; use arrow2::array::*; use arrow2::bitmap::Bitmap; diff --git a/tests/it/array/struct_/mutable.rs b/tests/it/array/struct_/mutable.rs new file mode 100644 index 00000000000..19f2f12f15e --- /dev/null +++ b/tests/it/array/struct_/mutable.rs @@ -0,0 +1,33 @@ +use arrow2::{ + array::*, + datatypes::{DataType, Field}, +}; + +#[test] +fn push() { + let c1 = Box::new(MutablePrimitiveArray::::new()) as Box; + let values = vec![c1]; + let data_type = DataType::Struct(vec![Field::new("f1", DataType::Int32, true)]); + let mut a = MutableStructArray::new(data_type, values); + + a.value::>(0) + .unwrap() + .push(Some(1)); + a.push(true); + a.value::>(0).unwrap().push(None); + a.push(false); + a.value::>(0) + .unwrap() + .push(Some(2)); + a.push(true); + + assert_eq!(a.len(), 3); + assert!(a.is_valid(0)); + assert!(!a.is_valid(1)); + assert!(a.is_valid(2)); + + assert_eq!( + a.value::>(0).unwrap().values(), + &Vec::from([1, 0, 2]) + ); +} From 7030a0463269c6fd52986d1ca63a60546103f8ac Mon Sep 17 00:00:00 2001 From: melkor Date: Sun, 31 Jul 2022 12:13:26 -0700 Subject: [PATCH 2/8] Avoid allocations --- src/array/struct_/mutable.rs | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/src/array/struct_/mutable.rs b/src/array/struct_/mutable.rs index e5e060dd95d..ce60c7d23fe 100644 --- a/src/array/struct_/mutable.rs +++ b/src/array/struct_/mutable.rs @@ -50,12 +50,8 @@ impl MutableStructArray { validity: Option, ) -> Self { match data_type { - DataType::Struct(ref fields) => { - assert_eq!( - fields.iter().map(|f| f.data_type()).collect::>(), - values.iter().map(|v| v.data_type()).collect::>() - ); - } + DataType::Struct(ref fields) => + assert!(fields.iter().map(|f| f.data_type()).eq(values.iter().map(|f| f.data_type()))), _ => panic!("StructArray must be initialized with DataType::Struct"), }; let self_ = Self { @@ -68,14 +64,15 @@ impl MutableStructArray { } fn assert_lengths(&self) { - let lengths: Vec<_> = self.values.iter().map(|v| v.len()).collect(); - if let Some(first_len) = lengths.first() { - if !lengths.iter().all(|l| l == first_len) { + let first_len = self.values.first().map(|v| v.len()); + if let Some(len) = first_len { + if !self.values.iter().all(|x| x.len() == len) { + let lengths: Vec<_> = self.values.iter().map(|v| v.len()).collect(); panic!("StructArray child lengths differ: {:?}", lengths); } } - if let Some(ref validity) = self.validity { - assert_eq!(*lengths.first().unwrap_or(&0), validity.len()); + if let Some(validity) = &self.validity { + assert_eq!(first_len.unwrap_or(0), validity.len()); } } From 91ffe4a845cc4223958b92393f29972445ef68b7 Mon Sep 17 00:00:00 2001 From: melkor Date: Sun, 31 Jul 2022 12:15:53 -0700 Subject: [PATCH 3/8] Make data type check more inclusive --- src/array/struct_/mutable.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/array/struct_/mutable.rs b/src/array/struct_/mutable.rs index ce60c7d23fe..4ce1120c6f4 100644 --- a/src/array/struct_/mutable.rs +++ b/src/array/struct_/mutable.rs @@ -49,7 +49,7 @@ impl MutableStructArray { values: Vec>, validity: Option, ) -> Self { - match data_type { + match data_type.to_logical_type() { DataType::Struct(ref fields) => assert!(fields.iter().map(|f| f.data_type()).eq(values.iter().map(|f| f.data_type()))), _ => panic!("StructArray must be initialized with DataType::Struct"), From 9d720e44bb9387c00df9f756b310298f1950b601 Mon Sep 17 00:00:00 2001 From: melkor Date: Sun, 31 Jul 2022 19:38:21 -0700 Subject: [PATCH 4/8] Shrink child arrays, not their container --- src/array/struct_/mutable.rs | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/array/struct_/mutable.rs b/src/array/struct_/mutable.rs index 4ce1120c6f4..e762b536c47 100644 --- a/src/array/struct_/mutable.rs +++ b/src/array/struct_/mutable.rs @@ -144,8 +144,10 @@ impl MutableStructArray { /// Shrinks the capacity of the [`MutableStructArray`] to fit its current length. pub fn shrink_to_fit(&mut self) { - self.values.shrink_to_fit(); - if let Some(validity) = &mut self.validity { + for v in &mut self.values { + v.shrink_to_fit(); + } + if let Some(validity) = self.validity.as_mut() { validity.shrink_to_fit() } } From f7af9037f32ff6b27c1386937eadb8fe51127620 Mon Sep 17 00:00:00 2001 From: melkor Date: Sun, 31 Jul 2022 19:40:53 -0700 Subject: [PATCH 5/8] Mention panics in docstring --- src/array/struct_/mutable.rs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/array/struct_/mutable.rs b/src/array/struct_/mutable.rs index e762b536c47..a5c93d37d81 100644 --- a/src/array/struct_/mutable.rs +++ b/src/array/struct_/mutable.rs @@ -107,6 +107,9 @@ impl MutableStructArray { } /// Call this after pushing into each child array. + /// # Panics + /// This function panics if any of the children does not have exactly one more + /// element than before the last call of `push`. pub fn push(&mut self, valid: bool) { match &mut self.validity { Some(validity) => validity.push(valid), @@ -115,7 +118,6 @@ impl MutableStructArray { false => self.init_validity(), }, }; - // TODO: investigate performance implications of this assertion self.assert_lengths(); } From c27ce7e4d765c6769a89ac85c425c65bb9a8f393 Mon Sep 17 00:00:00 2001 From: melkor Date: Thu, 4 Aug 2022 23:49:39 -0700 Subject: [PATCH 6/8] Reserve child arrays, not their container --- src/array/struct_/mutable.rs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/array/struct_/mutable.rs b/src/array/struct_/mutable.rs index a5c93d37d81..0cff7735418 100644 --- a/src/array/struct_/mutable.rs +++ b/src/array/struct_/mutable.rs @@ -100,7 +100,9 @@ impl MutableStructArray { impl MutableStructArray { /// Reserves `additional` entries. pub fn reserve(&mut self, additional: usize) { - self.values.reserve(additional); + for v in &mut self.values { + v.reserve(additional); + } if let Some(x) = self.validity.as_mut() { x.reserve(additional) } From b6b7540a8fb10e2ed8a46191c2ecd230e8fbe403 Mon Sep 17 00:00:00 2001 From: melkor Date: Thu, 4 Aug 2022 23:53:21 -0700 Subject: [PATCH 7/8] Implement `MutableArray::reserve` --- src/array/struct_/mutable.rs | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/array/struct_/mutable.rs b/src/array/struct_/mutable.rs index 0cff7735418..b4d8ded4ce6 100644 --- a/src/array/struct_/mutable.rs +++ b/src/array/struct_/mutable.rs @@ -207,4 +207,8 @@ impl MutableArray for MutableStructArray { fn shrink_to_fit(&mut self) { self.shrink_to_fit() } + + fn reserve(&mut self, additional: usize) { + self.reserve(additional) + } } From aa9af03952330f414bd71e1fedfcdb8d97baa4c5 Mon Sep 17 00:00:00 2001 From: melkor Date: Fri, 5 Aug 2022 00:44:50 -0700 Subject: [PATCH 8/8] Fix formatting --- src/array/struct_/mutable.rs | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/array/struct_/mutable.rs b/src/array/struct_/mutable.rs index b4d8ded4ce6..cc02146a77d 100644 --- a/src/array/struct_/mutable.rs +++ b/src/array/struct_/mutable.rs @@ -50,8 +50,10 @@ impl MutableStructArray { validity: Option, ) -> Self { match data_type.to_logical_type() { - DataType::Struct(ref fields) => - assert!(fields.iter().map(|f| f.data_type()).eq(values.iter().map(|f| f.data_type()))), + DataType::Struct(ref fields) => assert!(fields + .iter() + .map(|f| f.data_type()) + .eq(values.iter().map(|f| f.data_type()))), _ => panic!("StructArray must be initialized with DataType::Struct"), }; let self_ = Self {