From 0bb1e56c01dde0586b0804f60e50a56724440d82 Mon Sep 17 00:00:00 2001 From: VasanthakumarV Date: Fri, 17 Sep 2021 15:05:59 +0530 Subject: [PATCH 1/5] Add extend methods for `MutableUtf8Array` --- src/array/utf8/mutable.rs | 197 ++++++++++++++++++++++++++------- tests/it/array/utf8/mutable.rs | 39 +++++++ 2 files changed, 199 insertions(+), 37 deletions(-) diff --git a/src/array/utf8/mutable.rs b/src/array/utf8/mutable.rs index 80e1e292ae6..33d223bc583 100644 --- a/src/array/utf8/mutable.rs +++ b/src/array/utf8/mutable.rs @@ -224,6 +224,75 @@ impl> FromIterator> for MutableUtf8Array { } impl MutableUtf8Array { + /// Extends the [`MutableUtf8Array`] from an iterator of values of trusted len. + /// This differs from `extended_trusted_len` which accepts iterator of optional values. + #[inline] + pub fn extend_trusted_len_values(&mut self, iterator: I) + where + P: AsRef, + I: TrustedLen, + { + unsafe { self.extend_trusted_len_values_unchecked(iterator) } + } + + /// Extends the [`MutableUtf8Array`] from an iterator of values of trusted len. + /// This differs from `extended_trusted_len_unchecked` which accepts iterator of optional + /// values. + /// # Safety + /// The iterator must be trusted len. + #[inline] + pub unsafe fn extend_trusted_len_values_unchecked(&mut self, iterator: I) + where + P: AsRef, + I: Iterator, + { + let (_, upper) = iterator.size_hint(); + let additional = upper.expect("extend_trusted_len_values requires an upper limit"); + + extend_from_trusted_len_values_iter(&mut self.offsets, &mut self.values, iterator); + + if let Some(validity) = self.validity.as_mut() { + validity.extend_constant(additional, true); + } + } + + /// Extends the [`MutableUtf8Array`] from an iterator of trusted len. + #[inline] + pub fn extend_trusted_len(&mut self, iterator: I) + where + P: AsRef, + I: TrustedLen>, + { + unsafe { self.extend_trusted_len_unchecked(iterator) } + } + + /// Extends [`MutableUtf8Array`] from an iterator of trusted len. + /// #Safety + /// The iterator must be trusted len. + #[inline] + pub unsafe fn extend_trusted_len_unchecked(&mut self, iterator: I) + where + P: AsRef, + I: Iterator>, + { + if self.validity.is_none() { + self.validity = Some(MutableBitmap::from_trusted_len_iter( + std::iter::repeat(true).take(self.len()), + )); + } + + extend_from_trusted_len_iter( + &mut self.offsets, + &mut self.values, + &mut self.validity.as_mut().unwrap(), + iterator, + ); + + if self.validity.as_mut().unwrap().null_count() == 0 { + self.validity = None; + } + } + /// Creates a [`MutableUtf8Array`] from an iterator of trusted length. /// # Safety /// The iterator must be [`TrustedLen`](https://doc.rust-lang.org/std/iter/trait.TrustedLen.html). @@ -377,37 +446,13 @@ where P: AsRef, I: Iterator>, { - let (_, upper) = iterator.size_hint(); - let len = upper.expect("trusted_len_unzip requires an upper limit"); - - let mut validity = MutableBitmap::with_capacity(len); - let mut offsets = MutableBuffer::::with_capacity(len + 1); + let mut offsets = MutableBuffer::::with_capacity(1); let mut values = MutableBuffer::::new(); + let mut validity = MutableBitmap::new(); - let mut length = O::default(); - let mut dst = offsets.as_mut_ptr(); - std::ptr::write(dst, length); - dst = dst.add(1); - for item in iterator { - if let Some(item) = item { - validity.push(true); - let s = item.as_ref(); - length += O::from_usize(s.len()).unwrap(); - values.extend_from_slice(s.as_bytes()); - } else { - validity.push(false); - values.extend_from_slice(b""); - }; + offsets.push_unchecked(O::default()); - std::ptr::write(dst, length); - dst = dst.add(1); - } - assert_eq!( - dst.offset_from(offsets.as_ptr()) as usize, - len + 1, - "Trusted iterator length was not accurately reported" - ); - offsets.set_len(len + 1); + extend_from_trusted_len_iter(&mut offsets, &mut values, &mut validity, iterator); let validity = if validity.null_count() > 0 { Some(validity) @@ -481,33 +526,111 @@ where O: Offset, P: AsRef, I: Iterator, +{ + let mut offsets = MutableBuffer::::with_capacity(1); + let mut values = MutableBuffer::::new(); + + offsets.push_unchecked(O::default()); + + extend_from_trusted_len_values_iter(&mut offsets, &mut values, iterator); + + (offsets, values) +} + +/// Populates `offsets` and `values` [`Buffer`] with information +/// extracted from the incoming iterator. +/// # Safety +/// The caller must ensure that `iterator` is [`TrustedLen`] +#[inline] +unsafe fn extend_from_trusted_len_values_iter( + offsets: &mut MutableBuffer, + values: &mut MutableBuffer, + iterator: I, +) where + O: Offset, + P: AsRef, + I: Iterator, { let (_, upper) = iterator.size_hint(); - let len = upper.expect("trusted_len_unzip requires an upper limit"); + let len = upper.expect("extend_from_trusted_len_iter_values requires an upper limit"); - let mut offsets = MutableBuffer::::with_capacity(len + 1); - let mut values = MutableBuffer::::new(); + offsets.reserve(len); + + let mut length = *offsets.last().unwrap(); - let mut length = O::default(); let mut dst = offsets.as_mut_ptr(); - std::ptr::write(dst, length); - dst = dst.add(1); + dst = dst.add(offsets.len()); + for item in iterator { let s = item.as_ref(); + length += O::from_usize(s.len()).unwrap(); + values.extend_from_slice(s.as_bytes()); + std::ptr::write(dst, length); + + dst = dst.add(1); + } + + assert_eq!( + dst.offset_from(offsets.as_ptr()) as usize, + offsets.len() + len, + "Trusted iterator length was not accurately reported" + ); + + offsets.set_len(offsets.len() + len); +} + +/// Populates `offsets`, `values`, and validity [`Buffer`] with information +/// extracted from the incoming iterator. +/// # Safety +/// The caller must ensure that `iterator` is [`TrustedLen`] +#[inline] +unsafe fn extend_from_trusted_len_iter( + offsets: &mut MutableBuffer, + values: &mut MutableBuffer, + validity: &mut MutableBitmap, + iterator: I, +) where + O: Offset, + P: AsRef, + I: Iterator>, +{ + let (_, upper) = iterator.size_hint(); + let len = upper.expect("extend_from_trusted_len_values_iter requires an upper limit"); + + offsets.reserve(len); + validity.reserve(len); + + let mut length = *offsets.last().unwrap(); + + let mut dst = offsets.as_mut_ptr(); + dst = dst.add(offsets.len()); + + for item in iterator { + if let Some(item) = item { + let s = item.as_ref(); + + length += O::from_usize(s.len()).unwrap(); + + values.extend_from_slice(s.as_bytes()); + validity.push_unchecked(true); + } else { + validity.push_unchecked(false); + }; std::ptr::write(dst, length); + dst = dst.add(1); } + assert_eq!( dst.offset_from(offsets.as_ptr()) as usize, - len + 1, + offsets.len() + len, "Trusted iterator length was not accurately reported" ); - offsets.set_len(len + 1); - (offsets, values) + offsets.set_len(offsets.len() + len); } /// Creates two [`MutableBuffer`]s from an iterator of `&str`. diff --git a/tests/it/array/utf8/mutable.rs b/tests/it/array/utf8/mutable.rs index c7ee553ee5e..260f90e79c6 100644 --- a/tests/it/array/utf8/mutable.rs +++ b/tests/it/array/utf8/mutable.rs @@ -45,3 +45,42 @@ fn wrong_data_type() { let values = MutableBuffer::from(b"abbb"); MutableUtf8Array::::from_data(DataType::Int8, offsets, values, None); } + +#[test] +fn test_extend_trusted_len_values() { + let mut array = MutableUtf8Array::::new(); + + array.extend_trusted_len_values(["hi", "there"].iter()); + array.extend_trusted_len_values(["hello"].iter()); + array.extend_trusted_len(vec![Some("again"), None].into_iter()); + + let array: Utf8Array = array.into(); + + assert_eq!(array.values().as_slice(), b"hitherehelloagain"); + assert_eq!(array.offsets().as_slice(), &[0, 2, 7, 12, 17, 17]); + assert_eq!( + array.validity(), + &Some(Bitmap::from_u8_slice(&[0b00001111], 5)) + ); +} + +#[test] +fn test_extend_trusted_len() { + let mut array = MutableUtf8Array::::new(); + + // TODO Understand why the following is not possible + //array.extend_trusted_len([Some("hi"), Some("there")].into_iter()); + + array.extend_trusted_len(vec![Some("hi"), Some("there")].into_iter()); + array.extend_trusted_len(vec![None, Some("hello")].into_iter()); + array.extend_trusted_len_values(["again"].iter()); + + let array: Utf8Array = array.into(); + + assert_eq!(array.values().as_slice(), b"hitherehelloagain"); + assert_eq!(array.offsets().as_slice(), &[0, 2, 7, 7, 12, 17]); + assert_eq!( + array.validity(), + &Some(Bitmap::from_u8_slice(&[0b00011011], 5)) + ); +} From 1744abc7b8f7f6c93846677ee5221e050982c1e3 Mon Sep 17 00:00:00 2001 From: VasanthakumarV Date: Fri, 17 Sep 2021 23:05:48 +0530 Subject: [PATCH 2/5] Avoid extra allocation on the offset buffer --- src/array/utf8/mutable.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/array/utf8/mutable.rs b/src/array/utf8/mutable.rs index 33d223bc583..0a2e7146f9f 100644 --- a/src/array/utf8/mutable.rs +++ b/src/array/utf8/mutable.rs @@ -527,7 +527,7 @@ where P: AsRef, I: Iterator, { - let mut offsets = MutableBuffer::::with_capacity(1); + let mut offsets = MutableBuffer::::with_capacity(1 + iterator.size_hint().1.unwrap()); let mut values = MutableBuffer::::new(); offsets.push_unchecked(O::default()); From 79be6461b380e1790f115a81cb3be14345be109a Mon Sep 17 00:00:00 2001 From: VasanthakumarV Date: Fri, 17 Sep 2021 23:06:49 +0530 Subject: [PATCH 3/5] Rename `len` as `additional` --- src/array/utf8/mutable.rs | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/src/array/utf8/mutable.rs b/src/array/utf8/mutable.rs index 0a2e7146f9f..418cf260136 100644 --- a/src/array/utf8/mutable.rs +++ b/src/array/utf8/mutable.rs @@ -552,9 +552,9 @@ unsafe fn extend_from_trusted_len_values_iter( I: Iterator, { let (_, upper) = iterator.size_hint(); - let len = upper.expect("extend_from_trusted_len_iter_values requires an upper limit"); + let additional = upper.expect("extend_from_trusted_len_iter_values requires an upper limit"); - offsets.reserve(len); + offsets.reserve(additional); let mut length = *offsets.last().unwrap(); @@ -574,11 +574,11 @@ unsafe fn extend_from_trusted_len_values_iter( assert_eq!( dst.offset_from(offsets.as_ptr()) as usize, - offsets.len() + len, + offsets.len() + additional, "Trusted iterator length was not accurately reported" ); - offsets.set_len(offsets.len() + len); + offsets.set_len(offsets.len() + additional); } /// Populates `offsets`, `values`, and validity [`Buffer`] with information @@ -597,10 +597,10 @@ unsafe fn extend_from_trusted_len_iter( I: Iterator>, { let (_, upper) = iterator.size_hint(); - let len = upper.expect("extend_from_trusted_len_values_iter requires an upper limit"); + let additional = upper.expect("extend_from_trusted_len_values_iter requires an upper limit"); - offsets.reserve(len); - validity.reserve(len); + offsets.reserve(additional); + validity.reserve(additional); let mut length = *offsets.last().unwrap(); @@ -626,11 +626,11 @@ unsafe fn extend_from_trusted_len_iter( assert_eq!( dst.offset_from(offsets.as_ptr()) as usize, - offsets.len() + len, + offsets.len() + additional, "Trusted iterator length was not accurately reported" ); - offsets.set_len(offsets.len() + len); + offsets.set_len(offsets.len() + additional); } /// Creates two [`MutableBuffer`]s from an iterator of `&str`. From cd4e3d4f15eb1d17c90c7a740d78ee7ab7e7b450 Mon Sep 17 00:00:00 2001 From: VasanthakumarV Date: Fri, 17 Sep 2021 23:07:10 +0530 Subject: [PATCH 4/5] Remove todo comments in test --- tests/it/array/utf8/mutable.rs | 3 --- 1 file changed, 3 deletions(-) diff --git a/tests/it/array/utf8/mutable.rs b/tests/it/array/utf8/mutable.rs index 260f90e79c6..6afd6c666f2 100644 --- a/tests/it/array/utf8/mutable.rs +++ b/tests/it/array/utf8/mutable.rs @@ -68,9 +68,6 @@ fn test_extend_trusted_len_values() { fn test_extend_trusted_len() { let mut array = MutableUtf8Array::::new(); - // TODO Understand why the following is not possible - //array.extend_trusted_len([Some("hi"), Some("there")].into_iter()); - array.extend_trusted_len(vec![Some("hi"), Some("there")].into_iter()); array.extend_trusted_len(vec![None, Some("hello")].into_iter()); array.extend_trusted_len_values(["again"].iter()); From 2d1aa1183028c5cd1dd81e1a44f865872d664217 Mon Sep 17 00:00:00 2001 From: VasanthakumarV Date: Fri, 17 Sep 2021 23:28:54 +0530 Subject: [PATCH 5/5] Use `extend_constant` for `validity` initialization --- src/array/utf8/mutable.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/array/utf8/mutable.rs b/src/array/utf8/mutable.rs index 418cf260136..cbc3904e022 100644 --- a/src/array/utf8/mutable.rs +++ b/src/array/utf8/mutable.rs @@ -276,9 +276,9 @@ impl MutableUtf8Array { I: Iterator>, { if self.validity.is_none() { - self.validity = Some(MutableBitmap::from_trusted_len_iter( - std::iter::repeat(true).take(self.len()), - )); + let mut validity = MutableBitmap::new(); + validity.extend_constant(self.len(), true); + self.validity = Some(validity); } extend_from_trusted_len_iter(