diff --git a/src/buffer/immutable.rs b/src/buffer/immutable.rs index 5b3422012fd..e4b69ddc371 100644 --- a/src/buffer/immutable.rs +++ b/src/buffer/immutable.rs @@ -120,7 +120,7 @@ impl Buffer { /// Returns a new [`Buffer`] that is a slice of this buffer starting at `offset`. /// Doing so allows the same memory region to be shared between buffers. /// # Panics - /// Panics iff `offset` is larger than `len`. + /// Panics iff `offset + length` is larger than `len`. #[inline] pub fn sliced(self, offset: usize, length: usize) -> Self { assert!( diff --git a/src/io/parquet/write/binary/nested.rs b/src/io/parquet/write/binary/nested.rs index 1e9e45517eb..950ea4190ca 100644 --- a/src/io/parquet/write/binary/nested.rs +++ b/src/io/parquet/write/binary/nested.rs @@ -4,7 +4,7 @@ use parquet2::{encoding::Encoding, page::DataPage}; use super::super::{nested, utils, WriteOptions}; use super::basic::{build_statistics, encode_plain}; use crate::io::parquet::read::schema::is_nullable; -use crate::io::parquet::write::{slice_nested_leaf, Nested}; +use crate::io::parquet::write::Nested; use crate::{ array::{Array, BinaryArray}, error::Result, @@ -22,34 +22,21 @@ where { let is_optional = is_nullable(&type_.field_info); - // we slice the leaf by the offsets as dremel only computes lengths and thus - // does NOT take the starting offset into account. - // By slicing the leaf array we also don't write too many values. - let (start, len) = slice_nested_leaf(nested); - - let mut nested = nested.to_vec(); - let array = array.clone().sliced(start, len); - if let Some(Nested::Primitive(_, _, c)) = nested.last_mut() { - *c = len; - } else { - unreachable!("") - } - let mut buffer = vec![]; let (repetition_levels_byte_length, definition_levels_byte_length) = - nested::write_rep_and_def(options.version, &nested, &mut buffer)?; + nested::write_rep_and_def(options.version, nested, &mut buffer)?; - encode_plain(&array, is_optional, &mut buffer); + encode_plain(array, is_optional, &mut buffer); let statistics = if options.write_statistics { - Some(build_statistics(&array, type_.clone())) + Some(build_statistics(array, type_.clone())) } else { None }; utils::build_plain_page( buffer, - nested::num_values(&nested), + nested::num_values(nested), nested[0].len(), array.null_count(), repetition_levels_byte_length, diff --git a/src/io/parquet/write/mod.rs b/src/io/parquet/write/mod.rs index 6e10a19b164..a87dfd54975 100644 --- a/src/io/parquet/write/mod.rs +++ b/src/io/parquet/write/mod.rs @@ -80,13 +80,13 @@ pub fn slice_nested_leaf(nested: &[Nested]) -> (usize, usize) { for nested in nested.iter().rev() { match nested { Nested::LargeList(l_nested) => { - let start = *l_nested.offsets.first().unwrap(); - let end = *l_nested.offsets.last().unwrap(); + let start = *l_nested.offsets.first(); + let end = *l_nested.offsets.last(); return (start as usize, (end - start) as usize); } Nested::List(l_nested) => { - let start = *l_nested.offsets.first().unwrap(); - let end = *l_nested.offsets.last().unwrap(); + let start = *l_nested.offsets.first(); + let end = *l_nested.offsets.last(); return (start as usize, (end - start) as usize); } Nested::Primitive(_, _, len) => out = (0, *len), @@ -158,137 +158,124 @@ pub fn can_encode(data_type: &DataType, encoding: Encoding) -> bool { } /// Slices the [`Array`] to `Box` and `Vec`. -pub fn slice_parquet_array<'a>( - array: &'a dyn Array, - nested: &'a [Nested<'a>], - offset: usize, - length: usize, -) -> (Box, Vec>) { - let mut nested = nested.to_vec(); - - let mut is_nested = false; +pub fn slice_parquet_array( + primitive_array: &mut dyn Array, + nested: &mut [Nested], + mut current_offset: usize, + mut current_length: usize, +) { for nested in nested.iter_mut() { match nested { Nested::LargeList(l_nested) => { - is_nested = true; - // the slice is a bit awkward because we always want the latest value to compute the next length; - l_nested.offsets = &l_nested.offsets - [offset..offset + std::cmp::min(length + 1, l_nested.offsets.len())]; + l_nested.offsets.slice(current_offset, current_length + 1); + if let Some(validity) = l_nested.validity.as_mut() { + validity.slice(current_offset, current_length) + }; + + current_length = l_nested.offsets.range() as usize; + current_offset = *l_nested.offsets.first() as usize; } Nested::List(l_nested) => { - is_nested = true; - l_nested.offsets = &l_nested.offsets - [offset..offset + std::cmp::min(length + 1, l_nested.offsets.len())]; + l_nested.offsets.slice(current_offset, current_length + 1); + if let Some(validity) = l_nested.validity.as_mut() { + validity.slice(current_offset, current_length) + }; + + current_length = l_nested.offsets.range() as usize; + current_offset = *l_nested.offsets.first() as usize; + } + Nested::Struct(validity, _, length) => { + *length = current_length; + if let Some(validity) = validity.as_mut() { + validity.slice(current_offset, current_length) + }; + } + Nested::Primitive(validity, _, length) => { + *length = current_length; + if let Some(validity) = validity.as_mut() { + validity.slice(current_offset, current_length) + }; + primitive_array.slice(current_offset, current_length); } - _ => {} } } - if is_nested { - (array.to_boxed(), nested) - } else { - (array.to_boxed().sliced(offset, length), nested) - } } /// Get the length of [`Array`] that should be sliced. -pub fn get_max_length(array: &dyn Array, nested: &[Nested]) -> usize { - // the inner nested structure that - // dictates how often the primitive should be repeated - for nested in nested.iter().rev() { +pub fn get_max_length(nested: &[Nested]) -> usize { + let mut length = 0; + for nested in nested.iter() { match nested { - Nested::LargeList(l_nested) => return l_nested.offsets.len() - 1, - Nested::List(l_nested) => return l_nested.offsets.len() - 1, + Nested::LargeList(l_nested) => length += l_nested.offsets.range() as usize, + Nested::List(l_nested) => length += l_nested.offsets.range() as usize, _ => {} } } - array.len() + length } /// Returns an iterator of [`Page`]. -#[allow(clippy::needless_collect)] pub fn array_to_pages( - array: &dyn Array, + primitive_array: &dyn Array, type_: ParquetPrimitiveType, nested: &[Nested], options: WriteOptions, encoding: Encoding, ) -> Result>> { - // maximum page size is 2^31 e.g. i32::MAX - // we split at 2^31 - 2^25 to err on the safe side - // we also check for an array.len > 3 to prevent infinite recursion - // still have to figure out how to deal with values that are i32::MAX size, such as very large - // strings or a list column with many elements - - let array_byte_size = estimated_bytes_size(array); - if array_byte_size >= (2u32.pow(31) - 2u32.pow(25)) as usize && array.len() > 3 { - let length = get_max_length(array, nested); - let split_at = length / 2; - let (sub_array_left, subnested_left) = slice_parquet_array(array, nested, 0, split_at); - let (sub_array_right, subnested_right) = - slice_parquet_array(array, nested, split_at, length - split_at); - - Ok(DynIter::new( - array_to_pages( - sub_array_left.as_ref(), - type_.clone(), - subnested_left.as_ref(), - options, - encoding, - )? - .chain(array_to_pages( - sub_array_right.as_ref(), + if let DataType::Dictionary(key_type, _, _) = primitive_array.data_type().to_logical_type() { + return match_integer_type!(key_type, |$T| { + dictionary::array_to_pages::<$T>( + primitive_array.as_any().downcast_ref().unwrap(), type_, - subnested_right.as_ref(), + &nested, options, encoding, - )?), - )) + ) + }); + }; + + let nested = nested.to_vec(); + let primitive_array = primitive_array.to_boxed(); + + let number_of_rows = nested[0].len(); + + // note: this is not correct if the array is sliced - the estimation should happen on the + // primitive after sliced for parquet + let byte_size = estimated_bytes_size(primitive_array.as_ref()); + + const DEFAULT_PAGE_SIZE: usize = 1024 * 1024; + let max_page_size = options.data_pagesize_limit.unwrap_or(DEFAULT_PAGE_SIZE); + let max_page_size = max_page_size.min(2usize.pow(31) - 2usize.pow(25)); // allowed maximum page size + let bytes_per_row = if number_of_rows == 0 { + 0 } else { - match array.data_type() { - DataType::Dictionary(key_type, _, _) => { - match_integer_type!(key_type, |$T| { - dictionary::array_to_pages::<$T>( - array.as_any().downcast_ref().unwrap(), - type_, - nested, - options, - encoding, - ) - }) - } - _ => { - const DEFAULT_PAGE_SIZE: usize = 1024 * 1024; - let page_size = options.data_pagesize_limit.unwrap_or(DEFAULT_PAGE_SIZE); - let bytes_per_row = - ((array_byte_size as f64) / ((array.len() + 1) as f64)) as usize; - let rows_per_page = (page_size / (bytes_per_row + 1)).max(1); - - let length = get_max_length(array, nested); - let vs: Vec> = (0..length) - .step_by(rows_per_page) - .map(|offset| { - let length = if offset + rows_per_page > length { - length - offset - } else { - rows_per_page - }; - - let (sub_array, subnested) = - slice_parquet_array(array, nested, offset, length); - array_to_page( - sub_array.as_ref(), - type_.clone(), - &subnested, - options, - encoding, - ) - }) - .collect(); - - Ok(DynIter::new(vs.into_iter())) - } - } - } + ((byte_size as f64) / (number_of_rows as f64)) as usize + }; + let rows_per_page = (max_page_size / (bytes_per_row + 1)).max(1); + + let pages = (0..number_of_rows) + .step_by(rows_per_page) + .map(move |offset| { + let length = if offset + rows_per_page > number_of_rows { + number_of_rows - offset + } else { + rows_per_page + }; + + let mut right_array = primitive_array.clone(); + let mut right_nested = nested.clone(); + slice_parquet_array(right_array.as_mut(), &mut right_nested, offset, length); + + array_to_page( + right_array.as_ref(), + type_.clone(), + &right_nested, + options, + encoding, + ) + }); + + Ok(DynIter::new(pages)) } /// Converts an [`Array`] to a [`CompressedPage`] based on options, descriptor and `encoding`. diff --git a/src/io/parquet/write/nested/def.rs b/src/io/parquet/write/nested/def.rs index 2b958cfcb9a..b8576098467 100644 --- a/src/io/parquet/write/nested/def.rs +++ b/src/io/parquet/write/nested/def.rs @@ -9,7 +9,7 @@ trait DebugIter: Iterator + std::fmt::Debug {} impl + std::fmt::Debug> DebugIter for A {} fn single_iter<'a>( - validity: Option<&'a Bitmap>, + validity: &'a Option, is_optional: bool, length: usize, ) -> Box { @@ -26,23 +26,23 @@ fn single_iter<'a>( } } -fn single_list_iter<'a, O: Offset>(nested: &ListNested<'a, O>) -> Box { - match (nested.is_optional, nested.validity) { +fn single_list_iter<'a, O: Offset>(nested: &'a ListNested) -> Box { + match (nested.is_optional, &nested.validity) { (false, _) => Box::new( std::iter::repeat(0u32) - .zip(to_length(nested.offsets)) + .zip(to_length(&nested.offsets)) .map(|(a, b)| (a + (b != 0) as u32, b)), ) as Box, (true, None) => Box::new( std::iter::repeat(1u32) - .zip(to_length(nested.offsets)) + .zip(to_length(&nested.offsets)) .map(|(a, b)| (a + (b != 0) as u32, b)), ) as Box, (true, Some(validity)) => Box::new( validity .iter() .map(|x| (x as u32)) - .zip(to_length(nested.offsets)) + .zip(to_length(&nested.offsets)) .map(|(a, b)| (a + (b != 0) as u32, b)), ) as Box, } @@ -53,12 +53,12 @@ fn iter<'a>(nested: &'a [Nested]) -> Vec> { .iter() .map(|nested| match nested { Nested::Primitive(validity, is_optional, length) => { - single_iter(*validity, *is_optional, *length) + single_iter(validity, *is_optional, *length) } Nested::List(nested) => single_list_iter(nested), Nested::LargeList(nested) => single_list_iter(nested), Nested::Struct(validity, is_optional, length) => { - single_iter(*validity, *is_optional, *length) + single_iter(validity, *is_optional, *length) } }) .collect() @@ -184,11 +184,10 @@ mod tests { fn struct_optional() { let b = [ true, false, true, true, false, true, false, false, true, true, - ] - .into(); + ]; let nested = vec![ Nested::Struct(None, true, 10), - Nested::Primitive(Some(&b), true, 10), + Nested::Primitive(Some(b.into()), true, 10), ]; let expected = vec![2, 1, 2, 2, 1, 2, 1, 1, 2, 2]; @@ -200,7 +199,7 @@ mod tests { let nested = vec![ Nested::List(ListNested { is_optional: true, - offsets: &[0, 2], + offsets: vec![0, 2].try_into().unwrap(), validity: None, }), Nested::Primitive(None, true, 2), @@ -214,11 +213,10 @@ mod tests { fn struct_optional_1() { let b = [ true, false, true, true, false, true, false, false, true, true, - ] - .into(); + ]; let nested = vec![ Nested::Struct(None, true, 10), - Nested::Primitive(Some(&b), true, 10), + Nested::Primitive(Some(b.into()), true, 10), ]; let expected = vec![2, 1, 2, 2, 1, 2, 1, 1, 2, 2]; @@ -242,7 +240,7 @@ mod tests { // [[0, 1], [], [2, 0, 3], [4, 5, 6], [], [7, 8, 9], [], [10]] Nested::List(ListNested { is_optional: false, - offsets: &[0, 2, 2, 5, 8, 8, 11, 11, 12], + offsets: vec![0, 2, 2, 5, 8, 8, 11, 11, 12].try_into().unwrap(), validity: None, }), Nested::Primitive(None, false, 12), @@ -256,22 +254,21 @@ mod tests { fn l1_optional_optional() { // [[0, 1], None, [2, None, 3], [4, 5, 6], [], [7, 8, 9], None, [10]] - let v0 = [true, false, true, true, true, true, false, true].into(); + let v0 = [true, false, true, true, true, true, false, true]; let v1 = [ true, true, //[0, 1] true, false, true, //[2, None, 3] true, true, true, //[4, 5, 6] true, true, true, //[7, 8, 9] true, //[10] - ] - .into(); + ]; let nested = vec![ Nested::List(ListNested { is_optional: true, - offsets: &[0, 2, 2, 5, 8, 8, 11, 11, 12], - validity: Some(&v0), + offsets: vec![0, 2, 2, 5, 8, 8, 11, 11, 12].try_into().unwrap(), + validity: Some(v0.into()), }), - Nested::Primitive(Some(&v1), true, 12), + Nested::Primitive(Some(v1.into()), true, 12), ]; let expected = vec![3u32, 3, 0, 3, 2, 3, 3, 3, 3, 1, 3, 3, 3, 0, 3]; @@ -295,12 +292,12 @@ mod tests { let nested = vec![ Nested::List(ListNested { is_optional: false, - offsets: &[0, 2, 4], + offsets: vec![0, 2, 4].try_into().unwrap(), validity: None, }), Nested::List(ListNested { is_optional: false, - offsets: &[0, 3, 7, 8, 10], + offsets: vec![0, 3, 7, 8, 10].try_into().unwrap(), validity: None, }), Nested::Primitive(None, false, 10), @@ -312,7 +309,7 @@ mod tests { #[test] fn l2_optional_required_required() { - let a = [true, false, true, true].into(); + let a = [true, false, true, true]; /* [ [ @@ -330,12 +327,12 @@ mod tests { let nested = vec![ Nested::List(ListNested { is_optional: true, - offsets: &[0, 2, 2, 2, 5], - validity: Some(&a), + offsets: vec![0, 2, 2, 2, 5].try_into().unwrap(), + validity: Some(a.into()), }), Nested::List(ListNested { is_optional: false, - offsets: &[0, 3, 7, 8, 8, 10], + offsets: vec![0, 3, 7, 8, 8, 10].try_into().unwrap(), validity: None, }), Nested::Primitive(None, false, 10), @@ -347,8 +344,8 @@ mod tests { #[test] fn l2_optional_optional_required() { - let a = [true, false, true].into(); - let b = [true, true, true, true, false].into(); + let a = [true, false, true]; + let b = [true, true, true, true, false]; /* [ [ @@ -366,13 +363,13 @@ mod tests { let nested = vec![ Nested::List(ListNested { is_optional: true, - offsets: &[0, 2, 2, 5], - validity: Some(&a), + offsets: vec![0, 2, 2, 5].try_into().unwrap(), + validity: Some(a.into()), }), Nested::List(ListNested { is_optional: true, - offsets: &[0, 3, 7, 8, 8, 8], - validity: Some(&b), + offsets: vec![0, 3, 7, 8, 8, 8].try_into().unwrap(), + validity: Some(b.into()), }), Nested::Primitive(None, false, 8), ]; @@ -383,9 +380,9 @@ mod tests { #[test] fn l2_optional_optional_optional() { - let a = [true, false, true].into(); - let b = [true, true, true, false].into(); - let c = [true, true, true, true, false, true, true, true].into(); + let a = [true, false, true]; + let b = [true, true, true, false]; + let c = [true, true, true, true, false, true, true, true]; /* [ [ @@ -402,15 +399,15 @@ mod tests { let nested = vec![ Nested::List(ListNested { is_optional: true, - offsets: &[0, 2, 2, 4], - validity: Some(&a), + offsets: vec![0, 2, 2, 4].try_into().unwrap(), + validity: Some(a.into()), }), Nested::List(ListNested { is_optional: true, - offsets: &[0, 3, 7, 8, 8], - validity: Some(&b), + offsets: vec![0, 3, 7, 8, 8].try_into().unwrap(), + validity: Some(b.into()), }), - Nested::Primitive(Some(&c), true, 8), + Nested::Primitive(Some(c.into()), true, 8), ]; let expected = vec![5, 5, 5, 5, 4, 5, 5, 0, 5, 2]; @@ -431,21 +428,19 @@ mod tests { fn nested_list_struct_nullable() { let a = [ true, true, true, false, true, false, false, false, true, true, true, true, - ] - .into(); + ]; let b = [ true, true, true, false, true, true, true, true, true, true, true, true, - ] - .into(); - let c = [true, false, true, true, true, true, false, true].into(); + ]; + let c = [true, false, true, true, true, true, false, true]; let nested = vec![ Nested::List(ListNested { is_optional: true, - offsets: &[0, 2, 2, 5, 8, 8, 11, 11, 12], - validity: Some(&c), + offsets: vec![0, 2, 2, 5, 8, 8, 11, 11, 12].try_into().unwrap(), + validity: Some(c.into()), }), - Nested::Struct(Some(&b), true, 12), - Nested::Primitive(Some(&a), true, 12), + Nested::Struct(Some(b.into()), true, 12), + Nested::Primitive(Some(a.into()), true, 12), ]; let expected = vec![4, 4, 0, 4, 2, 4, 3, 3, 3, 1, 4, 4, 4, 0, 4]; @@ -454,12 +449,12 @@ mod tests { #[test] fn nested_list_struct_nullable1() { - let c = [true, false].into(); + let c = [true, false]; let nested = vec![ Nested::List(ListNested { is_optional: true, - offsets: &[0, 1, 1], - validity: Some(&c), + offsets: vec![0, 1, 1].try_into().unwrap(), + validity: Some(c.into()), }), Nested::Struct(None, true, 1), Nested::Primitive(None, true, 1), @@ -471,19 +466,18 @@ mod tests { #[test] fn nested_struct_list_nullable() { - let a = [true, false, true, true, true, true, false, true].into(); + let a = [true, false, true, true, true, true, false, true]; let b = [ true, true, true, false, true, true, true, true, true, true, true, true, - ] - .into(); + ]; let nested = vec![ Nested::Struct(None, true, 12), Nested::List(ListNested { is_optional: true, - offsets: &[0, 2, 2, 5, 8, 8, 11, 11, 12], - validity: Some(&a), + offsets: vec![0, 2, 2, 5, 8, 8, 11, 11, 12].try_into().unwrap(), + validity: Some(a.into()), }), - Nested::Primitive(Some(&b), true, 12), + Nested::Primitive(Some(b.into()), true, 12), ]; let expected = vec![4, 4, 1, 4, 3, 4, 4, 4, 4, 2, 4, 4, 4, 1, 4]; @@ -492,13 +486,13 @@ mod tests { #[test] fn nested_struct_list_nullable1() { - let a = [true, true, false].into(); + let a = [true, true, false]; let nested = vec![ Nested::Struct(None, true, 3), Nested::List(ListNested { is_optional: true, - offsets: &[0, 1, 1, 1], - validity: Some(&a), + offsets: vec![0, 1, 1, 1].try_into().unwrap(), + validity: Some(a.into()), }), Nested::Primitive(None, true, 1), ]; @@ -515,23 +509,23 @@ mod tests { ] */ - let a = [true].into(); - let b = [true, false].into(); - let c = [true, false].into(); - let d = [true].into(); + let a = [true]; + let b = [true, false]; + let c = [true, false]; + let d = [true]; let nested = vec![ Nested::List(ListNested { is_optional: true, - offsets: &[0, 2], - validity: Some(&a), + offsets: vec![0, 2].try_into().unwrap(), + validity: Some(a.into()), }), - Nested::Struct(Some(&b), true, 2), + Nested::Struct(Some(b.into()), true, 2), Nested::List(ListNested { is_optional: true, - offsets: &[0, 1, 1], - validity: Some(&c), + offsets: vec![0, 1, 1].try_into().unwrap(), + validity: Some(c.into()), }), - Nested::Primitive(Some(&d), true, 1), + Nested::Primitive(Some(d.into()), true, 1), ]; /* 0 6 @@ -559,29 +553,29 @@ mod tests { [{"a": []}], ] */ - let a = [true, false, true, true, true, true, false, true].into(); + let a = [true, false, true, true, true, true, false, true]; let b = [ true, true, true, false, true, true, true, true, true, true, true, true, - ] - .into(); + ]; let c = [ true, true, true, false, true, false, false, false, true, true, true, true, - ] - .into(); - let d = [true, true, true, true, true, false, true, true].into(); + ]; + let d = [true, true, true, true, true, false, true, true]; let nested = vec![ Nested::List(ListNested { is_optional: true, - offsets: &[0, 2, 2, 5, 8, 8, 11, 11, 12], - validity: Some(&a), + offsets: vec![0, 2, 2, 5, 8, 8, 11, 11, 12].try_into().unwrap(), + validity: Some(a.into()), }), - Nested::Struct(Some(&b), true, 12), + Nested::Struct(Some(b.into()), true, 12), Nested::List(ListNested { is_optional: true, - offsets: &[0, 1, 2, 3, 3, 4, 4, 4, 4, 5, 6, 8, 8], - validity: Some(&c), + offsets: vec![0, 1, 2, 3, 3, 4, 4, 4, 4, 5, 6, 8, 8] + .try_into() + .unwrap(), + validity: Some(c.into()), }), - Nested::Primitive(Some(&d), true, 8), + Nested::Primitive(Some(d.into()), true, 8), ]; let expected = vec![6, 6, 0, 6, 2, 6, 3, 3, 3, 1, 6, 5, 6, 6, 0, 4]; diff --git a/src/io/parquet/write/nested/rep.rs b/src/io/parquet/write/nested/rep.rs index c8dc59e7fdc..b1bca7bab33 100644 --- a/src/io/parquet/write/nested/rep.rs +++ b/src/io/parquet/write/nested/rep.rs @@ -10,9 +10,11 @@ fn iter<'a>(nested: &'a [Nested]) -> Vec> { .iter() .filter_map(|nested| match nested { Nested::Primitive(_, _, _) => None, - Nested::List(nested) => Some(Box::new(to_length(nested.offsets)) as Box), + Nested::List(nested) => { + Some(Box::new(to_length(&nested.offsets)) as Box) + } Nested::LargeList(nested) => { - Some(Box::new(to_length(nested.offsets)) as Box) + Some(Box::new(to_length(&nested.offsets)) as Box) } Nested::Struct(_, _, _) => None, }) @@ -171,7 +173,7 @@ mod tests { let nested = vec![ Nested::List(ListNested { is_optional: false, - offsets: &[0, 2, 2, 5, 8, 8, 11, 11, 12], + offsets: vec![0, 2, 2, 5, 8, 8, 11, 11, 12].try_into().unwrap(), validity: None, }), Nested::Primitive(None, false, 12), @@ -186,12 +188,12 @@ mod tests { let nested = vec![ Nested::List(ListNested { is_optional: false, - offsets: &[0, 2, 2, 4], + offsets: vec![0, 2, 2, 4].try_into().unwrap(), validity: None, }), Nested::List(ListNested { is_optional: false, - offsets: &[0, 3, 7, 8, 10], + offsets: vec![0, 3, 7, 8, 10].try_into().unwrap(), validity: None, }), Nested::Primitive(None, false, 10), @@ -211,7 +213,7 @@ mod tests { let nested = vec![ Nested::List(ListNested { is_optional: true, - offsets: &[0, 1, 2], + offsets: vec![0, 1, 2].try_into().unwrap(), validity: None, }), Nested::Struct(None, true, 2), @@ -227,13 +229,13 @@ mod tests { let nested = vec![ Nested::List(ListNested { is_optional: true, - offsets: &[0, 2, 3], + offsets: vec![0, 2, 3].try_into().unwrap(), validity: None, }), Nested::Struct(None, true, 3), Nested::List(ListNested { is_optional: true, - offsets: &[0, 3, 6, 7], + offsets: vec![0, 3, 6, 7].try_into().unwrap(), validity: None, }), Nested::Primitive(None, true, 7), @@ -252,7 +254,7 @@ mod tests { Nested::Struct(None, true, 1), Nested::List(ListNested { is_optional: true, - offsets: &[0, 4], + offsets: vec![0, 4].try_into().unwrap(), validity: None, }), Nested::Primitive(None, true, 4), @@ -267,12 +269,12 @@ mod tests { let nested = vec![ Nested::List(ListNested { is_optional: false, - offsets: &[0, 1, 1, 3, 5, 5, 8, 8, 9], + offsets: vec![0, 1, 1, 3, 5, 5, 8, 8, 9].try_into().unwrap(), validity: None, }), Nested::List(ListNested { is_optional: false, - offsets: &[0, 2, 4, 5, 7, 8, 9, 10, 11, 12], + offsets: vec![0, 2, 4, 5, 7, 8, 9, 10, 11, 12].try_into().unwrap(), validity: None, }), Nested::Primitive(None, false, 12), @@ -300,13 +302,13 @@ mod tests { let nested = vec![ Nested::List(ListNested { is_optional: true, - offsets: &[0, 2, 2, 5, 8, 8, 11, 11, 12], + offsets: vec![0, 2, 2, 5, 8, 8, 11, 11, 12].try_into().unwrap(), validity: None, }), Nested::Struct(None, true, 12), Nested::List(ListNested { is_optional: true, - offsets: &[0, 1, 2, 3, 3, 4, 4, 4, 4, 5, 6, 8], + offsets: vec![0, 1, 2, 3, 3, 4, 4, 4, 4, 5, 6, 8].try_into().unwrap(), validity: None, }), Nested::Primitive(None, true, 8), @@ -327,13 +329,13 @@ mod tests { let nested = vec![ Nested::List(ListNested { is_optional: true, - offsets: &[0, 1], + offsets: vec![0, 1].try_into().unwrap(), validity: None, }), Nested::Struct(None, true, 12), Nested::List(ListNested { is_optional: true, - offsets: &[0, 0], + offsets: vec![0, 0].try_into().unwrap(), validity: None, }), Nested::Primitive(None, true, 0), @@ -348,13 +350,13 @@ mod tests { let nested = vec![ Nested::List(ListNested { is_optional: true, - offsets: &[0, 1, 1], + offsets: vec![0, 1, 1].try_into().unwrap(), validity: None, }), Nested::Struct(None, true, 12), Nested::List(ListNested { is_optional: true, - offsets: &[0, 0], + offsets: vec![0, 0].try_into().unwrap(), validity: None, }), Nested::Primitive(None, true, 0), diff --git a/src/io/parquet/write/pages.rs b/src/io/parquet/write/pages.rs index e7ddf8991c4..5012c0323cf 100644 --- a/src/io/parquet/write/pages.rs +++ b/src/io/parquet/write/pages.rs @@ -6,7 +6,7 @@ use crate::array::{ListArray, StructArray}; use crate::bitmap::Bitmap; use crate::datatypes::PhysicalType; use crate::io::parquet::read::schema::is_nullable; -use crate::offset::Offset; +use crate::offset::{Offset, OffsetsBuffer}; use crate::{ array::Array, error::{Error, Result}, @@ -15,14 +15,14 @@ use crate::{ use super::{array_to_pages, Encoding, WriteOptions}; #[derive(Debug, Clone, PartialEq)] -pub struct ListNested<'a, O: Offset> { +pub struct ListNested { pub is_optional: bool, - pub offsets: &'a [O], - pub validity: Option<&'a Bitmap>, + pub offsets: OffsetsBuffer, + pub validity: Option, } -impl<'a, O: Offset> ListNested<'a, O> { - pub fn new(offsets: &'a [O], validity: Option<&'a Bitmap>, is_optional: bool) -> Self { +impl ListNested { + pub fn new(offsets: OffsetsBuffer, validity: Option, is_optional: bool) -> Self { Self { is_optional, offsets, @@ -33,43 +33,43 @@ impl<'a, O: Offset> ListNested<'a, O> { /// Descriptor of nested information of a field #[derive(Debug, Clone, PartialEq)] -pub enum Nested<'a> { +pub enum Nested { /// a primitive (leaf or parquet column) /// bitmap, _, length - Primitive(Option<&'a Bitmap>, bool, usize), + Primitive(Option, bool, usize), /// a list - List(ListNested<'a, i32>), + List(ListNested), /// a list - LargeList(ListNested<'a, i64>), + LargeList(ListNested), /// a struct - Struct(Option<&'a Bitmap>, bool, usize), + Struct(Option, bool, usize), } -impl Nested<'_> { +impl Nested { /// Returns the length (number of rows) of the element pub fn len(&self) -> usize { match self { Nested::Primitive(_, _, length) => *length, - Nested::List(nested) => nested.offsets.len() - 1, - Nested::LargeList(nested) => nested.offsets.len() - 1, + Nested::List(nested) => nested.offsets.len(), + Nested::LargeList(nested) => nested.offsets.len(), Nested::Struct(_, _, len) => *len, } } } /// Constructs the necessary `Vec>` to write the rep and def levels of `array` to parquet -pub fn to_nested<'a>(array: &'a dyn Array, type_: &ParquetType) -> Result>>> { +pub fn to_nested(array: &dyn Array, type_: &ParquetType) -> Result>> { let mut nested = vec![]; to_nested_recursive(array, type_, &mut nested, vec![])?; Ok(nested) } -fn to_nested_recursive<'a>( - array: &'a dyn Array, +fn to_nested_recursive( + array: &dyn Array, type_: &ParquetType, - nested: &mut Vec>>, - mut parents: Vec>, + nested: &mut Vec>, + mut parents: Vec, ) -> Result<()> { let is_optional = is_nullable(type_.get_field_info()); @@ -85,7 +85,11 @@ fn to_nested_recursive<'a>( )); }; - parents.push(Nested::Struct(array.validity(), is_optional, array.len())); + parents.push(Nested::Struct( + array.validity().cloned(), + is_optional, + array.len(), + )); for (type_, array) in fields.iter().zip(array.values()) { to_nested_recursive(array.as_ref(), type_, nested, parents.clone())?; @@ -108,8 +112,8 @@ fn to_nested_recursive<'a>( }; parents.push(Nested::List(ListNested::new( - array.offsets().buffer(), - array.validity(), + array.offsets().clone(), + array.validity().cloned(), is_optional, ))); to_nested_recursive(array.values().as_ref(), type_, nested, parents)?; @@ -131,15 +135,15 @@ fn to_nested_recursive<'a>( }; parents.push(Nested::LargeList(ListNested::new( - array.offsets().buffer(), - array.validity(), + array.offsets().clone(), + array.validity().cloned(), is_optional, ))); to_nested_recursive(array.values().as_ref(), type_, nested, parents)?; } _ => { parents.push(Nested::Primitive( - array.validity(), + array.validity().cloned(), is_optional, array.len(), )); @@ -290,11 +294,11 @@ mod tests { a, vec![ vec![ - Nested::Struct(Some(&Bitmap::from([true, true, false, true])), true, 4), + Nested::Struct(Some(Bitmap::from([true, true, false, true])), true, 4), Nested::Primitive(None, false, 4), ], vec![ - Nested::Struct(Some(&Bitmap::from([true, true, false, true])), true, 4), + Nested::Struct(Some(Bitmap::from([true, true, false, true])), true, 4), Nested::Primitive(None, false, 4), ], ] @@ -379,25 +383,25 @@ mod tests { // a.b.b vec![ Nested::Struct(None, false, 4), - Nested::Struct(Some(&Bitmap::from([true, true, false, true])), true, 4), + Nested::Struct(Some(Bitmap::from([true, true, false, true])), true, 4), Nested::Primitive(None, false, 4), ], // a.b.c vec![ Nested::Struct(None, false, 4), - Nested::Struct(Some(&Bitmap::from([true, true, false, true])), true, 4), + Nested::Struct(Some(Bitmap::from([true, true, false, true])), true, 4), Nested::Primitive(None, false, 4), ], // a.c.b vec![ Nested::Struct(None, false, 4), - Nested::Struct(Some(&Bitmap::from([true, true, false, true])), true, 4), + Nested::Struct(Some(Bitmap::from([true, true, false, true])), true, 4), Nested::Primitive(None, false, 4), ], // a.c.c vec![ Nested::Struct(None, false, 4), - Nested::Struct(Some(&Bitmap::from([true, true, false, true])), true, 4), + Nested::Struct(Some(Bitmap::from([true, true, false, true])), true, 4), Nested::Primitive(None, false, 4), ], ] @@ -487,19 +491,19 @@ mod tests { vec![ Nested::List(ListNested:: { is_optional: false, - offsets: &[0, 2, 4], + offsets: vec![0, 2, 4].try_into().unwrap(), validity: None, }), - Nested::Struct(Some(&Bitmap::from([true, true, false, true])), true, 4), + Nested::Struct(Some(Bitmap::from([true, true, false, true])), true, 4), Nested::Primitive(None, false, 4), ], vec![ Nested::List(ListNested:: { is_optional: false, - offsets: &[0, 2, 4], + offsets: vec![0, 2, 4].try_into().unwrap(), validity: None, }), - Nested::Struct(Some(&Bitmap::from([true, true, false, true])), true, 4), + Nested::Struct(Some(Bitmap::from([true, true, false, true])), true, 4), Nested::Primitive(None, false, 4), ], ] diff --git a/src/io/parquet/write/primitive/nested.rs b/src/io/parquet/write/primitive/nested.rs index c8bf625dc25..5814b60204c 100644 --- a/src/io/parquet/write/primitive/nested.rs +++ b/src/io/parquet/write/primitive/nested.rs @@ -7,7 +7,7 @@ use super::super::utils; use super::super::WriteOptions; use super::basic::{build_statistics, encode_plain}; use crate::io::parquet::read::schema::is_nullable; -use crate::io::parquet::write::{slice_nested_leaf, Nested}; +use crate::io::parquet::write::Nested; use crate::{ array::{Array, PrimitiveArray}, error::Result, @@ -29,27 +29,14 @@ where let mut buffer = vec![]; - // we slice the leaf by the offsets as dremel only computes lengths and thus - // does NOT take the starting offset into account. - // By slicing the leaf array we also don't write too many values. - let (start, len) = slice_nested_leaf(nested); - - let mut nested = nested.to_vec(); - let array = array.clone().sliced(start, len); - if let Some(Nested::Primitive(_, _, c)) = nested.last_mut() { - *c = len; - } else { - unreachable!("") - } - let (repetition_levels_byte_length, definition_levels_byte_length) = - nested::write_rep_and_def(options.version, &nested, &mut buffer)?; + nested::write_rep_and_def(options.version, nested, &mut buffer)?; - let buffer = encode_plain(&array, is_optional, buffer); + let buffer = encode_plain(array, is_optional, buffer); let statistics = if options.write_statistics { Some(serialize_statistics(&build_statistics( - &array, + array, type_.clone(), ))) } else { @@ -58,7 +45,7 @@ where utils::build_plain_page( buffer, - nested::num_values(&nested), + nested::num_values(nested), nested[0].len(), array.null_count(), repetition_levels_byte_length, diff --git a/src/io/parquet/write/utf8/nested.rs b/src/io/parquet/write/utf8/nested.rs index e7fe4dc5e0e..2792ef35712 100644 --- a/src/io/parquet/write/utf8/nested.rs +++ b/src/io/parquet/write/utf8/nested.rs @@ -4,7 +4,7 @@ use parquet2::{encoding::Encoding, page::DataPage}; use super::super::{nested, utils, WriteOptions}; use super::basic::{build_statistics, encode_plain}; use crate::io::parquet::read::schema::is_nullable; -use crate::io::parquet::write::{slice_nested_leaf, Nested}; +use crate::io::parquet::write::Nested; use crate::{ array::{Array, Utf8Array}, error::Result, @@ -21,34 +21,22 @@ where O: Offset, { let is_optional = is_nullable(&type_.field_info); - // we slice the leaf by the offsets as dremel only computes lengths and thus - // does NOT take the starting offset into account. - // By slicing the leaf array we also don't write too many values. - let (start, len) = slice_nested_leaf(nested); - - let mut nested = nested.to_vec(); - let array = array.clone().sliced(start, len); - if let Some(Nested::Primitive(_, _, c)) = nested.last_mut() { - *c = len; - } else { - unreachable!("") - } let mut buffer = vec![]; let (repetition_levels_byte_length, definition_levels_byte_length) = - nested::write_rep_and_def(options.version, &nested, &mut buffer)?; + nested::write_rep_and_def(options.version, nested, &mut buffer)?; - encode_plain(&array, is_optional, &mut buffer); + encode_plain(array, is_optional, &mut buffer); let statistics = if options.write_statistics { - Some(build_statistics(&array, type_.clone())) + Some(build_statistics(array, type_.clone())) } else { None }; utils::build_plain_page( buffer, - nested::num_values(&nested), + nested::num_values(nested), nested[0].len(), array.null_count(), repetition_levels_byte_length, diff --git a/src/offset.rs b/src/offset.rs index 52d8e6c9672..2532084ef03 100644 --- a/src/offset.rs +++ b/src/offset.rs @@ -438,6 +438,14 @@ impl OffsetsBuffer { (start, end) } + /// Slices this [`OffsetsBuffer`]. + /// # Panics + /// Panics iff `offset + length` is larger than `len`. + #[inline] + pub fn slice(&mut self, offset: usize, length: usize) { + self.0.slice(offset, length); + } + /// Slices this [`OffsetsBuffer`] starting at `offset`. /// # Safety /// The caller must ensure `offset + length <= self.len()` @@ -520,3 +528,12 @@ impl TryFrom> for Offsets { )) } } + +impl std::ops::Deref for OffsetsBuffer { + type Target = [O]; + + #[inline] + fn deref(&self) -> &[O] { + self.0.as_slice() + } +} diff --git a/tests/it/io/parquet/mod.rs b/tests/it/io/parquet/mod.rs index 4a7cc7fc99b..905d7c6dc04 100644 --- a/tests/it/io/parquet/mod.rs +++ b/tests/it/io/parquet/mod.rs @@ -9,7 +9,6 @@ use arrow2::{ io::parquet::read as p_read, io::parquet::read::statistics::*, io::parquet::write::*, - offset::Offset, types::{days_ms, NativeType}, }; @@ -1576,7 +1575,7 @@ fn arrow_type() -> Result<()> { fn data>( mut iter: I, inner_is_nullable: bool, -) -> ListArray { +) -> Box { // [[0, 1], [], [2, 0, 3], [4, 5, 6], [], [7, 8, 9], [], [10]] let data = vec![ Some(vec![Some(iter.next().unwrap()), Some(iter.next().unwrap())]), @@ -1606,12 +1605,12 @@ fn data>( inner_is_nullable, ); array.try_extend(data).unwrap(); - array.into() + array.into_box() } -fn list_array_generic( +fn assert_array_roundtrip( is_nullable: bool, - array: ListArray, + array: Box, limit: Option, ) -> Result<()> { let schema = Schema::from(vec![Field::new( @@ -1619,22 +1618,22 @@ fn list_array_generic( array.data_type().clone(), is_nullable, )]); - let chunk = Chunk::try_new(vec![array.boxed()])?; + let chunk = Chunk::try_new(vec![array])?; assert_roundtrip(schema, chunk, limit) } fn test_list_array_required_required(limit: Option) -> Result<()> { - list_array_generic(false, data(0..12i8, false), limit)?; - list_array_generic(false, data(0..12i16, false), limit)?; - list_array_generic(false, data(0..12i32, false), limit)?; - list_array_generic(false, data(0..12i64, false), limit)?; - list_array_generic(false, data(0..12u8, false), limit)?; - list_array_generic(false, data(0..12u16, false), limit)?; - list_array_generic(false, data(0..12u32, false), limit)?; - list_array_generic(false, data(0..12u64, false), limit)?; - list_array_generic(false, data((0..12).map(|x| (x as f32) * 1.0), false), limit)?; - list_array_generic( + assert_array_roundtrip(false, data(0..12i8, false), limit)?; + assert_array_roundtrip(false, data(0..12i16, false), limit)?; + assert_array_roundtrip(false, data(0..12i32, false), limit)?; + assert_array_roundtrip(false, data(0..12i64, false), limit)?; + assert_array_roundtrip(false, data(0..12u8, false), limit)?; + assert_array_roundtrip(false, data(0..12u16, false), limit)?; + assert_array_roundtrip(false, data(0..12u32, false), limit)?; + assert_array_roundtrip(false, data(0..12u64, false), limit)?; + assert_array_roundtrip(false, data((0..12).map(|x| (x as f32) * 1.0), false), limit)?; + assert_array_roundtrip( false, data((0..12).map(|x| (x as f64) * 1.0f64), false), limit, @@ -1648,17 +1647,17 @@ fn list_array_required_required() -> Result<()> { #[test] fn list_array_optional_optional() -> Result<()> { - list_array_generic(true, data(0..12, true), None) + assert_array_roundtrip(true, data(0..12, true), None) } #[test] fn list_array_required_optional() -> Result<()> { - list_array_generic(true, data(0..12, false), None) + assert_array_roundtrip(true, data(0..12, false), None) } #[test] fn list_array_optional_required() -> Result<()> { - list_array_generic(false, data(0..12, true), None) + assert_array_roundtrip(false, data(0..12, true), None) } #[test] @@ -1671,7 +1670,7 @@ fn list_utf8() -> Result<()> { let mut array = MutableListArray::::new_with_field(MutableUtf8Array::::new(), "item", true); array.try_extend(data).unwrap(); - list_array_generic(false, array.into(), None) + assert_array_roundtrip(false, array.into_box(), None) } #[test] @@ -1684,7 +1683,7 @@ fn list_large_utf8() -> Result<()> { let mut array = MutableListArray::::new_with_field(MutableUtf8Array::::new(), "item", true); array.try_extend(data).unwrap(); - list_array_generic(false, array.into(), None) + assert_array_roundtrip(false, array.into_box(), None) } #[test] @@ -1697,7 +1696,41 @@ fn list_binary() -> Result<()> { let mut array = MutableListArray::::new_with_field(MutableBinaryArray::::new(), "item", true); array.try_extend(data).unwrap(); - list_array_generic(false, array.into(), None) + assert_array_roundtrip(false, array.into_box(), None) +} + +#[test] +fn list_slice() -> Result<()> { + let data = vec![ + Some(vec![None, Some(2)]), + Some(vec![Some(3), Some(4)]), + Some(vec![Some(5), Some(6)]), + ]; + let mut array = MutableListArray::::new_with_field( + MutablePrimitiveArray::::new(), + "item", + true, + ); + array.try_extend(data).unwrap(); + let a: ListArray = array.into(); + let a = a.sliced(2, 1); + assert_array_roundtrip(false, a.boxed(), None) +} + +#[test] +fn struct_slice() -> Result<()> { + let a = pyarrow_nested_nullable("struct_list_nullable"); + + let a = a.sliced(2, 1); + assert_array_roundtrip(true, a, None) +} + +#[test] +fn list_struct_slice() -> Result<()> { + let a = pyarrow_nested_nullable("list_struct_nullable"); + + let a = a.sliced(2, 1); + assert_array_roundtrip(true, a, None) } #[test] @@ -1710,7 +1743,7 @@ fn large_list_large_binary() -> Result<()> { let mut array = MutableListArray::::new_with_field(MutableBinaryArray::::new(), "item", true); array.try_extend(data).unwrap(); - list_array_generic(false, array.into(), None) + assert_array_roundtrip(false, array.into_box(), None) } #[test] @@ -1726,7 +1759,7 @@ fn list_utf8_nullable() -> Result<()> { let mut array = MutableListArray::::new_with_field(MutableUtf8Array::::new(), "item", true); array.try_extend(data).unwrap(); - list_array_generic(true, array.into(), None) + assert_array_roundtrip(true, array.into_box(), None) } #[test] @@ -1745,7 +1778,7 @@ fn list_int_nullable() -> Result<()> { true, ); array.try_extend(data).unwrap(); - list_array_generic(true, array.into(), None) + assert_array_roundtrip(true, array.into_box(), None) } #[test]