diff --git a/parquet_integration/write_parquet.py b/parquet_integration/write_parquet.py index a80e69b68af..1bb262e5439 100644 --- a/parquet_integration/write_parquet.py +++ b/parquet_integration/write_parquet.py @@ -158,6 +158,34 @@ def case_nested() -> Tuple[dict, pa.Schema, str]: None, [""], ] + + list_struct_nullable = [ + [{"a": "a"}, {"a": "b"}], + None, + [{"a": "b"}, None, {"a": "b"}], + [{"a": None}, {"a": None}, {"a": None}], + [], + [{"a": "d"}, {"a": "d"}, {"a": "d"}], + None, + [{"a": "e"}], + ] + + struct_list_nullable = pa.StructArray.from_arrays( + [pa.array(string)], + fields=[("a", pa.list_(pa.utf8()))], + ) + + list_struct_list_nullable = [ + [{"a": ["a"]}, {"a": ["b"]}], + None, + [{"a": ["b"]}, None, {"a": ["b"]}], + [{"a": None}, {"a": None}, {"a": None}], + [], + [{"a": ["d"]}, {"a": [None]}, {"a": ["c", "d"]}], + None, + [{"a": []}], + ] + fields = [ pa.field("list_int64", pa.list_(pa.int64())), pa.field("list_int64_required", pa.list_(pa.field("item", pa.int64(), False))), @@ -180,6 +208,18 @@ def case_nested() -> Tuple[dict, pa.Schema, str]: pa.field( "list_nested_inner_required_required_i64", pa.list_(pa.list_(pa.int64())) ), + pa.field( + "list_struct_nullable", + pa.list_(pa.struct([("a", pa.utf8())])), + ), + pa.field( + "struct_list_nullable", + pa.struct([("a", pa.list_(pa.utf8()))]), + ), + pa.field( + "list_struct_list_nullable", + pa.list_(pa.struct([("a", pa.list_(pa.utf8()))])), + ), ] schema = pa.schema(fields) return ( @@ -195,6 +235,9 @@ def case_nested() -> Tuple[dict, pa.Schema, str]: "list_nested_i64": items_nested, "list_nested_inner_required_i64": items_required_nested, "list_nested_inner_required_required_i64": items_required_nested_2, + "list_struct_nullable": list_struct_nullable, + "struct_list_nullable": struct_list_nullable, + "list_struct_list_nullable": list_struct_list_nullable, }, schema, f"nested_nullable_10.parquet", @@ -246,7 +289,9 @@ def case_struct() -> Tuple[dict, pa.Schema, str]: struct_nullable = pa.StructArray.from_arrays( [pa.array(string), pa.array(boolean)], fields=struct_fields, - mask=pa.array([False, False, True, False, False, False, False, False, False, False]), + mask=pa.array( + [False, False, True, False, False, False, False, False, False, False] + ), ) return ( @@ -260,7 +305,20 @@ def case_struct() -> Tuple[dict, pa.Schema, str]: "struct_struct_nullable": pa.StructArray.from_arrays( [struct, pa.array(boolean)], names=["f1", "f2"], - mask=pa.array([False, False, True, False, False, False, False, False, False, False]), + mask=pa.array( + [ + False, + False, + True, + False, + False, + False, + False, + False, + False, + False, + ] + ), ), }, schema, @@ -271,30 +329,48 @@ def case_struct() -> Tuple[dict, pa.Schema, str]: def case_nested_edge(): simple = [[0, 1]] null = [None] + empty = [[]] struct_list_nullable = pa.StructArray.from_arrays( [pa.array([["a", "b", None, "c"]])], - fields=[ - ("f1", pa.list_(pa.utf8())), - ], + fields=[("f1", pa.list_(pa.utf8()))], ) + list_struct_list_nullable = pa.ListArray.from_arrays([0, 1], struct_list_nullable) + fields = [ pa.field("simple", pa.list_(pa.int64())), pa.field("null", pa.list_(pa.field("item", pa.int64(), True))), + pa.field("empty", pa.list_(pa.field("item", pa.int64(), True))), pa.field( - "struct_list_nullable", - pa.struct([ - ("f1", pa.list_(pa.utf8())), - ]), - ) + "struct_list_nullable", + pa.struct( + [("f1", pa.list_(pa.utf8()))], + ), + ), + pa.field( + "list_struct_list_nullable", + pa.list_( + pa.field( + "item", + pa.struct( + [ + ("f1", pa.list_(pa.utf8())), + ] + ), + True, + ) + ), + ), ] schema = pa.schema(fields) return ( { "simple": simple, "null": null, + "empty": empty, "struct_list_nullable": struct_list_nullable, + "list_struct_list_nullable": list_struct_list_nullable, }, schema, f"nested_edge_nullable_10.parquet", diff --git a/src/io/parquet/write/binary/nested.rs b/src/io/parquet/write/binary/nested.rs index 08c21d616f9..fd2fad7cf74 100644 --- a/src/io/parquet/write/binary/nested.rs +++ b/src/io/parquet/write/binary/nested.rs @@ -27,9 +27,17 @@ where // By slicing the leaf array we also don't write too many values. let (start, len) = slice_nested_leaf(nested); + let mut nested = nested.to_vec(); + let array = array.slice(start, len); + if let Some(Nested::Primitive(_, _, c)) = nested.last_mut() { + *c = len; + } else { + unreachable!("") + } + let mut buffer = vec![]; let (repetition_levels_byte_length, definition_levels_byte_length) = - nested::write_rep_and_def(options.version, nested, &mut buffer, start)?; + nested::write_rep_and_def(options.version, &nested, &mut buffer)?; let array = array.slice(start, len); encode_plain(&array, is_optional, &mut buffer); @@ -42,7 +50,7 @@ where utils::build_plain_page( buffer, - nested::num_values(nested), + nested::num_values(&nested), nested[0].len(), array.null_count(), repetition_levels_byte_length, diff --git a/src/io/parquet/write/boolean/nested.rs b/src/io/parquet/write/boolean/nested.rs index 2b8d430ac4d..0e6ce9f5718 100644 --- a/src/io/parquet/write/boolean/nested.rs +++ b/src/io/parquet/write/boolean/nested.rs @@ -23,10 +23,18 @@ pub fn array_to_page( // By slicing the leaf array we also don't write too many values. let (start, len) = slice_nested_leaf(nested); + let mut nested = nested.to_vec(); + let array = array.slice(start, len); + if let Some(Nested::Primitive(_, _, c)) = nested.last_mut() { + *c = len; + } else { + unreachable!("") + } + let mut buffer = vec![]; let (repetition_levels_byte_length, definition_levels_byte_length) = - nested::write_rep_and_def(options.version, nested, &mut buffer, start)?; - let array = array.slice(start, len); + nested::write_rep_and_def(options.version, &nested, &mut buffer)?; + encode_plain(&array, is_optional, &mut buffer)?; let statistics = if options.write_statistics { @@ -37,7 +45,7 @@ pub fn array_to_page( utils::build_plain_page( buffer, - nested::num_values(nested), + nested::num_values(&nested), nested[0].len(), array.null_count(), repetition_levels_byte_length, diff --git a/src/io/parquet/write/dictionary.rs b/src/io/parquet/write/dictionary.rs index 38f4e2113cf..8cbe6c766e6 100644 --- a/src/io/parquet/write/dictionary.rs +++ b/src/io/parquet/write/dictionary.rs @@ -75,7 +75,6 @@ fn serialize_levels( nested: &[Nested], options: WriteOptions, buffer: &mut Vec, - offset: usize, ) -> Result<(usize, usize)> { if nested.len() == 1 { let is_optional = is_nullable(&type_.field_info); @@ -83,7 +82,7 @@ fn serialize_levels( let definition_levels_byte_length = buffer.len(); Ok((0, definition_levels_byte_length)) } else { - nested::write_rep_and_def(options.version, nested, buffer, offset) + nested::write_rep_and_def(options.version, nested, buffer) } } @@ -115,23 +114,29 @@ fn serialize_keys( let validity = normalized_validity(array); let (start, len) = slice_nested_leaf(nested); + let mut nested = nested.to_vec(); + let array = array.slice(start, len); + if let Some(Nested::Primitive(_, _, c)) = nested.last_mut() { + *c = len; + } else { + unreachable!("") + } + let (repetition_levels_byte_length, definition_levels_byte_length) = serialize_levels( validity.as_ref(), array.len(), &type_, - nested, + &nested, options, &mut buffer, - start, )?; - let array = array.slice(start, len); serialize_keys_values(&array, validity.as_ref(), &mut buffer)?; let (num_values, num_rows) = if nested.len() == 1 { (array.len(), array.len()) } else { - (nested::num_values(nested), nested[0].len()) + (nested::num_values(&nested), nested[0].len()) }; utils::build_plain_page( diff --git a/src/io/parquet/write/nested/def.rs b/src/io/parquet/write/nested/def.rs index 395f73b913e..2b958cfcb9a 100644 --- a/src/io/parquet/write/nested/def.rs +++ b/src/io/parquet/write/nested/def.rs @@ -28,20 +28,22 @@ fn single_iter<'a>( fn single_list_iter<'a, O: Offset>(nested: &ListNested<'a, O>) -> Box { match (nested.is_optional, nested.validity) { - (false, _) => { - Box::new(std::iter::repeat(1u32).zip(to_length(nested.offsets))) as Box - } - (true, None) => { - Box::new(std::iter::repeat(2u32).zip(to_length(nested.offsets))) as Box - } + (false, _) => Box::new( + std::iter::repeat(0u32) + .zip(to_length(nested.offsets)) + .map(|(a, b)| (a + (b != 0) as u32, b)), + ) as Box, + (true, None) => Box::new( + std::iter::repeat(1u32) + .zip(to_length(nested.offsets)) + .map(|(a, b)| (a + (b != 0) as u32, b)), + ) as Box, (true, Some(validity)) => Box::new( validity .iter() - // lists have 2 groups, so - // True => 2 - // False => 1 - .map(|x| (x as u32) + 1) - .zip(to_length(nested.offsets)), + .map(|x| (x as u32)) + .zip(to_length(nested.offsets)) + .map(|(a, b)| (a + (b != 0) as u32, b)), ) as Box, } } @@ -67,7 +69,6 @@ fn iter<'a>(nested: &'a [Nested]) -> Vec> { pub struct DefLevelsIter<'a> { // iterators of validities and lengths. E.g. [[[None,b,c], None], None] -> [[(true, 2), (false, 0)], [(true, 3), (false, 0)], [(false, 1), (true, 1), (true, 1)]] iter: Vec>, - primitive_validity: Box, // vector containing the remaining number of values of each iterator. // e.g. the iters [[2, 2], [3, 4, 1, 2]] after the first iteration will return [2, 3], // and remaining will be [2, 3]. @@ -86,21 +87,15 @@ pub struct DefLevelsIter<'a> { } impl<'a> DefLevelsIter<'a> { - pub fn new(nested: &'a [Nested], offset: usize) -> Self { + pub fn new(nested: &'a [Nested]) -> Self { let remaining_values = num_values(nested); - let mut primitive_validity = iter(&nested[nested.len() - 1..]).pop().unwrap(); - if offset > 0 { - primitive_validity.nth(offset - 1); - } - - let iter = iter(&nested[..nested.len() - 1]); - let remaining = std::iter::repeat(0).take(iter.len()).collect(); - let validity = std::iter::repeat(0).take(iter.len()).collect(); + let iter = iter(nested); + let remaining = vec![0; iter.len()]; + let validity = vec![0; iter.len()]; Self { iter, - primitive_validity, remaining, validity, total: 0, @@ -114,28 +109,16 @@ impl<'a> Iterator for DefLevelsIter<'a> { type Item = u32; fn next(&mut self) -> Option { - if *self.remaining.last().unwrap() > 0 { - *self.remaining.last_mut().unwrap() -= 1; - - let primitive = self.primitive_validity.next()?.0; - let r = Some(self.total + primitive); - - for level in 0..self.current_level - 1 { - let level = self.remaining.len() - level - 1; - if self.remaining[level] == 0 { - self.current_level -= 1; - self.total -= self.validity[level]; - self.remaining[level.saturating_sub(1)] -= 1; - } - } - if self.remaining[0] == 0 { - self.current_level -= 1; - self.total -= self.validity[0]; - } + if self.remaining_values == 0 { + return None; + } + + if self.remaining.is_empty() { self.remaining_values -= 1; - return r; + return Some(0); } + let mut empty_contrib = 0u32; for ((iter, remaining), validity) in self .iter .iter_mut() @@ -145,15 +128,38 @@ impl<'a> Iterator for DefLevelsIter<'a> { { let (is_valid, length): (u32, usize) = iter.next()?; *validity = is_valid; + self.total += is_valid; + + *remaining = length; if length == 0 { - self.remaining_values -= 1; - return Some(self.total + is_valid / 2); + *validity = 0; + self.total -= is_valid; + empty_contrib = is_valid; + break; } - *remaining = length; self.current_level += 1; - self.total += is_valid; } - self.next() + + // track + if let Some(x) = self.remaining.get_mut(self.current_level.saturating_sub(1)) { + *x = x.saturating_sub(1) + } + + let r = Some(self.total + empty_contrib); + + for index in (1..self.current_level).rev() { + if self.remaining[index] == 0 { + self.current_level -= 1; + self.remaining[index - 1] -= 1; + self.total -= self.validity[index]; + } + } + if self.remaining[0] == 0 { + self.current_level = self.current_level.saturating_sub(1); + self.total -= self.validity[0]; + } + self.remaining_values -= 1; + r } fn size_hint(&self) -> (usize, Option) { @@ -167,7 +173,7 @@ mod tests { use super::*; fn test(nested: Vec, expected: Vec) { - let mut iter = DefLevelsIter::new(&nested, 0); + let mut iter = DefLevelsIter::new(&nested); assert_eq!(iter.size_hint().0, expected.len()); let result = iter.by_ref().collect::>(); assert_eq!(result, expected); @@ -176,9 +182,10 @@ mod tests { #[test] fn struct_optional() { - let b = Bitmap::from([ + let b = [ true, false, true, true, false, true, false, false, true, true, - ]); + ] + .into(); let nested = vec![ Nested::Struct(None, true, 10), Nested::Primitive(Some(&b), true, 10), @@ -188,11 +195,27 @@ mod tests { test(nested, expected) } + #[test] + fn nested_edge_simple() { + let nested = vec![ + Nested::List(ListNested { + is_optional: true, + offsets: &[0, 2], + validity: None, + }), + Nested::Primitive(None, true, 2), + ]; + let expected = vec![3, 3]; + + test(nested, expected) + } + #[test] fn struct_optional_1() { - let b = Bitmap::from([ + let b = [ true, false, true, true, false, true, false, false, true, true, - ]); + ] + .into(); let nested = vec![ Nested::Struct(None, true, 10), Nested::Primitive(Some(&b), true, 10), @@ -217,7 +240,7 @@ mod tests { fn l1_required_required() { let nested = vec![ // [[0, 1], [], [2, 0, 3], [4, 5, 6], [], [7, 8, 9], [], [10]] - Nested::List(ListNested:: { + Nested::List(ListNested { is_optional: false, offsets: &[0, 2, 2, 5, 8, 8, 11, 11, 12], validity: None, @@ -233,16 +256,17 @@ mod tests { fn l1_optional_optional() { // [[0, 1], None, [2, None, 3], [4, 5, 6], [], [7, 8, 9], None, [10]] - let v0 = Bitmap::from([true, false, true, true, true, true, false, true]); - let v1 = Bitmap::from([ + let v0 = [true, false, true, true, true, true, false, true].into(); + let v1 = [ true, true, //[0, 1] true, false, true, //[2, None, 3] true, true, true, //[4, 5, 6] true, true, true, //[7, 8, 9] true, //[10] - ]); + ] + .into(); let nested = vec![ - Nested::List(ListNested:: { + Nested::List(ListNested { is_optional: true, offsets: &[0, 2, 2, 5, 8, 8, 11, 11, 12], validity: Some(&v0), @@ -256,18 +280,30 @@ mod tests { #[test] fn l2_required_required_required() { + /* + [ + [ + [1,2,3], + [4,5,6,7], + ], + [ + [8], + [9, 10] + ] + ] + */ let nested = vec![ - Nested::List(ListNested:: { + Nested::List(ListNested { is_optional: false, offsets: &[0, 2, 4], validity: None, }), - Nested::List(ListNested:: { + Nested::List(ListNested { is_optional: false, offsets: &[0, 3, 7, 8, 10], validity: None, }), - Nested::Primitive(None, false, 12), + Nested::Primitive(None, false, 10), ]; let expected = vec![2, 2, 2, 2, 2, 2, 2, 2, 2, 2]; @@ -276,20 +312,33 @@ mod tests { #[test] fn l2_optional_required_required() { - let a = Bitmap::from([true, false, true, true]); - // e.g. [[[1,2,3], [4,5,6,7]], None, [[8], [], [9, 10]]] + let a = [true, false, true, true].into(); + /* + [ + [ + [1,2,3], + [4,5,6,7], + ], + None, + [ + [8], + [], + [9, 10] + ] + ] + */ let nested = vec![ - Nested::List(ListNested:: { + Nested::List(ListNested { is_optional: true, offsets: &[0, 2, 2, 2, 5], validity: Some(&a), }), - Nested::List(ListNested:: { + Nested::List(ListNested { is_optional: false, offsets: &[0, 3, 7, 8, 8, 10], validity: None, }), - Nested::Primitive(None, false, 12), + Nested::Primitive(None, false, 10), ]; let expected = vec![3, 3, 3, 3, 3, 3, 3, 0, 1, 3, 2, 3, 3]; @@ -298,21 +347,34 @@ mod tests { #[test] fn l2_optional_optional_required() { - let a = Bitmap::from([true, false, true]); - let b = Bitmap::from([true, true, true, true, false]); - // e.g. [[[1,2,3], [4,5,6,7]], None, [[8], [], None]] + let a = [true, false, true].into(); + let b = [true, true, true, true, false].into(); + /* + [ + [ + [1,2,3], + [4,5,6,7], + ], + None, + [ + [8], + [], + None, + ], + ] + */ let nested = vec![ - Nested::List(ListNested:: { + Nested::List(ListNested { is_optional: true, offsets: &[0, 2, 2, 5], validity: Some(&a), }), - Nested::List(ListNested:: { + Nested::List(ListNested { is_optional: true, offsets: &[0, 3, 7, 8, 8, 8], validity: Some(&b), }), - Nested::Primitive(None, false, 12), + Nested::Primitive(None, false, 8), ]; let expected = vec![4, 4, 4, 4, 4, 4, 4, 0, 4, 3, 2]; @@ -321,25 +383,208 @@ mod tests { #[test] fn l2_optional_optional_optional() { - let a = Bitmap::from([true, false, true]); - let b = Bitmap::from([true, true, true, false]); - let c = Bitmap::from([true, true, true, true, false, true, true, true]); - // e.g. [[[1,2,3], [4,None,6,7]], None, [[8], None]] + let a = [true, false, true].into(); + let b = [true, true, true, false].into(); + let c = [true, true, true, true, false, true, true, true].into(); + /* + [ + [ + [1,2,3], + [4,None,6,7], + ], + None, + [ + [8], + None, + ], + ] + */ let nested = vec![ - Nested::List(ListNested:: { + Nested::List(ListNested { is_optional: true, offsets: &[0, 2, 2, 4], validity: Some(&a), }), - Nested::List(ListNested:: { + Nested::List(ListNested { is_optional: true, offsets: &[0, 3, 7, 8, 8], validity: Some(&b), }), - Nested::Primitive(Some(&c), true, 12), + Nested::Primitive(Some(&c), true, 8), ]; let expected = vec![5, 5, 5, 5, 4, 5, 5, 0, 5, 2]; test(nested, expected) } + + /* + [{"a": "a"}, {"a": "b"}], + None, + [{"a": "b"}, None, {"a": "b"}], + [{"a": None}, {"a": None}, {"a": None}], + [], + [{"a": "d"}, {"a": "d"}, {"a": "d"}], + None, + [{"a": "e"}], + */ + #[test] + fn nested_list_struct_nullable() { + let a = [ + true, true, true, false, true, false, false, false, true, true, true, true, + ] + .into(); + let b = [ + true, true, true, false, true, true, true, true, true, true, true, true, + ] + .into(); + let c = [true, false, true, true, true, true, false, true].into(); + let nested = vec![ + Nested::List(ListNested { + is_optional: true, + offsets: &[0, 2, 2, 5, 8, 8, 11, 11, 12], + validity: Some(&c), + }), + Nested::Struct(Some(&b), true, 12), + Nested::Primitive(Some(&a), true, 12), + ]; + let expected = vec![4, 4, 0, 4, 2, 4, 3, 3, 3, 1, 4, 4, 4, 0, 4]; + + test(nested, expected) + } + + #[test] + fn nested_list_struct_nullable1() { + let c = [true, false].into(); + let nested = vec![ + Nested::List(ListNested { + is_optional: true, + offsets: &[0, 1, 1], + validity: Some(&c), + }), + Nested::Struct(None, true, 1), + Nested::Primitive(None, true, 1), + ]; + let expected = vec![4, 0]; + + test(nested, expected) + } + + #[test] + fn nested_struct_list_nullable() { + let a = [true, false, true, true, true, true, false, true].into(); + let b = [ + true, true, true, false, true, true, true, true, true, true, true, true, + ] + .into(); + let nested = vec![ + Nested::Struct(None, true, 12), + Nested::List(ListNested { + is_optional: true, + offsets: &[0, 2, 2, 5, 8, 8, 11, 11, 12], + validity: Some(&a), + }), + Nested::Primitive(Some(&b), true, 12), + ]; + let expected = vec![4, 4, 1, 4, 3, 4, 4, 4, 4, 2, 4, 4, 4, 1, 4]; + + test(nested, expected) + } + + #[test] + fn nested_struct_list_nullable1() { + let a = [true, true, false].into(); + let nested = vec![ + Nested::Struct(None, true, 3), + Nested::List(ListNested { + is_optional: true, + offsets: &[0, 1, 1, 1], + validity: Some(&a), + }), + Nested::Primitive(None, true, 1), + ]; + let expected = vec![4, 2, 1]; + + test(nested, expected) + } + + #[test] + fn nested_list_struct_list_nullable1() { + /* + [ + [{"a": ["b"]}, None], + ] + */ + + let a = [true].into(); + let b = [true, false].into(); + let c = [true, false].into(); + let d = [true].into(); + let nested = vec![ + Nested::List(ListNested { + is_optional: true, + offsets: &[0, 2], + validity: Some(&a), + }), + Nested::Struct(Some(&b), true, 2), + Nested::List(ListNested { + is_optional: true, + offsets: &[0, 1, 1], + validity: Some(&c), + }), + Nested::Primitive(Some(&d), true, 1), + ]; + /* + 0 6 + 1 6 + 0 0 + 0 6 + 1 2 + */ + let expected = vec![6, 2]; + + test(nested, expected) + } + + #[test] + fn nested_list_struct_list_nullable() { + /* + [ + [{"a": ["a"]}, {"a": ["b"]}], + None, + [{"a": ["b"]}, None, {"a": ["b"]}], + [{"a": None}, {"a": None}, {"a": None}], + [], + [{"a": ["d"]}, {"a": [None]}, {"a": ["c", "d"]}], + None, + [{"a": []}], + ] + */ + let a = [true, false, true, true, true, true, false, true].into(); + let b = [ + true, true, true, false, true, true, true, true, true, true, true, true, + ] + .into(); + let c = [ + true, true, true, false, true, false, false, false, true, true, true, true, + ] + .into(); + let d = [true, true, true, true, true, false, true, true].into(); + let nested = vec![ + Nested::List(ListNested { + is_optional: true, + offsets: &[0, 2, 2, 5, 8, 8, 11, 11, 12], + validity: Some(&a), + }), + Nested::Struct(Some(&b), true, 12), + Nested::List(ListNested { + is_optional: true, + offsets: &[0, 1, 2, 3, 3, 4, 4, 4, 4, 5, 6, 8, 8], + validity: Some(&c), + }), + Nested::Primitive(Some(&d), true, 8), + ]; + let expected = vec![6, 6, 0, 6, 2, 6, 3, 3, 3, 1, 6, 5, 6, 6, 0, 4]; + + test(nested, expected) + } } diff --git a/src/io/parquet/write/nested/mod.rs b/src/io/parquet/write/nested/mod.rs index 3629f0a6220..49951ec7fb1 100644 --- a/src/io/parquet/write/nested/mod.rs +++ b/src/io/parquet/write/nested/mod.rs @@ -53,19 +53,14 @@ fn write_rep_levels(buffer: &mut Vec, nested: &[Nested], version: Version) - } /// writes the rep levels to a `Vec`. -fn write_def_levels( - buffer: &mut Vec, - nested: &[Nested], - version: Version, - offset: usize, -) -> Result<()> { +fn write_def_levels(buffer: &mut Vec, nested: &[Nested], version: Version) -> Result<()> { let max_level = max_def_level(nested) as i16; if max_level == 0 { return Ok(()); } let num_bits = get_bit_width(max_level); - let levels = def::DefLevelsIter::new(nested, offset); + let levels = def::DefLevelsIter::new(nested); match version { Version::V1 => write_levels_v1(buffer, move |buffer: &mut Vec| { @@ -111,14 +106,11 @@ pub fn write_rep_and_def( page_version: Version, nested: &[Nested], buffer: &mut Vec, - // needed to take offset the validity iterator in case - // the list was sliced. - offset: usize, ) -> Result<(usize, usize)> { write_rep_levels(buffer, nested, page_version)?; let repetition_levels_byte_length = buffer.len(); - write_def_levels(buffer, nested, page_version, offset)?; + write_def_levels(buffer, nested, page_version)?; let definition_levels_byte_length = buffer.len() - repetition_levels_byte_length; Ok((repetition_levels_byte_length, definition_levels_byte_length)) diff --git a/src/io/parquet/write/nested/rep.rs b/src/io/parquet/write/nested/rep.rs index c7c5d15eb8e..c8dc59e7fdc 100644 --- a/src/io/parquet/write/nested/rep.rs +++ b/src/io/parquet/write/nested/rep.rs @@ -8,45 +8,33 @@ impl + std::fmt::Debug> DebugIter for A {} fn iter<'a>(nested: &'a [Nested]) -> Vec> { nested .iter() - .enumerate() - .filter_map(|(i, nested)| match nested { + .filter_map(|nested| match nested { Nested::Primitive(_, _, _) => None, Nested::List(nested) => Some(Box::new(to_length(nested.offsets)) as Box), Nested::LargeList(nested) => { Some(Box::new(to_length(nested.offsets)) as Box) } - Nested::Struct(_, _, length) => { - // only return 1, 1, 1, (x len) if struct is outer structure. - // otherwise treat as leaf - if i == 0 { - Some(Box::new(std::iter::repeat(1usize).take(*length)) as Box) - } else { - None - } - } + Nested::Struct(_, _, _) => None, }) .collect() } pub fn num_values(nested: &[Nested]) -> usize { - let iterators = iter(nested); - let depth = iterators.len(); + let pr = match nested.last().unwrap() { + Nested::Primitive(_, _, len) => *len, + _ => todo!(), + }; - iterators + iter(nested) .into_iter() .enumerate() - .map(|(index, lengths)| { - if index == depth - 1 { - lengths - .map(|length| if length == 0 { 1 } else { length }) - .sum::() - } else { - lengths - .map(|length| usize::from(length == 0)) - .sum::() - } + .map(|(_, lengths)| { + lengths + .map(|length| if length == 0 { 1 } else { 0 }) + .sum::() }) - .sum() + .sum::() + + pr } /// Iterator adapter of parquet / dremel repetition levels @@ -76,7 +64,7 @@ impl<'a> RepLevelsIter<'a> { let remaining_values = num_values(nested); let iter = iter(nested); - let remaining = std::iter::repeat(0).take(iter.len()).collect(); + let remaining = vec![0; iter.len()]; Self { iter, @@ -92,28 +80,14 @@ impl<'a> Iterator for RepLevelsIter<'a> { type Item = u32; fn next(&mut self) -> Option { - if *self.remaining.last().unwrap() > 0 { - *self.remaining.last_mut().unwrap() -= 1; - - let total = self.total; - self.total = 0; - let r = Some((self.current_level - total) as u32); - - for level in 0..self.current_level - 1 { - let level = self.remaining.len() - level - 1; - if self.remaining[level] == 0 { - self.current_level -= 1; - self.remaining[level.saturating_sub(1)] -= 1; - } - } - if self.remaining[0] == 0 { - self.current_level -= 1; - } + if self.remaining_values == 0 { + return None; + } + if self.remaining.is_empty() { self.remaining_values -= 1; - return r; + return Some(0); } - self.total = 0; for (iter, remaining) in self .iter .iter_mut() @@ -121,15 +95,34 @@ impl<'a> Iterator for RepLevelsIter<'a> { .skip(self.current_level) { let length: usize = iter.next()?; + *remaining = length; if length == 0 { - self.remaining_values -= 1; - return Some(self.current_level as u32); + break; } - *remaining = length; self.current_level += 1; self.total += 1; } - self.next() + + // track + if let Some(x) = self.remaining.get_mut(self.current_level.saturating_sub(1)) { + *x = x.saturating_sub(1) + } + let r = Some((self.current_level - self.total) as u32); + + // update + for index in (1..self.current_level).rev() { + if self.remaining[index] == 0 { + self.current_level -= 1; + self.remaining[index - 1] -= 1; + } + } + if self.remaining[0] == 0 { + self.current_level = self.current_level.saturating_sub(1); + } + self.total = 0; + self.remaining_values -= 1; + + r } fn size_hint(&self) -> (usize, Option) { @@ -147,8 +140,7 @@ mod tests { fn test(nested: Vec, expected: Vec) { let mut iter = RepLevelsIter::new(&nested); assert_eq!(iter.size_hint().0, expected.len()); - let result = iter.by_ref().collect::>(); - assert_eq!(result, expected); + assert_eq!(iter.by_ref().collect::>(), expected); assert_eq!(iter.size_hint().0, 0); } @@ -177,14 +169,13 @@ mod tests { #[test] fn l1() { let nested = vec![ - Nested::List(ListNested:: { + Nested::List(ListNested { is_optional: false, offsets: &[0, 2, 2, 5, 8, 8, 11, 11, 12], validity: None, }), Nested::Primitive(None, false, 12), ]; - let expected = vec![0u32, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0]; test(nested, expected) @@ -193,12 +184,12 @@ mod tests { #[test] fn l2() { let nested = vec![ - Nested::List(ListNested:: { + Nested::List(ListNested { is_optional: false, offsets: &[0, 2, 2, 4], validity: None, }), - Nested::List(ListNested:: { + Nested::List(ListNested { is_optional: false, offsets: &[0, 3, 7, 8, 10], validity: None, @@ -220,7 +211,7 @@ mod tests { let nested = vec![ Nested::List(ListNested { is_optional: true, - offsets: &[0i32, 1, 2], + offsets: &[0, 1, 2], validity: None, }), Nested::Struct(None, true, 2), @@ -236,13 +227,13 @@ mod tests { let nested = vec![ Nested::List(ListNested { is_optional: true, - offsets: &[0i32, 2, 3], + offsets: &[0, 2, 3], validity: None, }), Nested::Struct(None, true, 3), Nested::List(ListNested { is_optional: true, - offsets: &[0i32, 3, 6, 7], + offsets: &[0, 3, 6, 7], validity: None, }), Nested::Primitive(None, true, 7), @@ -251,4 +242,127 @@ mod tests { test(nested, expected) } + + #[test] + fn struct_list_optional() { + /* + {"f1": ["a", "b", None, "c"]} + */ + let nested = vec![ + Nested::Struct(None, true, 1), + Nested::List(ListNested { + is_optional: true, + offsets: &[0, 4], + validity: None, + }), + Nested::Primitive(None, true, 4), + ]; + let expected = vec![0, 1, 1, 1]; + + test(nested, expected) + } + + #[test] + fn l2_other() { + let nested = vec![ + Nested::List(ListNested { + is_optional: false, + offsets: &[0, 1, 1, 3, 5, 5, 8, 8, 9], + validity: None, + }), + Nested::List(ListNested { + is_optional: false, + offsets: &[0, 2, 4, 5, 7, 8, 9, 10, 11, 12], + validity: None, + }), + Nested::Primitive(None, false, 12), + ]; + let expected = vec![0, 2, 0, 0, 2, 1, 0, 2, 1, 0, 0, 1, 1, 0, 0]; + + test(nested, expected) + } + + #[test] + fn list_struct_list_1() { + /* + [ + [{"a": ["a"]}, {"a": ["b"]}], + [], + [{"a": ["b"]}, None, {"a": ["b"]}], + [{"a": []}, {"a": []}, {"a": []}], + [], + [{"a": ["d"]}, {"a": ["a"]}, {"a": ["c", "d"]}], + [], + [{"a": []}], + ] + // reps: [0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 2, 0, 0] + */ + let nested = vec![ + Nested::List(ListNested { + is_optional: true, + offsets: &[0, 2, 2, 5, 8, 8, 11, 11, 12], + validity: None, + }), + Nested::Struct(None, true, 12), + Nested::List(ListNested { + is_optional: true, + offsets: &[0, 1, 2, 3, 3, 4, 4, 4, 4, 5, 6, 8], + validity: None, + }), + Nested::Primitive(None, true, 8), + ]; + let expected = vec![0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 2, 0]; + + test(nested, expected) + } + + #[test] + fn list_struct_list_2() { + /* + [ + [{"a": []}], + ] + // reps: [0] + */ + let nested = vec![ + Nested::List(ListNested { + is_optional: true, + offsets: &[0, 1], + validity: None, + }), + Nested::Struct(None, true, 12), + Nested::List(ListNested { + is_optional: true, + offsets: &[0, 0], + validity: None, + }), + Nested::Primitive(None, true, 0), + ]; + let expected = vec![0]; + + test(nested, expected) + } + + #[test] + fn list_struct_list_3() { + let nested = vec![ + Nested::List(ListNested { + is_optional: true, + offsets: &[0, 1, 1], + validity: None, + }), + Nested::Struct(None, true, 12), + Nested::List(ListNested { + is_optional: true, + offsets: &[0, 0], + validity: None, + }), + Nested::Primitive(None, true, 0), + ]; + let expected = vec![0, 0]; + // [1, 0], [0] + // pick last + + test(nested, expected) + } } diff --git a/src/io/parquet/write/primitive/nested.rs b/src/io/parquet/write/primitive/nested.rs index 1a9a8cf7d75..6c7bf59f5a2 100644 --- a/src/io/parquet/write/primitive/nested.rs +++ b/src/io/parquet/write/primitive/nested.rs @@ -34,10 +34,17 @@ where // By slicing the leaf array we also don't write too many values. let (start, len) = slice_nested_leaf(nested); + let mut nested = nested.to_vec(); + let array = array.slice(start, len); + if let Some(Nested::Primitive(_, _, c)) = nested.last_mut() { + *c = len; + } else { + unreachable!("") + } + let (repetition_levels_byte_length, definition_levels_byte_length) = - nested::write_rep_and_def(options.version, nested, &mut buffer, start)?; + nested::write_rep_and_def(options.version, &nested, &mut buffer)?; - let array = array.slice(start, len); let buffer = encode_plain(&array, is_optional, buffer); let statistics = if options.write_statistics { @@ -51,7 +58,7 @@ where utils::build_plain_page( buffer, - nested::num_values(nested), + nested::num_values(&nested), nested[0].len(), array.null_count(), repetition_levels_byte_length, diff --git a/src/io/parquet/write/utf8/nested.rs b/src/io/parquet/write/utf8/nested.rs index 7e8d018c957..1054de6e311 100644 --- a/src/io/parquet/write/utf8/nested.rs +++ b/src/io/parquet/write/utf8/nested.rs @@ -26,11 +26,18 @@ where // By slicing the leaf array we also don't write too many values. let (start, len) = slice_nested_leaf(nested); + let mut nested = nested.to_vec(); + let array = array.slice(start, len); + if let Some(Nested::Primitive(_, _, c)) = nested.last_mut() { + *c = len; + } else { + unreachable!("") + } + let mut buffer = vec![]; let (repetition_levels_byte_length, definition_levels_byte_length) = - nested::write_rep_and_def(options.version, nested, &mut buffer, start)?; + nested::write_rep_and_def(options.version, &nested, &mut buffer)?; - let array = array.slice(start, len); encode_plain(&array, is_optional, &mut buffer); let statistics = if options.write_statistics { @@ -41,7 +48,7 @@ where utils::build_plain_page( buffer, - nested::num_values(nested), + nested::num_values(&nested), nested[0].len(), array.null_count(), repetition_levels_byte_length, diff --git a/tests/it/io/parquet/mod.rs b/tests/it/io/parquet/mod.rs index 70173101761..f42aa6be36f 100644 --- a/tests/it/io/parquet/mod.rs +++ b/tests/it/io/parquet/mod.rs @@ -22,6 +22,19 @@ mod write_async; type ArrayStats = (Box, Statistics); +fn new_struct( + arrays: Vec>, + names: Vec, + validity: Option, +) -> StructArray { + let fields = names + .into_iter() + .zip(arrays.iter()) + .map(|(n, a)| Field::new(n, a.data_type().clone(), true)) + .collect(); + StructArray::new(DataType::Struct(fields), arrays, validity) +} + pub fn read_column(mut reader: R, column: &str) -> Result { let metadata = p_read::read_metadata(&mut reader)?; let schema = p_read::infer_schema(&metadata)?; @@ -70,8 +83,18 @@ pub fn pyarrow_nested_edge(column: &str) -> Box { let array: ListArray = a.into(); Box::new(array) } + "empty" => { + // [None] + let data: [Option>>; 0] = []; + let mut a = MutableListArray::>::new(); + a.try_extend(data).unwrap(); + let array: ListArray = a.into(); + Box::new(array) + } "struct_list_nullable" => { - // [["a", "b", None, "c"]] + // [ + // {"f1": ["a", "b", None, "c"]} + // ] let a = ListArray::::new( DataType::List(Box::new(Field::new("item", DataType::Utf8, true))), vec![0, 4].try_into().unwrap(), @@ -85,6 +108,20 @@ pub fn pyarrow_nested_edge(column: &str) -> Box { ) .boxed() } + "list_struct_list_nullable" => { + let values = pyarrow_nested_edge("struct_list_nullable"); + ListArray::::new( + DataType::List(Box::new(Field::new( + "item", + values.data_type().clone(), + true, + ))), + vec![0, 1].try_into().unwrap(), + values, + None, + ) + .boxed() + } _ => todo!(), } } @@ -171,7 +208,7 @@ pub fn pyarrow_nested_nullable(column: &str) -> Box { [""], ] */ - "list_utf8" => Box::new(Utf8Array::::from([ + "list_utf8" => Utf8Array::::from([ Some("Hello".to_string()), Some("bbb".to_string()), Some("aa".to_string()), @@ -184,7 +221,8 @@ pub fn pyarrow_nested_nullable(column: &str) -> Box { Some("bbb".to_string()), Some("bbb".to_string()), Some("".to_string()), - ])), + ]) + .boxed(), "list_large_binary" => Box::new(BinaryArray::::from([ Some(b"Hello".to_vec()), Some(b"bbb".to_vec()), @@ -202,6 +240,105 @@ pub fn pyarrow_nested_nullable(column: &str) -> Box { "list_nested_i64" | "list_nested_inner_required_i64" | "list_nested_inner_required_required_i64" => Box::new(NullArray::new(DataType::Null, 1)), + "struct_list_nullable" => pyarrow_nested_nullable("list_utf8"), + "list_struct_nullable" => { + let array = Utf8Array::::from([ + Some("a"), + Some("b"), + // + Some("b"), + None, + Some("b"), + // + None, + None, + None, + // + Some("d"), + Some("d"), + Some("d"), + // + Some("e"), + ]) + .boxed(); + new_struct( + vec![array], + vec!["a".to_string()], + Some( + [ + true, true, // + true, false, true, // + true, true, true, // + true, true, true, // + true, + ] + .into(), + ), + ) + .boxed() + } + "list_struct_list_nullable" => { + /* + [ + [{"a": ["a"]}, {"a": ["b"]}], + None, + [{"a": ["b"]}, None, {"a": ["b"]}], + [{"a": None}, {"a": None}, {"a": None}], + [], + [{"a": ["d"]}, {"a": [None]}, {"a": ["c", "d"]}], + None, + [{"a": []}], + ] + */ + let array = Utf8Array::::from([ + Some("a"), + Some("b"), + // + Some("b"), + Some("b"), + // + Some("d"), + None, + Some("c"), + Some("d"), + ]) + .boxed(); + + let array = ListArray::::new( + DataType::List(Box::new(Field::new( + "item", + array.data_type().clone(), + true, + ))), + vec![0, 1, 2, 3, 3, 4, 4, 4, 4, 5, 6, 8, 8] + .try_into() + .unwrap(), + array, + Some( + [ + true, true, true, false, true, false, false, false, true, true, true, true, + ] + .into(), + ), + ) + .boxed(); + + new_struct( + vec![array], + vec!["a".to_string()], + Some( + [ + true, true, // + true, false, true, // + true, true, true, // + true, true, true, // + true, + ] + .into(), + ), + ) + .boxed() + } other => unreachable!("{}", other), }; @@ -272,6 +409,7 @@ pub fn pyarrow_nested_nullable(column: &str) -> Box { let array: ListArray = a.into(); Box::new(array) } + "struct_list_nullable" => new_struct(vec![values], vec!["a".to_string()], None).boxed(), _ => { let field = match column { "list_int64" => Field::new("item", DataType::Int64, true), @@ -280,6 +418,8 @@ pub fn pyarrow_nested_nullable(column: &str) -> Box { "list_bool" => Field::new("item", DataType::Boolean, true), "list_utf8" => Field::new("item", DataType::Utf8, true), "list_large_binary" => Field::new("item", DataType::LargeBinary, true), + "list_struct_nullable" => Field::new("item", values.data_type().clone(), true), + "list_struct_list_nullable" => Field::new("item", values.data_type().clone(), true), other => unreachable!("{}", other), }; @@ -664,6 +804,116 @@ pub fn pyarrow_nested_nullable_statistics(column: &str) -> Statistics { ) .boxed(), }, + "list_struct_nullable" => Statistics { + distinct_count: new_list( + new_struct( + vec![UInt64Array::from([None]).boxed()], + vec!["a".to_string()], + None, + ) + .boxed(), + true, + ) + .boxed(), + null_count: new_list( + new_struct( + vec![UInt64Array::from([Some(4)]).boxed()], + vec!["a".to_string()], + None, + ) + .boxed(), + true, + ) + .boxed(), + min_value: new_list( + new_struct( + vec![Utf8Array::::from_slice(["a"]).boxed()], + vec!["a".to_string()], + None, + ) + .boxed(), + true, + ) + .boxed(), + max_value: new_list( + new_struct( + vec![Utf8Array::::from_slice(["e"]).boxed()], + vec!["a".to_string()], + None, + ) + .boxed(), + true, + ) + .boxed(), + }, + "list_struct_list_nullable" => Statistics { + distinct_count: new_list( + new_struct( + vec![new_list(UInt64Array::from([None]).boxed(), true).boxed()], + vec!["a".to_string()], + None, + ) + .boxed(), + true, + ) + .boxed(), + null_count: new_list( + new_struct( + vec![new_list(UInt64Array::from([Some(1)]).boxed(), true).boxed()], + vec!["a".to_string()], + None, + ) + .boxed(), + true, + ) + .boxed(), + min_value: new_list( + new_struct( + vec![new_list(Utf8Array::::from_slice(["a"]).boxed(), true).boxed()], + vec!["a".to_string()], + None, + ) + .boxed(), + true, + ) + .boxed(), + max_value: new_list( + new_struct( + vec![new_list(Utf8Array::::from_slice(["d"]).boxed(), true).boxed()], + vec!["a".to_string()], + None, + ) + .boxed(), + true, + ) + .boxed(), + }, + "struct_list_nullable" => Statistics { + distinct_count: new_struct( + vec![new_list(UInt64Array::from([None]).boxed(), true).boxed()], + vec!["a".to_string()], + None, + ) + .boxed(), + null_count: new_struct( + vec![new_list(UInt64Array::from([Some(1)]).boxed(), true).boxed()], + vec!["a".to_string()], + None, + ) + .boxed(), + min_value: new_struct( + vec![new_list(Utf8Array::::from_slice([""]).boxed(), true).boxed()], + vec!["a".to_string()], + None, + ) + .boxed(), + max_value: new_struct( + vec![new_list(Utf8Array::::from_slice(["ccc"]).boxed(), true).boxed()], + vec!["a".to_string()], + None, + ) + .boxed(), + }, other => todo!("{}", other), } } @@ -700,9 +950,9 @@ pub fn pyarrow_nested_edge_statistics(column: &str) -> Statistics { min_value: new_list(Box::new(Int64Array::from([Some(0)]))).boxed(), max_value: new_list(Box::new(Int64Array::from([Some(1)]))).boxed(), }, - "null" => Statistics { + "null" | "empty" => Statistics { distinct_count: new_list(UInt64Array::from([None]).boxed()).boxed(), - null_count: new_list(UInt64Array::from([Some(1)]).boxed()).boxed(), + null_count: new_list(UInt64Array::from([Some(0)]).boxed()).boxed(), min_value: new_list(Box::new(Int64Array::from([None]))).boxed(), max_value: new_list(Box::new(Int64Array::from([None]))).boxed(), }, @@ -726,6 +976,34 @@ pub fn pyarrow_nested_edge_statistics(column: &str) -> Statistics { names, )), }, + "list_struct_list_nullable" => Statistics { + distinct_count: new_list( + new_struct( + vec![new_list(Box::new(UInt64Array::from([None]))).boxed()], + names.clone(), + ) + .boxed(), + ) + .boxed(), + null_count: new_list( + new_struct( + vec![new_list(Box::new(UInt64Array::from([Some(1)]))).boxed()], + names.clone(), + ) + .boxed(), + ) + .boxed(), + min_value: new_list(Box::new(new_struct( + vec![new_list(Box::new(Utf8Array::::from_slice(["a"]))).boxed()], + names.clone(), + ))) + .boxed(), + max_value: new_list(Box::new(new_struct( + vec![new_list(Box::new(Utf8Array::::from_slice(["c"]))).boxed()], + names, + ))) + .boxed(), + }, _ => unreachable!(), } } @@ -798,14 +1076,8 @@ pub fn pyarrow_struct(column: &str) -> Box { } pub fn pyarrow_struct_statistics(column: &str) -> Statistics { - let new_struct = |arrays: Vec>, names: Vec| { - let fields = names - .into_iter() - .zip(arrays.iter()) - .map(|(n, a)| Field::new(n, a.data_type().clone(), true)) - .collect(); - StructArray::new(DataType::Struct(fields), arrays, None) - }; + let new_struct = + |arrays: Vec>, names: Vec| new_struct(arrays, names, None); let names = vec!["f1".to_string(), "f2".to_string()]; diff --git a/tests/it/io/parquet/read.rs b/tests/it/io/parquet/read.rs index a07c394bc4c..93a9e428185 100644 --- a/tests/it/io/parquet/read.rs +++ b/tests/it/io/parquet/read.rs @@ -64,7 +64,12 @@ fn test_pyarrow_integration( "list_bool", "list_nested_inner_required_required_i64", "list_nested_inner_required_i64", - "struct_nullable", // it counts null struct items as nulls + // pyarrow counts null struct items as nulls + "struct_nullable", + "list_struct_nullable", + "list_struct_list_nullable", + "struct_list_nullable", + "null", // pyarrow reports an incorrect min/max for MapArray "map", "map_nullable", @@ -346,6 +351,21 @@ fn v2_nested_nested_required_required() -> Result<()> { ) } +#[test] +fn v1_nested_list_struct_nullable() -> Result<()> { + test_pyarrow_integration("list_struct_nullable", 1, "nested", false, false, None) +} + +#[test] +fn v1_nested_struct_list_nullable() -> Result<()> { + test_pyarrow_integration("struct_list_nullable", 1, "nested", false, false, None) +} + +#[test] +fn v1_nested_list_struct_list_nullable() -> Result<()> { + test_pyarrow_integration("list_struct_list_nullable", 1, "nested", false, false, None) +} + #[test] fn v1_decimal_9_nullable() -> Result<()> { test_pyarrow_integration("decimal_9", 1, "basic", false, false, None) @@ -467,20 +487,32 @@ fn v1_struct_struct_optional() -> Result<()> { } #[test] -fn v1_nested_edge_1() -> Result<()> { +fn v1_nested_edge_simple() -> Result<()> { test_pyarrow_integration("simple", 1, "nested_edge", false, false, None) } #[test] -fn v1_nested_edge_2() -> Result<()> { +fn v1_nested_edge_null() -> Result<()> { test_pyarrow_integration("null", 1, "nested_edge", false, false, None) } #[test] -fn v1_nested_edge_3() -> Result<()> { +fn v1_nested_edge_struct_list_nullable() -> Result<()> { test_pyarrow_integration("struct_list_nullable", 1, "nested_edge", false, false, None) } +#[test] +fn v1_nested_edge_list_struct_list_nullable() -> Result<()> { + test_pyarrow_integration( + "list_struct_list_nullable", + 1, + "nested_edge", + false, + false, + None, + ) +} + #[test] fn v1_map() -> Result<()> { test_pyarrow_integration("map", 1, "map", false, true, None) diff --git a/tests/it/io/parquet/write.rs b/tests/it/io/parquet/write.rs index ef5a18888d2..d110298bc68 100644 --- a/tests/it/io/parquet/write.rs +++ b/tests/it/io/parquet/write.rs @@ -37,6 +37,10 @@ fn round_trip_opt_stats( pyarrow_required_statistics(column), ), "struct" => (pyarrow_struct(column), pyarrow_struct_statistics(column)), + "nested_edge" => ( + pyarrow_nested_edge(column), + pyarrow_nested_edge_statistics(column), + ), _ => unreachable!(), }; @@ -386,6 +390,42 @@ fn list_nested_inner_required_required_i64() -> Result<()> { ) } +#[test] +fn list_struct_nullable() -> Result<()> { + round_trip_opt_stats( + "list_struct_nullable", + "nested", + Version::V1, + CompressionOptions::Uncompressed, + vec![Encoding::Plain], + true, + ) +} + +#[test] +fn v1_nested_struct_list_nullable() -> Result<()> { + round_trip_opt_stats( + "struct_list_nullable", + "nested", + Version::V1, + CompressionOptions::Uncompressed, + vec![Encoding::Plain], + true, + ) +} + +#[test] +fn v1_nested_list_struct_list_nullable() -> Result<()> { + round_trip_opt_stats( + "list_struct_list_nullable", + "nested", + Version::V1, + CompressionOptions::Uncompressed, + vec![Encoding::Plain], + true, + ) +} + #[test] fn utf8_optional_v2_delta() -> Result<()> { round_trip( @@ -585,3 +625,47 @@ fn struct_v2() -> Result<()> { vec![Encoding::Plain, Encoding::Plain], ) } + +#[test] +fn nested_edge_simple() -> Result<()> { + round_trip( + "simple", + "nested_edge", + Version::V1, + CompressionOptions::Uncompressed, + vec![Encoding::Plain], + ) +} + +#[test] +fn nested_edge_null() -> Result<()> { + round_trip( + "null", + "nested_edge", + Version::V1, + CompressionOptions::Uncompressed, + vec![Encoding::Plain], + ) +} + +#[test] +fn v1_nested_edge_struct_list_nullable() -> Result<()> { + round_trip( + "struct_list_nullable", + "nested_edge", + Version::V1, + CompressionOptions::Uncompressed, + vec![Encoding::Plain], + ) +} + +#[test] +fn nested_edge_list_struct_list_nullable() -> Result<()> { + round_trip( + "list_struct_list_nullable", + "nested_edge", + Version::V1, + CompressionOptions::Uncompressed, + vec![Encoding::Plain], + ) +}