diff --git a/parquet_integration/write_parquet.py b/parquet_integration/write_parquet.py index 6d3c97eff87..279567c8643 100644 --- a/parquet_integration/write_parquet.py +++ b/parquet_integration/write_parquet.py @@ -170,6 +170,22 @@ def case_nested() -> Tuple[dict, pa.Schema, str]: [{"a": "e"}], ] + struct_list_nullable = pa.StructArray.from_arrays( + [pa.array(string)], + fields=[("a", pa.list_(pa.utf8()))], + ) + + list_struct_list_nullable = [ + [{"a": ["a"]}, {"a": ["b"]}], + None, + [{"a": ["b"]}, None, {"a": ["b"]}], + [{"a": None}, {"a": None}, {"a": None}], + [], + [{"a": ["d"]}, {"a": [None]}, {"a": ["c", "d"]}], + None, + [{"a": []}], + ] + fields = [ pa.field("list_int64", pa.list_(pa.int64())), pa.field("list_int64_required", pa.list_(pa.field("item", pa.int64(), False))), @@ -196,6 +212,14 @@ def case_nested() -> Tuple[dict, pa.Schema, str]: "list_struct_nullable", pa.list_(pa.struct([("a", pa.utf8())])), ), + pa.field( + "struct_list_nullable", + pa.struct([("a", pa.list_(pa.utf8()))]), + ), + pa.field( + "list_struct_list_nullable", + pa.list_(pa.struct([("a", pa.list_(pa.utf8()))])), + ), ] schema = pa.schema(fields) return ( @@ -212,6 +236,8 @@ def case_nested() -> Tuple[dict, pa.Schema, str]: "list_nested_inner_required_i64": items_required_nested, "list_nested_inner_required_required_i64": items_required_nested_2, "list_struct_nullable": list_struct_nullable, + "struct_list_nullable": struct_list_nullable, + "list_struct_list_nullable": list_struct_list_nullable, }, schema, f"nested_nullable_10.parquet", @@ -263,7 +289,9 @@ def case_struct() -> Tuple[dict, pa.Schema, str]: struct_nullable = pa.StructArray.from_arrays( [pa.array(string), pa.array(boolean)], fields=struct_fields, - mask=pa.array([False, False, True, False, False, False, False, False, False, False]), + mask=pa.array( + [False, False, True, False, False, False, False, False, False, False] + ), ) return ( @@ -277,7 +305,20 @@ def case_struct() -> Tuple[dict, pa.Schema, str]: "struct_struct_nullable": pa.StructArray.from_arrays( [struct, pa.array(boolean)], names=["f1", "f2"], - mask=pa.array([False, False, True, False, False, False, False, False, False, False]), + mask=pa.array( + [ + False, + False, + True, + False, + False, + False, + False, + False, + False, + False, + ] + ), ), }, schema, @@ -288,30 +329,48 @@ def case_struct() -> Tuple[dict, pa.Schema, str]: def case_nested_edge(): simple = [[0, 1]] null = [None] + empty = [[]] struct_list_nullable = pa.StructArray.from_arrays( [pa.array([["a", "b", None, "c"]])], - fields=[ - ("f1", pa.list_(pa.utf8())), - ], + fields=[("f1", pa.list_(pa.utf8()))], ) + list_struct_list_nullable = pa.ListArray.from_arrays([0, 1], struct_list_nullable) + fields = [ pa.field("simple", pa.list_(pa.int64())), pa.field("null", pa.list_(pa.field("item", pa.int64(), True))), + pa.field("empty", pa.list_(pa.field("item", pa.int64(), True))), + pa.field( + "struct_list_nullable", + pa.struct( + [("f1", pa.list_(pa.utf8()))], + ), + ), pa.field( - "struct_list_nullable", - pa.struct([ - ("f1", pa.list_(pa.utf8())), - ]), - ) + "list_struct_list_nullable", + pa.list_( + pa.field( + "item", + pa.struct( + [ + ("f1", pa.list_(pa.utf8())), + ] + ), + True, + ) + ), + ), ] schema = pa.schema(fields) return ( { "simple": simple, "null": null, + "empty": empty, "struct_list_nullable": struct_list_nullable, + "list_struct_list_nullable": list_struct_list_nullable, }, schema, f"nested_edge_nullable_10.parquet", @@ -413,7 +472,7 @@ def case_benches_required(size): # for read benchmarks -for i in range(10, 22, 2): +for i in range(22, 22, 2): # two pages (dict) write_pyarrow(case_benches(2**i), 1, True, False, None) # single page diff --git a/src/io/parquet/write/nested/def.rs b/src/io/parquet/write/nested/def.rs index 395f73b913e..f3cb804feff 100644 --- a/src/io/parquet/write/nested/def.rs +++ b/src/io/parquet/write/nested/def.rs @@ -28,20 +28,22 @@ fn single_iter<'a>( fn single_list_iter<'a, O: Offset>(nested: &ListNested<'a, O>) -> Box { match (nested.is_optional, nested.validity) { - (false, _) => { - Box::new(std::iter::repeat(1u32).zip(to_length(nested.offsets))) as Box - } - (true, None) => { - Box::new(std::iter::repeat(2u32).zip(to_length(nested.offsets))) as Box - } + (false, _) => Box::new( + std::iter::repeat(0u32) + .zip(to_length(nested.offsets)) + .map(|(a, b)| (a + (b != 0) as u32, b)), + ) as Box, + (true, None) => Box::new( + std::iter::repeat(1u32) + .zip(to_length(nested.offsets)) + .map(|(a, b)| (a + (b != 0) as u32, b)), + ) as Box, (true, Some(validity)) => Box::new( validity .iter() - // lists have 2 groups, so - // True => 2 - // False => 1 - .map(|x| (x as u32) + 1) - .zip(to_length(nested.offsets)), + .map(|x| (x as u32)) + .zip(to_length(nested.offsets)) + .map(|(a, b)| (a + (b != 0) as u32, b)), ) as Box, } } @@ -114,28 +116,16 @@ impl<'a> Iterator for DefLevelsIter<'a> { type Item = u32; fn next(&mut self) -> Option { - if *self.remaining.last().unwrap() > 0 { - *self.remaining.last_mut().unwrap() -= 1; - - let primitive = self.primitive_validity.next()?.0; - let r = Some(self.total + primitive); - - for level in 0..self.current_level - 1 { - let level = self.remaining.len() - level - 1; - if self.remaining[level] == 0 { - self.current_level -= 1; - self.total -= self.validity[level]; - self.remaining[level.saturating_sub(1)] -= 1; - } - } - if self.remaining[0] == 0 { - self.current_level -= 1; - self.total -= self.validity[0]; - } + if self.remaining_values == 0 { + return None; + } + + if self.remaining.is_empty() { self.remaining_values -= 1; - return r; + return Some(0); } + let mut empty_contrib = 0u32; for ((iter, remaining), validity) in self .iter .iter_mut() @@ -145,15 +135,44 @@ impl<'a> Iterator for DefLevelsIter<'a> { { let (is_valid, length): (u32, usize) = iter.next()?; *validity = is_valid; + self.total += is_valid; + + *remaining = length; if length == 0 { - self.remaining_values -= 1; - return Some(self.total + is_valid / 2); + *validity = 0; + self.total -= is_valid; + empty_contrib = is_valid; + break; } - *remaining = length; self.current_level += 1; - self.total += is_valid; } - self.next() + + // track + if let Some(x) = self.remaining.get_mut(self.current_level.saturating_sub(1)) { + *x = x.saturating_sub(1) + } + + let primitive = if self.current_level == self.remaining.len() { + self.primitive_validity.next()?.0 + } else { + 0 + }; + let r = Some(self.total + empty_contrib + primitive); + + for level in 0..self.current_level.saturating_sub(1) { + let level = self.remaining.len() - level - 1; + if self.remaining[level] == 0 { + self.current_level -= 1; + self.remaining[level - 1] -= 1; + self.total -= self.validity[level]; + } + } + if self.remaining[0] == 0 { + self.current_level = self.current_level.saturating_sub(1); + self.total -= self.validity[0]; + } + self.remaining_values -= 1; + r } fn size_hint(&self) -> (usize, Option) { @@ -176,9 +195,10 @@ mod tests { #[test] fn struct_optional() { - let b = Bitmap::from([ + let b = [ true, false, true, true, false, true, false, false, true, true, - ]); + ] + .into(); let nested = vec![ Nested::Struct(None, true, 10), Nested::Primitive(Some(&b), true, 10), @@ -188,11 +208,27 @@ mod tests { test(nested, expected) } + #[test] + fn nested_edge_simple() { + let nested = vec![ + Nested::List(ListNested { + is_optional: true, + offsets: &[0, 2], + validity: None, + }), + Nested::Primitive(None, true, 2), + ]; + let expected = vec![3, 3]; + + test(nested, expected) + } + #[test] fn struct_optional_1() { - let b = Bitmap::from([ + let b = [ true, false, true, true, false, true, false, false, true, true, - ]); + ] + .into(); let nested = vec![ Nested::Struct(None, true, 10), Nested::Primitive(Some(&b), true, 10), @@ -217,7 +253,7 @@ mod tests { fn l1_required_required() { let nested = vec![ // [[0, 1], [], [2, 0, 3], [4, 5, 6], [], [7, 8, 9], [], [10]] - Nested::List(ListNested:: { + Nested::List(ListNested { is_optional: false, offsets: &[0, 2, 2, 5, 8, 8, 11, 11, 12], validity: None, @@ -233,16 +269,17 @@ mod tests { fn l1_optional_optional() { // [[0, 1], None, [2, None, 3], [4, 5, 6], [], [7, 8, 9], None, [10]] - let v0 = Bitmap::from([true, false, true, true, true, true, false, true]); - let v1 = Bitmap::from([ + let v0 = [true, false, true, true, true, true, false, true].into(); + let v1 = [ true, true, //[0, 1] true, false, true, //[2, None, 3] true, true, true, //[4, 5, 6] true, true, true, //[7, 8, 9] true, //[10] - ]); + ] + .into(); let nested = vec![ - Nested::List(ListNested:: { + Nested::List(ListNested { is_optional: true, offsets: &[0, 2, 2, 5, 8, 8, 11, 11, 12], validity: Some(&v0), @@ -256,18 +293,30 @@ mod tests { #[test] fn l2_required_required_required() { + /* + [ + [ + [1,2,3], + [4,5,6,7], + ], + [ + [8], + [9, 10] + ] + ] + */ let nested = vec![ - Nested::List(ListNested:: { + Nested::List(ListNested { is_optional: false, offsets: &[0, 2, 4], validity: None, }), - Nested::List(ListNested:: { + Nested::List(ListNested { is_optional: false, offsets: &[0, 3, 7, 8, 10], validity: None, }), - Nested::Primitive(None, false, 12), + Nested::Primitive(None, false, 10), ]; let expected = vec![2, 2, 2, 2, 2, 2, 2, 2, 2, 2]; @@ -276,20 +325,33 @@ mod tests { #[test] fn l2_optional_required_required() { - let a = Bitmap::from([true, false, true, true]); - // e.g. [[[1,2,3], [4,5,6,7]], None, [[8], [], [9, 10]]] + let a = [true, false, true, true].into(); + /* + [ + [ + [1,2,3], + [4,5,6,7], + ], + None, + [ + [8], + [], + [9, 10] + ] + ] + */ let nested = vec![ - Nested::List(ListNested:: { + Nested::List(ListNested { is_optional: true, offsets: &[0, 2, 2, 2, 5], validity: Some(&a), }), - Nested::List(ListNested:: { + Nested::List(ListNested { is_optional: false, offsets: &[0, 3, 7, 8, 8, 10], validity: None, }), - Nested::Primitive(None, false, 12), + Nested::Primitive(None, false, 10), ]; let expected = vec![3, 3, 3, 3, 3, 3, 3, 0, 1, 3, 2, 3, 3]; @@ -298,21 +360,34 @@ mod tests { #[test] fn l2_optional_optional_required() { - let a = Bitmap::from([true, false, true]); - let b = Bitmap::from([true, true, true, true, false]); - // e.g. [[[1,2,3], [4,5,6,7]], None, [[8], [], None]] + let a = [true, false, true].into(); + let b = [true, true, true, true, false].into(); + /* + [ + [ + [1,2,3], + [4,5,6,7], + ], + None, + [ + [8], + [], + None, + ], + ] + */ let nested = vec![ - Nested::List(ListNested:: { + Nested::List(ListNested { is_optional: true, offsets: &[0, 2, 2, 5], validity: Some(&a), }), - Nested::List(ListNested:: { + Nested::List(ListNested { is_optional: true, offsets: &[0, 3, 7, 8, 8, 8], validity: Some(&b), }), - Nested::Primitive(None, false, 12), + Nested::Primitive(None, false, 8), ]; let expected = vec![4, 4, 4, 4, 4, 4, 4, 0, 4, 3, 2]; @@ -321,25 +396,127 @@ mod tests { #[test] fn l2_optional_optional_optional() { - let a = Bitmap::from([true, false, true]); - let b = Bitmap::from([true, true, true, false]); - let c = Bitmap::from([true, true, true, true, false, true, true, true]); - // e.g. [[[1,2,3], [4,None,6,7]], None, [[8], None]] + let a = [true, false, true].into(); + let b = [true, true, true, false].into(); + let c = [true, true, true, true, false, true, true, true].into(); + /* + [ + [ + [1,2,3], + [4,None,6,7], + ], + None, + [ + [8], + None, + ], + ] + */ let nested = vec![ - Nested::List(ListNested:: { + Nested::List(ListNested { is_optional: true, offsets: &[0, 2, 2, 4], validity: Some(&a), }), - Nested::List(ListNested:: { + Nested::List(ListNested { is_optional: true, offsets: &[0, 3, 7, 8, 8], validity: Some(&b), }), - Nested::Primitive(Some(&c), true, 12), + Nested::Primitive(Some(&c), true, 8), ]; let expected = vec![5, 5, 5, 5, 4, 5, 5, 0, 5, 2]; test(nested, expected) } + + /* + [{"a": "a"}, {"a": "b"}], + None, + [{"a": "b"}, None, {"a": "b"}], + [{"a": None}, {"a": None}, {"a": None}], + [], + [{"a": "d"}, {"a": "d"}, {"a": "d"}], + None, + [{"a": "e"}], + */ + #[test] + fn nested_list_struct_nullable() { + let a = [ + true, true, true, false, true, false, false, false, true, true, true, true, + ] + .into(); + let b = [ + true, true, true, false, true, true, true, true, true, true, true, true, + ] + .into(); + let c = [true, false, true, true, true, true, false, true].into(); + let nested = vec![ + Nested::List(ListNested { + is_optional: true, + offsets: &[0, 2, 2, 5, 8, 8, 11, 11, 12], + validity: Some(&c), + }), + Nested::Struct(Some(&b), true, 12), + Nested::Primitive(Some(&a), true, 12), + ]; + let expected = vec![4, 4, 0, 4, 2, 4, 3, 3, 3, 1, 4, 4, 4, 0, 4]; + + test(nested, expected) + } + + #[test] + fn nested_list_struct_nullable1() { + let c = [true, false].into(); + let nested = vec![ + Nested::List(ListNested { + is_optional: true, + offsets: &[0, 1, 1], + validity: Some(&c), + }), + Nested::Struct(None, true, 1), + Nested::Primitive(None, true, 1), + ]; + let expected = vec![4, 0]; + + test(nested, expected) + } + + #[test] + fn nested_struct_list_nullable() { + let a = [true, false, true, true, true, true, false, true].into(); + let b = [ + true, true, true, false, true, true, true, true, true, true, true, true, + ] + .into(); + let nested = vec![ + Nested::Struct(None, true, 12), + Nested::List(ListNested { + is_optional: true, + offsets: &[0, 2, 2, 5, 8, 8, 11, 11, 12], + validity: Some(&a), + }), + Nested::Primitive(Some(&b), true, 12), + ]; + let expected = vec![4, 4, 1, 4, 3, 4, 4, 4, 4, 2, 4, 4, 4, 1, 4]; + + test(nested, expected) + } + + #[test] + fn nested_struct_list_nullable1() { + let a = [true, true, false].into(); + let nested = vec![ + Nested::Struct(None, true, 3), + Nested::List(ListNested { + is_optional: true, + offsets: &[0, 1, 1, 1], + validity: Some(&a), + }), + Nested::Primitive(None, true, 1), + ]; + let expected = vec![4, 2, 1]; + + test(nested, expected) + } } diff --git a/src/io/parquet/write/nested/rep.rs b/src/io/parquet/write/nested/rep.rs index c7c5d15eb8e..6d88bbd1637 100644 --- a/src/io/parquet/write/nested/rep.rs +++ b/src/io/parquet/write/nested/rep.rs @@ -8,45 +8,33 @@ impl + std::fmt::Debug> DebugIter for A {} fn iter<'a>(nested: &'a [Nested]) -> Vec> { nested .iter() - .enumerate() - .filter_map(|(i, nested)| match nested { + .filter_map(|nested| match nested { Nested::Primitive(_, _, _) => None, Nested::List(nested) => Some(Box::new(to_length(nested.offsets)) as Box), Nested::LargeList(nested) => { Some(Box::new(to_length(nested.offsets)) as Box) } - Nested::Struct(_, _, length) => { - // only return 1, 1, 1, (x len) if struct is outer structure. - // otherwise treat as leaf - if i == 0 { - Some(Box::new(std::iter::repeat(1usize).take(*length)) as Box) - } else { - None - } - } + Nested::Struct(_, _, _) => None, }) .collect() } pub fn num_values(nested: &[Nested]) -> usize { - let iterators = iter(nested); - let depth = iterators.len(); + let pr = match nested.last().unwrap() { + Nested::Primitive(_, _, len) => *len, + _ => todo!(), + }; - iterators + iter(nested) .into_iter() .enumerate() - .map(|(index, lengths)| { - if index == depth - 1 { - lengths - .map(|length| if length == 0 { 1 } else { length }) - .sum::() - } else { - lengths - .map(|length| usize::from(length == 0)) - .sum::() - } + .map(|(_, lengths)| { + lengths + .map(|length| if length == 0 { 1 } else { 0 }) + .sum::() }) - .sum() + .sum::() + + pr } /// Iterator adapter of parquet / dremel repetition levels @@ -76,7 +64,7 @@ impl<'a> RepLevelsIter<'a> { let remaining_values = num_values(nested); let iter = iter(nested); - let remaining = std::iter::repeat(0).take(iter.len()).collect(); + let remaining = vec![0; iter.len()]; Self { iter, @@ -92,44 +80,50 @@ impl<'a> Iterator for RepLevelsIter<'a> { type Item = u32; fn next(&mut self) -> Option { - if *self.remaining.last().unwrap() > 0 { - *self.remaining.last_mut().unwrap() -= 1; - - let total = self.total; - self.total = 0; - let r = Some((self.current_level - total) as u32); - - for level in 0..self.current_level - 1 { - let level = self.remaining.len() - level - 1; - if self.remaining[level] == 0 { - self.current_level -= 1; - self.remaining[level.saturating_sub(1)] -= 1; - } - } - if self.remaining[0] == 0 { - self.current_level -= 1; - } + if self.remaining_values == 0 { + return None; + } + if self.remaining.is_empty() { self.remaining_values -= 1; - return r; + return Some(0); } - self.total = 0; for (iter, remaining) in self .iter .iter_mut() .zip(self.remaining.iter_mut()) .skip(self.current_level) { - let length: usize = iter.next()?; + let length: usize = iter.next().unwrap_or_default(); + *remaining = length; if length == 0 { - self.remaining_values -= 1; - return Some(self.current_level as u32); + break; } - *remaining = length; self.current_level += 1; self.total += 1; } - self.next() + + // track + if let Some(x) = self.remaining.get_mut(self.current_level.saturating_sub(1)) { + *x = x.saturating_sub(1) + } + let r = Some((self.current_level - self.total) as u32); + + // update + for level in 0..self.current_level.saturating_sub(1) { + let level = self.remaining.len() - level - 1; + if self.remaining[level] == 0 { + self.current_level -= 1; + self.remaining[level - 1] -= 1; + } + } + if self.remaining[0] == 0 { + self.current_level = self.current_level.saturating_sub(1); + } + self.total = 0; + self.remaining_values -= 1; + + r } fn size_hint(&self) -> (usize, Option) { @@ -147,8 +141,7 @@ mod tests { fn test(nested: Vec, expected: Vec) { let mut iter = RepLevelsIter::new(&nested); assert_eq!(iter.size_hint().0, expected.len()); - let result = iter.by_ref().collect::>(); - assert_eq!(result, expected); + assert_eq!(iter.by_ref().collect::>(), expected); assert_eq!(iter.size_hint().0, 0); } @@ -177,13 +170,36 @@ mod tests { #[test] fn l1() { let nested = vec![ - Nested::List(ListNested:: { + Nested::List(ListNested { is_optional: false, offsets: &[0, 2, 2, 5, 8, 8, 11, 11, 12], validity: None, }), Nested::Primitive(None, false, 12), ]; + // [2, 0, 3, 3, 0, 3, 0, 1] + // 1) + // a) 2 => remaining = [2], level = 1, total = 1 + // b) return 0 + // c) remaining = [1], total = 0 + // 2) + // a) + // b) return 1 + // c) remaining = [0], => 1 + + // + /* + [ + [False, True], + [], + [True, True, False], + [True, False, True], + [], + [False, False, False], + [], + [True], + ] + */ let expected = vec![0u32, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0]; @@ -193,12 +209,12 @@ mod tests { #[test] fn l2() { let nested = vec![ - Nested::List(ListNested:: { + Nested::List(ListNested { is_optional: false, offsets: &[0, 2, 2, 4], validity: None, }), - Nested::List(ListNested:: { + Nested::List(ListNested { is_optional: false, offsets: &[0, 3, 7, 8, 10], validity: None, @@ -220,7 +236,7 @@ mod tests { let nested = vec![ Nested::List(ListNested { is_optional: true, - offsets: &[0i32, 1, 2], + offsets: &[0, 1, 2], validity: None, }), Nested::Struct(None, true, 2), @@ -236,13 +252,13 @@ mod tests { let nested = vec![ Nested::List(ListNested { is_optional: true, - offsets: &[0i32, 2, 3], + offsets: &[0, 2, 3], validity: None, }), Nested::Struct(None, true, 3), Nested::List(ListNested { is_optional: true, - offsets: &[0i32, 3, 6, 7], + offsets: &[0, 3, 6, 7], validity: None, }), Nested::Primitive(None, true, 7), @@ -251,4 +267,145 @@ mod tests { test(nested, expected) } + + #[test] + fn struct_list_optional() { + /* + {"f1": ["a", "b", None, "c"]} + */ + let nested = vec![ + Nested::Struct(None, true, 1), + Nested::List(ListNested { + is_optional: true, + offsets: &[0, 4], + validity: None, + }), + Nested::Primitive(None, true, 4), + ]; + let expected = vec![0, 1, 1, 1]; + + test(nested, expected) + } + + #[test] + fn l2_other() { + let nested = vec![ + Nested::List(ListNested { + is_optional: false, + offsets: &[0, 1, 1, 3, 5, 5, 8, 8, 9], + validity: None, + }), + Nested::List(ListNested { + is_optional: false, + offsets: &[0, 2, 4, 5, 7, 8, 9, 10, 11, 12], + validity: None, + }), + Nested::Primitive(None, false, 12), + ]; + let expected = vec![0, 2, 0, 0, 2, 1, 0, 2, 1, 0, 0, 1, 1, 0, 0]; + + test(nested, expected) + } + + #[test] + fn list_struct_list_1() { + /* + [ + [{"a": ["a"]}, {"a": ["b"]}], + [], + [{"a": ["b"]}, None, {"a": ["b"]}], + [{"a": []}, {"a": []}, {"a": []}], + [], + [{"a": ["d"]}, {"a": ["a"]}, {"a": ["c", "d"]}], + [], + [{"a": []}], + ] + // reps: [0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 2, 0, 0] + */ + let nested = vec![ + Nested::List(ListNested { + is_optional: true, + offsets: &[0, 2, 2, 5, 8, 8, 11, 11, 12], + validity: None, + }), + Nested::Struct(None, true, 12), + Nested::List(ListNested { + is_optional: true, + offsets: &[0, 1, 2, 3, 3, 4, 4, 4, 4, 5, 6, 8], + validity: None, + }), + Nested::Primitive(None, true, 9), + ]; + let expected = vec![0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 2, 0, 0]; + + test(nested, expected) + } + + #[test] + fn list_struct_list_2() { + /* + [ + [{"a": []}], + ] + // reps: [0] + */ + let nested = vec![ + Nested::List(ListNested { + is_optional: true, + offsets: &[0, 1], + validity: None, + }), + Nested::Struct(None, true, 12), + Nested::List(ListNested { + is_optional: true, + offsets: &[0, 0], + validity: None, + }), + Nested::Primitive(None, true, 0), + ]; + let expected = vec![0]; + + test(nested, expected) + } + + #[test] + fn list_struct_list_3() { + /* + [ + [{"a": []}], + [], + ] + // reps: [0, 0] + + // lengths: [[1, 0], [0]] + // 1) + // a1) 1 => remaining = [1, 0], level = 1, total = 1 + // a2) 0 => remaining = [1, 0], level = 1, total = 1 + // b) return 0 + // c) remaining = [1, 0], level = 1, total = 0 + // 2) + // a) 0 => remaining = [1, 0], level = 1, total = 0 + // b) return 1 + // c) remaining = [0], => 1 + */ + let nested = vec![ + Nested::List(ListNested { + is_optional: true, + offsets: &[0, 1, 1], + validity: None, + }), + Nested::Struct(None, true, 12), + Nested::List(ListNested { + is_optional: true, + offsets: &[0, 0], + validity: None, + }), + Nested::Primitive(None, true, 0), + ]; + let expected = vec![0, 0]; + // [1, 0], [0] + // pick last + + test(nested, expected) + } } diff --git a/tests/it/io/parquet/mod.rs b/tests/it/io/parquet/mod.rs index b3a73454849..f42aa6be36f 100644 --- a/tests/it/io/parquet/mod.rs +++ b/tests/it/io/parquet/mod.rs @@ -83,8 +83,18 @@ pub fn pyarrow_nested_edge(column: &str) -> Box { let array: ListArray = a.into(); Box::new(array) } + "empty" => { + // [None] + let data: [Option>>; 0] = []; + let mut a = MutableListArray::>::new(); + a.try_extend(data).unwrap(); + let array: ListArray = a.into(); + Box::new(array) + } "struct_list_nullable" => { - // [["a", "b", None, "c"]] + // [ + // {"f1": ["a", "b", None, "c"]} + // ] let a = ListArray::::new( DataType::List(Box::new(Field::new("item", DataType::Utf8, true))), vec![0, 4].try_into().unwrap(), @@ -98,6 +108,20 @@ pub fn pyarrow_nested_edge(column: &str) -> Box { ) .boxed() } + "list_struct_list_nullable" => { + let values = pyarrow_nested_edge("struct_list_nullable"); + ListArray::::new( + DataType::List(Box::new(Field::new( + "item", + values.data_type().clone(), + true, + ))), + vec![0, 1].try_into().unwrap(), + values, + None, + ) + .boxed() + } _ => todo!(), } } @@ -216,6 +240,7 @@ pub fn pyarrow_nested_nullable(column: &str) -> Box { "list_nested_i64" | "list_nested_inner_required_i64" | "list_nested_inner_required_required_i64" => Box::new(NullArray::new(DataType::Null, 1)), + "struct_list_nullable" => pyarrow_nested_nullable("list_utf8"), "list_struct_nullable" => { let array = Utf8Array::::from([ Some("a"), @@ -252,6 +277,68 @@ pub fn pyarrow_nested_nullable(column: &str) -> Box { ) .boxed() } + "list_struct_list_nullable" => { + /* + [ + [{"a": ["a"]}, {"a": ["b"]}], + None, + [{"a": ["b"]}, None, {"a": ["b"]}], + [{"a": None}, {"a": None}, {"a": None}], + [], + [{"a": ["d"]}, {"a": [None]}, {"a": ["c", "d"]}], + None, + [{"a": []}], + ] + */ + let array = Utf8Array::::from([ + Some("a"), + Some("b"), + // + Some("b"), + Some("b"), + // + Some("d"), + None, + Some("c"), + Some("d"), + ]) + .boxed(); + + let array = ListArray::::new( + DataType::List(Box::new(Field::new( + "item", + array.data_type().clone(), + true, + ))), + vec![0, 1, 2, 3, 3, 4, 4, 4, 4, 5, 6, 8, 8] + .try_into() + .unwrap(), + array, + Some( + [ + true, true, true, false, true, false, false, false, true, true, true, true, + ] + .into(), + ), + ) + .boxed(); + + new_struct( + vec![array], + vec!["a".to_string()], + Some( + [ + true, true, // + true, false, true, // + true, true, true, // + true, true, true, // + true, + ] + .into(), + ), + ) + .boxed() + } other => unreachable!("{}", other), }; @@ -322,6 +409,7 @@ pub fn pyarrow_nested_nullable(column: &str) -> Box { let array: ListArray = a.into(); Box::new(array) } + "struct_list_nullable" => new_struct(vec![values], vec!["a".to_string()], None).boxed(), _ => { let field = match column { "list_int64" => Field::new("item", DataType::Int64, true), @@ -331,6 +419,7 @@ pub fn pyarrow_nested_nullable(column: &str) -> Box { "list_utf8" => Field::new("item", DataType::Utf8, true), "list_large_binary" => Field::new("item", DataType::LargeBinary, true), "list_struct_nullable" => Field::new("item", values.data_type().clone(), true), + "list_struct_list_nullable" => Field::new("item", values.data_type().clone(), true), other => unreachable!("{}", other), }; @@ -757,6 +846,74 @@ pub fn pyarrow_nested_nullable_statistics(column: &str) -> Statistics { ) .boxed(), }, + "list_struct_list_nullable" => Statistics { + distinct_count: new_list( + new_struct( + vec![new_list(UInt64Array::from([None]).boxed(), true).boxed()], + vec!["a".to_string()], + None, + ) + .boxed(), + true, + ) + .boxed(), + null_count: new_list( + new_struct( + vec![new_list(UInt64Array::from([Some(1)]).boxed(), true).boxed()], + vec!["a".to_string()], + None, + ) + .boxed(), + true, + ) + .boxed(), + min_value: new_list( + new_struct( + vec![new_list(Utf8Array::::from_slice(["a"]).boxed(), true).boxed()], + vec!["a".to_string()], + None, + ) + .boxed(), + true, + ) + .boxed(), + max_value: new_list( + new_struct( + vec![new_list(Utf8Array::::from_slice(["d"]).boxed(), true).boxed()], + vec!["a".to_string()], + None, + ) + .boxed(), + true, + ) + .boxed(), + }, + "struct_list_nullable" => Statistics { + distinct_count: new_struct( + vec![new_list(UInt64Array::from([None]).boxed(), true).boxed()], + vec!["a".to_string()], + None, + ) + .boxed(), + null_count: new_struct( + vec![new_list(UInt64Array::from([Some(1)]).boxed(), true).boxed()], + vec!["a".to_string()], + None, + ) + .boxed(), + min_value: new_struct( + vec![new_list(Utf8Array::::from_slice([""]).boxed(), true).boxed()], + vec!["a".to_string()], + None, + ) + .boxed(), + max_value: new_struct( + vec![new_list(Utf8Array::::from_slice(["ccc"]).boxed(), true).boxed()], + vec!["a".to_string()], + None, + ) + .boxed(), + }, other => todo!("{}", other), } } @@ -793,9 +950,9 @@ pub fn pyarrow_nested_edge_statistics(column: &str) -> Statistics { min_value: new_list(Box::new(Int64Array::from([Some(0)]))).boxed(), max_value: new_list(Box::new(Int64Array::from([Some(1)]))).boxed(), }, - "null" => Statistics { + "null" | "empty" => Statistics { distinct_count: new_list(UInt64Array::from([None]).boxed()).boxed(), - null_count: new_list(UInt64Array::from([Some(1)]).boxed()).boxed(), + null_count: new_list(UInt64Array::from([Some(0)]).boxed()).boxed(), min_value: new_list(Box::new(Int64Array::from([None]))).boxed(), max_value: new_list(Box::new(Int64Array::from([None]))).boxed(), }, @@ -819,6 +976,34 @@ pub fn pyarrow_nested_edge_statistics(column: &str) -> Statistics { names, )), }, + "list_struct_list_nullable" => Statistics { + distinct_count: new_list( + new_struct( + vec![new_list(Box::new(UInt64Array::from([None]))).boxed()], + names.clone(), + ) + .boxed(), + ) + .boxed(), + null_count: new_list( + new_struct( + vec![new_list(Box::new(UInt64Array::from([Some(1)]))).boxed()], + names.clone(), + ) + .boxed(), + ) + .boxed(), + min_value: new_list(Box::new(new_struct( + vec![new_list(Box::new(Utf8Array::::from_slice(["a"]))).boxed()], + names.clone(), + ))) + .boxed(), + max_value: new_list(Box::new(new_struct( + vec![new_list(Box::new(Utf8Array::::from_slice(["c"]))).boxed()], + names, + ))) + .boxed(), + }, _ => unreachable!(), } } diff --git a/tests/it/io/parquet/read.rs b/tests/it/io/parquet/read.rs index 271227d4a04..93a9e428185 100644 --- a/tests/it/io/parquet/read.rs +++ b/tests/it/io/parquet/read.rs @@ -67,6 +67,9 @@ fn test_pyarrow_integration( // pyarrow counts null struct items as nulls "struct_nullable", "list_struct_nullable", + "list_struct_list_nullable", + "struct_list_nullable", + "null", // pyarrow reports an incorrect min/max for MapArray "map", "map_nullable", @@ -349,10 +352,20 @@ fn v2_nested_nested_required_required() -> Result<()> { } #[test] -fn v1_list_of_struct() -> Result<()> { +fn v1_nested_list_struct_nullable() -> Result<()> { test_pyarrow_integration("list_struct_nullable", 1, "nested", false, false, None) } +#[test] +fn v1_nested_struct_list_nullable() -> Result<()> { + test_pyarrow_integration("struct_list_nullable", 1, "nested", false, false, None) +} + +#[test] +fn v1_nested_list_struct_list_nullable() -> Result<()> { + test_pyarrow_integration("list_struct_list_nullable", 1, "nested", false, false, None) +} + #[test] fn v1_decimal_9_nullable() -> Result<()> { test_pyarrow_integration("decimal_9", 1, "basic", false, false, None) @@ -474,20 +487,32 @@ fn v1_struct_struct_optional() -> Result<()> { } #[test] -fn v1_nested_edge_1() -> Result<()> { +fn v1_nested_edge_simple() -> Result<()> { test_pyarrow_integration("simple", 1, "nested_edge", false, false, None) } #[test] -fn v1_nested_edge_2() -> Result<()> { +fn v1_nested_edge_null() -> Result<()> { test_pyarrow_integration("null", 1, "nested_edge", false, false, None) } #[test] -fn v1_nested_edge_3() -> Result<()> { +fn v1_nested_edge_struct_list_nullable() -> Result<()> { test_pyarrow_integration("struct_list_nullable", 1, "nested_edge", false, false, None) } +#[test] +fn v1_nested_edge_list_struct_list_nullable() -> Result<()> { + test_pyarrow_integration( + "list_struct_list_nullable", + 1, + "nested_edge", + false, + false, + None, + ) +} + #[test] fn v1_map() -> Result<()> { test_pyarrow_integration("map", 1, "map", false, true, None) diff --git a/tests/it/io/parquet/write.rs b/tests/it/io/parquet/write.rs index ef5a18888d2..04383a10a2c 100644 --- a/tests/it/io/parquet/write.rs +++ b/tests/it/io/parquet/write.rs @@ -37,6 +37,10 @@ fn round_trip_opt_stats( pyarrow_required_statistics(column), ), "struct" => (pyarrow_struct(column), pyarrow_struct_statistics(column)), + "nested_edge" => ( + pyarrow_nested_edge(column), + pyarrow_nested_edge_statistics(column), + ), _ => unreachable!(), }; @@ -386,6 +390,30 @@ fn list_nested_inner_required_required_i64() -> Result<()> { ) } +#[test] +fn list_struct_nullable() -> Result<()> { + round_trip_opt_stats( + "list_struct_nullable", + "nested", + Version::V1, + CompressionOptions::Uncompressed, + vec![Encoding::Plain], + true, + ) +} + +#[test] +fn v1_nested_struct_list_nullable() -> Result<()> { + round_trip_opt_stats( + "struct_list_nullable", + "nested", + Version::V1, + CompressionOptions::Uncompressed, + vec![Encoding::Plain], + true, + ) +} + #[test] fn utf8_optional_v2_delta() -> Result<()> { round_trip( @@ -585,3 +613,47 @@ fn struct_v2() -> Result<()> { vec![Encoding::Plain, Encoding::Plain], ) } + +#[test] +fn nested_edge_simple() -> Result<()> { + round_trip( + "simple", + "nested_edge", + Version::V1, + CompressionOptions::Uncompressed, + vec![Encoding::Plain], + ) +} + +#[test] +fn nested_edge_null() -> Result<()> { + round_trip( + "null", + "nested_edge", + Version::V1, + CompressionOptions::Uncompressed, + vec![Encoding::Plain], + ) +} + +#[test] +fn v1_nested_edge_struct_list_nullable() -> Result<()> { + round_trip( + "struct_list_nullable", + "nested_edge", + Version::V1, + CompressionOptions::Uncompressed, + vec![Encoding::Plain], + ) +} + +#[test] +fn nested_edge_list_struct_list_nullable() -> Result<()> { + round_trip( + "list_struct_list_nullable", + "nested_edge", + Version::V1, + CompressionOptions::Uncompressed, + vec![Encoding::Plain], + ) +}