From d67dc121d2b26135294db0a6e64c7c344cb69030 Mon Sep 17 00:00:00 2001 From: "Jorge C. Leitao" Date: Mon, 4 Jul 2022 14:12:41 +0000 Subject: [PATCH] Fixed error in nested stats --- src/io/parquet/read/statistics/mod.rs | 125 +++++++++++++++++++------- tests/it/io/parquet/mod.rs | 120 +++++++++++++++---------- 2 files changed, 165 insertions(+), 80 deletions(-) diff --git a/src/io/parquet/read/statistics/mod.rs b/src/io/parquet/read/statistics/mod.rs index ff9449f0be9..d27123fc51e 100644 --- a/src/io/parquet/read/statistics/mod.rs +++ b/src/io/parquet/read/statistics/mod.rs @@ -34,8 +34,12 @@ use super::get_field_columns; /// Enum of a count statistics #[derive(Debug, PartialEq)] pub enum Count { - /// simple arrays (every type not a Struct) have a count of UInt64 + /// simple arrays have a count of UInt64 Single(UInt64Array), + /// list arrays have a count as a list of UInt64 + List(ListArray), + /// list arrays have a count as a list of UInt64 + LargeList(ListArray), /// struct arrays have a count as a struct of UInt64 Struct(StructArray), /// map arrays have a count as a map of UInt64 @@ -88,6 +92,24 @@ impl From for Statistics { .unwrap() .clone(); Count::Map(a) + } else if let PhysicalType::List = s.null_count.data_type().to_physical_type() { + let a = s + .null_count + .as_box() + .as_any() + .downcast_ref::>() + .unwrap() + .clone(); + Count::List(a) + } else if let PhysicalType::LargeList = s.null_count.data_type().to_physical_type() { + let a = s + .null_count + .as_box() + .as_any() + .downcast_ref::>() + .unwrap() + .clone(); + Count::LargeList(a) } else { let a = s .null_count @@ -98,35 +120,54 @@ impl From for Statistics { .clone(); Count::Single(a) }; - let distinct_count = - if let PhysicalType::Struct = s.distinct_count.data_type().to_physical_type() { - let a = s - .distinct_count - .as_box() - .as_any() - .downcast_ref::() - .unwrap() - .clone(); - Count::Struct(a) - } else if let PhysicalType::Map = s.null_count.data_type().to_physical_type() { - let a = s - .null_count - .as_box() - .as_any() - .downcast_ref::() - .unwrap() - .clone(); - Count::Map(a) - } else { - let a = s - .distinct_count - .as_box() - .as_any() - .downcast_ref::() - .unwrap() - .clone(); - Count::Single(a) - }; + let distinct_count = if let PhysicalType::Struct = + s.distinct_count.data_type().to_physical_type() + { + let a = s + .distinct_count + .as_box() + .as_any() + .downcast_ref::() + .unwrap() + .clone(); + Count::Struct(a) + } else if let PhysicalType::Map = s.distinct_count.data_type().to_physical_type() { + let a = s + .distinct_count + .as_box() + .as_any() + .downcast_ref::() + .unwrap() + .clone(); + Count::Map(a) + } else if let PhysicalType::List = s.distinct_count.data_type().to_physical_type() { + let a = s + .distinct_count + .as_box() + .as_any() + .downcast_ref::>() + .unwrap() + .clone(); + Count::List(a) + } else if let PhysicalType::LargeList = s.distinct_count.data_type().to_physical_type() { + let a = s + .distinct_count + .as_box() + .as_any() + .downcast_ref::>() + .unwrap() + .clone(); + Count::LargeList(a) + } else { + let a = s + .distinct_count + .as_box() + .as_any() + .downcast_ref::() + .unwrap() + .clone(); + Count::Single(a) + }; Self { null_count, distinct_count, @@ -198,6 +239,18 @@ fn create_dt(data_type: &DataType) -> DataType { Box::new(Field::new(&f.name, create_dt(&f.data_type), f.is_nullable)), *ordered, ) + } else if let DataType::List(f) = data_type.to_logical_type() { + DataType::List(Box::new(Field::new( + &f.name, + create_dt(&f.data_type), + f.is_nullable, + ))) + } else if let DataType::LargeList(f) = data_type.to_logical_type() { + DataType::LargeList(Box::new(Field::new( + &f.name, + create_dt(&f.data_type), + f.is_nullable, + ))) } else { DataType::UInt64 } @@ -301,12 +354,20 @@ fn push( .as_mut_any() .downcast_mut::() .unwrap(); + let distinct_count = distinct_count + .as_mut_any() + .downcast_mut::() + .unwrap(); + let null_count = null_count + .as_mut_any() + .downcast_mut::() + .unwrap(); return push( stats, min.inner.as_mut(), max.inner.as_mut(), - distinct_count, - null_count, + distinct_count.inner.as_mut(), + null_count.inner.as_mut(), ); } Dictionary(_, _, _) => { diff --git a/tests/it/io/parquet/mod.rs b/tests/it/io/parquet/mod.rs index f43a1736e41..197cf20be56 100644 --- a/tests/it/io/parquet/mod.rs +++ b/tests/it/io/parquet/mod.rs @@ -563,7 +563,7 @@ pub fn pyarrow_required_statistics(column: &str) -> Statistics { pub fn pyarrow_nested_nullable_statistics(column: &str) -> Statistics { let new_list = |array: Box, nullable: bool| { - Box::new(ListArray::::new( + ListArray::::new( DataType::List(Box::new(Field::new( "item", array.data_type().clone(), @@ -572,69 +572,93 @@ pub fn pyarrow_nested_nullable_statistics(column: &str) -> Statistics { vec![0, array.len() as i32].into(), array, None, - )) as Box + ) }; match column { "list_int16" => Statistics { - distinct_count: Count::Single(UInt64Array::from([None])), - null_count: Count::Single(UInt64Array::from([Some(1)])), - min_value: new_list(Box::new(Int16Array::from_slice([0])), true), - max_value: new_list(Box::new(Int16Array::from_slice([10])), true), + distinct_count: Count::List(new_list(UInt64Array::from([None]).boxed(), true)), + null_count: Count::List(new_list(UInt64Array::from([Some(1)]).boxed(), true)), + min_value: new_list(Box::new(Int16Array::from_slice([0])), true).boxed(), + max_value: new_list(Box::new(Int16Array::from_slice([10])), true).boxed(), }, "list_bool" => Statistics { - distinct_count: Count::Single(UInt64Array::from([None])), - null_count: Count::Single(UInt64Array::from([Some(1)])), - min_value: new_list(Box::new(BooleanArray::from_slice([false])), true), - max_value: new_list(Box::new(BooleanArray::from_slice([true])), true), + distinct_count: Count::List(new_list(UInt64Array::from([None]).boxed(), true)), + null_count: Count::List(new_list(UInt64Array::from([Some(1)]).boxed(), true)), + min_value: new_list(Box::new(BooleanArray::from_slice([false])), true).boxed(), + max_value: new_list(Box::new(BooleanArray::from_slice([true])), true).boxed(), }, "list_utf8" => Statistics { - distinct_count: Count::Single(UInt64Array::from([None])), - null_count: Count::Single([Some(1)].into()), - min_value: new_list(Box::new(Utf8Array::::from_slice([""])), true), - max_value: new_list(Box::new(Utf8Array::::from_slice(["ccc"])), true), + distinct_count: Count::List(new_list(UInt64Array::from([None]).boxed(), true)), + null_count: Count::List(new_list(UInt64Array::from([Some(1)]).boxed(), true)), + min_value: new_list(Box::new(Utf8Array::::from_slice([""])), true).boxed(), + max_value: new_list(Box::new(Utf8Array::::from_slice(["ccc"])), true).boxed(), }, "list_large_binary" => Statistics { - distinct_count: Count::Single(UInt64Array::from([None])), - null_count: Count::Single([Some(1)].into()), - min_value: new_list(Box::new(BinaryArray::::from_slice([b""])), true), - max_value: new_list(Box::new(BinaryArray::::from_slice([b"ccc"])), true), + distinct_count: Count::List(new_list(UInt64Array::from([None]).boxed(), true)), + null_count: Count::List(new_list(UInt64Array::from([Some(1)]).boxed(), true)), + min_value: new_list(Box::new(BinaryArray::::from_slice([b""])), true).boxed(), + max_value: new_list(Box::new(BinaryArray::::from_slice([b"ccc"])), true).boxed(), }, "list_int64" => Statistics { - distinct_count: Count::Single(UInt64Array::from([None])), - null_count: Count::Single([Some(1)].into()), - min_value: new_list(Box::new(Int64Array::from_slice([0])), true), - max_value: new_list(Box::new(Int64Array::from_slice([10])), true), + distinct_count: Count::List(new_list(UInt64Array::from([None]).boxed(), true)), + null_count: Count::List(new_list(UInt64Array::from([Some(1)]).boxed(), true)), + min_value: new_list(Box::new(Int64Array::from_slice([0])), true).boxed(), + max_value: new_list(Box::new(Int64Array::from_slice([10])), true).boxed(), }, "list_int64_required" => Statistics { - distinct_count: Count::Single(UInt64Array::from([None])), - null_count: Count::Single([Some(1)].into()), - min_value: new_list(Box::new(Int64Array::from_slice([0])), false), - max_value: new_list(Box::new(Int64Array::from_slice([10])), false), + distinct_count: Count::List(new_list(UInt64Array::from([None]).boxed(), true)), + null_count: Count::List(new_list(UInt64Array::from([Some(1)]).boxed(), true)), + min_value: new_list(Box::new(Int64Array::from_slice([0])), false).boxed(), + max_value: new_list(Box::new(Int64Array::from_slice([10])), false).boxed(), }, "list_int64_required_required" | "list_int64_optional_required" => Statistics { - distinct_count: Count::Single(UInt64Array::from([None])), - null_count: Count::Single([Some(0)].into()), - min_value: new_list(Box::new(Int64Array::from_slice([0])), false), - max_value: new_list(Box::new(Int64Array::from_slice([10])), false), + distinct_count: Count::List(new_list(UInt64Array::from([None]).boxed(), false)), + null_count: Count::List(new_list(UInt64Array::from([Some(0)]).boxed(), false)), + min_value: new_list(Box::new(Int64Array::from_slice([0])), false).boxed(), + max_value: new_list(Box::new(Int64Array::from_slice([10])), false).boxed(), }, "list_nested_i64" => Statistics { - distinct_count: Count::Single(UInt64Array::from([None])), - null_count: Count::Single([Some(2)].into()), - min_value: new_list(new_list(Box::new(Int64Array::from_slice([0])), true), true), - max_value: new_list(new_list(Box::new(Int64Array::from_slice([10])), true), true), + distinct_count: Count::List(new_list(UInt64Array::from([None]).boxed(), true)), + null_count: Count::List(new_list(UInt64Array::from([Some(2)]).boxed(), true)), + min_value: new_list( + new_list(Box::new(Int64Array::from_slice([0])), true).boxed(), + true, + ) + .boxed(), + max_value: new_list( + new_list(Box::new(Int64Array::from_slice([10])), true).boxed(), + true, + ) + .boxed(), }, "list_nested_inner_required_required_i64" => Statistics { distinct_count: Count::Single(UInt64Array::from([None])), null_count: Count::Single([Some(0)].into()), - min_value: new_list(new_list(Box::new(Int64Array::from_slice([0])), true), true), - max_value: new_list(new_list(Box::new(Int64Array::from_slice([10])), true), true), + min_value: new_list( + new_list(Box::new(Int64Array::from_slice([0])), true).boxed(), + true, + ) + .boxed(), + max_value: new_list( + new_list(Box::new(Int64Array::from_slice([10])), true).boxed(), + true, + ) + .boxed(), }, "list_nested_inner_required_i64" => Statistics { distinct_count: Count::Single(UInt64Array::from([None])), null_count: Count::Single([Some(0)].into()), - min_value: new_list(new_list(Box::new(Int64Array::from_slice([0])), true), true), - max_value: new_list(new_list(Box::new(Int64Array::from_slice([10])), true), true), + min_value: new_list( + new_list(Box::new(Int64Array::from_slice([0])), true).boxed(), + true, + ) + .boxed(), + max_value: new_list( + new_list(Box::new(Int64Array::from_slice([10])), true).boxed(), + true, + ) + .boxed(), }, other => todo!("{}", other), } @@ -642,7 +666,7 @@ pub fn pyarrow_nested_nullable_statistics(column: &str) -> Statistics { pub fn pyarrow_nested_edge_statistics(column: &str) -> Statistics { let new_list = |array: Box| { - Box::new(ListArray::::new( + ListArray::::new( DataType::List(Box::new(Field::new( "item", array.data_type().clone(), @@ -651,21 +675,21 @@ pub fn pyarrow_nested_edge_statistics(column: &str) -> Statistics { vec![0, array.len() as i32].into(), array, None, - )) + ) }; match column { "simple" => Statistics { - distinct_count: Count::Single(UInt64Array::from([None])), - null_count: Count::Single(UInt64Array::from([Some(0)])), - min_value: new_list(Box::new(Int64Array::from([Some(0)]))), - max_value: new_list(Box::new(Int64Array::from([Some(1)]))), + distinct_count: Count::List(new_list(UInt64Array::from([None]).boxed())), + null_count: Count::List(new_list(UInt64Array::from([Some(0)]).boxed())), + min_value: new_list(Box::new(Int64Array::from([Some(0)]))).boxed(), + max_value: new_list(Box::new(Int64Array::from([Some(1)]))).boxed(), }, "null" => Statistics { - distinct_count: Count::Single(UInt64Array::from([None])), - null_count: Count::Single(UInt64Array::from([Some(1)])), - min_value: new_list(Box::new(Int64Array::from([None]))), - max_value: new_list(Box::new(Int64Array::from([None]))), + distinct_count: Count::List(new_list(UInt64Array::from([None]).boxed())), + null_count: Count::List(new_list(UInt64Array::from([Some(1)]).boxed())), + min_value: new_list(Box::new(Int64Array::from([None]))).boxed(), + max_value: new_list(Box::new(Int64Array::from([None]))).boxed(), }, _ => unreachable!(), }