From 1c5bfc7159a4660657e31d4f866eeda705773f5e Mon Sep 17 00:00:00 2001 From: "Jorge C. Leitao" Date: Sat, 23 Oct 2021 10:36:54 +0000 Subject: [PATCH] Added support for nested lists. --- src/io/parquet/read/mod.rs | 5 ++++ src/io/parquet/read/nested_utils.rs | 46 +++++++++++++++++++++++------ tests/it/io/parquet/mod.rs | 18 +++++++++++ tests/it/io/parquet/read.rs | 9 +++--- 4 files changed, 65 insertions(+), 13 deletions(-) diff --git a/src/io/parquet/read/mod.rs b/src/io/parquet/read/mod.rs index 92f4d899f0e..e85f8c63637 100644 --- a/src/io/parquet/read/mod.rs +++ b/src/io/parquet/read/mod.rs @@ -214,6 +214,11 @@ fn page_iter_to_array_nested< LargeBinary | LargeUtf8 => { binary::iter_to_array_nested::(iter, metadata, data_type) } + List(ref inner) => { + let (values, mut nested) = + page_iter_to_array_nested(iter, metadata, inner.data_type().clone())?; + Ok((create_list(data_type, &mut nested, values)?.into(), nested)) + } other => Err(ArrowError::NotYetImplemented(format!( "Reading {:?} from parquet still not implemented", other diff --git a/src/io/parquet/read/nested_utils.rs b/src/io/parquet/read/nested_utils.rs index 04037f05ba7..e69b355819a 100644 --- a/src/io/parquet/read/nested_utils.rs +++ b/src/io/parquet/read/nested_utils.rs @@ -110,17 +110,45 @@ pub fn extend_offsets( R: Iterator, D: Iterator, { - assert_eq!(max_rep, 1); - let mut values_count = 0; + let mut values_count = vec![0; nested.len()]; + let mut prev_def: u32 = 0; + let mut is_first = true; + rep_levels.zip(def_levels).for_each(|(rep, def)| { - if rep == 0 { - nested[0].push(values_count, def != 0); + let mut closures = max_rep - rep; + if prev_def <= 1 { + closures = 1; + }; + if is_first { + closures = max_rep; + is_first = false; } + + nested + .iter_mut() + .zip(values_count.iter()) + .enumerate() + .skip(rep as usize) + .take((rep + closures) as usize) + .for_each(|(depth, (nested, length))| { + let is_null = (def - rep) as usize == depth && depth == rep as usize; + nested.push(*length, !is_null); + }); + if def == max_def || (is_nullable && def == max_def - 1) { - values_count += 1; + values_count[1] += 1; } + if rep <= 1 && def > 1 { + values_count[0] += 1; + } + prev_def = def; }); - nested[0].close(values_count); + nested + .iter_mut() + .zip(values_count.iter()) + .for_each(|(nested, length)| { + nested.close(*length); + }); } pub fn is_nullable(type_: &ParquetType, container: &mut Vec) { @@ -164,12 +192,12 @@ pub fn init_nested(base_type: &ParquetType, capacity: usize) -> (Vec], + nested: &mut Vec>, values: Arc, ) -> Result> { Ok(match data_type { DataType::List(_) => { - let (offsets, validity) = nested[0].inner(); + let (offsets, validity) = nested.pop().unwrap().inner(); let offsets = Buffer::::from_trusted_len_iter(offsets.iter().map(|x| *x as i32)); Box::new(ListArray::::from_data( @@ -177,7 +205,7 @@ pub fn create_list( )) } DataType::LargeList(_) => { - let (offsets, validity) = nested[0].inner(); + let (offsets, validity) = nested.pop().unwrap().inner(); Box::new(ListArray::::from_data( data_type, offsets, values, validity, diff --git a/tests/it/io/parquet/mod.rs b/tests/it/io/parquet/mod.rs index e82135fd812..6404b1228b8 100644 --- a/tests/it/io/parquet/mod.rs +++ b/tests/it/io/parquet/mod.rs @@ -139,6 +139,7 @@ pub fn pyarrow_nested_nullable(column: usize) -> Box { Some(b"bbb".to_vec()), Some(b"".to_vec()), ])), + 7 => Arc::new(NullArray::from_data(DataType::Null, 1)), _ => unreachable!(), }; @@ -169,6 +170,23 @@ pub fn pyarrow_nested_nullable(column: usize) -> Box { data_type, offsets, values, None, )) } + 7 => { + let data = [ + Some(vec![Some(vec![Some(0), Some(1)])]), + None, + Some(vec![Some(vec![Some(2), None]), Some(vec![Some(3)])]), + Some(vec![Some(vec![Some(4), Some(5)]), Some(vec![Some(6)])]), + Some(vec![]), + Some(vec![Some(vec![Some(7)]), None, Some(vec![Some(9)])]), + None, + Some(vec![Some(vec![Some(10)])]), + ]; + let mut a = + MutableListArray::>>::new(); + a.try_extend(data).unwrap(); + let array: ListArray = a.into(); + Box::new(array) + } _ => unreachable!(), } } diff --git a/tests/it/io/parquet/read.rs b/tests/it/io/parquet/read.rs index bfad685eb49..a92e8d39d00 100644 --- a/tests/it/io/parquet/read.rs +++ b/tests/it/io/parquet/read.rs @@ -232,6 +232,11 @@ fn v1_nested_large_binary() -> Result<()> { test_pyarrow_integration(6, 1, "nested", false, false) } +#[test] +fn v2_nested_nested() -> Result<()> { + test_pyarrow_integration(7, 1, "nested", false, false) +} + #[test] fn v1_decimal_9_nullable() -> Result<()> { test_pyarrow_integration(7, 1, "basic", false, false) @@ -291,10 +296,6 @@ fn v2_decimal_26_nullable() -> Result<()> { fn v2_decimal_26_required() -> Result<()> { test_pyarrow_integration(8, 2, "basic", false, true) } -/*#[test] -fn v2_nested_nested() { - let _ = test_pyarrow_integration(7, 1, "nested",false, false); -}*/ #[test] fn all_types() -> Result<()> {