diff --git a/arrow-parquet-integration-testing/src/main.rs b/arrow-parquet-integration-testing/src/main.rs index d99c7888aa8..eef4acc9cb0 100644 --- a/arrow-parquet-integration-testing/src/main.rs +++ b/arrow-parquet-integration-testing/src/main.rs @@ -175,15 +175,15 @@ fn main() -> Result<()> { .fields .iter() .map(|x| match x.data_type() { - DataType::Dictionary(..) => Encoding::RleDictionary, + DataType::Dictionary(..) => vec![Encoding::RleDictionary], DataType::Utf8 | DataType::LargeUtf8 => { - if args.encoding_utf8 == EncodingScheme::Delta { + vec![if args.encoding_utf8 == EncodingScheme::Delta { Encoding::DeltaLengthByteArray } else { Encoding::Plain - } + }] } - _ => Encoding::Plain, + _ => vec![Encoding::Plain], }) .collect(); diff --git a/src/doc/lib.md b/src/doc/lib.md index efc89876e51..e27e0ff2c37 100644 --- a/src/doc/lib.md +++ b/src/doc/lib.md @@ -51,7 +51,7 @@ fn main() -> Result<()> { vec![Ok(chunk)].into_iter(), &schema, options, - vec![Encoding::Plain, Encoding::Plain], + vec![vec![Encoding::Plain], vec![Encoding::Plain]], )?; // anything implementing `std::io::Write` works diff --git a/src/io/parquet/write/binary/nested.rs b/src/io/parquet/write/binary/nested.rs index 321b61241d5..198e1d2156c 100644 --- a/src/io/parquet/write/binary/nested.rs +++ b/src/io/parquet/write/binary/nested.rs @@ -22,11 +22,8 @@ where let is_optional = is_nullable(&type_.field_info); let mut buffer = vec![]; - nested::write_rep_levels(&mut buffer, &nested, options.version)?; - let repetition_levels_byte_length = buffer.len(); - - nested::write_def_levels(&mut buffer, &nested, options.version)?; - let definition_levels_byte_length = buffer.len() - repetition_levels_byte_length; + let (repetition_levels_byte_length, definition_levels_byte_length) = + nested::write_rep_and_def(options.version, &nested, &mut buffer)?; encode_plain(array, is_optional, &mut buffer); diff --git a/src/io/parquet/write/boolean/nested.rs b/src/io/parquet/write/boolean/nested.rs index d8a156f6c8d..4bd741ab52a 100644 --- a/src/io/parquet/write/boolean/nested.rs +++ b/src/io/parquet/write/boolean/nested.rs @@ -19,11 +19,8 @@ pub fn array_to_page( let is_optional = is_nullable(&type_.field_info); let mut buffer = vec![]; - nested::write_rep_levels(&mut buffer, &nested, options.version)?; - let repetition_levels_byte_length = buffer.len(); - - nested::write_def_levels(&mut buffer, &nested, options.version)?; - let definition_levels_byte_length = buffer.len() - repetition_levels_byte_length; + let (repetition_levels_byte_length, definition_levels_byte_length) = + nested::write_rep_and_def(options.version, &nested, &mut buffer)?; encode_plain(array, is_optional, &mut buffer)?; diff --git a/src/io/parquet/write/nested/mod.rs b/src/io/parquet/write/nested/mod.rs index 51e94a57fc5..39a3f4eddf5 100644 --- a/src/io/parquet/write/nested/mod.rs +++ b/src/io/parquet/write/nested/mod.rs @@ -28,19 +28,26 @@ fn write_levels_v1) -> Result<()>>( } /// writes the rep levels to a `Vec`. -pub fn write_rep_levels(buffer: &mut Vec, nested: &[Nested], version: Version) -> Result<()> { - let num_bits = get_bit_width(max_rep_level(nested) as i16) as u8; +fn write_rep_levels(buffer: &mut Vec, nested: &[Nested], version: Version) -> Result<()> { + let max_level = max_rep_level(nested) as i16; + if max_level == 0 { + return Ok(()); + } + let num_bits = get_bit_width(max_level) as u8; + + let levels = rep::RepLevelsIter::new(nested); + + let mut buffer1 = vec![]; + encode_u32(&mut buffer1, rep::RepLevelsIter::new(nested), num_bits).unwrap(); match version { Version::V1 => { write_levels_v1(buffer, |buffer: &mut Vec| { - let levels = rep::RepLevelsIter::new(nested); encode_u32(buffer, levels, num_bits)?; Ok(()) })?; } Version::V2 => { - let levels = rep::RepLevelsIter::new(nested); encode_u32(buffer, levels, num_bits)?; } } @@ -49,11 +56,18 @@ pub fn write_rep_levels(buffer: &mut Vec, nested: &[Nested], version: Versio } /// writes the rep levels to a `Vec`. -pub fn write_def_levels(buffer: &mut Vec, nested: &[Nested], version: Version) -> Result<()> { - let num_bits = get_bit_width(max_def_level(nested) as i16) as u8; +fn write_def_levels(buffer: &mut Vec, nested: &[Nested], version: Version) -> Result<()> { + let max_level = max_def_level(nested) as i16; + if max_level == 0 { + return Ok(()); + } + let num_bits = get_bit_width(max_level) as u8; let levels = def::DefLevelsIter::new(nested); + let mut buffer1 = vec![]; + encode_u32(&mut buffer1, def::DefLevelsIter::new(nested), num_bits).unwrap(); + match version { Version::V1 => write_levels_v1(buffer, move |buffer: &mut Vec| { encode_u32(buffer, levels, num_bits)?; @@ -92,3 +106,17 @@ fn to_length( .windows(2) .map(|w| w[1].to_usize() - w[0].to_usize()) } + +pub fn write_rep_and_def( + page_version: Version, + nested: &[Nested], + buffer: &mut Vec, +) -> Result<(usize, usize)> { + write_rep_levels(buffer, nested, page_version)?; + let repetition_levels_byte_length = buffer.len(); + + write_def_levels(buffer, nested, page_version)?; + let definition_levels_byte_length = buffer.len() - repetition_levels_byte_length; + + Ok((repetition_levels_byte_length, definition_levels_byte_length)) +} diff --git a/src/io/parquet/write/nested/rep.rs b/src/io/parquet/write/nested/rep.rs index c239fcd5f30..4281a6301e1 100644 --- a/src/io/parquet/write/nested/rep.rs +++ b/src/io/parquet/write/nested/rep.rs @@ -147,6 +147,17 @@ mod tests { #[test] fn struct_required() { + let nested = vec![ + Nested::Struct(None, false, 10), + Nested::Primitive(None, true, 10), + ]; + let expected = vec![0, 0, 0, 0, 0, 0, 0, 0, 0, 0]; + + test(nested, expected) + } + + #[test] + fn struct_optional() { let nested = vec![ Nested::Struct(None, true, 10), Nested::Primitive(None, true, 10), diff --git a/src/io/parquet/write/primitive/nested.rs b/src/io/parquet/write/primitive/nested.rs index 2742650a73a..9bd13184ca4 100644 --- a/src/io/parquet/write/primitive/nested.rs +++ b/src/io/parquet/write/primitive/nested.rs @@ -28,11 +28,8 @@ where let is_optional = is_nullable(&type_.field_info); let mut buffer = vec![]; - nested::write_rep_levels(&mut buffer, &nested, options.version)?; - let repetition_levels_byte_length = buffer.len(); - - nested::write_def_levels(&mut buffer, &nested, options.version)?; - let definition_levels_byte_length = buffer.len() - repetition_levels_byte_length; + let (repetition_levels_byte_length, definition_levels_byte_length) = + nested::write_rep_and_def(options.version, &nested, &mut buffer)?; encode_plain(array, is_optional, &mut buffer); diff --git a/src/io/parquet/write/sink.rs b/src/io/parquet/write/sink.rs index d9cfe7b4166..de4fa11da26 100644 --- a/src/io/parquet/write/sink.rs +++ b/src/io/parquet/write/sink.rs @@ -30,7 +30,7 @@ use super::{Encoding, SchemaDescriptor, WriteOptions}; /// let schema = Schema::from(vec![ /// Field::new("values", DataType::Int32, true), /// ]); -/// let encoding = vec![Encoding::Plain]; +/// let encoding = vec![vec![Encoding::Plain]]; /// let options = WriteOptions { /// write_statistics: true, /// compression: CompressionOptions::Uncompressed, diff --git a/src/io/parquet/write/utf8/nested.rs b/src/io/parquet/write/utf8/nested.rs index bc96f07d2ed..7a6e0ee05be 100644 --- a/src/io/parquet/write/utf8/nested.rs +++ b/src/io/parquet/write/utf8/nested.rs @@ -22,11 +22,8 @@ where let is_optional = is_nullable(&type_.field_info); let mut buffer = vec![]; - nested::write_rep_levels(&mut buffer, &nested, options.version)?; - let repetition_levels_byte_length = buffer.len(); - - nested::write_def_levels(&mut buffer, &nested, options.version)?; - let definition_levels_byte_length = buffer.len() - repetition_levels_byte_length; + let (repetition_levels_byte_length, definition_levels_byte_length) = + nested::write_rep_and_def(options.version, &nested, &mut buffer)?; encode_plain(array, is_optional, &mut buffer); diff --git a/tests/it/io/parquet/read.rs b/tests/it/io/parquet/read.rs index 5216ad0436b..57b6cbf155d 100644 --- a/tests/it/io/parquet/read.rs +++ b/tests/it/io/parquet/read.rs @@ -31,7 +31,6 @@ fn test_pyarrow_integration( let mut file = File::open(path).unwrap(); let (array, statistics) = read_column(&mut file, column)?; - println!("{:?}", array); let expected = match (type_, required) { ("basic", true) => pyarrow_required(column), diff --git a/tests/it/io/parquet/write.rs b/tests/it/io/parquet/write.rs index fe288779add..2c41aac5fa2 100644 --- a/tests/it/io/parquet/write.rs +++ b/tests/it/io/parquet/write.rs @@ -509,7 +509,7 @@ fn decimal_26_required_v2() -> Result<()> { } #[test] -fn struct_() -> Result<()> { +fn struct_v1() -> Result<()> { round_trip( "struct", "struct", @@ -518,3 +518,14 @@ fn struct_() -> Result<()> { vec![Encoding::Plain, Encoding::Plain], ) } + +#[test] +fn struct_v2() -> Result<()> { + round_trip( + "struct", + "struct", + Version::V2, + CompressionOptions::Uncompressed, + vec![Encoding::Plain, Encoding::Plain], + ) +}