Skip to content
This repository has been archived by the owner on Feb 18, 2024. It is now read-only.

Commit

Permalink
Added support to write nested parquet (#1007)
Browse files Browse the repository at this point in the history
  • Loading branch information
jorgecarleitao authored May 27, 2022
1 parent b6a516c commit 2ed756a
Show file tree
Hide file tree
Showing 20 changed files with 1,488 additions and 713 deletions.
8 changes: 4 additions & 4 deletions arrow-parquet-integration-testing/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -175,15 +175,15 @@ fn main() -> Result<()> {
.fields
.iter()
.map(|x| match x.data_type() {
DataType::Dictionary(..) => Encoding::RleDictionary,
DataType::Dictionary(..) => vec![Encoding::RleDictionary],
DataType::Utf8 | DataType::LargeUtf8 => {
if args.encoding_utf8 == EncodingScheme::Delta {
vec![if args.encoding_utf8 == EncodingScheme::Delta {
Encoding::DeltaLengthByteArray
} else {
Encoding::Plain
}
}]
}
_ => Encoding::Plain,
_ => vec![Encoding::Plain],
})
.collect();

Expand Down
2 changes: 1 addition & 1 deletion benches/write_parquet.rs
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ fn write(array: &dyn Array, encoding: Encoding) -> Result<()> {
vec![Ok(columns)].into_iter(),
&schema,
options,
vec![encoding],
vec![vec![encoding]],
)?;

let writer = vec![];
Expand Down
8 changes: 6 additions & 2 deletions examples/parquet_write.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,12 @@ fn write_batch(path: &str, schema: Schema, columns: Chunk<Arc<dyn Array>>) -> Re

let iter = vec![Ok(columns)];

let row_groups =
RowGroupIterator::try_new(iter.into_iter(), &schema, options, vec![Encoding::Plain])?;
let row_groups = RowGroupIterator::try_new(
iter.into_iter(),
&schema,
options,
vec![vec![Encoding::Plain]],
)?;

// Create a new empty file
let file = File::create(path)?;
Expand Down
2 changes: 1 addition & 1 deletion src/doc/lib.md
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ fn main() -> Result<()> {
vec![Ok(chunk)].into_iter(),
&schema,
options,
vec![Encoding::Plain, Encoding::Plain],
vec![vec![Encoding::Plain], vec![Encoding::Plain]],
)?;

// anything implementing `std::io::Write` works
Expand Down
21 changes: 8 additions & 13 deletions src/io/parquet/write/binary/nested.rs
Original file line number Diff line number Diff line change
@@ -1,34 +1,29 @@
use parquet2::schema::types::PrimitiveType;
use parquet2::{encoding::Encoding, page::DataPage};

use super::super::{levels, utils, WriteOptions};
use super::super::{nested, utils, WriteOptions};
use super::basic::{build_statistics, encode_plain};
use crate::io::parquet::read::schema::is_nullable;
use crate::io::parquet::write::Nested;
use crate::{
array::{Array, BinaryArray, Offset},
error::Result,
};

pub fn array_to_page<O, OO>(
pub fn array_to_page<O>(
array: &BinaryArray<O>,
options: WriteOptions,
type_: PrimitiveType,
nested: levels::NestedInfo<OO>,
nested: Vec<Nested>,
) -> Result<DataPage>
where
OO: Offset,
O: Offset,
{
let is_optional = is_nullable(&type_.field_info);

let validity = array.validity();

let mut buffer = vec![];
levels::write_rep_levels(&mut buffer, &nested, options.version)?;
let repetition_levels_byte_length = buffer.len();

levels::write_def_levels(&mut buffer, &nested, validity, is_optional, options.version)?;
let definition_levels_byte_length = buffer.len() - repetition_levels_byte_length;
let (repetition_levels_byte_length, definition_levels_byte_length) =
nested::write_rep_and_def(options.version, &nested, &mut buffer)?;

encode_plain(array, is_optional, &mut buffer);

Expand All @@ -40,8 +35,8 @@ where

utils::build_plain_page(
buffer,
levels::num_values(nested.offsets()),
nested.offsets().len().saturating_sub(1),
nested::num_values(&nested),
nested[0].len(),
array.null_count(),
repetition_levels_byte_length,
definition_levels_byte_length,
Expand Down
27 changes: 10 additions & 17 deletions src/io/parquet/write/boolean/nested.rs
Original file line number Diff line number Diff line change
@@ -1,33 +1,26 @@
use parquet2::schema::types::PrimitiveType;
use parquet2::{encoding::Encoding, page::DataPage};

use super::super::{levels, utils, WriteOptions};
use super::super::{nested, utils, WriteOptions};
use super::basic::{build_statistics, encode_plain};
use crate::io::parquet::read::schema::is_nullable;
use crate::io::parquet::write::Nested;
use crate::{
array::{Array, BooleanArray, Offset},
array::{Array, BooleanArray},
error::Result,
};

pub fn array_to_page<O>(
pub fn array_to_page(
array: &BooleanArray,
options: WriteOptions,
type_: PrimitiveType,
nested: levels::NestedInfo<O>,
) -> Result<DataPage>
where
O: Offset,
{
nested: Vec<Nested>,
) -> Result<DataPage> {
let is_optional = is_nullable(&type_.field_info);

let validity = array.validity();

let mut buffer = vec![];
levels::write_rep_levels(&mut buffer, &nested, options.version)?;
let repetition_levels_byte_length = buffer.len();

levels::write_def_levels(&mut buffer, &nested, validity, is_optional, options.version)?;
let definition_levels_byte_length = buffer.len() - repetition_levels_byte_length;
let (repetition_levels_byte_length, definition_levels_byte_length) =
nested::write_rep_and_def(options.version, &nested, &mut buffer)?;

encode_plain(array, is_optional, &mut buffer)?;

Expand All @@ -39,8 +32,8 @@ where

utils::build_plain_page(
buffer,
levels::num_values(nested.offsets()),
nested.offsets().len().saturating_sub(1),
nested::num_values(&nested),
nested[0].len(),
array.null_count(),
repetition_levels_byte_length,
definition_levels_byte_length,
Expand Down
Loading

0 comments on commit 2ed756a

Please sign in to comment.