From bb98a1d0273f1710a0c25a461e803210f82ebd13 Mon Sep 17 00:00:00 2001 From: "Jorge C. Leitao" Date: Thu, 9 Dec 2021 16:05:41 +0000 Subject: [PATCH] Improved performance. --- src/io/parquet/read/binary/basic.rs | 6 +++++- src/io/parquet/read/utils.rs | 6 ++++-- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/src/io/parquet/read/binary/basic.rs b/src/io/parquet/read/binary/basic.rs index 5d7d0ced562..96b5b21adb0 100644 --- a/src/io/parquet/read/binary/basic.rs +++ b/src/io/parquet/read/binary/basic.rs @@ -214,7 +214,7 @@ fn read_plain_optional( pub(super) fn read_plain_required( buffer: &[u8], - _length: usize, + additional: usize, offsets: &mut MutableBuffer, values: &mut MutableBuffer, ) { @@ -222,11 +222,15 @@ pub(super) fn read_plain_required( let values_iterator = utils::BinaryIter::new(buffer); + // each value occupies 4 bytes + len declared in 4 bytes => reserve accordingly. + values.reserve(buffer.len() - 4 * additional); + let a = values.capacity(); for value in values_iterator { last_offset += O::from_usize(value.len()).unwrap(); values.extend_from_slice(value); offsets.push(last_offset); } + debug_assert_eq!(a, values.capacity()); } pub(super) fn extend_from_page( diff --git a/src/io/parquet/read/utils.rs b/src/io/parquet/read/utils.rs index 5659f2d6440..85b40a95a6b 100644 --- a/src/io/parquet/read/utils.rs +++ b/src/io/parquet/read/utils.rs @@ -1,4 +1,6 @@ -use parquet2::encoding::{get_length, Encoding}; +use std::convert::TryInto; + +use parquet2::encoding::Encoding; use parquet2::metadata::ColumnDescriptor; use parquet2::page::{split_buffer as _split_buffer, DataPage, DataPageHeader}; @@ -22,7 +24,7 @@ impl<'a> Iterator for BinaryIter<'a> { if self.values.is_empty() { return None; } - let length = get_length(self.values) as usize; + let length = u32::from_le_bytes(self.values[0..4].try_into().unwrap()) as usize; self.values = &self.values[4..]; let result = &self.values[..length]; self.values = &self.values[length..];