From 7d18273450fc1190a835a61f61a632f54dab0517 Mon Sep 17 00:00:00 2001 From: "Jorge C. Leitao" Date: Mon, 1 Aug 2022 06:22:47 +0000 Subject: [PATCH] Simpler and added docs --- .../parquet/read/deserialize/binary/basic.rs | 2 +- .../parquet/read/deserialize/binary/utils.rs | 8 +++---- .../parquet/read/deserialize/boolean/basic.rs | 2 +- .../deserialize/fixed_size_binary/basic.rs | 2 +- .../parquet/read/deserialize/nested_utils.rs | 2 +- .../read/deserialize/primitive/basic.rs | 2 +- src/io/parquet/read/deserialize/utils.rs | 22 +++++++++++++------ 7 files changed, 24 insertions(+), 16 deletions(-) diff --git a/src/io/parquet/read/deserialize/binary/basic.rs b/src/io/parquet/read/deserialize/binary/basic.rs index cd4a7805bbb..37440c9ba7f 100644 --- a/src/io/parquet/read/deserialize/binary/basic.rs +++ b/src/io/parquet/read/deserialize/binary/basic.rs @@ -221,7 +221,7 @@ impl TraitBinaryArray for Utf8Array { } } -impl<'a, O: Offset> DecodedState<'a> for (Binary, MutableBitmap) { +impl DecodedState for (Binary, MutableBitmap) { fn len(&self) -> usize { self.0.len() } diff --git a/src/io/parquet/read/deserialize/binary/utils.rs b/src/io/parquet/read/deserialize/binary/utils.rs index 0cd6bb5e31a..eac2b806643 100644 --- a/src/io/parquet/read/deserialize/binary/utils.rs +++ b/src/io/parquet/read/deserialize/binary/utils.rs @@ -107,10 +107,10 @@ impl<'a> Iterator for BinaryIter<'a> { if self.values.is_empty() { return None; } - let length = u32::from_le_bytes(self.values[0..4].try_into().unwrap()) as usize; - self.values = &self.values[4..]; - let result = &self.values[..length]; - self.values = &self.values[length..]; + let (length, remaining) = self.values.split_at(4); + let length = u32::from_le_bytes(length.try_into().unwrap()) as usize; + let (result, remaining) = remaining.split_at(length); + self.values = remaining; Some(result) } } diff --git a/src/io/parquet/read/deserialize/boolean/basic.rs b/src/io/parquet/read/deserialize/boolean/basic.rs index e74cc50af5e..d12bff3eced 100644 --- a/src/io/parquet/read/deserialize/boolean/basic.rs +++ b/src/io/parquet/read/deserialize/boolean/basic.rs @@ -100,7 +100,7 @@ impl<'a> utils::PageState<'a> for State<'a> { } } -impl<'a> DecodedState<'a> for (MutableBitmap, MutableBitmap) { +impl DecodedState for (MutableBitmap, MutableBitmap) { fn len(&self) -> usize { self.0.len() } diff --git a/src/io/parquet/read/deserialize/fixed_size_binary/basic.rs b/src/io/parquet/read/deserialize/fixed_size_binary/basic.rs index 0f9eb508158..e8de6cbf3f8 100644 --- a/src/io/parquet/read/deserialize/fixed_size_binary/basic.rs +++ b/src/io/parquet/read/deserialize/fixed_size_binary/basic.rs @@ -150,7 +150,7 @@ struct BinaryDecoder { size: usize, } -impl<'a> DecodedState<'a> for (FixedSizeBinary, MutableBitmap) { +impl DecodedState for (FixedSizeBinary, MutableBitmap) { fn len(&self) -> usize { self.0.len() } diff --git a/src/io/parquet/read/deserialize/nested_utils.rs b/src/io/parquet/read/deserialize/nested_utils.rs index 882a1724f85..73679d34224 100644 --- a/src/io/parquet/read/deserialize/nested_utils.rs +++ b/src/io/parquet/read/deserialize/nested_utils.rs @@ -247,7 +247,7 @@ impl Nested for NestedStruct { pub(super) trait NestedDecoder<'a> { type State: PageState<'a>; type Dictionary; - type DecodedState: DecodedState<'a>; + type DecodedState: DecodedState; fn build_state( &self, diff --git a/src/io/parquet/read/deserialize/primitive/basic.rs b/src/io/parquet/read/deserialize/primitive/basic.rs index bb4648aeb07..648e78782a0 100644 --- a/src/io/parquet/read/deserialize/primitive/basic.rs +++ b/src/io/parquet/read/deserialize/primitive/basic.rs @@ -145,7 +145,7 @@ where } } -impl<'a, T: std::fmt::Debug> utils::DecodedState<'a> for (Vec, MutableBitmap) { +impl utils::DecodedState for (Vec, MutableBitmap) { fn len(&self) -> usize { self.0.len() } diff --git a/src/io/parquet/read/deserialize/utils.rs b/src/io/parquet/read/deserialize/utils.rs index 762ce62d1b2..b847709e7b2 100644 --- a/src/io/parquet/read/deserialize/utils.rs +++ b/src/io/parquet/read/deserialize/utils.rs @@ -337,28 +337,32 @@ pub(super) trait PageState<'a>: std::fmt::Debug { } /// The state of a partially deserialized page -pub(super) trait DecodedState<'a>: std::fmt::Debug { - // the number of values that this decoder already consumed +pub(super) trait DecodedState: std::fmt::Debug { + // the number of values that the state already has fn len(&self) -> usize; } /// A decoder that knows how to map `State` -> Array pub(super) trait Decoder<'a> { + /// The state that this decoder derives from a [`DataPage`]. This is bound to the page. type State: PageState<'a>; + /// The dictionary representation that the decoder uses type Dict; - type DecodedState: DecodedState<'a>; + /// The target state that this Decoder decodes into. + type DecodedState: DecodedState; + /// Creates a new `Self::State` fn build_state( &self, page: &'a DataPage, dict: Option<&'a Self::Dict>, ) -> Result; - /// Initializes a new state + /// Initializes a new [`Self::DecodedState`]. fn with_capacity(&self, capacity: usize) -> Self::DecodedState; - /// extends (values, validity) by deserializing items in `State`. - /// It guarantees that the length of `values` is at most `values.len() + remaining`. + /// extends [`Self::DecodedState`] by deserializing items in [`Self::State`]. + /// It guarantees that the length of `decoded` is at most `decoded.len() + remaining`. fn extend_from_state( &self, page: &mut Self::State, @@ -366,7 +370,7 @@ pub(super) trait Decoder<'a> { additional: usize, ); - /// Deserializes a [`DictPage`] into a representation suited for decoding using it. + /// Deserializes a [`DictPage`] into [`Self::Dict`]. fn deserialize_dict(&self, page: &DictPage) -> Self::Dict; } @@ -404,10 +408,14 @@ pub(super) fn extend_from_new_page<'a, T: Decoder<'a>>( } } +/// Represents what happened when a new page was consumed #[derive(Debug)] pub enum MaybeNext

{ + /// Whether the page was sufficient to fill `chunk_size` Some(P), + /// whether there are no more pages or intermediary decoded states None, + /// Whether the page was insufficient to fill `chunk_size` and a new page is required More, }