From ac318931d5f68be208ce5e67f72256cdd9bc292b Mon Sep 17 00:00:00 2001 From: "Jorge C. Leitao" Date: Thu, 24 Feb 2022 15:36:22 +0000 Subject: [PATCH] Cleanup --- .../parquet/read/deserialize/binary/nested.rs | 7 ++----- src/io/parquet/read/file.rs | 7 +++---- src/io/parquet/read/mod.rs | 17 +++++++++-------- src/io/parquet/read/row_group.rs | 2 +- 4 files changed, 15 insertions(+), 18 deletions(-) diff --git a/src/io/parquet/read/deserialize/binary/nested.rs b/src/io/parquet/read/deserialize/binary/nested.rs index e49c4f49158..17de2d93a97 100644 --- a/src/io/parquet/read/deserialize/binary/nested.rs +++ b/src/io/parquet/read/deserialize/binary/nested.rs @@ -3,15 +3,12 @@ use std::collections::VecDeque; use parquet2::{encoding::Encoding, page::DataPage, schema::Repetition}; use crate::{ - array::Offset, - bitmap::MutableBitmap, - datatypes::DataType, - error::Result, + array::Offset, bitmap::MutableBitmap, datatypes::DataType, error::Result, io::parquet::read::DataPages, }; -use super::super::utils::MaybeNext; use super::super::nested_utils::*; +use super::super::utils::MaybeNext; use super::utils::Binary; use super::{ super::utils, diff --git a/src/io/parquet/read/file.rs b/src/io/parquet/read/file.rs index 126253e6b2e..8251392c6c0 100644 --- a/src/io/parquet/read/file.rs +++ b/src/io/parquet/read/file.rs @@ -14,14 +14,13 @@ use super::{infer_schema, read_metadata, FileMetaData, RowGroupDeserializer, Row type GroupFilter = Arc bool>; -/// An iterator of [`Chunk`] coming from row groups of a paquet file. +/// An iterator of [`Chunk`]s coming from row groups of a parquet file. /// -/// This can be thought of flatten chain of [`Iterator`] - each row group is sequentially +/// This can be thought of a flatten chain of [`Iterator`] - each row group is sequentially /// mapped to an [`Iterator`] and each iterator is iterated upon until either the limit /// or the last iterator ends. -/// /// # Implementation -/// Note that because +/// This iterator mixes IO-bounded and CPU-bounded operations. pub struct FileReader { row_groups: RowGroupReader, metadata: FileMetaData, diff --git a/src/io/parquet/read/mod.rs b/src/io/parquet/read/mod.rs index 38ab7ac4654..14bcbef3c02 100644 --- a/src/io/parquet/read/mod.rs +++ b/src/io/parquet/read/mod.rs @@ -7,12 +7,9 @@ mod row_group; pub mod schema; pub mod statistics; -use std::{ - io::{Read, Seek}, - sync::Arc, -}; - use futures::{AsyncRead, AsyncSeek}; + +// re-exports of parquet2's relevant APIs pub use parquet2::{ error::ParquetError, fallible_streaming_iterator, @@ -32,21 +29,25 @@ pub use parquet2::{ FallibleStreamingIterator, }; -use crate::{array::Array, error::Result}; - pub use deserialize::{column_iter_to_arrays, get_page_iterator}; pub use file::{FileReader, RowGroupReader}; pub use row_group::*; pub(crate) use schema::is_type_nullable; pub use schema::{infer_schema, FileMetaData}; -//use simple::nested_utils::{InitNested, NestedArrayIter, NestedState}; +use std::{ + io::{Read, Seek}, + sync::Arc, +}; + +use crate::{array::Array, error::Result}; /// Trait describing a [`FallibleStreamingIterator`] of [`DataPage`] pub trait DataPages: FallibleStreamingIterator + Send + Sync { } + impl + Send + Sync> DataPages for I { diff --git a/src/io/parquet/read/row_group.rs b/src/io/parquet/read/row_group.rs index 587544b6889..f83a65eadbe 100644 --- a/src/io/parquet/read/row_group.rs +++ b/src/io/parquet/read/row_group.rs @@ -87,7 +87,7 @@ impl Iterator for RowGroupDeserializer { } } -/// Returns all the parquet columns associated to `field_name`. +/// Returns all [`ColumnChunkMetaData`] associated to `field_name`. /// For non-nested parquet types, this returns a single column pub(super) fn get_field_columns<'a>( columns: &'a [ColumnChunkMetaData],