This repository has been archived by the owner on Feb 18, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 224
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Read parquet row groups in chunks (#789)
- Loading branch information
1 parent
b46a636
commit f35e02a
Showing
39 changed files
with
4,163 additions
and
2,140 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,46 +1,23 @@ | ||
use std::fs::File; | ||
use std::io::BufReader; | ||
use std::time::SystemTime; | ||
|
||
use arrow2::error::Result; | ||
use arrow2::io::parquet::read; | ||
use arrow2::{array::Array, error::Result}; | ||
|
||
fn read_field(path: &str, row_group: usize, field: usize) -> Result<Box<dyn Array>> { | ||
// Open a file, a common operation in Rust | ||
let mut file = BufReader::new(File::open(path)?); | ||
|
||
// Read the files' metadata. This has a small IO cost because it requires seeking to the end | ||
// of the file to read its footer. | ||
let metadata = read::read_metadata(&mut file)?; | ||
|
||
// Convert the files' metadata into an arrow schema. This is CPU-only and amounts to | ||
// parse thrift if the arrow format is available on a key, or infering the arrow schema from | ||
// the parquet's physical, converted and logical types. | ||
let arrow_schema = read::get_schema(&metadata)?; | ||
|
||
// Created an iterator of column chunks. Each iteration | ||
// yields an iterator of compressed pages. There is almost no CPU work in iterating. | ||
let columns = read::get_column_iterator(&mut file, &metadata, row_group, field, None, vec![]); | ||
|
||
// get the columns' field | ||
let field = &arrow_schema.fields[field]; | ||
|
||
// This is the actual work. In this case, pages are read and | ||
// decompressed, decoded and deserialized to arrow. | ||
// Because `columns` is an iterator, it uses a combination of IO and CPU. | ||
let (array, _, _) = read::column_iter_to_array(columns, field, vec![])?; | ||
|
||
Ok(array) | ||
} | ||
|
||
fn main() -> Result<()> { | ||
use std::env; | ||
let args: Vec<String> = env::args().collect(); | ||
|
||
let file_path = &args[1]; | ||
let field = args[2].parse::<usize>().unwrap(); | ||
let row_group = args[3].parse::<usize>().unwrap(); | ||
|
||
let array = read_field(file_path, row_group, field)?; | ||
println!("{:?}", array); | ||
let reader = File::open(file_path)?; | ||
let reader = read::FileReader::try_new(reader, None, None, None, None)?; | ||
|
||
let start = SystemTime::now(); | ||
for maybe_chunk in reader { | ||
let columns = maybe_chunk?; | ||
assert!(!columns.is_empty()); | ||
} | ||
println!("took: {} ms", start.elapsed().unwrap().as_millis()); | ||
Ok(()) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,57 @@ | ||
use std::sync::Arc; | ||
use std::time::SystemTime; | ||
|
||
use futures::future::BoxFuture; | ||
use futures::FutureExt; | ||
use tokio; | ||
use tokio::fs::File; | ||
use tokio::io::BufReader; | ||
use tokio_util::compat::*; | ||
|
||
use arrow2::error::Result; | ||
use arrow2::io::parquet::read::{self, RowGroupDeserializer}; | ||
|
||
#[tokio::main(flavor = "current_thread")] | ||
async fn main() -> Result<()> { | ||
let start = SystemTime::now(); | ||
|
||
use std::env; | ||
let args: Vec<String> = env::args().collect(); | ||
let file_path = Arc::new(args[1].clone()); | ||
|
||
// # Read metadata | ||
let mut reader = BufReader::new(File::open(file_path.as_ref()).await?).compat(); | ||
|
||
// this operation is usually done before reading the data, during planning. | ||
// This is a mix of IO and CPU-bounded tasks but both of them are O(1) | ||
let metadata = read::read_metadata_async(&mut reader).await?; | ||
let schema = read::get_schema(&metadata)?; | ||
|
||
// This factory yields one file descriptor per column and is used to read columns concurrently. | ||
// They do not need to be buffered since we execute exactly 1 seek and 1 read on them. | ||
let factory = || { | ||
Box::pin(async { Ok(File::open(file_path.clone().as_ref()).await?.compat()) }) | ||
as BoxFuture<_> | ||
}; | ||
|
||
// This is the row group loop. Groups can be skipped based on the statistics they carry. | ||
for row_group in &metadata.row_groups { | ||
// A row group is consumed in two steps: the first step is to read the (compressed) | ||
// columns into memory, which is IO-bounded. | ||
let column_chunks = | ||
read::read_columns_async(factory, row_group, schema.fields.clone(), None).await?; | ||
|
||
// the second step is to iterate over the columns in chunks. | ||
// this operation is CPU-bounded and should be sent to a separate thread pool (e.g. `tokio_rayon`) to not block | ||
// the runtime. | ||
// Furthermore, this operation is trivially paralellizable e.g. via rayon, as each iterator | ||
// can be advanced in parallel (parallel decompression and deserialization). | ||
let chunks = RowGroupDeserializer::new(column_chunks, row_group.num_rows() as usize, None); | ||
for maybe_chunk in chunks { | ||
let chunk = maybe_chunk?; | ||
println!("{}", chunk.len()); | ||
} | ||
} | ||
println!("took: {} ms", start.elapsed().unwrap().as_millis()); | ||
Ok(()) | ||
} |
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.