This repository has been archived by the owner on Feb 18, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 224
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Added support to read Avro files' metadata asynchronously (#614)
- Loading branch information
1 parent
9d4107c
commit 3f12bd6
Showing
12 changed files
with
419 additions
and
243 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,91 @@ | ||
//! APIs to read from Avro format to arrow. | ||
use std::io::Read; | ||
|
||
use fallible_streaming_iterator::FallibleStreamingIterator; | ||
|
||
use crate::error::{ArrowError, Result}; | ||
|
||
use super::util; | ||
|
||
fn read_size<R: Read>(reader: &mut R) -> Result<(usize, usize)> { | ||
let rows = match util::zigzag_i64(reader) { | ||
Ok(a) => a, | ||
Err(ArrowError::Io(io_err)) => { | ||
if let std::io::ErrorKind::UnexpectedEof = io_err.kind() { | ||
// end | ||
return Ok((0, 0)); | ||
} else { | ||
return Err(ArrowError::Io(io_err)); | ||
} | ||
} | ||
Err(other) => return Err(other), | ||
}; | ||
let bytes = util::zigzag_i64(reader)?; | ||
Ok((rows as usize, bytes as usize)) | ||
} | ||
|
||
/// Reads a block from the file into `buf`. | ||
/// # Panic | ||
/// Panics iff the block marker does not equal to the file's marker | ||
fn read_block<R: Read>(reader: &mut R, buf: &mut Vec<u8>, file_marker: [u8; 16]) -> Result<usize> { | ||
let (rows, bytes) = read_size(reader)?; | ||
if rows == 0 { | ||
return Ok(0); | ||
}; | ||
|
||
buf.resize(bytes, 0); | ||
reader.read_exact(buf)?; | ||
|
||
let mut marker = [0u8; 16]; | ||
reader.read_exact(&mut marker)?; | ||
|
||
assert!(!(marker != file_marker)); | ||
Ok(rows) | ||
} | ||
|
||
/// [`FallibleStreamingIterator`] of compressed avro blocks | ||
pub struct BlockStreamIterator<R: Read> { | ||
buf: (Vec<u8>, usize), | ||
reader: R, | ||
file_marker: [u8; 16], | ||
} | ||
|
||
impl<R: Read> BlockStreamIterator<R> { | ||
/// Creates a new [`BlockStreamIterator`]. | ||
pub fn new(reader: R, file_marker: [u8; 16]) -> Self { | ||
Self { | ||
reader, | ||
file_marker, | ||
buf: (vec![], 0), | ||
} | ||
} | ||
|
||
/// The buffer of [`BlockStreamIterator`]. | ||
pub fn buffer(&mut self) -> &mut Vec<u8> { | ||
&mut self.buf.0 | ||
} | ||
|
||
/// Deconstructs itself | ||
pub fn into_inner(self) -> (R, Vec<u8>) { | ||
(self.reader, self.buf.0) | ||
} | ||
} | ||
|
||
impl<R: Read> FallibleStreamingIterator for BlockStreamIterator<R> { | ||
type Error = ArrowError; | ||
type Item = (Vec<u8>, usize); | ||
|
||
fn advance(&mut self) -> Result<()> { | ||
let (buf, rows) = &mut self.buf; | ||
*rows = read_block(&mut self.reader, buf, self.file_marker)?; | ||
Ok(()) | ||
} | ||
|
||
fn get(&self) -> Option<&Self::Item> { | ||
if self.buf.1 > 0 { | ||
Some(&self.buf) | ||
} else { | ||
None | ||
} | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,100 @@ | ||
//! APIs to read from Avro format to arrow. | ||
use std::io::Read; | ||
|
||
use fallible_streaming_iterator::FallibleStreamingIterator; | ||
|
||
use crate::error::{ArrowError, Result}; | ||
|
||
use super::BlockStreamIterator; | ||
use super::Compression; | ||
|
||
/// Decompresses an avro block. | ||
/// Returns whether the buffers where swapped. | ||
fn decompress_block( | ||
block: &mut Vec<u8>, | ||
decompress: &mut Vec<u8>, | ||
compression: Option<Compression>, | ||
) -> Result<bool> { | ||
match compression { | ||
None => { | ||
std::mem::swap(block, decompress); | ||
Ok(true) | ||
} | ||
#[cfg(feature = "io_avro_compression")] | ||
Some(Compression::Deflate) => { | ||
decompress.clear(); | ||
let mut decoder = libflate::deflate::Decoder::new(&block[..]); | ||
decoder.read_to_end(decompress)?; | ||
Ok(false) | ||
} | ||
#[cfg(feature = "io_avro_compression")] | ||
Some(Compression::Snappy) => { | ||
let len = snap::raw::decompress_len(&block[..block.len() - 4]) | ||
.map_err(|_| ArrowError::Other("Failed to decompress snap".to_string()))?; | ||
decompress.clear(); | ||
decompress.resize(len, 0); | ||
snap::raw::Decoder::new() | ||
.decompress(&block[..block.len() - 4], decompress) | ||
.map_err(|_| ArrowError::Other("Failed to decompress snap".to_string()))?; | ||
Ok(false) | ||
} | ||
#[cfg(not(feature = "io_avro_compression"))] | ||
Some(Compression::Deflate) => Err(ArrowError::Other( | ||
"The avro file is deflate-encoded but feature 'io_avro_compression' is not active." | ||
.to_string(), | ||
)), | ||
#[cfg(not(feature = "io_avro_compression"))] | ||
Some(Compression::Snappy) => Err(ArrowError::Other( | ||
"The avro file is snappy-encoded but feature 'io_avro_compression' is not active." | ||
.to_string(), | ||
)), | ||
} | ||
} | ||
|
||
/// [`FallibleStreamingIterator`] of decompressed Avro blocks | ||
pub struct Decompressor<R: Read> { | ||
blocks: BlockStreamIterator<R>, | ||
codec: Option<Compression>, | ||
buf: (Vec<u8>, usize), | ||
was_swapped: bool, | ||
} | ||
|
||
impl<R: Read> Decompressor<R> { | ||
/// Creates a new [`Decompressor`]. | ||
pub fn new(blocks: BlockStreamIterator<R>, codec: Option<Compression>) -> Self { | ||
Self { | ||
blocks, | ||
codec, | ||
buf: (vec![], 0), | ||
was_swapped: false, | ||
} | ||
} | ||
|
||
/// Deconstructs itself into its internal reader | ||
pub fn into_inner(self) -> R { | ||
self.blocks.into_inner().0 | ||
} | ||
} | ||
|
||
impl<'a, R: Read> FallibleStreamingIterator for Decompressor<R> { | ||
type Error = ArrowError; | ||
type Item = (Vec<u8>, usize); | ||
|
||
fn advance(&mut self) -> Result<()> { | ||
if self.was_swapped { | ||
std::mem::swap(self.blocks.buffer(), &mut self.buf.0); | ||
} | ||
self.blocks.advance()?; | ||
self.was_swapped = decompress_block(self.blocks.buffer(), &mut self.buf.0, self.codec)?; | ||
self.buf.1 = self.blocks.get().map(|(_, rows)| *rows).unwrap_or_default(); | ||
Ok(()) | ||
} | ||
|
||
fn get(&self) -> Option<&Self::Item> { | ||
if self.buf.1 > 0 { | ||
Some(&self.buf) | ||
} else { | ||
None | ||
} | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,29 @@ | ||
use std::collections::HashMap; | ||
|
||
use avro_rs::{Error, Schema}; | ||
use serde_json; | ||
|
||
use crate::error::Result; | ||
|
||
use super::Compression; | ||
|
||
/// Deserializes the Avro header into an Avro [`Schema`] and optional [`Compression`]. | ||
pub(crate) fn deserialize_header( | ||
header: HashMap<String, Vec<u8>>, | ||
) -> Result<(Schema, Option<Compression>)> { | ||
let json = header | ||
.get("avro.schema") | ||
.and_then(|bytes| serde_json::from_slice(bytes.as_ref()).ok()) | ||
.ok_or(Error::GetAvroSchemaFromMap)?; | ||
let schema = Schema::parse(&json)?; | ||
|
||
let compression = header.get("avro.codec").and_then(|bytes| { | ||
let bytes: &[u8] = bytes.as_ref(); | ||
match bytes { | ||
b"snappy" => Some(Compression::Snappy), | ||
b"deflate" => Some(Compression::Deflate), | ||
_ => None, | ||
} | ||
}); | ||
Ok((schema, compression)) | ||
} |
Oops, something went wrong.