This repository has been archived by the owner on Feb 18, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 224
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Added support to write to Avro (#690)
- Loading branch information
1 parent
1ec0827
commit 45e72e9
Showing
23 changed files
with
697 additions
and
64 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,65 @@ | ||
use std::fs::File; | ||
|
||
use arrow2::{ | ||
array::{Array, Int32Array}, | ||
datatypes::{Field, Schema}, | ||
error::Result, | ||
io::avro::write, | ||
}; | ||
|
||
fn write_avro<W: std::io::Write>( | ||
file: &mut W, | ||
arrays: &[&dyn Array], | ||
schema: &Schema, | ||
compression: Option<write::Compression>, | ||
) -> Result<()> { | ||
let avro_fields = write::to_avro_schema(schema)?; | ||
|
||
let mut serializers = arrays | ||
.iter() | ||
.zip(avro_fields.iter()) | ||
.map(|(array, field)| write::new_serializer(*array, &field.schema)) | ||
.collect::<Vec<_>>(); | ||
let mut block = write::Block::new(arrays[0].len(), vec![]); | ||
|
||
write::serialize(&mut serializers, &mut block)?; | ||
|
||
let mut compressed_block = write::CompressedBlock::default(); | ||
|
||
if let Some(compression) = compression { | ||
write::compress(&block, &mut compressed_block, compression)?; | ||
} else { | ||
compressed_block.number_of_rows = block.number_of_rows; | ||
std::mem::swap(&mut compressed_block.data, &mut block.data); | ||
} | ||
|
||
write::write_metadata(file, avro_fields.clone(), compression)?; | ||
|
||
write::write_block(file, &compressed_block)?; | ||
|
||
Ok(()) | ||
} | ||
|
||
fn main() -> Result<()> { | ||
use std::env; | ||
let args: Vec<String> = env::args().collect(); | ||
|
||
let path = &args[1]; | ||
|
||
let array = Int32Array::from(&[ | ||
Some(0), | ||
Some(1), | ||
Some(2), | ||
Some(3), | ||
Some(4), | ||
Some(5), | ||
Some(6), | ||
]); | ||
let field = Field::new("c1", array.data_type().clone(), true); | ||
let schema = Schema::new(vec![field]); | ||
|
||
let mut file = File::create(path)?; | ||
write_avro(&mut file, &[(&array) as &dyn Array], &schema, None)?; | ||
|
||
Ok(()) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
# Avro write | ||
|
||
You can use this crate to write to Apache Avro. | ||
Below is an example, which you can run when this crate is compiled with feature `io_avro`. | ||
|
||
```rust | ||
{{#include ../../../examples/avro_write.rs}} | ||
``` |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,69 @@ | ||
use std::io::Write; | ||
|
||
use crate::{error::Result, io::avro::Compression}; | ||
|
||
use super::{util::zigzag_encode, SYNC_NUMBER}; | ||
|
||
/// A compressed Avro block. | ||
#[derive(Debug, Clone, Default, PartialEq)] | ||
pub struct CompressedBlock { | ||
/// The number of rows | ||
pub number_of_rows: usize, | ||
/// The compressed data | ||
pub data: Vec<u8>, | ||
} | ||
|
||
impl CompressedBlock { | ||
/// Creates a new CompressedBlock | ||
pub fn new(number_of_rows: usize, data: Vec<u8>) -> Self { | ||
Self { | ||
number_of_rows, | ||
data, | ||
} | ||
} | ||
} | ||
|
||
/// An uncompressed Avro block. | ||
#[derive(Debug, Clone, Default, PartialEq)] | ||
pub struct Block { | ||
/// The number of rows | ||
pub number_of_rows: usize, | ||
/// The uncompressed data | ||
pub data: Vec<u8>, | ||
} | ||
|
||
impl Block { | ||
/// Creates a new Block | ||
pub fn new(number_of_rows: usize, data: Vec<u8>) -> Self { | ||
Self { | ||
number_of_rows, | ||
data, | ||
} | ||
} | ||
} | ||
|
||
/// Writes a [`CompressedBlock`] to `writer` | ||
pub fn write_block<W: Write>(writer: &mut W, compressed_block: &CompressedBlock) -> Result<()> { | ||
// write size and rows | ||
zigzag_encode(compressed_block.number_of_rows as i64, writer)?; | ||
zigzag_encode(compressed_block.data.len() as i64, writer)?; | ||
|
||
writer.write_all(&compressed_block.data)?; | ||
|
||
writer.write_all(&SYNC_NUMBER)?; | ||
|
||
Ok(()) | ||
} | ||
|
||
/// Compresses an [`Block`] to a [`CompressedBlock`]. | ||
pub fn compress( | ||
block: &Block, | ||
compressed_block: &mut CompressedBlock, | ||
compression: Compression, | ||
) -> Result<()> { | ||
compressed_block.number_of_rows = block.number_of_rows; | ||
match compression { | ||
Compression::Deflate => todo!(), | ||
Compression::Snappy => todo!(), | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,30 @@ | ||
use std::collections::HashMap; | ||
|
||
use avro_schema::Schema; | ||
use serde_json; | ||
|
||
use crate::error::{ArrowError, Result}; | ||
|
||
use super::Compression; | ||
|
||
/// Serializes an [`Schema`] and optional [`Compression`] into an avro header. | ||
pub(crate) fn serialize_header( | ||
schema: &Schema, | ||
compression: Option<Compression>, | ||
) -> Result<HashMap<String, Vec<u8>>> { | ||
let schema = | ||
serde_json::to_string(schema).map_err(|e| ArrowError::ExternalFormat(e.to_string()))?; | ||
|
||
let mut header = HashMap::<String, Vec<u8>>::default(); | ||
|
||
header.insert("avro.schema".to_string(), schema.into_bytes()); | ||
if let Some(compression) = compression { | ||
let value = match compression { | ||
Compression::Snappy => b"snappy".to_vec(), | ||
Compression::Deflate => b"deflate".to_vec(), | ||
}; | ||
header.insert("avro.codec".to_string(), value); | ||
}; | ||
|
||
Ok(header) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,69 @@ | ||
//! APIs to write to Avro format. | ||
use std::io::Write; | ||
|
||
use avro_schema::{Field as AvroField, Record, Schema as AvroSchema}; | ||
|
||
use crate::error::Result; | ||
|
||
pub use super::Compression; | ||
|
||
mod header; | ||
use header::serialize_header; | ||
mod schema; | ||
pub use schema::to_avro_schema; | ||
mod serialize; | ||
pub use serialize::{can_serialize, new_serializer, BoxSerializer}; | ||
mod block; | ||
pub use block::*; | ||
mod util; | ||
|
||
const SYNC_NUMBER: [u8; 16] = [1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4]; | ||
|
||
/// Writes Avro's metadata to `writer`. | ||
pub fn write_metadata<W: std::io::Write>( | ||
writer: &mut W, | ||
fields: Vec<AvroField>, | ||
compression: Option<Compression>, | ||
) -> Result<()> { | ||
// * Four bytes, ASCII 'O', 'b', 'j', followed by 1. | ||
let avro_magic = [b'O', b'b', b'j', 1u8]; | ||
writer.write_all(&avro_magic)?; | ||
|
||
// * file metadata, including the schema. | ||
let schema = AvroSchema::Record(Record::new("", fields)); | ||
let header = serialize_header(&schema, compression)?; | ||
|
||
util::zigzag_encode(header.len() as i64, writer)?; | ||
for (name, item) in header { | ||
util::write_binary(name.as_bytes(), writer)?; | ||
util::write_binary(&item, writer)?; | ||
} | ||
writer.write_all(&[0])?; | ||
|
||
// The 16-byte, randomly-generated sync marker for this file. | ||
writer.write_all(&SYNC_NUMBER)?; | ||
|
||
Ok(()) | ||
} | ||
|
||
/// consumes a set of [`BoxSerializer`] into an [`Block`]. | ||
/// # Panics | ||
/// Panics iff the number of items in any of the serializers is not equal to the number of rows | ||
/// declared in the `block`. | ||
pub fn serialize<'a>(serializers: &mut [BoxSerializer<'a>], block: &mut Block) -> Result<()> { | ||
let Block { | ||
data, | ||
number_of_rows, | ||
} = block; | ||
|
||
data.clear(); // restart it | ||
|
||
// _the_ transpose (columns -> rows) | ||
for _ in 0..*number_of_rows { | ||
for serializer in &mut *serializers { | ||
let item_data = serializer.next().unwrap(); | ||
data.write_all(item_data)?; | ||
} | ||
} | ||
Ok(()) | ||
} |
Oops, something went wrong.