This repository has been archived by the owner on Feb 18, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 224
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Refactored JSON writing (5-10x) (#709)
- Loading branch information
1 parent
f33a41f
commit f07cc2c
Showing
15 changed files
with
526 additions
and
692 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,58 @@ | ||
use std::sync::Arc; | ||
|
||
use criterion::{criterion_group, criterion_main, Criterion}; | ||
|
||
use arrow2::array::*; | ||
use arrow2::error::Result; | ||
use arrow2::io::json::write; | ||
use arrow2::record_batch::RecordBatch; | ||
use arrow2::util::bench_util::*; | ||
|
||
fn write_batch(batch: &RecordBatch) -> Result<()> { | ||
let mut writer = vec![]; | ||
let format = write::JsonArray::default(); | ||
|
||
let batches = vec![Ok(batch.clone())].into_iter(); | ||
|
||
// Advancing this iterator serializes the next batch to its internal buffer (i.e. CPU-bounded) | ||
let blocks = write::Serializer::new(batches, vec![], format); | ||
|
||
// the operation of writing is IO-bounded. | ||
write::write(&mut writer, format, blocks)?; | ||
|
||
Ok(()) | ||
} | ||
|
||
fn make_batch(array: impl Array + 'static) -> RecordBatch { | ||
RecordBatch::try_from_iter([("a", Arc::new(array) as Arc<dyn Array>)]).unwrap() | ||
} | ||
|
||
fn add_benchmark(c: &mut Criterion) { | ||
(10..=18).step_by(2).for_each(|log2_size| { | ||
let size = 2usize.pow(log2_size); | ||
|
||
let array = create_primitive_array::<i32>(size, 0.1); | ||
let batch = make_batch(array); | ||
|
||
c.bench_function(&format!("json write i32 2^{}", log2_size), |b| { | ||
b.iter(|| write_batch(&batch)) | ||
}); | ||
|
||
let array = create_string_array::<i32>(size, 100, 0.1, 42); | ||
let batch = make_batch(array); | ||
|
||
c.bench_function(&format!("json write utf8 2^{}", log2_size), |b| { | ||
b.iter(|| write_batch(&batch)) | ||
}); | ||
|
||
let array = create_primitive_array::<f64>(size, 0.1); | ||
let batch = make_batch(array); | ||
|
||
c.bench_function(&format!("json write f64 2^{}", log2_size), |b| { | ||
b.iter(|| write_batch(&batch)) | ||
}); | ||
}); | ||
} | ||
|
||
criterion_group!(benches, add_benchmark); | ||
criterion_main!(benches); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
use std::fs::File; | ||
use std::sync::Arc; | ||
|
||
use arrow2::{ | ||
array::Int32Array, | ||
datatypes::{Field, Schema}, | ||
error::Result, | ||
io::json::write, | ||
record_batch::RecordBatch, | ||
}; | ||
|
||
fn write_batches(path: &str, batches: &[RecordBatch]) -> Result<()> { | ||
let mut writer = File::create(path)?; | ||
let format = write::JsonArray::default(); | ||
|
||
let batches = batches.iter().cloned().map(Ok); | ||
|
||
// Advancing this iterator serializes the next batch to its internal buffer (i.e. CPU-bounded) | ||
let blocks = write::Serializer::new(batches, vec![], format); | ||
|
||
// the operation of writing is IO-bounded. | ||
write::write(&mut writer, format, blocks)?; | ||
|
||
Ok(()) | ||
} | ||
|
||
fn main() -> Result<()> { | ||
let array = Int32Array::from(&[Some(0), None, Some(2), Some(3), Some(4), Some(5), Some(6)]); | ||
let field = Field::new("c1", array.data_type().clone(), true); | ||
let schema = Schema::new(vec![field]); | ||
let batch = RecordBatch::try_new(Arc::new(schema), vec![Arc::new(array)])?; | ||
|
||
write_batches("example.json", &[batch.clone(), batch]) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
# Write JSON | ||
|
||
When compiled with feature `io_json`, you can use this crate to write JSON files. | ||
The following example writes a batch as a JSON file: | ||
|
||
```rust | ||
{{#include ../../../examples/json_write.rs}} | ||
``` |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,77 @@ | ||
use std::{fmt::Debug, io::Write}; | ||
|
||
use crate::error::Result; | ||
|
||
/// Trait defining how to format a sequence of JSON objects to a byte stream. | ||
pub trait JsonFormat: Debug + Default + Copy { | ||
#[inline] | ||
/// write any bytes needed at the start of the file to the writer | ||
fn start_stream<W: Write>(&self, _writer: &mut W) -> Result<()> { | ||
Ok(()) | ||
} | ||
|
||
#[inline] | ||
/// write any bytes needed for the start of each row | ||
fn start_row<W: Write>(&self, _writer: &mut W, _is_first_row: bool) -> Result<()> { | ||
Ok(()) | ||
} | ||
|
||
#[inline] | ||
/// write any bytes needed for the end of each row | ||
fn end_row<W: Write>(&self, _writer: &mut W) -> Result<()> { | ||
Ok(()) | ||
} | ||
|
||
/// write any bytes needed for the start of each row | ||
fn end_stream<W: Write>(&self, _writer: &mut W) -> Result<()> { | ||
Ok(()) | ||
} | ||
} | ||
|
||
/// Produces JSON output with one record per line. For example | ||
/// | ||
/// ```json | ||
/// {"foo":1} | ||
/// {"bar":1} | ||
/// | ||
/// ``` | ||
#[derive(Debug, Default, Clone, Copy)] | ||
pub struct LineDelimited {} | ||
|
||
impl JsonFormat for LineDelimited { | ||
#[inline] | ||
fn end_row<W: Write>(&self, writer: &mut W) -> Result<()> { | ||
writer.write_all(b"\n")?; | ||
Ok(()) | ||
} | ||
} | ||
|
||
/// Produces JSON output as a single JSON array. For example | ||
/// | ||
/// ```json | ||
/// [{"foo":1},{"bar":1}] | ||
/// ``` | ||
#[derive(Debug, Default, Clone, Copy)] | ||
pub struct JsonArray {} | ||
|
||
impl JsonFormat for JsonArray { | ||
#[inline] | ||
fn start_stream<W: Write>(&self, writer: &mut W) -> Result<()> { | ||
writer.write_all(b"[")?; | ||
Ok(()) | ||
} | ||
|
||
#[inline] | ||
fn start_row<W: Write>(&self, writer: &mut W, is_first_row: bool) -> Result<()> { | ||
if !is_first_row { | ||
writer.write_all(b",")?; | ||
} | ||
Ok(()) | ||
} | ||
|
||
#[inline] | ||
fn end_stream<W: Write>(&self, writer: &mut W) -> Result<()> { | ||
writer.write_all(b"]")?; | ||
Ok(()) | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,21 +1,84 @@ | ||
// Licensed to the Apache Software Foundation (ASF) under one | ||
// or more contributor license agreements. See the NOTICE file | ||
// distributed with this work for additional information | ||
// regarding copyright ownership. The ASF licenses this file | ||
// to you under the Apache License, Version 2.0 (the | ||
// "License"); you may not use this file except in compliance | ||
// with the License. You may obtain a copy of the License at | ||
// | ||
// http://www.apache.org/licenses/LICENSE-2.0 | ||
// | ||
// Unless required by applicable law or agreed to in writing, | ||
// software distributed under the License is distributed on an | ||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
// KIND, either express or implied. See the License for the | ||
// specific language governing permissions and limitations | ||
// under the License. | ||
|
||
//! APIs to write to JSON | ||
mod format; | ||
mod serialize; | ||
mod writer; | ||
pub use serialize::write_record_batches; | ||
pub use writer::*; | ||
pub use fallible_streaming_iterator::*; | ||
pub use format::*; | ||
pub use serialize::serialize; | ||
|
||
use crate::{ | ||
error::{ArrowError, Result}, | ||
record_batch::RecordBatch, | ||
}; | ||
|
||
/// Writes blocks of JSON-encoded data into `writer`, ensuring that the written | ||
/// JSON has the expected `format` | ||
pub fn write<W, F, I>(writer: &mut W, format: F, mut blocks: I) -> Result<()> | ||
where | ||
W: std::io::Write, | ||
F: JsonFormat, | ||
I: FallibleStreamingIterator<Item = [u8], Error = ArrowError>, | ||
{ | ||
format.start_stream(writer)?; | ||
let mut is_first_row = true; | ||
while let Some(block) = blocks.next()? { | ||
format.start_row(writer, is_first_row)?; | ||
is_first_row = false; | ||
writer.write_all(block)?; | ||
} | ||
format.end_stream(writer)?; | ||
Ok(()) | ||
} | ||
|
||
/// [`FallibleStreamingIterator`] that serializes a [`RecordBatch`] to bytes. | ||
/// Advancing it is CPU-bounded | ||
pub struct Serializer<F: JsonFormat, I: Iterator<Item = Result<RecordBatch>>> { | ||
iter: I, | ||
buffer: Vec<u8>, | ||
format: F, | ||
} | ||
|
||
impl<F: JsonFormat, I: Iterator<Item = Result<RecordBatch>>> Serializer<F, I> { | ||
/// Creates a new [`Serializer`]. | ||
pub fn new(iter: I, buffer: Vec<u8>, format: F) -> Self { | ||
Self { | ||
iter, | ||
buffer, | ||
format, | ||
} | ||
} | ||
} | ||
|
||
impl<F: JsonFormat, I: Iterator<Item = Result<RecordBatch>>> FallibleStreamingIterator | ||
for Serializer<F, I> | ||
{ | ||
type Item = [u8]; | ||
|
||
type Error = ArrowError; | ||
|
||
fn advance(&mut self) -> Result<()> { | ||
self.buffer.clear(); | ||
self.iter | ||
.next() | ||
.map(|maybe_batch| { | ||
maybe_batch.map(|batch| { | ||
let names = batch | ||
.schema() | ||
.fields() | ||
.iter() | ||
.map(|f| f.name().as_str()) | ||
.collect::<Vec<_>>(); | ||
serialize(&names, batch.columns(), self.format, &mut self.buffer) | ||
}) | ||
}) | ||
.transpose()?; | ||
Ok(()) | ||
} | ||
|
||
fn get(&self) -> Option<&Self::Item> { | ||
if !self.buffer.is_empty() { | ||
Some(&self.buffer) | ||
} else { | ||
None | ||
} | ||
} | ||
} |
Oops, something went wrong.