Skip to content
This repository has been archived by the owner on Feb 18, 2024. It is now read-only.

Commit

Permalink
Fixed error in serializing json
Browse files Browse the repository at this point in the history
  • Loading branch information
jorgecarleitao committed Feb 5, 2022
1 parent 362ebf9 commit 9496bd3
Show file tree
Hide file tree
Showing 4 changed files with 129 additions and 16 deletions.
1 change: 1 addition & 0 deletions src/io/json/write/mod.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
//! APIs to write to JSON
mod format;
mod serialize;
mod utf8;
pub use fallible_streaming_iterator::*;
pub use format::*;
pub use serialize::serialize;
Expand Down
16 changes: 1 addition & 15 deletions src/io/json/write/serialize.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
use lexical_core::ToLexical;
use serde_json::Value;
use streaming_iterator::StreamingIterator;

use crate::bitmap::utils::zip_validity;
Expand All @@ -8,6 +7,7 @@ use crate::io::iterator::BufStreamingIterator;
use crate::util::lexical_to_bytes_mut;
use crate::{array::*, datatypes::DataType, types::NativeType};

use super::utf8::utf8_serialize;
use super::{JsonArray, JsonFormat};

fn boolean_serializer<'a>(
Expand Down Expand Up @@ -137,20 +137,6 @@ fn list_serializer<'a, O: Offset>(
))
}

#[inline]
fn utf8_serialize(value: &str, buf: &mut Vec<u8>) {
if value.as_bytes().is_ascii() {
buf.reserve(value.len() + 2);
buf.push(b'"');
buf.extend_from_slice(value.as_bytes());
buf.push(b'"');
} else {
// it may contain reserved keywords: perform roundtrip for
// todo: avoid this roundtrip over serde_json
serde_json::to_writer(buf, &Value::String(value.to_string())).unwrap();
}
}

fn new_serializer<'a>(
array: &'a dyn Array,
) -> Box<dyn StreamingIterator<Item = [u8]> + 'a + Send + Sync> {
Expand Down
107 changes: 107 additions & 0 deletions src/io/json/write/utf8.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
/// Contains a partial copy of code from serde_json to serialize utf8.
use serde_json::ser::CharEscape;

fn write_escape(buf: &mut Vec<u8>, char_escape: serde_json::ser::CharEscape) {
use serde_json::ser::CharEscape::*;

let s = match char_escape {
Quote => b"\\\"",
ReverseSolidus => b"\\\\",
Solidus => b"\\/",
Backspace => b"\\b",
FormFeed => b"\\f",
LineFeed => b"\\n",
CarriageReturn => b"\\r",
Tab => b"\\t",
AsciiControl(byte) => {
static HEX_DIGITS: [u8; 16] = *b"0123456789abcdef";
let bytes = &[
b'\\',
b'u',
b'0',
b'0',
HEX_DIGITS[(byte >> 4) as usize],
HEX_DIGITS[(byte & 0xF) as usize],
];
return buf.extend_from_slice(bytes);
}
};
buf.extend_from_slice(s)
}

#[inline]
fn from_escape_table(escape: u8, byte: u8) -> CharEscape {
match escape {
self::BB => CharEscape::Backspace,
self::TT => CharEscape::Tab,
self::NN => CharEscape::LineFeed,
self::FF => CharEscape::FormFeed,
self::RR => CharEscape::CarriageReturn,
self::QU => CharEscape::Quote,
self::BS => CharEscape::ReverseSolidus,
self::UU => CharEscape::AsciiControl(byte),
_ => unreachable!(),
}
}

const BB: u8 = b'b'; // \x08
const TT: u8 = b't'; // \x09
const NN: u8 = b'n'; // \x0A
const FF: u8 = b'f'; // \x0C
const RR: u8 = b'r'; // \x0D
const QU: u8 = b'"'; // \x22
const BS: u8 = b'\\'; // \x5C
const UU: u8 = b'u'; // \x00...\x1F except the ones above
const __: u8 = 0;

// Lookup table of escape sequences. A value of b'x' at index i means that byte
// i is escaped as "\x" in JSON. A value of 0 means that byte i is not escaped.
static ESCAPE: [u8; 256] = [
// 1 2 3 4 5 6 7 8 9 A B C D E F
UU, UU, UU, UU, UU, UU, UU, UU, BB, TT, NN, UU, FF, RR, UU, UU, // 0
UU, UU, UU, UU, UU, UU, UU, UU, UU, UU, UU, UU, UU, UU, UU, UU, // 1
__, __, QU, __, __, __, __, __, __, __, __, __, __, __, __, __, // 2
__, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 3
__, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 4
__, __, __, __, __, __, __, __, __, __, __, __, BS, __, __, __, // 5
__, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 6
__, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 7
__, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 8
__, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 9
__, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // A
__, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // B
__, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // C
__, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // D
__, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // E
__, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // F
];

#[inline]
pub fn utf8_serialize(value: &str, buf: &mut Vec<u8>) {
buf.reserve(value.len() + 2);
buf.push(b'"');
let bytes = value.as_bytes();

let mut start = 0;

for (i, &byte) in bytes.iter().enumerate() {
let escape = ESCAPE[byte as usize];
if escape == 0 {
continue;
}

if start < i {
buf.extend_from_slice(&bytes[start..i]);
}

let char_escape = from_escape_table(escape, byte);
write_escape(buf, char_escape);

start = i + 1;
}

if start != bytes.len() {
buf.extend_from_slice(&bytes[start..]);
}
buf.push(b'"');
}
21 changes: 20 additions & 1 deletion tests/it/io/json/write.rs
Original file line number Diff line number Diff line change
Expand Up @@ -301,7 +301,26 @@ fn write_escaped_utf8() -> Result<()> {

assert_eq!(
String::from_utf8(buf).unwrap().as_bytes(),
b"{\"c1\":\"a\na\"}\n{\"c1\":null}\n"
b"{\"c1\":\"a\\na\"}\n{\"c1\":null}\n"
);
Ok(())
}

#[test]
fn write_quotation_marks_in_utf8() -> Result<()> {
let a = Utf8Array::<i32>::from(&vec![Some("a\"a"), None]);

let batch = Chunk::try_new(vec![&a as &dyn Array]).unwrap();

let buf = write_batch(
batch,
vec!["c1".to_string()],
json_write::LineDelimited::default(),
)?;

assert_eq!(
String::from_utf8(buf).unwrap().as_bytes(),
b"{\"c1\":\"a\\\"a\"}\n{\"c1\":null}\n"
);
Ok(())
}

0 comments on commit 9496bd3

Please sign in to comment.