Skip to content
This repository has been archived by the owner on Feb 18, 2024. It is now read-only.

Fixed error in serializing json utf8 with special characters #813

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions src/io/json/write/mod.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
//! APIs to write to JSON
mod format;
mod serialize;
mod utf8;
pub use fallible_streaming_iterator::*;
pub use format::*;
pub use serialize::serialize;
Expand Down
16 changes: 1 addition & 15 deletions src/io/json/write/serialize.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
use lexical_core::ToLexical;
use serde_json::Value;
use streaming_iterator::StreamingIterator;

use crate::bitmap::utils::zip_validity;
Expand All @@ -8,6 +7,7 @@ use crate::io::iterator::BufStreamingIterator;
use crate::util::lexical_to_bytes_mut;
use crate::{array::*, datatypes::DataType, types::NativeType};

use super::utf8::utf8_serialize;
use super::{JsonArray, JsonFormat};

fn boolean_serializer<'a>(
Expand Down Expand Up @@ -137,20 +137,6 @@ fn list_serializer<'a, O: Offset>(
))
}

#[inline]
fn utf8_serialize(value: &str, buf: &mut Vec<u8>) {
if value.as_bytes().is_ascii() {
buf.reserve(value.len() + 2);
buf.push(b'"');
buf.extend_from_slice(value.as_bytes());
buf.push(b'"');
} else {
// it may contain reserved keywords: perform roundtrip for
// todo: avoid this roundtrip over serde_json
serde_json::to_writer(buf, &Value::String(value.to_string())).unwrap();
}
}

fn new_serializer<'a>(
array: &'a dyn Array,
) -> Box<dyn StreamingIterator<Item = [u8]> + 'a + Send + Sync> {
Expand Down
107 changes: 107 additions & 0 deletions src/io/json/write/utf8.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
/// Contains a partial copy of code from serde_json to serialize utf8.
use serde_json::ser::CharEscape;

fn write_escape(buf: &mut Vec<u8>, char_escape: serde_json::ser::CharEscape) {
use serde_json::ser::CharEscape::*;

let s = match char_escape {
Quote => b"\\\"",
ReverseSolidus => b"\\\\",
Solidus => b"\\/",
Backspace => b"\\b",
FormFeed => b"\\f",
LineFeed => b"\\n",
CarriageReturn => b"\\r",
Tab => b"\\t",
AsciiControl(byte) => {
static HEX_DIGITS: [u8; 16] = *b"0123456789abcdef";
let bytes = &[
b'\\',
b'u',
b'0',
b'0',
HEX_DIGITS[(byte >> 4) as usize],
HEX_DIGITS[(byte & 0xF) as usize],
];
return buf.extend_from_slice(bytes);
}
};
buf.extend_from_slice(s)
}

#[inline]
fn from_escape_table(escape: u8, byte: u8) -> CharEscape {
match escape {
self::BB => CharEscape::Backspace,
self::TT => CharEscape::Tab,
self::NN => CharEscape::LineFeed,
self::FF => CharEscape::FormFeed,
self::RR => CharEscape::CarriageReturn,
self::QU => CharEscape::Quote,
self::BS => CharEscape::ReverseSolidus,
self::UU => CharEscape::AsciiControl(byte),
_ => unreachable!(),
}
}

const BB: u8 = b'b'; // \x08
const TT: u8 = b't'; // \x09
const NN: u8 = b'n'; // \x0A
const FF: u8 = b'f'; // \x0C
const RR: u8 = b'r'; // \x0D
const QU: u8 = b'"'; // \x22
const BS: u8 = b'\\'; // \x5C
const UU: u8 = b'u'; // \x00...\x1F except the ones above
const __: u8 = 0;

// Lookup table of escape sequences. A value of b'x' at index i means that byte
// i is escaped as "\x" in JSON. A value of 0 means that byte i is not escaped.
static ESCAPE: [u8; 256] = [
// 1 2 3 4 5 6 7 8 9 A B C D E F
UU, UU, UU, UU, UU, UU, UU, UU, BB, TT, NN, UU, FF, RR, UU, UU, // 0
UU, UU, UU, UU, UU, UU, UU, UU, UU, UU, UU, UU, UU, UU, UU, UU, // 1
__, __, QU, __, __, __, __, __, __, __, __, __, __, __, __, __, // 2
__, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 3
__, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 4
__, __, __, __, __, __, __, __, __, __, __, __, BS, __, __, __, // 5
__, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 6
__, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 7
__, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 8
__, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 9
__, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // A
__, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // B
__, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // C
__, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // D
__, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // E
__, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // F
];

#[inline]
pub fn utf8_serialize(value: &str, buf: &mut Vec<u8>) {
buf.reserve(value.len() + 2);
buf.push(b'"');
let bytes = value.as_bytes();

let mut start = 0;

for (i, &byte) in bytes.iter().enumerate() {
let escape = ESCAPE[byte as usize];
if escape == 0 {
continue;
}

if start < i {
buf.extend_from_slice(&bytes[start..i]);
}

let char_escape = from_escape_table(escape, byte);
write_escape(buf, char_escape);

start = i + 1;
}

if start != bytes.len() {
buf.extend_from_slice(&bytes[start..]);
}
buf.push(b'"');
}
23 changes: 22 additions & 1 deletion tests/it/io/json/write.rs
Original file line number Diff line number Diff line change
Expand Up @@ -301,7 +301,28 @@ fn write_escaped_utf8() -> Result<()> {

assert_eq!(
String::from_utf8(buf).unwrap().as_bytes(),
b"{\"c1\":\"a\na\"}\n{\"c1\":null}\n"
b"{\"c1\":\"a\\na\"}\n{\"c1\":null}\n"
);
serde_json::from_slice::<serde_json::Value>(b"{\"c1\":\"a\\na\"}").unwrap();
Ok(())
}

#[test]
fn write_quotation_marks_in_utf8() -> Result<()> {
let a = Utf8Array::<i32>::from(&vec![Some("a\"a"), None]);

let batch = Chunk::try_new(vec![&a as &dyn Array]).unwrap();

let buf = write_batch(
batch,
vec!["c1".to_string()],
json_write::LineDelimited::default(),
)?;

assert_eq!(
String::from_utf8(buf).unwrap().as_bytes(),
b"{\"c1\":\"a\\\"a\"}\n{\"c1\":null}\n"
);
serde_json::from_slice::<serde_json::Value>(b"{\"c1\":\"a\\\"a\"}").unwrap();
Ok(())
}