diff --git a/src/io/json/write/mod.rs b/src/io/json/write/mod.rs index fcf1c435148..dc2c69944ad 100644 --- a/src/io/json/write/mod.rs +++ b/src/io/json/write/mod.rs @@ -1,6 +1,7 @@ //! APIs to write to JSON mod format; mod serialize; +mod utf8; pub use fallible_streaming_iterator::*; pub use format::*; pub use serialize::serialize; diff --git a/src/io/json/write/serialize.rs b/src/io/json/write/serialize.rs index 28bf0bd7563..d67ef2b36de 100644 --- a/src/io/json/write/serialize.rs +++ b/src/io/json/write/serialize.rs @@ -1,5 +1,4 @@ use lexical_core::ToLexical; -use serde_json::Value; use streaming_iterator::StreamingIterator; use crate::bitmap::utils::zip_validity; @@ -8,6 +7,7 @@ use crate::io::iterator::BufStreamingIterator; use crate::util::lexical_to_bytes_mut; use crate::{array::*, datatypes::DataType, types::NativeType}; +use super::utf8::utf8_serialize; use super::{JsonArray, JsonFormat}; fn boolean_serializer<'a>( @@ -137,20 +137,6 @@ fn list_serializer<'a, O: Offset>( )) } -#[inline] -fn utf8_serialize(value: &str, buf: &mut Vec) { - if value.as_bytes().is_ascii() { - buf.reserve(value.len() + 2); - buf.push(b'"'); - buf.extend_from_slice(value.as_bytes()); - buf.push(b'"'); - } else { - // it may contain reserved keywords: perform roundtrip for - // todo: avoid this roundtrip over serde_json - serde_json::to_writer(buf, &Value::String(value.to_string())).unwrap(); - } -} - fn new_serializer<'a>( array: &'a dyn Array, ) -> Box + 'a + Send + Sync> { diff --git a/src/io/json/write/utf8.rs b/src/io/json/write/utf8.rs new file mode 100644 index 00000000000..1984869b6d4 --- /dev/null +++ b/src/io/json/write/utf8.rs @@ -0,0 +1,107 @@ +/// Contains a partial copy of code from serde_json to serialize utf8. +use serde_json::ser::CharEscape; + +fn write_escape(buf: &mut Vec, char_escape: serde_json::ser::CharEscape) { + use serde_json::ser::CharEscape::*; + + let s = match char_escape { + Quote => b"\\\"", + ReverseSolidus => b"\\\\", + Solidus => b"\\/", + Backspace => b"\\b", + FormFeed => b"\\f", + LineFeed => b"\\n", + CarriageReturn => b"\\r", + Tab => b"\\t", + AsciiControl(byte) => { + static HEX_DIGITS: [u8; 16] = *b"0123456789abcdef"; + let bytes = &[ + b'\\', + b'u', + b'0', + b'0', + HEX_DIGITS[(byte >> 4) as usize], + HEX_DIGITS[(byte & 0xF) as usize], + ]; + return buf.extend_from_slice(bytes); + } + }; + buf.extend_from_slice(s) +} + +#[inline] +fn from_escape_table(escape: u8, byte: u8) -> CharEscape { + match escape { + self::BB => CharEscape::Backspace, + self::TT => CharEscape::Tab, + self::NN => CharEscape::LineFeed, + self::FF => CharEscape::FormFeed, + self::RR => CharEscape::CarriageReturn, + self::QU => CharEscape::Quote, + self::BS => CharEscape::ReverseSolidus, + self::UU => CharEscape::AsciiControl(byte), + _ => unreachable!(), + } +} + +const BB: u8 = b'b'; // \x08 +const TT: u8 = b't'; // \x09 +const NN: u8 = b'n'; // \x0A +const FF: u8 = b'f'; // \x0C +const RR: u8 = b'r'; // \x0D +const QU: u8 = b'"'; // \x22 +const BS: u8 = b'\\'; // \x5C +const UU: u8 = b'u'; // \x00...\x1F except the ones above +const __: u8 = 0; + +// Lookup table of escape sequences. A value of b'x' at index i means that byte +// i is escaped as "\x" in JSON. A value of 0 means that byte i is not escaped. +static ESCAPE: [u8; 256] = [ + // 1 2 3 4 5 6 7 8 9 A B C D E F + UU, UU, UU, UU, UU, UU, UU, UU, BB, TT, NN, UU, FF, RR, UU, UU, // 0 + UU, UU, UU, UU, UU, UU, UU, UU, UU, UU, UU, UU, UU, UU, UU, UU, // 1 + __, __, QU, __, __, __, __, __, __, __, __, __, __, __, __, __, // 2 + __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 3 + __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 4 + __, __, __, __, __, __, __, __, __, __, __, __, BS, __, __, __, // 5 + __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 6 + __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 7 + __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 8 + __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 9 + __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // A + __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // B + __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // C + __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // D + __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // E + __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // F +]; + +#[inline] +pub fn utf8_serialize(value: &str, buf: &mut Vec) { + buf.reserve(value.len() + 2); + buf.push(b'"'); + let bytes = value.as_bytes(); + + let mut start = 0; + + for (i, &byte) in bytes.iter().enumerate() { + let escape = ESCAPE[byte as usize]; + if escape == 0 { + continue; + } + + if start < i { + buf.extend_from_slice(&bytes[start..i]); + } + + let char_escape = from_escape_table(escape, byte); + write_escape(buf, char_escape); + + start = i + 1; + } + + if start != bytes.len() { + buf.extend_from_slice(&bytes[start..]); + } + buf.push(b'"'); +} diff --git a/tests/it/io/json/write.rs b/tests/it/io/json/write.rs index 51997c1d367..dc10fbff724 100644 --- a/tests/it/io/json/write.rs +++ b/tests/it/io/json/write.rs @@ -301,7 +301,26 @@ fn write_escaped_utf8() -> Result<()> { assert_eq!( String::from_utf8(buf).unwrap().as_bytes(), - b"{\"c1\":\"a\na\"}\n{\"c1\":null}\n" + b"{\"c1\":\"a\\na\"}\n{\"c1\":null}\n" + ); + Ok(()) +} + +#[test] +fn write_quotation_marks_in_utf8() -> Result<()> { + let a = Utf8Array::::from(&vec![Some("a\"a"), None]); + + let batch = Chunk::try_new(vec![&a as &dyn Array]).unwrap(); + + let buf = write_batch( + batch, + vec!["c1".to_string()], + json_write::LineDelimited::default(), + )?; + + assert_eq!( + String::from_utf8(buf).unwrap().as_bytes(), + b"{\"c1\":\"a\\\"a\"}\n{\"c1\":null}\n" ); Ok(()) }