From 814ae7f14a3a7121a0c602009bd8aa5ff45e8b79 Mon Sep 17 00:00:00 2001 From: Jethro Beekman Date: Thu, 23 Apr 2020 14:10:32 +0200 Subject: [PATCH] Add support for alternate encodings of bytes types --- Cargo.toml | 8 ++- src/base64.rs | 138 +++++++++++++++++++++++++++++++++++++++++++++++ src/de.rs | 38 +++++++++++-- src/lib.rs | 131 ++++++++++++++++++++++++++++++++++++++++++++ src/ser.rs | 66 ++++++++++++++++++++--- src/value/ser.rs | 83 +++++++++++++++++++++++----- 6 files changed, 439 insertions(+), 25 deletions(-) create mode 100644 src/base64.rs diff --git a/Cargo.toml b/Cargo.toml index 0eccbe006..aebc31d6f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -21,7 +21,7 @@ serde = { version = "1.0.100", default-features = false } indexmap = { version = "1.2", optional = true } itoa = { version = "0.4.3", default-features = false } ryu = "1.0" -b64-ct = { version = "0.1", default-features = false } +b64-ct = { version = "0.1", default-features = false, optional = true } [dev-dependencies] automod = "0.1" @@ -77,3 +77,9 @@ raw_value = [] # overflow the stack after deserialization has completed, including, but not # limited to, Display and Debug and Drop impls. unbounded_depth = [] + +# Support alternate encoding modes for bytes. Available on Rust 1.40+ +bytes_mode = [] + +# Support the Base64-encoding alternate bytes encoding mode +base64 = ["bytes_mode", "b64-ct"] diff --git a/src/base64.rs b/src/base64.rs new file mode 100644 index 000000000..4d88e545e --- /dev/null +++ b/src/base64.rs @@ -0,0 +1,138 @@ +//! Convenience functions for the base64 alternate byte encoding mode. + +use crate::de::Deserializer; +use crate::error::Result; +use crate::io; +use crate::read::{self, Read}; +use crate::ser::{CompactFormatter, PrettyFormatter, SerializerBuilder}; +use crate::value; +use crate::BytesMode; +use serde::de; +use serde::ser::Serialize; + +fn from_trait<'de, R, T>(read: R) -> Result +where + R: Read<'de>, + T: de::Deserialize<'de>, +{ + let mut de = Deserializer::with_bytes_mode(read, BytesMode::Base64); + let value = tri!(de::Deserialize::deserialize(&mut de)); + + // Make sure the whole stream has been consumed. + tri!(de.end()); + Ok(value) +} + +/// Like `from_reader`, except it uses BytesMode::Base64. +#[cfg(feature = "std")] +pub fn from_reader(rdr: R) -> Result +where + R: crate::io::Read, + T: de::DeserializeOwned, +{ + from_trait(read::IoRead::new(rdr)) +} + +/// Like `from_slice`, except it uses BytesMode::Base64. +pub fn from_slice<'a, T>(v: &'a [u8]) -> Result +where + T: de::Deserialize<'a>, +{ + from_trait(read::SliceRead::new(v)) +} + +/// Like `from_str`, except it uses BytesMode::Base64. +pub fn from_str<'a, T>(s: &'a str) -> Result +where + T: de::Deserialize<'a>, +{ + from_trait(read::StrRead::new(s)) +} + +/// Like `to_writer`, except it uses BytesMode::Base64. +#[cfg(feature = "std")] +#[inline] +pub fn to_writer(writer: W, value: &T) -> Result<()> +where + W: io::Write, + T: ?Sized + Serialize, +{ + let mut ser = SerializerBuilder::with_formatter(writer, CompactFormatter) + .bytes_mode(BytesMode::Base64) + .build(); + tri!(value.serialize(&mut ser)); + Ok(()) +} + +/// Like `to_writer_pretty`, except it uses BytesMode::Base64. +#[cfg(feature = "std")] +#[inline] +pub fn to_writer_pretty(writer: W, value: &T) -> Result<()> +where + W: io::Write, + T: ?Sized + Serialize, +{ + let mut ser = SerializerBuilder::with_formatter(writer, PrettyFormatter::new()) + .bytes_mode(BytesMode::Base64) + .build(); + tri!(value.serialize(&mut ser)); + Ok(()) +} + +/// Like `to_vec`, except it uses BytesMode::Base64. +#[inline] +pub fn to_vec(value: &T) -> Result> +where + T: ?Sized + Serialize, +{ + let mut writer = Vec::with_capacity(128); + tri!(to_writer(&mut writer, value)); + Ok(writer) +} + +/// Like `to_vec_pretty`, except it uses BytesMode::Base64. +#[inline] +pub fn to_vec_pretty(value: &T) -> Result> +where + T: ?Sized + Serialize, +{ + let mut writer = Vec::with_capacity(128); + tri!(to_writer_pretty(&mut writer, value)); + Ok(writer) +} + +/// Like `to_string`, except it uses BytesMode::Base64. +#[inline] +pub fn to_string(value: &T) -> Result +where + T: ?Sized + Serialize, +{ + let vec = tri!(to_vec(value)); + let string = unsafe { + // We do not emit invalid UTF-8. + String::from_utf8_unchecked(vec) + }; + Ok(string) +} + +/// Like `to_string_pretty`, except it uses BytesMode::Base64. +#[inline] +pub fn to_string_pretty(value: &T) -> Result +where + T: ?Sized + Serialize, +{ + let vec = tri!(to_vec_pretty(value)); + let string = unsafe { + // We do not emit invalid UTF-8. + String::from_utf8_unchecked(vec) + }; + Ok(string) +} + +/// Like `to_value`, except it uses BytesMode::Base64. +pub fn to_value(value: T) -> Result +where + T: Serialize, +{ + value.serialize(value::Serializer::with_bytes_mode(BytesMode::Base64)) +} diff --git a/src/de.rs b/src/de.rs index 5167dfeed..181e51bc1 100644 --- a/src/de.rs +++ b/src/de.rs @@ -5,8 +5,11 @@ use crate::lib::str::FromStr; use crate::lib::*; use crate::number::Number; use crate::read::{self, Fused, Reference}; +use crate::BytesMode; use serde::de::{self, Expected, Unexpected}; use serde::{forward_to_deserialize_any, serde_if_integer128}; + +#[cfg(feature = "base64")] use b64_ct::FromBase64; #[cfg(feature = "arbitrary_precision")] @@ -26,6 +29,7 @@ pub struct Deserializer { remaining_depth: u8, #[cfg(feature = "unbounded_depth")] disable_recursion_limit: bool, + bytes_mode: BytesMode, } impl<'de, R> Deserializer @@ -47,6 +51,7 @@ where read: read, scratch: Vec::new(), remaining_depth: 128, + bytes_mode: BytesMode::default(), } } @@ -57,9 +62,19 @@ where scratch: Vec::new(), remaining_depth: 128, disable_recursion_limit: false, + bytes_mode: BytesMode::default(), } } } + + /// Create a JSON deserializer with a specified encoding mode for bytes. + #[cfg(feature = "bytes_mode")] + pub fn with_bytes_mode(read: R, bytes_mode: BytesMode) -> Self { + Deserializer { + bytes_mode, + ..Self::new(read) + } + } } #[cfg(feature = "std")] @@ -1333,7 +1348,7 @@ impl<'de, 'a, R: Read<'de>> de::Deserializer<'de> for &'a mut Deserializer { self.deserialize_str(visitor) } - /// Deserialize a base64-encoded string. + /// Deserialize bytes according to the deserializer's byte mode. fn deserialize_bytes(self, visitor: V) -> Result where V: de::Visitor<'de>, @@ -1349,9 +1364,24 @@ impl<'de, 'a, R: Read<'de>> de::Deserializer<'de> for &'a mut Deserializer { b'"' => { self.eat_char(); self.scratch.clear(); - let string = self.read.parse_str(&mut self.scratch)?; - visitor.visit_bytes(&string.from_base64() - .map_err(|_| de::Error::invalid_value(de::Unexpected::Str(&string), &"base64 encoded string"))?) + match self.bytes_mode { + BytesMode::IntegerArray => { + match tri!(self.read.parse_str_raw(&mut self.scratch)) { + Reference::Borrowed(b) => visitor.visit_borrowed_bytes(b), + Reference::Copied(b) => visitor.visit_bytes(b), + } + } + #[cfg(feature = "base64")] + BytesMode::Base64 => { + let string = self.read.parse_str(&mut self.scratch)?; + visitor.visit_bytes(&string.from_base64().map_err(|_| { + de::Error::invalid_value( + de::Unexpected::Str(&string), + &"base64 encoded string", + ) + })?) + } + } } b'[' => self.deserialize_seq(visitor), _ => Err(self.peek_invalid_type(&visitor)), diff --git a/src/lib.rs b/src/lib.rs index 367685a6a..2bf2e8312 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -428,6 +428,8 @@ macro_rules! tri { #[macro_use] mod macros; +#[cfg(feature = "base64")] +pub mod base64; pub mod de; pub mod error; pub mod map; @@ -447,3 +449,132 @@ mod read; #[cfg(feature = "raw_value")] mod raw; + +/// Specifies how should bytes be (de)serialized +/// +/// JSON does not natively support binary data. Protocols can specify their own +/// mechanisms to handle binary data in JSON. Serde JSON supports different +/// modes, see the details for each variant. +/// +/// Note that the byte deserialization mode is only checked for types that are +/// deserialized as bytes (when a type directly calls `deserialize_bytes` or +/// `deserialize_byte_buf`). Types that are not self-describing (when a type +/// calls `deserialize_any`) can't be deserialized as bytes. +/// +/// The default mode is `IntegerArray`, which is the only format Serde JSON +/// used to support. +/// +/// The `Base64` mode is enabled with the `base64` crate feature. +#[cfg(feature = "bytes_mode")] +#[derive(Clone, Eq, PartialEq, Debug, Hash)] +#[non_exhaustive] +pub enum BytesMode { + /// Use integer arrays to represent bytes + /// + /// # Serialization + /// Bytes are serialized as a JSON array of integers, each element + /// representing one byte. + /// + /// # Deserialization + /// JSON arrays are deserialized as an array of integers, each element + /// representing one byte. + /// + /// JSON strings are parsed as raw bytes. It's not checked whether the + /// bytes represent a valid UTF-8 string. + /// + /// The relevant part of the JSON specification is Section 8.2 of [RFC 7159]: + /// + /// > When all the strings represented in a JSON text are composed entirely + /// > of Unicode characters (however escaped), then that JSON text is + /// > interoperable in the sense that all software implementations that + /// > parse it will agree on the contents of names and of string values in + /// > objects and arrays. + /// > + /// > However, the ABNF in this specification allows member names and string + /// > values to contain bit sequences that cannot encode Unicode characters; + /// > for example, "\uDEAD" (a single unpaired UTF-16 surrogate). Instances + /// > of this have been observed, for example, when a library truncates a + /// > UTF-16 string without checking whether the truncation split a + /// > surrogate pair. The behavior of software that receives JSON texts + /// > containing such values is unpredictable; for example, implementations + /// > might return different values for the length of a string value or even + /// > suffer fatal runtime exceptions. + /// + /// [RFC 7159]: https://tools.ietf.org/html/rfc7159 + /// + /// The behavior of serde_json is specified to fail on non-UTF-8 strings + /// when deserializing into Rust UTF-8 string types such as String, and + /// succeed with non-UTF-8 bytes when deserializing as bytes. + /// + /// Escape sequences are processed as usual, and for `\uXXXX` escapes it is + /// still checked if the hex number represents a valid Unicode code point. + /// + /// # Examples + /// + /// You can use this to parse JSON strings containing invalid UTF-8 bytes. + /// + /// ``` + /// use serde_bytes::ByteBuf; + /// + /// fn look_at_bytes() -> Result<(), serde_json::Error> { + /// let json_data = b"\"some bytes: \xe5\x00\xe5\""; + /// let bytes: ByteBuf = serde_json::from_slice(json_data)?; + /// + /// assert_eq!(b'\xe5', bytes[12]); + /// assert_eq!(b'\0', bytes[13]); + /// assert_eq!(b'\xe5', bytes[14]); + /// + /// Ok(()) + /// } + /// # + /// # look_at_bytes().unwrap(); + /// ``` + /// + /// Backslash escape sequences like `\n` are still interpreted and required + /// to be valid, and `\u` escape sequences are required to represent valid + /// Unicode code points. + /// + /// ``` + /// use serde_bytes::ByteBuf; + /// + /// fn look_at_bytes() { + /// let json_data = b"\"invalid unicode surrogate: \\uD801\""; + /// let parsed: Result = serde_json::from_slice(json_data); + /// + /// assert!(parsed.is_err()); + /// + /// let expected_msg = "unexpected end of hex escape at line 1 column 35"; + /// assert_eq!(expected_msg, parsed.unwrap_err().to_string()); + /// } + /// # + /// # look_at_bytes(); + /// ``` + IntegerArray, + /// Use base64-encoded strings to represent bytes + /// + /// Requires the `base64` crate feature. + /// + /// # Serialization + /// Bytes are serialized as a base64-encoded string. + /// + /// # Deserialization + /// JSON strings are deserialized as base64-encoded binary data. + /// + /// JSON arrays are deserialized as an array of integers, each element + /// representing one byte. + #[cfg(feature = "base64")] + Base64, +} + +#[cfg(not(feature = "bytes_mode"))] +#[derive(Clone, Eq, PartialEq, Debug, Hash)] +enum BytesMode { + IntegerArray, +} + +impl Default for BytesMode { + /// Returns `BytesMode::IntegerArray`. + fn default() -> BytesMode { + BytesMode::IntegerArray + } +} diff --git a/src/ser.rs b/src/ser.rs index 72d5dbe43..455a1daee 100644 --- a/src/ser.rs +++ b/src/ser.rs @@ -4,21 +4,26 @@ use crate::error::{Error, ErrorCode, Result}; use crate::io; use crate::lib::num::FpCategory; use crate::lib::*; +use crate::BytesMode; use serde::ser::{self, Impossible, Serialize}; use serde::serde_if_integer128; + +#[cfg(feature = "base64")] use b64_ct::{ToBase64, STANDARD}; /// A structure for serializing Rust values into JSON. pub struct Serializer { writer: W, formatter: F, + bytes_mode: BytesMode, } impl Serializer where W: io::Write, { - /// Creates a new JSON serializer. + /// Creates a new JSON serializer whose output will be written to the writer + /// specified. #[inline] pub fn new(writer: W) -> Self { Serializer::with_formatter(writer, CompactFormatter) @@ -41,13 +46,14 @@ where W: io::Write, F: Formatter, { - /// Creates a new JSON visitor whose output will be written to the writer - /// specified. + /// Creates a new JSON serializer with the specified formatter whose output + /// will be written to the writer specified. #[inline] pub fn with_formatter(writer: W, formatter: F) -> Self { Serializer { writer: writer, formatter: formatter, + bytes_mode: BytesMode::default(), } } @@ -58,6 +64,37 @@ where } } +/// Builder type to customize `Serializer` creation. +pub struct SerializerBuilder { + serializer: Serializer, +} + +impl SerializerBuilder +where + W: io::Write, + F: Formatter, +{ + /// Creates a new JSON serializer with the specified formatter whose output + /// will be written to the writer specified. + pub fn with_formatter(writer: W, formatter: F) -> Self { + SerializerBuilder { + serializer: Serializer::with_formatter(writer, formatter), + } + } + + /// Specify the encoding mode for bytes the serializer will use. + #[cfg(feature = "bytes_mode")] + pub fn bytes_mode(mut self, bytes_mode: BytesMode) -> Self { + self.serializer.bytes_mode = bytes_mode; + self + } + + /// Complete the `Serializer` construction. + pub fn build(self) -> Serializer { + self.serializer + } +} + impl<'a, W, F> ser::Serializer for &'a mut Serializer where W: io::Write, @@ -222,10 +259,21 @@ where Ok(()) } - /// Serialize a base64-encoded string. #[inline] + /// Serialize bytes according to the serializer's byte mode. fn serialize_bytes(self, value: &[u8]) -> Result<()> { - self.serialize_str(&value.to_base64(STANDARD)) + match self.bytes_mode { + BytesMode::IntegerArray => { + use serde::ser::SerializeSeq; + let mut seq = tri!(self.serialize_seq(Some(value.len()))); + for byte in value { + tri!(seq.serialize_element(byte)); + } + seq.end() + } + #[cfg(feature = "base64")] + BytesMode::Base64 => self.serialize_str(&value.to_base64(STANDARD)), + } } #[inline] @@ -869,8 +917,12 @@ where self.ser.serialize_str(value) } - fn serialize_bytes(self, value: &[u8]) -> Result<()> { - self.ser.serialize_bytes(value) + fn serialize_bytes(self, _value: &[u8]) -> Result<()> { + match self.ser.bytes_mode { + BytesMode::IntegerArray => Err(key_must_be_a_string()), + #[cfg(feature = "base64")] + BytesMode::Base64 => self.ser.serialize_bytes(_value), + } } #[inline] diff --git a/src/value/ser.rs b/src/value/ser.rs index 438d895ec..fa32002d9 100644 --- a/src/value/ser.rs +++ b/src/value/ser.rs @@ -2,9 +2,11 @@ use crate::error::{Error, ErrorCode, Result}; use crate::lib::*; use crate::map::Map; use crate::number::Number; -use crate::value::{to_value, Value}; -use serde::ser::{Impossible, Serialize}; +use crate::value::Value; +use crate::BytesMode; +#[cfg(feature = "base64")] use b64_ct::{ToBase64, STANDARD}; +use serde::ser::{Impossible, Serialize}; #[cfg(feature = "arbitrary_precision")] use serde::serde_if_integer128; @@ -54,7 +56,33 @@ impl Serialize for Value { /// input.serialize(serde_json::value::Serializer) /// } /// ``` -pub struct Serializer; +pub struct Serializer { + bytes_mode: BytesMode, +} + +/// Default `Serializer`. +// This type used to be a unit struct, so code may be initializing this by +// using the type's name. This constant makes that possible in a backwards +// compatible manner. +#[allow(non_upper_case_globals)] +pub const Serializer: Serializer = Serializer { + bytes_mode: BytesMode::IntegerArray, +}; + +impl Serializer { + /// Create a serializer whose output is a `Value` with a specific encoding + /// mode for bytes. + #[cfg(feature = "bytes_mode")] + pub fn with_bytes_mode(bytes_mode: BytesMode) -> Self { + Serializer { bytes_mode } + } + + fn clone(&self) -> Self { + Serializer { + bytes_mode: self.bytes_mode.clone(), + } + } +} impl serde::Serializer for Serializer { type Ok = Value; @@ -149,7 +177,14 @@ impl serde::Serializer for Serializer { } fn serialize_bytes(self, value: &[u8]) -> Result { - Ok(Value::String(value.to_base64(STANDARD))) + match self.bytes_mode { + BytesMode::IntegerArray => { + let vec = value.iter().map(|&b| Value::Number(b.into())).collect(); + Ok(Value::Array(vec)) + } + #[cfg(feature = "base64")] + BytesMode::Base64 => Ok(Value::String(value.to_base64(STANDARD))), + } } #[inline] @@ -191,7 +226,7 @@ impl serde::Serializer for Serializer { T: ?Sized + Serialize, { let mut values = Map::new(); - values.insert(String::from(variant), tri!(to_value(&value))); + values.insert(String::from(variant), tri!(value.serialize(self.clone()))); Ok(Value::Object(values)) } @@ -211,6 +246,7 @@ impl serde::Serializer for Serializer { fn serialize_seq(self, len: Option) -> Result { Ok(SerializeVec { vec: Vec::with_capacity(len.unwrap_or(0)), + ser: self.clone(), }) } @@ -236,6 +272,7 @@ impl serde::Serializer for Serializer { Ok(SerializeTupleVariant { name: String::from(variant), vec: Vec::with_capacity(len), + ser: self.clone(), }) } @@ -243,15 +280,22 @@ impl serde::Serializer for Serializer { Ok(SerializeMap::Map { map: Map::new(), next_key: None, + ser: self.clone(), }) } fn serialize_struct(self, name: &'static str, len: usize) -> Result { match name { #[cfg(feature = "arbitrary_precision")] - crate::number::TOKEN => Ok(SerializeMap::Number { out_value: None }), + crate::number::TOKEN => Ok(SerializeMap::Number { + out_value: None, + ser: self.clone(), + }), #[cfg(feature = "raw_value")] - crate::raw::TOKEN => Ok(SerializeMap::RawValue { out_value: None }), + crate::raw::TOKEN => Ok(SerializeMap::RawValue { + out_value: None, + ser: self.clone(), + }), _ => self.serialize_map(Some(len)), } } @@ -266,6 +310,7 @@ impl serde::Serializer for Serializer { Ok(SerializeStructVariant { name: String::from(variant), map: Map::new(), + ser: self.clone(), }) } @@ -279,27 +324,37 @@ impl serde::Serializer for Serializer { pub struct SerializeVec { vec: Vec, + ser: Serializer, } pub struct SerializeTupleVariant { name: String, vec: Vec, + ser: Serializer, } pub enum SerializeMap { Map { map: Map, next_key: Option, + ser: Serializer, }, #[cfg(feature = "arbitrary_precision")] - Number { out_value: Option }, + Number { + out_value: Option, + ser: Serializer, + }, #[cfg(feature = "raw_value")] - RawValue { out_value: Option }, + RawValue { + out_value: Option, + ser: Serializer, + }, } pub struct SerializeStructVariant { name: String, map: Map, + ser: Serializer, } impl serde::ser::SerializeSeq for SerializeVec { @@ -310,7 +365,7 @@ impl serde::ser::SerializeSeq for SerializeVec { where T: ?Sized + Serialize, { - self.vec.push(tri!(to_value(&value))); + self.vec.push(tri!(value.serialize(self.ser.clone()))); Ok(()) } @@ -359,7 +414,7 @@ impl serde::ser::SerializeTupleVariant for SerializeTupleVariant { where T: ?Sized + Serialize, { - self.vec.push(tri!(to_value(&value))); + self.vec.push(tri!(value.serialize(self.ser.clone()))); Ok(()) } @@ -402,12 +457,13 @@ impl serde::ser::SerializeMap for SerializeMap { SerializeMap::Map { ref mut map, ref mut next_key, + ref ser, } => { let key = next_key.take(); // Panic because this indicates a bug in the program rather than an // expected failure. let key = key.expect("serialize_value called before serialize_key"); - map.insert(key, tri!(to_value(&value))); + map.insert(key, tri!(value.serialize(ser.clone()))); Ok(()) } #[cfg(feature = "arbitrary_precision")] @@ -664,7 +720,8 @@ impl serde::ser::SerializeStructVariant for SerializeStructVariant { where T: ?Sized + Serialize, { - self.map.insert(String::from(key), tri!(to_value(&value))); + self.map + .insert(String::from(key), tri!(value.serialize(self.ser.clone()))); Ok(()) }