Skip to content

Commit

Permalink
Add support for alternate encodings of bytes types
Browse files Browse the repository at this point in the history
  • Loading branch information
Jethro Beekman committed Apr 23, 2020
1 parent a69b9fc commit c675f24
Show file tree
Hide file tree
Showing 7 changed files with 443 additions and 27 deletions.
2 changes: 2 additions & 0 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@ matrix:
- cargo test --features arbitrary_precision
- cargo test --features raw_value
- cargo test --features unbounded_depth
- cargo test --features bytes_mode
- cargo test --features base64

- rust: stable
- rust: beta
Expand Down
8 changes: 7 additions & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ serde = { version = "1.0.100", default-features = false }
indexmap = { version = "1.2", optional = true }
itoa = { version = "0.4.3", default-features = false }
ryu = "1.0"
b64-ct = { version = "0.1", default-features = false }
b64-ct = { version = "0.1", default-features = false, optional = true }

[dev-dependencies]
automod = "0.1"
Expand Down Expand Up @@ -77,3 +77,9 @@ raw_value = []
# overflow the stack after deserialization has completed, including, but not
# limited to, Display and Debug and Drop impls.
unbounded_depth = []

# Support alternate encoding modes for bytes. Available on Rust 1.40+
bytes_mode = []

# Support the Base64-encoding alternate bytes encoding mode
base64 = ["bytes_mode", "b64-ct"]
138 changes: 138 additions & 0 deletions src/base64.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
//! Convenience functions for the base64 alternate byte encoding mode.
use crate::de::Deserializer;
use crate::error::Result;
use crate::io;
use crate::read::{self, Read};
use crate::ser::{CompactFormatter, PrettyFormatter, SerializerBuilder};
use crate::value;
use crate::BytesMode;
use serde::de;
use serde::ser::Serialize;

fn from_trait<'de, R, T>(read: R) -> Result<T>
where
R: Read<'de>,
T: de::Deserialize<'de>,
{
let mut de = Deserializer::with_bytes_mode(read, BytesMode::Base64);
let value = tri!(de::Deserialize::deserialize(&mut de));

// Make sure the whole stream has been consumed.
tri!(de.end());
Ok(value)
}

/// Like `from_reader`, except it uses BytesMode::Base64.
#[cfg(feature = "std")]
pub fn from_reader<R, T>(rdr: R) -> Result<T>
where
R: crate::io::Read,
T: de::DeserializeOwned,
{
from_trait(read::IoRead::new(rdr))
}

/// Like `from_slice`, except it uses BytesMode::Base64.
pub fn from_slice<'a, T>(v: &'a [u8]) -> Result<T>
where
T: de::Deserialize<'a>,
{
from_trait(read::SliceRead::new(v))
}

/// Like `from_str`, except it uses BytesMode::Base64.
pub fn from_str<'a, T>(s: &'a str) -> Result<T>
where
T: de::Deserialize<'a>,
{
from_trait(read::StrRead::new(s))
}

/// Like `to_writer`, except it uses BytesMode::Base64.
#[cfg(feature = "std")]
#[inline]
pub fn to_writer<W, T>(writer: W, value: &T) -> Result<()>
where
W: io::Write,
T: ?Sized + Serialize,
{
let mut ser = SerializerBuilder::with_formatter(writer, CompactFormatter)
.bytes_mode(BytesMode::Base64)
.build();
tri!(value.serialize(&mut ser));
Ok(())
}

/// Like `to_writer_pretty`, except it uses BytesMode::Base64.
#[cfg(feature = "std")]
#[inline]
pub fn to_writer_pretty<W, T>(writer: W, value: &T) -> Result<()>
where
W: io::Write,
T: ?Sized + Serialize,
{
let mut ser = SerializerBuilder::with_formatter(writer, PrettyFormatter::new())
.bytes_mode(BytesMode::Base64)
.build();
tri!(value.serialize(&mut ser));
Ok(())
}

/// Like `to_vec`, except it uses BytesMode::Base64.
#[inline]
pub fn to_vec<T>(value: &T) -> Result<Vec<u8>>
where
T: ?Sized + Serialize,
{
let mut writer = Vec::with_capacity(128);
tri!(to_writer(&mut writer, value));
Ok(writer)
}

/// Like `to_vec_pretty`, except it uses BytesMode::Base64.
#[inline]
pub fn to_vec_pretty<T>(value: &T) -> Result<Vec<u8>>
where
T: ?Sized + Serialize,
{
let mut writer = Vec::with_capacity(128);
tri!(to_writer_pretty(&mut writer, value));
Ok(writer)
}

/// Like `to_string`, except it uses BytesMode::Base64.
#[inline]
pub fn to_string<T>(value: &T) -> Result<String>
where
T: ?Sized + Serialize,
{
let vec = tri!(to_vec(value));
let string = unsafe {
// We do not emit invalid UTF-8.
String::from_utf8_unchecked(vec)
};
Ok(string)
}

/// Like `to_string_pretty`, except it uses BytesMode::Base64.
#[inline]
pub fn to_string_pretty<T>(value: &T) -> Result<String>
where
T: ?Sized + Serialize,
{
let vec = tri!(to_vec_pretty(value));
let string = unsafe {
// We do not emit invalid UTF-8.
String::from_utf8_unchecked(vec)
};
Ok(string)
}

/// Like `to_value`, except it uses BytesMode::Base64.
pub fn to_value<T>(value: T) -> Result<value::Value>
where
T: Serialize,
{
value.serialize(value::Serializer::with_bytes_mode(BytesMode::Base64))
}
38 changes: 34 additions & 4 deletions src/de.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,11 @@ use crate::lib::str::FromStr;
use crate::lib::*;
use crate::number::Number;
use crate::read::{self, Fused, Reference};
use crate::BytesMode;
use serde::de::{self, Expected, Unexpected};
use serde::{forward_to_deserialize_any, serde_if_integer128};

#[cfg(feature = "base64")]
use b64_ct::FromBase64;

#[cfg(feature = "arbitrary_precision")]
Expand All @@ -26,6 +29,7 @@ pub struct Deserializer<R> {
remaining_depth: u8,
#[cfg(feature = "unbounded_depth")]
disable_recursion_limit: bool,
bytes_mode: BytesMode,
}

impl<'de, R> Deserializer<R>
Expand All @@ -47,6 +51,7 @@ where
read: read,
scratch: Vec::new(),
remaining_depth: 128,
bytes_mode: BytesMode::default(),
}
}

Expand All @@ -57,9 +62,19 @@ where
scratch: Vec::new(),
remaining_depth: 128,
disable_recursion_limit: false,
bytes_mode: BytesMode::default(),
}
}
}

/// Create a JSON deserializer with a specified encoding mode for bytes.
#[cfg(feature = "bytes_mode")]
pub fn with_bytes_mode(read: R, bytes_mode: BytesMode) -> Self {
Deserializer {
bytes_mode,
..Self::new(read)
}
}
}

#[cfg(feature = "std")]
Expand Down Expand Up @@ -1333,7 +1348,7 @@ impl<'de, 'a, R: Read<'de>> de::Deserializer<'de> for &'a mut Deserializer<R> {
self.deserialize_str(visitor)
}

/// Deserialize a base64-encoded string.
/// Deserialize bytes according to the deserializer's byte mode.
fn deserialize_bytes<V>(self, visitor: V) -> Result<V::Value>
where
V: de::Visitor<'de>,
Expand All @@ -1349,9 +1364,24 @@ impl<'de, 'a, R: Read<'de>> de::Deserializer<'de> for &'a mut Deserializer<R> {
b'"' => {
self.eat_char();
self.scratch.clear();
let string = self.read.parse_str(&mut self.scratch)?;
visitor.visit_bytes(&string.from_base64()
.map_err(|_| de::Error::invalid_value(de::Unexpected::Str(&string), &"base64 encoded string"))?)
match self.bytes_mode {
BytesMode::IntegerArray => {
match tri!(self.read.parse_str_raw(&mut self.scratch)) {
Reference::Borrowed(b) => visitor.visit_borrowed_bytes(b),
Reference::Copied(b) => visitor.visit_bytes(b),
}
}
#[cfg(feature = "base64")]
BytesMode::Base64 => {
let string = self.read.parse_str(&mut self.scratch)?;
visitor.visit_bytes(&string.from_base64().map_err(|_| {
de::Error::invalid_value(
de::Unexpected::Str(&string),
&"base64 encoded string",
)
})?)
}
}
}
b'[' => self.deserialize_seq(visitor),
_ => Err(self.peek_invalid_type(&visitor)),
Expand Down
131 changes: 131 additions & 0 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -428,6 +428,8 @@ macro_rules! tri {
#[macro_use]
mod macros;

#[cfg(feature = "base64")]
pub mod base64;
pub mod de;
pub mod error;
pub mod map;
Expand All @@ -447,3 +449,132 @@ mod read;

#[cfg(feature = "raw_value")]
mod raw;

/// Specifies how should bytes be (de)serialized
///
/// JSON does not natively support binary data. Protocols can specify their own
/// mechanisms to handle binary data in JSON. Serde JSON supports different
/// modes, see the details for each variant.
///
/// Note that the byte deserialization mode is only checked for types that are
/// deserialized as bytes (when a type directly calls `deserialize_bytes` or
/// `deserialize_byte_buf`). Types that are not self-describing (when a type
/// calls `deserialize_any`) can't be deserialized as bytes.
///
/// The default mode is `IntegerArray`, which is the only format Serde JSON
/// used to support.
///
/// The `Base64` mode is enabled with the `base64` crate feature.
#[cfg(feature = "bytes_mode")]
#[derive(Clone, Eq, PartialEq, Debug, Hash)]
#[non_exhaustive]
pub enum BytesMode {
/// Use integer arrays to represent bytes
///
/// # Serialization
/// Bytes are serialized as a JSON array of integers, each element
/// representing one byte.
///
/// # Deserialization
/// JSON arrays are deserialized as an array of integers, each element
/// representing one byte.
///
/// JSON strings are parsed as raw bytes. It's not checked whether the
/// bytes represent a valid UTF-8 string.
///
/// The relevant part of the JSON specification is Section 8.2 of [RFC 7159]:
///
/// > When all the strings represented in a JSON text are composed entirely
/// > of Unicode characters (however escaped), then that JSON text is
/// > interoperable in the sense that all software implementations that
/// > parse it will agree on the contents of names and of string values in
/// > objects and arrays.
/// >
/// > However, the ABNF in this specification allows member names and string
/// > values to contain bit sequences that cannot encode Unicode characters;
/// > for example, "\uDEAD" (a single unpaired UTF-16 surrogate). Instances
/// > of this have been observed, for example, when a library truncates a
/// > UTF-16 string without checking whether the truncation split a
/// > surrogate pair. The behavior of software that receives JSON texts
/// > containing such values is unpredictable; for example, implementations
/// > might return different values for the length of a string value or even
/// > suffer fatal runtime exceptions.
///
/// [RFC 7159]: https://tools.ietf.org/html/rfc7159
///
/// The behavior of serde_json is specified to fail on non-UTF-8 strings
/// when deserializing into Rust UTF-8 string types such as String, and
/// succeed with non-UTF-8 bytes when deserializing as bytes.
///
/// Escape sequences are processed as usual, and for `\uXXXX` escapes it is
/// still checked if the hex number represents a valid Unicode code point.
///
/// # Examples
///
/// You can use this to parse JSON strings containing invalid UTF-8 bytes.
///
/// ```
/// use serde_bytes::ByteBuf;
///
/// fn look_at_bytes() -> Result<(), serde_json::Error> {
/// let json_data = b"\"some bytes: \xe5\x00\xe5\"";
/// let bytes: ByteBuf = serde_json::from_slice(json_data)?;
///
/// assert_eq!(b'\xe5', bytes[12]);
/// assert_eq!(b'\0', bytes[13]);
/// assert_eq!(b'\xe5', bytes[14]);
///
/// Ok(())
/// }
/// #
/// # look_at_bytes().unwrap();
/// ```
///
/// Backslash escape sequences like `\n` are still interpreted and required
/// to be valid, and `\u` escape sequences are required to represent valid
/// Unicode code points.
///
/// ```
/// use serde_bytes::ByteBuf;
///
/// fn look_at_bytes() {
/// let json_data = b"\"invalid unicode surrogate: \\uD801\"";
/// let parsed: Result<ByteBuf, _> = serde_json::from_slice(json_data);
///
/// assert!(parsed.is_err());
///
/// let expected_msg = "unexpected end of hex escape at line 1 column 35";
/// assert_eq!(expected_msg, parsed.unwrap_err().to_string());
/// }
/// #
/// # look_at_bytes();
/// ```
IntegerArray,
/// Use base64-encoded strings to represent bytes
///
/// Requires the `base64` crate feature.
///
/// # Serialization
/// Bytes are serialized as a base64-encoded string.
///
/// # Deserialization
/// JSON strings are deserialized as base64-encoded binary data.
///
/// JSON arrays are deserialized as an array of integers, each element
/// representing one byte.
#[cfg(feature = "base64")]
Base64,
}

#[cfg(not(feature = "bytes_mode"))]
#[derive(Clone, Eq, PartialEq, Debug, Hash)]
enum BytesMode {
IntegerArray,
}

impl Default for BytesMode {
/// Returns `BytesMode::IntegerArray`.
fn default() -> BytesMode {
BytesMode::IntegerArray
}
}
Loading

0 comments on commit c675f24

Please sign in to comment.