Add support for alternate encodings of bytes types

serde-rs · Apr 23, 2020 · c675f24 · c675f24
1 parent a69b9fc
commit c675f24
Show file tree

Hide file tree

Showing 7 changed files with 443 additions and 27 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -9,6 +9,8 @@ matrix:
         - cargo test --features arbitrary_precision
         - cargo test --features raw_value
         - cargo test --features unbounded_depth
+        - cargo test --features bytes_mode
+        - cargo test --features base64
 
     - rust: stable
     - rust: beta

diff --git a/Cargo.toml b/Cargo.toml
@@ -21,7 +21,7 @@ serde = { version = "1.0.100", default-features = false }
 indexmap = { version = "1.2", optional = true }
 itoa = { version = "0.4.3", default-features = false }
 ryu = "1.0"
-b64-ct = { version = "0.1", default-features = false }
+b64-ct = { version = "0.1", default-features = false, optional = true }
 
 [dev-dependencies]
 automod = "0.1"
@@ -77,3 +77,9 @@ raw_value = []
 # overflow the stack after deserialization has completed, including, but not
 # limited to, Display and Debug and Drop impls.
 unbounded_depth = []
+
+# Support alternate encoding modes for bytes. Available on Rust 1.40+
+bytes_mode = []
+
+# Support the Base64-encoding alternate bytes encoding mode
+base64 = ["bytes_mode", "b64-ct"]
diff --git a/src/base64.rs b/src/base64.rs
@@ -0,0 +1,138 @@
+//! Convenience functions for the base64 alternate byte encoding mode.
+
+use crate::de::Deserializer;
+use crate::error::Result;
+use crate::io;
+use crate::read::{self, Read};
+use crate::ser::{CompactFormatter, PrettyFormatter, SerializerBuilder};
+use crate::value;
+use crate::BytesMode;
+use serde::de;
+use serde::ser::Serialize;
+
+fn from_trait<'de, R, T>(read: R) -> Result<T>
+where
+    R: Read<'de>,
+    T: de::Deserialize<'de>,
+{
+    let mut de = Deserializer::with_bytes_mode(read, BytesMode::Base64);
+    let value = tri!(de::Deserialize::deserialize(&mut de));
+
+    // Make sure the whole stream has been consumed.
+    tri!(de.end());
+    Ok(value)
+}
+
+/// Like `from_reader`, except it uses BytesMode::Base64.
+#[cfg(feature = "std")]
+pub fn from_reader<R, T>(rdr: R) -> Result<T>
+where
+    R: crate::io::Read,
+    T: de::DeserializeOwned,
+{
+    from_trait(read::IoRead::new(rdr))
+}
+
+/// Like `from_slice`, except it uses BytesMode::Base64.
+pub fn from_slice<'a, T>(v: &'a [u8]) -> Result<T>
+where
+    T: de::Deserialize<'a>,
+{
+    from_trait(read::SliceRead::new(v))
+}
+
+/// Like `from_str`, except it uses BytesMode::Base64.
+pub fn from_str<'a, T>(s: &'a str) -> Result<T>
+where
+    T: de::Deserialize<'a>,
+{
+    from_trait(read::StrRead::new(s))
+}
+
+/// Like `to_writer`, except it uses BytesMode::Base64.
+#[cfg(feature = "std")]
+#[inline]
+pub fn to_writer<W, T>(writer: W, value: &T) -> Result<()>
+where
+    W: io::Write,
+    T: ?Sized + Serialize,
+{
+    let mut ser = SerializerBuilder::with_formatter(writer, CompactFormatter)
+        .bytes_mode(BytesMode::Base64)
+        .build();
+    tri!(value.serialize(&mut ser));
+    Ok(())
+}
+
+/// Like `to_writer_pretty`, except it uses BytesMode::Base64.
+#[cfg(feature = "std")]
+#[inline]
+pub fn to_writer_pretty<W, T>(writer: W, value: &T) -> Result<()>
+where
+    W: io::Write,
+    T: ?Sized + Serialize,
+{
+    let mut ser = SerializerBuilder::with_formatter(writer, PrettyFormatter::new())
+        .bytes_mode(BytesMode::Base64)
+        .build();
+    tri!(value.serialize(&mut ser));
+    Ok(())
+}
+
+/// Like `to_vec`, except it uses BytesMode::Base64.
+#[inline]
+pub fn to_vec<T>(value: &T) -> Result<Vec<u8>>
+where
+    T: ?Sized + Serialize,
+{
+    let mut writer = Vec::with_capacity(128);
+    tri!(to_writer(&mut writer, value));
+    Ok(writer)
+}
+
+/// Like `to_vec_pretty`, except it uses BytesMode::Base64.
+#[inline]
+pub fn to_vec_pretty<T>(value: &T) -> Result<Vec<u8>>
+where
+    T: ?Sized + Serialize,
+{
+    let mut writer = Vec::with_capacity(128);
+    tri!(to_writer_pretty(&mut writer, value));
+    Ok(writer)
+}
+
+/// Like `to_string`, except it uses BytesMode::Base64.
+#[inline]
+pub fn to_string<T>(value: &T) -> Result<String>
+where
+    T: ?Sized + Serialize,
+{
+    let vec = tri!(to_vec(value));
+    let string = unsafe {
+        // We do not emit invalid UTF-8.
+        String::from_utf8_unchecked(vec)
+    };
+    Ok(string)
+}
+
+/// Like `to_string_pretty`, except it uses BytesMode::Base64.
+#[inline]
+pub fn to_string_pretty<T>(value: &T) -> Result<String>
+where
+    T: ?Sized + Serialize,
+{
+    let vec = tri!(to_vec_pretty(value));
+    let string = unsafe {
+        // We do not emit invalid UTF-8.
+        String::from_utf8_unchecked(vec)
+    };
+    Ok(string)
+}
+
+/// Like `to_value`, except it uses BytesMode::Base64.
+pub fn to_value<T>(value: T) -> Result<value::Value>
+where
+    T: Serialize,
+{
+    value.serialize(value::Serializer::with_bytes_mode(BytesMode::Base64))
+}
diff --git a/src/de.rs b/src/de.rs
@@ -5,8 +5,11 @@ use crate::lib::str::FromStr;
 use crate::lib::*;
 use crate::number::Number;
 use crate::read::{self, Fused, Reference};
+use crate::BytesMode;
 use serde::de::{self, Expected, Unexpected};
 use serde::{forward_to_deserialize_any, serde_if_integer128};
+
+#[cfg(feature = "base64")]
 use b64_ct::FromBase64;
 
 #[cfg(feature = "arbitrary_precision")]
@@ -26,6 +29,7 @@ pub struct Deserializer<R> {
     remaining_depth: u8,
     #[cfg(feature = "unbounded_depth")]
     disable_recursion_limit: bool,
+    bytes_mode: BytesMode,
 }
 
 impl<'de, R> Deserializer<R>
@@ -47,6 +51,7 @@ where
                 read: read,
                 scratch: Vec::new(),
                 remaining_depth: 128,
+                bytes_mode: BytesMode::default(),
             }
         }
 
@@ -57,9 +62,19 @@ where
                 scratch: Vec::new(),
                 remaining_depth: 128,
                 disable_recursion_limit: false,
+                bytes_mode: BytesMode::default(),
             }
         }
     }
+
+    /// Create a JSON deserializer with a specified encoding mode for bytes.
+    #[cfg(feature = "bytes_mode")]
+    pub fn with_bytes_mode(read: R, bytes_mode: BytesMode) -> Self {
+        Deserializer {
+            bytes_mode,
+            ..Self::new(read)
+        }
+    }
 }
 
 #[cfg(feature = "std")]
@@ -1333,7 +1348,7 @@ impl<'de, 'a, R: Read<'de>> de::Deserializer<'de> for &'a mut Deserializer<R> {
         self.deserialize_str(visitor)
     }
 
-    /// Deserialize a base64-encoded string.
+    /// Deserialize bytes according to the deserializer's byte mode.
     fn deserialize_bytes<V>(self, visitor: V) -> Result<V::Value>
     where
         V: de::Visitor<'de>,
@@ -1349,9 +1364,24 @@ impl<'de, 'a, R: Read<'de>> de::Deserializer<'de> for &'a mut Deserializer<R> {
             b'"' => {
                 self.eat_char();
                 self.scratch.clear();
-                let string = self.read.parse_str(&mut self.scratch)?;
-                visitor.visit_bytes(&string.from_base64()
-                    .map_err(|_| de::Error::invalid_value(de::Unexpected::Str(&string), &"base64 encoded string"))?)
+                match self.bytes_mode {
+                    BytesMode::IntegerArray => {
+                        match tri!(self.read.parse_str_raw(&mut self.scratch)) {
+                            Reference::Borrowed(b) => visitor.visit_borrowed_bytes(b),
+                            Reference::Copied(b) => visitor.visit_bytes(b),
+                        }
+                    }
+                    #[cfg(feature = "base64")]
+                    BytesMode::Base64 => {
+                        let string = self.read.parse_str(&mut self.scratch)?;
+                        visitor.visit_bytes(&string.from_base64().map_err(|_| {
+                            de::Error::invalid_value(
+                                de::Unexpected::Str(&string),
+                                &"base64 encoded string",
+                            )
+                        })?)
+                    }
+                }
             }
             b'[' => self.deserialize_seq(visitor),
             _ => Err(self.peek_invalid_type(&visitor)),

diff --git a/src/lib.rs b/src/lib.rs
@@ -428,6 +428,8 @@ macro_rules! tri {
 #[macro_use]
 mod macros;
 
+#[cfg(feature = "base64")]
+pub mod base64;
 pub mod de;
 pub mod error;
 pub mod map;
@@ -447,3 +449,132 @@ mod read;
 
 #[cfg(feature = "raw_value")]
 mod raw;
+
+/// Specifies how should bytes be (de)serialized
+///
+/// JSON does not natively support binary data. Protocols can specify their own
+/// mechanisms to handle binary data in JSON. Serde JSON supports different
+/// modes, see the details for each variant.
+///
+/// Note that the byte deserialization mode is only checked for types that are
+/// deserialized as bytes (when a type directly calls `deserialize_bytes` or
+/// `deserialize_byte_buf`). Types that are not self-describing (when a type
+/// calls `deserialize_any`) can't be deserialized as bytes.
+///
+/// The default mode is `IntegerArray`, which is the only format Serde JSON
+/// used to support.
+///
+/// The `Base64` mode is enabled with the `base64` crate feature.
+#[cfg(feature = "bytes_mode")]
+#[derive(Clone, Eq, PartialEq, Debug, Hash)]
+#[non_exhaustive]
+pub enum BytesMode {
+    /// Use integer arrays to represent bytes
+    ///
+    /// # Serialization
+    /// Bytes are serialized as a JSON array of integers, each element
+    /// representing one byte.
+    ///
+    /// # Deserialization
+    /// JSON arrays are deserialized as an array of integers, each element
+    /// representing one byte.
+    ///
+    /// JSON strings are parsed as raw bytes. It's not checked whether the
+    /// bytes represent a valid UTF-8 string.
+    ///
+    /// The relevant part of the JSON specification is Section 8.2 of [RFC 7159]:
+    ///
+    /// > When all the strings represented in a JSON text are composed entirely
+    /// > of Unicode characters (however escaped), then that JSON text is
+    /// > interoperable in the sense that all software implementations that
+    /// > parse it will agree on the contents of names and of string values in
+    /// > objects and arrays.
+    /// >
+    /// > However, the ABNF in this specification allows member names and string
+    /// > values to contain bit sequences that cannot encode Unicode characters;
+    /// > for example, "\uDEAD" (a single unpaired UTF-16 surrogate). Instances
+    /// > of this have been observed, for example, when a library truncates a
+    /// > UTF-16 string without checking whether the truncation split a
+    /// > surrogate pair.  The behavior of software that receives JSON texts
+    /// > containing such values is unpredictable; for example, implementations
+    /// > might return different values for the length of a string value or even
+    /// > suffer fatal runtime exceptions.
+    ///
+    /// [RFC 7159]: https://tools.ietf.org/html/rfc7159
+    ///
+    /// The behavior of serde_json is specified to fail on non-UTF-8 strings
+    /// when deserializing into Rust UTF-8 string types such as String, and
+    /// succeed with non-UTF-8 bytes when deserializing as bytes.
+    ///
+    /// Escape sequences are processed as usual, and for `\uXXXX` escapes it is
+    /// still checked if the hex number represents a valid Unicode code point.
+    ///
+    /// # Examples
+    ///
+    /// You can use this to parse JSON strings containing invalid UTF-8 bytes.
+    ///
+    /// ```
+    /// use serde_bytes::ByteBuf;
+    ///
+    /// fn look_at_bytes() -> Result<(), serde_json::Error> {
+    ///     let json_data = b"\"some bytes: \xe5\x00\xe5\"";
+    ///     let bytes: ByteBuf = serde_json::from_slice(json_data)?;
+    ///
+    ///     assert_eq!(b'\xe5', bytes[12]);
+    ///     assert_eq!(b'\0', bytes[13]);
+    ///     assert_eq!(b'\xe5', bytes[14]);
+    ///
+    ///     Ok(())
+    /// }
+    /// #
+    /// # look_at_bytes().unwrap();
+    /// ```
+    ///
+    /// Backslash escape sequences like `\n` are still interpreted and required
+    /// to be valid, and `\u` escape sequences are required to represent valid
+    /// Unicode code points.
+    ///
+    /// ```
+    /// use serde_bytes::ByteBuf;
+    ///
+    /// fn look_at_bytes() {
+    ///     let json_data = b"\"invalid unicode surrogate: \\uD801\"";
+    ///     let parsed: Result<ByteBuf, _> = serde_json::from_slice(json_data);
+    ///
+    ///     assert!(parsed.is_err());
+    ///
+    ///     let expected_msg = "unexpected end of hex escape at line 1 column 35";
+    ///     assert_eq!(expected_msg, parsed.unwrap_err().to_string());
+    /// }
+    /// #
+    /// # look_at_bytes();
+    /// ```
+    IntegerArray,
+    /// Use base64-encoded strings to represent bytes
+    ///
+    /// Requires the `base64` crate feature.
+    ///
+    /// # Serialization
+    /// Bytes are serialized as a base64-encoded string.
+    ///
+    /// # Deserialization
+    /// JSON strings are deserialized as base64-encoded binary data.
+    ///
+    /// JSON arrays are deserialized as an array of integers, each element
+    /// representing one byte.
+    #[cfg(feature = "base64")]
+    Base64,
+}
+
+#[cfg(not(feature = "bytes_mode"))]
+#[derive(Clone, Eq, PartialEq, Debug, Hash)]
+enum BytesMode {
+    IntegerArray,
+}
+
+impl Default for BytesMode {
+    /// Returns `BytesMode::IntegerArray`.
+    fn default() -> BytesMode {
+        BytesMode::IntegerArray
+    }
+}