Split EncodingError from the Error type

This mostly allows for decode functions to return a smaller more accurate error
tafia · Oct 12, 2024 · a975a82 · a975a82
1 parent 6dbd39a
commit a975a82
Show file tree

Hide file tree

Showing 6 changed files with 94 additions and 61 deletions.
diff --git a/Changelog.md b/Changelog.md
@@ -22,7 +22,7 @@
 - [#227]: Split `SeError` from `DeError` in the `serialize` feature.
   Serialize functions and methods now return `SeError`.
 - [#810]: Return `std::io::Error` from `Writer` methods.
-- [#811]: Split `NamespaceError` from `Error`.
+- [#811]: Split `NamespaceError` and `EncodingError` from `Error`.
 
 [#227]: https://github.com/tafia/quick-xml/issues/227
 [#810]: https://github.com/tafia/quick-xml/pull/810

diff --git a/examples/read_nodes.rs b/examples/read_nodes.rs
@@ -90,7 +90,10 @@ impl Translation {
                 })
             } else {
                 dbg!("Expected Event::Start for Text, got: {:?}", &event);
-                let name_string = reader.decoder().decode(name.as_ref())?;
+                let name_string = reader
+                    .decoder()
+                    .decode(name.as_ref())
+                    .map_err(quick_xml::Error::Encoding)?;
                 Err(AppError::NoText(name_string.into()))
             }
         } else {

diff --git a/src/encoding.rs b/src/encoding.rs
@@ -1,14 +1,11 @@
 //! A module for wrappers that encode / decode data.
 
 use std::borrow::Cow;
+use std::str::Utf8Error;
 
 #[cfg(feature = "encoding")]
 use encoding_rs::{DecoderResult, Encoding, UTF_16BE, UTF_16LE, UTF_8};
 
-#[cfg(feature = "encoding")]
-use crate::Error;
-use crate::Result;
-
 /// Unicode "byte order mark" (\u{FEFF}) encoded as UTF-8.
 /// See <https://unicode.org/faq/utf_bom.html#bom1>
 pub(crate) const UTF8_BOM: &[u8] = &[0xEF, 0xBB, 0xBF];
@@ -21,6 +18,48 @@ pub(crate) const UTF16_LE_BOM: &[u8] = &[0xFF, 0xFE];
 #[cfg(feature = "encoding")]
 pub(crate) const UTF16_BE_BOM: &[u8] = &[0xFE, 0xFF];
 
+/// An error when decoding or encoding
+///
+/// If feature [`encoding`] is disabled, the [`EncodingError`] is always [`EncodingError::Utf8`]
+///
+/// [`encoding`]: ../index.html#encoding
+#[derive(Clone, Debug, PartialEq, Eq)]
+#[non_exhaustive]
+pub enum EncodingError {
+    /// Input was not valid UTF-8
+    Utf8(Utf8Error),
+    /// Input did not adhere to the given encoding
+    #[cfg(feature = "encoding")]
+    Other(&'static Encoding),
+}
+
+impl From<Utf8Error> for EncodingError {
+    #[inline]
+    fn from(e: Utf8Error) -> Self {
+        Self::Utf8(e)
+    }
+}
+
+impl std::error::Error for EncodingError {
+    fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
+        match self {
+            Self::Utf8(e) => Some(e),
+            #[cfg(feature = "encoding")]
+            Self::Other(_) => None,
+        }
+    }
+}
+
+impl std::fmt::Display for EncodingError {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        match self {
+            Self::Utf8(e) => write!(f, "cannot decode input using UTF-8: {}", e),
+            #[cfg(feature = "encoding")]
+            Self::Other(encoding) => write!(f, "cannot decode input using {}", encoding.name()),
+        }
+    }
+}
+
 /// Decoder of byte slices into strings.
 ///
 /// If feature [`encoding`] is enabled, this encoding taken from the `"encoding"`
@@ -79,7 +118,7 @@ impl Decoder {
     ///
     /// ----
     /// Returns an error in case of malformed sequences in the `bytes`.
-    pub fn decode<'b>(&self, bytes: &'b [u8]) -> Result<Cow<'b, str>> {
+    pub fn decode<'b>(&self, bytes: &'b [u8]) -> Result<Cow<'b, str>, EncodingError> {
         #[cfg(not(feature = "encoding"))]
         let decoded = Ok(Cow::Borrowed(std::str::from_utf8(bytes)?));
 
@@ -90,7 +129,7 @@ impl Decoder {
     }
 
     /// Like [`decode`][Self::decode] but using a pre-allocated buffer.
-    pub fn decode_into(&self, bytes: &[u8], buf: &mut String) -> Result<()> {
+    pub fn decode_into(&self, bytes: &[u8], buf: &mut String) -> Result<(), EncodingError> {
         #[cfg(not(feature = "encoding"))]
         buf.push_str(std::str::from_utf8(bytes)?);
 
@@ -101,7 +140,10 @@ impl Decoder {
     }
 
     /// Decodes the `Cow` buffer, preserves the lifetime
-    pub(crate) fn decode_cow<'b>(&self, bytes: &Cow<'b, [u8]>) -> Result<Cow<'b, str>> {
+    pub(crate) fn decode_cow<'b>(
+        &self,
+        bytes: &Cow<'b, [u8]>,
+    ) -> Result<Cow<'b, str>, EncodingError> {
         match bytes {
             Cow::Borrowed(bytes) => self.decode(bytes),
             // Convert to owned, because otherwise Cow will be bound with wrong lifetime
@@ -114,15 +156,22 @@ impl Decoder {
 ///
 /// Returns an error in case of malformed or non-representable sequences in the `bytes`.
 #[cfg(feature = "encoding")]
-pub fn decode<'b>(bytes: &'b [u8], encoding: &'static Encoding) -> Result<Cow<'b, str>> {
+pub fn decode<'b>(
+    bytes: &'b [u8],
+    encoding: &'static Encoding,
+) -> Result<Cow<'b, str>, EncodingError> {
     encoding
         .decode_without_bom_handling_and_without_replacement(bytes)
-        .ok_or(Error::NonDecodable(None))
+        .ok_or(EncodingError::Other(encoding))
 }
 
 /// Like [`decode`] but using a pre-allocated buffer.
 #[cfg(feature = "encoding")]
-pub fn decode_into(bytes: &[u8], encoding: &'static Encoding, buf: &mut String) -> Result<()> {
+pub fn decode_into(
+    bytes: &[u8],
+    encoding: &'static Encoding,
+    buf: &mut String,
+) -> Result<(), EncodingError> {
     if encoding == UTF_8 {
         buf.push_str(std::str::from_utf8(bytes)?);
         return Ok(());
@@ -142,7 +191,7 @@ pub fn decode_into(bytes: &[u8], encoding: &'static Encoding, buf: &mut String)
             debug_assert_eq!(read, bytes.len());
             Ok(())
         }
-        DecoderResult::Malformed(_, _) => Err(Error::NonDecodable(None)),
+        DecoderResult::Malformed(_, _) => Err(EncodingError::Other(encoding)),
         // SAFETY: We allocate enough space above
         DecoderResult::OutputFull => unreachable!(),
     }

diff --git a/src/errors.rs b/src/errors.rs
@@ -1,13 +1,11 @@
 //! Error management module
 
-use crate::encoding::Decoder;
+use crate::encoding::{Decoder, EncodingError};
 use crate::escape::EscapeError;
 use crate::events::attributes::AttrError;
 use crate::name::{NamespaceError, QName};
 use std::fmt;
 use std::io::Error as IoError;
-use std::str::Utf8Error;
-use std::string::FromUtf8Error;
 use std::sync::Arc;
 
 /// An error returned if parsed document does not correspond to the XML grammar,
@@ -165,13 +163,10 @@ pub enum Error {
     Syntax(SyntaxError),
     /// The document is not [well-formed](https://www.w3.org/TR/xml11/#dt-wellformed).
     IllFormed(IllFormedError),
-    /// Input decoding error. If [`encoding`] feature is disabled, contains `None`,
-    /// otherwise contains the UTF-8 decoding error
-    ///
-    /// [`encoding`]: index.html#encoding
-    NonDecodable(Option<Utf8Error>),
     /// Attribute parsing error
     InvalidAttr(AttrError),
+    /// Encoding error
+    Encoding(EncodingError),
     /// Escape error
     EscapeError(EscapeError),
     /// Parsed XML has some namespace-related problems
@@ -211,19 +206,11 @@ impl From<IllFormedError> for Error {
     }
 }
 
-impl From<Utf8Error> for Error {
-    /// Creates a new `Error::NonDecodable` from the given error
-    #[inline]
-    fn from(error: Utf8Error) -> Error {
-        Self::NonDecodable(Some(error))
-    }
-}
-
-impl From<FromUtf8Error> for Error {
-    /// Creates a new `Error::Utf8` from the given error
+impl From<EncodingError> for Error {
+    /// Creates a new `Error::EncodingError` from the given error
     #[inline]
-    fn from(error: FromUtf8Error) -> Error {
-        error.utf8_error().into()
+    fn from(error: EncodingError) -> Error {
+        Self::Encoding(error)
     }
 }
 
@@ -258,9 +245,8 @@ impl fmt::Display for Error {
             Self::Io(e) => write!(f, "I/O error: {}", e),
             Self::Syntax(e) => write!(f, "syntax error: {}", e),
             Self::IllFormed(e) => write!(f, "ill-formed document: {}", e),
-            Self::NonDecodable(None) => write!(f, "Malformed input, decoding impossible"),
-            Self::NonDecodable(Some(e)) => write!(f, "Malformed UTF-8 input: {}", e),
             Self::InvalidAttr(e) => write!(f, "error while parsing attribute: {}", e),
+            Self::Encoding(e) => e.fmt(f),
             Self::EscapeError(e) => e.fmt(f),
             Self::Namespace(e) => e.fmt(f),
         }
@@ -273,11 +259,10 @@ impl std::error::Error for Error {
             Self::Io(e) => Some(e),
             Self::Syntax(e) => Some(e),
             Self::IllFormed(e) => Some(e),
-            Self::NonDecodable(Some(e)) => Some(e),
             Self::InvalidAttr(e) => Some(e),
+            Self::Encoding(e) => Some(e),
             Self::EscapeError(e) => Some(e),
             Self::Namespace(e) => Some(e),
-            _ => None,
         }
     }
 }
@@ -292,6 +277,7 @@ pub mod serialize {
     #[cfg(feature = "overlapped-lists")]
     use std::num::NonZeroUsize;
     use std::num::{ParseFloatError, ParseIntError};
+    use std::str::Utf8Error;
 
     /// (De)serialization error
     #[derive(Clone, Debug)]
@@ -383,16 +369,9 @@ pub mod serialize {
         }
     }
 
-    impl From<Utf8Error> for DeError {
-        #[inline]
-        fn from(e: Utf8Error) -> Self {
-            Self::InvalidXml(e.into())
-        }
-    }
-
-    impl From<FromUtf8Error> for DeError {
+    impl From<EncodingError> for DeError {
         #[inline]
-        fn from(e: FromUtf8Error) -> Self {
+        fn from(e: EncodingError) -> Self {
             Self::InvalidXml(e.into())
         }
     }

diff --git a/src/events/mod.rs b/src/events/mod.rs
@@ -45,8 +45,8 @@ use std::mem::replace;
 use std::ops::Deref;
 use std::str::from_utf8;
 
-use crate::encoding::Decoder;
-use crate::errors::{Error, IllFormedError, Result};
+use crate::encoding::{Decoder, EncodingError};
+use crate::errors::{Error, IllFormedError};
 use crate::escape::{
     escape, minimal_escape, partial_escape, resolve_predefined_entity, unescape_with,
 };
@@ -297,7 +297,7 @@ impl<'a> BytesStart<'a> {
     pub fn try_get_attribute<N: AsRef<[u8]> + Sized>(
         &'a self,
         attr_name: N,
-    ) -> Result<Option<Attribute<'a>>> {
+    ) -> Result<Option<Attribute<'a>>, Error> {
         for a in self.attributes().with_checks(false) {
             let a = a?;
             if a.key.as_ref() == attr_name.as_ref() {
@@ -583,7 +583,7 @@ impl<'a> BytesText<'a> {
     ///
     /// This will allocate if the value contains any escape sequences or in
     /// non-UTF-8 encoding.
-    pub fn unescape(&self) -> Result<Cow<'a, str>> {
+    pub fn unescape(&self) -> Result<Cow<'a, str>, Error> {
         self.unescape_with(resolve_predefined_entity)
     }
 
@@ -594,7 +594,7 @@ impl<'a> BytesText<'a> {
     pub fn unescape_with<'entity>(
         &self,
         resolve_entity: impl FnMut(&str) -> Option<&'entity str>,
-    ) -> Result<Cow<'a, str>> {
+    ) -> Result<Cow<'a, str>, Error> {
         let decoded = self.decoder.decode_cow(&self.content)?;
 
         match unescape_with(&decoded, resolve_entity)? {
@@ -743,7 +743,7 @@ impl<'a> BytesCData<'a> {
     /// | `&`       | `&amp;`
     /// | `'`       | `&apos;`
     /// | `"`       | `&quot;`
-    pub fn escape(self) -> Result<BytesText<'a>> {
+    pub fn escape(self) -> Result<BytesText<'a>, EncodingError> {
         let decoded = self.decode()?;
         Ok(BytesText::wrap(
             match escape(&decoded) {
@@ -768,7 +768,7 @@ impl<'a> BytesCData<'a> {
     /// | `<`       | `&lt;`
     /// | `>`       | `&gt;`
     /// | `&`       | `&amp;`
-    pub fn partial_escape(self) -> Result<BytesText<'a>> {
+    pub fn partial_escape(self) -> Result<BytesText<'a>, EncodingError> {
         let decoded = self.decode()?;
         Ok(BytesText::wrap(
             match partial_escape(&decoded) {
@@ -792,7 +792,7 @@ impl<'a> BytesCData<'a> {
     /// | `&`       | `&amp;`
     ///
     /// [specification]: https://www.w3.org/TR/xml11/#syntax
-    pub fn minimal_escape(self) -> Result<BytesText<'a>> {
+    pub fn minimal_escape(self) -> Result<BytesText<'a>, EncodingError> {
         let decoded = self.decode()?;
         Ok(BytesText::wrap(
             match minimal_escape(&decoded) {
@@ -805,8 +805,8 @@ impl<'a> BytesCData<'a> {
     }
 
     /// Gets content of this text buffer in the specified encoding
-    pub(crate) fn decode(&self) -> Result<Cow<'a, str>> {
-        self.decoder.decode_cow(&self.content)
+    pub(crate) fn decode(&self) -> Result<Cow<'a, str>, EncodingError> {
+        Ok(self.decoder.decode_cow(&self.content)?)
     }
 }
 
@@ -1136,13 +1136,15 @@ impl<'a> BytesDecl<'a> {
     /// ```
     ///
     /// [grammar]: https://www.w3.org/TR/xml11/#NT-XMLDecl
-    pub fn version(&self) -> Result<Cow<[u8]>> {
+    pub fn version(&self) -> Result<Cow<[u8]>, Error> {
         // The version *must* be the first thing in the declaration.
         match self.content.attributes().with_checks(false).next() {
             Some(Ok(a)) if a.key.as_ref() == b"version" => Ok(a.value),
             // first attribute was not "version"
             Some(Ok(a)) => {
-                let found = from_utf8(a.key.as_ref())?.to_string();
+                let found = from_utf8(a.key.as_ref())
+                    .map_err(|_| IllFormedError::MissingDeclVersion(None))?
+                    .to_string();
                 Err(Error::IllFormed(IllFormedError::MissingDeclVersion(Some(
                     found,
                 ))))
@@ -1189,7 +1191,7 @@ impl<'a> BytesDecl<'a> {
     /// ```
     ///
     /// [grammar]: https://www.w3.org/TR/xml11/#NT-XMLDecl
-    pub fn encoding(&self) -> Option<Result<Cow<[u8]>>> {
+    pub fn encoding(&self) -> Option<Result<Cow<[u8]>, Error>> {
         self.content
             .try_get_attribute("encoding")
             .map(|a| a.map(|a| a.value))
@@ -1231,7 +1233,7 @@ impl<'a> BytesDecl<'a> {
     /// ```
     ///
     /// [grammar]: https://www.w3.org/TR/xml11/#NT-XMLDecl
-    pub fn standalone(&self) -> Option<Result<Cow<[u8]>>> {
+    pub fn standalone(&self) -> Option<Result<Cow<[u8]>, Error>> {
         self.content
             .try_get_attribute("standalone")
             .map(|a| a.map(|a| a.value))

diff --git a/src/reader/slice_reader.rs b/src/reader/slice_reader.rs
@@ -233,7 +233,7 @@ impl<'a> Reader<&'a [u8]> {
         let len = span.end - span.start;
         // SAFETY: `span` can only contain indexes up to usize::MAX because it
         // was created from offsets from a single &[u8] slice
-        self.decoder().decode(&buffer[0..len as usize])
+        Ok(self.decoder().decode(&buffer[0..len as usize])?)
     }
 }