diff --git a/Changelog.md b/Changelog.md index d05a1ed8..b41035a1 100644 --- a/Changelog.md +++ b/Changelog.md @@ -22,7 +22,7 @@ - [#227]: Split `SeError` from `DeError` in the `serialize` feature. Serialize functions and methods now return `SeError`. - [#810]: Return `std::io::Error` from `Writer` methods. -- [#811]: Split `NamespaceError` from `Error`. +- [#811]: Split `NamespaceError` and `EncodingError` from `Error`. [#227]: https://github.com/tafia/quick-xml/issues/227 [#810]: https://github.com/tafia/quick-xml/pull/810 diff --git a/examples/read_nodes.rs b/examples/read_nodes.rs index 6c32f486..50a5f90d 100644 --- a/examples/read_nodes.rs +++ b/examples/read_nodes.rs @@ -90,7 +90,10 @@ impl Translation { }) } else { dbg!("Expected Event::Start for Text, got: {:?}", &event); - let name_string = reader.decoder().decode(name.as_ref())?; + let name_string = reader + .decoder() + .decode(name.as_ref()) + .map_err(quick_xml::Error::Encoding)?; Err(AppError::NoText(name_string.into())) } } else { diff --git a/src/encoding.rs b/src/encoding.rs index 120d793b..44f6c82d 100644 --- a/src/encoding.rs +++ b/src/encoding.rs @@ -1,14 +1,11 @@ //! A module for wrappers that encode / decode data. use std::borrow::Cow; +use std::str::Utf8Error; #[cfg(feature = "encoding")] use encoding_rs::{DecoderResult, Encoding, UTF_16BE, UTF_16LE, UTF_8}; -#[cfg(feature = "encoding")] -use crate::Error; -use crate::Result; - /// Unicode "byte order mark" (\u{FEFF}) encoded as UTF-8. /// See pub(crate) const UTF8_BOM: &[u8] = &[0xEF, 0xBB, 0xBF]; @@ -21,6 +18,48 @@ pub(crate) const UTF16_LE_BOM: &[u8] = &[0xFF, 0xFE]; #[cfg(feature = "encoding")] pub(crate) const UTF16_BE_BOM: &[u8] = &[0xFE, 0xFF]; +/// An error when decoding or encoding +/// +/// If feature [`encoding`] is disabled, the [`EncodingError`] is always [`EncodingError::Utf8`] +/// +/// [`encoding`]: ../index.html#encoding +#[derive(Clone, Debug, PartialEq, Eq)] +#[non_exhaustive] +pub enum EncodingError { + /// Input was not valid UTF-8 + Utf8(Utf8Error), + /// Input did not adhere to the given encoding + #[cfg(feature = "encoding")] + Other(&'static Encoding), +} + +impl From for EncodingError { + #[inline] + fn from(e: Utf8Error) -> Self { + Self::Utf8(e) + } +} + +impl std::error::Error for EncodingError { + fn source(&self) -> Option<&(dyn std::error::Error + 'static)> { + match self { + Self::Utf8(e) => Some(e), + #[cfg(feature = "encoding")] + Self::Other(_) => None, + } + } +} + +impl std::fmt::Display for EncodingError { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Self::Utf8(e) => write!(f, "cannot decode input using UTF-8: {}", e), + #[cfg(feature = "encoding")] + Self::Other(encoding) => write!(f, "cannot decode input using {}", encoding.name()), + } + } +} + /// Decoder of byte slices into strings. /// /// If feature [`encoding`] is enabled, this encoding taken from the `"encoding"` @@ -79,7 +118,7 @@ impl Decoder { /// /// ---- /// Returns an error in case of malformed sequences in the `bytes`. - pub fn decode<'b>(&self, bytes: &'b [u8]) -> Result> { + pub fn decode<'b>(&self, bytes: &'b [u8]) -> Result, EncodingError> { #[cfg(not(feature = "encoding"))] let decoded = Ok(Cow::Borrowed(std::str::from_utf8(bytes)?)); @@ -90,7 +129,7 @@ impl Decoder { } /// Like [`decode`][Self::decode] but using a pre-allocated buffer. - pub fn decode_into(&self, bytes: &[u8], buf: &mut String) -> Result<()> { + pub fn decode_into(&self, bytes: &[u8], buf: &mut String) -> Result<(), EncodingError> { #[cfg(not(feature = "encoding"))] buf.push_str(std::str::from_utf8(bytes)?); @@ -101,7 +140,10 @@ impl Decoder { } /// Decodes the `Cow` buffer, preserves the lifetime - pub(crate) fn decode_cow<'b>(&self, bytes: &Cow<'b, [u8]>) -> Result> { + pub(crate) fn decode_cow<'b>( + &self, + bytes: &Cow<'b, [u8]>, + ) -> Result, EncodingError> { match bytes { Cow::Borrowed(bytes) => self.decode(bytes), // Convert to owned, because otherwise Cow will be bound with wrong lifetime @@ -114,15 +156,22 @@ impl Decoder { /// /// Returns an error in case of malformed or non-representable sequences in the `bytes`. #[cfg(feature = "encoding")] -pub fn decode<'b>(bytes: &'b [u8], encoding: &'static Encoding) -> Result> { +pub fn decode<'b>( + bytes: &'b [u8], + encoding: &'static Encoding, +) -> Result, EncodingError> { encoding .decode_without_bom_handling_and_without_replacement(bytes) - .ok_or(Error::NonDecodable(None)) + .ok_or(EncodingError::Other(encoding)) } /// Like [`decode`] but using a pre-allocated buffer. #[cfg(feature = "encoding")] -pub fn decode_into(bytes: &[u8], encoding: &'static Encoding, buf: &mut String) -> Result<()> { +pub fn decode_into( + bytes: &[u8], + encoding: &'static Encoding, + buf: &mut String, +) -> Result<(), EncodingError> { if encoding == UTF_8 { buf.push_str(std::str::from_utf8(bytes)?); return Ok(()); @@ -142,7 +191,7 @@ pub fn decode_into(bytes: &[u8], encoding: &'static Encoding, buf: &mut String) debug_assert_eq!(read, bytes.len()); Ok(()) } - DecoderResult::Malformed(_, _) => Err(Error::NonDecodable(None)), + DecoderResult::Malformed(_, _) => Err(EncodingError::Other(encoding)), // SAFETY: We allocate enough space above DecoderResult::OutputFull => unreachable!(), } diff --git a/src/errors.rs b/src/errors.rs index 4765234d..a7805b07 100644 --- a/src/errors.rs +++ b/src/errors.rs @@ -1,13 +1,11 @@ //! Error management module -use crate::encoding::Decoder; +use crate::encoding::{Decoder, EncodingError}; use crate::escape::EscapeError; use crate::events::attributes::AttrError; use crate::name::{NamespaceError, QName}; use std::fmt; use std::io::Error as IoError; -use std::str::Utf8Error; -use std::string::FromUtf8Error; use std::sync::Arc; /// An error returned if parsed document does not correspond to the XML grammar, @@ -165,13 +163,10 @@ pub enum Error { Syntax(SyntaxError), /// The document is not [well-formed](https://www.w3.org/TR/xml11/#dt-wellformed). IllFormed(IllFormedError), - /// Input decoding error. If [`encoding`] feature is disabled, contains `None`, - /// otherwise contains the UTF-8 decoding error - /// - /// [`encoding`]: index.html#encoding - NonDecodable(Option), /// Attribute parsing error InvalidAttr(AttrError), + /// Encoding error + Encoding(EncodingError), /// Escape error EscapeError(EscapeError), /// Parsed XML has some namespace-related problems @@ -211,19 +206,11 @@ impl From for Error { } } -impl From for Error { - /// Creates a new `Error::NonDecodable` from the given error - #[inline] - fn from(error: Utf8Error) -> Error { - Self::NonDecodable(Some(error)) - } -} - -impl From for Error { - /// Creates a new `Error::Utf8` from the given error +impl From for Error { + /// Creates a new `Error::EncodingError` from the given error #[inline] - fn from(error: FromUtf8Error) -> Error { - error.utf8_error().into() + fn from(error: EncodingError) -> Error { + Self::Encoding(error) } } @@ -258,9 +245,8 @@ impl fmt::Display for Error { Self::Io(e) => write!(f, "I/O error: {}", e), Self::Syntax(e) => write!(f, "syntax error: {}", e), Self::IllFormed(e) => write!(f, "ill-formed document: {}", e), - Self::NonDecodable(None) => write!(f, "Malformed input, decoding impossible"), - Self::NonDecodable(Some(e)) => write!(f, "Malformed UTF-8 input: {}", e), Self::InvalidAttr(e) => write!(f, "error while parsing attribute: {}", e), + Self::Encoding(e) => e.fmt(f), Self::EscapeError(e) => e.fmt(f), Self::Namespace(e) => e.fmt(f), } @@ -273,11 +259,10 @@ impl std::error::Error for Error { Self::Io(e) => Some(e), Self::Syntax(e) => Some(e), Self::IllFormed(e) => Some(e), - Self::NonDecodable(Some(e)) => Some(e), Self::InvalidAttr(e) => Some(e), + Self::Encoding(e) => Some(e), Self::EscapeError(e) => Some(e), Self::Namespace(e) => Some(e), - _ => None, } } } @@ -292,6 +277,7 @@ pub mod serialize { #[cfg(feature = "overlapped-lists")] use std::num::NonZeroUsize; use std::num::{ParseFloatError, ParseIntError}; + use std::str::Utf8Error; /// (De)serialization error #[derive(Clone, Debug)] @@ -383,16 +369,9 @@ pub mod serialize { } } - impl From for DeError { - #[inline] - fn from(e: Utf8Error) -> Self { - Self::InvalidXml(e.into()) - } - } - - impl From for DeError { + impl From for DeError { #[inline] - fn from(e: FromUtf8Error) -> Self { + fn from(e: EncodingError) -> Self { Self::InvalidXml(e.into()) } } diff --git a/src/events/mod.rs b/src/events/mod.rs index c9acb5b7..07df95b4 100644 --- a/src/events/mod.rs +++ b/src/events/mod.rs @@ -45,8 +45,8 @@ use std::mem::replace; use std::ops::Deref; use std::str::from_utf8; -use crate::encoding::Decoder; -use crate::errors::{Error, IllFormedError, Result}; +use crate::encoding::{Decoder, EncodingError}; +use crate::errors::{Error, IllFormedError}; use crate::escape::{ escape, minimal_escape, partial_escape, resolve_predefined_entity, unescape_with, }; @@ -297,7 +297,7 @@ impl<'a> BytesStart<'a> { pub fn try_get_attribute + Sized>( &'a self, attr_name: N, - ) -> Result>> { + ) -> Result>, Error> { for a in self.attributes().with_checks(false) { let a = a?; if a.key.as_ref() == attr_name.as_ref() { @@ -583,7 +583,7 @@ impl<'a> BytesText<'a> { /// /// This will allocate if the value contains any escape sequences or in /// non-UTF-8 encoding. - pub fn unescape(&self) -> Result> { + pub fn unescape(&self) -> Result, Error> { self.unescape_with(resolve_predefined_entity) } @@ -594,7 +594,7 @@ impl<'a> BytesText<'a> { pub fn unescape_with<'entity>( &self, resolve_entity: impl FnMut(&str) -> Option<&'entity str>, - ) -> Result> { + ) -> Result, Error> { let decoded = self.decoder.decode_cow(&self.content)?; match unescape_with(&decoded, resolve_entity)? { @@ -743,7 +743,7 @@ impl<'a> BytesCData<'a> { /// | `&` | `&` /// | `'` | `'` /// | `"` | `"` - pub fn escape(self) -> Result> { + pub fn escape(self) -> Result, EncodingError> { let decoded = self.decode()?; Ok(BytesText::wrap( match escape(&decoded) { @@ -768,7 +768,7 @@ impl<'a> BytesCData<'a> { /// | `<` | `<` /// | `>` | `>` /// | `&` | `&` - pub fn partial_escape(self) -> Result> { + pub fn partial_escape(self) -> Result, EncodingError> { let decoded = self.decode()?; Ok(BytesText::wrap( match partial_escape(&decoded) { @@ -792,7 +792,7 @@ impl<'a> BytesCData<'a> { /// | `&` | `&` /// /// [specification]: https://www.w3.org/TR/xml11/#syntax - pub fn minimal_escape(self) -> Result> { + pub fn minimal_escape(self) -> Result, EncodingError> { let decoded = self.decode()?; Ok(BytesText::wrap( match minimal_escape(&decoded) { @@ -805,8 +805,8 @@ impl<'a> BytesCData<'a> { } /// Gets content of this text buffer in the specified encoding - pub(crate) fn decode(&self) -> Result> { - self.decoder.decode_cow(&self.content) + pub(crate) fn decode(&self) -> Result, EncodingError> { + Ok(self.decoder.decode_cow(&self.content)?) } } @@ -1136,13 +1136,15 @@ impl<'a> BytesDecl<'a> { /// ``` /// /// [grammar]: https://www.w3.org/TR/xml11/#NT-XMLDecl - pub fn version(&self) -> Result> { + pub fn version(&self) -> Result, Error> { // The version *must* be the first thing in the declaration. match self.content.attributes().with_checks(false).next() { Some(Ok(a)) if a.key.as_ref() == b"version" => Ok(a.value), // first attribute was not "version" Some(Ok(a)) => { - let found = from_utf8(a.key.as_ref())?.to_string(); + let found = from_utf8(a.key.as_ref()) + .map_err(|_| IllFormedError::MissingDeclVersion(None))? + .to_string(); Err(Error::IllFormed(IllFormedError::MissingDeclVersion(Some( found, )))) @@ -1189,7 +1191,7 @@ impl<'a> BytesDecl<'a> { /// ``` /// /// [grammar]: https://www.w3.org/TR/xml11/#NT-XMLDecl - pub fn encoding(&self) -> Option>> { + pub fn encoding(&self) -> Option, Error>> { self.content .try_get_attribute("encoding") .map(|a| a.map(|a| a.value)) @@ -1231,7 +1233,7 @@ impl<'a> BytesDecl<'a> { /// ``` /// /// [grammar]: https://www.w3.org/TR/xml11/#NT-XMLDecl - pub fn standalone(&self) -> Option>> { + pub fn standalone(&self) -> Option, Error>> { self.content .try_get_attribute("standalone") .map(|a| a.map(|a| a.value)) diff --git a/src/reader/slice_reader.rs b/src/reader/slice_reader.rs index e1082cac..4aadc7bf 100644 --- a/src/reader/slice_reader.rs +++ b/src/reader/slice_reader.rs @@ -233,7 +233,7 @@ impl<'a> Reader<&'a [u8]> { let len = span.end - span.start; // SAFETY: `span` can only contain indexes up to usize::MAX because it // was created from offsets from a single &[u8] slice - self.decoder().decode(&buffer[0..len as usize]) + Ok(self.decoder().decode(&buffer[0..len as usize])?) } }