Skip to content

Commit

Permalink
Split EncodingError from the Error type
Browse files Browse the repository at this point in the history
This mostly allows for decode functions to return a smaller more
accurate error
  • Loading branch information
RedPhoenixQ authored and Mingun committed Oct 12, 2024
1 parent 6dbd39a commit a975a82
Show file tree
Hide file tree
Showing 6 changed files with 94 additions and 61 deletions.
2 changes: 1 addition & 1 deletion Changelog.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
- [#227]: Split `SeError` from `DeError` in the `serialize` feature.
Serialize functions and methods now return `SeError`.
- [#810]: Return `std::io::Error` from `Writer` methods.
- [#811]: Split `NamespaceError` from `Error`.
- [#811]: Split `NamespaceError` and `EncodingError` from `Error`.

[#227]: https://github.com/tafia/quick-xml/issues/227
[#810]: https://github.com/tafia/quick-xml/pull/810
Expand Down
5 changes: 4 additions & 1 deletion examples/read_nodes.rs
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,10 @@ impl Translation {
})
} else {
dbg!("Expected Event::Start for Text, got: {:?}", &event);
let name_string = reader.decoder().decode(name.as_ref())?;
let name_string = reader
.decoder()
.decode(name.as_ref())
.map_err(quick_xml::Error::Encoding)?;
Err(AppError::NoText(name_string.into()))
}
} else {
Expand Down
71 changes: 60 additions & 11 deletions src/encoding.rs
Original file line number Diff line number Diff line change
@@ -1,14 +1,11 @@
//! A module for wrappers that encode / decode data.
use std::borrow::Cow;
use std::str::Utf8Error;

#[cfg(feature = "encoding")]
use encoding_rs::{DecoderResult, Encoding, UTF_16BE, UTF_16LE, UTF_8};

#[cfg(feature = "encoding")]
use crate::Error;
use crate::Result;

/// Unicode "byte order mark" (\u{FEFF}) encoded as UTF-8.
/// See <https://unicode.org/faq/utf_bom.html#bom1>
pub(crate) const UTF8_BOM: &[u8] = &[0xEF, 0xBB, 0xBF];
Expand All @@ -21,6 +18,48 @@ pub(crate) const UTF16_LE_BOM: &[u8] = &[0xFF, 0xFE];
#[cfg(feature = "encoding")]
pub(crate) const UTF16_BE_BOM: &[u8] = &[0xFE, 0xFF];

/// An error when decoding or encoding
///
/// If feature [`encoding`] is disabled, the [`EncodingError`] is always [`EncodingError::Utf8`]
///
/// [`encoding`]: ../index.html#encoding
#[derive(Clone, Debug, PartialEq, Eq)]
#[non_exhaustive]
pub enum EncodingError {
/// Input was not valid UTF-8
Utf8(Utf8Error),
/// Input did not adhere to the given encoding
#[cfg(feature = "encoding")]
Other(&'static Encoding),
}

impl From<Utf8Error> for EncodingError {
#[inline]
fn from(e: Utf8Error) -> Self {
Self::Utf8(e)
}
}

impl std::error::Error for EncodingError {
fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
match self {
Self::Utf8(e) => Some(e),
#[cfg(feature = "encoding")]
Self::Other(_) => None,
}
}
}

impl std::fmt::Display for EncodingError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
Self::Utf8(e) => write!(f, "cannot decode input using UTF-8: {}", e),
#[cfg(feature = "encoding")]
Self::Other(encoding) => write!(f, "cannot decode input using {}", encoding.name()),
}
}
}

/// Decoder of byte slices into strings.
///
/// If feature [`encoding`] is enabled, this encoding taken from the `"encoding"`
Expand Down Expand Up @@ -79,7 +118,7 @@ impl Decoder {
///
/// ----
/// Returns an error in case of malformed sequences in the `bytes`.
pub fn decode<'b>(&self, bytes: &'b [u8]) -> Result<Cow<'b, str>> {
pub fn decode<'b>(&self, bytes: &'b [u8]) -> Result<Cow<'b, str>, EncodingError> {
#[cfg(not(feature = "encoding"))]
let decoded = Ok(Cow::Borrowed(std::str::from_utf8(bytes)?));

Expand All @@ -90,7 +129,7 @@ impl Decoder {
}

/// Like [`decode`][Self::decode] but using a pre-allocated buffer.
pub fn decode_into(&self, bytes: &[u8], buf: &mut String) -> Result<()> {
pub fn decode_into(&self, bytes: &[u8], buf: &mut String) -> Result<(), EncodingError> {
#[cfg(not(feature = "encoding"))]
buf.push_str(std::str::from_utf8(bytes)?);

Expand All @@ -101,7 +140,10 @@ impl Decoder {
}

/// Decodes the `Cow` buffer, preserves the lifetime
pub(crate) fn decode_cow<'b>(&self, bytes: &Cow<'b, [u8]>) -> Result<Cow<'b, str>> {
pub(crate) fn decode_cow<'b>(
&self,
bytes: &Cow<'b, [u8]>,
) -> Result<Cow<'b, str>, EncodingError> {
match bytes {
Cow::Borrowed(bytes) => self.decode(bytes),
// Convert to owned, because otherwise Cow will be bound with wrong lifetime
Expand All @@ -114,15 +156,22 @@ impl Decoder {
///
/// Returns an error in case of malformed or non-representable sequences in the `bytes`.
#[cfg(feature = "encoding")]
pub fn decode<'b>(bytes: &'b [u8], encoding: &'static Encoding) -> Result<Cow<'b, str>> {
pub fn decode<'b>(
bytes: &'b [u8],
encoding: &'static Encoding,
) -> Result<Cow<'b, str>, EncodingError> {
encoding
.decode_without_bom_handling_and_without_replacement(bytes)
.ok_or(Error::NonDecodable(None))
.ok_or(EncodingError::Other(encoding))
}

/// Like [`decode`] but using a pre-allocated buffer.
#[cfg(feature = "encoding")]
pub fn decode_into(bytes: &[u8], encoding: &'static Encoding, buf: &mut String) -> Result<()> {
pub fn decode_into(
bytes: &[u8],
encoding: &'static Encoding,
buf: &mut String,
) -> Result<(), EncodingError> {
if encoding == UTF_8 {
buf.push_str(std::str::from_utf8(bytes)?);
return Ok(());
Expand All @@ -142,7 +191,7 @@ pub fn decode_into(bytes: &[u8], encoding: &'static Encoding, buf: &mut String)
debug_assert_eq!(read, bytes.len());
Ok(())
}
DecoderResult::Malformed(_, _) => Err(Error::NonDecodable(None)),
DecoderResult::Malformed(_, _) => Err(EncodingError::Other(encoding)),
// SAFETY: We allocate enough space above
DecoderResult::OutputFull => unreachable!(),
}
Expand Down
45 changes: 12 additions & 33 deletions src/errors.rs
Original file line number Diff line number Diff line change
@@ -1,13 +1,11 @@
//! Error management module
use crate::encoding::Decoder;
use crate::encoding::{Decoder, EncodingError};
use crate::escape::EscapeError;
use crate::events::attributes::AttrError;
use crate::name::{NamespaceError, QName};
use std::fmt;
use std::io::Error as IoError;
use std::str::Utf8Error;
use std::string::FromUtf8Error;
use std::sync::Arc;

/// An error returned if parsed document does not correspond to the XML grammar,
Expand Down Expand Up @@ -165,13 +163,10 @@ pub enum Error {
Syntax(SyntaxError),
/// The document is not [well-formed](https://www.w3.org/TR/xml11/#dt-wellformed).
IllFormed(IllFormedError),
/// Input decoding error. If [`encoding`] feature is disabled, contains `None`,
/// otherwise contains the UTF-8 decoding error
///
/// [`encoding`]: index.html#encoding
NonDecodable(Option<Utf8Error>),
/// Attribute parsing error
InvalidAttr(AttrError),
/// Encoding error
Encoding(EncodingError),
/// Escape error
EscapeError(EscapeError),
/// Parsed XML has some namespace-related problems
Expand Down Expand Up @@ -211,19 +206,11 @@ impl From<IllFormedError> for Error {
}
}

impl From<Utf8Error> for Error {
/// Creates a new `Error::NonDecodable` from the given error
#[inline]
fn from(error: Utf8Error) -> Error {
Self::NonDecodable(Some(error))
}
}

impl From<FromUtf8Error> for Error {
/// Creates a new `Error::Utf8` from the given error
impl From<EncodingError> for Error {
/// Creates a new `Error::EncodingError` from the given error
#[inline]
fn from(error: FromUtf8Error) -> Error {
error.utf8_error().into()
fn from(error: EncodingError) -> Error {
Self::Encoding(error)
}
}

Expand Down Expand Up @@ -258,9 +245,8 @@ impl fmt::Display for Error {
Self::Io(e) => write!(f, "I/O error: {}", e),
Self::Syntax(e) => write!(f, "syntax error: {}", e),
Self::IllFormed(e) => write!(f, "ill-formed document: {}", e),
Self::NonDecodable(None) => write!(f, "Malformed input, decoding impossible"),
Self::NonDecodable(Some(e)) => write!(f, "Malformed UTF-8 input: {}", e),
Self::InvalidAttr(e) => write!(f, "error while parsing attribute: {}", e),
Self::Encoding(e) => e.fmt(f),
Self::EscapeError(e) => e.fmt(f),
Self::Namespace(e) => e.fmt(f),
}
Expand All @@ -273,11 +259,10 @@ impl std::error::Error for Error {
Self::Io(e) => Some(e),
Self::Syntax(e) => Some(e),
Self::IllFormed(e) => Some(e),
Self::NonDecodable(Some(e)) => Some(e),
Self::InvalidAttr(e) => Some(e),
Self::Encoding(e) => Some(e),
Self::EscapeError(e) => Some(e),
Self::Namespace(e) => Some(e),
_ => None,
}
}
}
Expand All @@ -292,6 +277,7 @@ pub mod serialize {
#[cfg(feature = "overlapped-lists")]
use std::num::NonZeroUsize;
use std::num::{ParseFloatError, ParseIntError};
use std::str::Utf8Error;

/// (De)serialization error
#[derive(Clone, Debug)]
Expand Down Expand Up @@ -383,16 +369,9 @@ pub mod serialize {
}
}

impl From<Utf8Error> for DeError {
#[inline]
fn from(e: Utf8Error) -> Self {
Self::InvalidXml(e.into())
}
}

impl From<FromUtf8Error> for DeError {
impl From<EncodingError> for DeError {
#[inline]
fn from(e: FromUtf8Error) -> Self {
fn from(e: EncodingError) -> Self {
Self::InvalidXml(e.into())
}
}
Expand Down
30 changes: 16 additions & 14 deletions src/events/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -45,8 +45,8 @@ use std::mem::replace;
use std::ops::Deref;
use std::str::from_utf8;

use crate::encoding::Decoder;
use crate::errors::{Error, IllFormedError, Result};
use crate::encoding::{Decoder, EncodingError};
use crate::errors::{Error, IllFormedError};
use crate::escape::{
escape, minimal_escape, partial_escape, resolve_predefined_entity, unescape_with,
};
Expand Down Expand Up @@ -297,7 +297,7 @@ impl<'a> BytesStart<'a> {
pub fn try_get_attribute<N: AsRef<[u8]> + Sized>(
&'a self,
attr_name: N,
) -> Result<Option<Attribute<'a>>> {
) -> Result<Option<Attribute<'a>>, Error> {
for a in self.attributes().with_checks(false) {
let a = a?;
if a.key.as_ref() == attr_name.as_ref() {
Expand Down Expand Up @@ -583,7 +583,7 @@ impl<'a> BytesText<'a> {
///
/// This will allocate if the value contains any escape sequences or in
/// non-UTF-8 encoding.
pub fn unescape(&self) -> Result<Cow<'a, str>> {
pub fn unescape(&self) -> Result<Cow<'a, str>, Error> {
self.unescape_with(resolve_predefined_entity)
}

Expand All @@ -594,7 +594,7 @@ impl<'a> BytesText<'a> {
pub fn unescape_with<'entity>(
&self,
resolve_entity: impl FnMut(&str) -> Option<&'entity str>,
) -> Result<Cow<'a, str>> {
) -> Result<Cow<'a, str>, Error> {
let decoded = self.decoder.decode_cow(&self.content)?;

match unescape_with(&decoded, resolve_entity)? {
Expand Down Expand Up @@ -743,7 +743,7 @@ impl<'a> BytesCData<'a> {
/// | `&` | `&amp;`
/// | `'` | `&apos;`
/// | `"` | `&quot;`
pub fn escape(self) -> Result<BytesText<'a>> {
pub fn escape(self) -> Result<BytesText<'a>, EncodingError> {
let decoded = self.decode()?;
Ok(BytesText::wrap(
match escape(&decoded) {
Expand All @@ -768,7 +768,7 @@ impl<'a> BytesCData<'a> {
/// | `<` | `&lt;`
/// | `>` | `&gt;`
/// | `&` | `&amp;`
pub fn partial_escape(self) -> Result<BytesText<'a>> {
pub fn partial_escape(self) -> Result<BytesText<'a>, EncodingError> {
let decoded = self.decode()?;
Ok(BytesText::wrap(
match partial_escape(&decoded) {
Expand All @@ -792,7 +792,7 @@ impl<'a> BytesCData<'a> {
/// | `&` | `&amp;`
///
/// [specification]: https://www.w3.org/TR/xml11/#syntax
pub fn minimal_escape(self) -> Result<BytesText<'a>> {
pub fn minimal_escape(self) -> Result<BytesText<'a>, EncodingError> {
let decoded = self.decode()?;
Ok(BytesText::wrap(
match minimal_escape(&decoded) {
Expand All @@ -805,8 +805,8 @@ impl<'a> BytesCData<'a> {
}

/// Gets content of this text buffer in the specified encoding
pub(crate) fn decode(&self) -> Result<Cow<'a, str>> {
self.decoder.decode_cow(&self.content)
pub(crate) fn decode(&self) -> Result<Cow<'a, str>, EncodingError> {
Ok(self.decoder.decode_cow(&self.content)?)
}
}

Expand Down Expand Up @@ -1136,13 +1136,15 @@ impl<'a> BytesDecl<'a> {
/// ```
///
/// [grammar]: https://www.w3.org/TR/xml11/#NT-XMLDecl
pub fn version(&self) -> Result<Cow<[u8]>> {
pub fn version(&self) -> Result<Cow<[u8]>, Error> {
// The version *must* be the first thing in the declaration.
match self.content.attributes().with_checks(false).next() {
Some(Ok(a)) if a.key.as_ref() == b"version" => Ok(a.value),
// first attribute was not "version"
Some(Ok(a)) => {
let found = from_utf8(a.key.as_ref())?.to_string();
let found = from_utf8(a.key.as_ref())
.map_err(|_| IllFormedError::MissingDeclVersion(None))?
.to_string();
Err(Error::IllFormed(IllFormedError::MissingDeclVersion(Some(
found,
))))
Expand Down Expand Up @@ -1189,7 +1191,7 @@ impl<'a> BytesDecl<'a> {
/// ```
///
/// [grammar]: https://www.w3.org/TR/xml11/#NT-XMLDecl
pub fn encoding(&self) -> Option<Result<Cow<[u8]>>> {
pub fn encoding(&self) -> Option<Result<Cow<[u8]>, Error>> {
self.content
.try_get_attribute("encoding")
.map(|a| a.map(|a| a.value))
Expand Down Expand Up @@ -1231,7 +1233,7 @@ impl<'a> BytesDecl<'a> {
/// ```
///
/// [grammar]: https://www.w3.org/TR/xml11/#NT-XMLDecl
pub fn standalone(&self) -> Option<Result<Cow<[u8]>>> {
pub fn standalone(&self) -> Option<Result<Cow<[u8]>, Error>> {
self.content
.try_get_attribute("standalone")
.map(|a| a.map(|a| a.value))
Expand Down
2 changes: 1 addition & 1 deletion src/reader/slice_reader.rs
Original file line number Diff line number Diff line change
Expand Up @@ -233,7 +233,7 @@ impl<'a> Reader<&'a [u8]> {
let len = span.end - span.start;
// SAFETY: `span` can only contain indexes up to usize::MAX because it
// was created from offsets from a single &[u8] slice
self.decoder().decode(&buffer[0..len as usize])
Ok(self.decoder().decode(&buffer[0..len as usize])?)
}
}

Expand Down

0 comments on commit a975a82

Please sign in to comment.