Skip to content

Commit

Permalink
Fix incorrect reading of CDATA and comments when end sequence crosses…
Browse files Browse the repository at this point in the history
… the boundary of chunks in buffered reader

The bug was introduced in f2b99f0
  • Loading branch information
Mingun committed Sep 6, 2022
1 parent e052a46 commit 75823d5
Show file tree
Hide file tree
Showing 4 changed files with 39 additions and 9 deletions.
2 changes: 1 addition & 1 deletion src/reader/buffered_reader.rs
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,7 @@ macro_rules! impl_buffered_source {
// somewhere sane rather than at the EOF
Ok(n) if n.is_empty() => return Err(bang_type.to_err()),
Ok(available) => {
if let Some((consumed, used)) = bang_type.parse(available, read) {
if let Some((consumed, used)) = bang_type.parse(buf, available) {
buf.extend_from_slice(consumed);

self $(.$reader)? .consume(used);
Expand Down
37 changes: 31 additions & 6 deletions src/reader/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -742,25 +742,50 @@ impl BangType {

/// If element is finished, returns its content up to `>` symbol and
/// an index of this symbol, otherwise returns `None`
///
/// # Parameters
/// - `buf`: buffer with data consumed on previous iterations
/// - `chunk`: data read on current iteration and not yet consumed from reader
#[inline(always)]
fn parse<'b>(&self, chunk: &'b [u8], offset: usize) -> Option<(&'b [u8], usize)> {
fn parse<'b>(&self, buf: &[u8], chunk: &'b [u8]) -> Option<(&'b [u8], usize)> {
for i in memchr::memchr_iter(b'>', chunk) {
match self {
// Need to read at least 6 symbols (`!---->`) for properly finished comment
// <!----> - XML comment
// 012345 - i
Self::Comment => {
if offset + i > 4 && chunk[..i].ends_with(b"--") {
Self::Comment if buf.len() + i > 4 => {
if chunk[..i].ends_with(b"--") {
// We cannot strip last `--` from the buffer because we need it in case of
// check_comments enabled option. XML standard requires that comment
// will not end with `--->` sequence because this is a special case of
// `--` in the comment (https://www.w3.org/TR/xml11/#sec-comments)
return Some((&chunk[..i], i + 1)); // +1 for `>`
}
// End sequence `-|->` was splitted at |
// buf --/ \-- chunk
if i == 1 && buf.ends_with(b"-") && chunk[0] == b'-' {
return Some((&chunk[..i], i + 1)); // +1 for `>`
}
// End sequence `--|>` was splitted at |
// buf --/ \-- chunk
if i == 0 && buf.ends_with(b"--") {
return Some((&[], i + 1)); // +1 for `>`
}
}
Self::Comment => {}
Self::CData => {
if chunk[..i].ends_with(b"]]") {
return Some((&chunk[..i - 2], i + 1)); // +1 for `>`
return Some((&chunk[..i], i + 1)); // +1 for `>`
}
// End sequence `]|]>` was splitted at |
// buf --/ \-- chunk
if i == 1 && buf.ends_with(b"]") && chunk[0] == b']' {
return Some((&chunk[..i], i + 1)); // +1 for `>`
}
// End sequence `]]|>` was splitted at |
// buf --/ \-- chunk
if i == 0 && buf.ends_with(b"]]") {
return Some((&[], i + 1)); // +1 for `>`
}
}
Self::DocType => {
Expand Down Expand Up @@ -1021,7 +1046,7 @@ mod test {
$(.$await)?
.unwrap()
.map(|(ty, data)| (ty, Bytes(data))),
Some((BangType::CData, Bytes(b"![CDATA[")))
Some((BangType::CData, Bytes(b"![CDATA[]]")))
);
assert_eq!(position, 11);
}
Expand All @@ -1042,7 +1067,7 @@ mod test {
$(.$await)?
.unwrap()
.map(|(ty, data)| (ty, Bytes(data))),
Some((BangType::CData, Bytes(b"![CDATA[cdata]] ]>content")))
Some((BangType::CData, Bytes(b"![CDATA[cdata]] ]>content]]")))
);
assert_eq!(position, 28);
}
Expand Down
7 changes: 6 additions & 1 deletion src/reader/parser.rs
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,7 @@ impl Parser {
let len = buf.len();
match bang_type {
BangType::Comment if buf.starts_with(b"!--") => {
debug_assert!(buf.ends_with(b"--"));
if self.check_comments {
// search if '--' not in comments
if let Some(p) = memchr::memchr_iter(b'-', &buf[3..len - 2])
Expand All @@ -105,7 +106,11 @@ impl Parser {
)))
}
BangType::CData if uncased_starts_with(buf, b"![CDATA[") => {
Ok(Event::CData(BytesCData::wrap(&buf[8..], self.decoder())))
debug_assert!(buf.ends_with(b"]]"));
Ok(Event::CData(BytesCData::wrap(
&buf[8..len - 2],
self.decoder(),
)))
}
BangType::DocType if uncased_starts_with(buf, b"!DOCTYPE") => {
let start = buf[8..]
Expand Down
2 changes: 1 addition & 1 deletion src/reader/slice_reader.rs
Original file line number Diff line number Diff line change
Expand Up @@ -289,7 +289,7 @@ impl<'a> XmlSource<'a, ()> for &'a [u8] {

let bang_type = BangType::new(self[1..].first().copied())?;

if let Some((bytes, i)) = bang_type.parse(self, 0) {
if let Some((bytes, i)) = bang_type.parse(&[], self) {
*position += i;
*self = &self[i..];
return Ok(Some((bang_type, bytes)));
Expand Down

0 comments on commit 75823d5

Please sign in to comment.