Skip to content

Commit

Permalink
Rollup merge of #105076 - mina86:a, r=scottmcm
Browse files Browse the repository at this point in the history
Refactor core::char::EscapeDefault and co. structures

Change core::char::{EscapeUnicode, EscapeDefault and EscapeDebug}
structures from using a state machine to computing escaped sequence
upfront and during iteration just going through the characters.

This is arguably simpler since it’s easier to think about having
a buffer and start..end range to iterate over rather than thinking
about a state machine.

This also harmonises implementation of aforementioned iterators and
core::ascii::EscapeDefault struct.  This is done by introducing a new
helper EscapeIterInner struct which holds the buffer and offers simple
methods for iterating over range.

As a side effect, this probably optimises Display implementation for
those types since rather than calling write_char repeatedly, write_str
is invoked once.  On 64-bit platforms, it also reduces size of some of
the structs:

    | Struct                     | Before | After |
    |----------------------------+--------+-------+
    | core::char::EscapeUnicode  |     16 |    12 |
    | core::char::EscapeDefault  |     16 |    12 |
    | core::char::EscapeDebug    |     16 |    16 |

My ulterior motive and reason why I started looking into this is
addition of as_str method to the iterators.  With this change this
will became trivial.  It’s also going to be trivial to implement
DoubleEndedIterator if that’s ever desired.
  • Loading branch information
Dylan-DPC authored May 2, 2023
2 parents 1cb6357 + 76c9947 commit f916c44
Show file tree
Hide file tree
Showing 5 changed files with 273 additions and 206 deletions.
71 changes: 41 additions & 30 deletions library/core/src/ascii.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,10 @@
#![stable(feature = "core_ascii", since = "1.26.0")]

use crate::escape;
use crate::fmt;
use crate::iter::FusedIterator;
use crate::ops::Range;
use crate::str::from_utf8_unchecked;
use crate::num::NonZeroUsize;

/// An iterator over the escaped version of a byte.
///
Expand All @@ -21,10 +21,7 @@ use crate::str::from_utf8_unchecked;
#[must_use = "iterators are lazy and do nothing unless consumed"]
#[stable(feature = "rust1", since = "1.0.0")]
#[derive(Clone)]
pub struct EscapeDefault {
range: Range<u8>,
data: [u8; 4],
}
pub struct EscapeDefault(escape::EscapeIterInner<4>);

/// Returns an iterator that produces an escaped version of a `u8`.
///
Expand Down Expand Up @@ -90,21 +87,9 @@ pub struct EscapeDefault {
/// ```
#[stable(feature = "rust1", since = "1.0.0")]
pub fn escape_default(c: u8) -> EscapeDefault {
let (data, len) = match c {
b'\t' => ([b'\\', b't', 0, 0], 2),
b'\r' => ([b'\\', b'r', 0, 0], 2),
b'\n' => ([b'\\', b'n', 0, 0], 2),
b'\\' => ([b'\\', b'\\', 0, 0], 2),
b'\'' => ([b'\\', b'\'', 0, 0], 2),
b'"' => ([b'\\', b'"', 0, 0], 2),
b'\x20'..=b'\x7e' => ([c, 0, 0, 0], 1),
_ => {
let hex_digits: &[u8; 16] = b"0123456789abcdef";
([b'\\', b'x', hex_digits[(c >> 4) as usize], hex_digits[(c & 0xf) as usize]], 4)
}
};

return EscapeDefault { range: 0..len, data };
let mut data = [0; 4];
let range = escape::escape_ascii_into(&mut data, c);
EscapeDefault(escape::EscapeIterInner::new(data, range))
}

#[stable(feature = "rust1", since = "1.0.0")]
Expand All @@ -113,33 +98,59 @@ impl Iterator for EscapeDefault {

#[inline]
fn next(&mut self) -> Option<u8> {
self.range.next().map(|i| self.data[i as usize])
self.0.next()
}

#[inline]
fn size_hint(&self) -> (usize, Option<usize>) {
self.range.size_hint()
let n = self.0.len();
(n, Some(n))
}

#[inline]
fn count(self) -> usize {
self.0.len()
}

#[inline]
fn last(mut self) -> Option<u8> {
self.next_back()
self.0.next_back()
}

#[inline]
fn advance_by(&mut self, n: usize) -> Result<(), NonZeroUsize> {
self.0.advance_by(n)
}
}

#[stable(feature = "rust1", since = "1.0.0")]
impl DoubleEndedIterator for EscapeDefault {
#[inline]
fn next_back(&mut self) -> Option<u8> {
self.range.next_back().map(|i| self.data[i as usize])
self.0.next_back()
}

#[inline]
fn advance_back_by(&mut self, n: usize) -> Result<(), NonZeroUsize> {
self.0.advance_back_by(n)
}
}

#[stable(feature = "rust1", since = "1.0.0")]
impl ExactSizeIterator for EscapeDefault {}
impl ExactSizeIterator for EscapeDefault {
#[inline]
fn len(&self) -> usize {
self.0.len()
}
}

#[stable(feature = "fused", since = "1.26.0")]
impl FusedIterator for EscapeDefault {}

#[stable(feature = "ascii_escape_display", since = "1.39.0")]
impl fmt::Display for EscapeDefault {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
// SAFETY: ok because `escape_default` created only valid utf-8 data
f.write_str(unsafe {
from_utf8_unchecked(&self.data[(self.range.start as usize)..(self.range.end as usize)])
})
f.write_str(self.0.as_str())
}
}

Expand Down
57 changes: 21 additions & 36 deletions library/core/src/char/methods.rs
Original file line number Diff line number Diff line change
Expand Up @@ -380,20 +380,7 @@ impl char {
#[stable(feature = "rust1", since = "1.0.0")]
#[inline]
pub fn escape_unicode(self) -> EscapeUnicode {
let c = self as u32;

// or-ing 1 ensures that for c==0 the code computes that one
// digit should be printed and (which is the same) avoids the
// (31 - 32) underflow
let msb = 31 - (c | 1).leading_zeros();

// the index of the most significant hex digit
let ms_hex_digit = msb / 4;
EscapeUnicode {
c: self,
state: EscapeUnicodeState::Backslash,
hex_digit_idx: ms_hex_digit as usize,
}
EscapeUnicode::new(self)
}

/// An extended version of `escape_debug` that optionally permits escaping
Expand All @@ -403,21 +390,20 @@ impl char {
/// characters, and double quotes in strings.
#[inline]
pub(crate) fn escape_debug_ext(self, args: EscapeDebugExtArgs) -> EscapeDebug {
let init_state = match self {
'\0' => EscapeDefaultState::Backslash('0'),
'\t' => EscapeDefaultState::Backslash('t'),
'\r' => EscapeDefaultState::Backslash('r'),
'\n' => EscapeDefaultState::Backslash('n'),
'\\' => EscapeDefaultState::Backslash(self),
'"' if args.escape_double_quote => EscapeDefaultState::Backslash(self),
'\'' if args.escape_single_quote => EscapeDefaultState::Backslash(self),
match self {
'\0' => EscapeDebug::backslash(b'0'),
'\t' => EscapeDebug::backslash(b't'),
'\r' => EscapeDebug::backslash(b'r'),
'\n' => EscapeDebug::backslash(b'n'),
'\\' => EscapeDebug::backslash(b'\\'),
'"' if args.escape_double_quote => EscapeDebug::backslash(b'"'),
'\'' if args.escape_single_quote => EscapeDebug::backslash(b'\''),
_ if args.escape_grapheme_extended && self.is_grapheme_extended() => {
EscapeDefaultState::Unicode(self.escape_unicode())
EscapeDebug::from_unicode(self.escape_unicode())
}
_ if is_printable(self) => EscapeDefaultState::Char(self),
_ => EscapeDefaultState::Unicode(self.escape_unicode()),
};
EscapeDebug(EscapeDefault { state: init_state })
_ if is_printable(self) => EscapeDebug::printable(self),
_ => EscapeDebug::from_unicode(self.escape_unicode()),
}
}

/// Returns an iterator that yields the literal escape code of a character
Expand Down Expand Up @@ -515,15 +501,14 @@ impl char {
#[stable(feature = "rust1", since = "1.0.0")]
#[inline]
pub fn escape_default(self) -> EscapeDefault {
let init_state = match self {
'\t' => EscapeDefaultState::Backslash('t'),
'\r' => EscapeDefaultState::Backslash('r'),
'\n' => EscapeDefaultState::Backslash('n'),
'\\' | '\'' | '"' => EscapeDefaultState::Backslash(self),
'\x20'..='\x7e' => EscapeDefaultState::Char(self),
_ => EscapeDefaultState::Unicode(self.escape_unicode()),
};
EscapeDefault { state: init_state }
match self {
'\t' => EscapeDefault::backslash(b't'),
'\r' => EscapeDefault::backslash(b'r'),
'\n' => EscapeDefault::backslash(b'n'),
'\\' | '\'' | '"' => EscapeDefault::backslash(self as u8),
'\x20'..='\x7e' => EscapeDefault::printable(self as u8),
_ => EscapeDefault::from_unicode(self.escape_unicode()),
}
}

/// Returns the number of bytes this `char` would need if encoded in UTF-8.
Expand Down
Loading

0 comments on commit f916c44

Please sign in to comment.