From c144da31e545f9764eb4f4430984b3767bcf41c1 Mon Sep 17 00:00:00 2001 From: Nicholas Nethercote Date: Wed, 24 Jan 2024 16:36:22 +1100 Subject: [PATCH] Implement RFC 3349, mixed utf8 literals. Specifically: - Allow unicode chars in b"" and br"" literals. This is done by changing `Mode::allow_unicode_chars` to succeed on `ByteStr` and `RawByteStr`. - Allow unicode escapes in b"" literals. This is done by changing `Mode::allow_unicode_escapes` to succeed on `ByteStr`. Byte string literals can already have high bytes (`\x80`..`\xff`). Because they now also support unicode chars, they can now be mixed utf8, so we use `unescape_mixed`/`cook_mixed` instead of `unescape_unicode`/`cook_unicode` to process them. A new type `Rfc3349`, is used to implement the feature gating. Values of that type are threaded through the unescaping code to track whether rules from rfc3349 are required for unescaping to succeed. Test changes: - tests/ui/mixed-utf8-literals/basic.rs: new `check-pass` UI test with various literals exercising the new forms. - tests/ui/attributes/key-value-non-ascii.rs: changed from a byte string literal to a byte literal; we just need some kind of problem with a literal to preserve the test's intent. - tests/ui/parser/raw/raw-byte-string-literals.rs: moved the raw byte string literal with a non-ASCII char to `basic.rs`. - tests/ui/parser/byte-string-literals.rs: similar. - tests/ui/parser/issues/issue-23620-invalid-escapes.rs: moved one case fully to `basic.rs`, and one partially. - tests/ui/parser/unicode-control-codepoints.rs: left the code unchanged, but the errors are now about mixed-utf8-literals being feature gated. - tests/ui/suggestions/multibyte-escapes.rs: moved one case to `basic.rs`. - compiler/rustc_lexer/src/unescape/tests.rs: various adjustments - two cases that previously failed now succeed - added some more cases for the newly supported syntax I wasn't sure how to handle rust-analyzer in general, so I just allowed mixed utf8 literals everywhere without complaint. --- compiler/rustc_ast/src/util/literal.rs | 16 +- compiler/rustc_ast_passes/src/feature_gate.rs | 1 + compiler/rustc_feature/src/unstable.rs | 2 + compiler/rustc_lexer/src/unescape.rs | 89 ++++++--- compiler/rustc_lexer/src/unescape/tests.rs | 61 ++++--- compiler/rustc_parse/src/lexer/mod.rs | 14 +- .../src/lexer/unescape_error_reporting.rs | 3 +- compiler/rustc_parse_format/src/lib.rs | 15 +- compiler/rustc_span/src/symbol.rs | 1 + .../language-features/mixed-utf8-literals.md | 16 ++ .../crates/parser/src/lexed_str.rs | 12 +- .../crates/syntax/src/ast/token_ext.rs | 169 ++++++++---------- .../crates/syntax/src/validation.rs | 12 +- src/tools/tidy/src/ui_tests.rs | 2 +- tests/ui/attributes/key-value-non-ascii.rs | 2 +- .../ui/attributes/key-value-non-ascii.stderr | 14 +- .../feature-gate-mixed-utf8-literals.rs | 5 + .../feature-gate-mixed-utf8-literals.stderr | 33 ++++ tests/ui/mixed-utf8-literals/basic.rs | 19 ++ tests/ui/parser/byte-string-literals.rs | 2 - tests/ui/parser/byte-string-literals.stderr | 21 +-- .../issues/issue-23620-invalid-escapes.rs | 10 +- .../issues/issue-23620-invalid-escapes.stderr | 46 ++--- .../ui/parser/raw/raw-byte-string-literals.rs | 1 - .../raw/raw-byte-string-literals.stderr | 10 +- tests/ui/parser/unicode-control-codepoints.rs | 13 +- .../parser/unicode-control-codepoints.stderr | 105 +++-------- tests/ui/suggestions/multibyte-escapes.rs | 5 - tests/ui/suggestions/multibyte-escapes.stderr | 13 +- 29 files changed, 364 insertions(+), 348 deletions(-) create mode 100644 src/doc/unstable-book/src/language-features/mixed-utf8-literals.md create mode 100644 tests/ui/feature-gates/feature-gate-mixed-utf8-literals.rs create mode 100644 tests/ui/feature-gates/feature-gate-mixed-utf8-literals.stderr create mode 100644 tests/ui/mixed-utf8-literals/basic.rs diff --git a/compiler/rustc_ast/src/util/literal.rs b/compiler/rustc_ast/src/util/literal.rs index aaeb1bb9bff82..6ea42da45438c 100644 --- a/compiler/rustc_ast/src/util/literal.rs +++ b/compiler/rustc_ast/src/util/literal.rs @@ -3,7 +3,7 @@ use crate::ast::{self, LitKind, MetaItemLit, StrStyle}; use crate::token::{self, Token}; use rustc_lexer::unescape::{ - byte_from_char, unescape_byte, unescape_char, unescape_mixed, unescape_unicode, MixedUnit, Mode, + unescape_byte, unescape_char, unescape_mixed, unescape_unicode, MixedUnit, Mode, }; use rustc_span::symbol::{kw, sym, Symbol}; use rustc_span::Span; @@ -49,7 +49,8 @@ impl LitKind { // For byte/char/string literals, chars and escapes have already been // checked in the lexer (in `cook_lexer_literal`). So we can assume all - // chars and escapes are valid here. + // chars and escapes are valid here, and ignore `Rfc3349` return + // values. Ok(match kind { token::Bool => { assert!(symbol.is_bool_lit()); @@ -84,7 +85,7 @@ impl LitKind { // Force-inlining here is aggressive but the closure is // called on every char in the string, so it can be hot in // programs with many long strings containing escapes. - unescape_unicode( + _ = unescape_unicode( s, Mode::Str, &mut #[inline(always)] @@ -108,8 +109,11 @@ impl LitKind { token::ByteStr => { let s = symbol.as_str(); let mut buf = Vec::with_capacity(s.len()); - unescape_unicode(s, Mode::ByteStr, &mut |_, c| match c { - Ok(c) => buf.push(byte_from_char(c)), + _ = unescape_mixed(s, Mode::ByteStr, &mut |_, c| match c { + Ok(MixedUnit::Char(c)) => { + buf.extend_from_slice(c.encode_utf8(&mut [0; 4]).as_bytes()) + } + Ok(MixedUnit::HighByte(b)) => buf.push(b), Err(err) => { assert!(!err.is_fatal(), "failed to unescape string literal") } @@ -125,7 +129,7 @@ impl LitKind { token::CStr => { let s = symbol.as_str(); let mut buf = Vec::with_capacity(s.len()); - unescape_mixed(s, Mode::CStr, &mut |_span, c| match c { + _ = unescape_mixed(s, Mode::CStr, &mut |_span, c| match c { Ok(MixedUnit::Char(c)) => { buf.extend_from_slice(c.encode_utf8(&mut [0; 4]).as_bytes()) } diff --git a/compiler/rustc_ast_passes/src/feature_gate.rs b/compiler/rustc_ast_passes/src/feature_gate.rs index 82236d2e30678..5723b989d1b28 100644 --- a/compiler/rustc_ast_passes/src/feature_gate.rs +++ b/compiler/rustc_ast_passes/src/feature_gate.rs @@ -508,6 +508,7 @@ pub fn check_crate(krate: &ast::Crate, sess: &Session, features: &Features) { } }; } + gate_all!(mixed_utf8_literals, r#"mixed utf8 b"" and br"" literals are experimental"#); gate_all!( if_let_guard, "`if let` guards are experimental", diff --git a/compiler/rustc_feature/src/unstable.rs b/compiler/rustc_feature/src/unstable.rs index e66a66e23dc4f..bbb3c9af56a02 100644 --- a/compiler/rustc_feature/src/unstable.rs +++ b/compiler/rustc_feature/src/unstable.rs @@ -523,6 +523,8 @@ declare_features! ( /// standard library until the soundness issues with specialization /// are fixed. (unstable, min_specialization, "1.7.0", Some(31844)), + /// Allows mixed utf8 b"" and br"" literals. + (unstable, mixed_utf8_literals, "CURRENT_RUSTC_VERSION", Some(116907)), /// Allows qualified paths in struct expressions, struct patterns and tuple struct patterns. (unstable, more_qualified_paths, "1.54.0", Some(86935)), /// Allows the `#[must_not_suspend]` attribute. diff --git a/compiler/rustc_lexer/src/unescape.rs b/compiler/rustc_lexer/src/unescape.rs index 03d178eb266a4..8464915df330a 100644 --- a/compiler/rustc_lexer/src/unescape.rs +++ b/compiler/rustc_lexer/src/unescape.rs @@ -85,7 +85,7 @@ impl EscapeError { /// /// Values are returned by invoking `callback`. For `Char` and `Byte` modes, /// the callback will be called exactly once. -pub fn unescape_unicode(src: &str, mode: Mode, callback: &mut F) +pub fn unescape_unicode(src: &str, mode: Mode, callback: &mut F) -> Rfc3349 where F: FnMut(Range, Result), { @@ -94,8 +94,9 @@ where let mut chars = src.chars(); let res = unescape_char_or_byte(&mut chars, mode); callback(0..(src.len() - chars.as_str().len()), res); + Rfc3349::Unused // rfc3349 not relevant for `Mode::{Char,Byte}` } - Str | ByteStr => unescape_non_raw_common(src, mode, callback), + Str => unescape_non_raw_common(src, mode, callback), RawStr | RawByteStr => check_raw_common(src, mode, callback), RawCStr => check_raw_common(src, mode, &mut |r, mut result| { if let Ok('\0') = result { @@ -103,7 +104,7 @@ where } callback(r, result) }), - CStr => unreachable!(), + ByteStr | CStr => unreachable!(), } } @@ -142,18 +143,19 @@ impl From for MixedUnit { /// a sequence of escaped characters or errors. /// /// Values are returned by invoking `callback`. -pub fn unescape_mixed(src: &str, mode: Mode, callback: &mut F) +pub fn unescape_mixed(src: &str, mode: Mode, callback: &mut F) -> Rfc3349 where F: FnMut(Range, Result), { match mode { + ByteStr => unescape_non_raw_common(src, mode, callback), CStr => unescape_non_raw_common(src, mode, &mut |r, mut result| { if let Ok(MixedUnit::Char('\0')) = result { result = Err(EscapeError::NulInCStr); } callback(r, result) }), - Char | Byte | Str | RawStr | ByteStr | RawByteStr | RawCStr => unreachable!(), + Char | Byte | Str | RawStr | RawByteStr | RawCStr => unreachable!(), } } @@ -169,6 +171,15 @@ pub fn unescape_byte(src: &str) -> Result { unescape_char_or_byte(&mut src.chars(), Byte).map(byte_from_char) } +/// Used to indicate if rfc3349 (mixed-utf8-literals) was required for the +/// literal to be valid. Once rfc3349 is stabilized this type can be removed. +#[derive(Debug, PartialEq)] +#[must_use] +pub enum Rfc3349 { + Used, + Unused, +} + /// What kind of literal do we parse. #[derive(Debug, Clone, Copy, PartialEq)] pub enum Mode { @@ -205,17 +216,25 @@ impl Mode { /// Are unicode (non-ASCII) chars allowed? #[inline] - fn allow_unicode_chars(self) -> bool { + fn allow_unicode_chars(self, rfc3349: &mut Rfc3349) -> bool { match self { - Byte | ByteStr | RawByteStr => false, + Byte => false, + ByteStr | RawByteStr => { + *rfc3349 = Rfc3349::Used; + true + } Char | Str | RawStr | CStr | RawCStr => true, } } /// Are unicode escapes (`\u`) allowed? - fn allow_unicode_escapes(self) -> bool { + fn allow_unicode_escapes(self, rfc3349: &mut Rfc3349) -> bool { match self { - Byte | ByteStr => false, + Byte => false, + ByteStr => { + *rfc3349 = Rfc3349::Used; + true + } Char | Str | CStr => true, RawByteStr | RawStr | RawCStr => unreachable!(), } @@ -233,6 +252,7 @@ impl Mode { fn scan_escape + From>( chars: &mut Chars<'_>, mode: Mode, + rfc3349: &mut Rfc3349, ) -> Result { // Previous character was '\\', unescape what follows. let res: char = match chars.next().ok_or(EscapeError::LoneSlash)? { @@ -262,13 +282,17 @@ fn scan_escape + From>( Ok(T::from(value as u8)) }; } - 'u' => return scan_unicode(chars, mode.allow_unicode_escapes()).map(T::from), + 'u' => return scan_unicode(chars, mode, rfc3349).map(T::from), _ => return Err(EscapeError::InvalidEscape), }; Ok(T::from(res)) } -fn scan_unicode(chars: &mut Chars<'_>, allow_unicode_escapes: bool) -> Result { +fn scan_unicode( + chars: &mut Chars<'_>, + mode: Mode, + rfc3349: &mut Rfc3349, +) -> Result { // We've parsed '\u', now we have to parse '{..}'. if chars.next() != Some('{') { @@ -296,7 +320,7 @@ fn scan_unicode(chars: &mut Chars<'_>, allow_unicode_escapes: bool) -> Result, allow_unicode_escapes: bool) -> Result Result { - if allow_unicode_chars || c.is_ascii() { Ok(c) } else { Err(EscapeError::NonAsciiCharInByte) } +fn ascii_check(c: char, mode: Mode, rfc3349: &mut Rfc3349) -> Result { + // We must check `is_ascii` first, to avoid setting `rfc3349` unnecessarily. + if c.is_ascii() || mode.allow_unicode_chars(rfc3349) { + Ok(c) + } else { + Err(EscapeError::NonAsciiCharInByte) + } } fn unescape_char_or_byte(chars: &mut Chars<'_>, mode: Mode) -> Result { let c = chars.next().ok_or(EscapeError::ZeroChars)?; + let mut rfc3349 = Rfc3349::Unused; let res = match c { - '\\' => scan_escape(chars, mode), + '\\' => scan_escape(chars, mode, &mut rfc3349), '\n' | '\t' | '\'' => Err(EscapeError::EscapeOnlyChar), '\r' => Err(EscapeError::BareCarriageReturn), - _ => ascii_check(c, mode.allow_unicode_chars()), + _ => ascii_check(c, mode, &mut rfc3349), }?; + + assert_eq!(rfc3349, Rfc3349::Unused); // rfc3349 not relevant for `Mode::{Char,Byte}` + if chars.next().is_some() { return Err(EscapeError::MoreThanOneChar); } @@ -342,12 +375,16 @@ fn unescape_char_or_byte(chars: &mut Chars<'_>, mode: Mode) -> Result + From>(src: &str, mode: Mode, callback: &mut F) +fn unescape_non_raw_common + From>( + src: &str, + mode: Mode, + callback: &mut F, +) -> Rfc3349 where F: FnMut(Range, Result), { let mut chars = src.chars(); - let allow_unicode_chars = mode.allow_unicode_chars(); // get this outside the loop + let mut rfc3349 = Rfc3349::Unused; // The `start` and `end` computation here is complicated because // `skip_ascii_whitespace` makes us to skip over chars without counting @@ -367,16 +404,17 @@ where }); continue; } - _ => scan_escape::(&mut chars, mode), + _ => scan_escape::(&mut chars, mode, &mut rfc3349), } } '"' => Err(EscapeError::EscapeOnlyChar), '\r' => Err(EscapeError::BareCarriageReturn), - _ => ascii_check(c, allow_unicode_chars).map(T::from), + _ => ascii_check(c, mode, &mut rfc3349).map(T::from), }; let end = src.len() - chars.as_str().len(); callback(start..end, res); } + rfc3349 } fn skip_ascii_whitespace(chars: &mut Chars<'_>, start: usize, callback: &mut F) @@ -409,12 +447,12 @@ where /// sequence of characters or errors. /// NOTE: Raw strings do not perform any explicit character escaping, here we /// only produce errors on bare CR. -fn check_raw_common(src: &str, mode: Mode, callback: &mut F) +fn check_raw_common(src: &str, mode: Mode, callback: &mut F) -> Rfc3349 where F: FnMut(Range, Result), { let mut chars = src.chars(); - let allow_unicode_chars = mode.allow_unicode_chars(); // get this outside the loop + let mut rfc3349 = Rfc3349::Unused; // The `start` and `end` computation here matches the one in // `unescape_non_raw_common` for consistency, even though this function @@ -423,16 +461,17 @@ where let start = src.len() - chars.as_str().len() - c.len_utf8(); let res = match c { '\r' => Err(EscapeError::BareCarriageReturnInRawString), - _ => ascii_check(c, allow_unicode_chars), + _ => ascii_check(c, mode, &mut rfc3349), }; let end = src.len() - chars.as_str().len(); callback(start..end, res); } + rfc3349 } #[inline] -pub fn byte_from_char(c: char) -> u8 { +pub(crate) fn byte_from_char(c: char) -> u8 { let res = c as u32; - debug_assert!(res <= u8::MAX as u32, "guaranteed because of ByteStr"); + debug_assert!(res <= u8::MAX as u32, "guaranteed because of Byte"); res as u8 } diff --git a/compiler/rustc_lexer/src/unescape/tests.rs b/compiler/rustc_lexer/src/unescape/tests.rs index 5b99495f47581..d1560fa5dc87f 100644 --- a/compiler/rustc_lexer/src/unescape/tests.rs +++ b/compiler/rustc_lexer/src/unescape/tests.rs @@ -100,7 +100,9 @@ fn test_unescape_char_good() { fn test_unescape_str_warn() { fn check(literal: &str, expected: &[(Range, Result)]) { let mut unescaped = Vec::with_capacity(literal.len()); - unescape_unicode(literal, Mode::Str, &mut |range, res| unescaped.push((range, res))); + let rfc3349 = + unescape_unicode(literal, Mode::Str, &mut |range, res| unescaped.push((range, res))); + assert_eq!(rfc3349, Rfc3349::Unused); // rfc3349 not relevant for `Mode::Str` assert_eq!(unescaped, expected); } @@ -124,7 +126,7 @@ fn test_unescape_str_warn() { fn test_unescape_str_good() { fn check(literal_text: &str, expected: &str) { let mut buf = Ok(String::with_capacity(literal_text.len())); - unescape_unicode(literal_text, Mode::Str, &mut |range, c| { + let rfc3349 = unescape_unicode(literal_text, Mode::Str, &mut |range, c| { if let Ok(b) = &mut buf { match c { Ok(c) => b.push(c), @@ -132,6 +134,7 @@ fn test_unescape_str_good() { } } }); + assert_eq!(rfc3349, Rfc3349::Unused); // rfc3349 not relevant for `Mode::Str` assert_eq!(buf.as_deref(), Ok(expected)) } @@ -239,32 +242,43 @@ fn test_unescape_byte_good() { #[test] fn test_unescape_byte_str_good() { - fn check(literal_text: &str, expected: &[u8]) { - let mut buf = Ok(Vec::with_capacity(literal_text.len())); - unescape_unicode(literal_text, Mode::ByteStr, &mut |range, c| { - if let Ok(b) = &mut buf { + fn check(literal_text: &str, expected: &[u8], rfc3349_expected: Rfc3349) { + let mut buf_res = Ok(Vec::with_capacity(literal_text.len())); + let rfc3349_actual = unescape_mixed(literal_text, Mode::ByteStr, &mut |range, c| { + if let Ok(buf) = &mut buf_res { match c { - Ok(c) => b.push(byte_from_char(c)), - Err(e) => buf = Err((range, e)), + Ok(MixedUnit::Char(c)) => { + buf.extend_from_slice(c.encode_utf8(&mut [0; 4]).as_bytes()) + } + Ok(MixedUnit::HighByte(b)) => buf.push(b), + Err(e) => buf_res = Err((range, e)), } } }); - assert_eq!(buf.as_deref(), Ok(expected)) + assert_eq!(rfc3349_actual, rfc3349_expected); + assert_eq!(buf_res.as_deref(), Ok(expected)) } - check("foo", b"foo"); - check("", b""); - check(" \t\n", b" \t\n"); + check("foo", b"foo", Rfc3349::Unused); + check("", b"", Rfc3349::Unused); + check(" \t\n", b" \t\n", Rfc3349::Unused); + + check("hello \\\n world", b"hello world", Rfc3349::Unused); + check("thread's", b"thread's", Rfc3349::Unused); - check("hello \\\n world", b"hello world"); - check("thread's", b"thread's") + let a_pound_up_smiley = &[0x61, 0xc2, 0xa3, 0xe2, 0x86, 0x91, 0xf0, 0x9f, 0x98, 0x80]; + check("a£↑😀", a_pound_up_smiley, Rfc3349::Used); + check(r"\u{61}\u{a3}\u{2191}\u{1f600}", a_pound_up_smiley, Rfc3349::Used); + check(r"\x00\x7f\x80\xffa¥", &[0, 0x7f, 0x80, 0xff, 0x61, 0xc2, 0xa5], Rfc3349::Used); } #[test] fn test_unescape_raw_str() { fn check(literal: &str, expected: &[(Range, Result)]) { let mut unescaped = Vec::with_capacity(literal.len()); - unescape_unicode(literal, Mode::RawStr, &mut |range, res| unescaped.push((range, res))); + let rfc3349 = + unescape_unicode(literal, Mode::RawStr, &mut |range, res| unescaped.push((range, res))); + assert_eq!(rfc3349, Rfc3349::Unused); // rfc3349 not relevant for `Mode::RawStr` assert_eq!(unescaped, expected); } @@ -274,13 +288,20 @@ fn test_unescape_raw_str() { #[test] fn test_unescape_raw_byte_str() { - fn check(literal: &str, expected: &[(Range, Result)]) { + fn check( + literal: &str, + expected: &[(Range, Result)], + rfc3349_expected: Rfc3349, + ) { let mut unescaped = Vec::with_capacity(literal.len()); - unescape_unicode(literal, Mode::RawByteStr, &mut |range, res| unescaped.push((range, res))); + let rfc3349_actual = unescape_unicode(literal, Mode::RawByteStr, &mut |range, res| { + unescaped.push((range, res)) + }); + assert_eq!(rfc3349_actual, rfc3349_expected); assert_eq!(unescaped, expected); } - check("\r", &[(0..1, Err(EscapeError::BareCarriageReturnInRawString))]); - check("🦀", &[(0..4, Err(EscapeError::NonAsciiCharInByte))]); - check("🦀a", &[(0..4, Err(EscapeError::NonAsciiCharInByte)), (4..5, Ok('a'))]); + check("\r", &[(0..1, Err(EscapeError::BareCarriageReturnInRawString))], Rfc3349::Unused); + check("🦀", &[(0..4, Ok('🦀'))], Rfc3349::Used); + check("¥a", &[(0..2, Ok('¥')), (2..3, Ok('a'))], Rfc3349::Used); } diff --git a/compiler/rustc_parse/src/lexer/mod.rs b/compiler/rustc_parse/src/lexer/mod.rs index 3155245267600..66325e3be6bb3 100644 --- a/compiler/rustc_parse/src/lexer/mod.rs +++ b/compiler/rustc_parse/src/lexer/mod.rs @@ -8,9 +8,8 @@ use rustc_ast::token::{self, CommentKind, Delimiter, Token, TokenKind}; use rustc_ast::tokenstream::TokenStream; use rustc_ast::util::unicode::contains_text_flow_control_chars; use rustc_errors::{codes::*, Applicability, DiagCtxt, DiagnosticBuilder, StashKey}; -use rustc_lexer::unescape::{self, EscapeError, Mode}; -use rustc_lexer::{Base, DocStyle, RawStrError}; -use rustc_lexer::{Cursor, LiteralKind}; +use rustc_lexer::unescape::{self, EscapeError, Mode, Rfc3349}; +use rustc_lexer::{Base, Cursor, DocStyle, LiteralKind, RawStrError}; use rustc_session::lint::builtin::{ RUST_2021_PREFIXES_INCOMPATIBLE_SYNTAX, TEXT_DIRECTION_CODEPOINT_IN_COMMENT, }; @@ -436,7 +435,7 @@ impl<'sess, 'src> StringReader<'sess, 'src> { .with_code(E0766) .emit() } - self.cook_unicode(token::ByteStr, Mode::ByteStr, start, end, 2, 1) // b" " + self.cook_mixed(token::ByteStr, Mode::ByteStr, start, end, 2, 1) // b" " } rustc_lexer::LiteralKind::CStr { terminated } => { if !terminated { @@ -697,13 +696,13 @@ impl<'sess, 'src> StringReader<'sess, 'src> { end: BytePos, prefix_len: u32, postfix_len: u32, - unescape: fn(&str, Mode, &mut dyn FnMut(Range, Result<(), EscapeError>)), + unescape: fn(&str, Mode, &mut dyn FnMut(Range, Result<(), EscapeError>)) -> Rfc3349, ) -> (token::LitKind, Symbol) { let mut has_fatal_err = false; let content_start = start + BytePos(prefix_len); let content_end = end - BytePos(postfix_len); let lit_content = self.str_from_to(content_start, content_end); - unescape(lit_content, mode, &mut |range, result| { + let rfc3349 = unescape(lit_content, mode, &mut |range, result| { // Here we only check for errors. The actual unescaping is done later. if let Err(err) = result { let span_with_quotes = self.mk_sp(start, end); @@ -725,6 +724,9 @@ impl<'sess, 'src> StringReader<'sess, 'src> { ); } }); + if rfc3349 == Rfc3349::Used { + self.sess.gated_spans.gate(sym::mixed_utf8_literals, self.mk_sp(start, end)); + } // We normally exclude the quotes for the symbol, but for errors we // include it because it results in clearer error messages. diff --git a/compiler/rustc_parse/src/lexer/unescape_error_reporting.rs b/compiler/rustc_parse/src/lexer/unescape_error_reporting.rs index 3238f8e23bb0a..3867500678b1c 100644 --- a/compiler/rustc_parse/src/lexer/unescape_error_reporting.rs +++ b/compiler/rustc_parse/src/lexer/unescape_error_reporting.rs @@ -175,6 +175,7 @@ pub(crate) fn emit_unescape_error( EscapeError::NonAsciiCharInByte => { let (c, span) = last_char(); let desc = match mode { + // Note: once rfc3349 stabilizes, only `Mode::Byte` will be reachable here. Mode::Byte => "byte literal", Mode::ByteStr => "byte string literal", Mode::RawByteStr => "raw byte string literal", @@ -188,7 +189,7 @@ pub(crate) fn emit_unescape_error( }; err.span_label(span, format!("must be ASCII{postfix}")); // Note: the \\xHH suggestions are not given for raw byte string - // literals, because they are araw and so cannot use any escapes. + // literals, because they cannot use escapes. if (c as u32) <= 0xFF && mode != Mode::RawByteStr { err.span_suggestion( span, diff --git a/compiler/rustc_parse_format/src/lib.rs b/compiler/rustc_parse_format/src/lib.rs index d76ee161da6fd..2b9c9a32e8831 100644 --- a/compiler/rustc_parse_format/src/lib.rs +++ b/compiler/rustc_parse_format/src/lib.rs @@ -1056,13 +1056,14 @@ fn find_width_map_from_snippet( fn unescape_string(string: &str) -> Option { let mut buf = string::String::new(); let mut ok = true; - unescape::unescape_unicode(string, unescape::Mode::Str, &mut |_, unescaped_char| { - match unescaped_char { - Ok(c) => buf.push(c), - Err(_) => ok = false, - } - }); - + let rfc3349 = + unescape::unescape_unicode(string, unescape::Mode::Str, &mut |_, unescaped_char| { + match unescaped_char { + Ok(c) => buf.push(c), + Err(_) => ok = false, + } + }); + assert_eq!(rfc3349, unescape::Rfc3349::Unused); // rfc3349 not relevant for `Mode::Str` ok.then_some(buf) } diff --git a/compiler/rustc_span/src/symbol.rs b/compiler/rustc_span/src/symbol.rs index a54b928b908a7..fb4b656151220 100644 --- a/compiler/rustc_span/src/symbol.rs +++ b/compiler/rustc_span/src/symbol.rs @@ -1058,6 +1058,7 @@ symbols! { mir_unwind_unreachable, mir_variant, miri, + mixed_utf8_literals, mmx_reg, modifiers, module, diff --git a/src/doc/unstable-book/src/language-features/mixed-utf8-literals.md b/src/doc/unstable-book/src/language-features/mixed-utf8-literals.md new file mode 100644 index 0000000000000..2ba31b75fd226 --- /dev/null +++ b/src/doc/unstable-book/src/language-features/mixed-utf8-literals.md @@ -0,0 +1,16 @@ +# `mixed_utf8_literals` + +The tracking issue for this feature is: [#116907] + +[#116907]: https://github.com/rust-lang/rust/issues/116907 + +------------------------ + +This feature extends the syntax of string literals in the following ways. +- Byte string literals can contain unicode chars (e.g. `b"🦀"`) and unicode + escapes (e.g. `b"\u{1f980}"`. +- Raw byte string literals can contain unicode chars (e.g. `br"🦀"`). + +This makes it easier to work with strings that are mostly UTF-8 encoded but +also contain some non UTF-8 bytes, which are sometimes called "conventionally +UTF-8" strings. diff --git a/src/tools/rust-analyzer/crates/parser/src/lexed_str.rs b/src/tools/rust-analyzer/crates/parser/src/lexed_str.rs index bf1feb9a7eb07..2d9ea8ff38360 100644 --- a/src/tools/rust-analyzer/crates/parser/src/lexed_str.rs +++ b/src/tools/rust-analyzer/crates/parser/src/lexed_str.rs @@ -365,9 +365,11 @@ fn error_to_diagnostic_message(error: EscapeError, mode: Mode) -> &'static str { EscapeError::NonAsciiCharInByte if mode == Mode::Byte => { "non-ASCII character in byte literal" } + // Note: once rfc3349 stabilizes, this arm will be unreachable. EscapeError::NonAsciiCharInByte if mode == Mode::ByteStr => { "non-ASCII character in byte string literal" } + // Note: once rfc3349 stabilizes, this arm will be unreachable. EscapeError::NonAsciiCharInByte => "non-ASCII character in raw byte string literal", EscapeError::NulInCStr => "null character in C string literal", EscapeError::UnskippedWhitespaceWarning => "", @@ -378,15 +380,17 @@ fn error_to_diagnostic_message(error: EscapeError, mode: Mode) -> &'static str { fn unescape_string_error_message(text: &str, mode: Mode) -> &'static str { let mut error_message = ""; match mode { - Mode::CStr => { - rustc_lexer::unescape::unescape_mixed(text, mode, &mut |_, res| { + Mode::ByteStr | Mode::CStr => { + // Can ignore the `Rfc3349` return value. + _ = rustc_lexer::unescape::unescape_mixed(text, mode, &mut |_, res| { if let Err(e) = res { error_message = error_to_diagnostic_message(e, mode); } }); } - Mode::ByteStr | Mode::Str => { - rustc_lexer::unescape::unescape_unicode(text, mode, &mut |_, res| { + Mode::Str => { + // Can ignore the `Rfc3349` return value. + _ = rustc_lexer::unescape::unescape_unicode(text, mode, &mut |_, res| { if let Err(e) = res { error_message = error_to_diagnostic_message(e, mode); } diff --git a/src/tools/rust-analyzer/crates/syntax/src/ast/token_ext.rs b/src/tools/rust-analyzer/crates/syntax/src/ast/token_ext.rs index 7cd1f1550b988..c1e5c9d51ef8d 100644 --- a/src/tools/rust-analyzer/crates/syntax/src/ast/token_ext.rs +++ b/src/tools/rust-analyzer/crates/syntax/src/ast/token_ext.rs @@ -193,7 +193,8 @@ pub trait IsString: AstToken { let text = &self.text()[text_range_no_quotes - start]; let offset = text_range_no_quotes.start() - start; - unescape_unicode(text, Self::MODE, &mut |range, unescaped_char| { + // Ignores the `Rfc3349` return value, thus permitting mixed utf8 literals. + _ = unescape_unicode(text, Self::MODE, &mut |range, unescaped_char| { let text_range = TextRange::new(range.start.try_into().unwrap(), range.end.try_into().unwrap()); cb(text_range + offset, unescaped_char); @@ -226,7 +227,8 @@ impl ast::String { let mut buf = String::new(); let mut prev_end = 0; let mut has_error = false; - unescape_unicode(text, Self::MODE, &mut |char_range, unescaped_char| match ( + // Ignores the `Rfc3349` return value, thus permitting mixed utf8 literals. + _ = unescape_unicode(text, Self::MODE, &mut |char_range, unescaped_char| match ( unescaped_char, buf.capacity() == 0, ) { @@ -253,44 +255,18 @@ impl ast::String { impl IsString for ast::ByteString { const RAW_PREFIX: &'static str = "br"; const MODE: Mode = Mode::ByteStr; + + fn escaped_char_ranges( + &self, + cb: &mut dyn FnMut(TextRange, Result), + ) { + escaped_char_ranges_impl(self, cb); + } } impl ast::ByteString { pub fn value(&self) -> Option> { - if self.is_raw() { - let text = self.text(); - let text = - &text[self.text_range_between_quotes()? - self.syntax().text_range().start()]; - return Some(Cow::Borrowed(text.as_bytes())); - } - - let text = self.text(); - let text = &text[self.text_range_between_quotes()? - self.syntax().text_range().start()]; - - let mut buf: Vec = Vec::new(); - let mut prev_end = 0; - let mut has_error = false; - unescape_unicode(text, Self::MODE, &mut |char_range, unescaped_char| match ( - unescaped_char, - buf.capacity() == 0, - ) { - (Ok(c), false) => buf.push(c as u8), - (Ok(_), true) if char_range.len() == 1 && char_range.start == prev_end => { - prev_end = char_range.end - } - (Ok(c), true) => { - buf.reserve_exact(text.len()); - buf.extend_from_slice(text[..prev_end].as_bytes()); - buf.push(c as u8); - } - (Err(_), _) => has_error = true, - }); - - match (has_error, buf.capacity() == 0) { - (true, _) => None, - (false, true) => Some(Cow::Borrowed(text.as_bytes())), - (false, false) => Some(Cow::Owned(buf)), - } + value_impl(self) } } @@ -302,65 +278,13 @@ impl IsString for ast::CString { &self, cb: &mut dyn FnMut(TextRange, Result), ) { - let text_range_no_quotes = match self.text_range_between_quotes() { - Some(it) => it, - None => return, - }; - - let start = self.syntax().text_range().start(); - let text = &self.text()[text_range_no_quotes - start]; - let offset = text_range_no_quotes.start() - start; - - unescape_mixed(text, Self::MODE, &mut |range, unescaped_char| { - let text_range = - TextRange::new(range.start.try_into().unwrap(), range.end.try_into().unwrap()); - // XXX: This method should only be used for highlighting ranges. The unescaped - // char/byte is not used. For simplicity, we return an arbitrary placeholder char. - cb(text_range + offset, unescaped_char.map(|_| ' ')); - }); + escaped_char_ranges_impl(self, cb); } } impl ast::CString { pub fn value(&self) -> Option> { - if self.is_raw() { - let text = self.text(); - let text = - &text[self.text_range_between_quotes()? - self.syntax().text_range().start()]; - return Some(Cow::Borrowed(text.as_bytes())); - } - - let text = self.text(); - let text = &text[self.text_range_between_quotes()? - self.syntax().text_range().start()]; - - let mut buf = Vec::new(); - let mut prev_end = 0; - let mut has_error = false; - let extend_unit = |buf: &mut Vec, unit: MixedUnit| match unit { - MixedUnit::Char(c) => buf.extend(c.encode_utf8(&mut [0; 4]).as_bytes()), - MixedUnit::HighByte(b) => buf.push(b), - }; - unescape_mixed(text, Self::MODE, &mut |char_range, unescaped| match ( - unescaped, - buf.capacity() == 0, - ) { - (Ok(u), false) => extend_unit(&mut buf, u), - (Ok(_), true) if char_range.len() == 1 && char_range.start == prev_end => { - prev_end = char_range.end - } - (Ok(u), true) => { - buf.reserve_exact(text.len()); - buf.extend(text[..prev_end].as_bytes()); - extend_unit(&mut buf, u); - } - (Err(_), _) => has_error = true, - }); - - match (has_error, buf.capacity() == 0) { - (true, _) => None, - (false, true) => Some(Cow::Borrowed(text.as_bytes())), - (false, false) => Some(Cow::Owned(buf)), - } + value_impl(self) } } @@ -457,6 +381,71 @@ impl ast::FloatNumber { } } +fn escaped_char_ranges_impl( + this: &I, + cb: &mut dyn FnMut(TextRange, Result), +) { + let text_range_no_quotes = match this.text_range_between_quotes() { + Some(it) => it, + None => return, + }; + + let start = this.syntax().text_range().start(); + let text = &this.text()[text_range_no_quotes - start]; + let offset = text_range_no_quotes.start() - start; + + // Ignores the `Rfc3349` return value, thus permitting mixed utf8 literals. + _ = unescape_mixed(text, I::MODE, &mut |range, unescaped_char| { + let text_range = + TextRange::new(range.start.try_into().unwrap(), range.end.try_into().unwrap()); + // XXX: This method should only be used for highlighting ranges. The unescaped + // char/byte is not used. For simplicity, we return an arbitrary placeholder char. + cb(text_range + offset, unescaped_char.map(|_| ' ')); + }); +} + +fn value_impl(this: &I) -> Option> { + if this.is_raw() { + let text = this.text(); + let text = + &text[this.text_range_between_quotes()? - this.syntax().text_range().start()]; + return Some(Cow::Borrowed(text.as_bytes())); + } + + let text = this.text(); + let text = &text[this.text_range_between_quotes()? - this.syntax().text_range().start()]; + + let mut buf: Vec = Vec::new(); + let mut prev_end = 0; + let mut has_error = false; + let extend_unit = |buf: &mut Vec, unit: MixedUnit| match unit { + MixedUnit::Char(c) => buf.extend(c.encode_utf8(&mut [0; 4]).as_bytes()), + MixedUnit::HighByte(b) => buf.push(b), + }; + // Ignores the `Rfc3349` return value, thus permitting mixed utf8 literals. + _ = unescape_mixed(text, I::MODE, &mut |char_range, unescaped_char| match ( + unescaped_char, + buf.capacity() == 0, + ) { + (Ok(u), false) => extend_unit(&mut buf, u), + (Ok(_), true) if char_range.len() == 1 && char_range.start == prev_end => { + prev_end = char_range.end + } + (Ok(u), true) => { + buf.reserve_exact(text.len()); + buf.extend(text[..prev_end].as_bytes()); + extend_unit(&mut buf, u); + } + (Err(_), _) => has_error = true, + }); + + match (has_error, buf.capacity() == 0) { + (true, _) => None, + (false, true) => Some(Cow::Borrowed(text.as_bytes())), + (false, false) => Some(Cow::Owned(buf)), + } +} + #[derive(Debug, PartialEq, Eq, Copy, Clone)] pub enum Radix { Binary = 2, diff --git a/src/tools/rust-analyzer/crates/syntax/src/validation.rs b/src/tools/rust-analyzer/crates/syntax/src/validation.rs index 5c5b26f525f66..fae7b37e9b2c4 100644 --- a/src/tools/rust-analyzer/crates/syntax/src/validation.rs +++ b/src/tools/rust-analyzer/crates/syntax/src/validation.rs @@ -136,11 +136,13 @@ fn validate_literal(literal: ast::Literal, acc: &mut Vec) { } }; + // Ignores the `Rfc3349` return value from the `unescape_*` functions, thus + // permitting mixed utf8 literals. match literal.kind() { ast::LiteralKind::String(s) => { if !s.is_raw() { if let Some(without_quotes) = unquote(text, 1, '"') { - unescape_unicode(without_quotes, Mode::Str, &mut |range, char| { + _ = unescape_unicode(without_quotes, Mode::Str, &mut |range, char| { if let Err(err) = char { push_err(1, range.start, err); } @@ -151,7 +153,7 @@ fn validate_literal(literal: ast::Literal, acc: &mut Vec) { ast::LiteralKind::ByteString(s) => { if !s.is_raw() { if let Some(without_quotes) = unquote(text, 2, '"') { - unescape_unicode(without_quotes, Mode::ByteStr, &mut |range, char| { + _ = unescape_mixed(without_quotes, Mode::ByteStr, &mut |range, char| { if let Err(err) = char { push_err(1, range.start, err); } @@ -162,7 +164,7 @@ fn validate_literal(literal: ast::Literal, acc: &mut Vec) { ast::LiteralKind::CString(s) => { if !s.is_raw() { if let Some(without_quotes) = unquote(text, 2, '"') { - unescape_mixed(without_quotes, Mode::CStr, &mut |range, char| { + _ = unescape_mixed(without_quotes, Mode::CStr, &mut |range, char| { if let Err(err) = char { push_err(1, range.start, err); } @@ -172,7 +174,7 @@ fn validate_literal(literal: ast::Literal, acc: &mut Vec) { } ast::LiteralKind::Char(_) => { if let Some(without_quotes) = unquote(text, 1, '\'') { - unescape_unicode(without_quotes, Mode::Char, &mut |range, char| { + _ = unescape_unicode(without_quotes, Mode::Char, &mut |range, char| { if let Err(err) = char { push_err(1, range.start, err); } @@ -181,7 +183,7 @@ fn validate_literal(literal: ast::Literal, acc: &mut Vec) { } ast::LiteralKind::Byte(_) => { if let Some(without_quotes) = unquote(text, 2, '\'') { - unescape_unicode(without_quotes, Mode::Byte, &mut |range, char| { + _ = unescape_unicode(without_quotes, Mode::Byte, &mut |range, char| { if let Err(err) = char { push_err(2, range.start, err); } diff --git a/src/tools/tidy/src/ui_tests.rs b/src/tools/tidy/src/ui_tests.rs index 85553d2e3384a..5ce6c6b74c455 100644 --- a/src/tools/tidy/src/ui_tests.rs +++ b/src/tools/tidy/src/ui_tests.rs @@ -11,7 +11,7 @@ use std::path::{Path, PathBuf}; const ENTRY_LIMIT: usize = 900; // FIXME: The following limits should be reduced eventually. const ISSUES_ENTRY_LIMIT: usize = 1849; -const ROOT_ENTRY_LIMIT: usize = 870; +const ROOT_ENTRY_LIMIT: usize = 871; const EXPECTED_TEST_FILE_EXTENSIONS: &[&str] = &[ "rs", // test source files diff --git a/tests/ui/attributes/key-value-non-ascii.rs b/tests/ui/attributes/key-value-non-ascii.rs index e14e2fc05ad39..8e0bb6bc50ca6 100644 --- a/tests/ui/attributes/key-value-non-ascii.rs +++ b/tests/ui/attributes/key-value-non-ascii.rs @@ -1,4 +1,4 @@ #![feature(rustc_attrs)] -#[rustc_dummy = b"ffi.rs"] //~ ERROR non-ASCII character in byte string literal +#[rustc_dummy = b'ffi'] //~ ERROR non-ASCII character in byte literal fn main() {} diff --git a/tests/ui/attributes/key-value-non-ascii.stderr b/tests/ui/attributes/key-value-non-ascii.stderr index cc01bc46ebd29..e9b6947bf4cba 100644 --- a/tests/ui/attributes/key-value-non-ascii.stderr +++ b/tests/ui/attributes/key-value-non-ascii.stderr @@ -1,13 +1,11 @@ -error: non-ASCII character in byte string literal +error: non-ASCII character in byte literal --> $DIR/key-value-non-ascii.rs:3:19 | -LL | #[rustc_dummy = b"ffi.rs"] - | ^ must be ASCII - | -help: if you meant to use the UTF-8 encoding of 'ffi', use \xHH escapes - | -LL | #[rustc_dummy = b"/xEF/xAC/x83.rs"] - | ~~~~~~~~~~~~ +LL | #[rustc_dummy = b'ffi'] + | ^ + | | + | must be ASCII + | this multibyte character does not fit into a single byte error: aborting due to 1 previous error diff --git a/tests/ui/feature-gates/feature-gate-mixed-utf8-literals.rs b/tests/ui/feature-gates/feature-gate-mixed-utf8-literals.rs new file mode 100644 index 0000000000000..d037ea1a51876 --- /dev/null +++ b/tests/ui/feature-gates/feature-gate-mixed-utf8-literals.rs @@ -0,0 +1,5 @@ +fn main() { + _ = b"a¥🦀"; //~ ERROR mixed utf8 + _ = br"a¥🦀"; //~ ERROR mixed utf8 + _ = b"a\u{a5}\u{1f980}"; //~ ERROR mixed utf8 +} diff --git a/tests/ui/feature-gates/feature-gate-mixed-utf8-literals.stderr b/tests/ui/feature-gates/feature-gate-mixed-utf8-literals.stderr new file mode 100644 index 0000000000000..bdff26269b720 --- /dev/null +++ b/tests/ui/feature-gates/feature-gate-mixed-utf8-literals.stderr @@ -0,0 +1,33 @@ +error[E0658]: mixed utf8 b"" and br"" literals are experimental + --> $DIR/feature-gate-mixed-utf8-literals.rs:2:9 + | +LL | _ = b"a¥🦀"; + | ^^^^^^^ + | + = note: see issue #116907 for more information + = help: add `#![feature(mixed_utf8_literals)]` to the crate attributes to enable + = note: this compiler was built on YYYY-MM-DD; consider upgrading it if it is out of date + +error[E0658]: mixed utf8 b"" and br"" literals are experimental + --> $DIR/feature-gate-mixed-utf8-literals.rs:3:9 + | +LL | _ = br"a¥🦀"; + | ^^^^^^^^ + | + = note: see issue #116907 for more information + = help: add `#![feature(mixed_utf8_literals)]` to the crate attributes to enable + = note: this compiler was built on YYYY-MM-DD; consider upgrading it if it is out of date + +error[E0658]: mixed utf8 b"" and br"" literals are experimental + --> $DIR/feature-gate-mixed-utf8-literals.rs:4:9 + | +LL | _ = b"a\u{a5}\u{1f980}"; + | ^^^^^^^^^^^^^^^^^^^ + | + = note: see issue #116907 for more information + = help: add `#![feature(mixed_utf8_literals)]` to the crate attributes to enable + = note: this compiler was built on YYYY-MM-DD; consider upgrading it if it is out of date + +error: aborting due to 3 previous errors + +For more information about this error, try `rustc --explain E0658`. diff --git a/tests/ui/mixed-utf8-literals/basic.rs b/tests/ui/mixed-utf8-literals/basic.rs new file mode 100644 index 0000000000000..a62c9316ee81d --- /dev/null +++ b/tests/ui/mixed-utf8-literals/basic.rs @@ -0,0 +1,19 @@ +// check-pass + +#![feature(mixed_utf8_literals)] + +fn main() { + b"a¥🦀"; + b"é"; + b"字"; + + br"a¥🦀"; + br"é"; + br##"é"##; + + b"\u{a66e}"; + b"a\u{a5}\u{1f980}"; + b"\u{a4a4}"; + + b"hello\xff我叫\u{1F980}"; +} diff --git a/tests/ui/parser/byte-string-literals.rs b/tests/ui/parser/byte-string-literals.rs index 30a4f50c4e40b..dae941d342f70 100644 --- a/tests/ui/parser/byte-string-literals.rs +++ b/tests/ui/parser/byte-string-literals.rs @@ -3,7 +3,5 @@ static FOO: &'static [u8] = b"\f"; //~ ERROR unknown byte escape pub fn main() { b"\f"; //~ ERROR unknown byte escape b"\x0Z"; //~ ERROR invalid character in numeric character escape: `Z` - b"é"; //~ ERROR non-ASCII character in byte string literal - br##"é"##; //~ ERROR non-ASCII character in raw byte string literal b"a //~ ERROR unterminated double quote byte string } diff --git a/tests/ui/parser/byte-string-literals.stderr b/tests/ui/parser/byte-string-literals.stderr index 655b6998e85ff..2186b4c2e494c 100644 --- a/tests/ui/parser/byte-string-literals.stderr +++ b/tests/ui/parser/byte-string-literals.stderr @@ -20,31 +20,14 @@ error: invalid character in numeric character escape: `Z` LL | b"\x0Z"; | ^ invalid character in numeric character escape -error: non-ASCII character in byte string literal - --> $DIR/byte-string-literals.rs:6:7 - | -LL | b"é"; - | ^ must be ASCII - | -help: if you meant to use the unicode code point for 'é', use a \xHH escape - | -LL | b"\xE9"; - | ~~~~ - -error: non-ASCII character in raw byte string literal - --> $DIR/byte-string-literals.rs:7:10 - | -LL | br##"é"##; - | ^ must be ASCII - error[E0766]: unterminated double quote byte string - --> $DIR/byte-string-literals.rs:8:6 + --> $DIR/byte-string-literals.rs:6:6 | LL | b"a | ______^ LL | | } | |__^ -error: aborting due to 6 previous errors +error: aborting due to 4 previous errors For more information about this error, try `rustc --explain E0766`. diff --git a/tests/ui/parser/issues/issue-23620-invalid-escapes.rs b/tests/ui/parser/issues/issue-23620-invalid-escapes.rs index c1355f0d6fe0c..14df9ad0766aa 100644 --- a/tests/ui/parser/issues/issue-23620-invalid-escapes.rs +++ b/tests/ui/parser/issues/issue-23620-invalid-escapes.rs @@ -1,7 +1,4 @@ fn main() { - let _ = b"\u{a66e}"; - //~^ ERROR unicode escape in byte string - let _ = b'\u{a66e}'; //~^ ERROR unicode escape in byte string @@ -20,10 +17,9 @@ fn main() { let _ = '\xxy'; //~^ ERROR invalid character in numeric character escape: `x` - let _ = b"\u{a4a4} \xf \u"; - //~^ ERROR unicode escape in byte string - //~^^ ERROR invalid character in numeric character escape: ` ` - //~^^^ ERROR incorrect unicode escape sequence + let _ = b"\xf \u"; + //~^ ERROR invalid character in numeric character escape: ` ` + //~^^ ERROR incorrect unicode escape sequence let _ = "\xf \u"; //~^ ERROR invalid character in numeric character escape: ` ` diff --git a/tests/ui/parser/issues/issue-23620-invalid-escapes.stderr b/tests/ui/parser/issues/issue-23620-invalid-escapes.stderr index 88d97c795fc2a..3731cdc102e6f 100644 --- a/tests/ui/parser/issues/issue-23620-invalid-escapes.stderr +++ b/tests/ui/parser/issues/issue-23620-invalid-escapes.stderr @@ -1,21 +1,13 @@ error: unicode escape in byte string --> $DIR/issue-23620-invalid-escapes.rs:2:15 | -LL | let _ = b"\u{a66e}"; - | ^^^^^^^^ unicode escape in byte string - | - = help: unicode escape sequences cannot be used as a byte or in a byte string - -error: unicode escape in byte string - --> $DIR/issue-23620-invalid-escapes.rs:5:15 - | LL | let _ = b'\u{a66e}'; | ^^^^^^^^ unicode escape in byte string | = help: unicode escape sequences cannot be used as a byte or in a byte string error: incorrect unicode escape sequence - --> $DIR/issue-23620-invalid-escapes.rs:8:15 + --> $DIR/issue-23620-invalid-escapes.rs:5:15 | LL | let _ = b'\u'; | ^^ incorrect unicode escape sequence @@ -23,59 +15,51 @@ LL | let _ = b'\u'; = help: format of unicode escape sequences is `\u{...}` error: numeric character escape is too short - --> $DIR/issue-23620-invalid-escapes.rs:11:15 + --> $DIR/issue-23620-invalid-escapes.rs:8:15 | LL | let _ = b'\x5'; | ^^^ error: invalid character in numeric character escape: `x` - --> $DIR/issue-23620-invalid-escapes.rs:14:17 + --> $DIR/issue-23620-invalid-escapes.rs:11:17 | LL | let _ = b'\xxy'; | ^ invalid character in numeric character escape error: numeric character escape is too short - --> $DIR/issue-23620-invalid-escapes.rs:17:14 + --> $DIR/issue-23620-invalid-escapes.rs:14:14 | LL | let _ = '\x5'; | ^^^ error: invalid character in numeric character escape: `x` - --> $DIR/issue-23620-invalid-escapes.rs:20:16 + --> $DIR/issue-23620-invalid-escapes.rs:17:16 | LL | let _ = '\xxy'; | ^ invalid character in numeric character escape -error: unicode escape in byte string - --> $DIR/issue-23620-invalid-escapes.rs:23:15 - | -LL | let _ = b"\u{a4a4} \xf \u"; - | ^^^^^^^^ unicode escape in byte string - | - = help: unicode escape sequences cannot be used as a byte or in a byte string - error: invalid character in numeric character escape: ` ` - --> $DIR/issue-23620-invalid-escapes.rs:23:27 + --> $DIR/issue-23620-invalid-escapes.rs:20:18 | -LL | let _ = b"\u{a4a4} \xf \u"; - | ^ invalid character in numeric character escape +LL | let _ = b"\xf \u"; + | ^ invalid character in numeric character escape error: incorrect unicode escape sequence - --> $DIR/issue-23620-invalid-escapes.rs:23:28 + --> $DIR/issue-23620-invalid-escapes.rs:20:19 | -LL | let _ = b"\u{a4a4} \xf \u"; - | ^^ incorrect unicode escape sequence +LL | let _ = b"\xf \u"; + | ^^ incorrect unicode escape sequence | = help: format of unicode escape sequences is `\u{...}` error: invalid character in numeric character escape: ` ` - --> $DIR/issue-23620-invalid-escapes.rs:28:17 + --> $DIR/issue-23620-invalid-escapes.rs:24:17 | LL | let _ = "\xf \u"; | ^ invalid character in numeric character escape error: incorrect unicode escape sequence - --> $DIR/issue-23620-invalid-escapes.rs:28:18 + --> $DIR/issue-23620-invalid-escapes.rs:24:18 | LL | let _ = "\xf \u"; | ^^ incorrect unicode escape sequence @@ -83,12 +67,12 @@ LL | let _ = "\xf \u"; = help: format of unicode escape sequences is `\u{...}` error: incorrect unicode escape sequence - --> $DIR/issue-23620-invalid-escapes.rs:32:14 + --> $DIR/issue-23620-invalid-escapes.rs:28:14 | LL | let _ = "\u8f"; | ^^^- | | | help: format of unicode escape sequences uses braces: `\u{8f}` -error: aborting due to 13 previous errors +error: aborting due to 11 previous errors diff --git a/tests/ui/parser/raw/raw-byte-string-literals.rs b/tests/ui/parser/raw/raw-byte-string-literals.rs index 1b859fee596ad..c485fca5523d6 100644 --- a/tests/ui/parser/raw/raw-byte-string-literals.rs +++ b/tests/ui/parser/raw/raw-byte-string-literals.rs @@ -2,6 +2,5 @@ pub fn main() { br"a "; //~ ERROR bare CR not allowed in raw string - br"é"; //~ ERROR non-ASCII character in raw byte string literal br##~"a"~##; //~ ERROR only `#` is allowed in raw string delimitation } diff --git a/tests/ui/parser/raw/raw-byte-string-literals.stderr b/tests/ui/parser/raw/raw-byte-string-literals.stderr index a2f27d1ed70ae..134067b25e93d 100644 --- a/tests/ui/parser/raw/raw-byte-string-literals.stderr +++ b/tests/ui/parser/raw/raw-byte-string-literals.stderr @@ -4,17 +4,11 @@ error: bare CR not allowed in raw string LL | br"a "; | ^ -error: non-ASCII character in raw byte string literal - --> $DIR/raw-byte-string-literals.rs:5:8 - | -LL | br"é"; - | ^ must be ASCII - error: found invalid character; only `#` is allowed in raw string delimitation: ~ - --> $DIR/raw-byte-string-literals.rs:6:5 + --> $DIR/raw-byte-string-literals.rs:5:5 | LL | br##~"a"~##; | ^^^^^ -error: aborting due to 3 previous errors +error: aborting due to 2 previous errors diff --git a/tests/ui/parser/unicode-control-codepoints.rs b/tests/ui/parser/unicode-control-codepoints.rs index df099bb62ad1e..4a7e3aab08a7a 100644 --- a/tests/ui/parser/unicode-control-codepoints.rs +++ b/tests/ui/parser/unicode-control-codepoints.rs @@ -4,8 +4,7 @@ fn main() { println!("us\u{202B}e\u{202A}r"); println!("{:?}", r#"us\u{202B}e\u{202A}r"#); println!("{:?}", b"us\u{202B}e\u{202A}r"); - //~^ ERROR unicode escape in byte string - //~| ERROR unicode escape in byte string + //~^ ERROR mixed utf8 b"" and br"" literals are experimental println!("{:?}", br##"us\u{202B}e\u{202A}r"##); println!("{:?}", "/*‮ } ⁦if isAdmin⁩ ⁦ begin admins only "); @@ -14,15 +13,9 @@ fn main() { println!("{:?}", r##"/*‮ } ⁦if isAdmin⁩ ⁦ begin admins only "##); //~^ ERROR unicode codepoint changing visible direction of text present in literal println!("{:?}", b"/*‮ } ⁦if isAdmin⁩ ⁦ begin admins only "); - //~^ ERROR non-ASCII character in byte string literal - //~| ERROR non-ASCII character in byte string literal - //~| ERROR non-ASCII character in byte string literal - //~| ERROR non-ASCII character in byte string literal + //~^ ERROR mixed utf8 b"" and br"" literals are experimental println!("{:?}", br##"/*‮ } ⁦if isAdmin⁩ ⁦ begin admins only "##); - //~^ ERROR non-ASCII character in raw byte string literal - //~| ERROR non-ASCII character in raw byte string literal - //~| ERROR non-ASCII character in raw byte string literal - //~| ERROR non-ASCII character in raw byte string literal + //~^ ERROR mixed utf8 b"" and br"" literals are experimental println!("{:?}", '‮'); //~^ ERROR unicode codepoint changing visible direction of text present in literal } diff --git a/tests/ui/parser/unicode-control-codepoints.stderr b/tests/ui/parser/unicode-control-codepoints.stderr index fc071a9419142..ff3668829e37a 100644 --- a/tests/ui/parser/unicode-control-codepoints.stderr +++ b/tests/ui/parser/unicode-control-codepoints.stderr @@ -1,86 +1,32 @@ -error: unicode escape in byte string - --> $DIR/unicode-control-codepoints.rs:6:26 +error[E0658]: mixed utf8 b"" and br"" literals are experimental + --> $DIR/unicode-control-codepoints.rs:6:22 | LL | println!("{:?}", b"us\u{202B}e\u{202A}r"); - | ^^^^^^^^ unicode escape in byte string + | ^^^^^^^^^^^^^^^^^^^^^^^ | - = help: unicode escape sequences cannot be used as a byte or in a byte string + = note: see issue #116907 for more information + = help: add `#![feature(mixed_utf8_literals)]` to the crate attributes to enable + = note: this compiler was built on YYYY-MM-DD; consider upgrading it if it is out of date -error: unicode escape in byte string - --> $DIR/unicode-control-codepoints.rs:6:35 - | -LL | println!("{:?}", b"us\u{202B}e\u{202A}r"); - | ^^^^^^^^ unicode escape in byte string - | - = help: unicode escape sequences cannot be used as a byte or in a byte string - -error: non-ASCII character in byte string literal - --> $DIR/unicode-control-codepoints.rs:16:26 - | -LL | println!("{:?}", b"/* } if isAdmin begin admins only "); - | ^ must be ASCII but is '\u{202e}' - | -help: if you meant to use the UTF-8 encoding of '\u{202e}', use \xHH escapes - | -LL | println!("{:?}", b"/*\xE2\x80\xAE } if isAdmin begin admins only "); - | ~~~~~~~~~~~~ - -error: non-ASCII character in byte string literal - --> $DIR/unicode-control-codepoints.rs:16:30 - | -LL | println!("{:?}", b"/* } if isAdmin begin admins only "); - | ^ must be ASCII but is '\u{2066}' - | -help: if you meant to use the UTF-8 encoding of '\u{2066}', use \xHH escapes - | -LL | println!("{:?}", b"/* } \xE2\x81\xA6if isAdmin begin admins only "); - | ~~~~~~~~~~~~ - -error: non-ASCII character in byte string literal - --> $DIR/unicode-control-codepoints.rs:16:41 +error[E0658]: mixed utf8 b"" and br"" literals are experimental + --> $DIR/unicode-control-codepoints.rs:15:22 | LL | println!("{:?}", b"/* } if isAdmin begin admins only "); - | ^ must be ASCII but is '\u{2069}' + | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | -help: if you meant to use the UTF-8 encoding of '\u{2069}', use \xHH escapes - | -LL | println!("{:?}", b"/* } if isAdmin\xE2\x81\xA9 begin admins only "); - | ~~~~~~~~~~~~ - -error: non-ASCII character in byte string literal - --> $DIR/unicode-control-codepoints.rs:16:43 - | -LL | println!("{:?}", b"/* } if isAdmin begin admins only "); - | ^ must be ASCII but is '\u{2066}' - | -help: if you meant to use the UTF-8 encoding of '\u{2066}', use \xHH escapes - | -LL | println!("{:?}", b"/* } if isAdmin \xE2\x81\xA6 begin admins only "); - | ~~~~~~~~~~~~ - -error: non-ASCII character in raw byte string literal - --> $DIR/unicode-control-codepoints.rs:21:29 - | -LL | println!("{:?}", br##"/* } if isAdmin begin admins only "##); - | ^ must be ASCII but is '\u{202e}' + = note: see issue #116907 for more information + = help: add `#![feature(mixed_utf8_literals)]` to the crate attributes to enable + = note: this compiler was built on YYYY-MM-DD; consider upgrading it if it is out of date -error: non-ASCII character in raw byte string literal - --> $DIR/unicode-control-codepoints.rs:21:33 +error[E0658]: mixed utf8 b"" and br"" literals are experimental + --> $DIR/unicode-control-codepoints.rs:17:22 | LL | println!("{:?}", br##"/* } if isAdmin begin admins only "##); - | ^ must be ASCII but is '\u{2066}' - -error: non-ASCII character in raw byte string literal - --> $DIR/unicode-control-codepoints.rs:21:44 + | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | -LL | println!("{:?}", br##"/* } if isAdmin begin admins only "##); - | ^ must be ASCII but is '\u{2069}' - -error: non-ASCII character in raw byte string literal - --> $DIR/unicode-control-codepoints.rs:21:46 - | -LL | println!("{:?}", br##"/* } if isAdmin begin admins only "##); - | ^ must be ASCII but is '\u{2066}' + = note: see issue #116907 for more information + = help: add `#![feature(mixed_utf8_literals)]` to the crate attributes to enable + = note: this compiler was built on YYYY-MM-DD; consider upgrading it if it is out of date error: unicode codepoint changing visible direction of text present in comment --> $DIR/unicode-control-codepoints.rs:2:5 @@ -97,7 +43,7 @@ LL | // if access_level != "user" { // Check if admin = help: if their presence wasn't intentional, you can remove them error: unicode codepoint changing visible direction of text present in comment - --> $DIR/unicode-control-codepoints.rs:30:1 + --> $DIR/unicode-control-codepoints.rs:23:1 | LL | //"/* } if isAdmin begin admins only */" | ^^^^^-^^-^^^^^^^^^--^^^^^^^^^^^^^^^^^^^^^ @@ -112,7 +58,7 @@ LL | //"/* } if isAdmin begin admins only */" = help: if their presence wasn't intentional, you can remove them error: unicode codepoint changing visible direction of text present in literal - --> $DIR/unicode-control-codepoints.rs:11:22 + --> $DIR/unicode-control-codepoints.rs:10:22 | LL | println!("{:?}", "/* } if isAdmin begin admins only "); | ^^^-^^-^^^^^^^^^--^^^^^^^^^^^^^^^^^^^ @@ -132,7 +78,7 @@ LL | println!("{:?}", "/*\u{202e} } \u{2066}if isAdmin\u{2069} \u{2066} begi | ~~~~~~~~ ~~~~~~~~ ~~~~~~~~ ~~~~~~~~ error: unicode codepoint changing visible direction of text present in literal - --> $DIR/unicode-control-codepoints.rs:14:22 + --> $DIR/unicode-control-codepoints.rs:13:22 | LL | println!("{:?}", r##"/* } if isAdmin begin admins only "##); | ^^^^^^-^^-^^^^^^^^^--^^^^^^^^^^^^^^^^^^^^^ @@ -151,7 +97,7 @@ LL | println!("{:?}", r##"/*\u{202e} } \u{2066}if isAdmin\u{2069} \u{2066} b | ~~~~~~~~ ~~~~~~~~ ~~~~~~~~ ~~~~~~~~ error: unicode codepoint changing visible direction of text present in literal - --> $DIR/unicode-control-codepoints.rs:26:22 + --> $DIR/unicode-control-codepoints.rs:19:22 | LL | println!("{:?}", ''); | ^- @@ -167,7 +113,7 @@ LL | println!("{:?}", '\u{202e}'); | ~~~~~~~~ error: unicode codepoint changing visible direction of text present in doc comment - --> $DIR/unicode-control-codepoints.rs:33:1 + --> $DIR/unicode-control-codepoints.rs:26:1 | LL | /** ''); */fn foo() {} | ^^^^^^^^^^^^ this doc comment contains an invisible unicode text flow control codepoint @@ -177,7 +123,7 @@ LL | /** ''); */fn foo() {} = note: if you want to keep them but make them visible in your source code, you can escape them: '\u{202e}' error: unicode codepoint changing visible direction of text present in doc comment - --> $DIR/unicode-control-codepoints.rs:36:1 + --> $DIR/unicode-control-codepoints.rs:29:1 | LL | / /** LL | | * @@ -188,5 +134,6 @@ LL | | * ''); */fn bar() {} = note: if their presence wasn't intentional, you can remove them = note: if you want to keep them but make them visible in your source code, you can escape them: '\u{202e}' -error: aborting due to 17 previous errors +error: aborting due to 10 previous errors +For more information about this error, try `rustc --explain E0658`. diff --git a/tests/ui/suggestions/multibyte-escapes.rs b/tests/ui/suggestions/multibyte-escapes.rs index c4105186244db..bbb238e044952 100644 --- a/tests/ui/suggestions/multibyte-escapes.rs +++ b/tests/ui/suggestions/multibyte-escapes.rs @@ -10,9 +10,4 @@ fn main() { //~^ ERROR: non-ASCII character in byte literal //~| NOTE: this multibyte character does not fit into a single byte //~| NOTE: must be ASCII - - b"字"; - //~^ ERROR: non-ASCII character in byte string literal - //~| HELP: if you meant to use the UTF-8 encoding of '字', use \xHH escapes - //~| NOTE: must be ASCII } diff --git a/tests/ui/suggestions/multibyte-escapes.stderr b/tests/ui/suggestions/multibyte-escapes.stderr index 1e7c43e6538f6..e35d29b923844 100644 --- a/tests/ui/suggestions/multibyte-escapes.stderr +++ b/tests/ui/suggestions/multibyte-escapes.stderr @@ -18,16 +18,5 @@ LL | b'字'; | must be ASCII | this multibyte character does not fit into a single byte -error: non-ASCII character in byte string literal - --> $DIR/multibyte-escapes.rs:14:7 - | -LL | b"字"; - | ^^ must be ASCII - | -help: if you meant to use the UTF-8 encoding of '字', use \xHH escapes - | -LL | b"\xE5\xAD\x97"; - | ~~~~~~~~~~~~ - -error: aborting due to 3 previous errors +error: aborting due to 2 previous errors