Skip to content

Commit

Permalink
Reserve guarded string literals (RFC 3593)
Browse files Browse the repository at this point in the history
  • Loading branch information
pitaj committed Oct 9, 2024
1 parent 6f4ae0f commit 321a5db
Show file tree
Hide file tree
Showing 23 changed files with 1,514 additions and 9 deletions.
93 changes: 85 additions & 8 deletions compiler/rustc_lexer/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,12 @@ pub enum TokenKind {
/// for emoji identifier recovery, as those are not meant to be ever accepted.
InvalidPrefix,

/// Guarded string literal prefix: `#"` or `##`.
///
/// Used for reserving "guarded strings" (RFC 3598) in edition 2024.
/// Split into the component tokens on older editions.
GuardedStrPrefix,

/// Examples: `12u8`, `1.0e-40`, `b"123"`. Note that `_` is an invalid
/// suffix, but may be present here on string and float literals. Users of
/// this type will need to check for and reject that case.
Expand Down Expand Up @@ -191,30 +197,41 @@ pub enum DocStyle {
/// `rustc_ast::ast::LitKind`).
#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)]
pub enum LiteralKind {
/// "12_u8", "0o100", "0b120i99", "1f32".
/// `12_u8`, `0o100`, `0b120i99`, `1f32`.
Int { base: Base, empty_int: bool },
/// "12.34f32", "1e3", but not "1f32".
/// `12.34f32`, `1e3`, but not `1f32`.
Float { base: Base, empty_exponent: bool },
/// "'a'", "'\\'", "'''", "';"
/// `'a'`, `'\\'`, `'''`, `';`
Char { terminated: bool },
/// "b'a'", "b'\\'", "b'''", "b';"
/// `b'a'`, `b'\\'`, `b'''`, `b';`
Byte { terminated: bool },
/// ""abc"", ""abc"
/// `"abc"`, `"abc`
Str { terminated: bool },
/// "b"abc"", "b"abc"
/// `b"abc"`, `b"abc`
ByteStr { terminated: bool },
/// `c"abc"`, `c"abc`
CStr { terminated: bool },
/// "r"abc"", "r#"abc"#", "r####"ab"###"c"####", "r#"a". `None` indicates
/// `r"abc"`, `r#"abc"#`, `r####"ab"###"c"####`, `r#"a`. `None` indicates
/// an invalid literal.
RawStr { n_hashes: Option<u8> },
/// "br"abc"", "br#"abc"#", "br####"ab"###"c"####", "br#"a". `None`
/// `br"abc"`, `br#"abc"#`, `br####"ab"###"c"####`, `br#"a`. `None`
/// indicates an invalid literal.
RawByteStr { n_hashes: Option<u8> },
/// `cr"abc"`, "cr#"abc"#", `cr#"a`. `None` indicates an invalid literal.
RawCStr { n_hashes: Option<u8> },
}

/// `#"abc"#`, `##"a"` (fewer closing), or even `#"a` (unterminated).
///
/// Can capture fewer closing hashes than starting hashes,
/// for more efficient lexing and better backwards diagnostics.
#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)]
pub struct GuardedStr {
pub n_hashes: u32,
pub terminated: bool,
pub token_len: u32,
}

#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)]
pub enum RawStrError {
/// Non `#` characters exist between `r` and `"`, e.g. `r##~"abcde"##`
Expand Down Expand Up @@ -403,6 +420,12 @@ impl Cursor<'_> {
TokenKind::Literal { kind: literal_kind, suffix_start }
}

// Guarded string literal prefix: `#"` or `##`
'#' if matches!(self.first(), '"' | '#') => {
self.bump();
TokenKind::GuardedStrPrefix
}

// One-symbol tokens.
';' => Semi,
',' => Comma,
Expand Down Expand Up @@ -780,6 +803,60 @@ impl Cursor<'_> {
false
}

/// Attempt to lex for a guarded string literal.
///
/// Used by `rustc_parse::lexer` to lex for guarded strings
/// conditionally based on edition.
///
/// Note: this will not reset the `Cursor` when a
/// guarded string is not found. It is the caller's
/// responsibility to do so.
pub fn guarded_double_quoted_string(&mut self) -> Option<GuardedStr> {
debug_assert!(self.prev() != '#');

let mut n_start_hashes: u32 = 0;
while self.first() == '#' {
n_start_hashes += 1;
self.bump();
}

if self.first() != '"' {
return None;
}
self.bump();
debug_assert!(self.prev() == '"');

// Lex the string itself as a normal string literal
// so we can recover that for older editions later.
let terminated = self.double_quoted_string();
if !terminated {
let token_len = self.pos_within_token();
self.reset_pos_within_token();

return Some(GuardedStr { n_hashes: n_start_hashes, terminated: false, token_len });
}

// Consume closing '#' symbols.
// Note that this will not consume extra trailing `#` characters:
// `###"abcde"####` is lexed as a `GuardedStr { n_end_hashes: 3, .. }`
// followed by a `#` token.
let mut n_end_hashes = 0;
while self.first() == '#' && n_end_hashes < n_start_hashes {
n_end_hashes += 1;
self.bump();
}

// Reserved syntax, always an error, so it doesn't matter if
// `n_start_hashes != n_end_hashes`.

self.eat_literal_suffix();

let token_len = self.pos_within_token();
self.reset_pos_within_token();

Some(GuardedStr { n_hashes: n_start_hashes, terminated: true, token_len })
}

/// Eats the double-quoted string and returns `n_hashes` and an error if encountered.
fn raw_double_quoted_string(&mut self, prefix_len: u32) -> Result<u8, RawStrError> {
// Wrap the actual function to handle the error with too many hashes.
Expand Down
3 changes: 3 additions & 0 deletions compiler/rustc_lint/messages.ftl
Original file line number Diff line number Diff line change
Expand Up @@ -740,6 +740,9 @@ lint_reserved_prefix = prefix `{$prefix}` is unknown
.label = unknown prefix
.suggestion = insert whitespace here to avoid this being parsed as a prefix in Rust 2021
lint_reserved_string = will be parsed as a guarded string in Rust 2024
.suggestion = insert whitespace here to avoid this being parsed as a guarded string in Rust 2024
lint_shadowed_into_iter =
this method call resolves to `<&{$target} as IntoIterator>::into_iter` (due to backwards compatibility), but will resolve to `<{$target} as IntoIterator>::into_iter` in Rust {$edition}
.use_iter_suggestion = use `.iter()` instead of `.into_iter()` to avoid ambiguity
Expand Down
3 changes: 3 additions & 0 deletions compiler/rustc_lint/src/context/diagnostics.rs
Original file line number Diff line number Diff line change
Expand Up @@ -176,6 +176,9 @@ pub(super) fn decorate_lint(sess: &Session, diagnostic: BuiltinLintDiag, diag: &
lints::RawPrefix { label: label_span, suggestion: label_span.shrink_to_hi() }
.decorate_lint(diag);
}
BuiltinLintDiag::ReservedString(suggestion) => {
lints::ReservedString { suggestion }.decorate_lint(diag);
}
BuiltinLintDiag::UnusedBuiltinAttribute { attr_name, macro_name, invoc_span } => {
lints::UnusedBuiltinAttribute { invoc_span, attr_name, macro_name }.decorate_lint(diag);
}
Expand Down
7 changes: 7 additions & 0 deletions compiler/rustc_lint/src/lints.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3053,3 +3053,10 @@ pub(crate) enum MutRefSugg {
#[derive(LintDiagnostic)]
#[diag(lint_unqualified_local_imports)]
pub(crate) struct UnqualifiedLocalImportsDiag {}

#[derive(LintDiagnostic)]
#[diag(lint_reserved_string)]
pub(crate) struct ReservedString {
#[suggestion(code = " ", applicability = "machine-applicable")]
pub suggestion: Span,
}
41 changes: 41 additions & 0 deletions compiler/rustc_lint_defs/src/builtin.rs
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,7 @@ declare_lint_pass! {
RUST_2021_INCOMPATIBLE_OR_PATTERNS,
RUST_2021_PREFIXES_INCOMPATIBLE_SYNTAX,
RUST_2021_PRELUDE_COLLISIONS,
RUST_2024_GUARDED_STRING_INCOMPATIBLE_SYNTAX,
RUST_2024_INCOMPATIBLE_PAT,
RUST_2024_PRELUDE_COLLISIONS,
SELF_CONSTRUCTOR_FROM_OUTER_ITEM,
Expand Down Expand Up @@ -4996,3 +4997,43 @@ declare_lint! {
Warn,
"detects pointer to integer transmutes in const functions and associated constants",
}

declare_lint! {
/// The `rust_2024_guarded_string_incompatible_syntax` lint detects `#` tokens
/// that will be parsed as part of a guarded string literal in Rust 2024.
///
/// ### Example
///
/// ```rust,edition2021,compile_fail
/// #![deny(rust_2024_guarded_string_incompatible_syntax)]
///
/// macro_rules! m {
/// (# $x:expr #) => ();
/// (# $x:expr) => ();
/// }
///
/// m!(#"hey"#);
/// m!(#"hello");
/// ```
///
/// {{produces}}
///
/// ### Explanation
///
/// Prior to Rust 2024, `#"hey"#` is three tokens: the first `#`
/// followed by the string literal `"hey"` then the final `#`.
/// In Rust 2024, the whole sequence is considered a single token.
///
/// This lint suggests to add whitespace between the leading `#`
/// and the string to keep them separated in Rust 2024.
// Allow this lint -- rustdoc doesn't yet support threading edition into this lint's parser.
#[allow(rustdoc::invalid_rust_codeblocks)]
pub RUST_2024_GUARDED_STRING_INCOMPATIBLE_SYNTAX,
Allow,
"will be parsed as a guarded string in Rust 2024",
@future_incompatible = FutureIncompatibleInfo {
reason: FutureIncompatibilityReason::EditionError(Edition::Edition2024),
reference: "issue #123735 <https://github.com/rust-lang/rust/issues/123735>",
};
crate_level_only
}
2 changes: 2 additions & 0 deletions compiler/rustc_lint_defs/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -614,6 +614,8 @@ pub enum BuiltinLintDiag {
ReservedPrefix(Span, String),
/// `'r#` in edition < 2021.
RawPrefix(Span),
/// `##` or `#"` is edition < 2024.
ReservedString(Span),
TrailingMacro(bool, Ident),
BreakWithLabelAndLoop(Span),
UnicodeTextFlow(Span, String),
Expand Down
4 changes: 4 additions & 0 deletions compiler/rustc_parse/messages.ftl
Original file line number Diff line number Diff line change
Expand Up @@ -706,6 +706,10 @@ parse_require_colon_after_labeled_expression = labeled expression must be follow
.label = the label
.suggestion = add `:` after the label
parse_reserved_string = invalid string literal
.note = unprefixed guarded string literals are reserved for future use since Rust 2024
.suggestion_whitespace = consider inserting whitespace here
parse_return_types_use_thin_arrow = return types are denoted using `->`
.suggestion = use `->` instead
Expand Down
18 changes: 18 additions & 0 deletions compiler/rustc_parse/src/errors.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2110,6 +2110,24 @@ pub(crate) enum UnknownPrefixSugg {
},
}

#[derive(Diagnostic)]
#[diag(parse_reserved_string)]
#[note]
pub(crate) struct ReservedString {
#[primary_span]
pub span: Span,
#[subdiagnostic]
pub sugg: Option<GuardedStringSugg>,
}
#[derive(Subdiagnostic)]
#[suggestion(
parse_suggestion_whitespace,
code = " ",
applicability = "maybe-incorrect",
style = "verbose"
)]
pub(crate) struct GuardedStringSugg(#[primary_span] pub Span);

#[derive(Diagnostic)]
#[diag(parse_too_many_hashes)]
pub(crate) struct TooManyHashes {
Expand Down
84 changes: 83 additions & 1 deletion compiler/rustc_parse/src/lexer/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,8 @@ use rustc_lexer::unescape::{self, EscapeError, Mode};
use rustc_lexer::{Base, Cursor, DocStyle, LiteralKind, RawStrError};
use rustc_session::lint::BuiltinLintDiag;
use rustc_session::lint::builtin::{
RUST_2021_PREFIXES_INCOMPATIBLE_SYNTAX, TEXT_DIRECTION_CODEPOINT_IN_COMMENT,
RUST_2021_PREFIXES_INCOMPATIBLE_SYNTAX, RUST_2024_GUARDED_STRING_INCOMPATIBLE_SYNTAX,
TEXT_DIRECTION_CODEPOINT_IN_COMMENT,
};
use rustc_session::parse::ParseSess;
use rustc_span::symbol::Symbol;
Expand Down Expand Up @@ -251,6 +252,7 @@ impl<'psess, 'src> StringReader<'psess, 'src> {
let prefix_span = self.mk_sp(start, lit_start);
return (Token::new(self.ident(start), prefix_span), preceded_by_whitespace);
}
rustc_lexer::TokenKind::GuardedStrPrefix => self.maybe_report_guarded_str(start, str_before),
rustc_lexer::TokenKind::Literal { kind, suffix_start } => {
let suffix_start = start + BytePos(suffix_start);
let (kind, symbol) = self.cook_lexer_literal(start, suffix_start, kind);
Expand Down Expand Up @@ -781,6 +783,86 @@ impl<'psess, 'src> StringReader<'psess, 'src> {
}
}

/// Detect guarded string literal syntax
///
/// RFC 3598 reserved this syntax for future use. As of Rust 2024,
/// using this syntax produces an error. In earlier editions, however, it
/// only results in an (allowed by default) lint, and is treated as
/// separate tokens.
fn maybe_report_guarded_str(&mut self, start: BytePos, str_before: &'src str) -> TokenKind {
let span = self.mk_sp(start, self.pos);
let edition2024 = span.edition().at_least_rust_2024();

let space_pos = start + BytePos(1);
let space_span = self.mk_sp(space_pos, space_pos);

let mut cursor = Cursor::new(str_before);

let (span, unterminated) = match cursor.guarded_double_quoted_string() {
Some(rustc_lexer::GuardedStr { n_hashes, terminated, token_len }) => {
let end = start + BytePos(token_len);
let span = self.mk_sp(start, end);
let str_start = start + BytePos(n_hashes);

if edition2024 {
self.cursor = cursor;
self.pos = end;
}

let unterminated = if terminated { None } else { Some(str_start) };

(span, unterminated)
}
_ => {
// We should only get here in the `##+` case.
debug_assert_eq!(self.str_from_to(start, start + BytePos(2)), "##");

(span, None)
}
};
if edition2024 {
if let Some(str_start) = unterminated {
// Only a fatal error if string is unterminated.
self.dcx()
.struct_span_fatal(
self.mk_sp(str_start, self.pos),
"unterminated double quote string",
)
.with_code(E0765)
.emit()
}

let sugg = if span.from_expansion() {
None
} else {
Some(errors::GuardedStringSugg(space_span))
};

// In Edition 2024 and later, emit a hard error.
let err = self.dcx().emit_err(errors::ReservedString { span, sugg });

token::Literal(token::Lit {
kind: token::Err(err),
symbol: self.symbol_from_to(start, self.pos),
suffix: None,
})
} else {
// Before Rust 2024, only emit a lint for migration.
self.psess.buffer_lint(
RUST_2024_GUARDED_STRING_INCOMPATIBLE_SYNTAX,
span,
ast::CRATE_NODE_ID,
BuiltinLintDiag::ReservedString(space_span),
);

// For backwards compatibility, roll back to after just the first `#`
// and return the `Pound` token.
self.pos = start + BytePos(1);
self.cursor = Cursor::new(&str_before[1..]);
token::Pound
}
}

fn report_too_many_hashes(&self, start: BytePos, num: u32) -> ! {
self.dcx().emit_fatal(errors::TooManyHashes { span: self.mk_sp(start, self.pos), num });
}
Expand Down
1 change: 1 addition & 0 deletions src/librustdoc/html/highlight.rs
Original file line number Diff line number Diff line change
Expand Up @@ -845,6 +845,7 @@ impl<'src> Classifier<'src> {
// Number literals.
LiteralKind::Float { .. } | LiteralKind::Int { .. } => Class::Number,
},
TokenKind::GuardedStrPrefix => return no_highlight(sink),
TokenKind::Ident | TokenKind::RawIdent if lookahead == Some(TokenKind::Bang) => {
self.in_macro = true;
sink(Highlight::EnterSpan { class: Class::Macro(self.new_span(before, text)) });
Expand Down
6 changes: 6 additions & 0 deletions src/tools/rust-analyzer/crates/parser/src/lexed_str.rs
Original file line number Diff line number Diff line change
Expand Up @@ -187,6 +187,12 @@ impl<'a> Converter<'a> {
}

rustc_lexer::TokenKind::RawIdent => IDENT,

rustc_lexer::TokenKind::GuardedStrPrefix => {
err = "Invalid string literal (reserved syntax)";
ERROR
},

rustc_lexer::TokenKind::Literal { kind, .. } => {
self.extend_literal(token_text.len(), kind);
return;
Expand Down
Loading

0 comments on commit 321a5db

Please sign in to comment.