Skip to content

Commit

Permalink
Rollup merge of #114193 - crlf0710:lexer_unicode15, r=Manishearth
Browse files Browse the repository at this point in the history
Update lexer emoji diagnostics to Unicode 15.0

This replaces the `unic-emoji-char` dep tree (which hasn't been updated for a while) with `unicode-properties` crate which contains Unicode 15.0 data.

Improves diagnostics for added emoji characters in recent years. (See tests).

cc #101840

cc ``@Manishearth``
  • Loading branch information
matthiaskrgr authored Jul 31, 2023
2 parents 7c6942a + bca79a2 commit 57c57a5
Show file tree
Hide file tree
Showing 6 changed files with 36 additions and 76 deletions.
49 changes: 7 additions & 42 deletions Cargo.lock
Original file line number Diff line number Diff line change
Expand Up @@ -3786,7 +3786,7 @@ name = "rustc_lexer"
version = "0.1.0"
dependencies = [
"expect-test",
"unic-emoji-char",
"unicode-properties",
"unicode-xid",
]

Expand Down Expand Up @@ -5446,38 +5446,6 @@ dependencies = [
"tempfile",
]

[[package]]
name = "unic-char-property"
version = "0.9.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a8c57a407d9b6fa02b4795eb81c5b6652060a15a7903ea981f3d723e6c0be221"
dependencies = [
"unic-char-range",
]

[[package]]
name = "unic-char-range"
version = "0.9.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0398022d5f700414f6b899e10b8348231abf9173fa93144cbc1a43b9793c1fbc"

[[package]]
name = "unic-common"
version = "0.9.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "80d7ff825a6a654ee85a63e80f92f054f904f21e7d12da4e22f9834a4aaa35bc"

[[package]]
name = "unic-emoji-char"
version = "0.9.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0b07221e68897210270a38bde4babb655869637af0f69407f96053a34f76494d"
dependencies = [
"unic-char-property",
"unic-char-range",
"unic-ucd-version",
]

[[package]]
name = "unic-langid"
version = "0.9.1"
Expand Down Expand Up @@ -5521,15 +5489,6 @@ dependencies = [
"unic-langid-impl",
]

[[package]]
name = "unic-ucd-version"
version = "0.9.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "96bd2f2237fe450fcd0a1d2f5f4e91711124f7857ba2e964247776ebeeb7b0c4"
dependencies = [
"unic-common",
]

[[package]]
name = "unicase"
version = "2.6.0"
Expand Down Expand Up @@ -5567,6 +5526,12 @@ dependencies = [
"tinyvec",
]

[[package]]
name = "unicode-properties"
version = "0.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c7f91c8b21fbbaa18853c3d0801c78f4fc94cdb976699bb03e832e75f7fd22f0"

[[package]]
name = "unicode-script"
version = "0.5.5"
Expand Down
6 changes: 5 additions & 1 deletion compiler/rustc_lexer/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,11 @@ Rust lexer used by rustc. No stability guarantees are provided.
# Note that this crate purposefully does not depend on other rustc crates
[dependencies]
unicode-xid = "0.2.0"
unic-emoji-char = "0.9.0"

[dependencies.unicode-properties]
version = "0.1.0"
default-features = false
features = ["emoji"]

[dev-dependencies]
expect-test = "1.4.0"
11 changes: 4 additions & 7 deletions compiler/rustc_lexer/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ pub use crate::cursor::Cursor;
use self::LiteralKind::*;
use self::TokenKind::*;
use crate::cursor::EOF_CHAR;
use unicode_properties::UnicodeEmoji;

/// Parsed token.
/// It doesn't contain information about data that has been parsed,
Expand Down Expand Up @@ -428,9 +429,7 @@ impl Cursor<'_> {
Literal { kind, suffix_start }
}
// Identifier starting with an emoji. Only lexed for graceful error recovery.
c if !c.is_ascii() && unic_emoji_char::is_emoji(c) => {
self.fake_ident_or_unknown_prefix()
}
c if !c.is_ascii() && c.is_emoji_char() => self.fake_ident_or_unknown_prefix(),
_ => Unknown,
};
let res = Token::new(token_kind, self.pos_within_token());
Expand Down Expand Up @@ -514,9 +513,7 @@ impl Cursor<'_> {
// we see a prefix here, it is definitely an unknown prefix.
match self.first() {
'#' | '"' | '\'' => UnknownPrefix,
c if !c.is_ascii() && unic_emoji_char::is_emoji(c) => {
self.fake_ident_or_unknown_prefix()
}
c if !c.is_ascii() && c.is_emoji_char() => self.fake_ident_or_unknown_prefix(),
_ => Ident,
}
}
Expand All @@ -525,7 +522,7 @@ impl Cursor<'_> {
// Start is already eaten, eat the rest of identifier.
self.eat_while(|c| {
unicode_xid::UnicodeXID::is_xid_continue(c)
|| (!c.is_ascii() && unic_emoji_char::is_emoji(c))
|| (!c.is_ascii() && c.is_emoji_char())
|| c == '\u{200d}'
});
// Known prefixes must have been handled earlier. So if
Expand Down
6 changes: 1 addition & 5 deletions src/tools/tidy/src/deps.rs
Original file line number Diff line number Diff line change
Expand Up @@ -270,18 +270,14 @@ const PERMITTED_RUSTC_DEPENDENCIES: &[&str] = &[
"twox-hash",
"type-map",
"typenum",
"unic-char-property",
"unic-char-range",
"unic-common",
"unic-emoji-char",
"unic-langid",
"unic-langid-impl",
"unic-langid-macros",
"unic-langid-macros-impl",
"unic-ucd-version",
"unicase",
"unicode-ident",
"unicode-normalization",
"unicode-properties",
"unicode-script",
"unicode-security",
"unicode-width",
Expand Down
6 changes: 2 additions & 4 deletions tests/ui/lexer/lex-emoji-identifiers.rs
Original file line number Diff line number Diff line change
@@ -1,9 +1,7 @@
fn invalid_emoji_usages() {
let arrow↔️ = "basic emoji"; //~ ERROR: identifiers cannot contain emoji
// FIXME
let planet🪐 = "basic emoji"; //~ ERROR: unknown start of token
// FIXME
let wireless🛜 = "basic emoji"; //~ ERROR: unknown start of token
let planet🪐 = "basic emoji"; //~ ERROR: identifiers cannot contain emoji
let wireless🛜 = "basic emoji"; //~ ERROR: identifiers cannot contain emoji
// FIXME
let key1️⃣ = "keycap sequence"; //~ ERROR: unknown start of token
//~^ WARN: identifier contains uncommon Unicode codepoints
Expand Down
34 changes: 17 additions & 17 deletions tests/ui/lexer/lex-emoji-identifiers.stderr
Original file line number Diff line number Diff line change
@@ -1,17 +1,5 @@
error: unknown start of token: \u{1fa90}
--> $DIR/lex-emoji-identifiers.rs:4:15
|
LL | let planet🪐 = "basic emoji";
| ^^

error: unknown start of token: \u{1f6dc}
--> $DIR/lex-emoji-identifiers.rs:6:17
|
LL | let wireless🛜 = "basic emoji";
| ^^

error: unknown start of token: \u{20e3}
--> $DIR/lex-emoji-identifiers.rs:8:14
--> $DIR/lex-emoji-identifiers.rs:6:14
|
LL | let key1️⃣ = "keycap sequence";
| ^
Expand All @@ -22,26 +10,38 @@ error: identifiers cannot contain emoji: `arrow↔️`
LL | let arrow↔️ = "basic emoji";
| ^^^^^^

error: identifiers cannot contain emoji: `planet🪐`
--> $DIR/lex-emoji-identifiers.rs:3:9
|
LL | let planet🪐 = "basic emoji";
| ^^^^^^^^

error: identifiers cannot contain emoji: `wireless🛜`
--> $DIR/lex-emoji-identifiers.rs:4:9
|
LL | let wireless🛜 = "basic emoji";
| ^^^^^^^^^^

error: identifiers cannot contain emoji: `flag🇺🇳`
--> $DIR/lex-emoji-identifiers.rs:10:9
--> $DIR/lex-emoji-identifiers.rs:8:9
|
LL | let flag🇺🇳 = "flag sequence";
| ^^^^^^

error: identifiers cannot contain emoji: `wales🏴`
--> $DIR/lex-emoji-identifiers.rs:11:9
--> $DIR/lex-emoji-identifiers.rs:9:9
|
LL | let wales🏴 = "tag sequence";
| ^^^^^^^

error: identifiers cannot contain emoji: `folded🙏🏿`
--> $DIR/lex-emoji-identifiers.rs:12:9
--> $DIR/lex-emoji-identifiers.rs:10:9
|
LL | let folded🙏🏿 = "modifier sequence";
| ^^^^^^^^^^

warning: identifier contains uncommon Unicode codepoints
--> $DIR/lex-emoji-identifiers.rs:8:9
--> $DIR/lex-emoji-identifiers.rs:6:9
|
LL | let key1️⃣ = "keycap sequence";
| ^^^^
Expand Down

0 comments on commit 57c57a5

Please sign in to comment.