Rollup merge of #114193 - crlf0710:lexer_unicode15, r=Manishearth

Update lexer emoji diagnostics to Unicode 15.0

This replaces the `unic-emoji-char` dep tree (which hasn't been updated for a while) with `unicode-properties` crate which contains Unicode 15.0 data.

Improves diagnostics for added emoji characters in recent years. (See tests).

cc #101840

cc ``@Manishearth``
This commit is contained in:
Matthias Krüger 2023-07-31 22:51:15 +02:00 committed by GitHub
commit 57c57a555b
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
6 changed files with 36 additions and 76 deletions

View file

@ -3786,7 +3786,7 @@ name = "rustc_lexer"
version = "0.1.0"
dependencies = [
"expect-test",
"unic-emoji-char",
"unicode-properties",
"unicode-xid",
]
@ -5446,38 +5446,6 @@ dependencies = [
"tempfile",
]
[[package]]
name = "unic-char-property"
version = "0.9.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a8c57a407d9b6fa02b4795eb81c5b6652060a15a7903ea981f3d723e6c0be221"
dependencies = [
"unic-char-range",
]
[[package]]
name = "unic-char-range"
version = "0.9.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0398022d5f700414f6b899e10b8348231abf9173fa93144cbc1a43b9793c1fbc"
[[package]]
name = "unic-common"
version = "0.9.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "80d7ff825a6a654ee85a63e80f92f054f904f21e7d12da4e22f9834a4aaa35bc"
[[package]]
name = "unic-emoji-char"
version = "0.9.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0b07221e68897210270a38bde4babb655869637af0f69407f96053a34f76494d"
dependencies = [
"unic-char-property",
"unic-char-range",
"unic-ucd-version",
]
[[package]]
name = "unic-langid"
version = "0.9.1"
@ -5521,15 +5489,6 @@ dependencies = [
"unic-langid-impl",
]
[[package]]
name = "unic-ucd-version"
version = "0.9.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "96bd2f2237fe450fcd0a1d2f5f4e91711124f7857ba2e964247776ebeeb7b0c4"
dependencies = [
"unic-common",
]
[[package]]
name = "unicase"
version = "2.6.0"
@ -5567,6 +5526,12 @@ dependencies = [
"tinyvec",
]
[[package]]
name = "unicode-properties"
version = "0.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c7f91c8b21fbbaa18853c3d0801c78f4fc94cdb976699bb03e832e75f7fd22f0"
[[package]]
name = "unicode-script"
version = "0.5.5"

View file

@ -16,7 +16,11 @@ Rust lexer used by rustc. No stability guarantees are provided.
# Note that this crate purposefully does not depend on other rustc crates
[dependencies]
unicode-xid = "0.2.0"
unic-emoji-char = "0.9.0"
[dependencies.unicode-properties]
version = "0.1.0"
default-features = false
features = ["emoji"]
[dev-dependencies]
expect-test = "1.4.0"

View file

@ -34,6 +34,7 @@ pub use crate::cursor::Cursor;
use self::LiteralKind::*;
use self::TokenKind::*;
use crate::cursor::EOF_CHAR;
use unicode_properties::UnicodeEmoji;
/// Parsed token.
/// It doesn't contain information about data that has been parsed,
@ -428,9 +429,7 @@ impl Cursor<'_> {
Literal { kind, suffix_start }
}
// Identifier starting with an emoji. Only lexed for graceful error recovery.
c if !c.is_ascii() && unic_emoji_char::is_emoji(c) => {
self.fake_ident_or_unknown_prefix()
}
c if !c.is_ascii() && c.is_emoji_char() => self.fake_ident_or_unknown_prefix(),
_ => Unknown,
};
let res = Token::new(token_kind, self.pos_within_token());
@ -514,9 +513,7 @@ impl Cursor<'_> {
// we see a prefix here, it is definitely an unknown prefix.
match self.first() {
'#' | '"' | '\'' => UnknownPrefix,
c if !c.is_ascii() && unic_emoji_char::is_emoji(c) => {
self.fake_ident_or_unknown_prefix()
}
c if !c.is_ascii() && c.is_emoji_char() => self.fake_ident_or_unknown_prefix(),
_ => Ident,
}
}
@ -525,7 +522,7 @@ impl Cursor<'_> {
// Start is already eaten, eat the rest of identifier.
self.eat_while(|c| {
unicode_xid::UnicodeXID::is_xid_continue(c)
|| (!c.is_ascii() && unic_emoji_char::is_emoji(c))
|| (!c.is_ascii() && c.is_emoji_char())
|| c == '\u{200d}'
});
// Known prefixes must have been handled earlier. So if

View file

@ -270,18 +270,14 @@ const PERMITTED_RUSTC_DEPENDENCIES: &[&str] = &[
"twox-hash",
"type-map",
"typenum",
"unic-char-property",
"unic-char-range",
"unic-common",
"unic-emoji-char",
"unic-langid",
"unic-langid-impl",
"unic-langid-macros",
"unic-langid-macros-impl",
"unic-ucd-version",
"unicase",
"unicode-ident",
"unicode-normalization",
"unicode-properties",
"unicode-script",
"unicode-security",
"unicode-width",

View file

@ -1,9 +1,7 @@
fn invalid_emoji_usages() {
let arrow = "basic emoji"; //~ ERROR: identifiers cannot contain emoji
// FIXME
let planet🪐 = "basic emoji"; //~ ERROR: unknown start of token
// FIXME
let wireless🛜 = "basic emoji"; //~ ERROR: unknown start of token
let planet🪐 = "basic emoji"; //~ ERROR: identifiers cannot contain emoji
let wireless🛜 = "basic emoji"; //~ ERROR: identifiers cannot contain emoji
// FIXME
let key1 = "keycap sequence"; //~ ERROR: unknown start of token
//~^ WARN: identifier contains uncommon Unicode codepoints

View file

@ -1,17 +1,5 @@
error: unknown start of token: \u{1fa90}
--> $DIR/lex-emoji-identifiers.rs:4:15
|
LL | let planet🪐 = "basic emoji";
| ^^
error: unknown start of token: \u{1f6dc}
--> $DIR/lex-emoji-identifiers.rs:6:17
|
LL | let wireless🛜 = "basic emoji";
| ^^
error: unknown start of token: \u{20e3}
--> $DIR/lex-emoji-identifiers.rs:8:14
--> $DIR/lex-emoji-identifiers.rs:6:14
|
LL | let key1⃣ = "keycap sequence";
| ^
@ -22,26 +10,38 @@ error: identifiers cannot contain emoji: `arrow↔`
LL | let arrow↔ = "basic emoji";
| ^^^^^^
error: identifiers cannot contain emoji: `planet🪐`
--> $DIR/lex-emoji-identifiers.rs:3:9
|
LL | let planet🪐 = "basic emoji";
| ^^^^^^^^
error: identifiers cannot contain emoji: `wireless🛜`
--> $DIR/lex-emoji-identifiers.rs:4:9
|
LL | let wireless🛜 = "basic emoji";
| ^^^^^^^^^^
error: identifiers cannot contain emoji: `flag🇺🇳`
--> $DIR/lex-emoji-identifiers.rs:10:9
--> $DIR/lex-emoji-identifiers.rs:8:9
|
LL | let flag🇺🇳 = "flag sequence";
| ^^^^^^
error: identifiers cannot contain emoji: `wales🏴`
--> $DIR/lex-emoji-identifiers.rs:11:9
--> $DIR/lex-emoji-identifiers.rs:9:9
|
LL | let wales🏴 = "tag sequence";
| ^^^^^^^
error: identifiers cannot contain emoji: `folded🙏🏿`
--> $DIR/lex-emoji-identifiers.rs:12:9
--> $DIR/lex-emoji-identifiers.rs:10:9
|
LL | let folded🙏🏿 = "modifier sequence";
| ^^^^^^^^^^
warning: identifier contains uncommon Unicode codepoints
--> $DIR/lex-emoji-identifiers.rs:8:9
--> $DIR/lex-emoji-identifiers.rs:6:9
|
LL | let key1⃣ = "keycap sequence";
| ^^^^