Rollup merge of #114193 - crlf0710:lexer_unicode15, r=Manishearth
Update lexer emoji diagnostics to Unicode 15.0 This replaces the `unic-emoji-char` dep tree (which hasn't been updated for a while) with `unicode-properties` crate which contains Unicode 15.0 data. Improves diagnostics for added emoji characters in recent years. (See tests). cc #101840 cc ``@Manishearth``
This commit is contained in:
commit
57c57a555b
6 changed files with 36 additions and 76 deletions
49
Cargo.lock
49
Cargo.lock
|
@ -3786,7 +3786,7 @@ name = "rustc_lexer"
|
|||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"expect-test",
|
||||
"unic-emoji-char",
|
||||
"unicode-properties",
|
||||
"unicode-xid",
|
||||
]
|
||||
|
||||
|
@ -5446,38 +5446,6 @@ dependencies = [
|
|||
"tempfile",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "unic-char-property"
|
||||
version = "0.9.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a8c57a407d9b6fa02b4795eb81c5b6652060a15a7903ea981f3d723e6c0be221"
|
||||
dependencies = [
|
||||
"unic-char-range",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "unic-char-range"
|
||||
version = "0.9.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0398022d5f700414f6b899e10b8348231abf9173fa93144cbc1a43b9793c1fbc"
|
||||
|
||||
[[package]]
|
||||
name = "unic-common"
|
||||
version = "0.9.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "80d7ff825a6a654ee85a63e80f92f054f904f21e7d12da4e22f9834a4aaa35bc"
|
||||
|
||||
[[package]]
|
||||
name = "unic-emoji-char"
|
||||
version = "0.9.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0b07221e68897210270a38bde4babb655869637af0f69407f96053a34f76494d"
|
||||
dependencies = [
|
||||
"unic-char-property",
|
||||
"unic-char-range",
|
||||
"unic-ucd-version",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "unic-langid"
|
||||
version = "0.9.1"
|
||||
|
@ -5521,15 +5489,6 @@ dependencies = [
|
|||
"unic-langid-impl",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "unic-ucd-version"
|
||||
version = "0.9.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "96bd2f2237fe450fcd0a1d2f5f4e91711124f7857ba2e964247776ebeeb7b0c4"
|
||||
dependencies = [
|
||||
"unic-common",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "unicase"
|
||||
version = "2.6.0"
|
||||
|
@ -5567,6 +5526,12 @@ dependencies = [
|
|||
"tinyvec",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "unicode-properties"
|
||||
version = "0.1.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c7f91c8b21fbbaa18853c3d0801c78f4fc94cdb976699bb03e832e75f7fd22f0"
|
||||
|
||||
[[package]]
|
||||
name = "unicode-script"
|
||||
version = "0.5.5"
|
||||
|
|
|
@ -16,7 +16,11 @@ Rust lexer used by rustc. No stability guarantees are provided.
|
|||
# Note that this crate purposefully does not depend on other rustc crates
|
||||
[dependencies]
|
||||
unicode-xid = "0.2.0"
|
||||
unic-emoji-char = "0.9.0"
|
||||
|
||||
[dependencies.unicode-properties]
|
||||
version = "0.1.0"
|
||||
default-features = false
|
||||
features = ["emoji"]
|
||||
|
||||
[dev-dependencies]
|
||||
expect-test = "1.4.0"
|
||||
|
|
|
@ -34,6 +34,7 @@ pub use crate::cursor::Cursor;
|
|||
use self::LiteralKind::*;
|
||||
use self::TokenKind::*;
|
||||
use crate::cursor::EOF_CHAR;
|
||||
use unicode_properties::UnicodeEmoji;
|
||||
|
||||
/// Parsed token.
|
||||
/// It doesn't contain information about data that has been parsed,
|
||||
|
@ -428,9 +429,7 @@ impl Cursor<'_> {
|
|||
Literal { kind, suffix_start }
|
||||
}
|
||||
// Identifier starting with an emoji. Only lexed for graceful error recovery.
|
||||
c if !c.is_ascii() && unic_emoji_char::is_emoji(c) => {
|
||||
self.fake_ident_or_unknown_prefix()
|
||||
}
|
||||
c if !c.is_ascii() && c.is_emoji_char() => self.fake_ident_or_unknown_prefix(),
|
||||
_ => Unknown,
|
||||
};
|
||||
let res = Token::new(token_kind, self.pos_within_token());
|
||||
|
@ -514,9 +513,7 @@ impl Cursor<'_> {
|
|||
// we see a prefix here, it is definitely an unknown prefix.
|
||||
match self.first() {
|
||||
'#' | '"' | '\'' => UnknownPrefix,
|
||||
c if !c.is_ascii() && unic_emoji_char::is_emoji(c) => {
|
||||
self.fake_ident_or_unknown_prefix()
|
||||
}
|
||||
c if !c.is_ascii() && c.is_emoji_char() => self.fake_ident_or_unknown_prefix(),
|
||||
_ => Ident,
|
||||
}
|
||||
}
|
||||
|
@ -525,7 +522,7 @@ impl Cursor<'_> {
|
|||
// Start is already eaten, eat the rest of identifier.
|
||||
self.eat_while(|c| {
|
||||
unicode_xid::UnicodeXID::is_xid_continue(c)
|
||||
|| (!c.is_ascii() && unic_emoji_char::is_emoji(c))
|
||||
|| (!c.is_ascii() && c.is_emoji_char())
|
||||
|| c == '\u{200d}'
|
||||
});
|
||||
// Known prefixes must have been handled earlier. So if
|
||||
|
|
|
@ -270,18 +270,14 @@ const PERMITTED_RUSTC_DEPENDENCIES: &[&str] = &[
|
|||
"twox-hash",
|
||||
"type-map",
|
||||
"typenum",
|
||||
"unic-char-property",
|
||||
"unic-char-range",
|
||||
"unic-common",
|
||||
"unic-emoji-char",
|
||||
"unic-langid",
|
||||
"unic-langid-impl",
|
||||
"unic-langid-macros",
|
||||
"unic-langid-macros-impl",
|
||||
"unic-ucd-version",
|
||||
"unicase",
|
||||
"unicode-ident",
|
||||
"unicode-normalization",
|
||||
"unicode-properties",
|
||||
"unicode-script",
|
||||
"unicode-security",
|
||||
"unicode-width",
|
||||
|
|
|
@ -1,9 +1,7 @@
|
|||
fn invalid_emoji_usages() {
|
||||
let arrow↔️ = "basic emoji"; //~ ERROR: identifiers cannot contain emoji
|
||||
// FIXME
|
||||
let planet🪐 = "basic emoji"; //~ ERROR: unknown start of token
|
||||
// FIXME
|
||||
let wireless🛜 = "basic emoji"; //~ ERROR: unknown start of token
|
||||
let planet🪐 = "basic emoji"; //~ ERROR: identifiers cannot contain emoji
|
||||
let wireless🛜 = "basic emoji"; //~ ERROR: identifiers cannot contain emoji
|
||||
// FIXME
|
||||
let key1️⃣ = "keycap sequence"; //~ ERROR: unknown start of token
|
||||
//~^ WARN: identifier contains uncommon Unicode codepoints
|
||||
|
|
|
@ -1,17 +1,5 @@
|
|||
error: unknown start of token: \u{1fa90}
|
||||
--> $DIR/lex-emoji-identifiers.rs:4:15
|
||||
|
|
||||
LL | let planet🪐 = "basic emoji";
|
||||
| ^^
|
||||
|
||||
error: unknown start of token: \u{1f6dc}
|
||||
--> $DIR/lex-emoji-identifiers.rs:6:17
|
||||
|
|
||||
LL | let wireless🛜 = "basic emoji";
|
||||
| ^^
|
||||
|
||||
error: unknown start of token: \u{20e3}
|
||||
--> $DIR/lex-emoji-identifiers.rs:8:14
|
||||
--> $DIR/lex-emoji-identifiers.rs:6:14
|
||||
|
|
||||
LL | let key1️⃣ = "keycap sequence";
|
||||
| ^
|
||||
|
@ -22,26 +10,38 @@ error: identifiers cannot contain emoji: `arrow↔️`
|
|||
LL | let arrow↔️ = "basic emoji";
|
||||
| ^^^^^^
|
||||
|
||||
error: identifiers cannot contain emoji: `planet🪐`
|
||||
--> $DIR/lex-emoji-identifiers.rs:3:9
|
||||
|
|
||||
LL | let planet🪐 = "basic emoji";
|
||||
| ^^^^^^^^
|
||||
|
||||
error: identifiers cannot contain emoji: `wireless🛜`
|
||||
--> $DIR/lex-emoji-identifiers.rs:4:9
|
||||
|
|
||||
LL | let wireless🛜 = "basic emoji";
|
||||
| ^^^^^^^^^^
|
||||
|
||||
error: identifiers cannot contain emoji: `flag🇺🇳`
|
||||
--> $DIR/lex-emoji-identifiers.rs:10:9
|
||||
--> $DIR/lex-emoji-identifiers.rs:8:9
|
||||
|
|
||||
LL | let flag🇺🇳 = "flag sequence";
|
||||
| ^^^^^^
|
||||
|
||||
error: identifiers cannot contain emoji: `wales🏴`
|
||||
--> $DIR/lex-emoji-identifiers.rs:11:9
|
||||
--> $DIR/lex-emoji-identifiers.rs:9:9
|
||||
|
|
||||
LL | let wales🏴 = "tag sequence";
|
||||
| ^^^^^^^
|
||||
|
||||
error: identifiers cannot contain emoji: `folded🙏🏿`
|
||||
--> $DIR/lex-emoji-identifiers.rs:12:9
|
||||
--> $DIR/lex-emoji-identifiers.rs:10:9
|
||||
|
|
||||
LL | let folded🙏🏿 = "modifier sequence";
|
||||
| ^^^^^^^^^^
|
||||
|
||||
warning: identifier contains uncommon Unicode codepoints
|
||||
--> $DIR/lex-emoji-identifiers.rs:8:9
|
||||
--> $DIR/lex-emoji-identifiers.rs:6:9
|
||||
|
|
||||
LL | let key1️⃣ = "keycap sequence";
|
||||
| ^^^^
|
||||
|
|
Loading…
Add table
Reference in a new issue