Update lexer emoji diagnostics to Unicode 15.0

This commit is contained in:
Charles Lew 2023-07-29 08:47:21 +08:00
parent 04abc370b9
commit bca79a26d8
6 changed files with 36 additions and 76 deletions

View file

@ -3785,7 +3785,7 @@ name = "rustc_lexer"
version = "0.1.0" version = "0.1.0"
dependencies = [ dependencies = [
"expect-test", "expect-test",
"unic-emoji-char", "unicode-properties",
"unicode-xid", "unicode-xid",
] ]
@ -5445,38 +5445,6 @@ dependencies = [
"tempfile", "tempfile",
] ]
[[package]]
name = "unic-char-property"
version = "0.9.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a8c57a407d9b6fa02b4795eb81c5b6652060a15a7903ea981f3d723e6c0be221"
dependencies = [
"unic-char-range",
]
[[package]]
name = "unic-char-range"
version = "0.9.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0398022d5f700414f6b899e10b8348231abf9173fa93144cbc1a43b9793c1fbc"
[[package]]
name = "unic-common"
version = "0.9.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "80d7ff825a6a654ee85a63e80f92f054f904f21e7d12da4e22f9834a4aaa35bc"
[[package]]
name = "unic-emoji-char"
version = "0.9.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0b07221e68897210270a38bde4babb655869637af0f69407f96053a34f76494d"
dependencies = [
"unic-char-property",
"unic-char-range",
"unic-ucd-version",
]
[[package]] [[package]]
name = "unic-langid" name = "unic-langid"
version = "0.9.1" version = "0.9.1"
@ -5520,15 +5488,6 @@ dependencies = [
"unic-langid-impl", "unic-langid-impl",
] ]
[[package]]
name = "unic-ucd-version"
version = "0.9.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "96bd2f2237fe450fcd0a1d2f5f4e91711124f7857ba2e964247776ebeeb7b0c4"
dependencies = [
"unic-common",
]
[[package]] [[package]]
name = "unicase" name = "unicase"
version = "2.6.0" version = "2.6.0"
@ -5566,6 +5525,12 @@ dependencies = [
"tinyvec", "tinyvec",
] ]
[[package]]
name = "unicode-properties"
version = "0.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c7f91c8b21fbbaa18853c3d0801c78f4fc94cdb976699bb03e832e75f7fd22f0"
[[package]] [[package]]
name = "unicode-script" name = "unicode-script"
version = "0.5.5" version = "0.5.5"

View file

@ -16,7 +16,11 @@ Rust lexer used by rustc. No stability guarantees are provided.
# Note that this crate purposefully does not depend on other rustc crates # Note that this crate purposefully does not depend on other rustc crates
[dependencies] [dependencies]
unicode-xid = "0.2.0" unicode-xid = "0.2.0"
unic-emoji-char = "0.9.0"
[dependencies.unicode-properties]
version = "0.1.0"
default-features = false
features = ["emoji"]
[dev-dependencies] [dev-dependencies]
expect-test = "1.4.0" expect-test = "1.4.0"

View file

@ -34,6 +34,7 @@ pub use crate::cursor::Cursor;
use self::LiteralKind::*; use self::LiteralKind::*;
use self::TokenKind::*; use self::TokenKind::*;
use crate::cursor::EOF_CHAR; use crate::cursor::EOF_CHAR;
use unicode_properties::UnicodeEmoji;
/// Parsed token. /// Parsed token.
/// It doesn't contain information about data that has been parsed, /// It doesn't contain information about data that has been parsed,
@ -428,9 +429,7 @@ impl Cursor<'_> {
Literal { kind, suffix_start } Literal { kind, suffix_start }
} }
// Identifier starting with an emoji. Only lexed for graceful error recovery. // Identifier starting with an emoji. Only lexed for graceful error recovery.
c if !c.is_ascii() && unic_emoji_char::is_emoji(c) => { c if !c.is_ascii() && c.is_emoji_char() => self.fake_ident_or_unknown_prefix(),
self.fake_ident_or_unknown_prefix()
}
_ => Unknown, _ => Unknown,
}; };
let res = Token::new(token_kind, self.pos_within_token()); let res = Token::new(token_kind, self.pos_within_token());
@ -514,9 +513,7 @@ impl Cursor<'_> {
// we see a prefix here, it is definitely an unknown prefix. // we see a prefix here, it is definitely an unknown prefix.
match self.first() { match self.first() {
'#' | '"' | '\'' => UnknownPrefix, '#' | '"' | '\'' => UnknownPrefix,
c if !c.is_ascii() && unic_emoji_char::is_emoji(c) => { c if !c.is_ascii() && c.is_emoji_char() => self.fake_ident_or_unknown_prefix(),
self.fake_ident_or_unknown_prefix()
}
_ => Ident, _ => Ident,
} }
} }
@ -525,7 +522,7 @@ impl Cursor<'_> {
// Start is already eaten, eat the rest of identifier. // Start is already eaten, eat the rest of identifier.
self.eat_while(|c| { self.eat_while(|c| {
unicode_xid::UnicodeXID::is_xid_continue(c) unicode_xid::UnicodeXID::is_xid_continue(c)
|| (!c.is_ascii() && unic_emoji_char::is_emoji(c)) || (!c.is_ascii() && c.is_emoji_char())
|| c == '\u{200d}' || c == '\u{200d}'
}); });
// Known prefixes must have been handled earlier. So if // Known prefixes must have been handled earlier. So if

View file

@ -270,18 +270,14 @@ const PERMITTED_RUSTC_DEPENDENCIES: &[&str] = &[
"twox-hash", "twox-hash",
"type-map", "type-map",
"typenum", "typenum",
"unic-char-property",
"unic-char-range",
"unic-common",
"unic-emoji-char",
"unic-langid", "unic-langid",
"unic-langid-impl", "unic-langid-impl",
"unic-langid-macros", "unic-langid-macros",
"unic-langid-macros-impl", "unic-langid-macros-impl",
"unic-ucd-version",
"unicase", "unicase",
"unicode-ident", "unicode-ident",
"unicode-normalization", "unicode-normalization",
"unicode-properties",
"unicode-script", "unicode-script",
"unicode-security", "unicode-security",
"unicode-width", "unicode-width",

View file

@ -1,9 +1,7 @@
fn invalid_emoji_usages() { fn invalid_emoji_usages() {
let arrow = "basic emoji"; //~ ERROR: identifiers cannot contain emoji let arrow = "basic emoji"; //~ ERROR: identifiers cannot contain emoji
// FIXME let planet🪐 = "basic emoji"; //~ ERROR: identifiers cannot contain emoji
let planet🪐 = "basic emoji"; //~ ERROR: unknown start of token let wireless🛜 = "basic emoji"; //~ ERROR: identifiers cannot contain emoji
// FIXME
let wireless🛜 = "basic emoji"; //~ ERROR: unknown start of token
// FIXME // FIXME
let key1 = "keycap sequence"; //~ ERROR: unknown start of token let key1 = "keycap sequence"; //~ ERROR: unknown start of token
//~^ WARN: identifier contains uncommon Unicode codepoints //~^ WARN: identifier contains uncommon Unicode codepoints

View file

@ -1,17 +1,5 @@
error: unknown start of token: \u{1fa90}
--> $DIR/lex-emoji-identifiers.rs:4:15
|
LL | let planet🪐 = "basic emoji";
| ^^
error: unknown start of token: \u{1f6dc}
--> $DIR/lex-emoji-identifiers.rs:6:17
|
LL | let wireless🛜 = "basic emoji";
| ^^
error: unknown start of token: \u{20e3} error: unknown start of token: \u{20e3}
--> $DIR/lex-emoji-identifiers.rs:8:14 --> $DIR/lex-emoji-identifiers.rs:6:14
| |
LL | let key1⃣ = "keycap sequence"; LL | let key1⃣ = "keycap sequence";
| ^ | ^
@ -22,26 +10,38 @@ error: identifiers cannot contain emoji: `arrow↔`
LL | let arrow↔ = "basic emoji"; LL | let arrow↔ = "basic emoji";
| ^^^^^^ | ^^^^^^
error: identifiers cannot contain emoji: `planet🪐`
--> $DIR/lex-emoji-identifiers.rs:3:9
|
LL | let planet🪐 = "basic emoji";
| ^^^^^^^^
error: identifiers cannot contain emoji: `wireless🛜`
--> $DIR/lex-emoji-identifiers.rs:4:9
|
LL | let wireless🛜 = "basic emoji";
| ^^^^^^^^^^
error: identifiers cannot contain emoji: `flag🇺🇳` error: identifiers cannot contain emoji: `flag🇺🇳`
--> $DIR/lex-emoji-identifiers.rs:10:9 --> $DIR/lex-emoji-identifiers.rs:8:9
| |
LL | let flag🇺🇳 = "flag sequence"; LL | let flag🇺🇳 = "flag sequence";
| ^^^^^^ | ^^^^^^
error: identifiers cannot contain emoji: `wales🏴` error: identifiers cannot contain emoji: `wales🏴`
--> $DIR/lex-emoji-identifiers.rs:11:9 --> $DIR/lex-emoji-identifiers.rs:9:9
| |
LL | let wales🏴 = "tag sequence"; LL | let wales🏴 = "tag sequence";
| ^^^^^^^ | ^^^^^^^
error: identifiers cannot contain emoji: `folded🙏🏿` error: identifiers cannot contain emoji: `folded🙏🏿`
--> $DIR/lex-emoji-identifiers.rs:12:9 --> $DIR/lex-emoji-identifiers.rs:10:9
| |
LL | let folded🙏🏿 = "modifier sequence"; LL | let folded🙏🏿 = "modifier sequence";
| ^^^^^^^^^^ | ^^^^^^^^^^
warning: identifier contains uncommon Unicode codepoints warning: identifier contains uncommon Unicode codepoints
--> $DIR/lex-emoji-identifiers.rs:8:9 --> $DIR/lex-emoji-identifiers.rs:6:9
| |
LL | let key1⃣ = "keycap sequence"; LL | let key1⃣ = "keycap sequence";
| ^^^^ | ^^^^