Update lexer emoji diagnostics to Unicode 15.0

2023-07-29 08:47:21 +08:00 · 2023-07-29 08:47:21 +08:00 · bca79a26d8
commit bca79a26d8
parent 04abc370b9
6 changed files with 36 additions and 76 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -3785,7 +3785,7 @@ name = "rustc_lexer"
 version = "0.1.0"
 dependencies = [
 "expect-test",
- "unic-emoji-char",
+ "unicode-properties",
 "unicode-xid",
 ]
@ -5445,38 +5445,6 @@ dependencies = [
 "tempfile",
 ]
 [[package]]
 name = "unic-char-property"
 version = "0.9.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "a8c57a407d9b6fa02b4795eb81c5b6652060a15a7903ea981f3d723e6c0be221"
 dependencies = [
 "unic-char-range",
 ]
 [[package]]
 name = "unic-char-range"
 version = "0.9.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "0398022d5f700414f6b899e10b8348231abf9173fa93144cbc1a43b9793c1fbc"
 [[package]]
 name = "unic-common"
 version = "0.9.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "80d7ff825a6a654ee85a63e80f92f054f904f21e7d12da4e22f9834a4aaa35bc"
 [[package]]
 name = "unic-emoji-char"
 version = "0.9.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "0b07221e68897210270a38bde4babb655869637af0f69407f96053a34f76494d"
 dependencies = [
 "unic-char-property",
 "unic-char-range",
 "unic-ucd-version",
 ]
 [[package]]
 name = "unic-langid"
 version = "0.9.1"
@ -5520,15 +5488,6 @@ dependencies = [
 "unic-langid-impl",
 ]
 [[package]]
 name = "unic-ucd-version"
 version = "0.9.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "96bd2f2237fe450fcd0a1d2f5f4e91711124f7857ba2e964247776ebeeb7b0c4"
 dependencies = [
 "unic-common",
 ]
 [[package]]
 name = "unicase"
 version = "2.6.0"
@ -5566,6 +5525,12 @@ dependencies = [
 "tinyvec",
 ]
 [[package]]
 name = "unicode-properties"
 version = "0.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "c7f91c8b21fbbaa18853c3d0801c78f4fc94cdb976699bb03e832e75f7fd22f0"
 [[package]]
 name = "unicode-script"
 version = "0.5.5"
--- a/compiler/rustc_lexer/Cargo.toml
+++ b/compiler/rustc_lexer/Cargo.toml
@ -16,7 +16,11 @@ Rust lexer used by rustc. No stability guarantees are provided.
 # Note that this crate purposefully does not depend on other rustc crates
 [dependencies]
 unicode-xid = "0.2.0"
-unic-emoji-char = "0.9.0"
+
 [dependencies.unicode-properties]
 version = "0.1.0"
 default-features = false
 features = ["emoji"]
 [dev-dependencies]
 expect-test = "1.4.0"
--- a/compiler/rustc_lexer/src/lib.rs
+++ b/compiler/rustc_lexer/src/lib.rs
@ -34,6 +34,7 @@ pub use crate::cursor::Cursor;
 use self::LiteralKind::*;
 use self::TokenKind::*;
 use crate::cursor::EOF_CHAR;
 use unicode_properties::UnicodeEmoji;
 /// Parsed token.
 /// It doesn't contain information about data that has been parsed,
@ -428,9 +429,7 @@ impl Cursor<'_> {
                Literal { kind, suffix_start }
            }
            // Identifier starting with an emoji. Only lexed for graceful error recovery.
-            c if !c.is_ascii() && unic_emoji_char::is_emoji(c) => {
+            c if !c.is_ascii() && c.is_emoji_char() => self.fake_ident_or_unknown_prefix(),
                self.fake_ident_or_unknown_prefix()
            }
            _ => Unknown,
        };
        let res = Token::new(token_kind, self.pos_within_token());
@ -514,9 +513,7 @@ impl Cursor<'_> {
        // we see a prefix here, it is definitely an unknown prefix.
        match self.first() {
            '#' | '"' | '\'' => UnknownPrefix,
-            c if !c.is_ascii() && unic_emoji_char::is_emoji(c) => {
+            c if !c.is_ascii() && c.is_emoji_char() => self.fake_ident_or_unknown_prefix(),
                self.fake_ident_or_unknown_prefix()
            }
            _ => Ident,
        }
    }
@ -525,7 +522,7 @@ impl Cursor<'_> {
        // Start is already eaten, eat the rest of identifier.
        self.eat_while(|c| {
            unicode_xid::UnicodeXID::is_xid_continue(c)
-                || (!c.is_ascii() && unic_emoji_char::is_emoji(c))
+                || (!c.is_ascii() && c.is_emoji_char())
                || c == '\u{200d}'
        });
        // Known prefixes must have been handled earlier. So if
--- a/src/tools/tidy/src/deps.rs
+++ b/src/tools/tidy/src/deps.rs
@ -270,18 +270,14 @@ const PERMITTED_RUSTC_DEPENDENCIES: &[&str] = &[
    "twox-hash",
    "type-map",
    "typenum",
    "unic-char-property",
    "unic-char-range",
    "unic-common",
    "unic-emoji-char",
    "unic-langid",
    "unic-langid-impl",
    "unic-langid-macros",
    "unic-langid-macros-impl",
    "unic-ucd-version",
    "unicase",
    "unicode-ident",
    "unicode-normalization",
    "unicode-properties",
    "unicode-script",
    "unicode-security",
    "unicode-width",
--- a/tests/ui/lexer/lex-emoji-identifiers.rs
+++ b/tests/ui/lexer/lex-emoji-identifiers.rs
@ -1,9 +1,7 @@
 fn invalid_emoji_usages() {
    let arrow↔️ = "basic emoji"; //~ ERROR: identifiers cannot contain emoji
-    // FIXME
+    let planet🪐 = "basic emoji"; //~ ERROR: identifiers cannot contain emoji
-    let planet🪐 = "basic emoji"; //~ ERROR: unknown start of token
+    let wireless🛜 = "basic emoji"; //~ ERROR: identifiers cannot contain emoji
    // FIXME
    let wireless🛜 = "basic emoji"; //~ ERROR: unknown start of token
    // FIXME
    let key1️⃣ = "keycap sequence"; //~ ERROR: unknown start of token
                                    //~^ WARN: identifier contains uncommon Unicode codepoints
--- a/tests/ui/lexer/lex-emoji-identifiers.stderr
+++ b/tests/ui/lexer/lex-emoji-identifiers.stderr
@ -1,17 +1,5 @@
 error: unknown start of token: \u{1fa90}
  --> $DIR/lex-emoji-identifiers.rs:4:15
   |
 LL |     let planet🪐 = "basic emoji";
   |               ^^
 error: unknown start of token: \u{1f6dc}
  --> $DIR/lex-emoji-identifiers.rs:6:17
   |
 LL |     let wireless🛜 = "basic emoji";
   |                 ^^
 error: unknown start of token: \u{20e3}
-  --> $DIR/lex-emoji-identifiers.rs:8:14
+  --> $DIR/lex-emoji-identifiers.rs:6:14
   |
 LL |     let key1️⃣ = "keycap sequence";
   |             ^
@ -22,26 +10,38 @@ error: identifiers cannot contain emoji: `arrow↔️`
 LL |     let arrow↔️ = "basic emoji";
   |         ^^^^^^
 error: identifiers cannot contain emoji: `planet🪐`
  --> $DIR/lex-emoji-identifiers.rs:3:9
   |
 LL |     let planet🪐 = "basic emoji";
   |         ^^^^^^^^
 error: identifiers cannot contain emoji: `wireless🛜`
  --> $DIR/lex-emoji-identifiers.rs:4:9
   |
 LL |     let wireless🛜 = "basic emoji";
   |         ^^^^^^^^^^
 error: identifiers cannot contain emoji: `flag🇺🇳`
-  --> $DIR/lex-emoji-identifiers.rs:10:9
+  --> $DIR/lex-emoji-identifiers.rs:8:9
   |
 LL |     let flag🇺🇳 = "flag sequence";
   |         ^^^^^^
 error: identifiers cannot contain emoji: `wales🏴`
-  --> $DIR/lex-emoji-identifiers.rs:11:9
+  --> $DIR/lex-emoji-identifiers.rs:9:9
   |
 LL |     let wales🏴 = "tag sequence";
   |         ^^^^^^^
 error: identifiers cannot contain emoji: `folded🙏🏿`
-  --> $DIR/lex-emoji-identifiers.rs:12:9
+  --> $DIR/lex-emoji-identifiers.rs:10:9
   |
 LL |     let folded🙏🏿 = "modifier sequence";
   |         ^^^^^^^^^^
 warning: identifier contains uncommon Unicode codepoints
-  --> $DIR/lex-emoji-identifiers.rs:8:9
+  --> $DIR/lex-emoji-identifiers.rs:6:9
   |
 LL |     let key1️⃣ = "keycap sequence";
   |         ^^^^