From 21224e6ee07b917f7996b27f05c57327f806a026 Mon Sep 17 00:00:00 2001 From: Esteban Kuber Date: Thu, 9 Sep 2021 17:05:03 +0000 Subject: [PATCH] Account for confusable codepoints when recovering emoji identifiers --- compiler/rustc_parse/src/lexer/mod.rs | 15 +++++++++++++-- compiler/rustc_parse/src/lexer/unicode_chars.rs | 2 +- src/test/ui/parser/emoji-identifiers.rs | 3 ++- src/test/ui/parser/emoji-identifiers.stderr | 17 ++++++++++++++--- 4 files changed, 30 insertions(+), 7 deletions(-) diff --git a/compiler/rustc_parse/src/lexer/mod.rs b/compiler/rustc_parse/src/lexer/mod.rs index c4a3dd9bfda..9403e0af595 100644 --- a/compiler/rustc_parse/src/lexer/mod.rs +++ b/compiler/rustc_parse/src/lexer/mod.rs @@ -1,3 +1,4 @@ +use crate::lexer::unicode_chars::UNICODE_ARRAY; use rustc_ast::ast::{self, AttrStyle}; use rustc_ast::token::{self, CommentKind, Token, TokenKind}; use rustc_ast::tokenstream::{Spacing, TokenStream}; @@ -222,7 +223,17 @@ impl<'a> StringReader<'a> { } token::Ident(sym, is_raw_ident) } - rustc_lexer::TokenKind::InvalidIdent => { + rustc_lexer::TokenKind::InvalidIdent + // Do not recover an identifier with emojis if the codepoint is a confusable + // with a recoverable substitution token, like `โž–`. + if UNICODE_ARRAY + .iter() + .find(|&&(c, _, _)| { + let sym = self.str_from(start); + sym.chars().count() == 1 && c == sym.chars().next().unwrap() + }) + .is_none() => + { let sym = nfc_normalize(self.str_from(start)); let span = self.mk_sp(start, self.pos); self.sess.bad_unicode_identifiers.borrow_mut().entry(sym).or_default().push(span); @@ -299,7 +310,7 @@ impl<'a> StringReader<'a> { rustc_lexer::TokenKind::Caret => token::BinOp(token::Caret), rustc_lexer::TokenKind::Percent => token::BinOp(token::Percent), - rustc_lexer::TokenKind::Unknown => { + rustc_lexer::TokenKind::Unknown | rustc_lexer::TokenKind::InvalidIdent => { let c = self.str_from(start).chars().next().unwrap(); let mut err = self.struct_fatal_span_char(start, self.pos, "unknown start of token", c); diff --git a/compiler/rustc_parse/src/lexer/unicode_chars.rs b/compiler/rustc_parse/src/lexer/unicode_chars.rs index 3eebc088f3f..ccd11f06bc5 100644 --- a/compiler/rustc_parse/src/lexer/unicode_chars.rs +++ b/compiler/rustc_parse/src/lexer/unicode_chars.rs @@ -7,7 +7,7 @@ use rustc_errors::{Applicability, DiagnosticBuilder}; use rustc_span::{symbol::kw, BytePos, Pos, Span}; #[rustfmt::skip] // for line breaks -const UNICODE_ARRAY: &[(char, &str, char)] = &[ +pub(crate) const UNICODE_ARRAY: &[(char, &str, char)] = &[ ('โ€จ', "Line Separator", ' '), ('โ€ฉ', "Paragraph Separator", ' '), ('แš€', "Ogham Space mark", ' '), diff --git a/src/test/ui/parser/emoji-identifiers.rs b/src/test/ui/parser/emoji-identifiers.rs index 485fb29af9c..e07e0573e75 100644 --- a/src/test/ui/parser/emoji-identifiers.rs +++ b/src/test/ui/parser/emoji-identifiers.rs @@ -10,6 +10,7 @@ fn i_like_to_๐Ÿ˜…_a_lot() -> ๐Ÿ‘€ { //~ ERROR identifiers cannot contain emojis //~^ ERROR identifiers cannot contain emojis } fn main() { - let _ = i_like_to_๐Ÿ˜„_a_lot(); //~ ERROR cannot find function `i_like_to_๐Ÿ˜„_a_lot` in this scope + let _ = i_like_to_๐Ÿ˜„_a_lot() โž– 4; //~ ERROR cannot find function `i_like_to_๐Ÿ˜„_a_lot` in this scope //~^ ERROR identifiers cannot contain emojis + //~| ERROR unknown start of token: \u{2796} } diff --git a/src/test/ui/parser/emoji-identifiers.stderr b/src/test/ui/parser/emoji-identifiers.stderr index 84c3e3962ff..a73681d9196 100644 --- a/src/test/ui/parser/emoji-identifiers.stderr +++ b/src/test/ui/parser/emoji-identifiers.stderr @@ -1,16 +1,27 @@ +error: unknown start of token: \u{2796} + --> $DIR/emoji-identifiers.rs:13:33 + | +LL | let _ = i_like_to_๐Ÿ˜„_a_lot() โž– 4; + | ^^ + | +help: Unicode character 'โž–' (Heavy Minus Sign) looks like '-' (Minus/Hyphen), but it is not + | +LL | let _ = i_like_to_๐Ÿ˜„_a_lot() - 4; + | ~ + error[E0425]: cannot find function `i_like_to_๐Ÿ˜„_a_lot` in this scope --> $DIR/emoji-identifiers.rs:13:13 | LL | fn i_like_to_๐Ÿ˜…_a_lot() -> ๐Ÿ‘€ { | ----------------------------- similarly named function `i_like_to_๐Ÿ˜…_a_lot` defined here ... -LL | let _ = i_like_to_๐Ÿ˜„_a_lot(); +LL | let _ = i_like_to_๐Ÿ˜„_a_lot() โž– 4; | ^^^^^^^^^^^^^^^^^^ help: a function with a similar name exists: `i_like_to_๐Ÿ˜…_a_lot` error: identifiers cannot contain emojis: `i_like_to_๐Ÿ˜„_a_lot` --> $DIR/emoji-identifiers.rs:13:13 | -LL | let _ = i_like_to_๐Ÿ˜„_a_lot(); +LL | let _ = i_like_to_๐Ÿ˜„_a_lot() โž– 4; | ^^^^^^^^^^^^^^^^^^ error: identifiers cannot contain emojis: `full_of_โœจ` @@ -66,7 +77,7 @@ LL | ๐Ÿ‘€::full_ofโœจ() | function or associated item not found in `๐Ÿ‘€` | help: there is an associated function with a similar name: `full_of_โœจ` -error: aborting due to 8 previous errors +error: aborting due to 9 previous errors Some errors have detailed explanations: E0425, E0599. For more information about an error, try `rustc --explain E0425`.