Rollup merge of #133201 - nnethercote:rm-TokenKind-InvalidPrefix, r=compiler-errors

Remove `TokenKind::InvalidPrefix`

It's not needed. Best reviewed one commit at a time.

r? `@estebank`
This commit is contained in:
Matthias Krüger 2024-11-19 22:24:47 +01:00 committed by GitHub
commit 841243f319
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
4 changed files with 55 additions and 65 deletions

View file

@ -57,11 +57,10 @@ impl Token {
/// Enum representing common lexeme types. /// Enum representing common lexeme types.
#[derive(Clone, Copy, Debug, PartialEq, Eq)] #[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub enum TokenKind { pub enum TokenKind {
// Multi-char tokens: /// A line comment, e.g. `// comment`.
/// "// comment"
LineComment { doc_style: Option<DocStyle> }, LineComment { doc_style: Option<DocStyle> },
/// `/* block comment */` /// A block comment, e.g. `/* block comment */`.
/// ///
/// Block comments can be recursive, so a sequence like `/* /* */` /// Block comments can be recursive, so a sequence like `/* /* */`
/// will not be considered terminated and will result in a parsing error. /// will not be considered terminated and will result in a parsing error.
@ -70,18 +69,17 @@ pub enum TokenKind {
/// Any whitespace character sequence. /// Any whitespace character sequence.
Whitespace, Whitespace,
/// "ident" or "continue" /// An identifier or keyword, e.g. `ident` or `continue`.
///
/// At this step, keywords are also considered identifiers.
Ident, Ident,
/// Like the above, but containing invalid unicode codepoints. /// An identifier that is invalid because it contains emoji.
InvalidIdent, InvalidIdent,
/// "r#ident" /// A raw identifier, e.g. "r#ident".
RawIdent, RawIdent,
/// An unknown prefix, like `foo#`, `foo'`, `foo"`. /// An unknown literal prefix, like `foo#`, `foo'`, `foo"`. Excludes
/// literal prefixes that contain emoji, which are considered "invalid".
/// ///
/// Note that only the /// Note that only the
/// prefix (`foo`) is included in the token, not the separator (which is /// prefix (`foo`) is included in the token, not the separator (which is
@ -93,87 +91,83 @@ pub enum TokenKind {
/// An unknown prefix in a lifetime, like `'foo#`. /// An unknown prefix in a lifetime, like `'foo#`.
/// ///
/// Note that like above, only the `'` and prefix are included in the token /// Like `UnknownPrefix`, only the `'` and prefix are included in the token
/// and not the separator. /// and not the separator.
UnknownPrefixLifetime, UnknownPrefixLifetime,
/// `'r#lt`, which in edition < 2021 is split into several tokens: `'r # lt`. /// A raw lifetime, e.g. `'r#foo`. In edition < 2021 it will be split into
/// several tokens: `'r` and `#` and `foo`.
RawLifetime, RawLifetime,
/// Similar to the above, but *always* an error on every edition. This is used
/// for emoji identifier recovery, as those are not meant to be ever accepted.
InvalidPrefix,
/// Guarded string literal prefix: `#"` or `##`. /// Guarded string literal prefix: `#"` or `##`.
/// ///
/// Used for reserving "guarded strings" (RFC 3598) in edition 2024. /// Used for reserving "guarded strings" (RFC 3598) in edition 2024.
/// Split into the component tokens on older editions. /// Split into the component tokens on older editions.
GuardedStrPrefix, GuardedStrPrefix,
/// Examples: `12u8`, `1.0e-40`, `b"123"`. Note that `_` is an invalid /// Literals, e.g. `12u8`, `1.0e-40`, `b"123"`. Note that `_` is an invalid
/// suffix, but may be present here on string and float literals. Users of /// suffix, but may be present here on string and float literals. Users of
/// this type will need to check for and reject that case. /// this type will need to check for and reject that case.
/// ///
/// See [LiteralKind] for more details. /// See [LiteralKind] for more details.
Literal { kind: LiteralKind, suffix_start: u32 }, Literal { kind: LiteralKind, suffix_start: u32 },
/// "'a" /// A lifetime, e.g. `'a`.
Lifetime { starts_with_number: bool }, Lifetime { starts_with_number: bool },
// One-char tokens: /// `;`
/// ";"
Semi, Semi,
/// "," /// `,`
Comma, Comma,
/// "." /// `.`
Dot, Dot,
/// "(" /// `(`
OpenParen, OpenParen,
/// ")" /// `)`
CloseParen, CloseParen,
/// "{" /// `{`
OpenBrace, OpenBrace,
/// "}" /// `}`
CloseBrace, CloseBrace,
/// "[" /// `[`
OpenBracket, OpenBracket,
/// "]" /// `]`
CloseBracket, CloseBracket,
/// "@" /// `@`
At, At,
/// "#" /// `#`
Pound, Pound,
/// "~" /// `~`
Tilde, Tilde,
/// "?" /// `?`
Question, Question,
/// ":" /// `:`
Colon, Colon,
/// "$" /// `$`
Dollar, Dollar,
/// "=" /// `=`
Eq, Eq,
/// "!" /// `!`
Bang, Bang,
/// "<" /// `<`
Lt, Lt,
/// ">" /// `>`
Gt, Gt,
/// "-" /// `-`
Minus, Minus,
/// "&" /// `&`
And, And,
/// "|" /// `|`
Or, Or,
/// "+" /// `+`
Plus, Plus,
/// "*" /// `*`
Star, Star,
/// "/" /// `/`
Slash, Slash,
/// "^" /// `^`
Caret, Caret,
/// "%" /// `%`
Percent, Percent,
/// Unknown token, not expected by the lexer, e.g. "№" /// Unknown token, not expected by the lexer, e.g. "№"
@ -468,7 +462,7 @@ impl Cursor<'_> {
Literal { kind, suffix_start } Literal { kind, suffix_start }
} }
// Identifier starting with an emoji. Only lexed for graceful error recovery. // Identifier starting with an emoji. Only lexed for graceful error recovery.
c if !c.is_ascii() && c.is_emoji_char() => self.fake_ident_or_unknown_prefix(), c if !c.is_ascii() && c.is_emoji_char() => self.invalid_ident(),
_ => Unknown, _ => Unknown,
}; };
let res = Token::new(token_kind, self.pos_within_token()); let res = Token::new(token_kind, self.pos_within_token());
@ -552,24 +546,22 @@ impl Cursor<'_> {
// we see a prefix here, it is definitely an unknown prefix. // we see a prefix here, it is definitely an unknown prefix.
match self.first() { match self.first() {
'#' | '"' | '\'' => UnknownPrefix, '#' | '"' | '\'' => UnknownPrefix,
c if !c.is_ascii() && c.is_emoji_char() => self.fake_ident_or_unknown_prefix(), c if !c.is_ascii() && c.is_emoji_char() => self.invalid_ident(),
_ => Ident, _ => Ident,
} }
} }
fn fake_ident_or_unknown_prefix(&mut self) -> TokenKind { fn invalid_ident(&mut self) -> TokenKind {
// Start is already eaten, eat the rest of identifier. // Start is already eaten, eat the rest of identifier.
self.eat_while(|c| { self.eat_while(|c| {
unicode_xid::UnicodeXID::is_xid_continue(c) const ZERO_WIDTH_JOINER: char = '\u{200d}';
|| (!c.is_ascii() && c.is_emoji_char()) is_id_continue(c) || (!c.is_ascii() && c.is_emoji_char()) || c == ZERO_WIDTH_JOINER
|| c == '\u{200d}'
}); });
// Known prefixes must have been handled earlier. So if // An invalid identifier followed by '#' or '"' or '\'' could be
// we see a prefix here, it is definitely an unknown prefix. // interpreted as an invalid literal prefix. We don't bother doing that
match self.first() { // because the treatment of invalid identifiers and invalid prefixes
'#' | '"' | '\'' => InvalidPrefix, // would be the same.
_ => InvalidIdent, InvalidIdent
}
} }
fn c_or_byte_string( fn c_or_byte_string(

View file

@ -213,7 +213,7 @@ impl<'psess, 'src> StringReader<'psess, 'src> {
let ident = Symbol::intern(lifetime_name); let ident = Symbol::intern(lifetime_name);
token::Lifetime(ident, IdentIsRaw::No) token::Lifetime(ident, IdentIsRaw::No)
} }
rustc_lexer::TokenKind::InvalidIdent | rustc_lexer::TokenKind::InvalidPrefix rustc_lexer::TokenKind::InvalidIdent
// Do not recover an identifier with emoji if the codepoint is a confusable // Do not recover an identifier with emoji if the codepoint is a confusable
// with a recoverable substitution token, like ``. // with a recoverable substitution token, like ``.
if !UNICODE_ARRAY.iter().any(|&(c, _, _)| { if !UNICODE_ARRAY.iter().any(|&(c, _, _)| {
@ -359,8 +359,7 @@ impl<'psess, 'src> StringReader<'psess, 'src> {
rustc_lexer::TokenKind::Percent => token::BinOp(token::Percent), rustc_lexer::TokenKind::Percent => token::BinOp(token::Percent),
rustc_lexer::TokenKind::Unknown rustc_lexer::TokenKind::Unknown
| rustc_lexer::TokenKind::InvalidIdent | rustc_lexer::TokenKind::InvalidIdent => {
| rustc_lexer::TokenKind::InvalidPrefix => {
// Don't emit diagnostics for sequences of the same invalid token // Don't emit diagnostics for sequences of the same invalid token
if swallow_next_invalid > 0 { if swallow_next_invalid > 0 {
swallow_next_invalid -= 1; swallow_next_invalid -= 1;

View file

@ -861,10 +861,9 @@ impl<'src> Classifier<'src> {
}, },
Some(c) => c, Some(c) => c,
}, },
TokenKind::RawIdent TokenKind::RawIdent | TokenKind::UnknownPrefix | TokenKind::InvalidIdent => {
| TokenKind::UnknownPrefix Class::Ident(self.new_span(before, text))
| TokenKind::InvalidPrefix }
| TokenKind::InvalidIdent => Class::Ident(self.new_span(before, text)),
TokenKind::Lifetime { .. } TokenKind::Lifetime { .. }
| TokenKind::RawLifetime | TokenKind::RawLifetime
| TokenKind::UnknownPrefixLifetime => Class::Lifetime, | TokenKind::UnknownPrefixLifetime => Class::Lifetime,

View file

@ -183,7 +183,7 @@ impl<'a> Converter<'a> {
rustc_lexer::TokenKind::Ident => { rustc_lexer::TokenKind::Ident => {
SyntaxKind::from_keyword(token_text, self.edition).unwrap_or(IDENT) SyntaxKind::from_keyword(token_text, self.edition).unwrap_or(IDENT)
} }
rustc_lexer::TokenKind::InvalidPrefix | rustc_lexer::TokenKind::InvalidIdent => { rustc_lexer::TokenKind::InvalidIdent => {
err = "Ident contains invalid characters"; err = "Ident contains invalid characters";
IDENT IDENT
} }