From 3520bba13690e843704e77bd2ada89131b81051e Mon Sep 17 00:00:00 2001 From: clubby789 Date: Thu, 19 Jan 2023 02:24:51 +0000 Subject: [PATCH 1/2] Use strings for homoglyph replacements --- .../rustc_parse/src/lexer/unicode_chars.rs | 592 +++++++++--------- 1 file changed, 296 insertions(+), 296 deletions(-) diff --git a/compiler/rustc_parse/src/lexer/unicode_chars.rs b/compiler/rustc_parse/src/lexer/unicode_chars.rs index 65479b341d7..2332216f09b 100644 --- a/compiler/rustc_parse/src/lexer/unicode_chars.rs +++ b/compiler/rustc_parse/src/lexer/unicode_chars.rs @@ -7,329 +7,329 @@ use rustc_errors::{Applicability, Diagnostic}; use rustc_span::{symbol::kw, BytePos, Pos, Span}; #[rustfmt::skip] // for line breaks -pub(crate) const UNICODE_ARRAY: &[(char, &str, char)] = &[ - ('
', "Line Separator", ' '), - ('
', "Paragraph Separator", ' '), - (' ', "Ogham Space mark", ' '), - (' ', "En Quad", ' '), - (' ', "Em Quad", ' '), - (' ', "En Space", ' '), - (' ', "Em Space", ' '), - (' ', "Three-Per-Em Space", ' '), - (' ', "Four-Per-Em Space", ' '), - (' ', "Six-Per-Em Space", ' '), - (' ', "Punctuation Space", ' '), - (' ', "Thin Space", ' '), - (' ', "Hair Space", ' '), - (' ', "Medium Mathematical Space", ' '), - (' ', "No-Break Space", ' '), - (' ', "Figure Space", ' '), - (' ', "Narrow No-Break Space", ' '), - (' ', "Ideographic Space", ' '), +pub(crate) const UNICODE_ARRAY: &[(char, &str, &str)] = &[ + ('
', "Line Separator", " "), + ('
', "Paragraph Separator", " "), + (' ', "Ogham Space mark", " "), + (' ', "En Quad", " "), + (' ', "Em Quad", " "), + (' ', "En Space", " "), + (' ', "Em Space", " "), + (' ', "Three-Per-Em Space", " "), + (' ', "Four-Per-Em Space", " "), + (' ', "Six-Per-Em Space", " "), + (' ', "Punctuation Space", " "), + (' ', "Thin Space", " "), + (' ', "Hair Space", " "), + (' ', "Medium Mathematical Space", " "), + (' ', "No-Break Space", " "), + (' ', "Figure Space", " "), + (' ', "Narrow No-Break Space", " "), + (' ', "Ideographic Space", " "), - ('ߺ', "Nko Lajanyalan", '_'), - ('﹍', "Dashed Low Line", '_'), - ('﹎', "Centreline Low Line", '_'), - ('﹏', "Wavy Low Line", '_'), - ('_', "Fullwidth Low Line", '_'), + ('ߺ', "Nko Lajanyalan", "_"), + ('﹍', "Dashed Low Line", "_"), + ('﹎', "Centreline Low Line", "_"), + ('﹏', "Wavy Low Line", "_"), + ('_', "Fullwidth Low Line", "_"), - ('‐', "Hyphen", '-'), - ('‑', "Non-Breaking Hyphen", '-'), - ('‒', "Figure Dash", '-'), - ('–', "En Dash", '-'), - ('—', "Em Dash", '-'), - ('﹘', "Small Em Dash", '-'), - ('۔', "Arabic Full Stop", '-'), - ('⁃', "Hyphen Bullet", '-'), - ('˗', "Modifier Letter Minus Sign", '-'), - ('−', "Minus Sign", '-'), - ('➖', "Heavy Minus Sign", '-'), - ('Ⲻ', "Coptic Letter Dialect-P Ni", '-'), - ('ー', "Katakana-Hiragana Prolonged Sound Mark", '-'), - ('-', "Fullwidth Hyphen-Minus", '-'), - ('―', "Horizontal Bar", '-'), - ('─', "Box Drawings Light Horizontal", '-'), - ('━', "Box Drawings Heavy Horizontal", '-'), - ('㇐', "CJK Stroke H", '-'), - ('ꟷ', "Latin Epigraphic Letter Sideways I", '-'), - ('ᅳ', "Hangul Jungseong Eu", '-'), - ('ㅡ', "Hangul Letter Eu", '-'), - ('一', "CJK Unified Ideograph-4E00", '-'), - ('⼀', "Kangxi Radical One", '-'), + ('‐', "Hyphen", "-"), + ('‑', "Non-Breaking Hyphen", "-"), + ('‒', "Figure Dash", "-"), + ('–', "En Dash", "-"), + ('—', "Em Dash", "-"), + ('﹘', "Small Em Dash", "-"), + ('۔', "Arabic Full Stop", "-"), + ('⁃', "Hyphen Bullet", "-"), + ('˗', "Modifier Letter Minus Sign", "-"), + ('−', "Minus Sign", "-"), + ('➖', "Heavy Minus Sign", "-"), + ('Ⲻ', "Coptic Letter Dialect-P Ni", "-"), + ('ー', "Katakana-Hiragana Prolonged Sound Mark", "-"), + ('-', "Fullwidth Hyphen-Minus", "-"), + ('―', "Horizontal Bar", "-"), + ('─', "Box Drawings Light Horizontal", "-"), + ('━', "Box Drawings Heavy Horizontal", "-"), + ('㇐', "CJK Stroke H", "-"), + ('ꟷ', "Latin Epigraphic Letter Sideways I", "-"), + ('ᅳ', "Hangul Jungseong Eu", "-"), + ('ㅡ', "Hangul Letter Eu", "-"), + ('一', "CJK Unified Ideograph-4E00", "-"), + ('⼀', "Kangxi Radical One", "-"), - ('؍', "Arabic Date Separator", ','), - ('٫', "Arabic Decimal Separator", ','), - ('‚', "Single Low-9 Quotation Mark", ','), - ('¸', "Cedilla", ','), - ('ꓹ', "Lisu Letter Tone Na Po", ','), - (',', "Fullwidth Comma", ','), + ('؍', "Arabic Date Separator", ","), + ('٫', "Arabic Decimal Separator", ","), + ('‚', "Single Low-9 Quotation Mark", ","), + ('¸', "Cedilla", ","), + ('ꓹ', "Lisu Letter Tone Na Po", ","), + (',', "Fullwidth Comma", ","), - (';', "Greek Question Mark", ';'), - (';', "Fullwidth Semicolon", ';'), - ('︔', "Presentation Form For Vertical Semicolon", ';'), + (';', "Greek Question Mark", ";"), + (';', "Fullwidth Semicolon", ";"), + ('︔', "Presentation Form For Vertical Semicolon", ";"), - ('ः', "Devanagari Sign Visarga", ':'), - ('ઃ', "Gujarati Sign Visarga", ':'), - (':', "Fullwidth Colon", ':'), - ('։', "Armenian Full Stop", ':'), - ('܃', "Syriac Supralinear Colon", ':'), - ('܄', "Syriac Sublinear Colon", ':'), - ('᛬', "Runic Multiple Punctuation", ':'), - ('︰', "Presentation Form For Vertical Two Dot Leader", ':'), - ('᠃', "Mongolian Full Stop", ':'), - ('᠉', "Mongolian Manchu Full Stop", ':'), - ('⁚', "Two Dot Punctuation", ':'), - ('׃', "Hebrew Punctuation Sof Pasuq", ':'), - ('˸', "Modifier Letter Raised Colon", ':'), - ('꞉', "Modifier Letter Colon", ':'), - ('∶', "Ratio", ':'), - ('ː', "Modifier Letter Triangular Colon", ':'), - ('ꓽ', "Lisu Letter Tone Mya Jeu", ':'), - ('︓', "Presentation Form For Vertical Colon", ':'), + ('ः', "Devanagari Sign Visarga", ":"), + ('ઃ', "Gujarati Sign Visarga", ":"), + (':', "Fullwidth Colon", ":"), + ('։', "Armenian Full Stop", ":"), + ('܃', "Syriac Supralinear Colon", ":"), + ('܄', "Syriac Sublinear Colon", ":"), + ('᛬', "Runic Multiple Punctuation", ":"), + ('︰', "Presentation Form For Vertical Two Dot Leader", ":"), + ('᠃', "Mongolian Full Stop", ":"), + ('᠉', "Mongolian Manchu Full Stop", ":"), + ('⁚', "Two Dot Punctuation", ":"), + ('׃', "Hebrew Punctuation Sof Pasuq", ":"), + ('˸', "Modifier Letter Raised Colon", ":"), + ('꞉', "Modifier Letter Colon", ":"), + ('∶', "Ratio", ":"), + ('ː', "Modifier Letter Triangular Colon", ":"), + ('ꓽ', "Lisu Letter Tone Mya Jeu", ":"), + ('︓', "Presentation Form For Vertical Colon", ":"), - ('!', "Fullwidth Exclamation Mark", '!'), - ('ǃ', "Latin Letter Retroflex Click", '!'), - ('ⵑ', "Tifinagh Letter Tuareg Yang", '!'), - ('︕', "Presentation Form For Vertical Exclamation Mark", '!'), + ('!', "Fullwidth Exclamation Mark", "!"), + ('ǃ', "Latin Letter Retroflex Click", "!"), + ('ⵑ', "Tifinagh Letter Tuareg Yang", "!"), + ('︕', "Presentation Form For Vertical Exclamation Mark", "!"), - ('ʔ', "Latin Letter Glottal Stop", '?'), - ('Ɂ', "Latin Capital Letter Glottal Stop", '?'), - ('ॽ', "Devanagari Letter Glottal Stop", '?'), - ('Ꭾ', "Cherokee Letter He", '?'), - ('ꛫ', "Bamum Letter Ntuu", '?'), - ('?', "Fullwidth Question Mark", '?'), - ('︖', "Presentation Form For Vertical Question Mark", '?'), + ('ʔ', "Latin Letter Glottal Stop", "?"), + ('Ɂ', "Latin Capital Letter Glottal Stop", "?"), + ('ॽ', "Devanagari Letter Glottal Stop", "?"), + ('Ꭾ', "Cherokee Letter He", "?"), + ('ꛫ', "Bamum Letter Ntuu", "?"), + ('?', "Fullwidth Question Mark", "?"), + ('︖', "Presentation Form For Vertical Question Mark", "?"), - ('𝅭', "Musical Symbol Combining Augmentation Dot", '.'), - ('․', "One Dot Leader", '.'), - ('܁', "Syriac Supralinear Full Stop", '.'), - ('܂', "Syriac Sublinear Full Stop", '.'), - ('꘎', "Vai Full Stop", '.'), - ('𐩐', "Kharoshthi Punctuation Dot", '.'), - ('٠', "Arabic-Indic Digit Zero", '.'), - ('۰', "Extended Arabic-Indic Digit Zero", '.'), - ('ꓸ', "Lisu Letter Tone Mya Ti", '.'), - ('·', "Middle Dot", '.'), - ('・', "Katakana Middle Dot", '.'), - ('・', "Halfwidth Katakana Middle Dot", '.'), - ('᛫', "Runic Single Punctuation", '.'), - ('·', "Greek Ano Teleia", '.'), - ('⸱', "Word Separator Middle Dot", '.'), - ('𐄁', "Aegean Word Separator Dot", '.'), - ('•', "Bullet", '.'), - ('‧', "Hyphenation Point", '.'), - ('∙', "Bullet Operator", '.'), - ('⋅', "Dot Operator", '.'), - ('ꞏ', "Latin Letter Sinological Dot", '.'), - ('ᐧ', "Canadian Syllabics Final Middle Dot", '.'), - ('ᐧ', "Canadian Syllabics Final Middle Dot", '.'), - ('.', "Fullwidth Full Stop", '.'), - ('。', "Ideographic Full Stop", '.'), - ('︒', "Presentation Form For Vertical Ideographic Full Stop", '.'), + ('𝅭', "Musical Symbol Combining Augmentation Dot", "."), + ('․', "One Dot Leader", "."), + ('܁', "Syriac Supralinear Full Stop", "."), + ('܂', "Syriac Sublinear Full Stop", "."), + ('꘎', "Vai Full Stop", "."), + ('𐩐', "Kharoshthi Punctuation Dot", "."), + ('٠', "Arabic-Indic Digit Zero", "."), + ('۰', "Extended Arabic-Indic Digit Zero", "."), + ('ꓸ', "Lisu Letter Tone Mya Ti", "."), + ('·', "Middle Dot", "."), + ('・', "Katakana Middle Dot", "."), + ('・', "Halfwidth Katakana Middle Dot", "."), + ('᛫', "Runic Single Punctuation", "."), + ('·', "Greek Ano Teleia", "."), + ('⸱', "Word Separator Middle Dot", "."), + ('𐄁', "Aegean Word Separator Dot", "."), + ('•', "Bullet", "."), + ('‧', "Hyphenation Point", "."), + ('∙', "Bullet Operator", "."), + ('⋅', "Dot Operator", "."), + ('ꞏ', "Latin Letter Sinological Dot", "."), + ('ᐧ', "Canadian Syllabics Final Middle Dot", "."), + ('ᐧ', "Canadian Syllabics Final Middle Dot", "."), + ('.', "Fullwidth Full Stop", "."), + ('。', "Ideographic Full Stop", "."), + ('︒', "Presentation Form For Vertical Ideographic Full Stop", "."), - ('՝', "Armenian Comma", '\''), - (''', "Fullwidth Apostrophe", '\''), - ('‘', "Left Single Quotation Mark", '\''), - ('’', "Right Single Quotation Mark", '\''), - ('‛', "Single High-Reversed-9 Quotation Mark", '\''), - ('′', "Prime", '\''), - ('‵', "Reversed Prime", '\''), - ('՚', "Armenian Apostrophe", '\''), - ('׳', "Hebrew Punctuation Geresh", '\''), - ('`', "Grave Accent", '\''), - ('`', "Greek Varia", '\''), - ('`', "Fullwidth Grave Accent", '\''), - ('´', "Acute Accent", '\''), - ('΄', "Greek Tonos", '\''), - ('´', "Greek Oxia", '\''), - ('᾽', "Greek Koronis", '\''), - ('᾿', "Greek Psili", '\''), - ('῾', "Greek Dasia", '\''), - ('ʹ', "Modifier Letter Prime", '\''), - ('ʹ', "Greek Numeral Sign", '\''), - ('ˈ', "Modifier Letter Vertical Line", '\''), - ('ˊ', "Modifier Letter Acute Accent", '\''), - ('ˋ', "Modifier Letter Grave Accent", '\''), - ('˴', "Modifier Letter Middle Grave Accent", '\''), - ('ʻ', "Modifier Letter Turned Comma", '\''), - ('ʽ', "Modifier Letter Reversed Comma", '\''), - ('ʼ', "Modifier Letter Apostrophe", '\''), - ('ʾ', "Modifier Letter Right Half Ring", '\''), - ('ꞌ', "Latin Small Letter Saltillo", '\''), - ('י', "Hebrew Letter Yod", '\''), - ('ߴ', "Nko High Tone Apostrophe", '\''), - ('ߵ', "Nko Low Tone Apostrophe", '\''), - ('ᑊ', "Canadian Syllabics West-Cree P", '\''), - ('ᛌ', "Runic Letter Short-Twig-Sol S", '\''), - ('𖽑', "Miao Sign Aspiration", '\''), - ('𖽒', "Miao Sign Reformed Voicing", '\''), + ('՝', "Armenian Comma", "\'"), + (''', "Fullwidth Apostrophe", "\'"), + ('‘', "Left Single Quotation Mark", "\'"), + ('’', "Right Single Quotation Mark", "\'"), + ('‛', "Single High-Reversed-9 Quotation Mark", "\'"), + ('′', "Prime", "\'"), + ('‵', "Reversed Prime", "\'"), + ('՚', "Armenian Apostrophe", "\'"), + ('׳', "Hebrew Punctuation Geresh", "\'"), + ('`', "Grave Accent", "\'"), + ('`', "Greek Varia", "\'"), + ('`', "Fullwidth Grave Accent", "\'"), + ('´', "Acute Accent", "\'"), + ('΄', "Greek Tonos", "\'"), + ('´', "Greek Oxia", "\'"), + ('᾽', "Greek Koronis", "\'"), + ('᾿', "Greek Psili", "\'"), + ('῾', "Greek Dasia", "\'"), + ('ʹ', "Modifier Letter Prime", "\'"), + ('ʹ', "Greek Numeral Sign", "\'"), + ('ˈ', "Modifier Letter Vertical Line", "\'"), + ('ˊ', "Modifier Letter Acute Accent", "\'"), + ('ˋ', "Modifier Letter Grave Accent", "\'"), + ('˴', "Modifier Letter Middle Grave Accent", "\'"), + ('ʻ', "Modifier Letter Turned Comma", "\'"), + ('ʽ', "Modifier Letter Reversed Comma", "\'"), + ('ʼ', "Modifier Letter Apostrophe", "\'"), + ('ʾ', "Modifier Letter Right Half Ring", "\'"), + ('ꞌ', "Latin Small Letter Saltillo", "\'"), + ('י', "Hebrew Letter Yod", "\'"), + ('ߴ', "Nko High Tone Apostrophe", "\'"), + ('ߵ', "Nko Low Tone Apostrophe", "\'"), + ('ᑊ', "Canadian Syllabics West-Cree P", "\'"), + ('ᛌ', "Runic Letter Short-Twig-Sol S", "\'"), + ('𖽑', "Miao Sign Aspiration", "\'"), + ('𖽒', "Miao Sign Reformed Voicing", "\'"), - ('᳓', "Vedic Sign Nihshvasa", '"'), - ('"', "Fullwidth Quotation Mark", '"'), - ('“', "Left Double Quotation Mark", '"'), - ('”', "Right Double Quotation Mark", '"'), - ('‟', "Double High-Reversed-9 Quotation Mark", '"'), - ('″', "Double Prime", '"'), - ('‶', "Reversed Double Prime", '"'), - ('〃', "Ditto Mark", '"'), - ('״', "Hebrew Punctuation Gershayim", '"'), - ('˝', "Double Acute Accent", '"'), - ('ʺ', "Modifier Letter Double Prime", '"'), - ('˶', "Modifier Letter Middle Double Acute Accent", '"'), - ('˵', "Modifier Letter Middle Double Grave Accent", '"'), - ('ˮ', "Modifier Letter Double Apostrophe", '"'), - ('ײ', "Hebrew Ligature Yiddish Double Yod", '"'), - ('❞', "Heavy Double Comma Quotation Mark Ornament", '"'), - ('❝', "Heavy Double Turned Comma Quotation Mark Ornament", '"'), + ('᳓', "Vedic Sign Nihshvasa", "\""), + ('"', "Fullwidth Quotation Mark", "\""), + ('“', "Left Double Quotation Mark", "\""), + ('”', "Right Double Quotation Mark", "\""), + ('‟', "Double High-Reversed-9 Quotation Mark", "\""), + ('″', "Double Prime", "\""), + ('‶', "Reversed Double Prime", "\""), + ('〃', "Ditto Mark", "\""), + ('״', "Hebrew Punctuation Gershayim", "\""), + ('˝', "Double Acute Accent", "\""), + ('ʺ', "Modifier Letter Double Prime", "\""), + ('˶', "Modifier Letter Middle Double Acute Accent", "\""), + ('˵', "Modifier Letter Middle Double Grave Accent", "\""), + ('ˮ', "Modifier Letter Double Apostrophe", "\""), + ('ײ', "Hebrew Ligature Yiddish Double Yod", "\""), + ('❞', "Heavy Double Comma Quotation Mark Ornament", "\""), + ('❝', "Heavy Double Turned Comma Quotation Mark Ornament", "\""), - ('(', "Fullwidth Left Parenthesis", '('), - ('❨', "Medium Left Parenthesis Ornament", '('), - ('﴾', "Ornate Left Parenthesis", '('), + ('(', "Fullwidth Left Parenthesis", "("), + ('❨', "Medium Left Parenthesis Ornament", "("), + ('﴾', "Ornate Left Parenthesis", "("), - (')', "Fullwidth Right Parenthesis", ')'), - ('❩', "Medium Right Parenthesis Ornament", ')'), - ('﴿', "Ornate Right Parenthesis", ')'), + (')', "Fullwidth Right Parenthesis", ")"), + ('❩', "Medium Right Parenthesis Ornament", ")"), + ('﴿', "Ornate Right Parenthesis", ")"), - ('[', "Fullwidth Left Square Bracket", '['), - ('❲', "Light Left Tortoise Shell Bracket Ornament", '['), - ('「', "Left Corner Bracket", '['), - ('『', "Left White Corner Bracket", '['), - ('【', "Left Black Lenticular Bracket", '['), - ('〔', "Left Tortoise Shell Bracket", '['), - ('〖', "Left White Lenticular Bracket", '['), - ('〘', "Left White Tortoise Shell Bracket", '['), - ('〚', "Left White Square Bracket", '['), + ('[', "Fullwidth Left Square Bracket", "["), + ('❲', "Light Left Tortoise Shell Bracket Ornament", "["), + ('「', "Left Corner Bracket", "["), + ('『', "Left White Corner Bracket", "["), + ('【', "Left Black Lenticular Bracket", "["), + ('〔', "Left Tortoise Shell Bracket", "["), + ('〖', "Left White Lenticular Bracket", "["), + ('〘', "Left White Tortoise Shell Bracket", "["), + ('〚', "Left White Square Bracket", "["), - (']', "Fullwidth Right Square Bracket", ']'), - ('❳', "Light Right Tortoise Shell Bracket Ornament", ']'), - ('」', "Right Corner Bracket", ']'), - ('』', "Right White Corner Bracket", ']'), - ('】', "Right Black Lenticular Bracket", ']'), - ('〕', "Right Tortoise Shell Bracket", ']'), - ('〗', "Right White Lenticular Bracket", ']'), - ('〙', "Right White Tortoise Shell Bracket", ']'), - ('〛', "Right White Square Bracket", ']'), + (']', "Fullwidth Right Square Bracket", "]"), + ('❳', "Light Right Tortoise Shell Bracket Ornament", "]"), + ('」', "Right Corner Bracket", "]"), + ('』', "Right White Corner Bracket", "]"), + ('】', "Right Black Lenticular Bracket", "]"), + ('〕', "Right Tortoise Shell Bracket", "]"), + ('〗', "Right White Lenticular Bracket", "]"), + ('〙', "Right White Tortoise Shell Bracket", "]"), + ('〛', "Right White Square Bracket", "]"), - ('❴', "Medium Left Curly Bracket Ornament", '{'), - ('𝄔', "Musical Symbol Brace", '{'), - ('{', "Fullwidth Left Curly Bracket", '{'), + ('❴', "Medium Left Curly Bracket Ornament", "{"), + ('𝄔', "Musical Symbol Brace", "{"), + ('{', "Fullwidth Left Curly Bracket", "{"), - ('❵', "Medium Right Curly Bracket Ornament", '}'), - ('}', "Fullwidth Right Curly Bracket", '}'), + ('❵', "Medium Right Curly Bracket Ornament", "}"), + ('}', "Fullwidth Right Curly Bracket", "}"), - ('⁎', "Low Asterisk", '*'), - ('٭', "Arabic Five Pointed Star", '*'), - ('∗', "Asterisk Operator", '*'), - ('𐌟', "Old Italic Letter Ess", '*'), - ('*', "Fullwidth Asterisk", '*'), + ('⁎', "Low Asterisk", "*"), + ('٭', "Arabic Five Pointed Star", "*"), + ('∗', "Asterisk Operator", "*"), + ('𐌟', "Old Italic Letter Ess", "*"), + ('*', "Fullwidth Asterisk", "*"), - ('᜵', "Philippine Single Punctuation", '/'), - ('⁁', "Caret Insertion Point", '/'), - ('∕', "Division Slash", '/'), - ('⁄', "Fraction Slash", '/'), - ('╱', "Box Drawings Light Diagonal Upper Right To Lower Left", '/'), - ('⟋', "Mathematical Rising Diagonal", '/'), - ('⧸', "Big Solidus", '/'), - ('𝈺', "Greek Instrumental Notation Symbol-47", '/'), - ('㇓', "CJK Stroke Sp", '/'), - ('〳', "Vertical Kana Repeat Mark Upper Half", '/'), - ('Ⳇ', "Coptic Capital Letter Old Coptic Esh", '/'), - ('ノ', "Katakana Letter No", '/'), - ('丿', "CJK Unified Ideograph-4E3F", '/'), - ('⼃', "Kangxi Radical Slash", '/'), - ('/', "Fullwidth Solidus", '/'), + ('᜵', "Philippine Single Punctuation", "/"), + ('⁁', "Caret Insertion Point", "/"), + ('∕', "Division Slash", "/"), + ('⁄', "Fraction Slash", "/"), + ('╱', "Box Drawings Light Diagonal Upper Right To Lower Left", "/"), + ('⟋', "Mathematical Rising Diagonal", "/"), + ('⧸', "Big Solidus", "/"), + ('𝈺', "Greek Instrumental Notation Symbol-47", "/"), + ('㇓', "CJK Stroke Sp", "/"), + ('〳', "Vertical Kana Repeat Mark Upper Half", "/"), + ('Ⳇ', "Coptic Capital Letter Old Coptic Esh", "/"), + ('ノ', "Katakana Letter No", "/"), + ('丿', "CJK Unified Ideograph-4E3F", "/"), + ('⼃', "Kangxi Radical Slash", "/"), + ('/', "Fullwidth Solidus", "/"), - ('\', "Fullwidth Reverse Solidus", '\\'), - ('﹨', "Small Reverse Solidus", '\\'), - ('∖', "Set Minus", '\\'), - ('⟍', "Mathematical Falling Diagonal", '\\'), - ('⧵', "Reverse Solidus Operator", '\\'), - ('⧹', "Big Reverse Solidus", '\\'), - ('⧹', "Greek Vocal Notation Symbol-16", '\\'), - ('⧹', "Greek Instrumental Symbol-48", '\\'), - ('㇔', "CJK Stroke D", '\\'), - ('丶', "CJK Unified Ideograph-4E36", '\\'), - ('⼂', "Kangxi Radical Dot", '\\'), - ('、', "Ideographic Comma", '\\'), - ('ヽ', "Katakana Iteration Mark", '\\'), + ('\', "Fullwidth Reverse Solidus", "\\"), + ('﹨', "Small Reverse Solidus", "\\"), + ('∖', "Set Minus", "\\"), + ('⟍', "Mathematical Falling Diagonal", "\\"), + ('⧵', "Reverse Solidus Operator", "\\"), + ('⧹', "Big Reverse Solidus", "\\"), + ('⧹', "Greek Vocal Notation Symbol-16", "\\"), + ('⧹', "Greek Instrumental Symbol-48", "\\"), + ('㇔', "CJK Stroke D", "\\"), + ('丶', "CJK Unified Ideograph-4E36", "\\"), + ('⼂', "Kangxi Radical Dot", "\\"), + ('、', "Ideographic Comma", "\\"), + ('ヽ', "Katakana Iteration Mark", "\\"), - ('ꝸ', "Latin Small Letter Um", '&'), - ('&', "Fullwidth Ampersand", '&'), + ('ꝸ', "Latin Small Letter Um", "&"), + ('&', "Fullwidth Ampersand", "&"), - ('᛭', "Runic Cross Punctuation", '+'), - ('➕', "Heavy Plus Sign", '+'), - ('𐊛', "Lycian Letter H", '+'), - ('﬩', "Hebrew Letter Alternative Plus Sign", '+'), - ('+', "Fullwidth Plus Sign", '+'), + ('᛭', "Runic Cross Punctuation", "+"), + ('➕', "Heavy Plus Sign", "+"), + ('𐊛', "Lycian Letter H", "+"), + ('﬩', "Hebrew Letter Alternative Plus Sign", "+"), + ('+', "Fullwidth Plus Sign", "+"), - ('‹', "Single Left-Pointing Angle Quotation Mark", '<'), - ('❮', "Heavy Left-Pointing Angle Quotation Mark Ornament", '<'), - ('˂', "Modifier Letter Left Arrowhead", '<'), - ('𝈶', "Greek Instrumental Symbol-40", '<'), - ('ᐸ', "Canadian Syllabics Pa", '<'), - ('ᚲ', "Runic Letter Kauna", '<'), - ('❬', "Medium Left-Pointing Angle Bracket Ornament", '<'), - ('⟨', "Mathematical Left Angle Bracket", '<'), - ('〈', "Left-Pointing Angle Bracket", '<'), - ('〈', "Left Angle Bracket", '<'), - ('㇛', "CJK Stroke Pd", '<'), - ('く', "Hiragana Letter Ku", '<'), - ('𡿨', "CJK Unified Ideograph-21FE8", '<'), - ('《', "Left Double Angle Bracket", '<'), - ('<', "Fullwidth Less-Than Sign", '<'), + ('‹', "Single Left-Pointing Angle Quotation Mark", "<"), + ('❮', "Heavy Left-Pointing Angle Quotation Mark Ornament", "<"), + ('˂', "Modifier Letter Left Arrowhead", "<"), + ('𝈶', "Greek Instrumental Symbol-40", "<"), + ('ᐸ', "Canadian Syllabics Pa", "<"), + ('ᚲ', "Runic Letter Kauna", "<"), + ('❬', "Medium Left-Pointing Angle Bracket Ornament", "<"), + ('⟨', "Mathematical Left Angle Bracket", "<"), + ('〈', "Left-Pointing Angle Bracket", "<"), + ('〈', "Left Angle Bracket", "<"), + ('㇛', "CJK Stroke Pd", "<"), + ('く', "Hiragana Letter Ku", "<"), + ('𡿨', "CJK Unified Ideograph-21FE8", "<"), + ('《', "Left Double Angle Bracket", "<"), + ('<', "Fullwidth Less-Than Sign", "<"), - ('᐀', "Canadian Syllabics Hyphen", '='), - ('⹀', "Double Hyphen", '='), - ('゠', "Katakana-Hiragana Double Hyphen", '='), - ('꓿', "Lisu Punctuation Full Stop", '='), - ('=', "Fullwidth Equals Sign", '='), + ('᐀', "Canadian Syllabics Hyphen", "="), + ('⹀', "Double Hyphen", "="), + ('゠', "Katakana-Hiragana Double Hyphen", "="), + ('꓿', "Lisu Punctuation Full Stop", "="), + ('=', "Fullwidth Equals Sign", "="), - ('›', "Single Right-Pointing Angle Quotation Mark", '>'), - ('❯', "Heavy Right-Pointing Angle Quotation Mark Ornament", '>'), - ('˃', "Modifier Letter Right Arrowhead", '>'), - ('𝈷', "Greek Instrumental Symbol-42", '>'), - ('ᐳ', "Canadian Syllabics Po", '>'), - ('𖼿', "Miao Letter Archaic Zza", '>'), - ('❭', "Medium Right-Pointing Angle Bracket Ornament", '>'), - ('⟩', "Mathematical Right Angle Bracket", '>'), - ('〉', "Right-Pointing Angle Bracket", '>'), - ('〉', "Right Angle Bracket", '>'), - ('》', "Right Double Angle Bracket", '>'), - ('>', "Fullwidth Greater-Than Sign", '>'), + ('›', "Single Right-Pointing Angle Quotation Mark", ">"), + ('❯', "Heavy Right-Pointing Angle Quotation Mark Ornament", ">"), + ('˃', "Modifier Letter Right Arrowhead", ">"), + ('𝈷', "Greek Instrumental Symbol-42", ">"), + ('ᐳ', "Canadian Syllabics Po", ">"), + ('𖼿', "Miao Letter Archaic Zza", ">"), + ('❭', "Medium Right-Pointing Angle Bracket Ornament", ">"), + ('⟩', "Mathematical Right Angle Bracket", ">"), + ('〉', "Right-Pointing Angle Bracket", ">"), + ('〉', "Right Angle Bracket", ">"), + ('》', "Right Double Angle Bracket", ">"), + ('>', "Fullwidth Greater-Than Sign", ">"), ]; // FIXME: the lexer could be used to turn the ASCII version of unicode homoglyphs, instead of // keeping the substitution token in this table. Ideally, this should be inside `rustc_lexer`. // However, we should first remove compound tokens like `<<` from `rustc_lexer`, and then add // fancier error recovery to it, as there will be less overall work to do this way. -const ASCII_ARRAY: &[(char, &str, Option)] = &[ - (' ', "Space", None), - ('_', "Underscore", Some(token::Ident(kw::Underscore, false))), - ('-', "Minus/Hyphen", Some(token::BinOp(token::Minus))), - (',', "Comma", Some(token::Comma)), - (';', "Semicolon", Some(token::Semi)), - (':', "Colon", Some(token::Colon)), - ('!', "Exclamation Mark", Some(token::Not)), - ('?', "Question Mark", Some(token::Question)), - ('.', "Period", Some(token::Dot)), - ('(', "Left Parenthesis", Some(token::OpenDelim(Delimiter::Parenthesis))), - (')', "Right Parenthesis", Some(token::CloseDelim(Delimiter::Parenthesis))), - ('[', "Left Square Bracket", Some(token::OpenDelim(Delimiter::Bracket))), - (']', "Right Square Bracket", Some(token::CloseDelim(Delimiter::Bracket))), - ('{', "Left Curly Brace", Some(token::OpenDelim(Delimiter::Brace))), - ('}', "Right Curly Brace", Some(token::CloseDelim(Delimiter::Brace))), - ('*', "Asterisk", Some(token::BinOp(token::Star))), - ('/', "Slash", Some(token::BinOp(token::Slash))), - ('\\', "Backslash", None), - ('&', "Ampersand", Some(token::BinOp(token::And))), - ('+', "Plus Sign", Some(token::BinOp(token::Plus))), - ('<', "Less-Than Sign", Some(token::Lt)), - ('=', "Equals Sign", Some(token::Eq)), - ('>', "Greater-Than Sign", Some(token::Gt)), +const ASCII_ARRAY: &[(&str, &str, Option)] = &[ + (" ", "Space", None), + ("_", "Underscore", Some(token::Ident(kw::Underscore, false))), + ("-", "Minus/Hyphen", Some(token::BinOp(token::Minus))), + (",", "Comma", Some(token::Comma)), + (";", "Semicolon", Some(token::Semi)), + (":", "Colon", Some(token::Colon)), + ("!", "Exclamation Mark", Some(token::Not)), + ("?", "Question Mark", Some(token::Question)), + (".", "Period", Some(token::Dot)), + ("(", "Left Parenthesis", Some(token::OpenDelim(Delimiter::Parenthesis))), + (")", "Right Parenthesis", Some(token::CloseDelim(Delimiter::Parenthesis))), + ("[", "Left Square Bracket", Some(token::OpenDelim(Delimiter::Bracket))), + ("]", "Right Square Bracket", Some(token::CloseDelim(Delimiter::Bracket))), + ("{", "Left Curly Brace", Some(token::OpenDelim(Delimiter::Brace))), + ("}", "Right Curly Brace", Some(token::CloseDelim(Delimiter::Brace))), + ("*", "Asterisk", Some(token::BinOp(token::Star))), + ("/", "Slash", Some(token::BinOp(token::Slash))), + ("\\", "Backslash", None), + ("&", "Ampersand", Some(token::BinOp(token::And))), + ("+", "Plus Sign", Some(token::BinOp(token::Plus))), + ("<", "Less-Than Sign", Some(token::Lt)), + ("=", "Equals Sign", Some(token::Eq)), + (">", "Greater-Than Sign", Some(token::Gt)), // FIXME: Literals are already lexed by this point, so we can't recover gracefully just by // spitting the correct token out. - ('\'', "Single Quote", None), - ('"', "Quotation Mark", None), + ("\'", "Single Quote", None), + ("\"", "Quotation Mark", None), ]; pub(super) fn check_for_substitution<'a>( @@ -339,11 +339,11 @@ pub(super) fn check_for_substitution<'a>( err: &mut Diagnostic, count: usize, ) -> Option { - let &(_u_char, u_name, ascii_char) = UNICODE_ARRAY.iter().find(|&&(c, _, _)| c == ch)?; + let &(_, u_name, ascii_str) = UNICODE_ARRAY.iter().find(|&&(c, _, _)| c == ch)?; let span = Span::with_root_ctxt(pos, pos + Pos::from_usize(ch.len_utf8() * count)); - let Some((_ascii_char, ascii_name, token)) = ASCII_ARRAY.iter().find(|&&(c, _, _)| c == ascii_char) else { + let Some((_, ascii_name, token)) = ASCII_ARRAY.iter().find(|&&(s, _, _)| s == ascii_str) else { let msg = format!("substitution character not found for '{}'", ch); reader.sess.span_diagnostic.span_bug_no_panic(span, &msg); return None; @@ -354,7 +354,7 @@ pub(super) fn check_for_substitution<'a>( let msg = format!( "Unicode characters '“' (Left Double Quotation Mark) and \ '”' (Right Double Quotation Mark) look like '{}' ({}), but are not", - ascii_char, ascii_name + ascii_str, ascii_name ); err.span_suggestion( Span::with_root_ctxt( @@ -368,12 +368,12 @@ pub(super) fn check_for_substitution<'a>( } else { let msg = format!( "Unicode character '{}' ({}) looks like '{}' ({}), but it is not", - ch, u_name, ascii_char, ascii_name + ch, u_name, ascii_str, ascii_name ); err.span_suggestion( span, &msg, - ascii_char.to_string().repeat(count), + ascii_str.to_string().repeat(count), Applicability::MaybeIncorrect, ); } From 1487aa9f9d7edbe660272f7adb5e57ef31deab9d Mon Sep 17 00:00:00 2001 From: clubby789 Date: Thu, 19 Jan 2023 02:25:24 +0000 Subject: [PATCH 2/2] Add double-equals homoglyph --- compiler/rustc_parse/src/lexer/unicode_chars.rs | 2 ++ tests/ui/parser/unicode-chars.rs | 3 +++ tests/ui/parser/unicode-chars.stderr | 13 ++++++++++++- 3 files changed, 17 insertions(+), 1 deletion(-) diff --git a/compiler/rustc_parse/src/lexer/unicode_chars.rs b/compiler/rustc_parse/src/lexer/unicode_chars.rs index 2332216f09b..34d003ccfa7 100644 --- a/compiler/rustc_parse/src/lexer/unicode_chars.rs +++ b/compiler/rustc_parse/src/lexer/unicode_chars.rs @@ -296,6 +296,7 @@ pub(crate) const UNICODE_ARRAY: &[(char, &str, &str)] = &[ ('〉', "Right Angle Bracket", ">"), ('》', "Right Double Angle Bracket", ">"), ('>', "Fullwidth Greater-Than Sign", ">"), + ('⩵', "Two Consecutive Equals Signs", "==") ]; // FIXME: the lexer could be used to turn the ASCII version of unicode homoglyphs, instead of @@ -325,6 +326,7 @@ const ASCII_ARRAY: &[(&str, &str, Option)] = &[ ("+", "Plus Sign", Some(token::BinOp(token::Plus))), ("<", "Less-Than Sign", Some(token::Lt)), ("=", "Equals Sign", Some(token::Eq)), + ("==", "Double Equals Sign", Some(token::EqEq)), (">", "Greater-Than Sign", Some(token::Gt)), // FIXME: Literals are already lexed by this point, so we can't recover gracefully just by // spitting the correct token out. diff --git a/tests/ui/parser/unicode-chars.rs b/tests/ui/parser/unicode-chars.rs index f0122561f46..cd25c756689 100644 --- a/tests/ui/parser/unicode-chars.rs +++ b/tests/ui/parser/unicode-chars.rs @@ -6,4 +6,7 @@ fn main() { //~^ ERROR unknown start of token: \u{a0} //~^^ NOTE character appears 3 more times //~^^^ HELP Unicode character ' ' (No-Break Space) looks like ' ' (Space), but it is not + let _ = 1 ⩵ 2; + //~^ ERROR unknown start of token + //~^^ HELP Unicode character '⩵' (Two Consecutive Equals Signs) looks like '==' (Double Equals Sign), but it is not } diff --git a/tests/ui/parser/unicode-chars.stderr b/tests/ui/parser/unicode-chars.stderr index b1d4a0af711..086de5ec099 100644 --- a/tests/ui/parser/unicode-chars.stderr +++ b/tests/ui/parser/unicode-chars.stderr @@ -21,5 +21,16 @@ help: Unicode character ' ' (No-Break Space) looks like ' ' (Space), but it is LL | let x = 0; | ++++ -error: aborting due to 2 previous errors +error: unknown start of token: \u{2a75} + --> $DIR/unicode-chars.rs:9:15 + | +LL | let _ = 1 ⩵ 2; + | ^ + | +help: Unicode character '⩵' (Two Consecutive Equals Signs) looks like '==' (Double Equals Sign), but it is not + | +LL | let _ = 1 == 2; + | ~~ + +error: aborting due to 3 previous errors