Update the list of confusable characters

Also reorder and space the list to make it clearer for futures updates and to come closer to the original list. Thanks @est31 for the instructions. Fixes #43629. r? @est31
2017-08-06 17:36:50 +02:00 · 2017-08-06 17:36:50 +02:00 · 4e2ddcb879
commit 4e2ddcb879
parent a9c24fd579
1 changed files with 125 additions and 19 deletions
--- a/src/libsyntax/parse/lexer/unicode_chars.rs
+++ b/src/libsyntax/parse/lexer/unicode_chars.rs
@ -1,4 +1,4 @@
-// Copyright 2012-2013 The Rust Project Developers. See the COPYRIGHT
+// Copyright 2012-2017 The Rust Project Developers. See the COPYRIGHT
 // file at the top-level directory of this distribution and at
 // http://rust-lang.org/COPYRIGHT.
 //
@ -9,15 +9,16 @@
 // except according to those terms.

 // Characters and their corresponding confusables were collected from
-// http://www.unicode.org/Public/security/revision-06/confusables.txt
+// http://www.unicode.org/Public/security/10.0.0/confusables.txt

 use syntax_pos::{Span, NO_EXPANSION};
 use errors::DiagnosticBuilder;
 use super::StringReader;

 const UNICODE_ARRAY: &'static [(char, &'static str, char)] = &[
-    (' ', "No-Break Space", ' '),
-    (' ', "Ogham Space Mark", ' '),
+    (' ', "Line Separator", ' '),
+    (' ', "Paragraph Separator", ' '),
+    (' ', "Ogham Space mark", ' '),
    (' ', "En Quad", ' '),
    (' ', "Em Quad", ' '),
    (' ', "En Space", ' '),
@ -25,39 +26,63 @@ const UNICODE_ARRAY: &'static [(char, &'static str, char)] = &[
    (' ', "Three-Per-Em Space", ' '),
    (' ', "Four-Per-Em Space", ' '),
    (' ', "Six-Per-Em Space", ' '),
-    (' ', "Figure Space", ' '),
    (' ', "Punctuation Space", ' '),
    (' ', "Thin Space", ' '),
    (' ', "Hair Space", ' '),
-    (' ', "Narrow No-Break Space", ' '),
    (' ', "Medium Mathematical Space", ' '),
+    (' ', "No-Break Space", ' '),
+    (' ', "Figure Space", ' '),
+    (' ', "Narrow No-Break Space", ' '),
    ('　', "Ideographic Space", ' '),
+
    ('ߺ', "Nko Lajanyalan", '_'),
    ('﹍', "Dashed Low Line", '_'),
    ('﹎', "Centreline Low Line", '_'),
    ('﹏', "Wavy Low Line", '_'),
+    ('＿', "Fullwidth Low Line", '-'),
+
    ('‐', "Hyphen", '-'),
    ('‑', "Non-Breaking Hyphen", '-'),
    ('‒', "Figure Dash", '-'),
    ('–', "En Dash", '-'),
    ('—', "Em Dash", '-'),
    ('﹘', "Small Em Dash", '-'),
+    ('۔', "Arabic Full Stop", '-'),
    ('⁃', "Hyphen Bullet", '-'),
    ('˗', "Modifier Letter Minus Sign", '-'),
    ('−', "Minus Sign", '-'),
+    ('➖', "Heavy Minus Sign", '-'),
+    ('Ⲻ', "Coptic Letter Dialect-P Ni", '-'),
    ('ー', "Katakana-Hiragana Prolonged Sound Mark", '-'),
+    ('－', "Fullwidth Hyphen-Minus", '-'),
+    ('―', "Horizontal Bar", '-'),
+    ('─', "Box Drawings Light Horizontal", '-'),
+    ('━', "Box Drawings Heavy Horizontal", '-'),
+    ('㇐', "CJK Stroke H", '-'),
+    ('ꟷ', "Latin Epigraphic Letter Dideways", '-'),
+    ('ᅳ', "Hangul Jungseong Eu", '-'),
+    ('ㅡ', "Hangul Letter Eu", '-'),
+    ('一', "CJK Unified Ideograph-4E00", '-'),
+    ('⼀', "Kangxi Radical One", '-'),
+
+    ('؍', "Arabic Date Separator", ','),
    ('٫', "Arabic Decimal Separator", ','),
    ('‚', "Single Low-9 Quotation Mark", ','),
+    ('¸', "Cedilla", ','),
    ('ꓹ', "Lisu Letter Tone Na Po", ','),
    ('，', "Fullwidth Comma", ','),
+
    (';', "Greek Question Mark", ';'),
    ('；', "Fullwidth Semicolon", ';'),
+    ('︔', "Presentation Form For Vertical Semicolon", ';'),
+
    ('ः', "Devanagari Sign Visarga", ':'),
    ('ઃ', "Gujarati Sign Visarga", ':'),
    ('：', "Fullwidth Colon", ':'),
    ('։', "Armenian Full Stop", ':'),
    ('܃', "Syriac Supralinear Colon", ':'),
    ('܄', "Syriac Sublinear Colon", ':'),
+    ('᛬', "Runic Multiple Ponctuation", ':'),
    ('︰', "Presentation Form For Vertical Two Dot Leader", ':'),
    ('᠃', "Mongolian Full Stop", ':'),
    ('᠉', "Mongolian Manchu Full Stop", ':'),
@ -68,25 +93,48 @@ const UNICODE_ARRAY: &'static [(char, &'static str, char)] = &[
    ('∶', "Ratio", ':'),
    ('ː', "Modifier Letter Triangular Colon", ':'),
    ('ꓽ', "Lisu Letter Tone Mya Jeu", ':'),
+    ('︓', "Presentation Form For Vertical Colon", ':'),
+
    ('！', "Fullwidth Exclamation Mark", '!'),
    ('ǃ', "Latin Letter Retroflex Click", '!'),
+    ('ⵑ', "Tifinagh Letter Tuareg Yang", '!'),
+    ('︕', "Presentation Form For Vertical Exclamation Mark", '!'),
+
    ('ʔ', "Latin Letter Glottal Stop", '?'),
+    ('Ɂ', "Latin Capital Letter Glottal Stop", '?'),
    ('ॽ', "Devanagari Letter Glottal Stop", '?'),
    ('Ꭾ', "Cherokee Letter He", '?'),
+    ('ꛫ', "Bamum Letter Ntuu", '?'),
    ('？', "Fullwidth Question Mark", '?'),
+    ('︖', "Presentation Form For Vertical Question Mark", '?'),
+
    ('𝅭', "Musical Symbol Combining Augmentation Dot", '.'),
    ('․', "One Dot Leader", '.'),
-    ('۔', "Arabic Full Stop", '.'),
    ('܁', "Syriac Supralinear Full Stop", '.'),
    ('܂', "Syriac Sublinear Full Stop", '.'),
    ('꘎', "Vai Full Stop", '.'),
    ('𐩐', "Kharoshthi Punctuation Dot", '.'),
-    ('·', "Middle Dot", '.'),
    ('٠', "Arabic-Indic Digit Zero", '.'),
    ('۰', "Extended Arabic-Indic Digit Zero", '.'),
    ('ꓸ', "Lisu Letter Tone Mya Ti", '.'),
-    ('。', "Ideographic Full Stop", '.'),
+    ('·', "Middle Dot", '.'),
    ('・', "Katakana Middle Dot", '.'),
+    ('･', "Halfwidth Katakana Middle Dot", '.'),
+    ('᛫', "Runic Single Punctuation", '.'),
+    ('·', "Greek Ano Teleia", '.'),
+    ('⸱', "Word Separator Middle Dot", '.'),
+    ('𐄁', "Aegean Word Separator Dot", '.'),
+    ('•', "Bullet", '.'),
+    ('‧', "Hyphenation Point", '.'),
+    ('∙', "Bullet Operator", '.'),
+    ('⋅', "Dot Operator", '.'),
+    ('ꞏ', "Latin Letter Sinological Dot", '.'),
+    ('ᐧ', "Canadian Syllabics Final Middle Dot", '.'),
+    ('ᐧ', "Canadian Syllabics Final Middle Dot", '.'),
+    ('．', "Fullwidth Full Stop", '.'),
+    ('。', "Ideographic Full Stop", '.'),
+    ('︒', "Presentation Form For Vertical Ideographic Full Stop", '.'),
+
    ('՝', "Armenian Comma", '\''),
    ('＇', "Fullwidth Apostrophe", '\''),
    ('‘', "Left Single Quotation Mark", '\''),
@ -96,8 +144,10 @@ const UNICODE_ARRAY: &'static [(char, &'static str, char)] = &[
    ('‵', "Reversed Prime", '\''),
    ('՚', "Armenian Apostrophe", '\''),
    ('׳', "Hebrew Punctuation Geresh", '\''),
+    ('`', "Greek Accent", '\''),
    ('`', "Greek Varia", '\''),
    ('｀', "Fullwidth Grave Accent", '\''),
+    ('´', "Acute Accent", '\''),
    ('΄', "Greek Tonos", '\''),
    ('´', "Greek Oxia", '\''),
    ('᾽', "Greek Koronis", '\''),
@ -105,6 +155,7 @@ const UNICODE_ARRAY: &'static [(char, &'static str, char)] = &[
    ('῾', "Greek Dasia", '\''),
    ('ʹ', "Modifier Letter Prime", '\''),
    ('ʹ', "Greek Numeral Sign", '\''),
+    ('ˈ', "Modifier Letter Vertical Line", '\''),
    ('ˊ', "Modifier Letter Acute Accent", '\''),
    ('ˋ', "Modifier Letter Grave Accent", '\''),
    ('˴', "Modifier Letter Middle Grave Accent", '\''),
@ -116,6 +167,12 @@ const UNICODE_ARRAY: &'static [(char, &'static str, char)] = &[
    ('י', "Hebrew Letter Yod", '\''),
    ('ߴ', "Nko High Tone Apostrophe", '\''),
    ('ߵ', "Nko Low Tone Apostrophe", '\''),
+    ('ᑊ', "Canadian Syllabics West-Cree P", '\''),
+    ('ᛌ', "Runic Letter Short-Twig-Sol S", '\''),
+    ('𖽑', "Miao Sign Aspiration", '\''),
+    ('𖽒', "Miao Sign Reformed Voicing", '\''),
+
+    ('᳓', "Vedic Sign Nihshvasa", '"'),
    ('＂', "Fullwidth Quotation Mark", '"'),
    ('“', "Left Double Quotation Mark", '"'),
    ('”', "Right Double Quotation Mark", '"'),
@ -132,12 +189,15 @@ const UNICODE_ARRAY: &'static [(char, &'static str, char)] = &[
    ('ײ', "Hebrew Ligature Yiddish Double Yod", '"'),
    ('❞', "Heavy Double Comma Quotation Mark Ornament", '"'),
    ('❝', "Heavy Double Turned Comma Quotation Mark Ornament", '"'),
+
+    ('（', "Fullwidth Left Parenthesis", '('),
    ('❨', "Medium Left Parenthesis Ornament", '('),
    ('﴾', "Ornate Left Parenthesis", '('),
-    ('（', "Fullwidth Left Parenthesis", '('),
+
+    ('）', "Fullwidth Right Parenthesis", ')'),
    ('❩', "Medium Right Parenthesis Ornament", ')'),
    ('﴿', "Ornate Right Parenthesis", ')'),
-    ('）', "Fullwidth Right Parenthesis", ')'),
+
    ('［', "Fullwidth Left Square Bracket", '['),
    ('❲', "Light Left Tortoise Shell Bracket Ornament", '['),
    ('「', "Left Corner Bracket", '['),
@ -147,6 +207,7 @@ const UNICODE_ARRAY: &'static [(char, &'static str, char)] = &[
    ('〖', "Left White Lenticular Bracket", '['),
    ('〘', "Left White Tortoise Shell Bracket", '['),
    ('〚', "Left White Square Bracket", '['),
+
    ('］', "Fullwidth Right Square Bracket", ']'),
    ('❳', "Light Right Tortoise Shell Bracket Ornament", ']'),
    ('」', "Right Corner Bracket", ']'),
@ -156,11 +217,20 @@ const UNICODE_ARRAY: &'static [(char, &'static str, char)] = &[
    ('〗', "Right White Lenticular Bracket", ']'),
    ('〙', "Right White Tortoise Shell Bracket", ']'),
    ('〛', "Right White Square Bracket", ']'),
+
    ('❴', "Medium Left Curly Bracket Ornament", '{'),
+    ('𝄔', "Musical Symbol Brace", '{'),
+    ('｛', "Fullwidth Left Curly Bracket", '{'),
+
    ('❵', "Medium Right Curly Bracket Ornament", '}'),
+    ('｝', "Fullwidth Right Curly Bracket", '}'),
+
    ('⁎', "Low Asterisk", '*'),
    ('٭', "Arabic Five Pointed Star", '*'),
    ('∗', "Asterisk Operator", '*'),
+    ('𐌟', "Old Italic Letter Ess", '*'),
+    ('＊', "Fullwidth Asterisk", '*'),
+
    ('᜵', "Philippine Single Punctuation", '/'),
    ('⁁', "Caret Insertion Point", '/'),
    ('∕', "Division Slash", '/'),
@ -168,37 +238,73 @@ const UNICODE_ARRAY: &'static [(char, &'static str, char)] = &[
    ('╱', "Box Drawings Light Diagonal Upper Right To Lower Left", '/'),
    ('⟋', "Mathematical Rising Diagonal", '/'),
    ('⧸', "Big Solidus", '/'),
-    ('㇓', "Cjk Stroke Sp", '/'),
+    ('𝈺', "Greek Instrumental Notation Symbol-47", '/'),
+    ('㇓', "CJK Stroke Sp", '/'),
    ('〳', "Vertical Kana Repeat Mark Upper Half", '/'),
-    ('丿', "Cjk Unified Ideograph-4E3F", '/'),
+    ('Ⳇ', "Coptic Capital Letter Old Coptic Esh", '/'),
+    ('ノ', "Katakana Letter No", '/'),
+    ('丿', "CJK Unified Ideograph-4E3F", '/'),
    ('⼃', "Kangxi Radical Slash", '/'),
+    ('／', "Fullwidth Solidus", '/'),
+
    ('＼', "Fullwidth Reverse Solidus", '\\'),
    ('﹨', "Small Reverse Solidus", '\\'),
    ('∖', "Set Minus", '\\'),
    ('⟍', "Mathematical Falling Diagonal", '\\'),
    ('⧵', "Reverse Solidus Operator", '\\'),
    ('⧹', "Big Reverse Solidus", '\\'),
+    ('⧹', "Greek Vocal Notation Symbol-16", '\\'),
+    ('⧹', "Greek Instrumental Symbol-48", '\\'),
+    ('㇔', "CJK Stroke D", '\\'),
+    ('丶', "CJK Unified Ideograph-4E36", '\\'),
+    ('⼂', "Kangxi Radical Dot", '\\'),
    ('、', "Ideographic Comma", '\\'),
    ('ヽ', "Katakana Iteration Mark", '\\'),
-    ('㇔', "Cjk Stroke D", '\\'),
-    ('丶', "Cjk Unified Ideograph-4E36", '\\'),
-    ('⼂', "Kangxi Radical Dot", '\\'),
+
    ('ꝸ', "Latin Small Letter Um", '&'),
+    ('＆', "Fullwidth Ampersand", '&'),
+
+    ('᛭', "Runic Cros Punctuation", '+'),
+    ('➕', "Heavy Plus Sign", '+'),
+    ('𐊛', "Lycian Letter H", '+'),
    ('﬩', "Hebrew Letter Alternative Plus Sign", '+'),
+    ('＋', "Fullwidth Plus Sign", '+'),
+
    ('‹', "Single Left-Pointing Angle Quotation Mark", '<'),
    ('❮', "Heavy Left-Pointing Angle Quotation Mark Ornament", '<'),
    ('˂', "Modifier Letter Left Arrowhead", '<'),
+    ('𝈶', "Greek Instrumental Symbol-40", '<'),
+    ('ᐸ', "Canadian Syllabics Pa", '<'),
+    ('ᚲ', "Runic Letter Kauna", '<'),
+    ('❬', "Medium Left-Pointing Angle Bracket Ornament", '<'),
+    ('⟨', "Mathematical Left Angle Bracket", '<'),
+    ('〈', "Left-Pointing Angle Bracket", '<'),
    ('〈', "Left Angle Bracket", '<'),
+    ('㇛', "CJK Stroke Pd", '<'),
+    ('く', "Hiragana Letter Ku", '<'),
+    ('𡿨', "CJK Unified Ideograph-21FE8", '<'),
    ('《', "Left Double Angle Bracket", '<'),
+    ('＜', "Fullwidth Less-Than Sign", '<'),
+
+    ('᐀', "Canadian Syllabics Hyphen", '='),
+    ('⹀', "Double Hyphen", '='),
+    ('゠', "Katakana-Hiragana Double Hyphen", '='),
    ('꓿', "Lisu Punctuation Full Stop", '='),
+    ('＝', "Fullwidth Equals Sign", '='),
+
    ('›', "Single Right-Pointing Angle Quotation Mark", '>'),
    ('❯', "Heavy Right-Pointing Angle Quotation Mark Ornament", '>'),
    ('˃', "Modifier Letter Right Arrowhead", '>'),
+    ('𝈷', "Greek Instrumental Symbol-42", '>'),
+    ('ᐳ', "Canadian Syllabics Po", '>'),
+    ('𖼿', "Miao Letter Archaic Zza", '>'),
+    ('❭', "Medium Right-Pointing Angle Bracket Ornament", '>'),
+    ('⟩', "Mathematical Right Angle Bracket", '>'),
+    ('〉', "Right-Pointing Angle Bracket", '>'),
    ('〉', "Right Angle Bracket", '>'),
    ('》', "Right Double Angle Bracket", '>'),
-    ('Ⲻ', "Coptic Capital Letter Dialect-P Ni", '-'),
-    ('Ɂ', "Latin Capital Letter Glottal Stop", '?'),
-    ('Ⳇ', "Coptic Capital Letter Old Coptic Esh", '/'), ];
+    ('＞', "Fullwidth Greater-Than Sign", '>'), ];
+

 const ASCII_ARRAY: &'static [(char, &'static str)] = &[
    (' ', "Space"),