Auto merge of #90559 - rusticstuff:optimize-bidi-detection, r=davidtwco

Optimize bidi character detection. Should fix most of the performance regression of the bidi character detection (#90514), to be confirmed with a perf run.
2021-11-06 16:25:00 +00:00 · 2021-11-06 16:25:00 +00:00 · 5ec7d1dad6
commit 5ec7d1dad6
parent 3326f19e89 39110beab0
5 changed files with 46 additions and 16 deletions
--- a/compiler/rustc_ast/src/lib.rs
+++ b/compiler/rustc_ast/src/lib.rs
@ -16,6 +16,7 @@
 #![feature(nll)]
 #![feature(min_specialization)]
 #![recursion_limit = "256"]
 #![feature(slice_internals)]
 #[macro_use]
 extern crate rustc_macros;
@ -25,6 +26,7 @@ pub mod util {
    pub mod comments;
    pub mod literal;
    pub mod parser;
    pub mod unicode;
 }
 pub mod ast;
--- a/compiler/rustc_ast/src/util/unicode.rs
+++ b/compiler/rustc_ast/src/util/unicode.rs
@ -0,0 +1,35 @@
 pub const TEXT_FLOW_CONTROL_CHARS: &[char] = &[
    '\u{202A}', '\u{202B}', '\u{202D}', '\u{202E}', '\u{2066}', '\u{2067}', '\u{2068}', '\u{202C}',
    '\u{2069}',
 ];
 #[inline]
 pub fn contains_text_flow_control_chars(s: &str) -> bool {
    // Char   - UTF-8
    // U+202A - E2 80 AA
    // U+202B - E2 80 AB
    // U+202C - E2 80 AC
    // U+202D - E2 80 AD
    // U+202E - E2 80 AE
    // U+2066 - E2 81 A6
    // U+2067 - E2 81 A7
    // U+2068 - E2 81 A8
    // U+2069 - E2 81 A9
    let mut bytes = s.as_bytes();
    loop {
        match core::slice::memchr::memchr(0xE2, &bytes) {
            Some(idx) => {
                // bytes are valid UTF-8 -> E2 must be followed by two bytes
                let ch = &bytes[idx..idx + 3];
                match ch {
                    [_, 0x80, 0xAA..=0xAE] | [_, 0x81, 0xA6..=0xA9] => break true,
                    _ => {}
                }
                bytes = &bytes[idx + 3..];
            }
            None => {
                break false;
            }
        }
    }
 }
--- a/compiler/rustc_lint/src/context.rs
+++ b/compiler/rustc_lint/src/context.rs
@ -16,9 +16,9 @@
 use self::TargetLint::*;
 use crate::hidden_unicode_codepoints::UNICODE_TEXT_FLOW_CHARS;
 use crate::levels::{is_known_lint_tool, LintLevelsBuilder};
 use crate::passes::{EarlyLintPassObject, LateLintPassObject};
 use ast::util::unicode::TEXT_FLOW_CONTROL_CHARS;
 use rustc_ast as ast;
 use rustc_data_structures::fx::FxHashMap;
 use rustc_data_structures::sync;
@ -602,7 +602,7 @@ pub trait LintContext: Sized {
                    let spans: Vec<_> = content
                        .char_indices()
                        .filter_map(|(i, c)| {
-                            UNICODE_TEXT_FLOW_CHARS.contains(&c).then(|| {
+                            TEXT_FLOW_CONTROL_CHARS.contains(&c).then(|| {
                                let lo = span.lo() + BytePos(2 + i as u32);
                                (c, span.with_lo(lo).with_hi(lo + BytePos(c.len_utf8() as u32)))
                            })
--- a/compiler/rustc_lint/src/hidden_unicode_codepoints.rs
+++ b/compiler/rustc_lint/src/hidden_unicode_codepoints.rs
@ -1,4 +1,5 @@
 use crate::{EarlyContext, EarlyLintPass, LintContext};
 use ast::util::unicode::{contains_text_flow_control_chars, TEXT_FLOW_CONTROL_CHARS};
 use rustc_ast as ast;
 use rustc_errors::{Applicability, SuggestionStyle};
 use rustc_span::{BytePos, Span, Symbol};
@ -37,11 +38,6 @@ declare_lint! {
 declare_lint_pass!(HiddenUnicodeCodepoints => [TEXT_DIRECTION_CODEPOINT_IN_LITERAL]);
 crate const UNICODE_TEXT_FLOW_CHARS: &[char] = &[
    '\u{202A}', '\u{202B}', '\u{202D}', '\u{202E}', '\u{2066}', '\u{2067}', '\u{2068}', '\u{202C}',
    '\u{2069}',
 ];
 impl HiddenUnicodeCodepoints {
    fn lint_text_direction_codepoint(
        &self,
@ -57,7 +53,7 @@ impl HiddenUnicodeCodepoints {
            .as_str()
            .char_indices()
            .filter_map(|(i, c)| {
-                UNICODE_TEXT_FLOW_CHARS.contains(&c).then(|| {
+                TEXT_FLOW_CONTROL_CHARS.contains(&c).then(|| {
                    let lo = span.lo() + BytePos(i as u32 + padding);
                    (c, span.with_lo(lo).with_hi(lo + BytePos(c.len_utf8() as u32)))
                })
@ -131,7 +127,7 @@ impl HiddenUnicodeCodepoints {
 impl EarlyLintPass for HiddenUnicodeCodepoints {
    fn check_attribute(&mut self, cx: &EarlyContext<'_>, attr: &ast::Attribute) {
        if let ast::AttrKind::DocComment(_, comment) = attr.kind {
-            if comment.as_str().contains(UNICODE_TEXT_FLOW_CHARS) {
+            if contains_text_flow_control_chars(&comment.as_str()) {
                self.lint_text_direction_codepoint(cx, comment, attr.span, 0, false, "doc comment");
            }
        }
@ -142,7 +138,7 @@ impl EarlyLintPass for HiddenUnicodeCodepoints {
        let (text, span, padding) = match &expr.kind {
            ast::ExprKind::Lit(ast::Lit { token, kind, span }) => {
                let text = token.symbol;
-                if !text.as_str().contains(UNICODE_TEXT_FLOW_CHARS) {
+                if !contains_text_flow_control_chars(&text.as_str()) {
                    return;
                }
                let padding = match kind {
--- a/compiler/rustc_parse/src/lexer/mod.rs
+++ b/compiler/rustc_parse/src/lexer/mod.rs
@ -1,6 +1,7 @@
 use rustc_ast::ast::{self, AttrStyle};
 use rustc_ast::token::{self, CommentKind, Token, TokenKind};
 use rustc_ast::tokenstream::{Spacing, TokenStream};
 use rustc_ast::util::unicode::contains_text_flow_control_chars;
 use rustc_errors::{error_code, Applicability, DiagnosticBuilder, FatalError, PResult};
 use rustc_lexer::unescape::{self, Mode};
 use rustc_lexer::{Base, DocStyle, RawStrError};
@ -137,12 +138,8 @@ impl<'a> StringReader<'a> {
        // Opening delimiter of the length 2 is not included into the comment text.
        let content_start = start + BytePos(2);
        let content = self.str_from(content_start);
-        let span = self.mk_sp(start, self.pos);
+        if contains_text_flow_control_chars(content) {
-        const UNICODE_TEXT_FLOW_CHARS: &[char] = &[
+            let span = self.mk_sp(start, self.pos);
            '\u{202A}', '\u{202B}', '\u{202D}', '\u{202E}', '\u{2066}', '\u{2067}', '\u{2068}',
            '\u{202C}', '\u{2069}',
        ];
        if content.contains(UNICODE_TEXT_FLOW_CHARS) {
            self.sess.buffer_lint_with_diagnostic(
                &TEXT_DIRECTION_CODEPOINT_IN_COMMENT,
                span,