From 4484085b18df4b10243b503a21602bb71836e8b3 Mon Sep 17 00:00:00 2001 From: Laiho Date: Mon, 9 Sep 2024 00:58:52 +0300 Subject: [PATCH] Add fast path for ascii to ascii in str::replace --- library/alloc/src/str.rs | 25 ++++++++++++++++++++++++- library/alloc/src/string.rs | 7 ++++++- library/core/src/str/pattern.rs | 33 +++++++++++++++++++++++++++++++++ 3 files changed, 63 insertions(+), 2 deletions(-) diff --git a/library/alloc/src/str.rs b/library/alloc/src/str.rs index 32212b61c6e..709d11c52a0 100644 --- a/library/alloc/src/str.rs +++ b/library/alloc/src/str.rs @@ -19,7 +19,7 @@ pub use core::str::SplitInclusive; pub use core::str::SplitWhitespace; #[stable(feature = "rust1", since = "1.0.0")] pub use core::str::pattern; -use core::str::pattern::{DoubleEndedSearcher, Pattern, ReverseSearcher, Searcher}; +use core::str::pattern::{DoubleEndedSearcher, Pattern, ReverseSearcher, Searcher, Utf8Pattern}; #[stable(feature = "rust1", since = "1.0.0")] pub use core::str::{Bytes, CharIndices, Chars, from_utf8, from_utf8_mut}; #[stable(feature = "str_escape", since = "1.34.0")] @@ -268,6 +268,18 @@ impl str { #[stable(feature = "rust1", since = "1.0.0")] #[inline] pub fn replace(&self, from: P, to: &str) -> String { + // Fast path for ASCII to ASCII case. + + if let Some(from_byte) = match from.as_utf8_pattern() { + Some(Utf8Pattern::StringPattern([from_byte])) => Some(*from_byte), + Some(Utf8Pattern::CharPattern(c)) => c.as_ascii().map(|ascii_char| ascii_char.to_u8()), + _ => None, + } { + if let [to_byte] = to.as_bytes() { + return unsafe { replace_ascii(self.as_bytes(), from_byte, *to_byte) }; + } + } + let mut result = String::new(); let mut last_end = 0; for (start, part) in self.match_indices(from) { @@ -661,3 +673,14 @@ fn convert_while_ascii(b: &[u8], convert: fn(&u8) -> u8) -> Vec { out } +#[inline] +#[cfg(not(test))] +#[cfg(not(no_global_oom_handling))] +#[allow(dead_code)] +/// Faster implementation of string replacement for ASCII to ASCII cases. +/// Should produce fast vectorized code. +unsafe fn replace_ascii(utf8_bytes: &[u8], from: u8, to: u8) -> String { + let result: Vec = utf8_bytes.iter().map(|b| if *b == from { to } else { *b }).collect(); + // SAFETY: We replaced ascii with ascii on valid utf8 strings. + unsafe { String::from_utf8_unchecked(result) } +} diff --git a/library/alloc/src/string.rs b/library/alloc/src/string.rs index ee878e879e9..0d1600a28aa 100644 --- a/library/alloc/src/string.rs +++ b/library/alloc/src/string.rs @@ -53,7 +53,7 @@ use core::ops::AddAssign; #[cfg(not(no_global_oom_handling))] use core::ops::Bound::{Excluded, Included, Unbounded}; use core::ops::{self, Range, RangeBounds}; -use core::str::pattern::Pattern; +use core::str::pattern::{Pattern, Utf8Pattern}; use core::{fmt, hash, ptr, slice}; #[cfg(not(no_global_oom_handling))] @@ -2424,6 +2424,11 @@ impl<'b> Pattern for &'b String { { self[..].strip_suffix_of(haystack) } + + #[inline] + fn as_utf8_pattern(&self) -> Option> { + Some(Utf8Pattern::StringPattern(self.as_bytes())) + } } macro_rules! impl_eq { diff --git a/library/core/src/str/pattern.rs b/library/core/src/str/pattern.rs index 9f1294d7606..eb60effe813 100644 --- a/library/core/src/str/pattern.rs +++ b/library/core/src/str/pattern.rs @@ -160,6 +160,19 @@ pub trait Pattern: Sized { None } } + + /// Returns the pattern as utf-8 bytes if possible. + fn as_utf8_pattern(&self) -> Option>; +} +/// Result of calling [`Pattern::as_utf8_pattern()`]. +/// Can be used for inspecting the contents of a [`Pattern`] in cases +/// where the underlying representation can be represented as UTF-8. +#[derive(Copy, Clone, Eq, PartialEq, Debug)] +pub enum Utf8Pattern<'a> { + /// Type returned by String and str types. + StringPattern(&'a [u8]), + /// Type returned by char types. + CharPattern(char), } // Searcher @@ -599,6 +612,11 @@ impl Pattern for char { { self.encode_utf8(&mut [0u8; 4]).strip_suffix_of(haystack) } + + #[inline] + fn as_utf8_pattern(&self) -> Option> { + Some(Utf8Pattern::CharPattern(*self)) + } } ///////////////////////////////////////////////////////////////////////////// @@ -657,6 +675,11 @@ impl Pattern for MultiCharEqPattern { fn into_searcher(self, haystack: &str) -> MultiCharEqSearcher<'_, C> { MultiCharEqSearcher { haystack, char_eq: self.0, char_indices: haystack.char_indices() } } + + #[inline] + fn as_utf8_pattern(&self) -> Option> { + None + } } unsafe impl<'a, C: MultiCharEq> Searcher<'a> for MultiCharEqSearcher<'a, C> { @@ -747,6 +770,11 @@ macro_rules! pattern_methods { { ($pmap)(self).strip_suffix_of(haystack) } + + #[inline] + fn as_utf8_pattern(&self) -> Option> { + None + } }; } @@ -1022,6 +1050,11 @@ impl<'b> Pattern for &'b str { None } } + + #[inline] + fn as_utf8_pattern(&self) -> Option> { + Some(Utf8Pattern::StringPattern(self.as_bytes())) + } } /////////////////////////////////////////////////////////////////////////////