about summary refs log tree commit diff
diff options
context:
space:
mode:
authorLaiho <emillaiho@hotmail.fi>2024-09-09 00:58:52 +0300
committerLaiho <emillaiho@hotmail.fi>2024-09-23 19:24:06 +0300
commit4484085b18df4b10243b503a21602bb71836e8b3 (patch)
tree640fca292fed985ba7e81a84756bf2f5bb1bbc55
parenta772336fb3fbd1fe4493077fcfe04e0221296a99 (diff)
downloadrust-4484085b18df4b10243b503a21602bb71836e8b3.tar.gz
rust-4484085b18df4b10243b503a21602bb71836e8b3.zip
Add fast path for ascii to ascii in str::replace
-rw-r--r--library/alloc/src/str.rs25
-rw-r--r--library/alloc/src/string.rs7
-rw-r--r--library/core/src/str/pattern.rs33
3 files changed, 63 insertions, 2 deletions
diff --git a/library/alloc/src/str.rs b/library/alloc/src/str.rs
index 32212b61c6e..709d11c52a0 100644
--- a/library/alloc/src/str.rs
+++ b/library/alloc/src/str.rs
@@ -19,7 +19,7 @@ pub use core::str::SplitInclusive;
 pub use core::str::SplitWhitespace;
 #[stable(feature = "rust1", since = "1.0.0")]
 pub use core::str::pattern;
-use core::str::pattern::{DoubleEndedSearcher, Pattern, ReverseSearcher, Searcher};
+use core::str::pattern::{DoubleEndedSearcher, Pattern, ReverseSearcher, Searcher, Utf8Pattern};
 #[stable(feature = "rust1", since = "1.0.0")]
 pub use core::str::{Bytes, CharIndices, Chars, from_utf8, from_utf8_mut};
 #[stable(feature = "str_escape", since = "1.34.0")]
@@ -268,6 +268,18 @@ impl str {
     #[stable(feature = "rust1", since = "1.0.0")]
     #[inline]
     pub fn replace<P: Pattern>(&self, from: P, to: &str) -> String {
+        // Fast path for ASCII to ASCII case.
+
+        if let Some(from_byte) = match from.as_utf8_pattern() {
+            Some(Utf8Pattern::StringPattern([from_byte])) => Some(*from_byte),
+            Some(Utf8Pattern::CharPattern(c)) => c.as_ascii().map(|ascii_char| ascii_char.to_u8()),
+            _ => None,
+        } {
+            if let [to_byte] = to.as_bytes() {
+                return unsafe { replace_ascii(self.as_bytes(), from_byte, *to_byte) };
+            }
+        }
+
         let mut result = String::new();
         let mut last_end = 0;
         for (start, part) in self.match_indices(from) {
@@ -661,3 +673,14 @@ fn convert_while_ascii(b: &[u8], convert: fn(&u8) -> u8) -> Vec<u8> {
 
     out
 }
+#[inline]
+#[cfg(not(test))]
+#[cfg(not(no_global_oom_handling))]
+#[allow(dead_code)]
+/// Faster implementation of string replacement for ASCII to ASCII cases.
+/// Should produce fast vectorized code.
+unsafe fn replace_ascii(utf8_bytes: &[u8], from: u8, to: u8) -> String {
+    let result: Vec<u8> = utf8_bytes.iter().map(|b| if *b == from { to } else { *b }).collect();
+    // SAFETY: We replaced ascii with ascii on valid utf8 strings.
+    unsafe { String::from_utf8_unchecked(result) }
+}
diff --git a/library/alloc/src/string.rs b/library/alloc/src/string.rs
index ee878e879e9..0d1600a28aa 100644
--- a/library/alloc/src/string.rs
+++ b/library/alloc/src/string.rs
@@ -53,7 +53,7 @@ use core::ops::AddAssign;
 #[cfg(not(no_global_oom_handling))]
 use core::ops::Bound::{Excluded, Included, Unbounded};
 use core::ops::{self, Range, RangeBounds};
-use core::str::pattern::Pattern;
+use core::str::pattern::{Pattern, Utf8Pattern};
 use core::{fmt, hash, ptr, slice};
 
 #[cfg(not(no_global_oom_handling))]
@@ -2424,6 +2424,11 @@ impl<'b> Pattern for &'b String {
     {
         self[..].strip_suffix_of(haystack)
     }
+
+    #[inline]
+    fn as_utf8_pattern(&self) -> Option<Utf8Pattern<'_>> {
+        Some(Utf8Pattern::StringPattern(self.as_bytes()))
+    }
 }
 
 macro_rules! impl_eq {
diff --git a/library/core/src/str/pattern.rs b/library/core/src/str/pattern.rs
index 9f1294d7606..eb60effe813 100644
--- a/library/core/src/str/pattern.rs
+++ b/library/core/src/str/pattern.rs
@@ -160,6 +160,19 @@ pub trait Pattern: Sized {
             None
         }
     }
+
+    /// Returns the pattern as utf-8 bytes if possible.
+    fn as_utf8_pattern(&self) -> Option<Utf8Pattern<'_>>;
+}
+/// Result of calling [`Pattern::as_utf8_pattern()`].
+/// Can be used for inspecting the contents of a [`Pattern`] in cases
+/// where the underlying representation can be represented as UTF-8.
+#[derive(Copy, Clone, Eq, PartialEq, Debug)]
+pub enum Utf8Pattern<'a> {
+    /// Type returned by String and str types.
+    StringPattern(&'a [u8]),
+    /// Type returned by char types.
+    CharPattern(char),
 }
 
 // Searcher
@@ -599,6 +612,11 @@ impl Pattern for char {
     {
         self.encode_utf8(&mut [0u8; 4]).strip_suffix_of(haystack)
     }
+
+    #[inline]
+    fn as_utf8_pattern(&self) -> Option<Utf8Pattern<'_>> {
+        Some(Utf8Pattern::CharPattern(*self))
+    }
 }
 
 /////////////////////////////////////////////////////////////////////////////
@@ -657,6 +675,11 @@ impl<C: MultiCharEq> Pattern for MultiCharEqPattern<C> {
     fn into_searcher(self, haystack: &str) -> MultiCharEqSearcher<'_, C> {
         MultiCharEqSearcher { haystack, char_eq: self.0, char_indices: haystack.char_indices() }
     }
+
+    #[inline]
+    fn as_utf8_pattern(&self) -> Option<Utf8Pattern<'_>> {
+        None
+    }
 }
 
 unsafe impl<'a, C: MultiCharEq> Searcher<'a> for MultiCharEqSearcher<'a, C> {
@@ -747,6 +770,11 @@ macro_rules! pattern_methods {
         {
             ($pmap)(self).strip_suffix_of(haystack)
         }
+
+        #[inline]
+        fn as_utf8_pattern(&self) -> Option<Utf8Pattern<'_>> {
+            None
+        }
     };
 }
 
@@ -1022,6 +1050,11 @@ impl<'b> Pattern for &'b str {
             None
         }
     }
+
+    #[inline]
+    fn as_utf8_pattern(&self) -> Option<Utf8Pattern<'_>> {
+        Some(Utf8Pattern::StringPattern(self.as_bytes()))
+    }
 }
 
 /////////////////////////////////////////////////////////////////////////////