Rollup merge of #74066 - thomcc:optimize-is-ascii, r=nagisa

Optimize is_ascii for str and [u8]. This optimizes the `is_ascii` function for `[u8]` and `str`. I've been surprised this wasn't done for a while, so I just did it. Benchmarks comparing before/after look like: ``` test ascii::long_readonly::is_ascii_slice_iter_all ... bench: 174 ns/iter (+/- 79) = 40172 MB/s test ascii::long_readonly::is_ascii_slice_libcore ... bench: 16 ns/iter (+/- 5) = 436875 MB/s test ascii::medium_readonly::is_ascii_slice_iter_all ... bench: 12 ns/iter (+/- 3) = 2666 MB/s test ascii::medium_readonly::is_ascii_slice_libcore ... bench: 2 ns/iter (+/- 0) = 16000 MB/s test ascii::short_readonly::is_ascii_slice_iter_all ... bench: 3 ns/iter (+/- 0) = 2333 MB/s test ascii::short_readonly::is_ascii_slice_libcore ... bench: 4 ns/iter (+/- 0) = 1750 MB/s ``` (Taken on a x86_64 macbook 2.9 GHz Intel Core i9 with 6 cores) Where `is_ascii_slice_iter_all` is the old version, and `is_ascii_slice_libcore` is the new. I tried to document the code well, so hopefully it's understandable. It has fairly exhaustive tests ensuring size/align doesn't get violated -- because `miri` doesn't really help a lot for this sort of code right now, I tried to `debug_assert` all the safety invariants I'm depending on. (Of course, none of them are required for correctness or soundness -- just allows us to test that this sort of pointer manipulation is sound and such). Anyway, thanks. Let me know if you have questions/desired changes.
author: Manish Goregaokar <manishsmail@gmail.com> 2020-07-11 08:53:16 -0700
committer: GitHub <noreply@github.com> 2020-07-11 08:53:16 -0700
commit: 1979fa86f9fd8cc53384d2dabe775bcbf012a5ad (patch)
tree: 44744e4fcffb6dd96a97cecb0c96357805f19275 /src/libcore/tests
parent: 084ac77cf29e786df7251392bed0b6e6c7ea8786 (diff)
parent: a150dcc872b4f003c4a0e4cd7bb0e7c51ec791b2 (diff)
download: rust-1979fa86f9fd8cc53384d2dabe775bcbf012a5ad.tar.gz
rust-1979fa86f9fd8cc53384d2dabe775bcbf012a5ad.zip
1 files changed, 56 insertions, 0 deletions
diff --git a/src/libcore/tests/ascii.rs b/src/libcore/tests/ascii.rs
index 71275d40c46..57f2de16b2b 100644
--- a/src/libcore/tests/ascii.rs
+++ b/src/libcore/tests/ascii.rs
@@ -343,3 +343,59 @@ fn test_is_ascii_control() {
         " ",
     );
 }
+
+// `is_ascii` does a good amount of pointer manipulation and has
+// alignment-dependent computation. This is all sanity-checked via
+// `debug_assert!`s, so we test various sizes/alignments thoroughly versus an
+// "obviously correct" baseline function.
+#[test]
+fn test_is_ascii_align_size_thoroughly() {
+    // The "obviously-correct" baseline mentioned above.
+    fn is_ascii_baseline(s: &[u8]) -> bool {
+        s.iter().all(|b| b.is_ascii())
+    }
+
+    // Helper to repeat `l` copies of `b0` followed by `l` copies of `b1`.
+    fn repeat_concat(b0: u8, b1: u8, l: usize) -> Vec<u8> {
+        use core::iter::repeat;
+        repeat(b0).take(l).chain(repeat(b1).take(l)).collect()
+    }
+
+    // Miri is too slow for much of this, and in miri `align_offset` always
+    // returns `usize::max_value()` anyway (at the moment), so we just test
+    // lightly.
+    let iter = if cfg!(miri) { 0..5 } else { 0..100 };
+
+    for i in iter {
+        #[cfg(not(miri))]
+        let cases = &[
+            b"a".repeat(i),
+            b"\0".repeat(i),
+            b"\x7f".repeat(i),
+            b"\x80".repeat(i),
+            b"\xff".repeat(i),
+            repeat_concat(b'a', 0x80u8, i),
+            repeat_concat(0x80u8, b'a', i),
+        ];
+
+        #[cfg(miri)]
+        let cases = &[repeat_concat(b'a', 0x80u8, i)];
+
+        for case in cases {
+            for pos in 0..=case.len() {
+                // Potentially misaligned head
+                let prefix = &case[pos..];
+                assert_eq!(is_ascii_baseline(prefix), prefix.is_ascii(),);
+
+                // Potentially misaligned tail
+                let suffix = &case[..case.len() - pos];
+
+                assert_eq!(is_ascii_baseline(suffix), suffix.is_ascii(),);
+
+                // Both head and tail are potentially misaligned
+                let mid = &case[(pos / 2)..(case.len() - (pos / 2))];
+                assert_eq!(is_ascii_baseline(mid), mid.is_ascii(),);
+            }
+        }
+    }
+}
author	Manish Goregaokar <manishsmail@gmail.com>	2020-07-11 08:53:16 -0700
committer	GitHub <noreply@github.com>	2020-07-11 08:53:16 -0700
commit	1979fa86f9fd8cc53384d2dabe775bcbf012a5ad (patch)
tree	44744e4fcffb6dd96a97cecb0c96357805f19275 /src/libcore/tests
parent	084ac77cf29e786df7251392bed0b6e6c7ea8786 (diff)
parent	a150dcc872b4f003c4a0e4cd7bb0e7c51ec791b2 (diff)
download	rust-1979fa86f9fd8cc53384d2dabe775bcbf012a5ad.tar.gz rust-1979fa86f9fd8cc53384d2dabe775bcbf012a5ad.zip