diff options
| author | bors <bors@rust-lang.org> | 2020-08-16 23:52:32 +0000 |
|---|---|---|
| committer | bors <bors@rust-lang.org> | 2020-08-16 23:52:32 +0000 |
| commit | 4bb4b96ee7eef87134293ff7e4c02f699b805d5e (patch) | |
| tree | d25054b3f866152c72cd244d51fb88abdffec9ba | |
| parent | 7e6d6e5f535321c2223f044caba16f97b825009c (diff) | |
| parent | 8ec348afdd35752677914ba28146d7b82e901817 (diff) | |
| download | rust-4bb4b96ee7eef87134293ff7e4c02f699b805d5e.tar.gz rust-4bb4b96ee7eef87134293ff7e4c02f699b805d5e.zip | |
Auto merge of #74562 - pickfire:is_ascii_branchless, r=nagisa
Remove branch in optimized is_ascii Performs slightly better in short or medium bytes by eliminating the last branch check on `byte_pos == len` and always check the last byte as it is always at most one `usize`. Benchmark, before `libcore`, after `libcore_new`. It improves medium and short by 1ns but regresses unaligned_tail by 2ns, either way we can get unaligned_tail have a tiny chance of 1/8 on a 64 bit machine. I don't think we should bet on that, the probability is worse than dice. ``` test long::case00_libcore ... bench: 38 ns/iter (+/- 1) = 183947 MB/s test long::case00_libcore_new ... bench: 38 ns/iter (+/- 1) = 183947 MB/s test long::case01_iter_all ... bench: 227 ns/iter (+/- 6) = 30792 MB/s test long::case02_align_to ... bench: 40 ns/iter (+/- 1) = 174750 MB/s test long::case03_align_to_unrolled ... bench: 19 ns/iter (+/- 1) = 367894 MB/s test medium::case00_libcore ... bench: 5 ns/iter (+/- 0) = 6400 MB/s test medium::case00_libcore_new ... bench: 4 ns/iter (+/- 0) = 8000 MB/s test medium::case01_iter_all ... bench: 20 ns/iter (+/- 1) = 1600 MB/s test medium::case02_align_to ... bench: 6 ns/iter (+/- 0) = 5333 MB/s test medium::case03_align_to_unrolled ... bench: 5 ns/iter (+/- 0) = 6400 MB/s test short::case00_libcore ... bench: 7 ns/iter (+/- 0) = 1000 MB/s test short::case00_libcore_new ... bench: 6 ns/iter (+/- 0) = 1166 MB/s test short::case01_iter_all ... bench: 5 ns/iter (+/- 0) = 1400 MB/s test short::case02_align_to ... bench: 5 ns/iter (+/- 0) = 1400 MB/s test short::case03_align_to_unrolled ... bench: 5 ns/iter (+/- 1) = 1400 MB/s test unaligned_both::case00_libcore ... bench: 4 ns/iter (+/- 0) = 7500 MB/s test unaligned_both::case00_libcore_new ... bench: 4 ns/iter (+/- 0) = 7500 MB/s test unaligned_both::case01_iter_all ... bench: 26 ns/iter (+/- 0) = 1153 MB/s test unaligned_both::case02_align_to ... bench: 13 ns/iter (+/- 2) = 2307 MB/s test unaligned_both::case03_align_to_unrolled ... bench: 11 ns/iter (+/- 0) = 2727 MB/s test unaligned_head::case00_libcore ... bench: 5 ns/iter (+/- 0) = 6200 MB/s test unaligned_head::case00_libcore_new ... bench: 5 ns/iter (+/- 0) = 6200 MB/s test unaligned_head::case01_iter_all ... bench: 19 ns/iter (+/- 1) = 1631 MB/s test unaligned_head::case02_align_to ... bench: 10 ns/iter (+/- 0) = 3100 MB/s test unaligned_head::case03_align_to_unrolled ... bench: 14 ns/iter (+/- 0) = 2214 MB/s test unaligned_tail::case00_libcore ... bench: 3 ns/iter (+/- 0) = 10333 MB/s test unaligned_tail::case00_libcore_new ... bench: 5 ns/iter (+/- 0) = 6200 MB/s test unaligned_tail::case01_iter_all ... bench: 19 ns/iter (+/- 0) = 1631 MB/s test unaligned_tail::case02_align_to ... bench: 10 ns/iter (+/- 0) = 3100 MB/s test unaligned_tail::case03_align_to_unrolled ... bench: 13 ns/iter (+/- 0) = 2384 MB/s ``` Rough (unfair) maths on improvements for fun: 1ns * 7/8 - 2ns * 1/8 = 0.625ns Inspired by fish and zsh clever trick to highlight missing linefeeds (⏎) and branchless implementation of binary_search in rust. cc @thomcc https://github.com/rust-lang/rust/pull/74066 r? @nagisa
| -rw-r--r-- | library/core/src/slice/mod.rs | 15 |
1 files changed, 6 insertions, 9 deletions
diff --git a/library/core/src/slice/mod.rs b/library/core/src/slice/mod.rs index f2477ed6a6e..59c536bcff9 100644 --- a/library/core/src/slice/mod.rs +++ b/library/core/src/slice/mod.rs @@ -3029,7 +3029,7 @@ fn contains_nonascii(v: usize) -> bool { /// /// - Read the first word with an unaligned load. /// - Align the pointer, read subsequent words until end with aligned loads. -/// - If there's a tail, the last `usize` from `s` with an unaligned load. +/// - Read the last `usize` from `s` with an unaligned load. /// /// If any of these loads produces something for which `contains_nonascii` /// (above) returns true, then we know the answer is false. @@ -3077,7 +3077,10 @@ fn is_ascii(s: &[u8]) -> bool { // `align_offset` though. debug_assert_eq!((word_ptr as usize) % mem::align_of::<usize>(), 0); - while byte_pos <= len - USIZE_SIZE { + // Read subsequent words until the last aligned word, excluding the last + // aligned word by itself to be done in tail check later, to ensure that + // tail is always one `usize` at most to extra branch `byte_pos == len`. + while byte_pos < len - USIZE_SIZE { debug_assert!( // Sanity check that the read is in bounds (word_ptr as usize + USIZE_SIZE) <= (start.wrapping_add(len) as usize) && @@ -3098,15 +3101,9 @@ fn is_ascii(s: &[u8]) -> bool { word_ptr = unsafe { word_ptr.add(1) }; } - // If we have anything left over, it should be at-most 1 usize worth of bytes, - // which we check with a read_unaligned. - if byte_pos == len { - return true; - } - // Sanity check to ensure there really is only one `usize` left. This should // be guaranteed by our loop condition. - debug_assert!(byte_pos < len && len - byte_pos < USIZE_SIZE); + debug_assert!(byte_pos <= len && len - byte_pos <= USIZE_SIZE); // SAFETY: This relies on `len >= USIZE_SIZE`, which we check at the start. let last_word = unsafe { (start.add(len - USIZE_SIZE) as *const usize).read_unaligned() }; |
