diff options
| author | bors <bors@rust-lang.org> | 2024-10-01 22:12:44 +0000 |
|---|---|---|
| committer | bors <bors@rust-lang.org> | 2024-10-01 22:12:44 +0000 |
| commit | bfe5e8cef698ccc4fca655b4cdbabf78fed43816 (patch) | |
| tree | aae0b5d84cea2c8f0cc8b5a49ff1822d6d44dd60 | |
| parent | 06bb8364aaffefb0ce67e5f5445e66ec99c1f66e (diff) | |
| parent | 1562bf790974f0061c47d1a83b7a75cc507a744d (diff) | |
| download | rust-bfe5e8cef698ccc4fca655b4cdbabf78fed43816.tar.gz rust-bfe5e8cef698ccc4fca655b4cdbabf78fed43816.zip | |
Auto merge of #128204 - GuillaumeGomez:integers-opti, r=workingjubilee
Small optimization for integers Display implementation
This is a first pass to try to speed up a bit integers `Display` implementation. The idea behind this is to reduce the stack usage for the buffer storing the output (shouldn't be visible in bench normally) and some small specialization which benefits a lot to smaller integers like `u8` and `i8`.
Here are the results of the benchmarks:
| bench name | current std | with this PR |
|-|-|-|
| bench_std_fmt::bench_i16_0 | 16.45 ns/iter (+/- 0.25) | 16.50 ns/iter (+/- 0.15) |
| bench_std_fmt::bench_i16_max | 17.83 ns/iter (+/- 0.66) | 17.58 ns/iter (+/- 0.10) |
| bench_std_fmt::bench_i16_min | 20.97 ns/iter (+/- 0.49) | 20.50 ns/iter (+/- 0.28) |
| bench_std_fmt::bench_i32_0 | 16.63 ns/iter (+/- 0.06) | 16.62 ns/iter (+/- 0.07) |
| bench_std_fmt::bench_i32_max | 19.79 ns/iter (+/- 0.43) | 19.55 ns/iter (+/- 0.14) |
| bench_std_fmt::bench_i32_min | 22.97 ns/iter (+/- 0.50) | 22.08 ns/iter (+/- 0.08) |
| bench_std_fmt::bench_i64_0 | 16.63 ns/iter (+/- 0.39) | 16.69 ns/iter (+/- 0.44) |
| bench_std_fmt::bench_i64_half | 19.60 ns/iter (+/- 0.05) | 19.10 ns/iter (+/- 0.05) |
| bench_std_fmt::bench_i64_max | 25.22 ns/iter (+/- 0.34) | 24.43 ns/iter (+/- 0.02) |
| bench_std_fmt::bench_i8_0 | 16.27 ns/iter (+/- 0.32) | 15.80 ns/iter (+/- 0.17) |
| bench_std_fmt::bench_i8_max | 16.71 ns/iter (+/- 0.09) | 16.25 ns/iter (+/- 0.01) |
| bench_std_fmt::bench_i8_min | 20.07 ns/iter (+/- 0.22) | 19.80 ns/iter (+/- 0.30) |
| bench_std_fmt::bench_u128_0 | 21.37 ns/iter (+/- 0.24) | 21.35 ns/iter (+/- 0.35) |
| bench_std_fmt::bench_u128_max | 48.13 ns/iter (+/- 0.20) | 48.78 ns/iter (+/- 0.29) |
| bench_std_fmt::bench_u16_0 | 16.48 ns/iter (+/- 0.46) | 16.03 ns/iter (+/- 0.39) |
| bench_std_fmt::bench_u16_max | 17.31 ns/iter (+/- 0.32) | 17.41 ns/iter (+/- 0.32) |
| bench_std_fmt::bench_u16_min | 16.40 ns/iter (+/- 0.45) | 16.02 ns/iter (+/- 0.39) |
| bench_std_fmt::bench_u32_0 | 16.17 ns/iter (+/- 0.04) | 16.29 ns/iter (+/- 0.16) |
| bench_std_fmt::bench_u32_max | 19.00 ns/iter (+/- 0.10) | 19.16 ns/iter (+/- 0.28) |
| bench_std_fmt::bench_u32_min | 16.16 ns/iter (+/- 0.09) | 16.28 ns/iter (+/- 0.11) |
| bench_std_fmt::bench_u64_0 | 16.22 ns/iter (+/- 0.22) | 16.14 ns/iter (+/- 0.18) |
| bench_std_fmt::bench_u64_half | 19.25 ns/iter (+/- 0.07) | 18.95 ns/iter (+/- 0.05) |
| bench_std_fmt::bench_u64_max | 24.31 ns/iter (+/- 0.08) | 24.18 ns/iter (+/- 0.08) |
| bench_std_fmt::bench_u8_0 | 15.76 ns/iter (+/- 0.08) | 15.66 ns/iter (+/- 0.08) |
| bench_std_fmt::bench_u8_max | 16.53 ns/iter (+/- 0.03) | 16.29 ns/iter (+/- 0.02) |
| bench_std_fmt::bench_u8_min | 15.77 ns/iter (+/- 0.06) | 15.67 ns/iter (+/- 0.02) |
The source code is:
<details>
<summary>source code</summary>
```rust
#![feature(test)]
#![allow(non_snake_case)]
#![allow(clippy::cast_lossless)]
extern crate test;
macro_rules! benches {
($($name:ident($value:expr))*) => {
mod bench_std_fmt {
use std::io::Write;
use test::{Bencher, black_box};
$(
#[bench]
fn $name(b: &mut Bencher) {
let mut buf = Vec::with_capacity(40);
b.iter(|| {
buf.clear();
write!(&mut buf, "{}", black_box($value)).unwrap();
black_box(&buf);
});
}
)*
}
}
}
benches! {
bench_u64_0(0u64)
bench_u64_half(u32::max_value() as u64)
bench_u64_max(u64::max_value())
bench_i64_0(0i64)
bench_i64_half(i32::max_value() as i64)
bench_i64_max(i64::max_value())
bench_u16_0(0u16)
bench_u16_min(u16::min_value())
bench_u16_max(u16::max_value())
bench_i16_0(0i16)
bench_i16_min(i16::min_value())
bench_i16_max(i16::max_value())
bench_u128_0(0u128)
bench_u128_max(u128::max_value())
bench_i8_0(0i8)
bench_i8_min(i8::min_value())
bench_i8_max(i8::max_value())
bench_u8_0(0u8)
bench_u8_min(u8::min_value())
bench_u8_max(u8::max_value())
bench_u32_0(0u32)
bench_u32_min(u32::min_value())
bench_u32_max(u32::max_value())
bench_i32_0(0i32)
bench_i32_min(i32::min_value())
bench_i32_max(i32::max_value())
}
```
</details>
And then I ran the equivalent code (source code below) in callgrind with [callgrind_differ](https://github.com/Ethiraric/callgrind_differ) to generate a nice output and here's the result:
```
core::fmt::num::imp::<impl core::fmt::Display for i16>::fmt | 1300000 | - 70000 - 5.385% 1230000
core::fmt::num::imp::<impl core::fmt::Display for i32>::fmt | 1910000 | - 100000 - 5.236% 1810000
core::fmt::num::imp::<impl core::fmt::Display for i64>::fmt | 2430000 | - 110000 - 4.527% 2320000
core::fmt::num::imp::<impl core::fmt::Display for i8>::fmt | 1080000 | - 170000 - 15.741% 910000
core::fmt::num::imp::<impl core::fmt::Display for u16>::fmt | 960000 | + 10000 + 1.042% 970000
core::fmt::num::imp::<impl core::fmt::Display for u32>::fmt | 1300000 | + 30000 + 2.308% 1330000
core::fmt::num::imp::<impl core::fmt::Display for u8>::fmt | 820000 | - 30000 - 3.659% 790000
```
<details>
<summary>Source code</summary>
```rust
#![feature(test)]
extern crate test;
use std::io::{stdout, Write};
use std::io::StdoutLock;
use test::black_box;
macro_rules! benches {
($handle:ident, $buf:ident, $($name:ident($value:expr))*) => {
$(
fn $name(handle: &mut StdoutLock, buf: &mut Vec<u8>) {
for _ in 0..10000 {
buf.clear();
write!(buf, "{}", black_box($value)).unwrap();
handle.write_all(buf);
}
}
$name(&mut $handle, &mut $buf);
)*
}
}
fn main() {
let mut handle = stdout().lock();
let mut buf = Vec::with_capacity(40);
benches! {
handle, buf,
bench_u64_0(0u64)
bench_u64_half(u32::max_value() as u64)
bench_u64_max(u64::max_value())
bench_i64_0(0i64)
bench_i64_half(i32::max_value() as i64)
bench_i64_max(i64::max_value())
bench_u16_0(0u16)
bench_u16_min(u16::min_value())
bench_u16_max(u16::max_value())
bench_i16_0(0i16)
bench_i16_min(i16::min_value())
bench_i16_max(i16::max_value())
bench_u128_0(0u128)
bench_u128_max(u128::max_value())
bench_i8_0(0i8)
bench_i8_min(i8::min_value())
bench_i8_max(i8::max_value())
bench_u8_0(0u8)
bench_u8_min(u8::min_value())
bench_u8_max(u8::max_value())
bench_i32_0(0i32)
bench_i32_min(i32::min_value())
bench_i32_max(i32::max_value())
bench_u32_0(0u32)
bench_u32_min(u32::min_value())
bench_u32_max(u32::max_value())
}
}
```
</details>
The next step would be to specialize the `ToString` implementation so it doesn't go through the `Display` trait. I'm not sure if it will improve anything but I think it's worth a try.
r? `@Amanieu`
| -rw-r--r-- | library/core/src/fmt/num.rs | 212 |
1 files changed, 131 insertions, 81 deletions
diff --git a/library/core/src/fmt/num.rs b/library/core/src/fmt/num.rs index e7726da8d91..aecd725eca5 100644 --- a/library/core/src/fmt/num.rs +++ b/library/core/src/fmt/num.rs @@ -208,75 +208,119 @@ static DEC_DIGITS_LUT: &[u8; 200] = b"0001020304050607080910111213141516171819\ 8081828384858687888990919293949596979899"; macro_rules! impl_Display { - ($($t:ident),* as $u:ident via $conv_fn:ident named $name:ident) => { - #[cfg(not(feature = "optimize_for_size"))] - fn $name(mut n: $u, is_nonnegative: bool, f: &mut fmt::Formatter<'_>) -> fmt::Result { - // 2^128 is about 3*10^38, so 39 gives an extra byte of space - let mut buf = [MaybeUninit::<u8>::uninit(); 39]; - let mut curr = buf.len(); - let buf_ptr = MaybeUninit::slice_as_mut_ptr(&mut buf); - let lut_ptr = DEC_DIGITS_LUT.as_ptr(); + ($($t:ident $(as $positive:ident)? named $name:ident,)* ; as $u:ident via $conv_fn:ident named $gen_name:ident) => { - // SAFETY: Since `d1` and `d2` are always less than or equal to `198`, we - // can copy from `lut_ptr[d1..d1 + 1]` and `lut_ptr[d2..d2 + 1]`. To show - // that it's OK to copy into `buf_ptr`, notice that at the beginning - // `curr == buf.len() == 39 > log(n)` since `n < 2^128 < 10^39`, and at - // each step this is kept the same as `n` is divided. Since `n` is always - // non-negative, this means that `curr > 0` so `buf_ptr[curr..curr + 1]` - // is safe to access. - unsafe { - // need at least 16 bits for the 4-characters-at-a-time to work. - assert!(crate::mem::size_of::<$u>() >= 2); + $( + #[stable(feature = "rust1", since = "1.0.0")] + impl fmt::Display for $t { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + // If it's a signed integer. + $( + let is_nonnegative = *self >= 0; - // eagerly decode 4 characters at a time - while n >= 10000 { - let rem = (n % 10000) as usize; - n /= 10000; + #[cfg(not(feature = "optimize_for_size"))] + { + if !is_nonnegative { + // convert the negative num to positive by summing 1 to its 2s complement + return (!self as $positive).wrapping_add(1)._fmt(false, f); + } + } + #[cfg(feature = "optimize_for_size")] + { + if !is_nonnegative { + // convert the negative num to positive by summing 1 to its 2s complement + return $gen_name((!self.$conv_fn()).wrapping_add(1), false, f); + } + } + )? + // If it's a positive integer. + #[cfg(not(feature = "optimize_for_size"))] + { + self._fmt(true, f) + } + #[cfg(feature = "optimize_for_size")] + { + $gen_name(self.$conv_fn(), true, f) + } + } + } - let d1 = (rem / 100) << 1; - let d2 = (rem % 100) << 1; - curr -= 4; + #[cfg(not(feature = "optimize_for_size"))] + impl $t { + fn _fmt(mut self: $t, is_nonnegative: bool, f: &mut fmt::Formatter<'_>) -> fmt::Result { + const SIZE: usize = $t::MAX.ilog(10) as usize + 1; + let mut buf = [MaybeUninit::<u8>::uninit(); SIZE]; + let mut curr = SIZE; + let buf_ptr = MaybeUninit::slice_as_mut_ptr(&mut buf); + let lut_ptr = DEC_DIGITS_LUT.as_ptr(); + + // SAFETY: Since `d1` and `d2` are always less than or equal to `198`, we + // can copy from `lut_ptr[d1..d1 + 1]` and `lut_ptr[d2..d2 + 1]`. To show + // that it's OK to copy into `buf_ptr`, notice that at the beginning + // `curr == buf.len() == 39 > log(n)` since `n < 2^128 < 10^39`, and at + // each step this is kept the same as `n` is divided. Since `n` is always + // non-negative, this means that `curr > 0` so `buf_ptr[curr..curr + 1]` + // is safe to access. + unsafe { + // need at least 16 bits for the 4-characters-at-a-time to work. + #[allow(overflowing_literals)] + #[allow(unused_comparisons)] + // This block will be removed for smaller types at compile time and in the worst + // case, it will prevent to have the `10000` literal to overflow for `i8` and `u8`. + if core::mem::size_of::<$t>() >= 2 { + // eagerly decode 4 characters at a time + while self >= 10000 { + let rem = (self % 10000) as usize; + self /= 10000; + + let d1 = (rem / 100) << 1; + let d2 = (rem % 100) << 1; + curr -= 4; + + // We are allowed to copy to `buf_ptr[curr..curr + 3]` here since + // otherwise `curr < 0`. But then `n` was originally at least `10000^10` + // which is `10^40 > 2^128 > n`. + ptr::copy_nonoverlapping(lut_ptr.add(d1 as usize), buf_ptr.add(curr), 2); + ptr::copy_nonoverlapping(lut_ptr.add(d2 as usize), buf_ptr.add(curr + 2), 2); + } + } - // We are allowed to copy to `buf_ptr[curr..curr + 3]` here since - // otherwise `curr < 0`. But then `n` was originally at least `10000^10` - // which is `10^40 > 2^128 > n`. - ptr::copy_nonoverlapping(lut_ptr.add(d1), buf_ptr.add(curr), 2); - ptr::copy_nonoverlapping(lut_ptr.add(d2), buf_ptr.add(curr + 2), 2); - } + // if we reach here numbers are <= 9999, so at most 4 chars long + let mut n = self as usize; // possibly reduce 64bit math - // if we reach here numbers are <= 9999, so at most 4 chars long - let mut n = n as usize; // possibly reduce 64bit math + // decode 2 more chars, if > 2 chars + if n >= 100 { + let d1 = (n % 100) << 1; + n /= 100; + curr -= 2; + ptr::copy_nonoverlapping(lut_ptr.add(d1), buf_ptr.add(curr), 2); + } - // decode 2 more chars, if > 2 chars - if n >= 100 { - let d1 = (n % 100) << 1; - n /= 100; - curr -= 2; - ptr::copy_nonoverlapping(lut_ptr.add(d1), buf_ptr.add(curr), 2); + // if we reach here numbers are <= 100, so at most 2 chars long + // The biggest it can be is 99, and 99 << 1 == 198, so a `u8` is enough. + // decode last 1 or 2 chars + if n < 10 { + curr -= 1; + *buf_ptr.add(curr) = (n as u8) + b'0'; + } else { + let d1 = n << 1; + curr -= 2; + ptr::copy_nonoverlapping(lut_ptr.add(d1), buf_ptr.add(curr), 2); + } } - // decode last 1 or 2 chars - if n < 10 { - curr -= 1; - *buf_ptr.add(curr) = (n as u8) + b'0'; - } else { - let d1 = n << 1; - curr -= 2; - ptr::copy_nonoverlapping(lut_ptr.add(d1), buf_ptr.add(curr), 2); - } + // SAFETY: `curr` > 0 (since we made `buf` large enough), and all the chars are valid + // UTF-8 since `DEC_DIGITS_LUT` is + let buf_slice = unsafe { + str::from_utf8_unchecked( + slice::from_raw_parts(buf_ptr.add(curr), buf.len() - curr)) + }; + f.pad_integral(is_nonnegative, "", buf_slice) } - - // SAFETY: `curr` > 0 (since we made `buf` large enough), and all the chars are valid - // UTF-8 since `DEC_DIGITS_LUT` is - let buf_slice = unsafe { - str::from_utf8_unchecked( - slice::from_raw_parts(buf_ptr.add(curr), buf.len() - curr)) - }; - f.pad_integral(is_nonnegative, "", buf_slice) - } + })* #[cfg(feature = "optimize_for_size")] - fn $name(mut n: $u, is_nonnegative: bool, f: &mut fmt::Formatter<'_>) -> fmt::Result { + fn $gen_name(mut n: $u, is_nonnegative: bool, f: &mut fmt::Formatter<'_>) -> fmt::Result { // 2^128 is about 3*10^38, so 39 gives an extra byte of space let mut buf = [MaybeUninit::<u8>::uninit(); 39]; let mut curr = buf.len(); @@ -306,21 +350,6 @@ macro_rules! impl_Display { }; f.pad_integral(is_nonnegative, "", buf_slice) } - - $(#[stable(feature = "rust1", since = "1.0.0")] - impl fmt::Display for $t { - #[allow(unused_comparisons)] - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - let is_nonnegative = *self >= 0; - let n = if is_nonnegative { - self.$conv_fn() - } else { - // convert the negative num to positive by summing 1 to it's 2 complement - (!self.$conv_fn()).wrapping_add(1) - }; - $name(n, is_nonnegative, f) - } - })* }; } @@ -374,7 +403,6 @@ macro_rules! impl_Exp { (n, exponent, exponent, added_precision) }; - // 39 digits (worst case u128) + . = 40 // Since `curr` always decreases by the number of digits copied, this means // that `curr >= 0`. let mut buf = [MaybeUninit::<u8>::uninit(); 40]; @@ -469,7 +497,7 @@ macro_rules! impl_Exp { let n = if is_nonnegative { self.$conv_fn() } else { - // convert the negative num to positive by summing 1 to it's 2 complement + // convert the negative num to positive by summing 1 to its 2s complement (!self.$conv_fn()).wrapping_add(1) }; $name(n, is_nonnegative, false, f) @@ -484,7 +512,7 @@ macro_rules! impl_Exp { let n = if is_nonnegative { self.$conv_fn() } else { - // convert the negative num to positive by summing 1 to it's 2 complement + // convert the negative num to positive by summing 1 to its 2s complement (!self.$conv_fn()).wrapping_add(1) }; $name(n, is_nonnegative, true, f) @@ -499,8 +527,17 @@ macro_rules! impl_Exp { mod imp { use super::*; impl_Display!( - i8, u8, i16, u16, i32, u32, i64, u64, usize, isize - as u64 via to_u64 named fmt_u64 + i8 as u8 named fmt_i8, + u8 named fmt_u8, + i16 as u16 named fmt_i16, + u16 named fmt_u16, + i32 as u32 named fmt_i32, + u32 named fmt_u32, + i64 as u64 named fmt_i64, + u64 named fmt_u64, + isize as usize named fmt_isize, + usize named fmt_usize, + ; as u64 via to_u64 named fmt_u64 ); impl_Exp!( i8, u8, i16, u16, i32, u32, i64, u64, usize, isize @@ -511,8 +548,21 @@ mod imp { #[cfg(not(any(target_pointer_width = "64", target_arch = "wasm32")))] mod imp { use super::*; - impl_Display!(i8, u8, i16, u16, i32, u32, isize, usize as u32 via to_u32 named fmt_u32); - impl_Display!(i64, u64 as u64 via to_u64 named fmt_u64); + impl_Display!( + i8 as u8 named fmt_i8, + u8 named fmt_u8, + i16 as u16 named fmt_i16, + u16 named fmt_u16, + i32 as u32 named fmt_i32, + u32 named fmt_u32, + isize as usize named fmt_isize, + usize named fmt_usize, + ; as u32 via to_u32 named fmt_u32); + impl_Display!( + i64 as u64 named fmt_i64, + u64 named fmt_u64, + ; as u64 via to_u64 named fmt_u64); + impl_Exp!(i8, u8, i16, u16, i32, u32, isize, usize as u32 via to_u32 named exp_u32); impl_Exp!(i64, u64 as u64 via to_u64 named exp_u64); } @@ -619,7 +669,7 @@ impl fmt::Display for i128 { let n = if is_nonnegative { self.to_u128() } else { - // convert the negative num to positive by summing 1 to it's 2 complement + // convert the negative num to positive by summing 1 to its 2s complement (!self.to_u128()).wrapping_add(1) }; fmt_u128(n, is_nonnegative, f) |
