about summary refs log tree commit diff
diff options
context:
space:
mode:
authorbors <bors@rust-lang.org>2021-10-12 03:18:54 +0000
committerbors <bors@rust-lang.org>2021-10-12 03:18:54 +0000
commitffdf18d1447b57faafded06a887b6dae1f7293c7 (patch)
treec73ef89236e7d026ab35fb4b2ad723638e7fcfab
parent97e3b30285af8526f8d21072a0c6d3f231e74876 (diff)
parent57c623570a16fd6d550627ee8fed1eb7882521f3 (diff)
downloadrust-ffdf18d1447b57faafded06a887b6dae1f7293c7.tar.gz
rust-ffdf18d1447b57faafded06a887b6dae1f7293c7.zip
Auto merge of #88788 - falk-hueffner:speedup-int-log10-branchless, r=joshtriplett
Speedup int log10 branchless

This is achieved with a branchless bit-twiddling implementation of the case x < 100_000, and using this as building block.

Benchmark on an Intel i7-8700K (Coffee Lake):

```
name                                   old ns/iter  new ns/iter  diff ns/iter   diff %  speedup
num::int_log::u8_log10_predictable     165          169                     4    2.42%   x 0.98
num::int_log::u8_log10_random          438          423                   -15   -3.42%   x 1.04
num::int_log::u8_log10_random_small    438          423                   -15   -3.42%   x 1.04
num::int_log::u16_log10_predictable    633          417                  -216  -34.12%   x 1.52
num::int_log::u16_log10_random         908          471                  -437  -48.13%   x 1.93
num::int_log::u16_log10_random_small   945          471                  -474  -50.16%   x 2.01
num::int_log::u32_log10_predictable    1,496        1,340                -156  -10.43%   x 1.12
num::int_log::u32_log10_random         1,076        873                  -203  -18.87%   x 1.23
num::int_log::u32_log10_random_small   1,145        874                  -271  -23.67%   x 1.31
num::int_log::u64_log10_predictable    4,005        3,171                -834  -20.82%   x 1.26
num::int_log::u64_log10_random         1,247        1,021                -226  -18.12%   x 1.22
num::int_log::u64_log10_random_small   1,265        921                  -344  -27.19%   x 1.37
num::int_log::u128_log10_predictable   39,667       39,579                -88   -0.22%   x 1.00
num::int_log::u128_log10_random        6,456        6,696                 240    3.72%   x 0.96
num::int_log::u128_log10_random_small  4,108        3,903                -205   -4.99%   x 1.05
```

Benchmark on an M1 Mac Mini:

```
name                                   old ns/iter  new ns/iter  diff ns/iter   diff %  speedup
num::int_log::u8_log10_predictable     143          130                   -13   -9.09%   x 1.10
num::int_log::u8_log10_random          375          325                   -50  -13.33%   x 1.15
num::int_log::u8_log10_random_small    376          325                   -51  -13.56%   x 1.16
num::int_log::u16_log10_predictable    500          322                  -178  -35.60%   x 1.55
num::int_log::u16_log10_random         794          405                  -389  -48.99%   x 1.96
num::int_log::u16_log10_random_small   1,035        405                  -630  -60.87%   x 2.56
num::int_log::u32_log10_predictable    1,144        894                  -250  -21.85%   x 1.28
num::int_log::u32_log10_random         832          786                   -46   -5.53%   x 1.06
num::int_log::u32_log10_random_small   832          787                   -45   -5.41%   x 1.06
num::int_log::u64_log10_predictable    2,681        2,057                -624  -23.27%   x 1.30
num::int_log::u64_log10_random         1,015        806                  -209  -20.59%   x 1.26
num::int_log::u64_log10_random_small   1,004        795                  -209  -20.82%   x 1.26
num::int_log::u128_log10_predictable   56,825       56,526               -299   -0.53%   x 1.01
num::int_log::u128_log10_random        9,056        8,861                -195   -2.15%   x 1.02
num::int_log::u128_log10_random_small  1,528        1,527                  -1   -0.07%   x 1.00
```

The 128 bit case remains ridiculously slow because llvm fails to optimize division by a constant 128-bit value to multiplications. This could be worked around but it seems preferable to fix this in llvm.

From u32 up, table lookup (like suggested [here](https://github.com/rust-lang/rust/issues/70887#issuecomment-881099813)) is still faster, but requires a hardware `leading_zeros` to be viable, and might clog up the cache.
-rw-r--r--library/core/benches/lib.rs1
-rw-r--r--library/core/benches/num/int_log/mod.rs58
-rw-r--r--library/core/benches/num/mod.rs1
-rw-r--r--library/core/src/num/int_log10.rs107
-rw-r--r--library/core/tests/num/int_log.rs3
5 files changed, 114 insertions, 56 deletions
diff --git a/library/core/benches/lib.rs b/library/core/benches/lib.rs
index 82c1789c24d..d5e1ec083f9 100644
--- a/library/core/benches/lib.rs
+++ b/library/core/benches/lib.rs
@@ -1,6 +1,7 @@
 // wasm32 does not support benches (no time).
 #![cfg(not(target_arch = "wasm32"))]
 #![feature(flt2dec)]
+#![feature(int_log)]
 #![feature(test)]
 
 extern crate test;
diff --git a/library/core/benches/num/int_log/mod.rs b/library/core/benches/num/int_log/mod.rs
new file mode 100644
index 00000000000..6a219bcdd55
--- /dev/null
+++ b/library/core/benches/num/int_log/mod.rs
@@ -0,0 +1,58 @@
+use rand::Rng;
+use test::{black_box, Bencher};
+
+macro_rules! int_log_bench {
+    ($t:ty, $predictable:ident, $random:ident, $random_small:ident) => {
+        #[bench]
+        fn $predictable(bench: &mut Bencher) {
+            bench.iter(|| {
+                for n in 0..(<$t>::BITS / 8) {
+                    for i in 1..=(100 as $t) {
+                        let x = black_box(i << (n * 8));
+                        black_box(x.log10());
+                    }
+                }
+            });
+        }
+
+        #[bench]
+        fn $random(bench: &mut Bencher) {
+            let mut rng = rand::thread_rng();
+            /* Exponentially distributed random numbers from the whole range of the type.  */
+            let numbers: Vec<$t> = (0..256)
+                .map(|_| {
+                    let x = rng.gen::<$t>() >> rng.gen_range(0, <$t>::BITS);
+                    if x != 0 { x } else { 1 }
+                })
+                .collect();
+            bench.iter(|| {
+                for x in &numbers {
+                    black_box(black_box(x).log10());
+                }
+            });
+        }
+
+        #[bench]
+        fn $random_small(bench: &mut Bencher) {
+            let mut rng = rand::thread_rng();
+            /* Exponentially distributed random numbers from the range 0..256.  */
+            let numbers: Vec<$t> = (0..256)
+                .map(|_| {
+                    let x = (rng.gen::<u8>() >> rng.gen_range(0, u8::BITS)) as $t;
+                    if x != 0 { x } else { 1 }
+                })
+                .collect();
+            bench.iter(|| {
+                for x in &numbers {
+                    black_box(black_box(x).log10());
+                }
+            });
+        }
+    };
+}
+
+int_log_bench! {u8, u8_log10_predictable, u8_log10_random, u8_log10_random_small}
+int_log_bench! {u16, u16_log10_predictable, u16_log10_random, u16_log10_random_small}
+int_log_bench! {u32, u32_log10_predictable, u32_log10_random, u32_log10_random_small}
+int_log_bench! {u64, u64_log10_predictable, u64_log10_random, u64_log10_random_small}
+int_log_bench! {u128, u128_log10_predictable, u128_log10_random, u128_log10_random_small}
diff --git a/library/core/benches/num/mod.rs b/library/core/benches/num/mod.rs
index 852d4e481e2..2f9cad2725d 100644
--- a/library/core/benches/num/mod.rs
+++ b/library/core/benches/num/mod.rs
@@ -1,5 +1,6 @@
 mod dec2flt;
 mod flt2dec;
+mod int_log;
 
 use std::str::FromStr;
 use test::Bencher;
diff --git a/library/core/src/num/int_log10.rs b/library/core/src/num/int_log10.rs
index e4599067f85..398bb07a07e 100644
--- a/library/core/src/num/int_log10.rs
+++ b/library/core/src/num/int_log10.rs
@@ -1,76 +1,71 @@
 mod unchecked {
     // 0 < val <= u8::MAX
     pub const fn u8(val: u8) -> u32 {
-        if val >= 100 {
-            2
-        } else if val >= 10 {
-            1
-        } else {
-            0
-        }
+        let val = val as u32;
+
+        // For better performance, avoid branches by assembling the solution
+        // in the bits above the low 8 bits.
+
+        // Adding c1 to val gives 10 in the top bits for val < 10, 11 for val >= 10
+        const C1: u32 = 0b11_00000000 - 10; // 758
+        // Adding c2 to val gives 01 in the top bits for val < 100, 10 for val >= 100
+        const C2: u32 = 0b10_00000000 - 100; // 412
+
+        // Value of top bits:
+        //            +c1  +c2  1&2
+        //     0..=9   10   01   00 = 0
+        //   10..=99   11   01   01 = 1
+        // 100..=255   11   10   10 = 2
+        ((val + C1) & (val + C2)) >> 8
     }
 
-    // 0 < val <= u16::MAX
-    pub const fn u16(val: u16) -> u32 {
-        if val >= 10_000 {
-            4
-        } else if val >= 1000 {
-            3
-        } else if val >= 100 {
-            2
-        } else if val >= 10 {
-            1
-        } else {
-            0
-        }
+    // 0 < val < 100_000
+    const fn less_than_5(val: u32) -> u32 {
+        // Similar to u8, when adding one of these constants to val,
+        // we get two possible bit patterns above the low 17 bits,
+        // depending on whether val is below or above the threshold.
+        const C1: u32 = 0b011_00000000000000000 - 10; // 393206
+        const C2: u32 = 0b100_00000000000000000 - 100; // 524188
+        const C3: u32 = 0b111_00000000000000000 - 1000; // 916504
+        const C4: u32 = 0b100_00000000000000000 - 10000; // 514288
+
+        // Value of top bits:
+        //                +c1  +c2  1&2  +c3  +c4  3&4   ^
+        //         0..=9  010  011  010  110  011  010  000 = 0
+        //       10..=99  011  011  011  110  011  010  001 = 1
+        //     100..=999  011  100  000  110  011  010  010 = 2
+        //   1000..=9999  011  100  000  111  011  011  011 = 3
+        // 10000..=99999  011  100  000  111  100  100  100 = 4
+        (((val + C1) & (val + C2)) ^ ((val + C3) & (val + C4))) >> 17
     }
 
-    // 0 < val < 100_000_000
-    const fn less_than_8(mut val: u32) -> u32 {
-        let mut log = 0;
-        if val >= 10_000 {
-            val /= 10_000;
-            log += 4;
-        }
-        log + if val >= 1000 {
-            3
-        } else if val >= 100 {
-            2
-        } else if val >= 10 {
-            1
-        } else {
-            0
-        }
+    // 0 < val <= u16::MAX
+    pub const fn u16(val: u16) -> u32 {
+        less_than_5(val as u32)
     }
 
     // 0 < val <= u32::MAX
     pub const fn u32(mut val: u32) -> u32 {
         let mut log = 0;
-        if val >= 100_000_000 {
-            val /= 100_000_000;
-            log += 8;
-        }
-        log + less_than_8(val)
-    }
-
-    // 0 < val < 10_000_000_000_000_000
-    const fn less_than_16(mut val: u64) -> u32 {
-        let mut log = 0;
-        if val >= 100_000_000 {
-            val /= 100_000_000;
-            log += 8;
+        if val >= 100_000 {
+            val /= 100_000;
+            log += 5;
         }
-        log + less_than_8(val as u32)
+        log + less_than_5(val)
     }
 
     // 0 < val <= u64::MAX
     pub const fn u64(mut val: u64) -> u32 {
         let mut log = 0;
-        if val >= 10_000_000_000_000_000 {
-            val /= 10_000_000_000_000_000;
-            log += 16;
+        if val >= 10_000_000_000 {
+            val /= 10_000_000_000;
+            log += 10;
+        }
+        if val >= 100_000 {
+            val /= 100_000;
+            log += 5;
         }
-        log + less_than_16(val)
+        log + less_than_5(val as u32)
     }
 
     // 0 < val <= u128::MAX
@@ -79,13 +74,13 @@ mod unchecked {
         if val >= 100_000_000_000_000_000_000_000_000_000_000 {
             val /= 100_000_000_000_000_000_000_000_000_000_000;
             log += 32;
-            return log + less_than_8(val as u32);
+            return log + u32(val as u32);
         }
         if val >= 10_000_000_000_000_000 {
             val /= 10_000_000_000_000_000;
             log += 16;
         }
-        log + less_than_16(val as u64)
+        log + u64(val as u64)
     }
 
     // 0 < val <= i8::MAX
diff --git a/library/core/tests/num/int_log.rs b/library/core/tests/num/int_log.rs
index 1517e8a952f..3cd0073ddd8 100644
--- a/library/core/tests/num/int_log.rs
+++ b/library/core/tests/num/int_log.rs
@@ -96,6 +96,9 @@ fn checked_log10() {
     for i in 1..=u16::MAX {
         assert_eq!(i.checked_log10(), Some((i as f32).log10() as u32));
     }
+    for i in 1..=100_000u32 {
+        assert_eq!(i.checked_log10(), Some((i as f32).log10() as u32));
+    }
 }
 
 macro_rules! log10_loop {