diff options
| author | Simon Sapin <simon.sapin@exyr.org> | 2019-03-19 00:50:26 +0100 |
|---|---|---|
| committer | Simon Sapin <simon.sapin@exyr.org> | 2019-03-19 00:50:26 +0100 |
| commit | 0ad91f73d92c3b8d3978f8f54c04b8efe3d2e673 (patch) | |
| tree | 806084ebc5277243acbb738fd570642947445412 | |
| parent | 4a3241a815e951dfa87e9be7133e8265bc945e57 (diff) | |
| download | rust-0ad91f73d92c3b8d3978f8f54c04b8efe3d2e673.tar.gz rust-0ad91f73d92c3b8d3978f8f54c04b8efe3d2e673.zip | |
Simplify u8::to_ascii_{upp,low}ercase while keeping it fast
| -rw-r--r-- | src/libcore/benches/ascii.rs | 24 | ||||
| -rw-r--r-- | src/libcore/num/mod.rs | 46 |
2 files changed, 27 insertions, 43 deletions
diff --git a/src/libcore/benches/ascii.rs b/src/libcore/benches/ascii.rs index ce36027394a..89e67cca4b7 100644 --- a/src/libcore/benches/ascii.rs +++ b/src/libcore/benches/ascii.rs @@ -1,4 +1,26 @@ -// See comments in `u8::to_ascii_uppercase` in `src/libcore/num/mod.rs`. +// Lower-case ASCII 'a' is the first byte that has its highest bit set +// after wrap-adding 0x1F: +// +// b'a' + 0x1F == 0x80 == 0b1000_0000 +// b'z' + 0x1F == 0x98 == 0b10011000 +// +// Lower-case ASCII 'z' is the last byte that has its highest bit unset +// after wrap-adding 0x05: +// +// b'a' + 0x05 == 0x66 == 0b0110_0110 +// b'z' + 0x05 == 0x7F == 0b0111_1111 +// +// … except for 0xFB to 0xFF, but those are in the range of bytes +// that have the highest bit unset again after adding 0x1F. +// +// So `(byte + 0x1f) & !(byte + 5)` has its highest bit set +// iff `byte` is a lower-case ASCII letter. +// +// Lower-case ASCII letters all have the 0x20 bit set. +// (Two positions right of 0x80, the highest bit.) +// Unsetting that bit produces the same letter, in upper-case. +// +// Therefore: fn branchless_to_ascii_upper_case(byte: u8) -> u8 { byte & !( diff --git a/src/libcore/num/mod.rs b/src/libcore/num/mod.rs index 64469a4b7e4..3fcae6b94b0 100644 --- a/src/libcore/num/mod.rs +++ b/src/libcore/num/mod.rs @@ -3794,39 +3794,8 @@ impl u8 { #[stable(feature = "ascii_methods_on_intrinsics", since = "1.23.0")] #[inline] pub fn to_ascii_uppercase(&self) -> u8 { - // See benchmarks in src/libcore/benches/ascii_case.rs - - // Lower-case ASCII 'a' is the first byte that has its highest bit set - // after wrap-adding 0x1F: - // - // b'a' + 0x1F == 0x80 == 0b1000_0000 - // b'z' + 0x1F == 0x98 == 0b10011000 - // - // Lower-case ASCII 'z' is the last byte that has its highest bit unset - // after wrap-adding 0x05: - // - // b'a' + 0x05 == 0x66 == 0b0110_0110 - // b'z' + 0x05 == 0x7F == 0b0111_1111 - // - // … except for 0xFB to 0xFF, but those are in the range of bytes - // that have the highest bit unset again after adding 0x1F. - // - // So `(byte + 0x1f) & !(byte + 5)` has its highest bit set - // iff `byte` is a lower-case ASCII letter. - // - // Lower-case ASCII letters all have the 0x20 bit set. - // (Two positions right of 0x80, the highest bit.) - // Unsetting that bit produces the same letter, in upper-case. - // - // Therefore: - *self & - !( - ( - self.wrapping_add(0x1f) & - !self.wrapping_add(0x05) & - 0x80 - ) >> 2 - ) + // Unset the fith bit if this is a lowercase letter + *self & !((self.is_ascii_lowercase() as u8) << 5) } /// Makes a copy of the value in its ASCII lower case equivalent. @@ -3848,15 +3817,8 @@ impl u8 { #[stable(feature = "ascii_methods_on_intrinsics", since = "1.23.0")] #[inline] pub fn to_ascii_lowercase(&self) -> u8 { - // See comments in to_ascii_uppercase above. - *self | - ( - ( - self.wrapping_add(0x3f) & - !self.wrapping_add(0x25) & - 0x80 - ) >> 2 - ) + // Set the fith bit if this is an uppercase letter + *self | ((self.is_ascii_uppercase() as u8) << 5) } /// Checks that two values are an ASCII case-insensitive match. |
