about summary refs log tree commit diff
diff options
context:
space:
mode:
authorAndrea Canciani <ranma42@gmail.com>2016-05-04 16:08:21 +0200
committerAndrea Canciani <ranma42@gmail.com>2016-05-13 15:18:31 +0200
commit78124a2b32d61a3ab1cc5798828195970438262b (patch)
tree7a4b4a8f796de72bb3afe2c48a77d4d3d9adaef2
parenta581c82bdc2c13a398b091878d7353bc3d097d59 (diff)
downloadrust-78124a2b32d61a3ab1cc5798828195970438262b.tar.gz
rust-78124a2b32d61a3ab1cc5798828195970438262b.zip
Fix fast path of float parsing on x87
The fast path of the float parser relies on the rounding to happen
exactly and directly to the correct number of bits. On x87, instead,
double rounding would occour as the FPU stack defaults to 80 bits of
precision.

This can be fixed by setting the precision of the FPU stack before
performing the int to float conversion. This can be achieved by
changing the value of the x87 control word. This is a somewhat common
operation that is in fact performed whenever a float needs to be
truncated to an integer, but it is undesirable to add its overhead for
code that does not rely on x87 for computations (i.e. on non-x86
architectures, or x86 architectures which perform FPU computations on
using SSE).

Fixes `num::dec2flt::fast_path_correct` (on x87).
-rw-r--r--src/libcore/lib.rs2
-rw-r--r--src/libcore/num/dec2flt/algorithm.rs47
2 files changed, 42 insertions, 7 deletions
diff --git a/src/libcore/lib.rs b/src/libcore/lib.rs
index e1bbdf4a7ae..a054e41b208 100644
--- a/src/libcore/lib.rs
+++ b/src/libcore/lib.rs
@@ -61,7 +61,9 @@
 #![cfg_attr(not(stage0), deny(warnings))]
 
 #![feature(allow_internal_unstable)]
+#![feature(asm)]
 #![feature(associated_type_defaults)]
+#![feature(cfg_target_feature)]
 #![feature(concat_idents)]
 #![feature(const_fn)]
 #![feature(cfg_target_has_atomic)]
diff --git a/src/libcore/num/dec2flt/algorithm.rs b/src/libcore/num/dec2flt/algorithm.rs
index e33c2814bf2..d64ad449e9a 100644
--- a/src/libcore/num/dec2flt/algorithm.rs
+++ b/src/libcore/num/dec2flt/algorithm.rs
@@ -32,19 +32,49 @@ fn power_of_ten(e: i16) -> Fp {
     Fp { f: sig, e: exp }
 }
 
+#[cfg(any(not(target_arch="x86"), target_feature="sse2"))]
+mod fpu_precision {
+    pub fn set_precision<T>() { }
+}
+
+#[cfg(all(target_arch="x86", not(target_feature="sse2")))]
+mod fpu_precision {
+    use mem::size_of;
+    use ops::Drop;
+
+    pub struct FPUControlWord(u16);
+
+    fn set_cw(cw: u16) {
+        unsafe { asm!("fldcw $0" :: "m" (cw)) :: "volatile" }
+    }
+
+    pub fn set_precision<T>() -> FPUControlWord {
+        let cw = 0u16;
+        let cw_precision = match size_of::<T>() {
+            4 => 0x0000, // 32 bits
+            8 => 0x0200, // 64 bits
+            _ => 0x0300, // default, 80 bits
+        };
+        unsafe { asm!("fnstcw $0" : "=*m" (&cw)) ::: "volatile" }
+        set_cw((cw & 0xFCFF) | cw_precision);
+        FPUControlWord(cw)
+    }
+
+    impl Drop for FPUControlWord {
+        fn drop(&mut self) {
+            set_cw(self.0)
+        }
+    }
+}
+
 /// The fast path of Bellerophon using machine-sized integers and floats.
 ///
 /// This is extracted into a separate function so that it can be attempted before constructing
 /// a bignum.
 ///
 /// The fast path crucially depends on arithmetic being correctly rounded, so on x86
-/// without SSE or SSE2 it will be **wrong** (as in, off by one ULP occasionally), because the x87
-/// FPU stack will round to 80 bit first before rounding to 64/32 bit. However, as such hardware
-/// is extremely rare nowadays and in fact all in-tree target triples assume an SSE2-capable
-/// microarchitecture, there is little incentive to deal with that. There's a test that will fail
-/// when SSE or SSE2 is disabled, so people building their own non-SSE copy will get a heads up.
-///
-/// FIXME: It would nevertheless be nice if we had a good way to detect and deal with x87.
+/// without SSE or SSE2 it requires the precision of the x87 FPU stack to be changed
+/// so that it directly rounds to 64/32 bit.
 pub fn fast_path<T: RawFloat>(integral: &[u8], fractional: &[u8], e: i64) -> Option<T> {
     let num_digits = integral.len() + fractional.len();
     // log_10(f64::max_sig) ~ 15.95. We compare the exact value to max_sig near the end,
@@ -60,6 +90,9 @@ pub fn fast_path<T: RawFloat>(integral: &[u8], fractional: &[u8], e: i64) -> Opt
     if f > T::max_sig() {
         return None;
     }
+
+    let _cw = fpu_precision::set_precision::<T>();
+
     // The case e < 0 cannot be folded into the other branch. Negative powers result in
     // a repeating fractional part in binary, which are rounded, which causes real
     // (and occasioally quite significant!) errors in the final result.