From c82da7a54b9efb1a0ccbe11de66c71f547bf7db9 Mon Sep 17 00:00:00 2001
From: Kang Seonghoon <public+git@mearie.org>
Date: Sun, 19 Apr 2015 14:19:54 +0900
Subject: core: added core::num::flt2dec for floating-point formatting.

This is a fork of the flt2dec portion of rust-strconv [1] with
a necessary relicensing (the original code was licensed CC0-1.0).
Each module is accompanied with large unit tests, integrated
in this commit as coretest::num::flt2dec. This module is added
in order to replace the existing core::fmt::float method.

The forked revision of rust-strconv is from 2015-04-20, with a commit ID
9adf6d3571c6764a6f240a740c823024f70dc1c7.

[1] https://github.com/lifthrasiir/rust-strconv/
---
 src/libcore/num/flt2dec/bignum.rs          | 355 ++++++++++++++
 src/libcore/num/flt2dec/decoder.rs         | 100 ++++
 src/libcore/num/flt2dec/estimator.rs       |  23 +
 src/libcore/num/flt2dec/mod.rs             | 662 +++++++++++++++++++++++++
 src/libcore/num/flt2dec/strategy/dragon.rs | 327 +++++++++++++
 src/libcore/num/flt2dec/strategy/grisu.rs  | 749 +++++++++++++++++++++++++++++
 src/libcore/num/mod.rs                     |   3 +
 7 files changed, 2219 insertions(+)
 create mode 100644 src/libcore/num/flt2dec/bignum.rs
 create mode 100644 src/libcore/num/flt2dec/decoder.rs
 create mode 100644 src/libcore/num/flt2dec/estimator.rs
 create mode 100644 src/libcore/num/flt2dec/mod.rs
 create mode 100644 src/libcore/num/flt2dec/strategy/dragon.rs
 create mode 100644 src/libcore/num/flt2dec/strategy/grisu.rs

(limited to 'src/libcore/num')
diff --git a/src/libcore/num/flt2dec/bignum.rs b/src/libcore/num/flt2dec/bignum.rs
new file mode 100644
index 00000000000..449d080dc5c
--- /dev/null
+++ b/src/libcore/num/flt2dec/bignum.rs
@@ -0,0 +1,355 @@
+// Copyright 2015 The Rust Project Developers. See the COPYRIGHT
+// file at the top-level directory of this distribution and at
+// http://rust-lang.org/COPYRIGHT.
+//
+// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
+// option. This file may not be copied, modified, or distributed
+// except according to those terms.
+
+//! Custom arbitrary-precision number (bignum) implementation.
+//!
+//! This is designed to avoid the heap allocation at expense of stack memory.
+//! The most used bignum type, `Big32x36`, is limited by 32 × 36 = 1,152 bits
+//! and will take at most 152 bytes of stack memory. This is (barely) enough
+//! for handling all possible finite `f64` values.
+//!
+//! In principle it is possible to have multiple bignum types for different
+//! inputs, but we don't do so to avoid the code bloat. Each bignum is still
+//! tracked for the actual usages, so it normally doesn't matter.
+
+#![macro_use]
+
+use prelude::*;
+use mem;
+use intrinsics;
+
+/// Arithmetic operations required by bignums.
+pub trait FullOps {
+    /// Returns `(carry', v')` such that `carry' * 2^W + v' = self + other + carry`,
+    /// where `W` is the number of bits in `Self`.
+    fn full_add(self, other: Self, carry: bool) -> (bool /*carry*/, Self);
+
+    /// Returns `(carry', v')` such that `carry' * 2^W + v' = self * other + carry`,
+    /// where `W` is the number of bits in `Self`.
+    fn full_mul(self, other: Self, carry: Self) -> (Self /*carry*/, Self);
+
+    /// Returns `(carry', v')` such that `carry' * 2^W + v' = self * other + other2 + carry`,
+    /// where `W` is the number of bits in `Self`.
+    fn full_mul_add(self, other: Self, other2: Self, carry: Self) -> (Self /*carry*/, Self);
+
+    /// Returns `(quo, rem)` such that `borrow * 2^W + self = quo * other + rem`
+    /// and `0 <= rem < other`, where `W` is the number of bits in `Self`.
+    fn full_div_rem(self, other: Self, borrow: Self) -> (Self /*quotient*/, Self /*remainder*/);
+}
+
+macro_rules! impl_full_ops {
+    ($($ty:ty: add($addfn:path), mul/div($bigty:ident);)*) => (
+        $(
+            impl FullOps for $ty {
+                fn full_add(self, other: $ty, carry: bool) -> (bool, $ty) {
+                    // this cannot overflow, the output is between 0 and 2*2^nbits - 1
+                    // FIXME will LLVM optimize this into ADC or similar???
+                    let (v, carry1) = unsafe { $addfn(self, other) };
+                    let (v, carry2) = unsafe { $addfn(v, if carry {1} else {0}) };
+                    (carry1 || carry2, v)
+                }
+
+                fn full_mul(self, other: $ty, carry: $ty) -> ($ty, $ty) {
+                    // this cannot overflow, the output is between 0 and 2^nbits * (2^nbits - 1)
+                    let nbits = mem::size_of::<$ty>() * 8;
+                    let v = (self as $bigty) * (other as $bigty) + (carry as $bigty);
+                    ((v >> nbits) as $ty, v as $ty)
+                }
+
+                fn full_mul_add(self, other: $ty, other2: $ty, carry: $ty) -> ($ty, $ty) {
+                    // this cannot overflow, the output is between 0 and 2^(2*nbits) - 1
+                    let nbits = mem::size_of::<$ty>() * 8;
+                    let v = (self as $bigty) * (other as $bigty) + (other2 as $bigty) +
+                            (carry as $bigty);
+                    ((v >> nbits) as $ty, v as $ty)
+                }
+
+                fn full_div_rem(self, other: $ty, borrow: $ty) -> ($ty, $ty) {
+                    debug_assert!(borrow < other);
+                    // this cannot overflow, the dividend is between 0 and other * 2^nbits - 1
+                    let nbits = mem::size_of::<$ty>() * 8;
+                    let lhs = ((borrow as $bigty) << nbits) | (self as $bigty);
+                    let rhs = other as $bigty;
+                    ((lhs / rhs) as $ty, (lhs % rhs) as $ty)
+                }
+            }
+        )*
+    )
+}
+
+impl_full_ops! {
+    u8:  add(intrinsics::u8_add_with_overflow),  mul/div(u16);
+    u16: add(intrinsics::u16_add_with_overflow), mul/div(u32);
+    u32: add(intrinsics::u32_add_with_overflow), mul/div(u64);
+//  u64: add(intrinsics::u64_add_with_overflow), mul/div(u128); // damn!
+}
+
+macro_rules! define_bignum {
+    ($name:ident: type=$ty:ty, n=$n:expr) => (
+        /// Stack-allocated arbitrary-precision (up to certain limit) integer.
+        ///
+        /// This is backed by an fixed-size array of given type ("digit").
+        /// While the array is not very large (normally some hundred bytes),
+        /// copying it recklessly may result in the performance hit.
+        /// Thus this is intentionally not `Copy`.
+        ///
+        /// All operations available to bignums panic in the case of over/underflows.
+        /// The caller is responsible to use large enough bignum types.
+        pub struct $name {
+            /// One plus the offset to the maximum "digit" in the use.
+            /// This does not decrease, so be aware of the computation order.
+            /// `base[size..]` should be zero.
+            size: usize,
+            /// Digits. `[a, b, c, ...]` represents `a + b*n + c*n^2 + ...`.
+            base: [$ty; $n]
+        }
+
+        impl $name {
+            /// Makes a bignum from one digit.
+            pub fn from_small(v: $ty) -> $name {
+                let mut base = [0; $n];
+                base[0] = v;
+                $name { size: 1, base: base }
+            }
+
+            /// Makes a bignum from `u64` value.
+            pub fn from_u64(mut v: u64) -> $name {
+                use mem;
+
+                let mut base = [0; $n];
+                let mut sz = 0;
+                while v > 0 {
+                    base[sz] = v as $ty;
+                    v >>= mem::size_of::<$ty>() * 8;
+                    sz += 1;
+                }
+                $name { size: sz, base: base }
+            }
+
+            /// Returns true if the bignum is zero.
+            pub fn is_zero(&self) -> bool {
+                self.base[..self.size].iter().all(|&v| v == 0)
+            }
+
+            /// Adds `other` to itself and returns its own mutable reference.
+            pub fn add<'a>(&'a mut self, other: &$name) -> &'a mut $name {
+                use cmp;
+                use num::flt2dec::bignum::FullOps;
+
+                let mut sz = cmp::max(self.size, other.size);
+                let mut carry = false;
+                for (a, b) in self.base[..sz].iter_mut().zip(other.base[..sz].iter()) {
+                    let (c, v) = (*a).full_add(*b, carry);
+                    *a = v;
+                    carry = c;
+                }
+                if carry {
+                    self.base[sz] = 1;
+                    sz += 1;
+                }
+                self.size = sz;
+                self
+            }
+
+            /// Subtracts `other` from itself and returns its own mutable reference.
+            pub fn sub<'a>(&'a mut self, other: &$name) -> &'a mut $name {
+                use cmp;
+                use num::flt2dec::bignum::FullOps;
+
+                let sz = cmp::max(self.size, other.size);
+                let mut noborrow = true;
+                for (a, b) in self.base[..sz].iter_mut().zip(other.base[..sz].iter()) {
+                    let (c, v) = (*a).full_add(!*b, noborrow);
+                    *a = v;
+                    noborrow = c;
+                }
+                assert!(noborrow);
+                self.size = sz;
+                self
+            }
+
+            /// Multiplies itself by a digit-sized `other` and returns its own
+            /// mutable reference.
+            pub fn mul_small<'a>(&'a mut self, other: $ty) -> &'a mut $name {
+                use num::flt2dec::bignum::FullOps;
+
+                let mut sz = self.size;
+                let mut carry = 0;
+                for a in self.base[..sz].iter_mut() {
+                    let (c, v) = (*a).full_mul(other, carry);
+                    *a = v;
+                    carry = c;
+                }
+                if carry > 0 {
+                    self.base[sz] = carry;
+                    sz += 1;
+                }
+                self.size = sz;
+                self
+            }
+
+            /// Multiplies itself by `2^bits` and returns its own mutable reference.
+            pub fn mul_pow2<'a>(&'a mut self, bits: usize) -> &'a mut $name {
+                use mem;
+
+                let digitbits = mem::size_of::<$ty>() * 8;
+                let digits = bits / digitbits;
+                let bits = bits % digitbits;
+
+                assert!(digits < $n);
+                debug_assert!(self.base[$n-digits..].iter().all(|&v| v == 0));
+                debug_assert!(bits == 0 || (self.base[$n-digits-1] >> (digitbits - bits)) == 0);
+
+                // shift by `digits * digitbits` bits
+                for i in (0..self.size).rev() {
+                    self.base[i+digits] = self.base[i];
+                }
+                for i in 0..digits {
+                    self.base[i] = 0;
+                }
+
+                // shift by `nbits` bits
+                let mut sz = self.size + digits;
+                if bits > 0 {
+                    let last = sz;
+                    let overflow = self.base[last-1] >> (digitbits - bits);
+                    if overflow > 0 {
+                        self.base[last] = overflow;
+                        sz += 1;
+                    }
+                    for i in (digits+1..last).rev() {
+                        self.base[i] = (self.base[i] << bits) |
+                                       (self.base[i-1] >> (digitbits - bits));
+                    }
+                    self.base[digits] <<= bits;
+                    // self.base[..digits] is zero, no need to shift
+                }
+
+                self.size = sz;
+                self
+            }
+
+            /// Multiplies itself by a number described by `other[0] + other[1] * n +
+            /// other[2] * n^2 + ...` and returns its own mutable reference.
+            pub fn mul_digits<'a>(&'a mut self, other: &[$ty]) -> &'a mut $name {
+                // the internal routine. works best when aa.len() <= bb.len().
+                fn mul_inner(ret: &mut [$ty; $n], aa: &[$ty], bb: &[$ty]) -> usize {
+                    use num::flt2dec::bignum::FullOps;
+
+                    let mut retsz = 0;
+                    for (i, &a) in aa.iter().enumerate() {
+                        if a == 0 { continue; }
+                        let mut sz = bb.len();
+                        let mut carry = 0;
+                        for (j, &b) in bb.iter().enumerate() {
+                            let (c, v) = a.full_mul_add(b, ret[i + j], carry);
+                            ret[i + j] = v;
+                            carry = c;
+                        }
+                        if carry > 0 {
+                            ret[i + sz] = carry;
+                            sz += 1;
+                        }
+                        if retsz < i + sz {
+                            retsz = i + sz;
+                        }
+                    }
+                    retsz
+                }
+
+                let mut ret = [0; $n];
+                let retsz = if self.size < other.len() {
+                    mul_inner(&mut ret, &self.base[..self.size], other)
+                } else {
+                    mul_inner(&mut ret, other, &self.base[..self.size])
+                };
+                self.base = ret;
+                self.size = retsz;
+                self
+            }
+
+            /// Divides itself by a digit-sized `other` and returns its own
+            /// mutable reference *and* the remainder.
+            pub fn div_rem_small<'a>(&'a mut self, other: $ty) -> (&'a mut $name, $ty) {
+                use num::flt2dec::bignum::FullOps;
+
+                assert!(other > 0);
+
+                let sz = self.size;
+                let mut borrow = 0;
+                for a in self.base[..sz].iter_mut().rev() {
+                    let (q, r) = (*a).full_div_rem(other, borrow);
+                    *a = q;
+                    borrow = r;
+                }
+                (self, borrow)
+            }
+        }
+
+        impl ::cmp::PartialEq for $name {
+            fn eq(&self, other: &$name) -> bool { self.base[..] == other.base[..] }
+        }
+
+        impl ::cmp::Eq for $name {
+        }
+
+        impl ::cmp::PartialOrd for $name {
+            fn partial_cmp(&self, other: &$name) -> ::option::Option<::cmp::Ordering> {
+                ::option::Option::Some(self.cmp(other))
+            }
+        }
+
+        impl ::cmp::Ord for $name {
+            fn cmp(&self, other: &$name) -> ::cmp::Ordering {
+                use cmp::max;
+                use iter::order;
+
+                let sz = max(self.size, other.size);
+                let lhs = self.base[..sz].iter().cloned().rev();
+                let rhs = other.base[..sz].iter().cloned().rev();
+                order::cmp(lhs, rhs)
+            }
+        }
+
+        impl ::clone::Clone for $name {
+            fn clone(&self) -> $name {
+                $name { size: self.size, base: self.base }
+            }
+        }
+
+        impl ::fmt::Debug for $name {
+            fn fmt(&self, f: &mut ::fmt::Formatter) -> ::fmt::Result {
+                use mem;
+
+                let sz = if self.size < 1 {1} else {self.size};
+                let digitlen = mem::size_of::<$ty>() * 2;
+
+                try!(write!(f, "{:#x}", self.base[sz-1]));
+                for &v in self.base[..sz-1].iter().rev() {
+                    try!(write!(f, "_{:01$x}", v, digitlen));
+                }
+                ::result::Result::Ok(())
+            }
+        }
+    )
+}
+
+/// The digit type for `Big32x36`.
+pub type Digit32 = u32;
+
+define_bignum!(Big32x36: type=Digit32, n=36);
+
+// this one is used for testing only.
+#[doc(hidden)]
+pub mod tests {
+    use prelude::*;
+    define_bignum!(Big8x3: type=u8, n=3);
+}
+
diff --git a/src/libcore/num/flt2dec/decoder.rs b/src/libcore/num/flt2dec/decoder.rs
new file mode 100644
index 00000000000..4b7d00f80e1
--- /dev/null
+++ b/src/libcore/num/flt2dec/decoder.rs
@@ -0,0 +1,100 @@
+// Copyright 2015 The Rust Project Developers. See the COPYRIGHT
+// file at the top-level directory of this distribution and at
+// http://rust-lang.org/COPYRIGHT.
+//
+// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
+// option. This file may not be copied, modified, or distributed
+// except according to those terms.
+
+//! Decodes a floating-point value into individual parts and error ranges.
+
+use prelude::*;
+
+use {f32, f64};
+use num::{Float, FpCategory};
+
+/// Decoded unsigned finite value, such that:
+///
+/// - The original value equals to `mant * 2^exp`.
+///
+/// - Any number from `(mant - minus) * 2^exp` to `(mant + plus) * 2^exp` will
+///   round to the original value. The range is inclusive only when
+///   `inclusive` is true.
+#[derive(Copy, Clone, Debug, PartialEq)]
+pub struct Decoded {
+    /// The scaled mantissa.
+    pub mant: u64,
+    /// The lower error range.
+    pub minus: u64,
+    /// The upper error range.
+    pub plus: u64,
+    /// The shared exponent in base 2.
+    pub exp: i16,
+    /// True when the error range is inclusive.
+    ///
+    /// In IEEE 754, this is true when the original mantissa was even.
+    pub inclusive: bool,
+}
+
+/// Decoded unsigned value.
+#[derive(Copy, Clone, Debug, PartialEq)]
+pub enum FullDecoded {
+    /// Not-a-number.
+    Nan,
+    /// Infinities, either positive or negative.
+    Infinite,
+    /// Zero, either positive or negative.
+    Zero,
+    /// Finite numbers with further decoded fields.
+    Finite(Decoded),
+}
+
+/// A floating point type which can be `decode`d.
+pub trait DecodableFloat: Float + Copy {
+    /// The minimum positive normalized value.
+    fn min_pos_norm_value() -> Self;
+}
+
+impl DecodableFloat for f32 {
+    fn min_pos_norm_value() -> Self { f32::MIN_POSITIVE }
+}
+
+impl DecodableFloat for f64 {
+    fn min_pos_norm_value() -> Self { f64::MIN_POSITIVE }
+}
+
+/// Returns a sign (true when negative) and `FullDecoded` value
+/// from given floating point number.
+pub fn decode<T: DecodableFloat>(v: T) -> (/*negative?*/ bool, FullDecoded) {
+    let (mant, exp, sign) = v.integer_decode();
+    let even = (mant & 1) == 0;
+    let decoded = match v.classify() {
+        FpCategory::Nan => FullDecoded::Nan,
+        FpCategory::Infinite => FullDecoded::Infinite,
+        FpCategory::Zero => FullDecoded::Zero,
+        FpCategory::Subnormal => {
+            // (mant - 2, exp) -- (mant, exp) -- (mant + 2, exp)
+            // Float::integer_decode always preserves the exponent,
+            // so the mantissa is scaled for subnormals.
+            FullDecoded::Finite(Decoded { mant: mant, minus: 1, plus: 1,
+                                          exp: exp, inclusive: even })
+        }
+        FpCategory::Normal => {
+            let minnorm = <T as DecodableFloat>::min_pos_norm_value().integer_decode();
+            if mant == minnorm.0 && exp == minnorm.1 {
+                // (maxmant, exp - 1) -- (minnormmant, exp) -- (minnormmant + 1, exp)
+                // where maxmant = minnormmant * 2 - 1
+                FullDecoded::Finite(Decoded { mant: mant << 1, minus: 1, plus: 2,
+                                              exp: exp - 1, inclusive: even })
+            } else {
+                // (mant - 1, exp) -- (mant, exp) -- (mant + 1, exp)
+                FullDecoded::Finite(Decoded { mant: mant << 1, minus: 1, plus: 1,
+                                              exp: exp - 1, inclusive: even })
+            }
+        }
+    };
+    (sign < 0, decoded)
+}
+
diff --git a/src/libcore/num/flt2dec/estimator.rs b/src/libcore/num/flt2dec/estimator.rs
new file mode 100644
index 00000000000..d3e06e406db
--- /dev/null
+++ b/src/libcore/num/flt2dec/estimator.rs
@@ -0,0 +1,23 @@
+// Copyright 2015 The Rust Project Developers. See the COPYRIGHT
+// file at the top-level directory of this distribution and at
+// http://rust-lang.org/COPYRIGHT.
+//
+// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
+// option. This file may not be copied, modified, or distributed
+// except according to those terms.
+
+//! The exponent estimator.
+
+/// Finds `k_0` such that `10^(k_0-1) < mant * 2^exp <= 10^(k_0+1)`.
+///
+/// This is used to approximate `k = ceil(log_10 (mant * 2^exp))`;
+/// the true `k` is either `k_0` or `k_0+1`.
+#[doc(hidden)]
+pub fn estimate_scaling_factor(mant: u64, exp: i16) -> i16 {
+    // 2^(nbits-1) < mant <= 2^nbits if mant > 0
+    let nbits = 64 - (mant - 1).leading_zeros() as i64;
+    (((nbits + exp as i64) * 1292913986) >> 32) as i16
+}
+
diff --git a/src/libcore/num/flt2dec/mod.rs b/src/libcore/num/flt2dec/mod.rs
new file mode 100644
index 00000000000..8b4b3a45e9b
--- /dev/null
+++ b/src/libcore/num/flt2dec/mod.rs
@@ -0,0 +1,662 @@
+// Copyright 2015 The Rust Project Developers. See the COPYRIGHT
+// file at the top-level directory of this distribution and at
+// http://rust-lang.org/COPYRIGHT.
+//
+// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
+// option. This file may not be copied, modified, or distributed
+// except according to those terms.
+
+/*!
+
+Floating-point number to decimal conversion routines.
+
+# Problem statement
+
+We are given the floating-point number `v = f * 2^e` with an integer `f`,
+and its bounds `minus` and `plus` such that any number between `v - minus` and
+`v + plus` will be rounded to `v`. For the simplicity we assume that
+this range is exclusive. Then we would like to get the unique decimal
+representation `V = 0.d[0..n-1] * 10^k` such that:
+
+- `d[0]` is non-zero.
+
+- It's correctly rounded when parsed back: `v - minus < V < v + plus`.
+  Furthermore it is shortest such one, i.e. there is no representation
+  with less than `n` digits that is correctly rounded.
+
+- It's closest to the original value: `abs(V - v) <= 10^(k-n) / 2`. Note that
+  there might be two representations satisfying this uniqueness requirement,
+  in which case some tie-breaking mechanism is used.
+
+We will call this mode of operation as to the *shortest* mode. This mode is used
+when there is no additional constraint, and can be thought as a "natural" mode
+as it matches the ordinary intuition (it at least prints `0.1f32` as "0.1").
+
+We have two more modes of operation closely related to each other. In these modes
+we are given either the number of significant digits `n` or the last-digit
+limitation `limit` (which determines the actual `n`), and we would like to get
+the representation `V = 0.d[0..n-1] * 10^k` such that:
+
+- `d[0]` is non-zero, unless `n` was zero in which case only `k` is returned.
+
+- It's closest to the original value: `abs(V - v) <= 10^(k-n) / 2`. Again,
+  there might be some tie-breaking mechanism.
+
+When `limit` is given but not `n`, we set `n` such that `k - n = limit`
+so that the last digit `d[n-1]` is scaled by `10^(k-n) = 10^limit`.
+If such `n` is negative, we clip it to zero so that we will only get `k`.
+We are also limited by the supplied buffer. This limitation is used to print
+the number up to given number of fractional digits without knowing
+the correct `k` beforehand.
+
+We will call the mode of operation requiring `n` as to the *exact* mode,
+and one requiring `limit` as to the *fixed* mode. The exact mode is a subset of
+the fixed mode: the sufficiently large last-digit limitation will eventually fill
+the supplied buffer and let the algorithm to return.
+
+# Implementation overview
+
+It is easy to get the floating point printing correct but slow (Russ Cox has
+[demonstrated](http://research.swtch.com/ftoa) how it's easy), or incorrect but
+fast (naïve division and modulo). But it is surprisingly hard to print
+floating point numbers correctly *and* efficiently.
+
+There are two classes of algorithms widely known to be correct.
+
+- The "Dragon" family of algorithm is first described by Guy L. Steele Jr. and
+  Jon L. White. They rely on the fixed-size big integer for their correctness.
+  A slight improvement was found later, which is posthumously described by
+  Robert G. Burger and R. Kent Dybvig. David Gay's `dtoa.c` routine is
+  a popular implementation of this strategy.
+
+- The "Grisu" family of algorithm is first described by Florian Loitsch.
+  They use very cheap integer-only procedure to determine the close-to-correct
+  representation which is at least guaranteed to be shortest. The variant,
+  Grisu3, actively detects if the resulting representation is incorrect.
+
+We implement both algorithms with necessary tweaks to suit our requirements.
+In particular, published literatures are short of the actual implementation
+difficulties like how to avoid arithmetic overflows. Each implementation,
+available in `strategy::dragon` and `strategy::grisu` respectively,
+extensively describes all necessary justifications and many proofs for them.
+(It is still difficult to follow though. You have been warned.)
+
+Both implementations expose two public functions:
+
+- `format_shortest(decoded, buf)`, which always needs at least
+  `MAX_SIG_DIGITS` digits of buffer. Implements the shortest mode.
+
+- `format_exact(decoded, buf, limit)`, which accepts as small as
+  one digit of buffer. Implements exact and fixed modes.
+
+They try to fill the `u8` buffer with digits and returns the number of digits
+written and the exponent `k`. They are total for all finite `f32` and `f64`
+inputs (Grisu internally falls back to Dragon if possible).
+
+The rendered digits are formatted into the actual string form with
+four functions:
+
+- `to_shortest_str` prints the shortest representation, which can be padded by
+  zeroes to make *at least* given number of fractional digits.
+
+- `to_shortest_exp_str` prints the shortest representation, which can be
+  padded by zeroes when its exponent is in the specified ranges,
+  or can be printed in the exponential form such as `1.23e45`.
+
+- `to_exact_exp_str` prints the exact representation with given number of
+  digits in the exponential form.
+
+- `to_exact_fixed_str` prints the fixed representation with *exactly*
+  given number of fractional digits.
+
+They all return a slice of preallocated `Part` array, which corresponds to
+the individual part of strings: a fixed string, a part of rendered digits,
+a number of zeroes or a small (`u16`) number. The caller is expected to
+provide an enough buffer and `Part` array, and to assemble the final
+string from parts itself.
+
+All algorithms and formatting functions are accompanied by extensive tests
+in `coretest::num::flt2dec` module. It also shows how to use individual
+functions.
+
+*/
+
+// while this is extensively documented, this is in principle private which is
+// only made public for testing. do not expose us.
+#![doc(hidden)]
+
+use prelude::*;
+use i16;
+use num::Float;
+use slice::bytes;
+pub use self::decoder::{decode, DecodableFloat, FullDecoded, Decoded};
+
+pub mod estimator;
+pub mod bignum;
+pub mod decoder;
+
+/// Digit-generation algorithms.
+pub mod strategy {
+    pub mod dragon;
+    pub mod grisu;
+}
+
+/// The minimum size of buffer necessary for the shortest mode.
+///
+/// It is a bit non-trivial to derive, but this is one plus the maximal number of
+/// significant decimal digits from formatting algorithms with the shortest result.
+/// The exact formula is `ceil(# bits in mantissa * log_10 2 + 1)`.
+pub const MAX_SIG_DIGITS: usize = 17;
+
+/// When `d[..n]` contains decimal digits, increase the last digit and propagate carry.
+/// Returns a next digit when it causes the length change.
+#[doc(hidden)]
+pub fn round_up(d: &mut [u8], n: usize) -> Option<u8> {
+    match d[..n].iter().rposition(|&c| c != b'9') {
+        Some(i) => { // d[i+1..n] is all nines
+            d[i] += 1;
+            for j in i+1..n { d[j] = b'0'; }
+            None
+        }
+        None if n > 0 => { // 999..999 rounds to 1000..000 with an increased exponent
+            d[0] = b'1';
+            for j in 1..n { d[j] = b'0'; }
+            Some(b'0')
+        }
+        None => { // an empty buffer rounds up (a bit strange but reasonable)
+            Some(b'1')
+        }
+    }
+}
+
+/// Formatted parts.
+#[derive(Copy, Clone, PartialEq, Eq, Debug)]
+pub enum Part<'a> {
+    /// Given number of zero digits.
+    Zero(usize),
+    /// A literal number up to 5 digits.
+    Num(u16),
+    /// A verbatim copy of given bytes.
+    Copy(&'a [u8]),
+}
+
+impl<'a> Part<'a> {
+    /// Returns the exact byte length of given part.
+    pub fn len(&self) -> usize {
+        match *self {
+            Part::Zero(nzeroes) => nzeroes,
+            Part::Num(v) => if v < 1_000 { if v < 10 { 1 } else if v < 100 { 2 } else { 3 } }
+                            else { if v < 10_000 { 4 } else { 5 } },
+            Part::Copy(buf) => buf.len(),
+        }
+    }
+
+    /// Writes a part into the supplied buffer.
+    /// Returns the number of written bytes, or `None` if the buffer is not enough.
+    /// (It may still leave partially written bytes in the buffer; do not rely on that.)
+    pub fn write(&self, out: &mut [u8]) -> Option<usize> {
+        let len = self.len();
+        if out.len() >= len {
+            match *self {
+                Part::Zero(nzeroes) => {
+                    for c in &mut out[..nzeroes] { *c = b'0'; }
+                }
+                Part::Num(mut v) => {
+                    for c in out[..len].iter_mut().rev() {
+                        *c = b'0' + (v % 10) as u8;
+                        v /= 10;
+                    }
+                }
+                Part::Copy(buf) => {
+                    bytes::copy_memory(buf, out);
+                }
+            }
+            Some(len)
+        } else {
+            None
+        }
+    }
+}
+
+/// Formatted result containing one or more parts.
+/// This can be written to the byte buffer or converted to the allocated string.
+#[derive(Clone)]
+pub struct Formatted<'a> {
+    /// A byte slice representing a sign, either `""`, `"-"` or `"+"`.
+    pub sign: &'static [u8],
+    /// Formatted parts to be rendered after a sign and optional zero padding.
+    pub parts: &'a [Part<'a>],
+}
+
+impl<'a> Formatted<'a> {
+    /// Returns the exact byte length of combined formatted result.
+    pub fn len(&self) -> usize {
+        let mut len = self.sign.len();
+        for part in self.parts {
+            len += part.len();
+        }
+        len
+    }
+
+    /// Writes all formatted parts into the supplied buffer.
+    /// Returns the number of written bytes, or `None` if the buffer is not enough.
+    /// (It may still leave partially written bytes in the buffer; do not rely on that.)
+    pub fn write(&self, out: &mut [u8]) -> Option<usize> {
+        if out.len() < self.sign.len() { return None; }
+        bytes::copy_memory(self.sign, out);
+
+        let mut written = self.sign.len();
+        for part in self.parts {
+            match part.write(&mut out[written..]) {
+                Some(len) => { written += len; }
+                None => { return None; }
+            }
+        }
+        Some(written)
+    }
+}
+
+/// Formats given decimal digits `0.<...buf...> * 10^exp` into the decimal form
+/// with at least given number of fractional digits. The result is stored to
+/// the supplied parts array and a slice of written parts is returned.
+///
+/// `frac_digits` can be less than the number of actual fractional digits in `buf`;
+/// it will be ignored and full digits will be printed. It is only used to print
+/// additional zeroes after rendered digits. Thus `frac_digits` of 0 means that
+/// it will only print given digits and nothing else.
+fn digits_to_dec_str<'a>(buf: &'a [u8], exp: i16, frac_digits: usize,
+                         parts: &'a mut [Part<'a>]) -> &'a [Part<'a>] {
+    assert!(!buf.is_empty());
+    assert!(buf[0] > b'0');
+    assert!(parts.len() >= 4);
+
+    // if there is the restriction on the last digit position, `buf` is assumed to be
+    // left-padded with the virtual zeroes. the number of virtual zeroes, `nzeroes`,
+    // equals to `max(0, exp + frag_digits - buf.len())`, so that the position of
+    // the last digit `exp - buf.len() - nzeroes` is no more than `-frac_digits`:
+    //
+    //                       |<-virtual->|
+    //       |<---- buf ---->|  zeroes   |     exp
+    //    0. 1 2 3 4 5 6 7 8 9 _ _ _ _ _ _ x 10
+    //    |                  |           |
+    // 10^exp    10^(exp-buf.len())   10^(exp-buf.len()-nzeroes)
+    //
+    // `nzeroes` is individually calculated for each case in order to avoid overflow.
+
+    if exp <= 0 {
+        // the decimal point is before rendered digits: [0.][000...000][1234][____]
+        let minus_exp = -(exp as i32) as usize;
+        parts[0] = Part::Copy(b"0.");
+        parts[1] = Part::Zero(minus_exp);
+        parts[2] = Part::Copy(buf);
+        if frac_digits > buf.len() && frac_digits - buf.len() > minus_exp {
+            parts[3] = Part::Zero((frac_digits - buf.len()) - minus_exp);
+            &parts[..4]
+        } else {
+            &parts[..3]
+        }
+    } else {
+        let exp = exp as usize;
+        if exp < buf.len() {
+            // the decimal point is inside rendered digits: [12][.][34][____]
+            parts[0] = Part::Copy(&buf[..exp]);
+            parts[1] = Part::Copy(b".");
+            parts[2] = Part::Copy(&buf[exp..]);
+            if frac_digits > buf.len() - exp {
+                parts[3] = Part::Zero(frac_digits - (buf.len() - exp));
+                &parts[..4]
+            } else {
+                &parts[..3]
+            }
+        } else {
+            // the decimal point is after rendered digits: [1234][____0000] or [1234][__][.][__].
+            parts[0] = Part::Copy(buf);
+            parts[1] = Part::Zero(exp - buf.len());
+            if frac_digits > 0 {
+                parts[2] = Part::Copy(b".");
+                parts[3] = Part::Zero(frac_digits);
+                &parts[..4]
+            } else {
+                &parts[..2]
+            }
+        }
+    }
+}
+
+/// Formats given decimal digits `0.<...buf...> * 10^exp` into the exponential form
+/// with at least given number of significant digits. When `upper` is true,
+/// the exponent will be prefixed by `E`; otherwise that's `e`. The result is
+/// stored to the supplied parts array and a slice of written parts is returned.
+///
+/// `min_digits` can be less than the number of actual significant digits in `buf`;
+/// it will be ignored and full digits will be printed. It is only used to print
+/// additional zeroes after rendered digits. Thus `min_digits` of 0 means that
+/// it will only print given digits and nothing else.
+fn digits_to_exp_str<'a>(buf: &'a [u8], exp: i16, min_ndigits: usize, upper: bool,
+                         parts: &'a mut [Part<'a>]) -> &'a [Part<'a>] {
+    assert!(!buf.is_empty());
+    assert!(buf[0] > b'0');
+    assert!(parts.len() >= 6);
+
+    let mut n = 0;
+
+    parts[n] = Part::Copy(&buf[..1]);
+    n += 1;
+
+    if buf.len() > 1 || min_ndigits > 1 {
+        parts[n] = Part::Copy(b".");
+        parts[n + 1] = Part::Copy(&buf[1..]);
+        n += 2;
+        if min_ndigits > buf.len() {
+            parts[n] = Part::Zero(min_ndigits - buf.len());
+            n += 1;
+        }
+    }
+
+    // 0.1234 x 10^exp = 1.234 x 10^(exp-1)
+    let exp = exp as i32 - 1; // avoid underflow when exp is i16::MIN
+    if exp < 0 {
+        parts[n] = Part::Copy(if upper { b"E-" } else { b"e-" });
+        parts[n + 1] = Part::Num(-exp as u16);
+    } else {
+        parts[n] = Part::Copy(if upper { b"E" } else { b"e" });
+        parts[n + 1] = Part::Num(exp as u16);
+    }
+    &parts[..n + 2]
+}
+
+/// Sign formatting options.
+#[derive(Copy, Clone, PartialEq, Eq, Debug)]
+pub enum Sign {
+    /// Prints `-` only for the negative non-zero values.
+    Minus,        // -inf -1  0  0  1  inf nan
+    /// Prints `-` only for any negative values (including the negative zero).
+    MinusRaw,     // -inf -1  0  0  1  inf nan
+    /// Prints `-` for the negative non-zero values, or `+` otherwise.
+    MinusPlus,    // -inf -1 +0 +0 +1 +inf nan
+    /// Prints `-` for any negative values (including the negative zero), or `+` otherwise.
+    MinusPlusRaw, // -inf -1 -0 +0 +1 +inf nan
+}
+
+/// Returns the static byte string corresponding to the sign to be formatted.
+/// It can be either `b""`, `b"+"` or `b"-"`.
+fn determine_sign(sign: Sign, decoded: &FullDecoded, negative: bool) -> &'static [u8] {
+    match (*decoded, sign) {
+        (FullDecoded::Nan, _) => b"",
+        (FullDecoded::Zero, Sign::Minus) => b"",
+        (FullDecoded::Zero, Sign::MinusRaw) => if negative { b"-" } else { b"" },
+        (FullDecoded::Zero, Sign::MinusPlus) => b"+",
+        (FullDecoded::Zero, Sign::MinusPlusRaw) => if negative { b"-" } else { b"+" },
+        (_, Sign::Minus) | (_, Sign::MinusRaw) => if negative { b"-" } else { b"" },
+        (_, Sign::MinusPlus) | (_, Sign::MinusPlusRaw) => if negative { b"-" } else { b"+" },
+    }
+}
+
+/// Formats given floating point number into the decimal form with at least
+/// given number of fractional digits. The result is stored to the supplied parts
+/// array while utilizing given byte buffer as a scratch. `upper` is only used to
+/// determine the case of non-finite values, i.e. `inf` and `nan`. The first part
+/// to be rendered is always a `Part::Sign` (which can be an empty string
+/// if no sign is rendered).
+///
+/// `format_shortest` should be the underlying digit-generation function.
+/// You probably would want `strategy::grisu::format_shortest` for this.
+///
+/// `frac_digits` can be less than the number of actual fractional digits in `v`;
+/// it will be ignored and full digits will be printed. It is only used to print
+/// additional zeroes after rendered digits. Thus `frac_digits` of 0 means that
+/// it will only print given digits and nothing else.
+///
+/// The byte buffer should be at least `MAX_SIG_DIGITS` bytes long.
+/// There should be at least 5 parts available, due to the worst case like
+/// `[+][0.][0000][45][0000]` with `frac_digits = 10`.
+pub fn to_shortest_str<'a, T, F>(mut format_shortest: F, v: T,
+                                 sign: Sign, frac_digits: usize, upper: bool,
+                                 buf: &'a mut [u8], parts: &'a mut [Part<'a>]) -> Formatted<'a>
+        where T: DecodableFloat, F: FnMut(&Decoded, &mut [u8]) -> (usize, i16) {
+    assert!(parts.len() >= 4);
+    assert!(buf.len() >= MAX_SIG_DIGITS);
+
+    let (negative, full_decoded) = decode(v);
+    let sign = determine_sign(sign, &full_decoded, negative);
+    match full_decoded {
+        FullDecoded::Nan => {
+            parts[0] = Part::Copy(if upper { b"NAN" } else { b"nan" });
+            Formatted { sign: sign, parts: &parts[..1] }
+        }
+        FullDecoded::Infinite => {
+            parts[0] = Part::Copy(if upper { b"INF" } else { b"inf" });
+            Formatted { sign: sign, parts: &parts[..1] }
+        }
+        FullDecoded::Zero => {
+            if frac_digits > 0 { // [0.][0000]
+                parts[0] = Part::Copy(b"0.");
+                parts[1] = Part::Zero(frac_digits);
+                Formatted { sign: sign, parts: &parts[..2] }
+            } else {
+                parts[0] = Part::Copy(b"0");
+                Formatted { sign: sign, parts: &parts[..1] }
+            }
+        }
+        FullDecoded::Finite(ref decoded) => {
+            let (len, exp) = format_shortest(decoded, buf);
+            Formatted { sign: sign,
+                        parts: digits_to_dec_str(&buf[..len], exp, frac_digits, parts) }
+        }
+    }
+}
+
+/// Formats given floating point number into the decimal form or
+/// the exponential form, depending on the resulting exponent. The result is
+/// stored to the supplied parts array while utilizing given byte buffer
+/// as a scratch. `upper` is used to determine the case of non-finite values
+/// (`inf` and `nan`) or the case of the exponent prefix (`e` or `E`).
+/// The first part to be rendered is always a `Part::Sign` (which can be
+/// an empty string if no sign is rendered).
+///
+/// `format_shortest` should be the underlying digit-generation function.
+/// You probably would want `strategy::grisu::format_shortest` for this.
+///
+/// The `dec_bounds` is a tuple `(lo, hi)` such that the number is formatted
+/// as decimal only when `10^lo <= V < 10^hi`. Note that this is the *apparant* `V`
+/// instead of the actual `v`! Thus any printed exponent in the exponential form
+/// cannot be in this range, avoiding any confusion.
+///
+/// The byte buffer should be at least `MAX_SIG_DIGITS` bytes long.
+/// There should be at least 7 parts available, due to the worst case like
+/// `[+][1][.][2345][e][-][67]`.
+pub fn to_shortest_exp_str<'a, T, F>(mut format_shortest: F, v: T,
+                                     sign: Sign, dec_bounds: (i16, i16), upper: bool,
+                                     buf: &'a mut [u8], parts: &'a mut [Part<'a>]) -> Formatted<'a>
+        where T: DecodableFloat, F: FnMut(&Decoded, &mut [u8]) -> (usize, i16) {
+    assert!(parts.len() >= 6);
+    assert!(buf.len() >= MAX_SIG_DIGITS);
+    assert!(dec_bounds.0 <= dec_bounds.1);
+
+    let (negative, full_decoded) = decode(v);
+    let sign = determine_sign(sign, &full_decoded, negative);
+    match full_decoded {
+        FullDecoded::Nan => {
+            parts[0] = Part::Copy(if upper { b"NAN" } else { b"nan" });
+            Formatted { sign: sign, parts: &parts[..1] }
+        }
+        FullDecoded::Infinite => {
+            parts[0] = Part::Copy(if upper { b"INF" } else { b"inf" });
+            Formatted { sign: sign, parts: &parts[..1] }
+        }
+        FullDecoded::Zero => {
+            parts[0] = if dec_bounds.0 <= 0 && 0 < dec_bounds.1 {
+                Part::Copy(b"0")
+            } else {
+                Part::Copy(if upper { b"0E0" } else { b"0e0" })
+            };
+            Formatted { sign: sign, parts: &parts[..1] }
+        }
+        FullDecoded::Finite(ref decoded) => {
+            let (len, exp) = format_shortest(decoded, buf);
+            let vis_exp = exp as i32 - 1;
+            let parts = if dec_bounds.0 as i32 <= vis_exp && vis_exp < dec_bounds.1 as i32 {
+                digits_to_dec_str(&buf[..len], exp, 0, parts)
+            } else {
+                digits_to_exp_str(&buf[..len], exp, 0, upper, parts)
+            };
+            Formatted { sign: sign, parts: parts }
+        }
+    }
+}
+
+/// Returns rather crude approximation (upper bound) for the maximum buffer size
+/// calculated from the given decoded exponent.
+///
+/// The exact limit is:
+///
+/// - when `exp < 0`, the maximum length is `ceil(log_10 (5^-exp * (2^64 - 1)))`.
+/// - when `exp >= 0`, the maximum length is `ceil(log_10 (2^exp * (2^64 - 1)))`.
+///
+/// `ceil(log_10 (x^exp * (2^64 - 1)))` is less than `ceil(log_10 (2^64 - 1)) +
+/// ceil(exp * log_10 x)`, which is in turn less than `20 + (1 + exp * log_10 x)`.
+/// We use the facts that `log_10 2 < 5/16` and `log_10 5 < 12/16`, which is
+/// enough for our purposes.
+///
+/// Why do we need this? `format_exact` functions will fill the entire buffer
+/// unless limited by the last digit restriction, but it is possible that
+/// the number of digits requested is ridiculously large (say, 30,000 digits).
+/// The vast majority of buffer will be filled with zeroes, so we don't want to
+/// allocate all the buffer beforehand. Consequently, for any given arguments,
+/// 826 bytes of buffer should be sufficient for `f64`. Compare this with
+/// the actual number for the worst case: 770 bytes (when `exp = -1074`).
+fn estimate_max_buf_len(exp: i16) -> usize {
+    21 + ((if exp < 0 { -12 } else { 5 } * exp as i32) as usize >> 4)
+}
+
+/// Formats given floating point number into the exponential form with
+/// exactly given number of significant digits. The result is stored to
+/// the supplied parts array while utilizing given byte buffer as a scratch.
+/// `upper` is used to determine the case of the exponent prefix (`e` or `E`).
+/// The first part to be rendered is always a `Part::Sign` (which can be
+/// an empty string if no sign is rendered).
+///
+/// `format_exact` should be the underlying digit-generation function.
+/// You probably would want `strategy::grisu::format_exact` for this.
+///
+/// The byte buffer should be at least `ndigits` bytes long unless `ndigits` is
+/// so large that only the fixed number of digits will be ever written.
+/// (The tipping point for `f64` is about 800, so 1000 bytes should be enough.)
+/// There should be at least 7 parts available, due to the worst case like
+/// `[+][1][.][2345][e][-][67]`.
+pub fn to_exact_exp_str<'a, T, F>(mut format_exact: F, v: T,
+                                  sign: Sign, ndigits: usize, upper: bool,
+                                  buf: &'a mut [u8], parts: &'a mut [Part<'a>]) -> Formatted<'a>
+        where T: DecodableFloat, F: FnMut(&Decoded, &mut [u8], i16) -> (usize, i16) {
+    assert!(parts.len() >= 6);
+    assert!(ndigits > 0);
+
+    let (negative, full_decoded) = decode(v);
+    let sign = determine_sign(sign, &full_decoded, negative);
+    match full_decoded {
+        FullDecoded::Nan => {
+            parts[0] = Part::Copy(if upper { b"NAN" } else { b"nan" });
+            Formatted { sign: sign, parts: &parts[..1] }
+        }
+        FullDecoded::Infinite => {
+            parts[0] = Part::Copy(if upper { b"INF" } else { b"inf" });
+            Formatted { sign: sign, parts: &parts[..1] }
+        }
+        FullDecoded::Zero => {
+            if ndigits > 1 { // [0.][0000][e0]
+                parts[0] = Part::Copy(b"0.");
+                parts[1] = Part::Zero(ndigits - 1);
+                parts[2] = Part::Copy(if upper { b"E0" } else { b"e0" });
+                Formatted { sign: sign, parts: &parts[..3] }
+            } else {
+                parts[0] = Part::Copy(if upper { b"0E0" } else { b"0e0" });
+                Formatted { sign: sign, parts: &parts[..1] }
+            }
+        }
+        FullDecoded::Finite(ref decoded) => {
+            let maxlen = estimate_max_buf_len(decoded.exp);
+            assert!(buf.len() >= ndigits || buf.len() >= maxlen);
+
+            let trunc = if ndigits < maxlen { ndigits } else { maxlen };
+            let (len, exp) = format_exact(decoded, &mut buf[..trunc], i16::MIN);
+            Formatted { sign: sign,
+                        parts: digits_to_exp_str(&buf[..len], exp, ndigits, upper, parts) }
+        }
+    }
+}
+
+/// Formats given floating point number into the decimal form with exactly
+/// given number of fractional digits. The result is stored to the supplied parts
+/// array while utilizing given byte buffer as a scratch. `upper` is only used to
+/// determine the case of non-finite values, i.e. `inf` and `nan`. The first part
+/// to be rendered is always a `Part::Sign` (which can be an empty string
+/// if no sign is rendered).
+///
+/// `format_exact` should be the underlying digit-generation function.
+/// You probably would want `strategy::grisu::format_exact` for this.
+///
+/// The byte buffer should be enough for the output unless `frac_digits` is
+/// so large that only the fixed number of digits will be ever written.
+/// (The tipping point for `f64` is about 800, and 1000 bytes should be enough.)
+/// There should be at least 5 parts available, due to the worst case like
+/// `[+][0.][0000][45][0000]` with `frac_digits = 10`.
+pub fn to_exact_fixed_str<'a, T, F>(mut format_exact: F, v: T,
+                                    sign: Sign, frac_digits: usize, upper: bool,
+                                    buf: &'a mut [u8], parts: &'a mut [Part<'a>]) -> Formatted<'a>
+        where T: DecodableFloat, F: FnMut(&Decoded, &mut [u8], i16) -> (usize, i16) {
+    assert!(parts.len() >= 4);
+
+    let (negative, full_decoded) = decode(v);
+    let sign = determine_sign(sign, &full_decoded, negative);
+    match full_decoded {
+        FullDecoded::Nan => {
+            parts[0] = Part::Copy(if upper { b"NAN" } else { b"nan" });
+            Formatted { sign: sign, parts: &parts[..1] }
+        }
+        FullDecoded::Infinite => {
+            parts[0] = Part::Copy(if upper { b"INF" } else { b"inf" });
+            Formatted { sign: sign, parts: &parts[..1] }
+        }
+        FullDecoded::Zero => {
+            if frac_digits > 0 { // [0.][0000]
+                parts[0] = Part::Copy(b"0.");
+                parts[1] = Part::Zero(frac_digits);
+                Formatted { sign: sign, parts: &parts[..2] }
+            } else {
+                parts[0] = Part::Copy(b"0");
+                Formatted { sign: sign, parts: &parts[..1] }
+            }
+        }
+        FullDecoded::Finite(ref decoded) => {
+            let maxlen = estimate_max_buf_len(decoded.exp);
+            assert!(buf.len() >= maxlen);
+
+            // it *is* possible that `frac_digits` is ridiculously large.
+            // `format_exact` will end rendering digits much earlier in this case,
+            // because we are strictly limited by `maxlen`.
+            let limit = if frac_digits < 0x8000 { -(frac_digits as i16) } else { i16::MIN };
+            let (len, exp) = format_exact(decoded, &mut buf[..maxlen], limit);
+            if exp <= limit {
+                // `format_exact` always returns at least one digit even though the restriction
+                // hasn't been met, so we catch this condition and treats as like zeroes.
+                // this does not include the case that the restriction has been met
+                // only after the final rounding-up; it's a regular case with `exp = limit + 1`.
+                debug_assert_eq!(len, 0);
+                if frac_digits > 0 { // [0.][0000]
+                    parts[0] = Part::Copy(b"0.");
+                    parts[1] = Part::Zero(frac_digits);
+                    Formatted { sign: sign, parts: &parts[..2] }
+                } else {
+                    parts[0] = Part::Copy(b"0");
+                    Formatted { sign: sign, parts: &parts[..1] }
+                }
+            } else {
+                Formatted { sign: sign,
+                            parts: digits_to_dec_str(&buf[..len], exp, frac_digits, parts) }
+            }
+        }
+    }
+}
+
diff --git a/src/libcore/num/flt2dec/strategy/dragon.rs b/src/libcore/num/flt2dec/strategy/dragon.rs
new file mode 100644
index 00000000000..bd735594e82
--- /dev/null
+++ b/src/libcore/num/flt2dec/strategy/dragon.rs
@@ -0,0 +1,327 @@
+// Copyright 2015 The Rust Project Developers. See the COPYRIGHT
+// file at the top-level directory of this distribution and at
+// http://rust-lang.org/COPYRIGHT.
+//
+// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
+// option. This file may not be copied, modified, or distributed
+// except according to those terms.
+
+/*!
+Almost direct (but slightly optimized) Rust translation of Figure 3 of [1].
+
+[1] Burger, R. G. and Dybvig, R. K. 1996. Printing floating-point numbers
+    quickly and accurately. SIGPLAN Not. 31, 5 (May. 1996), 108-116.
+*/
+
+use prelude::*;
+use num::Float;
+use cmp::Ordering;
+
+use num::flt2dec::{Decoded, MAX_SIG_DIGITS, round_up};
+use num::flt2dec::estimator::estimate_scaling_factor;
+use num::flt2dec::bignum::Digit32 as Digit;
+use num::flt2dec::bignum::Big32x36 as Big;
+
+// FIXME(#22540) const ref to static array seems to ICE
+static POW10: [Digit; 10] = [1, 10, 100, 1000, 10000, 100000,
+                             1000000, 10000000, 100000000, 1000000000];
+static TWOPOW10: [Digit; 10] = [2, 20, 200, 2000, 20000, 200000,
+                                2000000, 20000000, 200000000, 2000000000];
+
+// precalculated arrays of `Digit`s for 10^(2^n)
+static POW10TO16: [Digit; 2] = [0x6fc10000, 0x2386f2];
+static POW10TO32: [Digit; 4] = [0, 0x85acef81, 0x2d6d415b, 0x4ee];
+static POW10TO64: [Digit; 7] = [0, 0, 0xbf6a1f01, 0x6e38ed64, 0xdaa797ed, 0xe93ff9f4, 0x184f03];
+static POW10TO128: [Digit; 14] =
+    [0, 0, 0, 0, 0x2e953e01, 0x3df9909, 0xf1538fd, 0x2374e42f, 0xd3cff5ec, 0xc404dc08,
+     0xbccdb0da, 0xa6337f19, 0xe91f2603, 0x24e];
+static POW10TO256: [Digit; 27] =
+    [0, 0, 0, 0, 0, 0, 0, 0, 0x982e7c01, 0xbed3875b, 0xd8d99f72, 0x12152f87, 0x6bde50c6,
+     0xcf4a6e70, 0xd595d80f, 0x26b2716e, 0xadc666b0, 0x1d153624, 0x3c42d35a, 0x63ff540e,
+     0xcc5573c0, 0x65f9ef17, 0x55bc28f2, 0x80dcc7f7, 0xf46eeddc, 0x5fdcefce, 0x553f7];
+
+#[doc(hidden)]
+pub fn mul_pow10<'a>(x: &'a mut Big, n: usize) -> &'a mut Big {
+    debug_assert!(n < 512);
+    if n &   7 != 0 { x.mul_small(POW10[n & 7]); }
+    if n &   8 != 0 { x.mul_small(POW10[8]); }
+    if n &  16 != 0 { x.mul_digits(&POW10TO16); }
+    if n &  32 != 0 { x.mul_digits(&POW10TO32); }
+    if n &  64 != 0 { x.mul_digits(&POW10TO64); }
+    if n & 128 != 0 { x.mul_digits(&POW10TO128); }
+    if n & 256 != 0 { x.mul_digits(&POW10TO256); }
+    x
+}
+
+fn div_2pow10<'a>(x: &'a mut Big, mut n: usize) -> &'a mut Big {
+    let largest = POW10.len() - 1;
+    while n > largest {
+        x.div_rem_small(POW10[largest]);
+        n -= largest;
+    }
+    x.div_rem_small(TWOPOW10[n]);
+    x
+}
+
+// only usable when `x < 16 * scale`; `scaleN` should be `scale.mul_small(N)`
+fn div_rem_upto_16<'a>(x: &'a mut Big, scale: &Big,
+                       scale2: &Big, scale4: &Big, scale8: &Big) -> (u8, &'a mut Big) {
+    let mut d = 0;
+    if *x >= *scale8 { x.sub(scale8); d += 8; }
+    if *x >= *scale4 { x.sub(scale4); d += 4; }
+    if *x >= *scale2 { x.sub(scale2); d += 2; }
+    if *x >= *scale  { x.sub(scale);  d += 1; }
+    debug_assert!(*x < *scale);
+    (d, x)
+}
+
+/// The shortest mode implementation for Dragon.
+pub fn format_shortest(d: &Decoded, buf: &mut [u8]) -> (/*#digits*/ usize, /*exp*/ i16) {
+    // the number `v` to format is known to be:
+    // - equal to `mant * 2^exp`;
+    // - preceded by `(mant - 2 * minus) * 2^exp` in the original type; and
+    // - followed by `(mant + 2 * plus) * 2^exp` in the original type.
+    //
+    // obviously, `minus` and `plus` cannot be zero. (for infinities, we use out-of-range values.)
+    // also we assume that at least one digit is generated, i.e. `mant` cannot be zero too.
+    //
+    // this also means that any number between `low = (mant - minus) * 2^exp` and
+    // `high = (mant + plus) * 2^exp` will map to this exact floating point number,
+    // with bounds included when the original mantissa was even (i.e. `!mant_was_odd`).
+
+    assert!(d.mant > 0);
+    assert!(d.minus > 0);
+    assert!(d.plus > 0);
+    assert!(d.mant.checked_add(d.plus).is_some());
+    assert!(d.mant.checked_sub(d.minus).is_some());
+    assert!(buf.len() >= MAX_SIG_DIGITS);
+
+    // `a.cmp(&b) < rounding` is `if d.inclusive {a <= b} else {a < b}`
+    let rounding = if d.inclusive {Ordering::Greater} else {Ordering::Equal};
+
+    // estimate `k_0` from original inputs satisfying `10^(k_0-1) < high <= 10^(k_0+1)`.
+    // the tight bound `k` satisfying `10^(k-1) < high <= 10^k` is calculated later.
+    let mut k = estimate_scaling_factor(d.mant + d.plus, d.exp);
+
+    // convert `{mant, plus, minus} * 2^exp` into the fractional form so that:
+    // - `v = mant / scale`
+    // - `low = (mant - minus) / scale`
+    // - `high = (mant + plus) / scale`
+    let mut mant = Big::from_u64(d.mant);
+    let mut minus = Big::from_u64(d.minus);
+    let mut plus = Big::from_u64(d.plus);
+    let mut scale = Big::from_small(1);
+    if d.exp < 0 {
+        scale.mul_pow2(-d.exp as usize);
+    } else {
+        mant.mul_pow2(d.exp as usize);
+        minus.mul_pow2(d.exp as usize);
+        plus.mul_pow2(d.exp as usize);
+    }
+
+    // divide `mant` by `10^k`. now `scale / 10 < mant + plus <= scale * 10`.
+    if k >= 0 {
+        mul_pow10(&mut scale, k as usize);
+    } else {
+        mul_pow10(&mut mant, -k as usize);
+        mul_pow10(&mut minus, -k as usize);
+        mul_pow10(&mut plus, -k as usize);
+    }
+
+    // fixup when `mant + plus > scale` (or `>=`).
+    // we are not actually modifying `scale`, since we can skip the initial multiplication instead.
+    // now `scale < mant + plus <= scale * 10` and we are ready to generate digits.
+    //
+    // note that `d[0]` *can* be zero, when `scale - plus < mant < scale`.
+    // in this case rounding-up condition (`up` below) will be triggered immediately.
+    if scale.cmp(mant.clone().add(&plus)) < rounding {
+        // equivalent to scaling `scale` by 10
+        k += 1;
+    } else {
+        mant.mul_small(10);
+        minus.mul_small(10);
+        plus.mul_small(10);
+    }
+
+    // cache `(2, 4, 8) * scale` for digit generation.
+    let mut scale2 = scale.clone(); scale2.mul_pow2(1);
+    let mut scale4 = scale.clone(); scale4.mul_pow2(2);
+    let mut scale8 = scale.clone(); scale8.mul_pow2(3);
+
+    let mut down;
+    let mut up;
+    let mut i = 0;
+    loop {
+        // invariants, where `d[0..n-1]` are digits generated so far:
+        // - `v = mant / scale * 10^(k-n-1) + d[0..n-1] * 10^(k-n)`
+        // - `v - low = minus / scale * 10^(k-n-1)`
+        // - `high - v = plus / scale * 10^(k-n-1)`
+        // - `(mant + plus) / scale <= 10` (thus `mant / scale < 10`)
+        // where `d[i..j]` is a shorthand for `d[i] * 10^(j-i) + ... + d[j-1] * 10 + d[j]`.
+
+        // generate one digit: `d[n] = floor(mant / scale) < 10`.
+        let (d, _) = div_rem_upto_16(&mut mant, &scale, &scale2, &scale4, &scale8);
+        debug_assert!(d < 10);
+        buf[i] = b'0' + d;
+        i += 1;
+
+        // this is a simplified description of the modified Dragon algorithm.
+        // many intermediate derivations and completeness arguments are omitted for convenience.
+        //
+        // start with modified invariants, as we've updated `n`:
+        // - `v = mant / scale * 10^(k-n) + d[0..n-1] * 10^(k-n)`
+        // - `v - low = minus / scale * 10^(k-n)`
+        // - `high - v = plus / scale * 10^(k-n)`
+        //
+        // assume that `d[0..n-1]` is the shortest representation between `low` and `high`,
+        // i.e. `d[0..n-1]` satisfies both of the following but `d[0..n-2]` doesn't:
+        // - `low < d[0..n-1] * 10^(k-n) < high` (bijectivity: digits round to `v`); and
+        // - `abs(v / 10^(k-n) - d[0..n-1]) <= 1/2` (the last digit is correct).
+        //
+        // the second condition simplifies to `2 * mant <= scale`.
+        // solving invariants in terms of `mant`, `low` and `high` yields
+        // a simpler version of the first condition: `-plus < mant < minus`.
+        // since `-plus < 0 <= mant`, we have the correct shortest representation
+        // when `mant < minus` and `2 * mant <= scale`.
+        // (the former becomes `mant <= minus` when the original mantissa is even.)
+        //
+        // when the second doesn't hold (`2 * mant > scale`), we need to increase the last digit.
+        // this is enough for restoring that condition: we already know that
+        // the digit generation guarantees `0 <= v / 10^(k-n) - d[0..n-1] < 1`.
+        // in this case, the first condition becomes `-plus < mant - scale < minus`.
+        // since `mant < scale` after the generation, we have `scale < mant + plus`.
+        // (again, this becomes `scale <= mant + plus` when the original mantissa is even.)
+        //
+        // in short:
+        // - stop and round `down` (keep digits as is) when `mant < minus` (or `<=`).
+        // - stop and round `up` (increase the last digit) when `scale < mant + plus` (or `<=`).
+        // - keep generating otherwise.
+        down = mant.cmp(&minus) < rounding;
+        up = scale.cmp(mant.clone().add(&plus)) < rounding;
+        if down || up { break; } // we have the shortest representation, proceed to the rounding
+
+        // restore the invariants.
+        // this makes the algorithm always terminating: `minus` and `plus` always increases,
+        // but `mant` is clipped modulo `scale` and `scale` is fixed.
+        mant.mul_small(10);
+        minus.mul_small(10);
+        plus.mul_small(10);
+    }
+
+    // rounding up happens when
+    // i) only the rounding-up condition was triggered, or
+    // ii) both conditions were triggered and tie breaking prefers rounding up.
+    if up && (!down || *mant.mul_pow2(1) >= scale) {
+        // if rounding up changes the length, the exponent should also change.
+        // it seems that this condition is very hard to satisfy (possibly impossible),
+        // but we are just being safe and consistent here.
+        if let Some(c) = round_up(buf, i) {
+            buf[i] = c;
+            i += 1;
+            k += 1;
+        }
+    }
+
+    (i, k)
+}
+
+/// The exact and fixed mode implementation for Dragon.
+pub fn format_exact(d: &Decoded, buf: &mut [u8], limit: i16) -> (/*#digits*/ usize, /*exp*/ i16) {
+    assert!(d.mant > 0);
+    assert!(d.minus > 0);
+    assert!(d.plus > 0);
+    assert!(d.mant.checked_add(d.plus).is_some());
+    assert!(d.mant.checked_sub(d.minus).is_some());
+
+    // estimate `k_0` from original inputs satisfying `10^(k_0-1) < v <= 10^(k_0+1)`.
+    let mut k = estimate_scaling_factor(d.mant, d.exp);
+
+    // `v = mant / scale`.
+    let mut mant = Big::from_u64(d.mant);
+    let mut scale = Big::from_small(1);
+    if d.exp < 0 {
+        scale.mul_pow2(-d.exp as usize);
+    } else {
+        mant.mul_pow2(d.exp as usize);
+    }
+
+    // divide `mant` by `10^k`. now `scale / 10 < mant <= scale * 10`.
+    if k >= 0 {
+        mul_pow10(&mut scale, k as usize);
+    } else {
+        mul_pow10(&mut mant, -k as usize);
+    }
+
+    // fixup when `mant + plus >= scale`, where `plus / scale = 10^-buf.len() / 2`.
+    // in order to keep the fixed-size bignum, we actually use `mant + floor(plus) >= scale`.
+    // we are not actually modifying `scale`, since we can skip the initial multiplication instead.
+    // again with the shortest algorithm, `d[0]` can be zero but will be eventually rounded up.
+    if *div_2pow10(&mut scale.clone(), buf.len()).add(&mant) >= scale {
+        // equivalent to scaling `scale` by 10
+        k += 1;
+    } else {
+        mant.mul_small(10);
+    }
+
+    // if we are working with the last-digit limitation, we need to shorten the buffer
+    // before the actual rendering in order to avoid double rounding.
+    // note that we have to enlarge the buffer again when rounding up happens!
+    let mut len = if k < limit {
+        // oops, we cannot even produce *one* digit.
+        // this is possible when, say, we've got something like 9.5 and it's being rounded to 10.
+        // we return an empty buffer, with an exception of the later rounding-up case
+        // which occurs when `k == limit` and has to produce exactly one digit.
+        0
+    } else if ((k as i32 - limit as i32) as usize) < buf.len() {
+        (k - limit) as usize
+    } else {
+        buf.len()
+    };
+
+    if len > 0 {
+        // cache `(2, 4, 8) * scale` for digit generation.
+        // (this can be expensive, so do not calculate them when the buffer is empty.)
+        let mut scale2 = scale.clone(); scale2.mul_pow2(1);
+        let mut scale4 = scale.clone(); scale4.mul_pow2(2);
+        let mut scale8 = scale.clone(); scale8.mul_pow2(3);
+
+        for i in 0..len {
+            if mant.is_zero() { // following digits are all zeroes, we stop here
+                // do *not* try to perform rounding! rather, fill remaining digits.
+                for c in &mut buf[i..len] { *c = b'0'; }
+                return (len, k);
+            }
+
+            let mut d = 0;
+            if mant >= scale8 { mant.sub(&scale8); d += 8; }
+            if mant >= scale4 { mant.sub(&scale4); d += 4; }
+            if mant >= scale2 { mant.sub(&scale2); d += 2; }
+            if mant >= scale  { mant.sub(&scale);  d += 1; }
+            debug_assert!(mant < scale);
+            debug_assert!(d < 10);
+            buf[i] = b'0' + d;
+            mant.mul_small(10);
+        }
+    }
+
+    // rounding up if we stop in the middle of digits
+    if mant >= *scale.mul_small(5) {
+        // if rounding up changes the length, the exponent should also change.
+        // but we've been requested a fixed number of digits, so do not alter the buffer...
+        if let Some(c) = round_up(buf, len) {
+            // ...unless we've been requested the fixed precision instead.
+            // we also need to check that, if the original buffer was empty,
+            // the additional digit can only be added when `k == limit` (edge case).
+            k += 1;
+            if k > limit && len < buf.len() {
+                buf[len] = c;
+                len += 1;
+            }
+        }
+    }
+
+    (len, k)
+}
+
diff --git a/src/libcore/num/flt2dec/strategy/grisu.rs b/src/libcore/num/flt2dec/strategy/grisu.rs
new file mode 100644
index 00000000000..220811e9985
--- /dev/null
+++ b/src/libcore/num/flt2dec/strategy/grisu.rs
@@ -0,0 +1,749 @@
+// Copyright 2015 The Rust Project Developers. See the COPYRIGHT
+// file at the top-level directory of this distribution and at
+// http://rust-lang.org/COPYRIGHT.
+//
+// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
+// option. This file may not be copied, modified, or distributed
+// except according to those terms.
+
+/*!
+Rust adaptation of Grisu3 algorithm described in [1]. It uses about
+1KB of precomputed table, and in turn, it's very quick for most inputs.
+
+[1] Florian Loitsch. 2010. Printing floating-point numbers quickly and
+    accurately with integers. SIGPLAN Not. 45, 6 (June 2010), 233-243.
+*/
+
+use prelude::*;
+use num::Float;
+
+use num::flt2dec::{Decoded, MAX_SIG_DIGITS, round_up};
+
+/// A custom 64-bit floating point type, representing `f * 2^e`.
+#[derive(Copy, Clone, Debug)]
+#[doc(hidden)]
+pub struct Fp {
+    /// The integer mantissa.
+    pub f: u64,
+    /// The exponent in base 2.
+    pub e: i16,
+}
+
+impl Fp {
+    /// Returns a correctly rounded product of itself and `other`.
+    fn mul(&self, other: &Fp) -> Fp {
+        const MASK: u64 = 0xffffffff;
+        let a = self.f >> 32;
+        let b = self.f & MASK;
+        let c = other.f >> 32;
+        let d = other.f & MASK;
+        let ac = a * c;
+        let bc = b * c;
+        let ad = a * d;
+        let bd = b * d;
+        let tmp = (bd >> 32) + (ad & MASK) + (bc & MASK) + (1 << 31) /* round */;
+        let f = ac + (ad >> 32) + (bc >> 32) + (tmp >> 32);
+        let e = self.e + other.e + 64;
+        Fp { f: f, e: e }
+    }
+
+    /// Normalizes itself so that the resulting mantissa is at least `2^63`.
+    fn normalize(&self) -> Fp {
+        let mut f = self.f;
+        let mut e = self.e;
+        if f >> (64 - 32) == 0 { f <<= 32; e -= 32; }
+        if f >> (64 - 16) == 0 { f <<= 16; e -= 16; }
+        if f >> (64 -  8) == 0 { f <<=  8; e -=  8; }
+        if f >> (64 -  4) == 0 { f <<=  4; e -=  4; }
+        if f >> (64 -  2) == 0 { f <<=  2; e -=  2; }
+        if f >> (64 -  1) == 0 { f <<=  1; e -=  1; }
+        debug_assert!(f >= (1 >> 63));
+        Fp { f: f, e: e }
+    }
+
+    /// Normalizes itself to have the shared exponent.
+    /// It can only decrease the exponent (and thus increase the mantissa).
+    fn normalize_to(&self, e: i16) -> Fp {
+        let edelta = self.e - e;
+        assert!(edelta >= 0);
+        let edelta = edelta as usize;
+        assert_eq!(self.f << edelta >> edelta, self.f);
+        Fp { f: self.f << edelta, e: e }
+    }
+}
+
+// see the comments in `format_shortest_opt` for the rationale.
+#[doc(hidden)] pub const ALPHA: i16 = -60;
+#[doc(hidden)] pub const GAMMA: i16 = -32;
+
+/*
+# the following Python code generates this table:
+for i in xrange(-308, 333, 8):
+    if i >= 0: f = 10**i; e = 0
+    else: f = 2**(80-4*i) // 10**-i; e = 4 * i - 80
+    l = f.bit_length()
+    f = ((f << 64 >> (l-1)) + 1) >> 1; e += l - 64
+    print '    (%#018x, %5d, %4d),' % (f, e, i)
+*/
+// FIXME(#22540) const ref to static array seems to ICE
+#[doc(hidden)]
+pub static CACHED_POW10: [(u64, i16, i16); 81] = [ // (f, e, k)
+    (0xe61acf033d1a45df, -1087, -308),
+    (0xab70fe17c79ac6ca, -1060, -300),
+    (0xff77b1fcbebcdc4f, -1034, -292),
+    (0xbe5691ef416bd60c, -1007, -284),
+    (0x8dd01fad907ffc3c,  -980, -276),
+    (0xd3515c2831559a83,  -954, -268),
+    (0x9d71ac8fada6c9b5,  -927, -260),
+    (0xea9c227723ee8bcb,  -901, -252),
+    (0xaecc49914078536d,  -874, -244),
+    (0x823c12795db6ce57,  -847, -236),
+    (0xc21094364dfb5637,  -821, -228),
+    (0x9096ea6f3848984f,  -794, -220),
+    (0xd77485cb25823ac7,  -768, -212),
+    (0xa086cfcd97bf97f4,  -741, -204),
+    (0xef340a98172aace5,  -715, -196),
+    (0xb23867fb2a35b28e,  -688, -188),
+    (0x84c8d4dfd2c63f3b,  -661, -180),
+    (0xc5dd44271ad3cdba,  -635, -172),
+    (0x936b9fcebb25c996,  -608, -164),
+    (0xdbac6c247d62a584,  -582, -156),
+    (0xa3ab66580d5fdaf6,  -555, -148),
+    (0xf3e2f893dec3f126,  -529, -140),
+    (0xb5b5ada8aaff80b8,  -502, -132),
+    (0x87625f056c7c4a8b,  -475, -124),
+    (0xc9bcff6034c13053,  -449, -116),
+    (0x964e858c91ba2655,  -422, -108),
+    (0xdff9772470297ebd,  -396, -100),
+    (0xa6dfbd9fb8e5b88f,  -369,  -92),
+    (0xf8a95fcf88747d94,  -343,  -84),
+    (0xb94470938fa89bcf,  -316,  -76),
+    (0x8a08f0f8bf0f156b,  -289,  -68),
+    (0xcdb02555653131b6,  -263,  -60),
+    (0x993fe2c6d07b7fac,  -236,  -52),
+    (0xe45c10c42a2b3b06,  -210,  -44),
+    (0xaa242499697392d3,  -183,  -36),
+    (0xfd87b5f28300ca0e,  -157,  -28),
+    (0xbce5086492111aeb,  -130,  -20),
+    (0x8cbccc096f5088cc,  -103,  -12),
+    (0xd1b71758e219652c,   -77,   -4),
+    (0x9c40000000000000,   -50,    4),
+    (0xe8d4a51000000000,   -24,   12),
+    (0xad78ebc5ac620000,     3,   20),
+    (0x813f3978f8940984,    30,   28),
+    (0xc097ce7bc90715b3,    56,   36),
+    (0x8f7e32ce7bea5c70,    83,   44),
+    (0xd5d238a4abe98068,   109,   52),
+    (0x9f4f2726179a2245,   136,   60),
+    (0xed63a231d4c4fb27,   162,   68),
+    (0xb0de65388cc8ada8,   189,   76),
+    (0x83c7088e1aab65db,   216,   84),
+    (0xc45d1df942711d9a,   242,   92),
+    (0x924d692ca61be758,   269,  100),
+    (0xda01ee641a708dea,   295,  108),
+    (0xa26da3999aef774a,   322,  116),
+    (0xf209787bb47d6b85,   348,  124),
+    (0xb454e4a179dd1877,   375,  132),
+    (0x865b86925b9bc5c2,   402,  140),
+    (0xc83553c5c8965d3d,   428,  148),
+    (0x952ab45cfa97a0b3,   455,  156),
+    (0xde469fbd99a05fe3,   481,  164),
+    (0xa59bc234db398c25,   508,  172),
+    (0xf6c69a72a3989f5c,   534,  180),
+    (0xb7dcbf5354e9bece,   561,  188),
+    (0x88fcf317f22241e2,   588,  196),
+    (0xcc20ce9bd35c78a5,   614,  204),
+    (0x98165af37b2153df,   641,  212),
+    (0xe2a0b5dc971f303a,   667,  220),
+    (0xa8d9d1535ce3b396,   694,  228),
+    (0xfb9b7cd9a4a7443c,   720,  236),
+    (0xbb764c4ca7a44410,   747,  244),
+    (0x8bab8eefb6409c1a,   774,  252),
+    (0xd01fef10a657842c,   800,  260),
+    (0x9b10a4e5e9913129,   827,  268),
+    (0xe7109bfba19c0c9d,   853,  276),
+    (0xac2820d9623bf429,   880,  284),
+    (0x80444b5e7aa7cf85,   907,  292),
+    (0xbf21e44003acdd2d,   933,  300),
+    (0x8e679c2f5e44ff8f,   960,  308),
+    (0xd433179d9c8cb841,   986,  316),
+    (0x9e19db92b4e31ba9,  1013,  324),
+    (0xeb96bf6ebadf77d9,  1039,  332),
+];
+
+#[doc(hidden)] pub const CACHED_POW10_FIRST_E: i16 = -1087;
+#[doc(hidden)] pub const CACHED_POW10_LAST_E: i16 = 1039;
+
+#[doc(hidden)]
+pub fn cached_power(alpha: i16, gamma: i16) -> (i16, Fp) {
+    let offset = CACHED_POW10_FIRST_E as i32;
+    let range = (CACHED_POW10.len() as i32) - 1;
+    let domain = (CACHED_POW10_LAST_E - CACHED_POW10_FIRST_E) as i32;
+    let idx = ((gamma as i32) - offset) * range / domain;
+    let (f, e, k) = CACHED_POW10[idx as usize];
+    debug_assert!(alpha <= e && e <= gamma);
+    (k, Fp { f: f, e: e })
+}
+
+/// Given `x > 0`, returns `(k, 10^k)` such that `10^k <= x < 10^(k+1)`.
+#[doc(hidden)]
+pub fn max_pow10_no_more_than(x: u32) -> (u8, u32) {
+    debug_assert!(x > 0);
+
+    const X9: u32 = 10_0000_0000;
+    const X8: u32 =  1_0000_0000;
+    const X7: u32 =    1000_0000;
+    const X6: u32 =     100_0000;
+    const X5: u32 =      10_0000;
+    const X4: u32 =       1_0000;
+    const X3: u32 =         1000;
+    const X2: u32 =          100;
+    const X1: u32 =           10;
+
+    if x < X4 {
+        if x < X2 { if x < X1 {(0,  1)} else {(1, X1)} }
+        else      { if x < X3 {(2, X2)} else {(3, X3)} }
+    } else {
+        if x < X6      { if x < X5 {(4, X4)} else {(5, X5)} }
+        else if x < X8 { if x < X7 {(6, X6)} else {(7, X7)} }
+        else           { if x < X9 {(8, X8)} else {(9, X9)} }
+    }
+}
+
+/// The shortest mode implementation for Grisu.
+///
+/// It returns `None` when it would return an inexact representation otherwise.
+pub fn format_shortest_opt(d: &Decoded,
+                           buf: &mut [u8]) -> Option<(/*#digits*/ usize, /*exp*/ i16)> {
+    assert!(d.mant > 0);
+    assert!(d.minus > 0);
+    assert!(d.plus > 0);
+    assert!(d.mant.checked_add(d.plus).is_some());
+    assert!(d.mant.checked_sub(d.minus).is_some());
+    assert!(buf.len() >= MAX_SIG_DIGITS);
+    assert!(d.mant + d.plus < (1 << 61)); // we need at least three bits of additional precision
+
+    // start with the normalized values with the shared exponent
+    let plus = Fp { f: d.mant + d.plus, e: d.exp }.normalize();
+    let minus = Fp { f: d.mant - d.minus, e: d.exp }.normalize_to(plus.e);
+    let v = Fp { f: d.mant, e: d.exp }.normalize_to(plus.e);
+
+    // find any `cached = 10^minusk` such that `ALPHA <= minusk + plus.e + 64 <= GAMMA`.
+    // since `plus` is normalized, this means `2^(62 + ALPHA) <= plus * cached < 2^(64 + GAMMA)`;
+    // given our choices of `ALPHA` and `GAMMA`, this puts `plus * cached` into `[4, 2^32)`.
+    //
+    // it is obviously desirable to maximize `GAMMA - ALPHA`,
+    // so that we don't need many cached powers of 10, but there are some considerations:
+    //
+    // 1. we want to keep `floor(plus * cached)` within `u32` since it needs a costly division.
+    //    (this is not really avoidable, remainder is required for accuracy estimation.)
+    // 2. the remainder of `floor(plus * cached)` repeatedly gets multiplied by 10,
+    //    and it should not overflow.
+    //
+    // the first gives `64 + GAMMA <= 32`, while the second gives `10 * 2^-ALPHA <= 2^64`;
+    // -60 and -32 is the maximal range with this constraint, and V8 also uses them.
+    let (minusk, cached) = cached_power(ALPHA - plus.e - 64, GAMMA - plus.e - 64);
+
+    // scale fps. this gives the maximal error of 1 ulp (proved from Theorem 5.1).
+    let plus = plus.mul(&cached);
+    let minus = minus.mul(&cached);
+    let v = v.mul(&cached);
+    debug_assert_eq!(plus.e, minus.e);
+    debug_assert_eq!(plus.e, v.e);
+
+    //         +- actual range of minus
+    //   | <---|---------------------- unsafe region --------------------------> |
+    //   |     |                                                                 |
+    //   |  |<--->|  | <--------------- safe region ---------------> |           |
+    //   |  |     |  |                                               |           |
+    //   |1 ulp|1 ulp|                 |1 ulp|1 ulp|                 |1 ulp|1 ulp|
+    //   |<--->|<--->|                 |<--->|<--->|                 |<--->|<--->|
+    //   |-----|-----|-------...-------|-----|-----|-------...-------|-----|-----|
+    //   |   minus   |                 |     v     |                 |   plus    |
+    // minus1     minus0           v - 1 ulp   v + 1 ulp           plus0       plus1
+    //
+    // above `minus`, `v` and `plus` are *quantized* approximations (error < 1 ulp).
+    // as we don't know the error is positive or negative, we use two approximations spaced equally
+    // and have the maximal error of 2 ulps.
+    //
+    // the "unsafe region" is a liberal interval which we initially generate.
+    // the "safe region" is a conservative interval which we only accept.
+    // we start with the correct repr within the unsafe region, and try to find the closest repr
+    // to `v` which is also within the safe region. if we can't, we give up.
+    let plus1 = plus.f + 1;
+//  let plus0 = plus.f - 1; // only for explanation
+//  let minus0 = minus.f + 1; // only for explanation
+    let minus1 = minus.f - 1;
+    let e = -plus.e as usize; // shared exponent
+
+    // divide `plus1` into integral and fractional parts.
+    // integral parts are guaranteed to fit in u32, since cached power guarantees `plus < 2^32`
+    // and normalized `plus.f` is always less than `2^64 - 2^4` due to the precision requirement.
+    let plus1int = (plus1 >> e) as u32;
+    let plus1frac = plus1 & ((1 << e) - 1);
+
+    // calculate the largest `10^max_kappa` no more than `plus1` (thus `plus1 < 10^(max_kappa+1)`).
+    // this is an upper bound of `kappa` below.
+    let (max_kappa, max_ten_kappa) = max_pow10_no_more_than(plus1int);
+
+    let mut i = 0;
+    let exp = max_kappa as i16 - minusk + 1;
+
+    // Theorem 6.2: if `k` is the greatest integer s.t. `0 <= y mod 10^k <= y - x`,
+    //              then `V = floor(y / 10^k) * 10^k` is in `[x, y]` and one of the shortest
+    //              representations (with the minimal number of significant digits) in that range.
+    //
+    // find the digit length `kappa` between `(minus1, plus1)` as per Theorem 6.2.
+    // Theorem 6.2 can be adopted to exclude `x` by requiring `y mod 10^k < y - x` instead.
+    // (e.g. `x` = 32000, `y` = 32777; `kappa` = 2 since `y mod 10^3 = 777 < y - x = 777`.)
+    // the algorithm relies on the later verification phase to exclude `y`.
+    let delta1 = plus1 - minus1;
+//  let delta1int = (delta1 >> e) as usize; // only for explanation
+    let delta1frac = delta1 & ((1 << e) - 1);
+
+    // render integral parts, while checking for the accuracy at each step.
+    let mut kappa = max_kappa as i16;
+    let mut ten_kappa = max_ten_kappa; // 10^kappa
+    let mut remainder = plus1int; // digits yet to be rendered
+    loop { // we always have at least one digit to render, as `plus1 >= 10^kappa`
+        // invariants:
+        // - `delta1int <= remainder < 10^(kappa+1)`
+        // - `plus1int = d[0..n-1] * 10^(kappa+1) + remainder`
+        //   (it follows that `remainder = plus1int % 10^(kappa+1)`)
+
+        // divide `remainder` by `10^kappa`. both are scaled by `2^-e`.
+        let q = remainder / ten_kappa;
+        let r = remainder % ten_kappa;
+        debug_assert!(q < 10);
+        buf[i] = b'0' + q as u8;
+        i += 1;
+
+        let plus1rem = ((r as u64) << e) + plus1frac; // == (plus1 % 10^kappa) * 2^e
+        if plus1rem < delta1 {
+            // `plus1 % 10^kappa < delta1 = plus1 - minus1`; we've found the correct `kappa`.
+            let ten_kappa = (ten_kappa as u64) << e; // scale 10^kappa back to the shared exponent
+            return round_and_weed(&mut buf[..i], exp, plus1rem, delta1, plus1 - v.f, ten_kappa, 1);
+        }
+
+        // break the loop when we have rendered all integral digits.
+        // the exact number of digits is `max_kappa + 1` as `plus1 < 10^(max_kappa+1)`.
+        if i > max_kappa as usize {
+            debug_assert_eq!(ten_kappa, 1);
+            debug_assert_eq!(kappa, 0);
+            break;
+        }
+
+        // restore invariants
+        kappa -= 1;
+        ten_kappa /= 10;
+        remainder = r;
+    }
+
+    // render fractional parts, while checking for the accuracy at each step.
+    // this time we rely on repeated multiplications, as division will lose the precision.
+    let mut remainder = plus1frac;
+    let mut threshold = delta1frac;
+    let mut ulp = 1;
+    loop { // the next digit should be significant as we've tested that before breaking out
+        // invariants, where `m = max_kappa + 1` (# of digits in the integral part):
+        // - `remainder < 2^e`
+        // - `plus1frac * 10^(n-m) = d[m..n-1] * 2^e + remainder`
+
+        remainder *= 10; // won't overflow, `2^e * 10 < 2^64`
+        threshold *= 10;
+        ulp *= 10;
+
+        // divide `remainder` by `10^kappa`.
+        // both are scaled by `2^e / 10^kappa`, so the latter is implicit here.
+        let q = remainder >> e;
+        let r = remainder & ((1 << e) - 1);
+        debug_assert!(q < 10);
+        buf[i] = b'0' + q as u8;
+        i += 1;
+
+        if r < threshold {
+            let ten_kappa = 1 << e; // implicit divisor
+            return round_and_weed(&mut buf[..i], exp, r, threshold,
+                                  (plus1 - v.f) * ulp, ten_kappa, ulp);
+        }
+
+        // restore invariants
+        kappa -= 1;
+        remainder = r;
+    }
+
+    // we've generated all significant digits of `plus1`, but not sure if it's the optimal one.
+    // for example, if `minus1` is 3.14153... and `plus1` is 3.14158..., there are 5 different
+    // shortest representation from 3.14154 to 3.14158 but we only have the greatest one.
+    // we have to successively decrease the last digit and check if this is the optimal repr.
+    // there are at most 9 candidates (..1 to ..9), so this is fairly quick. ("rounding" phase)
+    //
+    // the function checks if this "optimal" repr is actually within the ulp ranges,
+    // and also, it is possible that the "second-to-optimal" repr can actually be optimal
+    // due to the rounding error. in either cases this returns `None`. ("weeding" phase)
+    //
+    // all arguments here are scaled by the common (but implicit) value `k`, so that:
+    // - `remainder = (plus1 % 10^kappa) * k`
+    // - `threshold = (plus1 - minus1) * k` (and also, `remainder < threshold`)
+    // - `plus1v = (plus1 - v) * k` (and also, `threshold > plus1v` from prior invariants)
+    // - `ten_kappa = 10^kappa * k`
+    // - `ulp = 2^-e * k`
+    fn round_and_weed(buf: &mut [u8], exp: i16, remainder: u64, threshold: u64, plus1v: u64,
+                      ten_kappa: u64, ulp: u64) -> Option<(usize, i16)> {
+        assert!(!buf.is_empty());
+
+        // produce two approximations to `v` (actually `plus1 - v`) within 1.5 ulps.
+        // the resulting representation should be the closest representation to both.
+        //
+        // here `plus1 - v` is used since calculations are done with respect to `plus1`
+        // in order to avoid overflow/underflow (hence the seemingly swapped names).
+        let plus1v_down = plus1v + ulp; // plus1 - (v - 1 ulp)
+        let plus1v_up = plus1v - ulp; // plus1 - (v + 1 ulp)
+
+        // decrease the last digit and stop at the closest representation to `v + 1 ulp`.
+        let mut plus1w = remainder; // plus1w(n) = plus1 - w(n)
+        {
+            let last = buf.last_mut().unwrap();
+
+            // we work with the approximated digits `w(n)`, which is initially equal to `plus1 -
+            // plus1 % 10^kappa`. after running the loop body `n` times, `w(n) = plus1 -
+            // plus1 % 10^kappa - n * 10^kappa`. we set `plus1w(n) = plus1 - w(n) =
+            // plus1 % 10^kappa + n * 10^kappa` (thus `remainder = plus1w(0)`) to simplify checks.
+            // note that `plus1w(n)` is always increasing.
+            //
+            // we have three conditions to terminate. any of them will make the loop unable to
+            // proceed, but we then have at least one valid representation known to be closest to
+            // `v + 1 ulp` anyway. we will denote them as TC1 through TC3 for brevity.
+            //
+            // TC1: `w(n) <= v + 1 ulp`, i.e. this is the last repr that can be the closest one.
+            // this is equivalent to `plus1 - w(n) = plus1w(n) >= plus1 - (v + 1 ulp) = plus1v_up`.
+            // combined with TC2 (which checks if `w(n+1)` is valid), this prevents the possible
+            // overflow on the calculation of `plus1w(n)`.
+            //
+            // TC2: `w(n+1) < minus1`, i.e. the next repr definitely does not round to `v`.
+            // this is equivalent to `plus1 - w(n) + 10^kappa = plus1w(n) + 10^kappa >
+            // plus1 - minus1 = threshold`. the left hand side can overflow, but we know
+            // `threshold > plus1v`, so if TC1 is false, `threshold - plus1w(n) >
+            // threshold - (plus1v - 1 ulp) > 1 ulp` and we can safely test if
+            // `threshold - plus1w(n) < 10^kappa` instead.
+            //
+            // TC3: `abs(w(n) - (v + 1 ulp)) <= abs(w(n+1) - (v + 1 ulp))`, i.e. the next repr is
+            // no closer to `v + 1 ulp` than the current repr. given `z(n) = plus1v_up - plus1w(n)`,
+            // this becomes `abs(z(n)) <= abs(z(n+1))`. again assuming that TC1 is false, we have
+            // `z(n) > 0`. we have two cases to consider:
+            //
+            // - when `z(n+1) >= 0`: TC3 becomes `z(n) <= z(n+1)`. as `plus1w(n)` is increasing,
+            //   `z(n)` should be decreasing and this is clearly false.
+            // - when `z(n+1) < 0`:
+            //   - TC3a: the precondition is `plus1v_up < plus1w(n) + 10^kappa`. assuming TC2 is
+            //     false, `threshold >= plus1w(n) + 10^kappa` so it cannot overflow.
+            //   - TC3b: TC3 becomes `z(n) <= -z(n+1)`, i.e. `plus1v_up - plus1w(n) >=
+            //     plus1w(n+1) - plus1v_up = plus1w(n) + 10^kappa - plus1v_up`. the negated TC1
+            //     gives `plus1v_up > plus1w(n)`, so it cannot overflow or underflow when
+            //     combined with TC3a.
+            //
+            // consequently, we should stop when `TC1 || TC2 || (TC3a && TC3b)`. the following is
+            // equal to its inverse, `!TC1 && !TC2 && (!TC3a || !TC3b)`.
+            while plus1w < plus1v_up &&
+                  threshold - plus1w >= ten_kappa &&
+                  (plus1w + ten_kappa < plus1v_up ||
+                   plus1v_up - plus1w >= plus1w + ten_kappa - plus1v_up) {
+                *last -= 1;
+                debug_assert!(*last > b'0'); // the shortest repr cannot end with `0`
+                plus1w += ten_kappa;
+            }
+        }
+
+        // check if this representation is also the closest representation to `v - 1 ulp`.
+        //
+        // this is simply same to the terminating conditions for `v + 1 ulp`, with all `plus1v_up`
+        // replaced by `plus1v_down` instead. overflow analysis equally holds.
+        if plus1w < plus1v_down &&
+           threshold - plus1w >= ten_kappa &&
+           (plus1w + ten_kappa < plus1v_down ||
+            plus1v_down - plus1w >= plus1w + ten_kappa - plus1v_down) {
+            return None;
+        }
+
+        // now we have the closest representation to `v` between `plus1` and `minus1`.
+        // this is too liberal, though, so we reject any `w(n)` not between `plus0` and `minus0`,
+        // i.e. `plus1 - plus1w(n) <= minus0` or `plus1 - plus1w(n) >= plus0`. we utilize the facts
+        // that `threshold = plus1 - minus1` and `plus1 - plus0 = minus0 - minus1 = 2 ulp`.
+        if 2 * ulp <= plus1w && plus1w <= threshold - 4 * ulp {
+            Some((buf.len(), exp))
+        } else {
+            None
+        }
+    }
+}
+
+/// The shortest mode implementation for Grisu with Dragon fallback.
+///
+/// This should be used for most cases.
+pub fn format_shortest(d: &Decoded, buf: &mut [u8]) -> (/*#digits*/ usize, /*exp*/ i16) {
+    use num::flt2dec::strategy::dragon::format_shortest as fallback;
+    match format_shortest_opt(d, buf) {
+        Some(ret) => ret,
+        None => fallback(d, buf),
+    }
+}
+
+/// The exact and fixed mode implementation for Grisu.
+///
+/// It returns `None` when it would return an inexact representation otherwise.
+pub fn format_exact_opt(d: &Decoded, buf: &mut [u8], limit: i16)
+                                -> Option<(/*#digits*/ usize, /*exp*/ i16)> {
+    assert!(d.mant > 0);
+    assert!(d.mant < (1 << 61)); // we need at least three bits of additional precision
+    assert!(!buf.is_empty());
+
+    // normalize and scale `v`.
+    let v = Fp { f: d.mant, e: d.exp }.normalize();
+    let (minusk, cached) = cached_power(ALPHA - v.e - 64, GAMMA - v.e - 64);
+    let v = v.mul(&cached);
+
+    // divide `v` into integral and fractional parts.
+    let e = -v.e as usize;
+    let vint = (v.f >> e) as u32;
+    let vfrac = v.f & ((1 << e) - 1);
+
+    // both old `v` and new `v` (scaled by `10^-k`) has an error of < 1 ulp (Theorem 5.1).
+    // as we don't know the error is positive or negative, we use two approximations
+    // spaced equally and have the maximal error of 2 ulps (same to the shortest case).
+    //
+    // the goal is to find the exactly rounded series of digits that are common to
+    // both `v - 1 ulp` and `v + 1 ulp`, so that we are maximally confident.
+    // if this is not possible, we don't know which one is the correct output for `v`,
+    // so we give up and fall back.
+    //
+    // `err` is defined as `1 ulp * 2^e` here (same to the ulp in `vfrac`),
+    // and we will scale it whenever `v` gets scaled.
+    let mut err = 1;
+
+    // calculate the largest `10^max_kappa` no more than `v` (thus `v < 10^(max_kappa+1)`).
+    // this is an upper bound of `kappa` below.
+    let (max_kappa, max_ten_kappa) = max_pow10_no_more_than(vint);
+
+    let mut i = 0;
+    let exp = max_kappa as i16 - minusk + 1;
+
+    // if we are working with the last-digit limitation, we need to shorten the buffer
+    // before the actual rendering in order to avoid double rounding.
+    // note that we have to enlarge the buffer again when rounding up happens!
+    let len = if exp <= limit {
+        // oops, we cannot even produce *one* digit.
+        // this is possible when, say, we've got something like 9.5 and it's being rounded to 10.
+        //
+        // in principle we can immediately call `possibly_round` with an empty buffer,
+        // but scaling `max_ten_kappa << e` by 10 can result in overflow.
+        // thus we are being sloppy here and widen the error range by a factor of 10.
+        // this will increase the false negative rate, but only very, *very* slightly;
+        // it can only matter noticably when the mantissa is bigger than 60 bits.
+        return possibly_round(buf, 0, exp, limit, v.f / 10, (max_ten_kappa as u64) << e, err << e);
+    } else if ((exp as i32 - limit as i32) as usize) < buf.len() {
+        (exp - limit) as usize
+    } else {
+        buf.len()
+    };
+    debug_assert!(len > 0);
+
+    // render integral parts.
+    // the error is entirely fractional, so we don't need to check it in this part.
+    let mut kappa = max_kappa as i16;
+    let mut ten_kappa = max_ten_kappa; // 10^kappa
+    let mut remainder = vint; // digits yet to be rendered
+    loop { // we always have at least one digit to render
+        // invariants:
+        // - `remainder < 10^(kappa+1)`
+        // - `vint = d[0..n-1] * 10^(kappa+1) + remainder`
+        //   (it follows that `remainder = vint % 10^(kappa+1)`)
+
+        // divide `remainder` by `10^kappa`. both are scaled by `2^-e`.
+        let q = remainder / ten_kappa;
+        let r = remainder % ten_kappa;
+        debug_assert!(q < 10);
+        buf[i] = b'0' + q as u8;
+        i += 1;
+
+        // is the buffer full? run the rounding pass with the remainder.
+        if i == len {
+            let vrem = ((r as u64) << e) + vfrac; // == (v % 10^kappa) * 2^e
+            return possibly_round(buf, len, exp, limit, vrem, (ten_kappa as u64) << e, err << e);
+        }
+
+        // break the loop when we have rendered all integral digits.
+        // the exact number of digits is `max_kappa + 1` as `plus1 < 10^(max_kappa+1)`.
+        if i > max_kappa as usize {
+            debug_assert_eq!(ten_kappa, 1);
+            debug_assert_eq!(kappa, 0);
+            break;
+        }
+
+        // restore invariants
+        kappa -= 1;
+        ten_kappa /= 10;
+        remainder = r;
+    }
+
+    // render fractional parts.
+    //
+    // in principle we can continue to the last available digit and check for the accuracy.
+    // unfortunately we are working with the finite-sized integers, so we need some criterion
+    // to detect the overflow. V8 uses `remainder > err`, which becomes false when
+    // the first `i` significant digits of `v - 1 ulp` and `v` differ. however this rejects
+    // too many otherwise valid input.
+    //
+    // since the later phase has a correct overflow detection, we instead use tighter criterion:
+    // we continue til `err` exceeds `10^kappa / 2`, so that the range between `v - 1 ulp` and
+    // `v + 1 ulp` definitely contains two or more rounded representations. this is same to
+    // the first two comparisons from `possibly_round`, for the reference.
+    let mut remainder = vfrac;
+    let maxerr = 1 << (e - 1);
+    while err < maxerr {
+        // invariants, where `m = max_kappa + 1` (# of digits in the integral part):
+        // - `remainder < 2^e`
+        // - `vfrac * 10^(n-m) = d[m..n-1] * 2^e + remainder`
+        // - `err = 10^(n-m)`
+
+        remainder *= 10; // won't overflow, `2^e * 10 < 2^64`
+        err *= 10; // won't overflow, `err * 10 < 2^e * 5 < 2^64`
+
+        // divide `remainder` by `10^kappa`.
+        // both are scaled by `2^e / 10^kappa`, so the latter is implicit here.
+        let q = remainder >> e;
+        let r = remainder & ((1 << e) - 1);
+        debug_assert!(q < 10);
+        buf[i] = b'0' + q as u8;
+        i += 1;
+
+        // is the buffer full? run the rounding pass with the remainder.
+        if i == len {
+            return possibly_round(buf, len, exp, limit, r, 1 << e, err);
+        }
+
+        // restore invariants
+        remainder = r;
+    }
+
+    // further calculation is useless (`possibly_round` definitely fails), so we give up.
+    return None;
+
+    // we've generated all requested digits of `v`, which should be also same to corresponding
+    // digits of `v - 1 ulp`. now we check if there is a unique representation shared by
+    // both `v - 1 ulp` and `v + 1 ulp`; this can be either same to generated digits, or
+    // to the rounded-up version of those digits. if the range contains multiple representations
+    // of the same length, we cannot be sure and should return `None` instead.
+    //
+    // all arguments here are scaled by the common (but implicit) value `k`, so that:
+    // - `remainder = (v % 10^kappa) * k`
+    // - `ten_kappa = 10^kappa * k`
+    // - `ulp = 2^-e * k`
+    fn possibly_round(buf: &mut [u8], mut len: usize, mut exp: i16, limit: i16,
+                      remainder: u64, ten_kappa: u64, ulp: u64) -> Option<(usize, i16)> {
+        debug_assert!(remainder < ten_kappa);
+
+        //           10^kappa
+        //    :   :   :<->:   :
+        //    :   :   :   :   :
+        //    :|1 ulp|1 ulp|  :
+        //    :|<--->|<--->|  :
+        // ----|-----|-----|----
+        //     |     v     |
+        // v - 1 ulp   v + 1 ulp
+        //
+        // (for the reference, the dotted line indicates the exact value for
+        // possible representations in given number of digits.)
+        //
+        // error is too large that there are at least three possible representations
+        // between `v - 1 ulp` and `v + 1 ulp`. we cannot determine which one is correct.
+        if ulp >= ten_kappa { return None; }
+
+        //    10^kappa
+        //   :<------->:
+        //   :         :
+        //   : |1 ulp|1 ulp|
+        //   : |<--->|<--->|
+        // ----|-----|-----|----
+        //     |     v     |
+        // v - 1 ulp   v + 1 ulp
+        //
+        // in fact, 1/2 ulp is enough to introduce two possible representations.
+        // (remember that we need a unique representation for both `v - 1 ulp` and `v + 1 ulp`.)
+        // this won't overflow, as `ulp < ten_kappa` from the first check.
+        if ten_kappa - ulp <= ulp { return None; }
+
+        //     remainder
+        //       :<->|                           :
+        //       :   |                           :
+        //       :<--------- 10^kappa ---------->:
+        //     | :   |                           :
+        //     |1 ulp|1 ulp|                     :
+        //     |<--->|<--->|                     :
+        // ----|-----|-----|------------------------
+        //     |     v     |
+        // v - 1 ulp   v + 1 ulp
+        //
+        // if `v + 1 ulp` is closer to the rounded-down representation (which is already in `buf`),
+        // then we can safely return. note that `v - 1 ulp` *can* be less than the current
+        // representation, but as `1 ulp < 10^kappa / 2`, this condition is enough:
+        // the distance between `v - 1 ulp` and the current representation
+        // cannot exceed `10^kappa / 2`.
+        //
+        // the condition equals to `remainder + ulp < 10^kappa / 2`.
+        // since this can easily overflow, first check if `remainder < 10^kappa / 2`.
+        // we've already verified that `ulp < 10^kappa / 2`, so as long as
+        // `10^kappa` did not overflow after all, the second check is fine.
+        if ten_kappa - remainder > remainder && ten_kappa - 2 * remainder >= 2 * ulp {
+            return Some((len, exp));
+        }
+
+        //   :<------- remainder ------>|   :
+        //   :                          |   :
+        //   :<--------- 10^kappa --------->:
+        //   :                    |     |   : |
+        //   :                    |1 ulp|1 ulp|
+        //   :                    |<--->|<--->|
+        // -----------------------|-----|-----|-----
+        //                        |     v     |
+        //                    v - 1 ulp   v + 1 ulp
+        //
+        // on the other hands, if `v - 1 ulp` is closer to the rounded-up representation,
+        // we should round up and return. for the same reason we don't need to check `v + 1 ulp`.
+        //
+        // the condition equals to `remainder - ulp >= 10^kappa / 2`.
+        // again we first check if `remainder > ulp` (note that this is not `remainder >= ulp`,
+        // as `10^kappa` is never zero). also note that `remainder - ulp <= 10^kappa`,
+        // so the second check does not overflow.
+        if remainder > ulp && ten_kappa - (remainder - ulp) <= remainder - ulp {
+            if let Some(c) = round_up(buf, len) {
+                // only add an additional digit when we've been requested the fixed precision.
+                // we also need to check that, if the original buffer was empty,
+                // the additional digit can only be added when `exp == limit` (edge case).
+                exp += 1;
+                if exp > limit && len < buf.len() {
+                    buf[len] = c;
+                    len += 1;
+                }
+            }
+            return Some((len, exp));
+        }
+
+        // otherwise we are doomed (i.e. some values between `v - 1 ulp` and `v + 1 ulp` are
+        // rounding down and others are rounding up) and give up.
+        None
+    }
+}
+
+/// The exact and fixed mode implementation for Grisu with Dragon fallback.
+///
+/// This should be used for most cases.
+pub fn format_exact(d: &Decoded, buf: &mut [u8], limit: i16) -> (/*#digits*/ usize, /*exp*/ i16) {
+    use num::flt2dec::strategy::dragon::format_exact as fallback;
+    match format_exact_opt(d, buf, limit) {
+        Some(ret) => ret,
+        None => fallback(d, buf, limit),
+    }
+}
+
diff --git a/src/libcore/num/mod.rs b/src/libcore/num/mod.rs
index 71c2b38287e..011830ddb78 100644
--- a/src/libcore/num/mod.rs
+++ b/src/libcore/num/mod.rs
@@ -44,6 +44,9 @@ pub struct Wrapping<T>(#[stable(feature = "rust1", since = "1.0.0")] pub T);
 #[unstable(feature = "core", reason = "may be removed or relocated")]
 pub mod wrapping;
 
+#[unstable(feature = "core", reason = "internal routines only exposed for testing")]
+pub mod flt2dec;
+
 /// Types that have a "zero" value.
 ///
 /// This trait is intended for use in conjunction with `Add`, as an identity:
-- 
cgit 1.4.1-3-g733a5