From 351feaff5375becdf437fcd412973bf025ca5d8d Mon Sep 17 00:00:00 2001
From: Dmitry Panov <dop@itoolabs.com>
Date: Mon, 27 Apr 2020 22:36:20 +0100
Subject: [PATCH] Ported V8's fast-dtoa. Most conversions should now run at
 comparable speed to strconv.FormatFloat().

---
 date.go                           |  19 -
 ftoa/LICENSE_LUCENE               |  21 +
 ftoa/common.go                    |  22 +-
 ftoa/ftoa.go                      | 698 +++++++++++++++++++++++
 ftoa/ftobasestr.go                |   5 +-
 ftoa/ftostr.go                    | 896 ++++--------------------------
 ftoa/ftostr_test.go               |  23 +-
 ftoa/internal/fast/LICENSE_V8     |  26 +
 ftoa/internal/fast/cachedpower.go | 120 ++++
 ftoa/internal/fast/common.go      |  18 +
 ftoa/internal/fast/diyfp.go       | 152 +++++
 ftoa/internal/fast/dtoa.go        | 624 +++++++++++++++++++++
 12 files changed, 1791 insertions(+), 833 deletions(-)
 create mode 100644 ftoa/LICENSE_LUCENE
 create mode 100644 ftoa/ftoa.go
 create mode 100644 ftoa/internal/fast/LICENSE_V8
 create mode 100644 ftoa/internal/fast/cachedpower.go
 create mode 100644 ftoa/internal/fast/common.go
 create mode 100644 ftoa/internal/fast/diyfp.go
 create mode 100644 ftoa/internal/fast/dtoa.go

diff --git a/date.go b/date.go
index 1df98cee..e8ae5808 100644
--- a/date.go
+++ b/date.go
@@ -113,25 +113,6 @@ func (d *dateObject) export() interface{} {
 	return nil
 }
 
-func (d *dateObject) setTime(year, m, day, hour, min, sec, nsec int64) Value {
-	t, ok := mkTime(year, m, day, hour, min, sec, nsec, time.Local)
-	if ok {
-		return d.setTimeMs(timeToMsec(t))
-	}
-	d.unset()
-	return _NaN
-}
-
-func (d *dateObject) setTimeUTC(year, m, day, hour, min, sec, nsec int64) Value {
-	t, ok := mkTime(year, m, day, hour, min, sec, nsec, time.UTC)
-	if ok {
-		t = t.In(time.Local)
-		return d.setTimeMs(timeToMsec(t))
-	}
-	d.unset()
-	return _NaN
-}
-
 func (d *dateObject) setTimeMs(ms int64) Value {
 	if ms >= 0 && ms <= maxTime || ms < 0 && ms >= -maxTime {
 		d.msec = ms
diff --git a/ftoa/LICENSE_LUCENE b/ftoa/LICENSE_LUCENE
new file mode 100644
index 00000000..c8da489c
--- /dev/null
+++ b/ftoa/LICENSE_LUCENE
@@ -0,0 +1,21 @@
+Copyright (C) 1998, 1999 by Lucent Technologies
+All Rights Reserved
+
+Permission to use, copy, modify, and distribute this software and
+its documentation for any purpose and without fee is hereby
+granted, provided that the above copyright notice appear in all
+copies and that both that the copyright notice and this
+permission notice and warranty disclaimer appear in supporting
+documentation, and that the name of Lucent or any of its entities
+not be used in advertising or publicity pertaining to
+distribution of the software without specific, written prior
+permission.
+
+LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
+INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
+IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
+SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
+IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
+ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
+THIS SOFTWARE.
diff --git a/ftoa/common.go b/ftoa/common.go
index ea632df4..207fb5fa 100644
--- a/ftoa/common.go
+++ b/ftoa/common.go
@@ -1,12 +1,16 @@
 /*
 Package ftoa provides ECMAScript-compliant floating point number conversion to string.
+
 It contains code ported from Rhino (https://github.com/mozilla/rhino/blob/master/src/org/mozilla/javascript/DToA.java)
+as well as from the original code by David M. Gay.
+
+See LICENSE_LUCENE for the original copyright message and disclaimer.
+
 */
 package ftoa
 
 import (
 	"math"
-	"math/big"
 )
 
 const (
@@ -97,7 +101,7 @@ func stuffBits(bits []byte, offset int, val uint32) {
 	bits[offset+3] = byte(val)
 }
 
-func d2b(d float64, bi *big.Int) (e, bits int) {
+func d2b(d float64, b []byte) (e, bits int, dblBits []byte) {
 	dBits := math.Float64bits(d)
 	d0 := uint32(dBits >> 32)
 	d1 := uint32(dBits)
@@ -106,33 +110,32 @@ func d2b(d float64, bi *big.Int) (e, bits int) {
 	d0 &= 0x7fffffff /* clear sign bit, which we ignore */
 
 	var de, k, i int
-	var dbl_bits []byte
 	if de = int(d0 >> exp_shift); de != 0 {
 		z |= exp_msk1
 	}
 
 	y := d1
 	if y != 0 {
-		dbl_bits = make([]byte, 8)
+		dblBits = b[:8]
 		k = lo0bits(y)
 		y >>= k
 		if k != 0 {
-			stuffBits(dbl_bits, 4, y|z<<(32-k))
+			stuffBits(dblBits, 4, y|z<<(32-k))
 			z >>= k
 		} else {
-			stuffBits(dbl_bits, 4, y)
+			stuffBits(dblBits, 4, y)
 		}
-		stuffBits(dbl_bits, 0, z)
+		stuffBits(dblBits, 0, z)
 		if z != 0 {
 			i = 2
 		} else {
 			i = 1
 		}
 	} else {
-		dbl_bits = make([]byte, 4)
+		dblBits = b[:4]
 		k = lo0bits(z)
 		z >>= k
-		stuffBits(dbl_bits, 0, z)
+		stuffBits(dblBits, 0, z)
 		k += 32
 		i = 1
 	}
@@ -144,6 +147,5 @@ func d2b(d float64, bi *big.Int) (e, bits int) {
 		e = de - bias - (p - 1) + 1 + k
 		bits = 32*i - hi0bits(z)
 	}
-	bi.SetBytes(dbl_bits)
 	return
 }
diff --git a/ftoa/ftoa.go b/ftoa/ftoa.go
new file mode 100644
index 00000000..59b516d7
--- /dev/null
+++ b/ftoa/ftoa.go
@@ -0,0 +1,698 @@
+package ftoa
+
+import (
+	"math"
+	"math/big"
+)
+
+const (
+	exp_11     = 0x3ff00000
+	frac_mask1 = 0xfffff
+	bletch     = 0x10
+	quick_max  = 14
+	int_max    = 14
+)
+
+var (
+	tens = [...]float64{
+		1e0, 1e1, 1e2, 1e3, 1e4, 1e5, 1e6, 1e7, 1e8, 1e9,
+		1e10, 1e11, 1e12, 1e13, 1e14, 1e15, 1e16, 1e17, 1e18, 1e19,
+		1e20, 1e21, 1e22,
+	}
+
+	bigtens = [...]float64{1e16, 1e32, 1e64, 1e128, 1e256}
+
+	big5  = big.NewInt(5)
+	big10 = big.NewInt(10)
+
+	p05       = []*big.Int{big5, big.NewInt(25), big.NewInt(125)}
+	pow5Cache [7]*big.Int
+
+	dtoaModes = []int{
+		ModeStandard:            0,
+		ModeStandardExponential: 0,
+		ModeFixed:               3,
+		ModeExponential:         2,
+		ModePrecision:           2,
+	}
+)
+
+/*
+d must be > 0 and must not be Inf
+
+mode:
+		0 ==> shortest string that yields d when read in
+			and rounded to nearest.
+		1 ==> like 0, but with Steele & White stopping rule;
+			e.g. with IEEE P754 arithmetic , mode 0 gives
+			1e23 whereas mode 1 gives 9.999999999999999e22.
+		2 ==> max(1,ndigits) significant digits.  This gives a
+			return value similar to that of ecvt, except
+			that trailing zeros are suppressed.
+		3 ==> through ndigits past the decimal point.  This
+			gives a return value similar to that from fcvt,
+			except that trailing zeros are suppressed, and
+			ndigits can be negative.
+		4,5 ==> similar to 2 and 3, respectively, but (in
+			round-nearest mode) with the tests of mode 0 to
+			possibly return a shorter string that rounds to d.
+			With IEEE arithmetic and compilation with
+			-DHonor_FLT_ROUNDS, modes 4 and 5 behave the same
+			as modes 2 and 3 when FLT_ROUNDS != 1.
+		6-9 ==> Debugging modes similar to mode - 4:  don't try
+			fast floating-point estimate (if applicable).
+
+		Values of mode other than 0-9 are treated as mode 0.
+*/
+func ftoa(d float64, mode int, biasUp bool, ndigits int, buf []byte) ([]byte, int) {
+	startPos := len(buf)
+	dblBits := make([]byte, 0, 8)
+	be, bbits, dblBits := d2b(d, dblBits)
+
+	dBits := math.Float64bits(d)
+	word0 := uint32(dBits >> 32)
+	word1 := uint32(dBits)
+
+	i := int((word0 >> exp_shift1) & (exp_mask >> exp_shift1))
+	var d2 float64
+	var denorm bool
+	if i != 0 {
+		d2 = setWord0(d, (word0&frac_mask1)|exp_11)
+		i -= bias
+		denorm = false
+	} else {
+		/* d is denormalized */
+		i = bbits + be + (bias + (p - 1) - 1)
+		var x uint64
+		if i > 32 {
+			x = uint64(word0)<<(64-i) | uint64(word1)>>(i-32)
+		} else {
+			x = uint64(word1) << (32 - i)
+		}
+		d2 = setWord0(float64(x), uint32((x>>32)-31*exp_mask))
+		i -= (bias + (p - 1) - 1) + 1
+		denorm = true
+	}
+	/* At this point d = f*2^i, where 1 <= f < 2.  d2 is an approximation of f. */
+	ds := (d2-1.5)*0.289529654602168 + 0.1760912590558 + float64(i)*0.301029995663981
+	k := int(ds)
+	if ds < 0.0 && ds != float64(k) {
+		k-- /* want k = floor(ds) */
+	}
+	k_check := true
+	if k >= 0 && k < len(tens) {
+		if d < tens[k] {
+			k--
+		}
+		k_check = false
+	}
+	/* At this point floor(log10(d)) <= k <= floor(log10(d))+1.
+	   If k_check is zero, we're guaranteed that k = floor(log10(d)). */
+	j := bbits - i - 1
+	var b2, s2, b5, s5 int
+	/* At this point d = b/2^j, where b is an odd integer. */
+	if j >= 0 {
+		b2 = 0
+		s2 = j
+	} else {
+		b2 = -j
+		s2 = 0
+	}
+	if k >= 0 {
+		b5 = 0
+		s5 = k
+		s2 += k
+	} else {
+		b2 -= k
+		b5 = -k
+		s5 = 0
+	}
+	/* At this point d/10^k = (b * 2^b2 * 5^b5) / (2^s2 * 5^s5), where b is an odd integer,
+	   b2 >= 0, b5 >= 0, s2 >= 0, and s5 >= 0. */
+	if mode < 0 || mode > 9 {
+		mode = 0
+	}
+	try_quick := true
+	if mode > 5 {
+		mode -= 4
+		try_quick = false
+	}
+	leftright := true
+	var ilim, ilim1 int
+	switch mode {
+	case 0, 1:
+		ilim, ilim1 = -1, -1
+		ndigits = 0
+	case 2:
+		leftright = false
+		fallthrough
+	case 4:
+		if ndigits <= 0 {
+			ndigits = 1
+		}
+		ilim, ilim1 = ndigits, ndigits
+	case 3:
+		leftright = false
+		fallthrough
+	case 5:
+		i = ndigits + k + 1
+		ilim = i
+		ilim1 = i - 1
+	}
+	/* ilim is the maximum number of significant digits we want, based on k and ndigits. */
+	/* ilim1 is the maximum number of significant digits we want, based on k and ndigits,
+	   when it turns out that k was computed too high by one. */
+	fast_failed := false
+	if ilim >= 0 && ilim <= quick_max && try_quick {
+
+		/* Try to get by with floating-point arithmetic. */
+
+		i = 0
+		d2 = d
+		k0 := k
+		ilim0 := ilim
+		ieps := 2 /* conservative */
+		/* Divide d by 10^k, keeping track of the roundoff error and avoiding overflows. */
+		if k > 0 {
+			ds = tens[k&0xf]
+			j = k >> 4
+			if (j & bletch) != 0 {
+				/* prevent overflows */
+				j &= bletch - 1
+				d /= bigtens[len(bigtens)-1]
+				ieps++
+			}
+			for ; j != 0; i++ {
+				if (j & 1) != 0 {
+					ieps++
+					ds *= bigtens[i]
+				}
+				j >>= 1
+			}
+			d /= ds
+		} else if j1 := -k; j1 != 0 {
+			d *= tens[j1&0xf]
+			for j = j1 >> 4; j != 0; i++ {
+				if (j & 1) != 0 {
+					ieps++
+					d *= bigtens[i]
+				}
+				j >>= 1
+			}
+		}
+		/* Check that k was computed correctly. */
+		if k_check && d < 1.0 && ilim > 0 {
+			if ilim1 <= 0 {
+				fast_failed = true
+			} else {
+				ilim = ilim1
+				k--
+				d *= 10.
+				ieps++
+			}
+		}
+		/* eps bounds the cumulative error. */
+		eps := float64(ieps)*d + 7.0
+		eps = setWord0(eps, _word0(eps)-(p-1)*exp_msk1)
+		if ilim == 0 {
+			d -= 5.0
+			if d > eps {
+				buf = append(buf, '1')
+				k++
+				return buf, k + 1
+			}
+			if d < -eps {
+				buf = append(buf, '0')
+				return buf, 1
+			}
+			fast_failed = true
+		}
+		if !fast_failed {
+			fast_failed = true
+			if leftright {
+				/* Use Steele & White method of only
+				 * generating digits needed.
+				 */
+				eps = 0.5/tens[ilim-1] - eps
+				for i = 0; ; {
+					l := int64(d)
+					d -= float64(l)
+					buf = append(buf, byte('0'+l))
+					if d < eps {
+						return buf, k + 1
+					}
+					if 1.0-d < eps {
+						buf, k = bumpUp(buf, k)
+						return buf, k + 1
+					}
+					i++
+					if i >= ilim {
+						break
+					}
+					eps *= 10.0
+					d *= 10.0
+				}
+			} else {
+				/* Generate ilim digits, then fix them up. */
+				eps *= tens[ilim-1]
+				for i = 1; ; i++ {
+					l := int64(d)
+					d -= float64(l)
+					buf = append(buf, byte('0'+l))
+					if i == ilim {
+						if d > 0.5+eps {
+							buf, k = bumpUp(buf, k)
+							return buf, k + 1
+						} else if d < 0.5-eps {
+							buf = stripTrailingZeroes(buf, startPos)
+							return buf, k + 1
+						}
+						break
+					}
+					d *= 10.0
+				}
+			}
+		}
+		if fast_failed {
+			buf = buf[:startPos]
+			d = d2
+			k = k0
+			ilim = ilim0
+		}
+	}
+
+	/* Do we have a "small" integer? */
+	if be >= 0 && k <= int_max {
+		/* Yes. */
+		ds = tens[k]
+		if ndigits < 0 && ilim <= 0 {
+			if ilim < 0 || d < 5*ds || (!biasUp && d == 5*ds) {
+				buf = buf[:startPos]
+				buf = append(buf, '0')
+				return buf, 1
+			}
+			buf = append(buf, '1')
+			k++
+			return buf, k + 1
+		}
+		for i = 1; ; i++ {
+			l := int64(d / ds)
+			d -= float64(l) * ds
+			buf = append(buf, byte('0'+l))
+			if i == ilim {
+				d += d
+				if (d > ds) || (d == ds && (((l & 1) != 0) || biasUp)) {
+					buf, k = bumpUp(buf, k)
+				}
+				break
+			}
+			d *= 10.0
+			if d == 0 {
+				break
+			}
+		}
+		return buf, k + 1
+	}
+
+	m2 := b2
+	m5 := b5
+	var mhi, mlo *big.Int
+	if leftright {
+		if mode < 2 {
+			if denorm {
+				i = be + (bias + (p - 1) - 1 + 1)
+			} else {
+				i = 1 + p - bbits
+			}
+			/* i is 1 plus the number of trailing zero bits in d's significand. Thus,
+			   (2^m2 * 5^m5) / (2^(s2+i) * 5^s5) = (1/2 lsb of d)/10^k. */
+		} else {
+			j = ilim - 1
+			if m5 >= j {
+				m5 -= j
+			} else {
+				j -= m5
+				s5 += j
+				b5 += j
+				m5 = 0
+			}
+			i = ilim
+			if i < 0 {
+				m2 -= i
+				i = 0
+			}
+			/* (2^m2 * 5^m5) / (2^(s2+i) * 5^s5) = (1/2 * 10^(1-ilim))/10^k. */
+		}
+		b2 += i
+		s2 += i
+		mhi = big.NewInt(1)
+		/* (mhi * 2^m2 * 5^m5) / (2^s2 * 5^s5) = one-half of last printed (when mode >= 2) or
+		   input (when mode < 2) significant digit, divided by 10^k. */
+	}
+
+	/* We still have d/10^k = (b * 2^b2 * 5^b5) / (2^s2 * 5^s5).  Reduce common factors in
+	   b2, m2, and s2 without changing the equalities. */
+	if m2 > 0 && s2 > 0 {
+		if m2 < s2 {
+			i = m2
+		} else {
+			i = s2
+		}
+		b2 -= i
+		m2 -= i
+		s2 -= i
+	}
+
+	b := new(big.Int).SetBytes(dblBits)
+	/* Fold b5 into b and m5 into mhi. */
+	if b5 > 0 {
+		if leftright {
+			if m5 > 0 {
+				pow5mult(mhi, m5)
+				b.Mul(mhi, b)
+			}
+			j = b5 - m5
+			if j != 0 {
+				pow5mult(b, j)
+			}
+		} else {
+			pow5mult(b, b5)
+		}
+	}
+	/* Now we have d/10^k = (b * 2^b2) / (2^s2 * 5^s5) and
+	   (mhi * 2^m2) / (2^s2 * 5^s5) = one-half of last printed or input significant digit, divided by 10^k. */
+
+	S := big.NewInt(1)
+	if s5 > 0 {
+		pow5mult(S, s5)
+	}
+	/* Now we have d/10^k = (b * 2^b2) / (S * 2^s2) and
+	   (mhi * 2^m2) / (S * 2^s2) = one-half of last printed or input significant digit, divided by 10^k. */
+
+	/* Check for special case that d is a normalized power of 2. */
+	spec_case := false
+	if mode < 2 {
+		if (_word1(d) == 0) && ((_word0(d) & bndry_mask) == 0) &&
+			((_word0(d) & (exp_mask & (exp_mask << 1))) != 0) {
+			/* The special case.  Here we want to be within a quarter of the last input
+			   significant digit instead of one half of it when the decimal output string's value is less than d.  */
+			b2 += log2P
+			s2 += log2P
+			spec_case = true
+		}
+	}
+
+	/* Arrange for convenient computation of quotients:
+	 * shift left if necessary so divisor has 4 leading 0 bits.
+	 *
+	 * Perhaps we should just compute leading 28 bits of S once
+	 * and for all and pass them and a shift to quorem, so it
+	 * can do shifts and ors to compute the numerator for q.
+	 */
+	var zz int
+	if s5 != 0 {
+		S_bytes := S.Bytes()
+		var S_hiWord uint32
+		for idx := 0; idx < 4; idx++ {
+			S_hiWord = S_hiWord << 8
+			if idx < len(S_bytes) {
+				S_hiWord |= uint32(S_bytes[idx])
+			}
+		}
+		zz = 32 - hi0bits(S_hiWord)
+	} else {
+		zz = 1
+	}
+	i = (zz + s2) & 0x1f
+	if i != 0 {
+		i = 32 - i
+	}
+	/* i is the number of leading zero bits in the most significant word of S*2^s2. */
+	if i > 4 {
+		i -= 4
+		b2 += i
+		m2 += i
+		s2 += i
+	} else if i < 4 {
+		i += 28
+		b2 += i
+		m2 += i
+		s2 += i
+	}
+	/* Now S*2^s2 has exactly four leading zero bits in its most significant word. */
+	if b2 > 0 {
+		b = b.Lsh(b, uint(b2))
+	}
+	if s2 > 0 {
+		S.Lsh(S, uint(s2))
+	}
+	/* Now we have d/10^k = b/S and
+	   (mhi * 2^m2) / S = maximum acceptable error, divided by 10^k. */
+	if k_check {
+		if b.Cmp(S) < 0 {
+			k--
+			b.Mul(b, big10) /* we botched the k estimate */
+			if leftright {
+				mhi.Mul(mhi, big10)
+			}
+			ilim = ilim1
+		}
+	}
+	/* At this point 1 <= d/10^k = b/S < 10. */
+
+	if ilim <= 0 && mode > 2 {
+		/* We're doing fixed-mode output and d is less than the minimum nonzero output in this mode.
+		   Output either zero or the minimum nonzero output depending on which is closer to d. */
+		if ilim >= 0 {
+			i = b.Cmp(S.Mul(S, big5))
+		}
+		if ilim < 0 || i < 0 || i == 0 && !biasUp {
+			/* Always emit at least one digit.  If the number appears to be zero
+			   using the current mode, then emit one '0' digit and set decpt to 1. */
+			buf = buf[:startPos]
+			buf = append(buf, '0')
+			return buf, 1
+		}
+		buf = append(buf, '1')
+		k++
+		return buf, k + 1
+	}
+
+	var dig byte
+	if leftright {
+		if m2 > 0 {
+			mhi.Lsh(mhi, uint(m2))
+		}
+
+		/* Compute mlo -- check for special case
+		 * that d is a normalized power of 2.
+		 */
+
+		mlo = mhi
+		if spec_case {
+			mhi = mlo
+			mhi = new(big.Int).Lsh(mhi, log2P)
+		}
+		/* mlo/S = maximum acceptable error, divided by 10^k, if the output is less than d. */
+		/* mhi/S = maximum acceptable error, divided by 10^k, if the output is greater than d. */
+		var z, delta big.Int
+		for i = 1; ; i++ {
+			z.DivMod(b, S, b)
+			dig = byte(z.Int64() + '0')
+			/* Do we yet have the shortest decimal string
+			 * that will round to d?
+			 */
+			j = b.Cmp(mlo)
+			/* j is b/S compared with mlo/S. */
+			delta.Sub(S, mhi)
+			var j1 int
+			if delta.Sign() <= 0 {
+				j1 = 1
+			} else {
+				j1 = b.Cmp(&delta)
+			}
+			/* j1 is b/S compared with 1 - mhi/S. */
+			if (j1 == 0) && (mode == 0) && ((_word1(d) & 1) == 0) {
+				if dig == '9' {
+					var flag bool
+					buf = append(buf, '9')
+					if buf, flag = roundOff(buf, startPos); flag {
+						k++
+						buf = append(buf, '1')
+					}
+					return buf, k + 1
+				}
+				if j > 0 {
+					dig++
+				}
+				buf = append(buf, dig)
+				return buf, k + 1
+			}
+			if (j < 0) || ((j == 0) && (mode == 0) && ((_word1(d) & 1) == 0)) {
+				if j1 > 0 {
+					/* Either dig or dig+1 would work here as the least significant decimal digit.
+					   Use whichever would produce a decimal value closer to d. */
+					b.Lsh(b, 1)
+					j1 = b.Cmp(S)
+					if (j1 > 0) || (j1 == 0 && (((dig & 1) == 1) || biasUp)) {
+						dig++
+						if dig == '9' {
+							buf = append(buf, '9')
+							buf, flag := roundOff(buf, startPos)
+							if flag {
+								k++
+								buf = append(buf, '1')
+							}
+							return buf, k + 1
+						}
+					}
+				}
+				buf = append(buf, dig)
+				return buf, k + 1
+			}
+			if j1 > 0 {
+				if dig == '9' { /* possible if i == 1 */
+					buf = append(buf, '9')
+					buf, flag := roundOff(buf, startPos)
+					if flag {
+						k++
+						buf = append(buf, '1')
+					}
+					return buf, k + 1
+				}
+				buf = append(buf, dig+1)
+				return buf, k + 1
+			}
+			buf = append(buf, dig)
+			if i == ilim {
+				break
+			}
+			b.Mul(b, big10)
+			if mlo == mhi {
+				mhi.Mul(mhi, big10)
+			} else {
+				mlo.Mul(mlo, big10)
+				mhi.Mul(mhi, big10)
+			}
+		}
+	} else {
+		var z big.Int
+		for i = 1; ; i++ {
+			z.DivMod(b, S, b)
+			dig = byte(z.Int64() + '0')
+			buf = append(buf, dig)
+			if i >= ilim {
+				break
+			}
+
+			b.Mul(b, big10)
+		}
+	}
+	/* Round off last digit */
+
+	b.Lsh(b, 1)
+	j = b.Cmp(S)
+	if (j > 0) || (j == 0 && (((dig & 1) == 1) || biasUp)) {
+		var flag bool
+		buf, flag = roundOff(buf, startPos)
+		if flag {
+			k++
+			buf = append(buf, '1')
+			return buf, k + 1
+		}
+	} else {
+		buf = stripTrailingZeroes(buf, startPos)
+	}
+
+	return buf, k + 1
+}
+
+func bumpUp(buf []byte, k int) ([]byte, int) {
+	var lastCh byte
+	stop := 0
+	if len(buf) > 0 && buf[0] == '-' {
+		stop = 1
+	}
+	for {
+		lastCh = buf[len(buf)-1]
+		buf = buf[:len(buf)-1]
+		if lastCh != '9' {
+			break
+		}
+		if len(buf) == stop {
+			k++
+			lastCh = '0'
+			break
+		}
+	}
+	buf = append(buf, lastCh+1)
+	return buf, k
+}
+
+func setWord0(d float64, w uint32) float64 {
+	dBits := math.Float64bits(d)
+	return math.Float64frombits(uint64(w)<<32 | dBits&0xffffffff)
+}
+
+func _word0(d float64) uint32 {
+	dBits := math.Float64bits(d)
+	return uint32(dBits >> 32)
+}
+
+func _word1(d float64) uint32 {
+	dBits := math.Float64bits(d)
+	return uint32(dBits)
+}
+
+func stripTrailingZeroes(buf []byte, startPos int) []byte {
+	bl := len(buf) - 1
+	for bl >= startPos && buf[bl] == '0' {
+		bl--
+	}
+	return buf[:bl+1]
+}
+
+/* Set b = b * 5^k.  k must be nonnegative. */
+func pow5mult(b *big.Int, k int) *big.Int {
+	if k < (1 << (len(pow5Cache) + 2)) {
+		i := k & 3
+		if i != 0 {
+			b.Mul(b, p05[i-1])
+		}
+		k >>= 2
+		i = 0
+		for {
+			if k&1 != 0 {
+				b.Mul(b, pow5Cache[i])
+			}
+			k >>= 1
+			if k == 0 {
+				break
+			}
+			i++
+		}
+		return b
+	}
+	return b.Mul(b, new(big.Int).Exp(big5, big.NewInt(int64(k)), nil))
+}
+
+func roundOff(buf []byte, startPos int) ([]byte, bool) {
+	i := len(buf)
+	for i != startPos {
+		i--
+		if buf[i] != '9' {
+			buf[i]++
+			return buf[:i+1], false
+		}
+	}
+	return buf[:startPos], true
+}
+
+func init() {
+	p := big.NewInt(625)
+	pow5Cache[0] = p
+	for i := 1; i < len(pow5Cache); i++ {
+		p = new(big.Int).Mul(p, p)
+		pow5Cache[i] = p
+	}
+}
diff --git a/ftoa/ftobasestr.go b/ftoa/ftobasestr.go
index 424a5788..9dc9b2d0 100644
--- a/ftoa/ftobasestr.go
+++ b/ftoa/ftobasestr.go
@@ -64,8 +64,8 @@ func FToBaseStr(num float64, radix int) string {
 		word0 := uint32(dBits >> 32)
 		word1 := uint32(dBits)
 
-		b := new(big.Int)
-		e, _ := d2b(df, b)
+		dblBits := make([]byte, 0, 8)
+		e, _, dblBits := d2b(df, dblBits)
 		//            JS_ASSERT(e < 0);
 		/* At this point df = b * 2^e.  e must be less than zero because 0 < df < 1. */
 
@@ -88,6 +88,7 @@ func FToBaseStr(num float64, radix int) string {
 			mhi = big.NewInt(1 << log2P)
 		}
 
+		b := new(big.Int).SetBytes(dblBits)
 		b.Lsh(b, uint(e+s2))
 		s := big.NewInt(1)
 		s.Lsh(s, uint(s2))
diff --git a/ftoa/ftostr.go b/ftoa/ftostr.go
index e0d7a097..a9d2d240 100644
--- a/ftoa/ftostr.go
+++ b/ftoa/ftostr.go
@@ -2,654 +2,26 @@ package ftoa
 
 import (
 	"math"
-	"math/big"
 	"strconv"
-)
 
-const (
-	exp_11     = 0x3ff00000
-	frac_mask1 = 0xfffff
-	bletch     = 0x10
-	quick_max  = 14
-	int_max    = 14
+	"github.com/dop251/goja/ftoa/internal/fast"
 )
 
 type FToStrMode int
 
 const (
-	ModeStandard            FToStrMode = iota /* Either fixed or exponential format; round-trip */
-	ModeStandardExponential                   /* Always exponential format; round-trip */
-	ModeFixed                                 /* Round to <precision> digits after the decimal point; exponential if number is large */
-	ModeExponential                           /* Always exponential format; <precision> significant digits */
-	ModePrecision                             /* Either fixed or exponential format; <precision> significant digits */
+	// Either fixed or exponential format; round-trip
+	ModeStandard FToStrMode = iota
+	// Always exponential format; round-trip
+	ModeStandardExponential
+	// Round to <precision> digits after the decimal point; exponential if number is large
+	ModeFixed
+	// Always exponential format; <precision> significant digits
+	ModeExponential
+	// Either fixed or exponential format; <precision> significant digits
+	ModePrecision
 )
 
-var (
-	tens = [...]float64{
-		1e0, 1e1, 1e2, 1e3, 1e4, 1e5, 1e6, 1e7, 1e8, 1e9,
-		1e10, 1e11, 1e12, 1e13, 1e14, 1e15, 1e16, 1e17, 1e18, 1e19,
-		1e20, 1e21, 1e22,
-	}
-
-	bigtens = [...]float64{1e16, 1e32, 1e64, 1e128, 1e256}
-
-	big5  = big.NewInt(5)
-	big10 = big.NewInt(10)
-
-	p05       = []*big.Int{big5, big.NewInt(25), big.NewInt(125)}
-	pow5Cache [7]*big.Int
-
-	dtoaModes = []int{
-		ModeStandard:            0,
-		ModeStandardExponential: 0,
-		ModeFixed:               3,
-		ModeExponential:         2,
-		ModePrecision:           2,
-	}
-)
-
-/*
-mode:
-		0 ==> shortest string that yields d when read in
-			and rounded to nearest.
-		1 ==> like 0, but with Steele & White stopping rule;
-			e.g. with IEEE P754 arithmetic , mode 0 gives
-			1e23 whereas mode 1 gives 9.999999999999999e22.
-		2 ==> max(1,ndigits) significant digits.  This gives a
-			return value similar to that of ecvt, except
-			that trailing zeros are suppressed.
-		3 ==> through ndigits past the decimal point.  This
-			gives a return value similar to that from fcvt,
-			except that trailing zeros are suppressed, and
-			ndigits can be negative.
-		4,5 ==> similar to 2 and 3, respectively, but (in
-			round-nearest mode) with the tests of mode 0 to
-			possibly return a shorter string that rounds to d.
-			With IEEE arithmetic and compilation with
-			-DHonor_FLT_ROUNDS, modes 4 and 5 behave the same
-			as modes 2 and 3 when FLT_ROUNDS != 1.
-		6-9 ==> Debugging modes similar to mode - 4:  don't try
-			fast floating-point estimate (if applicable).
-
-		Values of mode other than 0-9 are treated as mode 0.
-*/
-func ftoa(d float64, mode int, biasUp bool, ndigits int, buf []byte) ([]byte, int) {
-	var sign bool
-	if math.IsNaN(d) {
-		buf = append(buf, "NaN"...)
-		return buf, 9999
-	}
-	if math.Signbit(d) {
-		sign = true
-		d = math.Copysign(d, 1.0)
-	} else {
-		sign = false
-	}
-	if math.IsInf(d, 0) {
-		if sign {
-			buf = append(buf, '-')
-		}
-		buf = append(buf, "Infinity"...)
-		return buf, 9999
-	}
-	dBits := math.Float64bits(d)
-	word0 := uint32(dBits >> 32)
-	word1 := uint32(dBits)
-	if d == 0 {
-		// no_digits
-		buf = append(buf, '0')
-		return buf, 1
-	}
-	if sign {
-		buf = append(buf, '-')
-	}
-	b := new(big.Int)
-	be, bbits := d2b(d, b)
-	i := int((word0 >> exp_shift1) & (exp_mask >> exp_shift1))
-	var d2 float64
-	var denorm bool
-	if i != 0 {
-		d2 = setWord0(d, (word0&frac_mask1)|exp_11)
-		i -= bias
-		denorm = false
-	} else {
-		/* d is denormalized */
-		i = bbits + be + (bias + (p - 1) - 1)
-		var x uint64
-		if i > 32 {
-			x = uint64(word0)<<(64-i) | uint64(word1)>>(i-32)
-		} else {
-			x = uint64(word1) << (32 - i)
-		}
-		d2 = setWord0(float64(x), uint32((x>>32)-31*exp_mask))
-		i -= (bias + (p - 1) - 1) + 1
-		denorm = true
-	}
-	/* At this point d = f*2^i, where 1 <= f < 2.  d2 is an approximation of f. */
-	ds := (d2-1.5)*0.289529654602168 + 0.1760912590558 + float64(i)*0.301029995663981
-	k := int(ds)
-	if ds < 0.0 && ds != float64(k) {
-		k-- /* want k = floor(ds) */
-	}
-	k_check := true
-	if k >= 0 && k < len(tens) {
-		if d < tens[k] {
-			k--
-		}
-		k_check = false
-	}
-	/* At this point floor(log10(d)) <= k <= floor(log10(d))+1.
-	   If k_check is zero, we're guaranteed that k = floor(log10(d)). */
-	j := bbits - i - 1
-	var b2, s2, b5, s5 int
-	/* At this point d = b/2^j, where b is an odd integer. */
-	if j >= 0 {
-		b2 = 0
-		s2 = j
-	} else {
-		b2 = -j
-		s2 = 0
-	}
-	if k >= 0 {
-		b5 = 0
-		s5 = k
-		s2 += k
-	} else {
-		b2 -= k
-		b5 = -k
-		s5 = 0
-	}
-	/* At this point d/10^k = (b * 2^b2 * 5^b5) / (2^s2 * 5^s5), where b is an odd integer,
-	   b2 >= 0, b5 >= 0, s2 >= 0, and s5 >= 0. */
-	if mode < 0 || mode > 9 {
-		mode = 0
-	}
-	try_quick := true
-	if mode > 5 {
-		mode -= 4
-		try_quick = false
-	}
-	leftright := true
-	var ilim, ilim1 int
-	switch mode {
-	case 0, 1:
-		ilim, ilim1 = -1, -1
-		ndigits = 0
-	case 2:
-		leftright = false
-		fallthrough
-	case 4:
-		if ndigits <= 0 {
-			ndigits = 1
-		}
-		ilim, ilim1 = ndigits, ndigits
-	case 3:
-		leftright = false
-		fallthrough
-	case 5:
-		i = ndigits + k + 1
-		ilim = i
-		ilim1 = i - 1
-	}
-	/* ilim is the maximum number of significant digits we want, based on k and ndigits. */
-	/* ilim1 is the maximum number of significant digits we want, based on k and ndigits,
-	   when it turns out that k was computed too high by one. */
-	fast_failed := false
-	if ilim >= 0 && ilim <= quick_max && try_quick {
-
-		/* Try to get by with floating-point arithmetic. */
-
-		i = 0
-		d2 = d
-		k0 := k
-		ilim0 := ilim
-		ieps := 2 /* conservative */
-		/* Divide d by 10^k, keeping track of the roundoff error and avoiding overflows. */
-		if k > 0 {
-			ds = tens[k&0xf]
-			j = k >> 4
-			if (j & bletch) != 0 {
-				/* prevent overflows */
-				j &= bletch - 1
-				d /= bigtens[len(bigtens)-1]
-				ieps++
-			}
-			for ; j != 0; i++ {
-				if (j & 1) != 0 {
-					ieps++
-					ds *= bigtens[i]
-				}
-				j >>= 1
-			}
-			d /= ds
-		} else if j1 := -k; j1 != 0 {
-			d *= tens[j1&0xf]
-			for j = j1 >> 4; j != 0; i++ {
-				if (j & 1) != 0 {
-					ieps++
-					d *= bigtens[i]
-				}
-				j >>= 1
-			}
-		}
-		/* Check that k was computed correctly. */
-		if k_check && d < 1.0 && ilim > 0 {
-			if ilim1 <= 0 {
-				fast_failed = true
-			} else {
-				ilim = ilim1
-				k--
-				d *= 10.
-				ieps++
-			}
-		}
-		/* eps bounds the cumulative error. */
-		eps := float64(ieps)*d + 7.0
-		eps = setWord0(eps, _word0(eps)-(p-1)*exp_msk1)
-		if ilim == 0 {
-			d -= 5.0
-			if d > eps {
-				buf = append(buf, '1')
-				k++
-				return buf, int(k + 1)
-			}
-			if d < -eps {
-				buf = append(buf, '0')
-				return buf, 1
-			}
-			fast_failed = true
-		}
-		if !fast_failed {
-			fast_failed = true
-			if leftright {
-				/* Use Steele & White method of only
-				 * generating digits needed.
-				 */
-				eps = 0.5/tens[ilim-1] - eps
-				for i = 0; ; {
-					l := int64(d)
-					d -= float64(l)
-					buf = append(buf, byte('0'+l))
-					if d < eps {
-						return buf, k + 1
-					}
-					if 1.0-d < eps {
-						buf, k = bumpUp(buf, k)
-						return buf, k + 1
-					}
-					i++
-					if i >= ilim {
-						break
-					}
-					eps *= 10.0
-					d *= 10.0
-				}
-			} else {
-				/* Generate ilim digits, then fix them up. */
-				eps *= tens[ilim-1]
-				for i = 1; ; i++ {
-					l := int64(d)
-					d -= float64(l)
-					buf = append(buf, byte('0'+l))
-					if i == ilim {
-						if d > 0.5+eps {
-							buf, k = bumpUp(buf, k)
-							return buf, k + 1
-						} else if d < 0.5-eps {
-							buf = stripTrailingZeroes(buf)
-							return buf, k + 1
-						}
-						break
-					}
-					d *= 10.0
-				}
-			}
-		}
-		if fast_failed {
-			if sign {
-				buf = buf[:1]
-			} else {
-				buf = buf[:0]
-			}
-			d = d2
-			k = k0
-			ilim = ilim0
-		}
-	}
-
-	/* Do we have a "small" integer? */
-	if be >= 0 && k <= int_max {
-		/* Yes. */
-		ds = tens[k]
-		if ndigits < 0 && ilim <= 0 {
-			if ilim < 0 || d < 5*ds || (!biasUp && d == 5*ds) {
-				if sign {
-					buf = buf[:1]
-				} else {
-					buf = buf[:0]
-				}
-				buf = append(buf, '0')
-				return buf, 1
-			}
-			buf = append(buf, '1')
-			k++
-			return buf, k + 1
-		}
-		for i = 1; ; i++ {
-			l := int64(d / ds)
-			d -= float64(l) * ds
-			buf = append(buf, byte('0'+l))
-			if i == ilim {
-				d += d
-				if (d > ds) || (d == ds && (((l & 1) != 0) || biasUp)) {
-					buf, k = bumpUp(buf, k)
-				}
-				break
-			}
-			d *= 10.0
-			if d == 0 {
-				break
-			}
-		}
-		return buf, k + 1
-	}
-
-	m2 := b2
-	m5 := b5
-	var mhi, mlo *big.Int
-	if leftright {
-		if mode < 2 {
-			if denorm {
-				i = be + (bias + (p - 1) - 1 + 1)
-			} else {
-				i = 1 + p - bbits
-			}
-			/* i is 1 plus the number of trailing zero bits in d's significand. Thus,
-			   (2^m2 * 5^m5) / (2^(s2+i) * 5^s5) = (1/2 lsb of d)/10^k. */
-		} else {
-			j = ilim - 1
-			if m5 >= j {
-				m5 -= j
-			} else {
-				j -= m5
-				s5 += j
-				b5 += j
-				m5 = 0
-			}
-			i = ilim
-			if i < 0 {
-				m2 -= i
-				i = 0
-			}
-			/* (2^m2 * 5^m5) / (2^(s2+i) * 5^s5) = (1/2 * 10^(1-ilim))/10^k. */
-		}
-		b2 += i
-		s2 += i
-		mhi = big.NewInt(1)
-		/* (mhi * 2^m2 * 5^m5) / (2^s2 * 5^s5) = one-half of last printed (when mode >= 2) or
-		   input (when mode < 2) significant digit, divided by 10^k. */
-	}
-
-	/* We still have d/10^k = (b * 2^b2 * 5^b5) / (2^s2 * 5^s5).  Reduce common factors in
-	   b2, m2, and s2 without changing the equalities. */
-	if m2 > 0 && s2 > 0 {
-		if m2 < s2 {
-			i = m2
-		} else {
-			i = s2
-		}
-		b2 -= i
-		m2 -= i
-		s2 -= i
-	}
-
-	/* Fold b5 into b and m5 into mhi. */
-	if b5 > 0 {
-		if leftright {
-			if m5 > 0 {
-				pow5mult(mhi, m5)
-				b.Mul(mhi, b)
-			}
-			j = b5 - m5
-			if j != 0 {
-				pow5mult(b, j)
-			}
-		} else {
-			pow5mult(b, b5)
-		}
-	}
-	/* Now we have d/10^k = (b * 2^b2) / (2^s2 * 5^s5) and
-	   (mhi * 2^m2) / (2^s2 * 5^s5) = one-half of last printed or input significant digit, divided by 10^k. */
-
-	S := big.NewInt(1)
-	if s5 > 0 {
-		pow5mult(S, s5)
-	}
-	/* Now we have d/10^k = (b * 2^b2) / (S * 2^s2) and
-	   (mhi * 2^m2) / (S * 2^s2) = one-half of last printed or input significant digit, divided by 10^k. */
-
-	/* Check for special case that d is a normalized power of 2. */
-	spec_case := false
-	if mode < 2 {
-		if (_word1(d) == 0) && ((_word0(d) & bndry_mask) == 0) &&
-			((_word0(d) & (exp_mask & (exp_mask << 1))) != 0) {
-			/* The special case.  Here we want to be within a quarter of the last input
-			   significant digit instead of one half of it when the decimal output string's value is less than d.  */
-			b2 += log2P
-			s2 += log2P
-			spec_case = true
-		}
-	}
-
-	/* Arrange for convenient computation of quotients:
-	 * shift left if necessary so divisor has 4 leading 0 bits.
-	 *
-	 * Perhaps we should just compute leading 28 bits of S once
-	 * and for all and pass them and a shift to quorem, so it
-	 * can do shifts and ors to compute the numerator for q.
-	 */
-	S_bytes := S.Bytes()
-	var S_hiWord uint32
-	for idx := 0; idx < 4; idx++ {
-		S_hiWord = S_hiWord << 8
-		if idx < len(S_bytes) {
-			S_hiWord |= uint32(S_bytes[idx])
-		}
-	}
-	var zz int
-	if s5 != 0 {
-		zz = 32 - hi0bits(S_hiWord)
-	} else {
-		zz = 1
-	}
-	i = (zz + s2) & 0x1f
-	if i != 0 {
-		i = 32 - i
-	}
-	/* i is the number of leading zero bits in the most significant word of S*2^s2. */
-	if i > 4 {
-		i -= 4
-		b2 += i
-		m2 += i
-		s2 += i
-	} else if i < 4 {
-		i += 28
-		b2 += i
-		m2 += i
-		s2 += i
-	}
-	/* Now S*2^s2 has exactly four leading zero bits in its most significant word. */
-	if b2 > 0 {
-		b = b.Lsh(b, uint(b2))
-	}
-	if s2 > 0 {
-		S.Lsh(S, uint(s2))
-	}
-	/* Now we have d/10^k = b/S and
-	   (mhi * 2^m2) / S = maximum acceptable error, divided by 10^k. */
-	if k_check {
-		if b.Cmp(S) < 0 {
-			k--
-			b.Mul(b, big10) /* we botched the k estimate */
-			if leftright {
-				mhi.Mul(mhi, big10)
-			}
-			ilim = ilim1
-		}
-	}
-	/* At this point 1 <= d/10^k = b/S < 10. */
-
-	if ilim <= 0 && mode > 2 {
-		/* We're doing fixed-mode output and d is less than the minimum nonzero output in this mode.
-		   Output either zero or the minimum nonzero output depending on which is closer to d. */
-		if ilim >= 0 {
-			i = b.Cmp(S.Mul(S, big5))
-		}
-		if ilim < 0 || i < 0 || i == 0 && !biasUp {
-			/* Always emit at least one digit.  If the number appears to be zero
-			   using the current mode, then emit one '0' digit and set decpt to 1. */
-			if sign {
-				buf = buf[:1]
-			} else {
-				buf = buf[:0]
-			}
-			buf = append(buf, '0')
-			return buf, 1
-		}
-		buf = append(buf, '1')
-		k++
-		return buf, int(k + 1)
-	}
-
-	var dig byte
-	if leftright {
-		if m2 > 0 {
-			mhi.Lsh(mhi, uint(m2))
-		}
-
-		/* Compute mlo -- check for special case
-		 * that d is a normalized power of 2.
-		 */
-
-		mlo = mhi
-		if spec_case {
-			mhi = mlo
-			mhi = new(big.Int).Lsh(mhi, log2P)
-		}
-		/* mlo/S = maximum acceptable error, divided by 10^k, if the output is less than d. */
-		/* mhi/S = maximum acceptable error, divided by 10^k, if the output is greater than d. */
-		var z, delta big.Int
-		for i = 1; ; i++ {
-			z.DivMod(b, S, b)
-			dig = byte(z.Int64() + '0')
-			/* Do we yet have the shortest decimal string
-			 * that will round to d?
-			 */
-			j = b.Cmp(mlo)
-			/* j is b/S compared with mlo/S. */
-			delta.Sub(S, mhi)
-			var j1 int
-			if delta.Sign() <= 0 {
-				j1 = 1
-			} else {
-				j1 = b.Cmp(&delta)
-			}
-			/* j1 is b/S compared with 1 - mhi/S. */
-			if (j1 == 0) && (mode == 0) && ((_word1(d) & 1) == 0) {
-				if dig == '9' {
-					var flag bool
-					buf = append(buf, '9')
-					if buf, flag = roundOff(buf); flag {
-						k++
-						buf = append(buf, '1')
-					}
-					return buf, k + 1
-				}
-				if j > 0 {
-					dig++
-				}
-				buf = append(buf, dig)
-				return buf, k + 1
-			}
-			if (j < 0) || ((j == 0) && (mode == 0) && ((_word1(d) & 1) == 0)) {
-				if j1 > 0 {
-					/* Either dig or dig+1 would work here as the least significant decimal digit.
-					   Use whichever would produce a decimal value closer to d. */
-					b.Lsh(b, 1)
-					j1 = b.Cmp(S)
-					if (j1 > 0) || (j1 == 0 && (((dig & 1) == 1) || biasUp)) {
-						dig++
-						if dig == '9' {
-							buf = append(buf, '9')
-							buf, flag := roundOff(buf)
-							if flag {
-								k++
-								buf = append(buf, '1')
-							}
-							return buf, k + 1
-						}
-					}
-				}
-				buf = append(buf, dig)
-				return buf, k + 1
-			}
-			if j1 > 0 {
-				if dig == '9' { /* possible if i == 1 */
-					buf = append(buf, '9')
-					buf, flag := roundOff(buf)
-					if flag {
-						k++
-						buf = append(buf, '1')
-					}
-					return buf, k + 1
-				}
-				buf = append(buf, dig+1)
-				return buf, k + 1
-			}
-			buf = append(buf, dig)
-			if i == ilim {
-				break
-			}
-			b.Mul(b, big10)
-			if mlo == mhi {
-				mhi.Mul(mhi, big10)
-			} else {
-				mlo.Mul(mlo, big10)
-				mhi.Mul(mhi, big10)
-			}
-		}
-	} else {
-		var z big.Int
-		for i = 1; ; i++ {
-			z.DivMod(b, S, b)
-			dig = byte(z.Int64() + '0')
-			buf = append(buf, dig)
-			if i >= ilim {
-				break
-			}
-
-			b.Mul(b, big10)
-		}
-	}
-	/* Round off last digit */
-
-	b.Lsh(b, 1)
-	j = b.Cmp(S)
-	if (j > 0) || (j == 0 && (((dig & 1) == 1) || biasUp)) {
-		var flag bool
-		buf, flag = roundOff(buf)
-		if flag {
-			k++
-			buf = append(buf, '1')
-			return buf, k + 1
-		}
-	} else {
-		buf = stripTrailingZeroes(buf)
-	}
-
-	return buf, k + 1
-}
-
 func insert(b []byte, p int, c byte) []byte {
 	b = append(b, 0)
 	copy(b[p+1:], b[p:])
@@ -668,186 +40,108 @@ func expand(b []byte, delta int) []byte {
 }
 
 func FToStr(d float64, mode FToStrMode, precision int, buffer []byte) []byte {
+	if math.IsNaN(d) {
+		buffer = append(buffer, "NaN"...)
+		return buffer
+	}
+	if math.IsInf(d, 0) {
+		if math.Signbit(d) {
+			buffer = append(buffer, '-')
+		}
+		buffer = append(buffer, "Infinity"...)
+		return buffer
+	}
+
 	if mode == ModeFixed && (d >= 1e21 || d <= -1e21) {
 		mode = ModeStandard
 	}
 
-	buffer, decPt := ftoa(d, dtoaModes[mode], mode >= ModeFixed, precision, buffer)
-	if decPt != 9999 {
-		exponentialNotation := false
-		minNDigits := 0 /* Minimum number of significand digits required by mode and precision */
-		sign := false
-		nDigits := len(buffer)
-		if len(buffer) > 0 && buffer[0] == '-' {
-			sign = true
-			nDigits--
-		}
+	var decPt int
+	var ok bool
+	startPos := len(buffer)
 
-		switch mode {
-		case ModeStandard:
-			if decPt < -5 || decPt > 21 {
-				exponentialNotation = true
-			} else {
-				minNDigits = decPt
-			}
-		case ModeFixed:
-			if precision >= 0 {
-				minNDigits = decPt + precision
-			} else {
-				minNDigits = decPt
-			}
-		case ModeExponential:
-			//                    JS_ASSERT(precision > 0);
-			minNDigits = precision
-			fallthrough
-		case ModeStandardExponential:
-			exponentialNotation = true
-		case ModePrecision:
-			//                    JS_ASSERT(precision > 0);
-			minNDigits = precision
-			if decPt < -5 || decPt > precision {
-				exponentialNotation = true
-			}
+	if d != 0 { // also matches -0
+		if d < 0 {
+			buffer = append(buffer, '-')
+			d = -d
+			startPos++
 		}
-
-		for nDigits < minNDigits {
-			buffer = append(buffer, '0')
-			nDigits++
-		}
-
-		if exponentialNotation {
-			/* Insert a decimal point if more than one significand digit */
-			if nDigits != 1 {
-				p := 1
-				if sign {
-					p = 2
-				}
-				buffer = insert(buffer, p, '.')
-			}
-			buffer = append(buffer, 'e')
-			if decPt-1 >= 0 {
-				buffer = append(buffer, '+')
-			}
-			buffer = strconv.AppendInt(buffer, int64(decPt-1), 10)
-		} else if decPt != nDigits {
-			/* Some kind of a fraction in fixed notation */
-			//                JS_ASSERT(decPt <= nDigits);
-			if decPt > 0 {
-				/* dd...dd . dd...dd */
-				if sign {
-					buffer = insert(buffer, decPt+1, '.')
-				} else {
-					buffer = insert(buffer, decPt, '.')
-				}
-			} else {
-				/* 0 . 00...00dd...dd */
-				o := 0
-				if sign {
-					o = 1
-				}
-				buffer = expand(buffer, 2-decPt)
-				copy(buffer[o+2-decPt:], buffer[o:])
-				buffer[o] = '0'
-				buffer[o+1] = '.'
-				for i := o + 2; i < o+2-decPt; i++ {
-					buffer[i] = '0'
-				}
-			}
+		switch mode {
+		case ModeStandard, ModeStandardExponential:
+			buffer, decPt, ok = fast.Dtoa(d, fast.ModeShortest, 0, buffer)
+		case ModeExponential, ModePrecision:
+			buffer, decPt, ok = fast.Dtoa(d, fast.ModePrecision, precision, buffer)
 		}
+	} else {
+		buffer = append(buffer, '0')
+		decPt, ok = 1, true
 	}
-	return buffer
-}
-
-func bumpUp(buf []byte, k int) ([]byte, int) {
-	var lastCh byte
-	stop := 0
-	if len(buf) > 0 && buf[0] == '-' {
-		stop = 1
+	if !ok {
+		buffer, decPt = ftoa(d, dtoaModes[mode], mode >= ModeFixed, precision, buffer)
 	}
-	for {
-		lastCh = buf[len(buf)-1]
-		buf = buf[:len(buf)-1]
-		if lastCh != '9' {
-			break
+	exponentialNotation := false
+	minNDigits := 0 /* Minimum number of significand digits required by mode and precision */
+	nDigits := len(buffer) - startPos
+
+	switch mode {
+	case ModeStandard:
+		if decPt < -5 || decPt > 21 {
+			exponentialNotation = true
+		} else {
+			minNDigits = decPt
 		}
-		if len(buf) == stop {
-			k++
-			lastCh = '0'
-			break
+	case ModeFixed:
+		if precision >= 0 {
+			minNDigits = decPt + precision
+		} else {
+			minNDigits = decPt
+		}
+	case ModeExponential:
+		//                    JS_ASSERT(precision > 0);
+		minNDigits = precision
+		fallthrough
+	case ModeStandardExponential:
+		exponentialNotation = true
+	case ModePrecision:
+		//                    JS_ASSERT(precision > 0);
+		minNDigits = precision
+		if decPt < -5 || decPt > precision {
+			exponentialNotation = true
 		}
 	}
-	buf = append(buf, lastCh+1)
-	return buf, k
-}
 
-func setWord0(d float64, w uint32) float64 {
-	dBits := math.Float64bits(d)
-	return math.Float64frombits(uint64(w)<<32 | dBits&0xffffffff)
-}
-
-func _word0(d float64) uint32 {
-	dBits := math.Float64bits(d)
-	return uint32(dBits >> 32)
-}
-
-func _word1(d float64) uint32 {
-	dBits := math.Float64bits(d)
-	return uint32(dBits)
-}
-
-func stripTrailingZeroes(buf []byte) []byte {
-	bl := len(buf) - 1
-	for bl >= 0 && buf[bl] == '0' {
-		bl--
+	for nDigits < minNDigits {
+		buffer = append(buffer, '0')
+		nDigits++
 	}
-	return buf[:bl+1]
-}
 
-/* Set b = b * 5^k.  k must be nonnegative. */
-func pow5mult(b *big.Int, k int) *big.Int {
-	if k < (1 << (len(pow5Cache) + 2)) {
-		i := k & 3
-		if i != 0 {
-			b.Mul(b, p05[i-1])
+	if exponentialNotation {
+		/* Insert a decimal point if more than one significand digit */
+		if nDigits != 1 {
+			buffer = insert(buffer, startPos+1, '.')
 		}
-		k >>= 2
-		i = 0
-		for {
-			if k&1 != 0 {
-				b.Mul(b, pow5Cache[i])
-			}
-			k >>= 1
-			if k == 0 {
-				break
-			}
-			i++
+		buffer = append(buffer, 'e')
+		if decPt-1 >= 0 {
+			buffer = append(buffer, '+')
 		}
-		return b
-	}
-	return b.Mul(b, new(big.Int).Exp(big5, big.NewInt(int64(k)), nil))
-}
-
-func roundOff(buf []byte) ([]byte, bool) {
-	i := len(buf)
-	stop := 0
-	if i > 0 && buf[0] == '-' {
-		stop = 1
-	}
-	for i != stop {
-		i--
-		if buf[i] != '9' {
-			buf[i]++
-			return buf[:i+1], false
+		buffer = strconv.AppendInt(buffer, int64(decPt-1), 10)
+	} else if decPt != nDigits {
+		/* Some kind of a fraction in fixed notation */
+		//                JS_ASSERT(decPt <= nDigits);
+		if decPt > 0 {
+			/* dd...dd . dd...dd */
+			buffer = insert(buffer, startPos+decPt, '.')
+		} else {
+			/* 0 . 00...00dd...dd */
+			buffer = expand(buffer, 2-decPt)
+			copy(buffer[startPos+2-decPt:], buffer[startPos:])
+			buffer[startPos] = '0'
+			buffer[startPos+1] = '.'
+			for i := startPos + 2; i < startPos+2-decPt; i++ {
+				buffer[i] = '0'
+			}
 		}
 	}
-	return buf[:stop], true
-}
 
-func init() {
-	p := big.NewInt(625)
-	pow5Cache[0] = p
-	for i := 1; i < len(pow5Cache); i++ {
-		p = new(big.Int).Mul(p, p)
-		pow5Cache[i] = p
-	}
+	return buffer
 }
diff --git a/ftoa/ftostr_test.go b/ftoa/ftostr_test.go
index e645b598..db2441f8 100644
--- a/ftoa/ftostr_test.go
+++ b/ftoa/ftostr_test.go
@@ -33,16 +33,37 @@ func TestDtostr(t *testing.T) {
 	testFToStr(885, ModeExponential, 2, "8.9e+2", t)
 	testFToStr(25, ModeExponential, 1, "3e+1", t)
 	testFToStr(1e-6, ModeFixed, 7, "0.0000010", t)
+	testFToStr(math.Pi, ModeStandardExponential, 0, "3.141592653589793e+0", t)
 	testFToStr(math.Inf(1), ModeStandard, 0, "Infinity", t)
 	testFToStr(math.NaN(), ModeStandard, 0, "NaN", t)
 	testFToStr(math.SmallestNonzeroFloat64, ModeExponential, 40, "4.940656458412465441765687928682213723651e-324", t)
+	testFToStr(3.5844466002796428e+298, ModeStandard, 0, "3.5844466002796428e+298", t)
+	testFToStr(math.Float64frombits(0x0010000000000000), ModeStandard, 0, "2.2250738585072014e-308", t) // smallest normal
+	testFToStr(math.Float64frombits(0x000FFFFFFFFFFFFF), ModeStandard, 0, "2.225073858507201e-308", t)  // largest denormal
+	testFToStr(4294967272.0, ModePrecision, 14, "4294967272.0000", t)
 }
 
 func BenchmarkDtostrSmall(b *testing.B) {
 	var buf [128]byte
 	b.ReportAllocs()
 	for i := 0; i < b.N; i++ {
-		FToStr(math.Pi, ModeExponential, 0, buf[:0])
+		FToStr(math.Pi, ModeStandardExponential, 0, buf[:0])
+	}
+}
+
+func BenchmarkDtostrShort(b *testing.B) {
+	var buf [128]byte
+	b.ReportAllocs()
+	for i := 0; i < b.N; i++ {
+		FToStr(3.1415, ModeStandard, 0, buf[:0])
+	}
+}
+
+func BenchmarkDtostrFixed(b *testing.B) {
+	var buf [128]byte
+	b.ReportAllocs()
+	for i := 0; i < b.N; i++ {
+		FToStr(math.Pi, ModeFixed, 4, buf[:0])
 	}
 }
 
diff --git a/ftoa/internal/fast/LICENSE_V8 b/ftoa/internal/fast/LICENSE_V8
new file mode 100644
index 00000000..bbad2662
--- /dev/null
+++ b/ftoa/internal/fast/LICENSE_V8
@@ -0,0 +1,26 @@
+Copyright 2014, the V8 project authors. All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above
+      copyright notice, this list of conditions and the following
+      disclaimer in the documentation and/or other materials provided
+      with the distribution.
+    * Neither the name of Google Inc. nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/ftoa/internal/fast/cachedpower.go b/ftoa/internal/fast/cachedpower.go
new file mode 100644
index 00000000..4f7e49fc
--- /dev/null
+++ b/ftoa/internal/fast/cachedpower.go
@@ -0,0 +1,120 @@
+package fast
+
+import "math"
+
+const (
+	kCachedPowersOffset      = 348                 // -1 * the first decimal_exponent.
+	kD_1_LOG2_10             = 0.30102999566398114 //  1 / lg(10)
+	kDecimalExponentDistance = 8
+)
+
+type cachedPower struct {
+	significand      uint64
+	binary_exponent  int16
+	decimal_exponent int16
+}
+
+var (
+	cachedPowers = [...]cachedPower{
+		{0xFA8FD5A0081C0288, -1220, -348},
+		{0xBAAEE17FA23EBF76, -1193, -340},
+		{0x8B16FB203055AC76, -1166, -332},
+		{0xCF42894A5DCE35EA, -1140, -324},
+		{0x9A6BB0AA55653B2D, -1113, -316},
+		{0xE61ACF033D1A45DF, -1087, -308},
+		{0xAB70FE17C79AC6CA, -1060, -300},
+		{0xFF77B1FCBEBCDC4F, -1034, -292},
+		{0xBE5691EF416BD60C, -1007, -284},
+		{0x8DD01FAD907FFC3C, -980, -276},
+		{0xD3515C2831559A83, -954, -268},
+		{0x9D71AC8FADA6C9B5, -927, -260},
+		{0xEA9C227723EE8BCB, -901, -252},
+		{0xAECC49914078536D, -874, -244},
+		{0x823C12795DB6CE57, -847, -236},
+		{0xC21094364DFB5637, -821, -228},
+		{0x9096EA6F3848984F, -794, -220},
+		{0xD77485CB25823AC7, -768, -212},
+		{0xA086CFCD97BF97F4, -741, -204},
+		{0xEF340A98172AACE5, -715, -196},
+		{0xB23867FB2A35B28E, -688, -188},
+		{0x84C8D4DFD2C63F3B, -661, -180},
+		{0xC5DD44271AD3CDBA, -635, -172},
+		{0x936B9FCEBB25C996, -608, -164},
+		{0xDBAC6C247D62A584, -582, -156},
+		{0xA3AB66580D5FDAF6, -555, -148},
+		{0xF3E2F893DEC3F126, -529, -140},
+		{0xB5B5ADA8AAFF80B8, -502, -132},
+		{0x87625F056C7C4A8B, -475, -124},
+		{0xC9BCFF6034C13053, -449, -116},
+		{0x964E858C91BA2655, -422, -108},
+		{0xDFF9772470297EBD, -396, -100},
+		{0xA6DFBD9FB8E5B88F, -369, -92},
+		{0xF8A95FCF88747D94, -343, -84},
+		{0xB94470938FA89BCF, -316, -76},
+		{0x8A08F0F8BF0F156B, -289, -68},
+		{0xCDB02555653131B6, -263, -60},
+		{0x993FE2C6D07B7FAC, -236, -52},
+		{0xE45C10C42A2B3B06, -210, -44},
+		{0xAA242499697392D3, -183, -36},
+		{0xFD87B5F28300CA0E, -157, -28},
+		{0xBCE5086492111AEB, -130, -20},
+		{0x8CBCCC096F5088CC, -103, -12},
+		{0xD1B71758E219652C, -77, -4},
+		{0x9C40000000000000, -50, 4},
+		{0xE8D4A51000000000, -24, 12},
+		{0xAD78EBC5AC620000, 3, 20},
+		{0x813F3978F8940984, 30, 28},
+		{0xC097CE7BC90715B3, 56, 36},
+		{0x8F7E32CE7BEA5C70, 83, 44},
+		{0xD5D238A4ABE98068, 109, 52},
+		{0x9F4F2726179A2245, 136, 60},
+		{0xED63A231D4C4FB27, 162, 68},
+		{0xB0DE65388CC8ADA8, 189, 76},
+		{0x83C7088E1AAB65DB, 216, 84},
+		{0xC45D1DF942711D9A, 242, 92},
+		{0x924D692CA61BE758, 269, 100},
+		{0xDA01EE641A708DEA, 295, 108},
+		{0xA26DA3999AEF774A, 322, 116},
+		{0xF209787BB47D6B85, 348, 124},
+		{0xB454E4A179DD1877, 375, 132},
+		{0x865B86925B9BC5C2, 402, 140},
+		{0xC83553C5C8965D3D, 428, 148},
+		{0x952AB45CFA97A0B3, 455, 156},
+		{0xDE469FBD99A05FE3, 481, 164},
+		{0xA59BC234DB398C25, 508, 172},
+		{0xF6C69A72A3989F5C, 534, 180},
+		{0xB7DCBF5354E9BECE, 561, 188},
+		{0x88FCF317F22241E2, 588, 196},
+		{0xCC20CE9BD35C78A5, 614, 204},
+		{0x98165AF37B2153DF, 641, 212},
+		{0xE2A0B5DC971F303A, 667, 220},
+		{0xA8D9D1535CE3B396, 694, 228},
+		{0xFB9B7CD9A4A7443C, 720, 236},
+		{0xBB764C4CA7A44410, 747, 244},
+		{0x8BAB8EEFB6409C1A, 774, 252},
+		{0xD01FEF10A657842C, 800, 260},
+		{0x9B10A4E5E9913129, 827, 268},
+		{0xE7109BFBA19C0C9D, 853, 276},
+		{0xAC2820D9623BF429, 880, 284},
+		{0x80444B5E7AA7CF85, 907, 292},
+		{0xBF21E44003ACDD2D, 933, 300},
+		{0x8E679C2F5E44FF8F, 960, 308},
+		{0xD433179D9C8CB841, 986, 316},
+		{0x9E19DB92B4E31BA9, 1013, 324},
+		{0xEB96BF6EBADF77D9, 1039, 332},
+		{0xAF87023B9BF0EE6B, 1066, 340},
+	}
+)
+
+func getCachedPowerForBinaryExponentRange(min_exponent, max_exponent int) (power diyfp, decimal_exponent int) {
+	kQ := diyFpKSignificandSize
+	k := int(math.Ceil(float64(min_exponent+kQ-1) * kD_1_LOG2_10))
+	index := (kCachedPowersOffset+k-1)/kDecimalExponentDistance + 1
+	cached_power := cachedPowers[index]
+	_DCHECK(min_exponent <= int(cached_power.binary_exponent))
+	_DCHECK(int(cached_power.binary_exponent) <= max_exponent)
+	decimal_exponent = int(cached_power.decimal_exponent)
+	power = diyfp{f: cached_power.significand, e: int(cached_power.binary_exponent)}
+
+	return
+}
diff --git a/ftoa/internal/fast/common.go b/ftoa/internal/fast/common.go
new file mode 100644
index 00000000..6ffaaf92
--- /dev/null
+++ b/ftoa/internal/fast/common.go
@@ -0,0 +1,18 @@
+/*
+Package fast contains code ported from V8 (https://github.com/v8/v8/blob/master/src/numbers/fast-dtoa.cc)
+
+See LICENSE_V8 for the original copyright message and disclaimer.
+*/
+package fast
+
+import "errors"
+
+var (
+	dcheckFailure = errors.New("DCHECK assertion failed")
+)
+
+func _DCHECK(f bool) {
+	if !f {
+		panic(dcheckFailure)
+	}
+}
diff --git a/ftoa/internal/fast/diyfp.go b/ftoa/internal/fast/diyfp.go
new file mode 100644
index 00000000..727a7472
--- /dev/null
+++ b/ftoa/internal/fast/diyfp.go
@@ -0,0 +1,152 @@
+package fast
+
+import "math"
+
+const (
+	diyFpKSignificandSize        = 64
+	kSignificandSize             = 53
+	kUint64MSB            uint64 = 1 << 63
+
+	kSignificandMask = 0x000FFFFFFFFFFFFF
+	kHiddenBit       = 0x0010000000000000
+	kExponentMask    = 0x7FF0000000000000
+
+	kPhysicalSignificandSize = 52 // Excludes the hidden bit.
+	kExponentBias            = 0x3FF + kPhysicalSignificandSize
+	kDenormalExponent        = -kExponentBias + 1
+)
+
+type double float64
+
+type diyfp struct {
+	f uint64
+	e int
+}
+
+// f =- o.
+// The exponents of both numbers must be the same and the significand of this
+// must be bigger than the significand of other.
+// The result will not be normalized.
+func (f *diyfp) subtract(o diyfp) {
+	_DCHECK(f.e == o.e)
+	_DCHECK(f.f >= o.f)
+	f.f -= o.f
+}
+
+// Returns f - o
+// The exponents of both numbers must be the same and this must be bigger
+// than other. The result will not be normalized.
+func (f diyfp) minus(o diyfp) diyfp {
+	res := f
+	res.subtract(o)
+	return res
+}
+
+// f *= o
+func (f *diyfp) mul(o diyfp) {
+	// Simply "emulates" a 128 bit multiplication.
+	// However: the resulting number only contains 64 bits. The least
+	// significant 64 bits are only used for rounding the most significant 64
+	// bits.
+	const kM32 uint64 = 0xFFFFFFFF
+	a := f.f >> 32
+	b := f.f & kM32
+	c := o.f >> 32
+	d := o.f & kM32
+	ac := a * c
+	bc := b * c
+	ad := a * d
+	bd := b * d
+	tmp := (bd >> 32) + (ad & kM32) + (bc & kM32)
+	// By adding 1U << 31 to tmp we round the final result.
+	// Halfway cases will be round up.
+	tmp += 1 << 31
+	result_f := ac + (ad >> 32) + (bc >> 32) + (tmp >> 32)
+	f.e += o.e + 64
+	f.f = result_f
+}
+
+// Returns f * o
+func (f diyfp) times(o diyfp) diyfp {
+	res := f
+	res.mul(o)
+	return res
+}
+
+func (f *diyfp) _normalize() {
+	f_, e := f.f, f.e
+	// This method is mainly called for normalizing boundaries. In general
+	// boundaries need to be shifted by 10 bits. We thus optimize for this case.
+	const k10MSBits uint64 = 0x3FF << 54
+	for f_&k10MSBits == 0 {
+		f_ <<= 10
+		e -= 10
+	}
+	for f_&kUint64MSB == 0 {
+		f_ <<= 1
+		e--
+	}
+	f.f, f.e = f_, e
+}
+
+func normalizeDiyfp(f diyfp) diyfp {
+	res := f
+	res._normalize()
+	return res
+}
+
+// f must be strictly greater than 0.
+func (d double) toNormalizedDiyfp() diyfp {
+	f, e := d.sigExp()
+
+	// The current float could be a denormal.
+	for (f & kHiddenBit) == 0 {
+		f <<= 1
+		e--
+	}
+	// Do the final shifts in one go.
+	f <<= diyFpKSignificandSize - kSignificandSize
+	e -= diyFpKSignificandSize - kSignificandSize
+	return diyfp{f, e}
+}
+
+// Returns the two boundaries of this.
+// The bigger boundary (m_plus) is normalized. The lower boundary has the same
+// exponent as m_plus.
+// Precondition: the value encoded by this Double must be greater than 0.
+func (d double) normalizedBoundaries() (m_minus, m_plus diyfp) {
+	v := d.toDiyFp()
+	significand_is_zero := v.f == kHiddenBit
+	m_plus = normalizeDiyfp(diyfp{f: (v.f << 1) + 1, e: v.e - 1})
+	if significand_is_zero && v.e != kDenormalExponent {
+		// The boundary is closer. Think of v = 1000e10 and v- = 9999e9.
+		// Then the boundary (== (v - v-)/2) is not just at a distance of 1e9 but
+		// at a distance of 1e8.
+		// The only exception is for the smallest normal: the largest denormal is
+		// at the same distance as its successor.
+		// Note: denormals have the same exponent as the smallest normals.
+		m_minus = diyfp{f: (v.f << 2) - 1, e: v.e - 2}
+	} else {
+		m_minus = diyfp{f: (v.f << 1) - 1, e: v.e - 1}
+	}
+	m_minus.f <<= m_minus.e - m_plus.e
+	m_minus.e = m_plus.e
+	return
+}
+
+func (d double) toDiyFp() diyfp {
+	f, e := d.sigExp()
+	return diyfp{f: f, e: e}
+}
+
+func (d double) sigExp() (significand uint64, exponent int) {
+	d64 := math.Float64bits(float64(d))
+	significand = d64 & kSignificandMask
+	if d64&kExponentMask != 0 { // not denormal
+		significand += kHiddenBit
+		exponent = int((d64&kExponentMask)>>kPhysicalSignificandSize) - kExponentBias
+	} else {
+		exponent = kDenormalExponent
+	}
+	return
+}
diff --git a/ftoa/internal/fast/dtoa.go b/ftoa/internal/fast/dtoa.go
new file mode 100644
index 00000000..a82e31e9
--- /dev/null
+++ b/ftoa/internal/fast/dtoa.go
@@ -0,0 +1,624 @@
+package fast
+
+import (
+	"fmt"
+	"strconv"
+)
+
+const (
+	kMinimalTargetExponent = -60
+	kMaximalTargetExponent = -32
+
+	kTen4 = 10000
+	kTen5 = 100000
+	kTen6 = 1000000
+	kTen7 = 10000000
+	kTen8 = 100000000
+	kTen9 = 1000000000
+)
+
+type Mode int
+
+const (
+	ModeShortest Mode = iota
+	ModePrecision
+)
+
+// Adjusts the last digit of the generated number, and screens out generated
+// solutions that may be inaccurate. A solution may be inaccurate if it is
+// outside the safe interval, or if we cannot prove that it is closer to the
+// input than a neighboring representation of the same length.
+//
+// Input: * buffer containing the digits of too_high / 10^kappa
+//        * distance_too_high_w == (too_high - w).f() * unit
+//        * unsafe_interval == (too_high - too_low).f() * unit
+//        * rest = (too_high - buffer * 10^kappa).f() * unit
+//        * ten_kappa = 10^kappa * unit
+//        * unit = the common multiplier
+// Output: returns true if the buffer is guaranteed to contain the closest
+//    representable number to the input.
+//  Modifies the generated digits in the buffer to approach (round towards) w.
+func roundWeed(buffer []byte, distance_too_high_w, unsafe_interval, rest, ten_kappa, unit uint64) bool {
+	small_distance := distance_too_high_w - unit
+	big_distance := distance_too_high_w + unit
+
+	// Let w_low  = too_high - big_distance, and
+	//     w_high = too_high - small_distance.
+	// Note: w_low < w < w_high
+	//
+	// The real w (* unit) must lie somewhere inside the interval
+	// ]w_low; w_high[ (often written as "(w_low; w_high)")
+
+	// Basically the buffer currently contains a number in the unsafe interval
+	// ]too_low; too_high[ with too_low < w < too_high
+	//
+	//  too_high - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+	//                     ^v 1 unit            ^      ^                 ^      ^
+	//  boundary_high ---------------------     .      .                 .      .
+	//                     ^v 1 unit            .      .                 .      .
+	//   - - - - - - - - - - - - - - - - - - -  +  - - + - - - - - -     .      .
+	//                                          .      .         ^       .      .
+	//                                          .  big_distance  .       .      .
+	//                                          .      .         .       .    rest
+	//                              small_distance     .         .       .      .
+	//                                          v      .         .       .      .
+	//  w_high - - - - - - - - - - - - - - - - - -     .         .       .      .
+	//                     ^v 1 unit                   .         .       .      .
+	//  w ----------------------------------------     .         .       .      .
+	//                     ^v 1 unit                   v         .       .      .
+	//  w_low  - - - - - - - - - - - - - - - - - - - - -         .       .      .
+	//                                                           .       .      v
+	//  buffer --------------------------------------------------+-------+--------
+	//                                                           .       .
+	//                                                  safe_interval    .
+	//                                                           v       .
+	//   - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -     .
+	//                     ^v 1 unit                                     .
+	//  boundary_low -------------------------                     unsafe_interval
+	//                     ^v 1 unit                                     v
+	//  too_low  - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+	//
+	//
+	// Note that the value of buffer could lie anywhere inside the range too_low
+	// to too_high.
+	//
+	// boundary_low, boundary_high and w are approximations of the real boundaries
+	// and v (the input number). They are guaranteed to be precise up to one unit.
+	// In fact the error is guaranteed to be strictly less than one unit.
+	//
+	// Anything that lies outside the unsafe interval is guaranteed not to round
+	// to v when read again.
+	// Anything that lies inside the safe interval is guaranteed to round to v
+	// when read again.
+	// If the number inside the buffer lies inside the unsafe interval but not
+	// inside the safe interval then we simply do not know and bail out (returning
+	// false).
+	//
+	// Similarly we have to take into account the imprecision of 'w' when finding
+	// the closest representation of 'w'. If we have two potential
+	// representations, and one is closer to both w_low and w_high, then we know
+	// it is closer to the actual value v.
+	//
+	// By generating the digits of too_high we got the largest (closest to
+	// too_high) buffer that is still in the unsafe interval. In the case where
+	// w_high < buffer < too_high we try to decrement the buffer.
+	// This way the buffer approaches (rounds towards) w.
+	// There are 3 conditions that stop the decrementation process:
+	//   1) the buffer is already below w_high
+	//   2) decrementing the buffer would make it leave the unsafe interval
+	//   3) decrementing the buffer would yield a number below w_high and farther
+	//      away than the current number. In other words:
+	//              (buffer{-1} < w_high) && w_high - buffer{-1} > buffer - w_high
+	// Instead of using the buffer directly we use its distance to too_high.
+	// Conceptually rest ~= too_high - buffer
+	// We need to do the following tests in this order to avoid over- and
+	// underflows.
+	_DCHECK(rest <= unsafe_interval)
+	for rest < small_distance && // Negated condition 1
+		unsafe_interval-rest >= ten_kappa && // Negated condition 2
+		(rest+ten_kappa < small_distance || // buffer{-1} > w_high
+			small_distance-rest >= rest+ten_kappa-small_distance) {
+		buffer[len(buffer)-1]--
+		rest += ten_kappa
+	}
+
+	// We have approached w+ as much as possible. We now test if approaching w-
+	// would require changing the buffer. If yes, then we have two possible
+	// representations close to w, but we cannot decide which one is closer.
+	if rest < big_distance && unsafe_interval-rest >= ten_kappa &&
+		(rest+ten_kappa < big_distance ||
+			big_distance-rest > rest+ten_kappa-big_distance) {
+		return false
+	}
+
+	// Weeding test.
+	//   The safe interval is [too_low + 2 ulp; too_high - 2 ulp]
+	//   Since too_low = too_high - unsafe_interval this is equivalent to
+	//      [too_high - unsafe_interval + 4 ulp; too_high - 2 ulp]
+	//   Conceptually we have: rest ~= too_high - buffer
+	return (2*unit <= rest) && (rest <= unsafe_interval-4*unit)
+}
+
+// Rounds the buffer upwards if the result is closer to v by possibly adding
+// 1 to the buffer. If the precision of the calculation is not sufficient to
+// round correctly, return false.
+// The rounding might shift the whole buffer in which case the kappa is
+// adjusted. For example "99", kappa = 3 might become "10", kappa = 4.
+//
+// If 2*rest > ten_kappa then the buffer needs to be round up.
+// rest can have an error of +/- 1 unit. This function accounts for the
+// imprecision and returns false, if the rounding direction cannot be
+// unambiguously determined.
+//
+// Precondition: rest < ten_kappa.
+func roundWeedCounted(buffer []byte, rest, ten_kappa, unit uint64, kappa *int) bool {
+	_DCHECK(rest < ten_kappa)
+	// The following tests are done in a specific order to avoid overflows. They
+	// will work correctly with any uint64 values of rest < ten_kappa and unit.
+	//
+	// If the unit is too big, then we don't know which way to round. For example
+	// a unit of 50 means that the real number lies within rest +/- 50. If
+	// 10^kappa == 40 then there is no way to tell which way to round.
+	if unit >= ten_kappa {
+		return false
+	}
+	// Even if unit is just half the size of 10^kappa we are already completely
+	// lost. (And after the previous test we know that the expression will not
+	// over/underflow.)
+	if ten_kappa-unit <= unit {
+		return false
+	}
+	// If 2 * (rest + unit) <= 10^kappa we can safely round down.
+	if (ten_kappa-rest > rest) && (ten_kappa-2*rest >= 2*unit) {
+		return true
+	}
+
+	// If 2 * (rest - unit) >= 10^kappa, then we can safely round up.
+	if (rest > unit) && (ten_kappa-(rest-unit) <= (rest - unit)) {
+		// Increment the last digit recursively until we find a non '9' digit.
+		buffer[len(buffer)-1]++
+		for i := len(buffer) - 1; i > 0; i-- {
+			if buffer[i] != '0'+10 {
+				break
+			}
+			buffer[i] = '0'
+			buffer[i-1]++
+		}
+		// If the first digit is now '0'+ 10 we had a buffer with all '9's. With the
+		// exception of the first digit all digits are now '0'. Simply switch the
+		// first digit to '1' and adjust the kappa. Example: "99" becomes "10" and
+		// the power (the kappa) is increased.
+		if buffer[0] == '0'+10 {
+			buffer[0] = '1'
+			*kappa += 1
+		}
+		return true
+	}
+	return false
+}
+
+// Returns the biggest power of ten that is less than or equal than the given
+// number. We furthermore receive the maximum number of bits 'number' has.
+// If number_bits == 0 then 0^-1 is returned
+// The number of bits must be <= 32.
+// Precondition: number < (1 << (number_bits + 1)).
+func biggestPowerTen(number uint32, number_bits int) (power uint32, exponent int) {
+	switch number_bits {
+	case 32, 31, 30:
+		if kTen9 <= number {
+			power = kTen9
+			exponent = 9
+			break
+		}
+		fallthrough
+	case 29, 28, 27:
+		if kTen8 <= number {
+			power = kTen8
+			exponent = 8
+			break
+		}
+		fallthrough
+	case 26, 25, 24:
+		if kTen7 <= number {
+			power = kTen7
+			exponent = 7
+			break
+		}
+		fallthrough
+	case 23, 22, 21, 20:
+		if kTen6 <= number {
+			power = kTen6
+			exponent = 6
+			break
+		}
+		fallthrough
+	case 19, 18, 17:
+		if kTen5 <= number {
+			power = kTen5
+			exponent = 5
+			break
+		}
+		fallthrough
+	case 16, 15, 14:
+		if kTen4 <= number {
+			power = kTen4
+			exponent = 4
+			break
+		}
+		fallthrough
+	case 13, 12, 11, 10:
+		if 1000 <= number {
+			power = 1000
+			exponent = 3
+			break
+		}
+		fallthrough
+	case 9, 8, 7:
+		if 100 <= number {
+			power = 100
+			exponent = 2
+			break
+		}
+		fallthrough
+	case 6, 5, 4:
+		if 10 <= number {
+			power = 10
+			exponent = 1
+			break
+		}
+		fallthrough
+	case 3, 2, 1:
+		if 1 <= number {
+			power = 1
+			exponent = 0
+			break
+		}
+		fallthrough
+	case 0:
+		power = 0
+		exponent = -1
+	}
+	return
+}
+
+// Generates the digits of input number w.
+// w is a floating-point number (DiyFp), consisting of a significand and an
+// exponent. Its exponent is bounded by kMinimalTargetExponent and
+// kMaximalTargetExponent.
+//       Hence -60 <= w.e() <= -32.
+//
+// Returns false if it fails, in which case the generated digits in the buffer
+// should not be used.
+// Preconditions:
+//  * low, w and high are correct up to 1 ulp (unit in the last place). That
+//    is, their error must be less than a unit of their last digits.
+//  * low.e() == w.e() == high.e()
+//  * low < w < high, and taking into account their error: low~ <= high~
+//  * kMinimalTargetExponent <= w.e() <= kMaximalTargetExponent
+// Postconditions: returns false if procedure fails.
+//   otherwise:
+//     * buffer is not null-terminated, but len contains the number of digits.
+//     * buffer contains the shortest possible decimal digit-sequence
+//       such that LOW < buffer * 10^kappa < HIGH, where LOW and HIGH are the
+//       correct values of low and high (without their error).
+//     * if more than one decimal representation gives the minimal number of
+//       decimal digits then the one closest to W (where W is the correct value
+//       of w) is chosen.
+// Remark: this procedure takes into account the imprecision of its input
+//   numbers. If the precision is not enough to guarantee all the postconditions
+//   then false is returned. This usually happens rarely (~0.5%).
+//
+// Say, for the sake of example, that
+//   w.e() == -48, and w.f() == 0x1234567890ABCDEF
+// w's value can be computed by w.f() * 2^w.e()
+// We can obtain w's integral digits by simply shifting w.f() by -w.e().
+//  -> w's integral part is 0x1234
+//  w's fractional part is therefore 0x567890ABCDEF.
+// Printing w's integral part is easy (simply print 0x1234 in decimal).
+// In order to print its fraction we repeatedly multiply the fraction by 10 and
+// get each digit. Example the first digit after the point would be computed by
+//   (0x567890ABCDEF * 10) >> 48. -> 3
+// The whole thing becomes slightly more complicated because we want to stop
+// once we have enough digits. That is, once the digits inside the buffer
+// represent 'w' we can stop. Everything inside the interval low - high
+// represents w. However we have to pay attention to low, high and w's
+// imprecision.
+func digitGen(low, w, high diyfp, buffer []byte) (kappa int, buf []byte, res bool) {
+	_DCHECK(low.e == w.e && w.e == high.e)
+	_DCHECK(low.f+1 <= high.f-1)
+	_DCHECK(kMinimalTargetExponent <= w.e && w.e <= kMaximalTargetExponent)
+	// low, w and high are imprecise, but by less than one ulp (unit in the last
+	// place).
+	// If we remove (resp. add) 1 ulp from low (resp. high) we are certain that
+	// the new numbers are outside of the interval we want the final
+	// representation to lie in.
+	// Inversely adding (resp. removing) 1 ulp from low (resp. high) would yield
+	// numbers that are certain to lie in the interval. We will use this fact
+	// later on.
+	// We will now start by generating the digits within the uncertain
+	// interval. Later we will weed out representations that lie outside the safe
+	// interval and thus _might_ lie outside the correct interval.
+	unit := uint64(1)
+	too_low := diyfp{f: low.f - unit, e: low.e}
+	too_high := diyfp{f: high.f + unit, e: high.e}
+	// too_low and too_high are guaranteed to lie outside the interval we want the
+	// generated number in.
+	unsafe_interval := too_high.minus(too_low)
+	// We now cut the input number into two parts: the integral digits and the
+	// fractionals. We will not write any decimal separator though, but adapt
+	// kappa instead.
+	// Reminder: we are currently computing the digits (stored inside the buffer)
+	// such that:   too_low < buffer * 10^kappa < too_high
+	// We use too_high for the digit_generation and stop as soon as possible.
+	// If we stop early we effectively round down.
+	one := diyfp{f: 1 << -w.e, e: w.e}
+	// Division by one is a shift.
+	integrals := uint32(too_high.f >> -one.e)
+	// Modulo by one is an and.
+	fractionals := too_high.f & (one.f - 1)
+	divisor, divisor_exponent := biggestPowerTen(integrals, diyFpKSignificandSize-(-one.e))
+	kappa = divisor_exponent + 1
+	buf = buffer
+	for kappa > 0 {
+		digit := int(integrals / divisor)
+		buf = append(buf, byte('0'+digit))
+		integrals %= divisor
+		kappa--
+		// Note that kappa now equals the exponent of the divisor and that the
+		// invariant thus holds again.
+		rest := uint64(integrals)<<-one.e + fractionals
+		// Invariant: too_high = buffer * 10^kappa + DiyFp(rest, one.e)
+		// Reminder: unsafe_interval.e == one.e
+		if rest < unsafe_interval.f {
+			// Rounding down (by not emitting the remaining digits) yields a number
+			// that lies within the unsafe interval.
+			res = roundWeed(buf, too_high.minus(w).f,
+				unsafe_interval.f, rest,
+				uint64(divisor)<<-one.e, unit)
+			return
+		}
+		divisor /= 10
+	}
+	// The integrals have been generated. We are at the point of the decimal
+	// separator. In the following loop we simply multiply the remaining digits by
+	// 10 and divide by one. We just need to pay attention to multiply associated
+	// data (like the interval or 'unit'), too.
+	// Note that the multiplication by 10 does not overflow, because w.e >= -60
+	// and thus one.e >= -60.
+	_DCHECK(one.e >= -60)
+	_DCHECK(fractionals < one.f)
+	_DCHECK(0xFFFFFFFFFFFFFFFF/10 >= one.f)
+	for {
+		fractionals *= 10
+		unit *= 10
+		unsafe_interval.f *= 10
+		// Integer division by one.
+		digit := byte(fractionals >> -one.e)
+		buf = append(buf, '0'+digit)
+		fractionals &= one.f - 1 // Modulo by one.
+		kappa--
+		if fractionals < unsafe_interval.f {
+			res = roundWeed(buf, too_high.minus(w).f*unit, unsafe_interval.f, fractionals, one.f, unit)
+			return
+		}
+	}
+}
+
+// Generates (at most) requested_digits of input number w.
+// w is a floating-point number (DiyFp), consisting of a significand and an
+// exponent. Its exponent is bounded by kMinimalTargetExponent and
+// kMaximalTargetExponent.
+//       Hence -60 <= w.e() <= -32.
+//
+// Returns false if it fails, in which case the generated digits in the buffer
+// should not be used.
+// Preconditions:
+//  * w is correct up to 1 ulp (unit in the last place). That
+//    is, its error must be strictly less than a unit of its last digit.
+//  * kMinimalTargetExponent <= w.e() <= kMaximalTargetExponent
+//
+// Postconditions: returns false if procedure fails.
+//   otherwise:
+//     * buffer is not null-terminated, but length contains the number of
+//       digits.
+//     * the representation in buffer is the most precise representation of
+//       requested_digits digits.
+//     * buffer contains at most requested_digits digits of w. If there are less
+//       than requested_digits digits then some trailing '0's have been removed.
+//     * kappa is such that
+//            w = buffer * 10^kappa + eps with |eps| < 10^kappa / 2.
+//
+// Remark: This procedure takes into account the imprecision of its input
+//   numbers. If the precision is not enough to guarantee all the postconditions
+//   then false is returned. This usually happens rarely, but the failure-rate
+//   increases with higher requested_digits.
+func digitGenCounted(w diyfp, requested_digits int, buffer []byte) (kappa int, buf []byte, res bool) {
+	_DCHECK(kMinimalTargetExponent <= w.e && w.e <= kMaximalTargetExponent)
+
+	// w is assumed to have an error less than 1 unit. Whenever w is scaled we
+	// also scale its error.
+	w_error := uint64(1)
+	// We cut the input number into two parts: the integral digits and the
+	// fractional digits. We don't emit any decimal separator, but adapt kappa
+	// instead. Example: instead of writing "1.2" we put "12" into the buffer and
+	// increase kappa by 1.
+	one := diyfp{f: 1 << -w.e, e: w.e}
+	// Division by one is a shift.
+	integrals := uint32(w.f >> -one.e)
+	// Modulo by one is an and.
+	fractionals := w.f & (one.f - 1)
+	divisor, divisor_exponent := biggestPowerTen(integrals, diyFpKSignificandSize-(-one.e))
+	kappa = divisor_exponent + 1
+	buf = buffer
+	// Loop invariant: buffer = w / 10^kappa  (integer division)
+	// The invariant holds for the first iteration: kappa has been initialized
+	// with the divisor exponent + 1. And the divisor is the biggest power of ten
+	// that is smaller than 'integrals'.
+	for kappa > 0 {
+		digit := byte(integrals / divisor)
+		buf = append(buf, '0'+digit)
+		requested_digits--
+		integrals %= divisor
+		kappa--
+		// Note that kappa now equals the exponent of the divisor and that the
+		// invariant thus holds again.
+		if requested_digits == 0 {
+			break
+		}
+		divisor /= 10
+	}
+
+	if requested_digits == 0 {
+		rest := uint64(integrals)<<-one.e + fractionals
+		res = roundWeedCounted(buf, rest, uint64(divisor)<<-one.e, w_error, &kappa)
+		return
+	}
+
+	// The integrals have been generated. We are at the point of the decimal
+	// separator. In the following loop we simply multiply the remaining digits by
+	// 10 and divide by one. We just need to pay attention to multiply associated
+	// data (the 'unit'), too.
+	// Note that the multiplication by 10 does not overflow, because w.e >= -60
+	// and thus one.e >= -60.
+	_DCHECK(one.e >= -60)
+	_DCHECK(fractionals < one.f)
+	_DCHECK(0xFFFFFFFFFFFFFFFF/10 >= one.f)
+	for requested_digits > 0 && fractionals > w_error {
+		fractionals *= 10
+		w_error *= 10
+		// Integer division by one.
+		digit := byte(fractionals >> -one.e)
+		buf = append(buf, '0'+digit)
+		requested_digits--
+		fractionals &= one.f - 1 // Modulo by one.
+		kappa--
+	}
+	if requested_digits != 0 {
+		res = false
+	} else {
+		res = roundWeedCounted(buf, fractionals, one.f, w_error, &kappa)
+	}
+	return
+}
+
+// Provides a decimal representation of v.
+// Returns true if it succeeds, otherwise the result cannot be trusted.
+// There will be *length digits inside the buffer (not null-terminated).
+// If the function returns true then
+//        v == (double) (buffer * 10^decimal_exponent).
+// The digits in the buffer are the shortest representation possible: no
+// 0.09999999999999999 instead of 0.1. The shorter representation will even be
+// chosen even if the longer one would be closer to v.
+// The last digit will be closest to the actual v. That is, even if several
+// digits might correctly yield 'v' when read again, the closest will be
+// computed.
+func grisu3(f float64, buffer []byte) (digits []byte, decimal_exponent int, result bool) {
+	v := double(f)
+	w := v.toNormalizedDiyfp()
+
+	// boundary_minus and boundary_plus are the boundaries between v and its
+	// closest floating-point neighbors. Any number strictly between
+	// boundary_minus and boundary_plus will round to v when convert to a double.
+	// Grisu3 will never output representations that lie exactly on a boundary.
+	boundary_minus, boundary_plus := v.normalizedBoundaries()
+	ten_mk_minimal_binary_exponent := kMinimalTargetExponent - (w.e + diyFpKSignificandSize)
+	ten_mk_maximal_binary_exponent := kMaximalTargetExponent - (w.e + diyFpKSignificandSize)
+	ten_mk, mk := getCachedPowerForBinaryExponentRange(ten_mk_minimal_binary_exponent, ten_mk_maximal_binary_exponent)
+
+	_DCHECK(
+		(kMinimalTargetExponent <=
+			w.e+ten_mk.e+diyFpKSignificandSize) &&
+			(kMaximalTargetExponent >= w.e+ten_mk.e+diyFpKSignificandSize))
+	// Note that ten_mk is only an approximation of 10^-k. A DiyFp only contains a
+	// 64 bit significand and ten_mk is thus only precise up to 64 bits.
+
+	// The DiyFp::Times procedure rounds its result, and ten_mk is approximated
+	// too. The variable scaled_w (as well as scaled_boundary_minus/plus) are now
+	// off by a small amount.
+	// In fact: scaled_w - w*10^k < 1ulp (unit in the last place) of scaled_w.
+	// In other words: let f = scaled_w.f() and e = scaled_w.e(), then
+	//           (f-1) * 2^e < w*10^k < (f+1) * 2^e
+	scaled_w := w.times(ten_mk)
+	_DCHECK(scaled_w.e ==
+		boundary_plus.e+ten_mk.e+diyFpKSignificandSize)
+	// In theory it would be possible to avoid some recomputations by computing
+	// the difference between w and boundary_minus/plus (a power of 2) and to
+	// compute scaled_boundary_minus/plus by subtracting/adding from
+	// scaled_w. However the code becomes much less readable and the speed
+	// enhancements are not terrific.
+	scaled_boundary_minus := boundary_minus.times(ten_mk)
+	scaled_boundary_plus := boundary_plus.times(ten_mk)
+	// DigitGen will generate the digits of scaled_w. Therefore we have
+	// v == (double) (scaled_w * 10^-mk).
+	// Set decimal_exponent == -mk and pass it to DigitGen. If scaled_w is not an
+	// integer than it will be updated. For instance if scaled_w == 1.23 then
+	// the buffer will be filled with "123" und the decimal_exponent will be
+	// decreased by 2.
+	var kappa int
+	kappa, digits, result = digitGen(scaled_boundary_minus, scaled_w, scaled_boundary_plus, buffer)
+	decimal_exponent = -mk + kappa
+	return
+}
+
+// The "counted" version of grisu3 (see above) only generates requested_digits
+// number of digits. This version does not generate the shortest representation,
+// and with enough requested digits 0.1 will at some point print as 0.9999999...
+// Grisu3 is too imprecise for real halfway cases (1.5 will not work) and
+// therefore the rounding strategy for halfway cases is irrelevant.
+func grisu3Counted(v float64, requested_digits int, buffer []byte) (digits []byte, decimal_exponent int, result bool) {
+	w := double(v).toNormalizedDiyfp()
+	ten_mk_minimal_binary_exponent := kMinimalTargetExponent - (w.e + diyFpKSignificandSize)
+	ten_mk_maximal_binary_exponent := kMaximalTargetExponent - (w.e + diyFpKSignificandSize)
+	ten_mk, mk := getCachedPowerForBinaryExponentRange(ten_mk_minimal_binary_exponent, ten_mk_maximal_binary_exponent)
+
+	_DCHECK(
+		(kMinimalTargetExponent <=
+			w.e+ten_mk.e+diyFpKSignificandSize) &&
+			(kMaximalTargetExponent >= w.e+ten_mk.e+diyFpKSignificandSize))
+	// Note that ten_mk is only an approximation of 10^-k. A DiyFp only contains a
+	// 64 bit significand and ten_mk is thus only precise up to 64 bits.
+
+	// The DiyFp::Times procedure rounds its result, and ten_mk is approximated
+	// too. The variable scaled_w (as well as scaled_boundary_minus/plus) are now
+	// off by a small amount.
+	// In fact: scaled_w - w*10^k < 1ulp (unit in the last place) of scaled_w.
+	// In other words: let f = scaled_w.f() and e = scaled_w.e(), then
+	//           (f-1) * 2^e < w*10^k < (f+1) * 2^e
+	scaled_w := w.times(ten_mk)
+	// We now have (double) (scaled_w * 10^-mk).
+	// DigitGen will generate the first requested_digits digits of scaled_w and
+	// return together with a kappa such that scaled_w ~= buffer * 10^kappa. (It
+	// will not always be exactly the same since DigitGenCounted only produces a
+	// limited number of digits.)
+	var kappa int
+	kappa, digits, result = digitGenCounted(scaled_w, requested_digits, buffer)
+	decimal_exponent = -mk + kappa
+
+	return
+}
+
+// v must be > 0 and must not be Inf or NaN
+func Dtoa(v float64, mode Mode, requested_digits int, buffer []byte) (digits []byte, decimal_point int, result bool) {
+	defer func() {
+		if x := recover(); x != nil {
+			if x == dcheckFailure {
+				panic(fmt.Errorf("DCHECK assertion failed while formatting %s in mode %d", strconv.FormatFloat(v, 'e', 50, 64), mode))
+			}
+			panic(x)
+		}
+	}()
+	var decimal_exponent int
+	startPos := len(buffer)
+	switch mode {
+	case ModeShortest:
+		digits, decimal_exponent, result = grisu3(v, buffer)
+	case ModePrecision:
+		digits, decimal_exponent, result = grisu3Counted(v, requested_digits, buffer)
+	}
+	if result {
+		decimal_point = len(digits) - startPos + decimal_exponent
+	} else {
+		digits = digits[:startPos]
+	}
+	return
+}