From f3f4decab8c64a9e3e1980742027dfdb0befb34a Mon Sep 17 00:00:00 2001
From: Trevor Gross <tmgross@umich.edu>
Date: Sun, 7 Apr 2024 02:51:51 -0400
Subject: [PATCH 01/13] Add missing functions for `f16` and `f128`

---
 README.md        | 10 +++++-----
 src/float/add.rs |  4 ++++
 src/float/cmp.rs | 35 +++++++++++++++++++++++++++++++++++
 src/float/div.rs |  9 +++++++++
 src/float/mod.rs |  2 +-
 src/float/mul.rs |  4 ++++
 src/float/sub.rs |  5 +++++
 src/lib.rs       |  1 +
 8 files changed, 64 insertions(+), 6 deletions(-)

diff --git a/README.md b/README.md
index 00d547f1..ccfa96b0 100644
--- a/README.md
+++ b/README.md
@@ -232,9 +232,9 @@ These builtins are needed to support 128-bit integers.
 
 These builtins are needed to support `f16` and `f128`, which are in the process of being added to Rust.
 
-- [ ] addtf3.c
-- [ ] comparetf2.c
-- [ ] divtf3.c
+- [x] addtf3.c
+- [x] comparetf2.c
+- [x] divtf3.c
 - [x] extenddftf2.c
 - [x] extendhfsf2.c
 - [x] extendhftf2.c
@@ -249,13 +249,13 @@ These builtins are needed to support `f16` and `f128`, which are in the process
 - [ ] floatsitf.c
 - [ ] floatunditf.c
 - [ ] floatunsitf.c
-- [ ] multf3.c
+- [x] multf3.c
 - [ ] powitf2.c
 - [ ] ppc/fixtfdi.c
 - [ ] ppc/fixunstfdi.c
 - [ ] ppc/floatditf.c
 - [ ] ppc/floatunditf.c
-- [ ] subtf3.c
+- [x] subtf3.c
 - [x] truncdfhf2.c
 - [x] truncsfhf2.c
 - [x] trunctfdf2.c
diff --git a/src/float/add.rs b/src/float/add.rs
index 97f73e2f..e2fb8407 100644
--- a/src/float/add.rs
+++ b/src/float/add.rs
@@ -203,6 +203,10 @@ intrinsics! {
         add(a, b)
     }
 
+    pub extern "C" fn __addtf3(a: f128, b: f128) -> f128 {
+        add(a, b)
+    }
+
     #[cfg(target_arch = "arm")]
     pub extern "C" fn __addsf3vfp(a: f32, b: f32) -> f32 {
         a + b
diff --git a/src/float/cmp.rs b/src/float/cmp.rs
index 1c8917af..46e903dc 100644
--- a/src/float/cmp.rs
+++ b/src/float/cmp.rs
@@ -170,6 +170,41 @@ intrinsics! {
     pub extern "C" fn __gtdf2(a: f64, b: f64) -> i32 {
         cmp(a, b).to_ge_abi()
     }
+
+    #[avr_skip]
+    pub extern "C" fn __letf2(a: f128, b: f128) -> i32 {
+        cmp(a, b).to_le_abi()
+    }
+
+    #[avr_skip]
+    pub extern "C" fn __getf2(a: f128, b: f128) -> i32 {
+        cmp(a, b).to_ge_abi()
+    }
+
+    #[avr_skip]
+    pub extern "C" fn __unordtf2(a: f128, b: f128) -> i32 {
+        unord(a, b) as i32
+    }
+
+    #[avr_skip]
+    pub extern "C" fn __eqtf2(a: f128, b: f128) -> i32 {
+        cmp(a, b).to_le_abi()
+    }
+
+    #[avr_skip]
+    pub extern "C" fn __lttf2(a: f128, b: f128) -> i32 {
+        cmp(a, b).to_le_abi()
+    }
+
+    #[avr_skip]
+    pub extern "C" fn __netf2(a: f128, b: f128) -> i32 {
+        cmp(a, b).to_le_abi()
+    }
+
+    #[avr_skip]
+    pub extern "C" fn __gttf2(a: f128, b: f128) -> i32 {
+        cmp(a, b).to_ge_abi()
+    }
 }
 
 #[cfg(target_arch = "arm")]
diff --git a/src/float/div.rs b/src/float/div.rs
index d587fe4f..9038f6b9 100644
--- a/src/float/div.rs
+++ b/src/float/div.rs
@@ -914,6 +914,15 @@ intrinsics! {
         div64(a, b)
     }
 
+    // TODO: how should `HInt` be handled?
+    pub extern "C" fn __divtf3(a: f128, b: f128) -> f128 {
+        if cfg!(target_pointer_width = "64") {
+            div32(a, b)
+        } else {
+            div64(a, b)
+        }
+    }
+
     #[cfg(target_arch = "arm")]
     pub extern "C" fn __divsf3vfp(a: f32, b: f32) -> f32 {
         a / b
diff --git a/src/float/mod.rs b/src/float/mod.rs
index a82dd7d2..02d291ed 100644
--- a/src/float/mod.rs
+++ b/src/float/mod.rs
@@ -59,7 +59,7 @@ pub(crate) trait Float:
     /// A mask for the significand
     const SIGNIFICAND_MASK: Self::Int;
 
-    // The implicit bit of the float format
+    /// The implicit bit of the float format
     const IMPLICIT_BIT: Self::Int;
 
     /// A mask for the exponent
diff --git a/src/float/mul.rs b/src/float/mul.rs
index 378fa970..eed29527 100644
--- a/src/float/mul.rs
+++ b/src/float/mul.rs
@@ -199,6 +199,10 @@ intrinsics! {
         mul(a, b)
     }
 
+    pub extern "C" fn __multf3(a: f128, b: f128) -> f128 {
+        mul(a, b)
+    }
+
     #[cfg(target_arch = "arm")]
     pub extern "C" fn __mulsf3vfp(a: f32, b: f32) -> f32 {
         a * b
diff --git a/src/float/sub.rs b/src/float/sub.rs
index 64653ee2..6bb3271a 100644
--- a/src/float/sub.rs
+++ b/src/float/sub.rs
@@ -1,5 +1,6 @@
 use crate::float::add::__adddf3;
 use crate::float::add::__addsf3;
+use crate::float::add::__addtf3;
 use crate::float::Float;
 
 intrinsics! {
@@ -15,6 +16,10 @@ intrinsics! {
         __adddf3(a, f64::from_repr(b.repr() ^ f64::SIGN_MASK))
     }
 
+    pub extern "C" fn __subtf3(a: f128, b: f128) -> f128 {
+        __addtf3(a, f128::from_repr(b.repr() ^ f128::SIGN_MASK))
+    }
+
     #[cfg(target_arch = "arm")]
     pub extern "C" fn __subsf3vfp(a: f32, b: f32) -> f32 {
         a - b
diff --git a/src/lib.rs b/src/lib.rs
index 40564178..ea376631 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -2,6 +2,7 @@
 #![cfg_attr(not(feature = "no-asm"), feature(asm))]
 #![feature(abi_unadjusted)]
 #![feature(asm_experimental_arch)]
+#![feature(c_unwind)]
 #![cfg_attr(not(feature = "no-asm"), feature(global_asm))]
 #![feature(cfg_target_has_atomic)]
 #![feature(compiler_builtins)]

From bbe3aefd8ca32b071953dcfd56a2560bccf45a7f Mon Sep 17 00:00:00 2001
From: Trevor Gross <tmgross@umich.edu>
Date: Thu, 11 Apr 2024 05:38:46 -0400
Subject: [PATCH 02/13] Work on bigint

Try splitting part of 'Int' into 'MinInt' so we don't need to implement everything on u256/i256

Add addsub test

Add mul/div/rem tests

Add cmp test

Remove 32-bit div implementation

formatting updates

disable div tests for now

Bigint updates

Big update

Fix widen mul

wrapping add

disable duplicate symbols in builtins

Apply temporary unord fix from @beetrees #593

tests

add lowerhex display

errors by ref

tests

fix-test

Update big tests

Fix core calls

Disable widen_mul for signed

Test adding symbols in build.rs

Add a feature to compile intrinsics that are missing on the system for testing

update

Disable f128 tests on platforms without system support

add missing build.rs file

pull cas file from master

testgs

print more div values

Add a benchmark

Work on fixing bit widths

Update benchmark
---
 build.rs                   |  12 --
 src/float/add.rs           |  22 +--
 src/float/cmp.rs           |   2 +-
 src/float/div.rs           |  95 ++++++----
 src/float/extend.rs        |   2 +-
 src/float/mod.rs           |   2 +-
 src/float/mul.rs           |   2 +-
 src/float/trunc.rs         |   2 +-
 src/int/addsub.rs          |  10 +-
 src/int/big.rs             | 364 +++++++++++++++++++++++++++++++++++++
 src/int/mod.rs             | 233 +++++++++++++-----------
 src/int/mul.rs             |   4 +-
 src/int/shift.rs           |   2 +-
 src/lib.rs                 |  57 ++++++
 testcrate/Cargo.toml       |   2 +
 testcrate/benches/float.rs |  90 +++++++++
 testcrate/build.rs         |  15 ++
 testcrate/src/lib.rs       |  18 +-
 testcrate/tests/addsub.rs  |  16 +-
 testcrate/tests/big.rs     | 104 +++++++++++
 testcrate/tests/cmp.rs     |  27 ++-
 testcrate/tests/div_rem.rs |  29 ++-
 testcrate/tests/mul.rs     |  14 +-
 23 files changed, 933 insertions(+), 191 deletions(-)
 create mode 100644 src/int/big.rs
 create mode 100644 testcrate/benches/float.rs
 create mode 100644 testcrate/build.rs
 create mode 100644 testcrate/tests/big.rs

diff --git a/build.rs b/build.rs
index bafbf75d..1229fb2a 100644
--- a/build.rs
+++ b/build.rs
@@ -479,10 +479,6 @@ mod c {
                 ("__floatsitf", "floatsitf.c"),
                 ("__floatunditf", "floatunditf.c"),
                 ("__floatunsitf", "floatunsitf.c"),
-                ("__addtf3", "addtf3.c"),
-                ("__multf3", "multf3.c"),
-                ("__subtf3", "subtf3.c"),
-                ("__divtf3", "divtf3.c"),
                 ("__powitf2", "powitf2.c"),
                 ("__fe_getround", "fp_mode.c"),
                 ("__fe_raise_inexact", "fp_mode.c"),
@@ -500,30 +496,22 @@ mod c {
         if target_arch == "mips64" {
             sources.extend(&[
                 ("__netf2", "comparetf2.c"),
-                ("__addtf3", "addtf3.c"),
-                ("__multf3", "multf3.c"),
-                ("__subtf3", "subtf3.c"),
                 ("__fixtfsi", "fixtfsi.c"),
                 ("__floatsitf", "floatsitf.c"),
                 ("__fixunstfsi", "fixunstfsi.c"),
                 ("__floatunsitf", "floatunsitf.c"),
                 ("__fe_getround", "fp_mode.c"),
-                ("__divtf3", "divtf3.c"),
             ]);
         }
 
         if target_arch == "loongarch64" {
             sources.extend(&[
                 ("__netf2", "comparetf2.c"),
-                ("__addtf3", "addtf3.c"),
-                ("__multf3", "multf3.c"),
-                ("__subtf3", "subtf3.c"),
                 ("__fixtfsi", "fixtfsi.c"),
                 ("__floatsitf", "floatsitf.c"),
                 ("__fixunstfsi", "fixunstfsi.c"),
                 ("__floatunsitf", "floatunsitf.c"),
                 ("__fe_getround", "fp_mode.c"),
-                ("__divtf3", "divtf3.c"),
             ]);
         }
 
diff --git a/src/float/add.rs b/src/float/add.rs
index e2fb8407..8fa9dac5 100644
--- a/src/float/add.rs
+++ b/src/float/add.rs
@@ -1,5 +1,5 @@
 use crate::float::Float;
-use crate::int::{CastInto, Int};
+use crate::int::{CastInto, Int, MinInt};
 
 /// Returns `a + b`
 fn add<F: Float>(a: F, b: F) -> F
@@ -57,9 +57,9 @@ where
         }
 
         // zero + anything = anything
-        if a_abs == Int::ZERO {
+        if a_abs == MinInt::ZERO {
             // but we need to get the sign right for zero + zero
-            if b_abs == Int::ZERO {
+            if b_abs == MinInt::ZERO {
                 return F::from_repr(a.repr() & b.repr());
             } else {
                 return b;
@@ -67,7 +67,7 @@ where
         }
 
         // anything + zero = anything
-        if b_abs == Int::ZERO {
+        if b_abs == MinInt::ZERO {
             return a;
         }
     }
@@ -113,10 +113,10 @@ where
     // Shift the significand of b by the difference in exponents, with a sticky
     // bottom bit to get rounding correct.
     let align = a_exponent.wrapping_sub(b_exponent).cast();
-    if align != Int::ZERO {
+    if align != MinInt::ZERO {
         if align < bits {
             let sticky =
-                F::Int::from_bool(b_significand << bits.wrapping_sub(align).cast() != Int::ZERO);
+                F::Int::from_bool(b_significand << bits.wrapping_sub(align).cast() != MinInt::ZERO);
             b_significand = (b_significand >> align.cast()) | sticky;
         } else {
             b_significand = one; // sticky; b is known to be non-zero.
@@ -125,8 +125,8 @@ where
     if subtraction {
         a_significand = a_significand.wrapping_sub(b_significand);
         // If a == -b, return +zero.
-        if a_significand == Int::ZERO {
-            return F::from_repr(Int::ZERO);
+        if a_significand == MinInt::ZERO {
+            return F::from_repr(MinInt::ZERO);
         }
 
         // If partial cancellation occured, we need to left-shift the result
@@ -143,8 +143,8 @@ where
 
         // If the addition carried up, we need to right-shift the result and
         // adjust the exponent:
-        if a_significand & implicit_bit << 4 != Int::ZERO {
-            let sticky = F::Int::from_bool(a_significand & one != Int::ZERO);
+        if a_significand & implicit_bit << 4 != MinInt::ZERO {
+            let sticky = F::Int::from_bool(a_significand & one != MinInt::ZERO);
             a_significand = a_significand >> 1 | sticky;
             a_exponent += 1;
         }
@@ -160,7 +160,7 @@ where
         // need to shift the significand.
         let shift = (1 - a_exponent).cast();
         let sticky =
-            F::Int::from_bool((a_significand << bits.wrapping_sub(shift).cast()) != Int::ZERO);
+            F::Int::from_bool((a_significand << bits.wrapping_sub(shift).cast()) != MinInt::ZERO);
         a_significand = a_significand >> shift.cast() | sticky;
         a_exponent = 0;
     }
diff --git a/src/float/cmp.rs b/src/float/cmp.rs
index 46e903dc..ae05a3a6 100644
--- a/src/float/cmp.rs
+++ b/src/float/cmp.rs
@@ -1,7 +1,7 @@
 #![allow(unreachable_code)]
 
 use crate::float::Float;
-use crate::int::Int;
+use crate::int::MinInt;
 
 #[derive(Clone, Copy)]
 enum Result {
diff --git a/src/float/div.rs b/src/float/div.rs
index 9038f6b9..6f64dfae 100644
--- a/src/float/div.rs
+++ b/src/float/div.rs
@@ -3,7 +3,9 @@
 #![allow(clippy::needless_return)]
 
 use crate::float::Float;
-use crate::int::{CastInto, DInt, HInt, Int};
+use crate::int::{CastInto, DInt, HInt, Int, MinInt};
+
+use super::HalfRep;
 
 fn div32<F: Float>(a: F, b: F) -> F
 where
@@ -37,6 +39,11 @@ where
     let quiet_bit = implicit_bit >> 1;
     let qnan_rep = exponent_mask | quiet_bit;
 
+    // #[inline(always)]
+    // fn negate<T: Int>(a: T) -> T {
+    //     T::wrapping_neg(a.signe)
+    // }
+
     #[inline(always)]
     fn negate_u32(a: u32) -> u32 {
         (<i32>::wrapping_neg(a as i32)) as u32
@@ -459,10 +466,14 @@ where
     i32: CastInto<F::Int>,
     F::Int: CastInto<i32>,
     u64: CastInto<F::Int>,
+    u64: CastInto<HalfRep<F>>,
+    F::Int: CastInto<HalfRep<F>>,
+    F::Int: From<HalfRep<F>>,
+    F::Int: From<u8>,
     F::Int: CastInto<u64>,
     i64: CastInto<F::Int>,
     F::Int: CastInto<i64>,
-    F::Int: HInt,
+    F::Int: HInt + DInt,
 {
     const NUMBER_OF_HALF_ITERATIONS: usize = 3;
     const NUMBER_OF_FULL_ITERATIONS: usize = 1;
@@ -471,7 +482,7 @@ where
     let one = F::Int::ONE;
     let zero = F::Int::ZERO;
     let hw = F::BITS / 2;
-    let lo_mask = u64::MAX >> hw;
+    let lo_mask = F::Int::MAX >> hw;
 
     let significand_bits = F::SIGNIFICAND_BITS;
     let max_exponent = F::EXPONENT_MAX;
@@ -616,8 +627,9 @@ where
 
     let mut x_uq0 = if NUMBER_OF_HALF_ITERATIONS > 0 {
         // Starting with (n-1) half-width iterations
-        let b_uq1_hw: u32 =
-            (CastInto::<u64>::cast(b_significand) >> (significand_bits + 1 - hw)) as u32;
+        let b_uq1_hw: HalfRep<F> = CastInto::<HalfRep<F>>::cast(
+            CastInto::<u64>::cast(b_significand) >> (significand_bits + 1 - hw),
+        );
 
         // C is (3/4 + 1/sqrt(2)) - 1 truncated to W0 fractional bits as UQ0.HW
         // with W0 being either 16 or 32 and W0 <= HW.
@@ -625,12 +637,13 @@ where
         // b/2 is subtracted to obtain x0) wrapped to [0, 1) range.
 
         // HW is at least 32. Shifting into the highest bits if needed.
-        let c_hw = (0x7504F333_u64 as u32).wrapping_shl(hw.wrapping_sub(32));
+        let c_hw = (CastInto::<HalfRep<F>>::cast(0x7504F333_u64)).wrapping_shl(hw.wrapping_sub(32));
 
         // b >= 1, thus an upper bound for 3/4 + 1/sqrt(2) - b/2 is about 0.9572,
         // so x0 fits to UQ0.HW without wrapping.
-        let x_uq0_hw: u32 = {
-            let mut x_uq0_hw: u32 = c_hw.wrapping_sub(b_uq1_hw /* exact b_hw/2 as UQ0.HW */);
+        let x_uq0_hw: HalfRep<F> = {
+            let mut x_uq0_hw: HalfRep<F> =
+                c_hw.wrapping_sub(b_uq1_hw /* exact b_hw/2 as UQ0.HW */);
             // dbg!(x_uq0_hw);
             // An e_0 error is comprised of errors due to
             // * x0 being an inherently imprecise first approximation of 1/b_hw
@@ -661,8 +674,9 @@ where
                 // no overflow occurred earlier: ((rep_t)x_UQ0_hw * b_UQ1_hw >> HW) is
                 // expected to be strictly positive because b_UQ1_hw has its highest bit set
                 // and x_UQ0_hw should be rather large (it converges to 1/2 < 1/b_hw <= 1).
-                let corr_uq1_hw: u32 =
-                    0.wrapping_sub(((x_uq0_hw as u64).wrapping_mul(b_uq1_hw as u64)) >> hw) as u32;
+                let corr_uq1_hw: HalfRep<F> = CastInto::<HalfRep<F>>::cast(zero.wrapping_sub(
+                    ((F::Int::from(x_uq0_hw)).wrapping_mul(F::Int::from(b_uq1_hw))) >> hw,
+                ));
                 // dbg!(corr_uq1_hw);
 
                 // Now, we should multiply UQ0.HW and UQ1.(HW-1) numbers, naturally
@@ -677,7 +691,9 @@ where
                 // The fact corr_UQ1_hw was virtually round up (due to result of
                 // multiplication being **first** truncated, then negated - to improve
                 // error estimations) can increase x_UQ0_hw by up to 2*Ulp of x_UQ0_hw.
-                x_uq0_hw = ((x_uq0_hw as u64).wrapping_mul(corr_uq1_hw as u64) >> (hw - 1)) as u32;
+                x_uq0_hw = ((F::Int::from(x_uq0_hw)).wrapping_mul(F::Int::from(corr_uq1_hw))
+                    >> (hw - 1))
+                    .cast();
                 // dbg!(x_uq0_hw);
                 // Now, either no overflow occurred or x_UQ0_hw is 0 or 1 in its half_rep_t
                 // representation. In the latter case, x_UQ0_hw will be either 0 or 1 after
@@ -707,7 +723,7 @@ where
             // be not below that value (see g(x) above), so it is safe to decrement just
             // once after the final iteration. On the other hand, an effective value of
             // divisor changes after this point (from b_hw to b), so adjust here.
-            x_uq0_hw.wrapping_sub(1_u32)
+            x_uq0_hw.wrapping_sub(HalfRep::<F>::ONE)
         };
 
         // Error estimations for full-precision iterations are calculated just
@@ -717,7 +733,7 @@ where
         // Simulating operations on a twice_rep_t to perform a single final full-width
         // iteration. Using ad-hoc multiplication implementations to take advantage
         // of particular structure of operands.
-        let blo: u64 = (CastInto::<u64>::cast(b_uq1)) & lo_mask;
+        let blo: F::Int = b_uq1 & lo_mask;
         // x_UQ0 = x_UQ0_hw * 2^HW - 1
         // x_UQ0 * b_UQ1 = (x_UQ0_hw * 2^HW) * (b_UQ1_hw * 2^HW + blo) - b_UQ1
         //
@@ -726,19 +742,20 @@ where
         // +            [  x_UQ0_hw *  blo  ]
         // -                      [      b_UQ1       ]
         // = [      result       ][.... discarded ...]
-        let corr_uq1 = negate_u64(
-            (x_uq0_hw as u64) * (b_uq1_hw as u64) + (((x_uq0_hw as u64) * (blo)) >> hw) - 1,
-        ); // account for *possible* carry
-        let lo_corr = corr_uq1 & lo_mask;
-        let hi_corr = corr_uq1 >> hw;
+        let corr_uq1: F::Int = (F::Int::from(x_uq0_hw) * F::Int::from(b_uq1_hw)
+            + ((F::Int::from(x_uq0_hw) * blo) >> hw))
+            .wrapping_sub(one)
+            .wrapping_neg(); // account for *possible* carry
+        let lo_corr: F::Int = corr_uq1 & lo_mask;
+        let hi_corr: F::Int = corr_uq1 >> hw;
         // x_UQ0 * corr_UQ1 = (x_UQ0_hw * 2^HW) * (hi_corr * 2^HW + lo_corr) - corr_UQ1
-        let mut x_uq0: <F as Float>::Int = ((((x_uq0_hw as u64) * hi_corr) << 1)
-            .wrapping_add(((x_uq0_hw as u64) * lo_corr) >> (hw - 1))
-            .wrapping_sub(2))
-        .cast(); // 1 to account for the highest bit of corr_UQ1 can be 1
-                 // 1 to account for possible carry
-                 // Just like the case of half-width iterations but with possibility
-                 // of overflowing by one extra Ulp of x_UQ0.
+        let mut x_uq0: F::Int = ((F::Int::from(x_uq0_hw) * hi_corr) << 1)
+            .wrapping_add((F::Int::from(x_uq0_hw) * lo_corr) >> (hw - 1))
+            .wrapping_sub(F::Int::from(2u8));
+        // 1 to account for the highest bit of corr_UQ1 can be 1
+        // 1 to account for possible carry
+        // Just like the case of half-width iterations but with possibility
+        // of overflowing by one extra Ulp of x_UQ0.
         x_uq0 -= one;
         // ... and then traditional fixup by 2 should work
 
@@ -755,8 +772,8 @@ where
         x_uq0
     } else {
         // C is (3/4 + 1/sqrt(2)) - 1 truncated to 64 fractional bits as UQ0.n
-        let c: <F as Float>::Int = (0x7504F333 << (F::BITS - 32)).cast();
-        let x_uq0: <F as Float>::Int = c.wrapping_sub(b_uq1);
+        let c: F::Int = (0x7504F333 << (F::BITS - 32)).cast();
+        let x_uq0: F::Int = c.wrapping_sub(b_uq1);
         // E_0 <= 3/4 - 1/sqrt(2) + 2 * 2^-64
         x_uq0
     };
@@ -799,14 +816,27 @@ where
 
     // Add 2 to U_N due to final decrement.
 
-    let reciprocal_precision: <F as Float>::Int = 220.cast();
+    let reciprocal_precision: F::Int = if F::BITS == 32
+        && NUMBER_OF_HALF_ITERATIONS == 2
+        && NUMBER_OF_FULL_ITERATIONS == 1
+    {
+        74.cast()
+    } else if F::BITS == 32 && NUMBER_OF_HALF_ITERATIONS == 0 && NUMBER_OF_FULL_ITERATIONS == 3 {
+        10.cast()
+    } else if F::BITS == 64 && NUMBER_OF_HALF_ITERATIONS == 3 && NUMBER_OF_FULL_ITERATIONS == 1 {
+        220.cast()
+    } else if F::BITS == 128 && NUMBER_OF_HALF_ITERATIONS == 4 && NUMBER_OF_FULL_ITERATIONS == 1 {
+        13922.cast()
+    } else {
+        panic!("invalid iterations for the specified bits");
+    };
 
     // Suppose 1/b - P * 2^-W < x < 1/b + P * 2^-W
     let x_uq0 = x_uq0 - reciprocal_precision;
     // Now 1/b - (2*P) * 2^-W < x < 1/b
     // FIXME Is x_UQ0 still >= 0.5?
 
-    let mut quotient: <F as Float>::Int = x_uq0.widen_mul(a_significand << 1).hi();
+    let mut quotient: F::Int = x_uq0.widen_mul(a_significand << 1).hi();
     // Now, a/b - 4*P * 2^-W < q < a/b for q=<quotient_UQ1:dummy> in UQ1.(SB+1+W).
 
     // quotient_UQ1 is in [0.5, 2.0) as UQ1.(SB+1),
@@ -914,13 +944,8 @@ intrinsics! {
         div64(a, b)
     }
 
-    // TODO: how should `HInt` be handled?
     pub extern "C" fn __divtf3(a: f128, b: f128) -> f128 {
-        if cfg!(target_pointer_width = "64") {
-            div32(a, b)
-        } else {
-            div64(a, b)
-        }
+        div64(a, b)
     }
 
     #[cfg(target_arch = "arm")]
diff --git a/src/float/extend.rs b/src/float/extend.rs
index 7c244660..5b0c0d97 100644
--- a/src/float/extend.rs
+++ b/src/float/extend.rs
@@ -1,5 +1,5 @@
 use crate::float::Float;
-use crate::int::{CastInto, Int};
+use crate::int::{CastInto, Int, MinInt};
 
 /// Generic conversion from a narrower to a wider IEEE-754 floating-point type
 fn extend<F: Float, R: Float>(a: F) -> R
diff --git a/src/float/mod.rs b/src/float/mod.rs
index 02d291ed..a82dd7d2 100644
--- a/src/float/mod.rs
+++ b/src/float/mod.rs
@@ -59,7 +59,7 @@ pub(crate) trait Float:
     /// A mask for the significand
     const SIGNIFICAND_MASK: Self::Int;
 
-    /// The implicit bit of the float format
+    // The implicit bit of the float format
     const IMPLICIT_BIT: Self::Int;
 
     /// A mask for the exponent
diff --git a/src/float/mul.rs b/src/float/mul.rs
index eed29527..e3e5708e 100644
--- a/src/float/mul.rs
+++ b/src/float/mul.rs
@@ -1,5 +1,5 @@
 use crate::float::Float;
-use crate::int::{CastInto, DInt, HInt, Int};
+use crate::int::{CastInto, DInt, HInt, Int, MinInt};
 
 fn mul<F: Float>(a: F, b: F) -> F
 where
diff --git a/src/float/trunc.rs b/src/float/trunc.rs
index 6de446c1..b607a654 100644
--- a/src/float/trunc.rs
+++ b/src/float/trunc.rs
@@ -1,5 +1,5 @@
 use crate::float::Float;
-use crate::int::{CastInto, Int};
+use crate::int::{CastInto, Int, MinInt};
 
 fn trunc<F: Float, R: Float>(a: F) -> R
 where
diff --git a/src/int/addsub.rs b/src/int/addsub.rs
index f31eff4b..e95590d8 100644
--- a/src/int/addsub.rs
+++ b/src/int/addsub.rs
@@ -1,6 +1,6 @@
-use crate::int::{DInt, Int};
+use crate::int::{DInt, Int, MinInt};
 
-trait UAddSub: DInt {
+trait UAddSub: DInt + Int {
     fn uadd(self, other: Self) -> Self {
         let (lo, carry) = self.lo().overflowing_add(other.lo());
         let hi = self.hi().wrapping_add(other.hi());
@@ -22,7 +22,7 @@ impl UAddSub for u128 {}
 
 trait AddSub: Int
 where
-    <Self as Int>::UnsignedInt: UAddSub,
+    <Self as MinInt>::UnsignedInt: UAddSub,
 {
     fn add(self, other: Self) -> Self {
         Self::from_unsigned(self.unsigned().uadd(other.unsigned()))
@@ -37,7 +37,7 @@ impl AddSub for i128 {}
 
 trait Addo: AddSub
 where
-    <Self as Int>::UnsignedInt: UAddSub,
+    <Self as MinInt>::UnsignedInt: UAddSub,
 {
     fn addo(self, other: Self) -> (Self, bool) {
         let sum = AddSub::add(self, other);
@@ -50,7 +50,7 @@ impl Addo for u128 {}
 
 trait Subo: AddSub
 where
-    <Self as Int>::UnsignedInt: UAddSub,
+    <Self as MinInt>::UnsignedInt: UAddSub,
 {
     fn subo(self, other: Self) -> (Self, bool) {
         let sum = AddSub::sub(self, other);
diff --git a/src/int/big.rs b/src/int/big.rs
new file mode 100644
index 00000000..a54d6259
--- /dev/null
+++ b/src/int/big.rs
@@ -0,0 +1,364 @@
+//! Integers used for wide operations, larger than `u128`.
+
+#![allow(unused)]
+
+use crate::int::{DInt, HInt, Int, MinInt};
+use core::{fmt, ops};
+
+const WORD_LO_MASK: u64 = 0x00000000ffffffff;
+const WORD_HI_MASK: u64 = 0xffffffff00000000;
+const WORD_FULL_MASK: u64 = 0xffffffffffffffff;
+const U128_LO_MASK: u128 = u64::MAX as u128;
+const U128_HI_MASK: u128 = (u64::MAX as u128) << 64;
+
+/// A 256-bit unsigned integer represented as 4 64-bit limbs.
+///
+/// Each limb is a native-endian number, but the array is little-limb-endian.
+#[allow(non_camel_case_types)]
+#[derive(Clone, Copy, Debug, PartialEq, PartialOrd)]
+pub struct u256(pub [u64; 4]);
+
+impl u256 {
+    pub const MAX: Self = Self([u64::MAX, u64::MAX, u64::MAX, u64::MAX]);
+
+    /// Reinterpret as a signed integer
+    pub fn signed(self) -> i256 {
+        i256(self.0)
+    }
+}
+
+/// A 256-bit signed integer represented as 4 64-bit limbs.
+///
+/// Each limb is a native-endian number, but the array is little-limb-endian.
+#[allow(non_camel_case_types)]
+#[derive(Clone, Copy, Debug, PartialEq, PartialOrd)]
+pub struct i256(pub [u64; 4]);
+
+impl i256 {
+    /// Reinterpret as an unsigned integer
+    pub fn unsigned(self) -> u256 {
+        u256(self.0)
+    }
+}
+
+impl MinInt for u256 {
+    type OtherSign = i256;
+
+    type UnsignedInt = u256;
+
+    const SIGNED: bool = false;
+    const BITS: u32 = 256;
+    const ZERO: Self = Self([0u64; 4]);
+    const ONE: Self = Self([1, 0, 0, 0]);
+    const MIN: Self = Self([0u64; 4]);
+    const MAX: Self = Self([u64::MAX; 4]);
+}
+
+impl MinInt for i256 {
+    type OtherSign = u256;
+
+    type UnsignedInt = u256;
+
+    const SIGNED: bool = false;
+    const BITS: u32 = 256;
+    const ZERO: Self = Self([0u64; 4]);
+    const ONE: Self = Self([1, 0, 0, 0]);
+    const MIN: Self = Self([0, 0, 0, 1 << 63]);
+    const MAX: Self = Self([u64::MAX, u64::MAX, u64::MAX, u64::MAX << 1]);
+}
+
+// impl Int for i256 {
+//     fn is_zero(self) -> bool {
+//         self == Self::ZERO
+//     }
+
+//     fn wrapping_neg(self) -> Self {
+//         Self::ZERO.wrapping_sub(self)
+//     }
+
+//     fn wrapping_add(self, other: Self) -> Self {
+//         self.overflowing_add(other).0
+//     }
+//
+//     fn overflowing_add(self, other: Self) -> (Self, bool) {
+//         let x0 = (u128::from(self.0[0])).wrapping_add(u128::from(other.0[0]));
+//         let v0 = x0 as u64;
+//         let c0 = x0 >> 64;
+
+//         let x1 = (u128::from(self.0[1]))
+//             .wrapping_add(u128::from(other.0[1]))
+//             .wrapping_add(c0);
+//         let v1 = x1 as u64;
+//         let c1 = x1 >> 64;
+
+//         let x2 = (u128::from(self.0[2]))
+//             .wrapping_add(u128::from(other.0[2]))
+//             .wrapping_add(c1);
+//         let v2 = x2 as u64;
+//         let c2 = x2 >> 64;
+
+//         let x3 = (u128::from(self.0[3]))
+//             .wrapping_add(u128::from(other.0[3]))
+//             .wrapping_add(c2);
+//         let v3 = x3 as u64;
+//         let c3 = x3 >> 64;
+
+//         (Self([v0, v1, v2, v3]), c3 > 0)
+//     }
+// }
+
+macro_rules! impl_common {
+    ($ty:ty) => {
+        //         impl ops::Add for $ty {
+        //             type Output = Self;
+
+        //             fn add(self, rhs: Self) -> Self::Output {
+        //                 let (val, wrapped) = self.overflowing_add(rhs);
+        //                 debug_assert!(!wrapped, "attempted to add with overflow");
+        //                 val
+        //             }
+        //         }
+
+        //         impl ops::AddAssign for $ty {
+        //             fn add_assign(&mut self, rhs: Self) {
+        //                 *self = *self + rhs
+        //             }
+        //         }
+
+        //         impl ops::BitAnd for $ty {
+        //             type Output = Self;
+
+        //             fn bitand(self, rhs: Self) -> Self::Output {
+        //                 Self([
+        //                     self.0[0] & rhs.0[0],
+        //                     self.0[1] & rhs.0[1],
+        //                     self.0[2] & rhs.0[2],
+        //                     self.0[3] & rhs.0[3],
+        //                 ])
+        //             }
+        //         }
+
+        //         impl ops::BitAndAssign for $ty {
+        //             fn bitand_assign(&mut self, rhs: Self) {
+        //                 *self = *self & rhs
+        //             }
+        //         }
+
+        impl ops::BitOr for $ty {
+            type Output = Self;
+
+            fn bitor(mut self, rhs: Self) -> Self::Output {
+                self.0[0] |= rhs.0[0];
+                self.0[1] |= rhs.0[1];
+                self.0[2] |= rhs.0[2];
+                self.0[3] |= rhs.0[3];
+                self
+            }
+        }
+
+        //         impl ops::BitOrAssign for $ty {
+        //             fn bitor_assign(&mut self, rhs: Self) {
+        //                 *self = *self | rhs
+        //             }
+        //         }
+
+        //         impl ops::BitXor for $ty {
+        //             type Output = Self;
+
+        //             fn bitxor(self, rhs: Self) -> Self::Output {
+        //                 Self([
+        //                     self.0[0] ^ rhs.0[0],
+        //                     self.0[1] ^ rhs.0[1],
+        //                     self.0[2] ^ rhs.0[2],
+        //                     self.0[3] ^ rhs.0[3],
+        //                 ])
+        //             }
+        //         }
+
+        //         impl ops::BitXorAssign for $ty {
+        //             fn bitxor_assign(&mut self, rhs: Self) {
+        //                 *self = *self ^ rhs
+        //             }
+        //         }
+
+        impl ops::Not for $ty {
+            type Output = Self;
+
+            fn not(self) -> Self::Output {
+                Self([!self.0[0], !self.0[1], !self.0[2], !self.0[3]])
+            }
+        }
+
+        impl ops::Shl<u32> for $ty {
+            type Output = Self;
+
+            fn shl(self, rhs: u32) -> Self::Output {
+                todo!()
+            }
+        }
+    };
+}
+
+impl_common!(i256);
+impl_common!(u256);
+
+macro_rules! word {
+    (1, $val:expr) => {
+        (($val >> (32 * 3)) & Self::from(WORD_LO_MASK)) as u64
+    };
+    (2, $val:expr) => {
+        (($val >> (32 * 2)) & Self::from(WORD_LO_MASK)) as u64
+    };
+    (3, $val:expr) => {
+        (($val >> (32 * 1)) & Self::from(WORD_LO_MASK)) as u64
+    };
+    (4, $val:expr) => {
+        (($val >> (32 * 0)) & Self::from(WORD_LO_MASK)) as u64
+    };
+}
+
+impl HInt for u128 {
+    type D = u256;
+
+    fn widen(self) -> Self::D {
+        let w0 = self & u128::from(u64::MAX);
+        let w1 = (self >> u64::BITS) & u128::from(u64::MAX);
+        u256([w0 as u64, w1 as u64, 0, 0])
+    }
+
+    fn zero_widen(self) -> Self::D {
+        self.widen()
+    }
+
+    fn zero_widen_mul(self, rhs: Self) -> Self::D {
+        let product11: u64 = word!(1, self) * word!(1, rhs);
+        let product12: u64 = word!(1, self) * word!(2, rhs);
+        let product13: u64 = word!(1, self) * word!(3, rhs);
+        let product14: u64 = word!(1, self) * word!(4, rhs);
+        let product21: u64 = word!(2, self) * word!(1, rhs);
+        let product22: u64 = word!(2, self) * word!(2, rhs);
+        let product23: u64 = word!(2, self) * word!(3, rhs);
+        let product24: u64 = word!(2, self) * word!(4, rhs);
+        let product31: u64 = word!(3, self) * word!(1, rhs);
+        let product32: u64 = word!(3, self) * word!(2, rhs);
+        let product33: u64 = word!(3, self) * word!(3, rhs);
+        let product34: u64 = word!(3, self) * word!(4, rhs);
+        let product41: u64 = word!(4, self) * word!(1, rhs);
+        let product42: u64 = word!(4, self) * word!(2, rhs);
+        let product43: u64 = word!(4, self) * word!(3, rhs);
+        let product44: u64 = word!(4, self) * word!(4, rhs);
+
+        let sum0: u128 = u128::from(product44);
+        let sum1: u128 = u128::from(product34) + u128::from(product43);
+        let sum2: u128 = u128::from(product24) + u128::from(product33) + u128::from(product42);
+        let sum3: u128 = u128::from(product14)
+            + u128::from(product23)
+            + u128::from(product32)
+            + u128::from(product41);
+        let sum4: u128 = u128::from(product13) + u128::from(product22) + u128::from(product31);
+        let sum5: u128 = u128::from(product12) + u128::from(product21);
+        let sum6: u128 = u128::from(product11);
+
+        let r0: u128 =
+            (sum0 & u128::from(WORD_FULL_MASK)) + ((sum1 & u128::from(WORD_LO_MASK)) << 32);
+        let r1: u128 = (sum0 >> 64)
+            + ((sum1 >> 32) & u128::from(WORD_FULL_MASK))
+            + (sum2 & u128::from(WORD_FULL_MASK))
+            + ((sum3 << 32) & u128::from(WORD_HI_MASK));
+
+        let lo = r0.wrapping_add(r1 << 64);
+        let hi = (r1 >> 64)
+            + (sum1 >> 96)
+            + (sum2 >> 64)
+            + (sum3 >> 32)
+            + sum4
+            + (sum5 << 32)
+            + (sum6 << 64);
+
+        u256([
+            (lo & U128_LO_MASK) as u64,
+            ((lo >> 64) & U128_LO_MASK) as u64,
+            (hi & U128_LO_MASK) as u64,
+            ((hi >> 64) & U128_LO_MASK) as u64,
+        ])
+    }
+
+    fn widen_mul(self, rhs: Self) -> Self::D {
+        self.zero_widen_mul(rhs)
+    }
+}
+
+impl HInt for i128 {
+    type D = i256;
+
+    fn widen(self) -> Self::D {
+        let mut ret = self.unsigned().zero_widen().signed();
+        if self.is_negative() {
+            ret.0[2] = u64::MAX;
+            ret.0[3] = u64::MAX;
+        }
+        ret
+    }
+
+    fn zero_widen(self) -> Self::D {
+        self.unsigned().zero_widen().signed()
+    }
+
+    fn zero_widen_mul(self, rhs: Self) -> Self::D {
+        self.unsigned().zero_widen_mul(rhs.unsigned()).signed()
+    }
+
+    fn widen_mul(self, rhs: Self) -> Self::D {
+        unimplemented!()
+        // let mut res = self.zero_widen_mul(rhs);
+        // if self.is_negative() ^ rhs.is_negative() {
+        //     // Sign extend as needed
+        //     // for word in res.0.iter_mut().rev() {
+        //     //     let zeroes = word.leading_zeros();
+        //     //     let leading = u64::MAX << (64 - zeroes);
+        //     //     *word |= leading;
+        //     //     if zeroes != 64 {
+        //     //         break;
+        //     //     }
+        //     // }
+        // }
+
+        // res
+    }
+}
+
+impl DInt for u256 {
+    type H = u128;
+
+    fn lo(self) -> Self::H {
+        let mut tmp = [0u8; 16];
+        tmp[..8].copy_from_slice(&self.0[0].to_le_bytes());
+        tmp[8..].copy_from_slice(&self.0[1].to_le_bytes());
+        u128::from_le_bytes(tmp)
+    }
+
+    fn hi(self) -> Self::H {
+        let mut tmp = [0u8; 16];
+        tmp[..8].copy_from_slice(&self.0[2].to_le_bytes());
+        tmp[8..].copy_from_slice(&self.0[3].to_le_bytes());
+        u128::from_le_bytes(tmp)
+    }
+}
+
+impl DInt for i256 {
+    type H = i128;
+
+    fn lo(self) -> Self::H {
+        let mut tmp = [0u8; 16];
+        tmp[..8].copy_from_slice(&self.0[0].to_le_bytes());
+        tmp[8..].copy_from_slice(&self.0[1].to_le_bytes());
+        i128::from_le_bytes(tmp)
+    }
+
+    fn hi(self) -> Self::H {
+        let mut tmp = [0u8; 16];
+        tmp[..8].copy_from_slice(&self.0[2].to_le_bytes());
+        tmp[8..].copy_from_slice(&self.0[3].to_le_bytes());
+        i128::from_le_bytes(tmp)
+    }
+}
diff --git a/src/int/mod.rs b/src/int/mod.rs
index 509f9fda..bb343d79 100644
--- a/src/int/mod.rs
+++ b/src/int/mod.rs
@@ -3,42 +3,29 @@ use core::ops;
 mod specialized_div_rem;
 
 pub mod addsub;
+mod big;
 pub mod leading_zeros;
 pub mod mul;
 pub mod sdiv;
 pub mod shift;
 pub mod udiv;
 
-pub use self::leading_zeros::__clzsi2;
+pub use big::{i256, u256};
+pub use leading_zeros::__clzsi2;
 
 public_test_dep! {
-/// Trait for some basic operations on integers
-pub(crate) trait Int:
-    Copy
+/// Minimal integer implementations needed on all integer types, including wide integers.
+pub(crate) trait MinInt: Copy
     + core::fmt::Debug
-    + PartialEq
-    + PartialOrd
-    + ops::AddAssign
-    + ops::SubAssign
-    + ops::BitAndAssign
-    + ops::BitOrAssign
-    + ops::BitXorAssign
-    + ops::ShlAssign<i32>
-    + ops::ShrAssign<u32>
-    + ops::Add<Output = Self>
-    + ops::Sub<Output = Self>
-    + ops::Div<Output = Self>
-    + ops::Shl<u32, Output = Self>
-    + ops::Shr<u32, Output = Self>
     + ops::BitOr<Output = Self>
-    + ops::BitXor<Output = Self>
-    + ops::BitAnd<Output = Self>
     + ops::Not<Output = Self>
+    + ops::Shl<u32, Output = Self>
 {
+
     /// Type with the same width but other signedness
-    type OtherSign: Int;
+    type OtherSign: MinInt;
     /// Unsigned version of Self
-    type UnsignedInt: Int;
+    type UnsignedInt: MinInt;
 
     /// If `Self` is a signed integer
     const SIGNED: bool;
@@ -50,13 +37,46 @@ pub(crate) trait Int:
     const ONE: Self;
     const MIN: Self;
     const MAX: Self;
+}
+}
 
+public_test_dep! {
+/// Trait for some basic operations on integers
+pub(crate) trait Int: MinInt
+    + PartialEq
+    + PartialOrd
+    + ops::AddAssign
+    + ops::SubAssign
+    + ops::BitAndAssign
+    + ops::BitOrAssign
+    + ops::BitXorAssign
+    + ops::ShlAssign<i32>
+    + ops::ShrAssign<u32>
+    + ops::Add<Output = Self>
+    + ops::Sub<Output = Self>
+    + ops::Mul<Output = Self>
+    + ops::Div<Output = Self>
+    + ops::Shr<u32, Output = Self>
+    + ops::BitXor<Output = Self>
+    + ops::BitAnd<Output = Self>
+{
     /// LUT used for maximizing the space covered and minimizing the computational cost of fuzzing
     /// in `testcrate`. For example, Self = u128 produces [0,1,2,7,8,15,16,31,32,63,64,95,96,111,
     /// 112,119,120,125,126,127].
-    const FUZZ_LENGTHS: [u8; 20];
+    const FUZZ_LENGTHS: [u8; 20] = make_fuzz_lengths(<Self as MinInt>::BITS);
+
     /// The number of entries of `FUZZ_LENGTHS` actually used. The maximum is 20 for u128.
-    const FUZZ_NUM: usize;
+    const FUZZ_NUM: usize = {
+        let log2 = (<Self as MinInt>::BITS - 1).count_ones() as usize;
+        if log2 == 3 {
+            // case for u8
+            6
+        } else {
+            // 3 entries on each extreme, 2 in the middle, and 4 for each scale of intermediate
+            // boundaries.
+            8 + (4 * (log2 - 4))
+        }
+    };
 
     fn unsigned(self) -> Self::UnsignedInt;
     fn from_unsigned(unsigned: Self::UnsignedInt) -> Self;
@@ -83,74 +103,54 @@ pub(crate) trait Int:
 }
 }
 
+pub(crate) const fn make_fuzz_lengths(bits: u32) -> [u8; 20] {
+    let mut v = [0u8; 20];
+    v[0] = 0;
+    v[1] = 1;
+    v[2] = 2; // important for parity and the iX::MIN case when reversed
+    let mut i = 3;
+
+    // No need for any more until the byte boundary, because there should be no algorithms
+    // that are sensitive to anything not next to byte boundaries after 2. We also scale
+    // in powers of two, which is important to prevent u128 corner tests from getting too
+    // big.
+    let mut l = 8;
+    loop {
+        if l >= ((bits / 2) as u8) {
+            break;
+        }
+        // get both sides of the byte boundary
+        v[i] = l - 1;
+        i += 1;
+        v[i] = l;
+        i += 1;
+        l *= 2;
+    }
+
+    if bits != 8 {
+        // add the lower side of the middle boundary
+        v[i] = ((bits / 2) - 1) as u8;
+        i += 1;
+    }
+
+    // We do not want to jump directly from the Self::BITS/2 boundary to the Self::BITS
+    // boundary because of algorithms that split the high part up. We reverse the scaling
+    // as we go to Self::BITS.
+    let mid = i;
+    let mut j = 1;
+    loop {
+        v[i] = (bits as u8) - (v[mid - j]) - 1;
+        if j == mid {
+            break;
+        }
+        i += 1;
+        j += 1;
+    }
+    v
+}
+
 macro_rules! int_impl_common {
     ($ty:ty) => {
-        const BITS: u32 = <Self as Int>::ZERO.count_zeros();
-        const SIGNED: bool = Self::MIN != Self::ZERO;
-
-        const ZERO: Self = 0;
-        const ONE: Self = 1;
-        const MIN: Self = <Self>::MIN;
-        const MAX: Self = <Self>::MAX;
-
-        const FUZZ_LENGTHS: [u8; 20] = {
-            let bits = <Self as Int>::BITS;
-            let mut v = [0u8; 20];
-            v[0] = 0;
-            v[1] = 1;
-            v[2] = 2; // important for parity and the iX::MIN case when reversed
-            let mut i = 3;
-            // No need for any more until the byte boundary, because there should be no algorithms
-            // that are sensitive to anything not next to byte boundaries after 2. We also scale
-            // in powers of two, which is important to prevent u128 corner tests from getting too
-            // big.
-            let mut l = 8;
-            loop {
-                if l >= ((bits / 2) as u8) {
-                    break;
-                }
-                // get both sides of the byte boundary
-                v[i] = l - 1;
-                i += 1;
-                v[i] = l;
-                i += 1;
-                l *= 2;
-            }
-
-            if bits != 8 {
-                // add the lower side of the middle boundary
-                v[i] = ((bits / 2) - 1) as u8;
-                i += 1;
-            }
-
-            // We do not want to jump directly from the Self::BITS/2 boundary to the Self::BITS
-            // boundary because of algorithms that split the high part up. We reverse the scaling
-            // as we go to Self::BITS.
-            let mid = i;
-            let mut j = 1;
-            loop {
-                v[i] = (bits as u8) - (v[mid - j]) - 1;
-                if j == mid {
-                    break;
-                }
-                i += 1;
-                j += 1;
-            }
-            v
-        };
-
-        const FUZZ_NUM: usize = {
-            let log2 = (<Self as Int>::BITS - 1).count_ones() as usize;
-            if log2 == 3 {
-                // case for u8
-                6
-            } else {
-                // 3 entries on each extreme, 2 in the middle, and 4 for each scale of intermediate
-                // boundaries.
-                8 + (4 * (log2 - 4))
-            }
-        };
-
         fn from_bool(b: bool) -> Self {
             b as $ty
         }
@@ -203,10 +203,20 @@ macro_rules! int_impl_common {
 
 macro_rules! int_impl {
     ($ity:ty, $uty:ty) => {
-        impl Int for $uty {
+        impl MinInt for $uty {
             type OtherSign = $ity;
             type UnsignedInt = $uty;
 
+            const BITS: u32 = <Self as MinInt>::ZERO.count_zeros();
+            const SIGNED: bool = Self::MIN != Self::ZERO;
+
+            const ZERO: Self = 0;
+            const ONE: Self = 1;
+            const MIN: Self = <Self>::MIN;
+            const MAX: Self = <Self>::MAX;
+        }
+
+        impl Int for $uty {
             fn unsigned(self) -> $uty {
                 self
             }
@@ -228,10 +238,20 @@ macro_rules! int_impl {
             int_impl_common!($uty);
         }
 
-        impl Int for $ity {
+        impl MinInt for $ity {
             type OtherSign = $uty;
             type UnsignedInt = $uty;
 
+            const BITS: u32 = <Self as MinInt>::ZERO.count_zeros();
+            const SIGNED: bool = Self::MIN != Self::ZERO;
+
+            const ZERO: Self = 0;
+            const ONE: Self = 1;
+            const MIN: Self = <Self>::MIN;
+            const MAX: Self = <Self>::MAX;
+        }
+
+        impl Int for $ity {
             fn unsigned(self) -> $uty {
                 self as $uty
             }
@@ -259,18 +279,22 @@ int_impl!(i128, u128);
 public_test_dep! {
 /// Trait for integers twice the bit width of another integer. This is implemented for all
 /// primitives except for `u8`, because there is not a smaller primitive.
-pub(crate) trait DInt: Int {
+pub(crate) trait DInt: MinInt {
     /// Integer that is half the bit width of the integer this trait is implemented for
-    type H: HInt<D = Self> + Int;
+    type H: HInt<D = Self>;
 
     /// Returns the low half of `self`
     fn lo(self) -> Self::H;
     /// Returns the high half of `self`
     fn hi(self) -> Self::H;
     /// Returns the low and high halves of `self` as a tuple
-    fn lo_hi(self) -> (Self::H, Self::H);
+    fn lo_hi(self) -> (Self::H, Self::H) {
+        (self.lo(), self.hi())
+    }
     /// Constructs an integer using lower and higher half parts
-    fn from_lo_hi(lo: Self::H, hi: Self::H) -> Self;
+    fn from_lo_hi(lo: Self::H, hi: Self::H) -> Self {
+        lo.zero_widen() | hi.widen_hi()
+    }
 }
 }
 
@@ -279,7 +303,7 @@ public_test_dep! {
 /// primitives except for `u128`, because it there is not a larger primitive.
 pub(crate) trait HInt: Int {
     /// Integer that is double the bit width of the integer this trait is implemented for
-    type D: DInt<H = Self> + Int;
+    type D: DInt<H = Self> + MinInt;
 
     /// Widens (using default extension) the integer to have double bit width
     fn widen(self) -> Self::D;
@@ -287,7 +311,9 @@ pub(crate) trait HInt: Int {
     /// around problems with associated type bounds (such as `Int<Othersign: DInt>`) being unstable
     fn zero_widen(self) -> Self::D;
     /// Widens the integer to have double bit width and shifts the integer into the higher bits
-    fn widen_hi(self) -> Self::D;
+    fn widen_hi(self) -> Self::D {
+        self.widen() << <Self as MinInt>::BITS
+    }
     /// Widening multiplication with zero widening. This cannot overflow.
     fn zero_widen_mul(self, rhs: Self) -> Self::D;
     /// Widening multiplication. This cannot overflow.
@@ -305,13 +331,7 @@ macro_rules! impl_d_int {
                     self as $X
                 }
                 fn hi(self) -> Self::H {
-                    (self >> <$X as Int>::BITS) as $X
-                }
-                fn lo_hi(self) -> (Self::H, Self::H) {
-                    (self.lo(), self.hi())
-                }
-                fn from_lo_hi(lo: Self::H, hi: Self::H) -> Self {
-                    lo.zero_widen() | hi.widen_hi()
+                    (self >> <$X as MinInt>::BITS) as $X
                 }
             }
         )*
@@ -330,9 +350,6 @@ macro_rules! impl_h_int {
                 fn zero_widen(self) -> Self::D {
                     (self as $uH) as $X
                 }
-                fn widen_hi(self) -> Self::D {
-                    (self as $X) << <$H as Int>::BITS
-                }
                 fn zero_widen_mul(self, rhs: Self) -> Self::D {
                     self.zero_widen().wrapping_mul(rhs.zero_widen())
                 }
diff --git a/src/int/mul.rs b/src/int/mul.rs
index 2538e2f4..e0093a72 100644
--- a/src/int/mul.rs
+++ b/src/int/mul.rs
@@ -1,6 +1,6 @@
 use crate::int::{DInt, HInt, Int};
 
-trait Mul: DInt
+trait Mul: DInt + Int
 where
     Self::H: DInt,
 {
@@ -30,7 +30,7 @@ where
 impl Mul for u64 {}
 impl Mul for i128 {}
 
-pub(crate) trait UMulo: Int + DInt {
+pub(crate) trait UMulo: DInt + Int {
     fn mulo(self, rhs: Self) -> (Self, bool) {
         match (self.hi().is_zero(), rhs.hi().is_zero()) {
             // overflow is guaranteed
diff --git a/src/int/shift.rs b/src/int/shift.rs
index dbd04018..31727298 100644
--- a/src/int/shift.rs
+++ b/src/int/shift.rs
@@ -1,4 +1,4 @@
-use crate::int::{DInt, HInt, Int};
+use crate::int::{DInt, HInt, Int, MinInt};
 
 trait Ashl: DInt {
     /// Returns `a << b`, requires `b < Self::BITS`
diff --git a/src/lib.rs b/src/lib.rs
index ea376631..7c0b5072 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -44,6 +44,21 @@ extern crate core;
 
 #[macro_use]
 mod macros;
+macro_rules! vdbg {
+    ($val:expr $(,)?) => {
+        // Use of `match` here is intentional because it affects the lifetimes
+        // of temporaries - https://stackoverflow.com/a/48732525/1063961
+        match $val {
+            tmp => {
+                $crate::write_val(
+                    tmp,
+                    concat!("[", file!(), ":", line!(), "] ", stringify!($val), " = "),
+                );
+                tmp
+            }
+        }
+    };
+}
 
 pub mod float;
 pub mod int;
@@ -80,3 +95,45 @@ pub mod x86;
 pub mod x86_64;
 
 pub mod probestack;
+
+// Hacky way to print values since we don't have `std` for the crate
+mod val_print {
+    extern "C" {
+        fn print_callback(val_ptr: *const u8, val_sz: usize, name_ptr: *const u8, name_len: usize);
+    }
+
+    pub fn write_val<T: Copy>(val: T, name: &str) {
+        unsafe {
+            print_callback(
+                core::ptr::addr_of!(val).cast(),
+                core::mem::size_of::<T>(),
+                name.as_ptr(),
+                name.len(),
+            )
+        };
+    }
+}
+
+pub use val_print::write_val;
+
+#[macro_export]
+macro_rules! set_val_callback {
+    () => {
+        #[no_mangle]
+        unsafe extern "C" fn print_callback(
+            val_ptr: *const u8,
+            val_sz: usize,
+            name_ptr: *const u8,
+            name_len: usize,
+        ) {
+            let val = unsafe { core::slice::from_raw_parts(val_ptr, val_sz) };
+            let name_slice = unsafe { core::slice::from_raw_parts(name_ptr, name_len) };
+            let name = core::str::from_utf8(name_slice).unwrap();
+            print!("{}: 0x", name);
+            for byte in val.iter().rev() {
+                print!("{:02x}", byte);
+            }
+            println!();
+        }
+    };
+}
diff --git a/testcrate/Cargo.toml b/testcrate/Cargo.toml
index 6ff3fde1..6f771181 100644
--- a/testcrate/Cargo.toml
+++ b/testcrate/Cargo.toml
@@ -33,3 +33,5 @@ no-asm = ["compiler_builtins/no-asm"]
 no-f16-f128 = ["compiler_builtins/no-f16-f128"]
 mem = ["compiler_builtins/mem"]
 mangled-names = ["compiler_builtins/mangled-names"]
+# Skip tests that rely on f128 symbols being available on the system
+no-sys-f128 = []
diff --git a/testcrate/benches/float.rs b/testcrate/benches/float.rs
new file mode 100644
index 00000000..a12300b3
--- /dev/null
+++ b/testcrate/benches/float.rs
@@ -0,0 +1,90 @@
+#![feature(test, f16, f128)]
+
+extern crate test;
+use core::hint::black_box;
+use test::Bencher;
+
+extern crate compiler_builtins;
+
+macro_rules! test_values {
+    ($ty:ty) => {
+        &[
+            <$ty>::MIN,
+            <$ty>::MAX,
+            <$ty>::NAN,
+            <$ty>::INFINITY,
+            <$ty>::NEG_INFINITY,
+            <$ty>::MIN_POSITIVE,
+            0.0,
+            1.0,
+            -1.0,
+        ]
+    };
+}
+
+fn combine2<T: Copy>(vals: &[T]) -> Vec<(T, T)> {
+    let mut ret = Vec::new();
+    for x in vals.iter().copied() {
+        for y in vals.iter().copied() {
+            ret.push((x, y));
+        }
+    }
+    ret
+}
+
+macro_rules! test_iter {
+    ($b:ident, $ty:ty, $fn:path) => {{
+        let vals = combine2(test_values!($ty));
+        let iter_loop = || {
+            for (a, b) in vals.iter().copied() {
+                black_box($fn(black_box(a), black_box(b)));
+            }
+        };
+
+        // Warmup
+        for _ in 0..1000 {
+            iter_loop();
+        }
+
+        $b.iter(iter_loop);
+    }};
+}
+
+macro_rules! foobar {
+    ($($ty:ty, $rust_fn:ident, $builtin_fn:ident, $mod:ident::$sym:ident);* $(;)?) => {
+        $(
+            #[bench]
+            fn $rust_fn(b: &mut Bencher) {
+                // Equalize with the builtin function which is called separately
+                #[inline(never)]
+                fn inline_wrapper(a: $ty, b: $ty) -> $ty {
+                    compiler_builtins::float::$mod::$sym(black_box(a), black_box(b))
+                }
+
+                test_iter!(b, $ty, inline_wrapper);
+            }
+
+            #[bench]
+            fn $builtin_fn(b: &mut Bencher) {
+                extern "C" {
+                    fn $sym(a: $ty, b: $ty) -> $ty;
+                }
+
+                unsafe {
+                    test_iter!(b, $ty, $sym);
+                }
+            }
+        )*
+    };
+}
+
+foobar! {
+    f32, addsf3_rust, addsf3_builtin, add::__addsf3;
+    f32, subsf3_rust, subsf3_builtin, sub::__subsf3;
+    f32, mulsf3_rust, mulsf3_builtin, mul::__mulsf3;
+    f32, divsf3_rust, divsf3_builtin, div::__divsf3;
+    f64, adddf3_rust, adddf3_builtin, add::__adddf3;
+    f64, subdf3_rust, subdf3_builtin, sub::__subdf3;
+    f64, muldf3_rust, muldf3_builtin, mul::__muldf3;
+    f64, divdf3_rust, divdf3_builtin, div::__divdf3;
+}
diff --git a/testcrate/build.rs b/testcrate/build.rs
new file mode 100644
index 00000000..86c97af1
--- /dev/null
+++ b/testcrate/build.rs
@@ -0,0 +1,15 @@
+use std::env;
+
+fn main() {
+    let target = env::var("TARGET").unwrap();
+
+    // These platforms do not have f128 symbols available in their system libraries, so
+    // skip related tests.
+    if target.starts_with("arm-")
+        || target.contains("apple-darwin")
+        || target.contains("windows-msvc")
+    {
+        println!("cargo:warning=skipping `f128` tests; system does not have relevant symbols");
+        println!("cargo:rustc-cfg=feature=\"no-sys-f128\"");
+    }
+}
diff --git a/testcrate/src/lib.rs b/testcrate/src/lib.rs
index 9bd155f6..13abf459 100644
--- a/testcrate/src/lib.rs
+++ b/testcrate/src/lib.rs
@@ -15,7 +15,7 @@
 #![no_std]
 
 use compiler_builtins::float::Float;
-use compiler_builtins::int::Int;
+use compiler_builtins::int::{Int, MinInt};
 
 use rand_xoshiro::rand_core::{RngCore, SeedableRng};
 use rand_xoshiro::Xoshiro128StarStar;
@@ -101,7 +101,10 @@ macro_rules! edge_cases {
 
 /// Feeds a series of fuzzing inputs to `f`. The fuzzer first uses an algorithm designed to find
 /// edge cases, followed by a more random fuzzer that runs `n` times.
-pub fn fuzz<I: Int, F: FnMut(I)>(n: u32, mut f: F) {
+pub fn fuzz<I: Int, F: FnMut(I)>(n: u32, mut f: F)
+where
+    <I as MinInt>::UnsignedInt: Int,
+{
     // edge case tester. Calls `f` 210 times for u128.
     // zero gets skipped by the loop
     f(I::ZERO);
@@ -111,7 +114,7 @@ pub fn fuzz<I: Int, F: FnMut(I)>(n: u32, mut f: F) {
 
     // random fuzzer
     let mut rng = Xoshiro128StarStar::seed_from_u64(0);
-    let mut x: I = Int::ZERO;
+    let mut x: I = MinInt::ZERO;
     for _ in 0..n {
         fuzz_step(&mut rng, &mut x);
         f(x)
@@ -119,7 +122,10 @@ pub fn fuzz<I: Int, F: FnMut(I)>(n: u32, mut f: F) {
 }
 
 /// The same as `fuzz`, except `f` has two inputs.
-pub fn fuzz_2<I: Int, F: Fn(I, I)>(n: u32, f: F) {
+pub fn fuzz_2<I: Int, F: Fn(I, I)>(n: u32, f: F)
+where
+    <I as MinInt>::UnsignedInt: Int,
+{
     // Check cases where the first and second inputs are zero. Both call `f` 210 times for `u128`.
     edge_cases!(I, case, {
         f(I::ZERO, case);
@@ -150,10 +156,10 @@ pub fn fuzz_shift<I: Int, F: Fn(I, u32)>(f: F) {
     // Shift functions are very simple and do not need anything other than shifting a small
     // set of random patterns for every fuzz length.
     let mut rng = Xoshiro128StarStar::seed_from_u64(0);
-    let mut x: I = Int::ZERO;
+    let mut x: I = MinInt::ZERO;
     for i in 0..I::FUZZ_NUM {
         fuzz_step(&mut rng, &mut x);
-        f(x, Int::ZERO);
+        f(x, MinInt::ZERO);
         f(x, I::FUZZ_LENGTHS[i] as u32);
     }
 }
diff --git a/testcrate/tests/addsub.rs b/testcrate/tests/addsub.rs
index da7684ec..343e47ae 100644
--- a/testcrate/tests/addsub.rs
+++ b/testcrate/tests/addsub.rs
@@ -1,4 +1,6 @@
 #![allow(unused_macros)]
+#![feature(f128)]
+#![feature(f16)]
 
 use testcrate::*;
 
@@ -80,13 +82,13 @@ macro_rules! float_sum {
                 let sub1: $f = $fn_sub(x, y);
                 if !Float::eq_repr(add0, add1) {
                     panic!(
-                        "{}({}, {}): std: {}, builtins: {}",
+                        "{}({:?}, {:?}): std: {:?}, builtins: {:?}",
                         stringify!($fn_add), x, y, add0, add1
                     );
                 }
                 if !Float::eq_repr(sub0, sub1) {
                     panic!(
-                        "{}({}, {}): std: {}, builtins: {}",
+                        "{:?}({:?}, {:?}): std: {:?}, builtins: {:?}",
                         stringify!($fn_sub), x, y, sub0, sub1
                     );
                 }
@@ -110,6 +112,16 @@ fn float_addsub() {
     );
 }
 
+#[test]
+#[cfg(not(feature = "no-sys-f128"))]
+fn float_addsub_f128() {
+    use compiler_builtins::float::{add::__addtf3, sub::__subtf3, Float};
+
+    float_sum!(
+        f128, __addtf3, __subtf3;
+    );
+}
+
 #[cfg(target_arch = "arm")]
 #[test]
 fn float_addsub_arm() {
diff --git a/testcrate/tests/big.rs b/testcrate/tests/big.rs
new file mode 100644
index 00000000..abf7d77c
--- /dev/null
+++ b/testcrate/tests/big.rs
@@ -0,0 +1,104 @@
+use compiler_builtins::int::{i256, u256, HInt, Int, MinInt};
+
+const LOHI_SPLIT: u128 = 0xaaaaaaaaaaaaaaaaffffffffffffffff;
+
+/// Print a `u256` as hex since we can't add format implementations
+fn hexu(v: u256) -> String {
+    format!(
+        "0x{:016x}{:016x}{:016x}{:016x}",
+        v.0[3], v.0[2], v.0[1], v.0[0]
+    )
+}
+
+fn hexi(v: i256) -> String {
+    hexu(v.unsigned())
+}
+
+#[test]
+fn widen_u128() {
+    assert_eq!(u128::MAX.widen(), u256([u64::MAX, u64::MAX, 0, 0]));
+    assert_eq!(
+        LOHI_SPLIT.widen(),
+        u256([u64::MAX, 0xaaaaaaaaaaaaaaaa, 0, 0])
+    );
+}
+
+#[test]
+fn widen_i128() {
+    assert_eq!((-1i128).widen(), u256::MAX.signed());
+    assert_eq!(
+        (LOHI_SPLIT as i128).widen(),
+        i256([u64::MAX, 0xaaaaaaaaaaaaaaaa, u64::MAX, u64::MAX])
+    );
+    assert_eq!((-1i128).zero_widen().unsigned(), (u128::MAX).widen());
+}
+
+#[test]
+fn widen_mul_u128() {
+    let tests = [
+        (u128::MAX / 2, 2_u128, u256([u64::MAX - 1, u64::MAX, 0, 0])),
+        (u128::MAX, 2_u128, u256([u64::MAX - 1, u64::MAX, 1, 0])),
+        // TODO: https://github.com/rust-lang/compiler-builtins/pull/587#issuecomment-2060543566
+        // (u128::MAX, u128::MAX, u256([1, 0, u64::MAX - 1, u64::MAX])),
+        (u128::MIN, u128::MIN, u256::ZERO),
+        (1234, 0, u256::ZERO),
+        (0, 1234, u256::ZERO),
+    ];
+
+    let mut errors = Vec::new();
+    for (i, (a, b, exp)) in tests.iter().copied().enumerate() {
+        let res = a.widen_mul(b);
+        let res_z = a.zero_widen_mul(b);
+        assert_eq!(res, res_z);
+        if res != exp {
+            errors.push((i, a, b, exp, res));
+        }
+    }
+
+    for (i, a, b, exp, res) in &errors {
+        eprintln!(
+            "FAILURE ({i}): {a:#034x} * {b:#034x} = {} got {}",
+            hexu(*exp),
+            hexu(*res)
+        );
+    }
+    assert!(errors.is_empty());
+}
+
+// #[test]
+// fn widen_mul_i128() {
+//     let tests = [
+//         (
+//             i128::MAX / 2,
+//             2_i128,
+//             i256([u64::MAX - 1, u64::MAX >> 1, 0, 0]),
+//         ),
+//         (i128::MAX, 2_i128, i256([u64::MAX - 1, u64::MAX, 0, 0])),
+//         (i128::MIN, 2_i128, i256([0, 0, u64::MAX, u64::MAX])),
+//         (
+//             i128::MAX,
+//             i128::MAX,
+//             i256([1, 0, u64::MAX - 1, u64::MAX >> 2]),
+//         ),
+//         (i128::MAX, i128::MIN, i256([0, 0, 0, 0b11 << 62])),
+//         (i128::MIN, i128::MIN, i256([0, 0, 0, 0])),
+//         (1234, 0, i256::ZERO),
+//         (0, 1234, i256::ZERO),
+//         (-1234, 0, i256::ZERO),
+//         (0, -1234, i256::ZERO),
+//     ];
+
+//     let mut errors = Vec::new();
+//     for (i, (a, b, exp)) in tests.iter().copied().enumerate() {
+//         let res = a.widen_mul(b);
+//         // TODO check zero widen mul
+//         if res != exp {
+//             errors.push((i, a, b, exp, res));
+//         }
+//     }
+
+//     for (i, a, b, exp, res) in &errors {
+//         eprintln!("FAILURE ({i}): {a:#034x} * {b:#034x} = {} got {}", hexi(*exp), hexi(*res));
+//     }
+//     assert!(errors.is_empty());
+// }
diff --git a/testcrate/tests/cmp.rs b/testcrate/tests/cmp.rs
index 5c10a560..7ad90a7c 100644
--- a/testcrate/tests/cmp.rs
+++ b/testcrate/tests/cmp.rs
@@ -1,4 +1,6 @@
 #![allow(unused_macros)]
+#![feature(f128)]
+#![feature(f16)]
 
 use testcrate::*;
 
@@ -16,7 +18,10 @@ macro_rules! cmp {
             };
             let cmp1 = $fn($x, $y);
             if cmp0 != cmp1 {
-                panic!("{}({}, {}): std: {}, builtins: {}", stringify!($fn_builtins), $x, $y, cmp0, cmp1);
+                panic!(
+                    "{}({:?}, {:?}): std: {:?}, builtins: {:?}",
+                    stringify!($fn_builtins), $x, $y, cmp0, cmp1
+                );
             }
         )*
     };
@@ -55,6 +60,26 @@ fn float_comparisons() {
     });
 }
 
+#[cfg(not(feature = "no-sys-f128"))]
+#[test]
+fn float_comparisons_f128() {
+    use compiler_builtins::float::cmp::{
+        __eqtf2, __getf2, __gttf2, __letf2, __lttf2, __netf2, __unordtf2,
+    };
+
+    fuzz_float_2(N, |x: f128, y: f128| {
+        assert_eq!(__unordtf2(x, y) != 0, x.is_nan() || y.is_nan());
+        cmp!(x, y,
+            1, __lttf2;
+            1, __letf2;
+            1, __eqtf2;
+            -1, __getf2;
+            -1, __gttf2;
+            1, __netf2;
+        );
+    });
+}
+
 macro_rules! cmp2 {
     ($x:ident, $y:ident, $($unordered_val:expr, $fn_std:expr, $fn_builtins:ident);*;) => {
         $(
diff --git a/testcrate/tests/div_rem.rs b/testcrate/tests/div_rem.rs
index de3bd9be..07bd233c 100644
--- a/testcrate/tests/div_rem.rs
+++ b/testcrate/tests/div_rem.rs
@@ -1,9 +1,13 @@
 #![allow(unused_macros)]
+#![feature(f128)]
+#![feature(f16)]
 
 use compiler_builtins::int::sdiv::{__divmoddi4, __divmodsi4, __divmodti4};
 use compiler_builtins::int::udiv::{__udivmoddi4, __udivmodsi4, __udivmodti4, u128_divide_sparc};
 use testcrate::*;
 
+compiler_builtins::set_val_callback!();
+
 // Division algorithms have by far the nastiest and largest number of edge cases, and experience shows
 // that sometimes 100_000 iterations of the random fuzzer is needed.
 
@@ -107,12 +111,15 @@ macro_rules! float {
     ($($i:ty, $fn:ident);*;) => {
         $(
             fuzz_float_2(N, |x: $i, y: $i| {
+                dbg!(x, y);
                 let quo0 = x / y;
+                dbg!(quo0);
                 let quo1: $i = $fn(x, y);
+                dbg!(quo1);
                 #[cfg(not(target_arch = "arm"))]
                 if !Float::eq_repr(quo0, quo1) {
                     panic!(
-                        "{}({}, {}): std: {}, builtins: {}",
+                        "{}({:?}, {:?}): std: {:?}, builtins: {:?}",
                         stringify!($fn), x, y, quo0, quo1
                     );
                 }
@@ -122,7 +129,7 @@ macro_rules! float {
                 if !(Float::is_subnormal(quo0) || Float::is_subnormal(quo1)) {
                     if !Float::eq_repr(quo0, quo1) {
                         panic!(
-                            "{}({}, {}): std: {}, builtins: {}",
+                            "{}({:?}, {:?}): std: {:?}, builtins: {:?}",
                             stringify!($fn), x, y, quo0, quo1
                         );
                     }
@@ -146,6 +153,24 @@ fn float_div() {
     );
 }
 
+#[cfg(not(feature = "no-sys-f128"))]
+#[test]
+fn float_div_f128() {
+    use compiler_builtins::float::{div::__divtf3, Float};
+
+    float!(
+        f128, __divtf3;
+    );
+}
+
+#[test]
+fn div_failures() {
+    use compiler_builtins::float::{div::__divtf3, Float};
+    let a = f128::from_bits(0x1);
+    let b = f128::from_bits(0x1);
+    dbg!(__divtf3(a, b));
+}
+
 #[cfg(target_arch = "arm")]
 #[test]
 fn float_div_arm() {
diff --git a/testcrate/tests/mul.rs b/testcrate/tests/mul.rs
index 819f06ca..446d5c46 100644
--- a/testcrate/tests/mul.rs
+++ b/testcrate/tests/mul.rs
@@ -1,4 +1,6 @@
 #![allow(unused_macros)]
+#![feature(f128)]
+#![feature(f16)]
 
 use testcrate::*;
 
@@ -91,7 +93,7 @@ macro_rules! float_mul {
                 if !(Float::is_subnormal(mul0) || Float::is_subnormal(mul1)) {
                     if !Float::eq_repr(mul0, mul1) {
                         panic!(
-                            "{}({}, {}): std: {}, builtins: {}",
+                            "{}({:?}, {:?}): std: {:?}, builtins: {:?}",
                             stringify!($fn), x, y, mul0, mul1
                         );
                     }
@@ -115,6 +117,16 @@ fn float_mul() {
     );
 }
 
+#[test]
+#[cfg(not(feature = "no-sys-f128"))]
+fn float_mul_f128() {
+    use compiler_builtins::float::{mul::__multf3, Float};
+
+    float_mul!(
+        f128, __multf3;
+    );
+}
+
 #[cfg(target_arch = "arm")]
 #[test]
 fn float_mul_arm() {

From e2b4bbc12691d09f69fe94fc68e5d26cb88e11ea Mon Sep 17 00:00:00 2001
From: Trevor Gross <tmgross@umich.edu>
Date: Mon, 6 May 2024 01:40:57 -0500
Subject: [PATCH 03/13] update

---
 src/float/mod.rs | 7 +++++--
 src/lib.rs       | 1 -
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/float/mod.rs b/src/float/mod.rs
index a82dd7d2..dadcf51a 100644
--- a/src/float/mod.rs
+++ b/src/float/mod.rs
@@ -1,6 +1,6 @@
 use core::ops;
 
-use super::int::Int;
+use crate::int::{DInt, Int, MinInt};
 
 pub mod add;
 pub mod cmp;
@@ -12,6 +12,9 @@ pub mod pow;
 pub mod sub;
 pub mod trunc;
 
+/// Wrapper to extract the integer type half of the float's size
+pub(crate) type HalfRep<F: Float> = <F::Int as DInt>::H;
+
 public_test_dep! {
 /// Trait for some basic operations on floats
 pub(crate) trait Float:
@@ -59,7 +62,7 @@ pub(crate) trait Float:
     /// A mask for the significand
     const SIGNIFICAND_MASK: Self::Int;
 
-    // The implicit bit of the float format
+    /// The implicit bit of the float format
     const IMPLICIT_BIT: Self::Int;
 
     /// A mask for the exponent
diff --git a/src/lib.rs b/src/lib.rs
index 7c0b5072..e468ea81 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -13,7 +13,6 @@
 #![feature(linkage)]
 #![feature(naked_functions)]
 #![feature(repr_simd)]
-#![feature(c_unwind)]
 #![cfg_attr(not(feature = "no-f16-f128"), feature(f16))]
 #![cfg_attr(not(feature = "no-f16-f128"), feature(f128))]
 #![no_builtins]

From fe8e3bdb3a802a2ed0d333047371507949d13fb4 Mon Sep 17 00:00:00 2001
From: Trevor Gross <tmgross@umich.edu>
Date: Mon, 6 May 2024 02:32:01 -0500
Subject: [PATCH 04/13] Fix some runtime panics

---
 src/float/div.rs | 97 ++++++++++++++++++++++++++++++++++--------------
 src/float/mod.rs |  2 +-
 2 files changed, 70 insertions(+), 29 deletions(-)

diff --git a/src/float/div.rs b/src/float/div.rs
index 6f64dfae..a03b0902 100644
--- a/src/float/div.rs
+++ b/src/float/div.rs
@@ -7,6 +7,62 @@ use crate::int::{CastInto, DInt, HInt, Int, MinInt};
 
 use super::HalfRep;
 
+/// Configuration for division on the 64-bit implementation
+trait DivIterations64: Float
+where
+    u16: CastInto<Self::Int>,
+{
+    const NUMBER_OF_HALF_ITERATIONS: usize;
+    const NUMBER_OF_FULL_ITERATIONS: usize;
+    const USE_NATIVE_FULL_ITERATIONS: bool = false;
+
+    fn reciprocal_precision() -> Self::Int {
+        let precision: u16 = const {
+            if Self::BITS == 32
+                && Self::NUMBER_OF_HALF_ITERATIONS == 2
+                && Self::NUMBER_OF_FULL_ITERATIONS == 1
+            {
+                74
+            } else if Self::BITS == 32
+                && Self::NUMBER_OF_HALF_ITERATIONS == 0
+                && Self::NUMBER_OF_FULL_ITERATIONS == 3
+            {
+                10
+            } else if Self::BITS == 64
+                && Self::NUMBER_OF_HALF_ITERATIONS == 3
+                && Self::NUMBER_OF_FULL_ITERATIONS == 1
+            {
+                220
+            } else if Self::BITS == 128
+                && Self::NUMBER_OF_HALF_ITERATIONS == 4
+                && Self::NUMBER_OF_FULL_ITERATIONS == 1
+            {
+                13922
+            } else {
+                panic!("invalid iterations for the specified bits");
+            }
+        };
+
+        precision.cast()
+    }
+}
+
+impl DivIterations64 for f32 {
+    const NUMBER_OF_HALF_ITERATIONS: usize = 0;
+    const NUMBER_OF_FULL_ITERATIONS: usize = 3;
+}
+
+impl DivIterations64 for f64 {
+    const NUMBER_OF_HALF_ITERATIONS: usize = 3;
+    const NUMBER_OF_FULL_ITERATIONS: usize = 1;
+}
+
+#[cfg(not(feature = "no-f16-f128"))]
+impl DivIterations64 for f128 {
+    const NUMBER_OF_HALF_ITERATIONS: usize = 4;
+    const NUMBER_OF_FULL_ITERATIONS: usize = 1;
+}
+
 fn div32<F: Float>(a: F, b: F) -> F
 where
     u32: CastInto<F::Int>,
@@ -461,24 +517,22 @@ where
 
 fn div64<F: Float>(a: F, b: F) -> F
 where
-    u32: CastInto<F::Int>,
+    F: DivIterations64,
     F::Int: CastInto<u32>,
-    i32: CastInto<F::Int>,
     F::Int: CastInto<i32>,
-    u64: CastInto<F::Int>,
-    u64: CastInto<HalfRep<F>>,
     F::Int: CastInto<HalfRep<F>>,
     F::Int: From<HalfRep<F>>,
     F::Int: From<u8>,
     F::Int: CastInto<u64>,
-    i64: CastInto<F::Int>,
     F::Int: CastInto<i64>,
     F::Int: HInt + DInt,
+    u16: CastInto<F::Int>,
+    i32: CastInto<F::Int>,
+    i64: CastInto<F::Int>,
+    u32: CastInto<F::Int>,
+    u64: CastInto<F::Int>,
+    u64: CastInto<HalfRep<F>>,
 {
-    const NUMBER_OF_HALF_ITERATIONS: usize = 3;
-    const NUMBER_OF_FULL_ITERATIONS: usize = 1;
-    const USE_NATIVE_FULL_ITERATIONS: bool = false;
-
     let one = F::Int::ONE;
     let zero = F::Int::ZERO;
     let hw = F::BITS / 2;
@@ -625,7 +679,7 @@ where
     // rounding, so error estimations have to be computed taking the selected
     // mode into account!
 
-    let mut x_uq0 = if NUMBER_OF_HALF_ITERATIONS > 0 {
+    let mut x_uq0 = if F::NUMBER_OF_HALF_ITERATIONS > 0 {
         // Starting with (n-1) half-width iterations
         let b_uq1_hw: HalfRep<F> = CastInto::<HalfRep<F>>::cast(
             CastInto::<u64>::cast(b_significand) >> (significand_bits + 1 - hw),
@@ -666,7 +720,7 @@ where
             // On (1/b, 1], g(x) > 0 <=> f(x) < x
             //
             // For half-width iterations, b_hw is used instead of b.
-            for _ in 0..NUMBER_OF_HALF_ITERATIONS {
+            for _ in 0..F::NUMBER_OF_HALF_ITERATIONS {
                 // corr_UQ1_hw can be **larger** than 2 - b_hw*x by at most 1*Ulp
                 // of corr_UQ1_hw.
                 // "0.0 - (...)" is equivalent to "2.0 - (...)" in UQ1.(HW-1).
@@ -778,8 +832,8 @@ where
         x_uq0
     };
 
-    let mut x_uq0 = if USE_NATIVE_FULL_ITERATIONS {
-        for _ in 0..NUMBER_OF_FULL_ITERATIONS {
+    let mut x_uq0 = if F::USE_NATIVE_FULL_ITERATIONS {
+        for _ in 0..F::NUMBER_OF_FULL_ITERATIONS {
             let corr_uq1: u64 = 0.wrapping_sub(
                 (CastInto::<u64>::cast(x_uq0) * (CastInto::<u64>::cast(b_uq1))) >> F::BITS,
             );
@@ -816,20 +870,7 @@ where
 
     // Add 2 to U_N due to final decrement.
 
-    let reciprocal_precision: F::Int = if F::BITS == 32
-        && NUMBER_OF_HALF_ITERATIONS == 2
-        && NUMBER_OF_FULL_ITERATIONS == 1
-    {
-        74.cast()
-    } else if F::BITS == 32 && NUMBER_OF_HALF_ITERATIONS == 0 && NUMBER_OF_FULL_ITERATIONS == 3 {
-        10.cast()
-    } else if F::BITS == 64 && NUMBER_OF_HALF_ITERATIONS == 3 && NUMBER_OF_FULL_ITERATIONS == 1 {
-        220.cast()
-    } else if F::BITS == 128 && NUMBER_OF_HALF_ITERATIONS == 4 && NUMBER_OF_FULL_ITERATIONS == 1 {
-        13922.cast()
-    } else {
-        panic!("invalid iterations for the specified bits");
-    };
+    let reciprocal_precision: F::Int = F::reciprocal_precision();
 
     // Suppose 1/b - P * 2^-W < x < 1/b + P * 2^-W
     let x_uq0 = x_uq0 - reciprocal_precision;
@@ -898,7 +939,7 @@ where
     // r = a - b * q
     let abs_result = if written_exponent > 0 {
         let mut ret = quotient & significand_mask;
-        ret |= ((written_exponent as u64) << significand_bits).cast();
+        ret |= written_exponent.cast() << significand_bits;
         residual <<= 1;
         ret
     } else {
diff --git a/src/float/mod.rs b/src/float/mod.rs
index dadcf51a..6e169d23 100644
--- a/src/float/mod.rs
+++ b/src/float/mod.rs
@@ -13,7 +13,7 @@ pub mod sub;
 pub mod trunc;
 
 /// Wrapper to extract the integer type half of the float's size
-pub(crate) type HalfRep<F: Float> = <F::Int as DInt>::H;
+pub(crate) type HalfRep<F> = <<F as Float>::Int as DInt>::H;
 
 public_test_dep! {
 /// Trait for some basic operations on floats

From b288658199bc10339633dd5eaa4741baa449900c Mon Sep 17 00:00:00 2001
From: Trevor Gross <tmgross@umich.edu>
Date: Mon, 6 May 2024 04:57:35 -0500
Subject: [PATCH 05/13] fix feature flags

---
 src/float/add.rs | 1 +
 src/float/div.rs | 1 +
 src/float/mul.rs | 1 +
 src/float/sub.rs | 1 +
 4 files changed, 4 insertions(+)

diff --git a/src/float/add.rs b/src/float/add.rs
index 8fa9dac5..cf1d4136 100644
--- a/src/float/add.rs
+++ b/src/float/add.rs
@@ -203,6 +203,7 @@ intrinsics! {
         add(a, b)
     }
 
+    #[cfg(not(feature = "no-f16-f128"))]
     pub extern "C" fn __addtf3(a: f128, b: f128) -> f128 {
         add(a, b)
     }
diff --git a/src/float/div.rs b/src/float/div.rs
index a03b0902..75af759f 100644
--- a/src/float/div.rs
+++ b/src/float/div.rs
@@ -985,6 +985,7 @@ intrinsics! {
         div64(a, b)
     }
 
+    #[cfg(not(feature = "no-f16-f128"))]
     pub extern "C" fn __divtf3(a: f128, b: f128) -> f128 {
         div64(a, b)
     }
diff --git a/src/float/mul.rs b/src/float/mul.rs
index e3e5708e..fcc593d6 100644
--- a/src/float/mul.rs
+++ b/src/float/mul.rs
@@ -199,6 +199,7 @@ intrinsics! {
         mul(a, b)
     }
 
+    #[cfg(not(feature = "no-f16-f128"))]
     pub extern "C" fn __multf3(a: f128, b: f128) -> f128 {
         mul(a, b)
     }
diff --git a/src/float/sub.rs b/src/float/sub.rs
index 6bb3271a..0d394ce1 100644
--- a/src/float/sub.rs
+++ b/src/float/sub.rs
@@ -16,6 +16,7 @@ intrinsics! {
         __adddf3(a, f64::from_repr(b.repr() ^ f64::SIGN_MASK))
     }
 
+    #[cfg(not(feature = "no-f16-f128"))]
     pub extern "C" fn __subtf3(a: f128, b: f128) -> f128 {
         __addtf3(a, f128::from_repr(b.repr() ^ f128::SIGN_MASK))
     }

From 72e8223a88072310d6d7c2cb54fda16fb7a25cf2 Mon Sep 17 00:00:00 2001
From: Trevor Gross <tmgross@umich.edu>
Date: Mon, 6 May 2024 05:01:28 -0500
Subject: [PATCH 06/13] cfg-fix

---
 src/float/cmp.rs | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/float/cmp.rs b/src/float/cmp.rs
index ae05a3a6..36086969 100644
--- a/src/float/cmp.rs
+++ b/src/float/cmp.rs
@@ -170,7 +170,10 @@ intrinsics! {
     pub extern "C" fn __gtdf2(a: f64, b: f64) -> i32 {
         cmp(a, b).to_ge_abi()
     }
+}
 
+#[cfg(not(feature = "no-f16-f128"))]
+intrinsics! {
     #[avr_skip]
     pub extern "C" fn __letf2(a: f128, b: f128) -> i32 {
         cmp(a, b).to_le_abi()

From 0d2685b670359f8989dc31d518c70b09d3680951 Mon Sep 17 00:00:00 2001
From: Trevor Gross <tmgross@umich.edu>
Date: Fri, 10 May 2024 03:45:46 -0500
Subject: [PATCH 07/13] update

---
 src/float/sub.rs           | 2 +-
 testcrate/tests/div_rem.rs | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/float/sub.rs b/src/float/sub.rs
index 0d394ce1..477019f7 100644
--- a/src/float/sub.rs
+++ b/src/float/sub.rs
@@ -1,6 +1,5 @@
 use crate::float::add::__adddf3;
 use crate::float::add::__addsf3;
-use crate::float::add::__addtf3;
 use crate::float::Float;
 
 intrinsics! {
@@ -18,6 +17,7 @@ intrinsics! {
 
     #[cfg(not(feature = "no-f16-f128"))]
     pub extern "C" fn __subtf3(a: f128, b: f128) -> f128 {
+        use crate::float::add::__addtf3;
         __addtf3(a, f128::from_repr(b.repr() ^ f128::SIGN_MASK))
     }
 
diff --git a/testcrate/tests/div_rem.rs b/testcrate/tests/div_rem.rs
index 07bd233c..7308faf6 100644
--- a/testcrate/tests/div_rem.rs
+++ b/testcrate/tests/div_rem.rs
@@ -153,7 +153,7 @@ fn float_div() {
     );
 }
 
-#[cfg(not(feature = "no-sys-f128"))]
+#[cfg(not(any(feature = "no-sys-f128", feature = "no-f16-f128")))]
 #[test]
 fn float_div_f128() {
     use compiler_builtins::float::{div::__divtf3, Float};
@@ -163,6 +163,7 @@ fn float_div_f128() {
     );
 }
 
+#[cfg(not(any(feature = "no-sys-f128", feature = "no-f16-f128")))]
 #[test]
 fn div_failures() {
     use compiler_builtins::float::{div::__divtf3, Float};

From 9500b7d040c26c92e6f6bf0d601a3baa4c848c4a Mon Sep 17 00:00:00 2001
From: Trevor Gross <tmgross@umich.edu>
Date: Fri, 10 May 2024 03:58:00 -0500
Subject: [PATCH 08/13] Fix bigint missing carry

---
 src/int/big.rs         |  5 +++--
 testcrate/tests/big.rs | 10 +++++++---
 2 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/src/int/big.rs b/src/int/big.rs
index a54d6259..026c6463 100644
--- a/src/int/big.rs
+++ b/src/int/big.rs
@@ -266,14 +266,15 @@ impl HInt for u128 {
             + (sum2 & u128::from(WORD_FULL_MASK))
             + ((sum3 << 32) & u128::from(WORD_HI_MASK));
 
-        let lo = r0.wrapping_add(r1 << 64);
+        let (lo, carry) = r0.overflowing_add(r1 << 64);
         let hi = (r1 >> 64)
             + (sum1 >> 96)
             + (sum2 >> 64)
             + (sum3 >> 32)
             + sum4
             + (sum5 << 32)
-            + (sum6 << 64);
+            + (sum6 << 64)
+            + u128::from(carry);
 
         u256([
             (lo & U128_LO_MASK) as u64,
diff --git a/testcrate/tests/big.rs b/testcrate/tests/big.rs
index abf7d77c..13cd8e18 100644
--- a/testcrate/tests/big.rs
+++ b/testcrate/tests/big.rs
@@ -38,8 +38,7 @@ fn widen_mul_u128() {
     let tests = [
         (u128::MAX / 2, 2_u128, u256([u64::MAX - 1, u64::MAX, 0, 0])),
         (u128::MAX, 2_u128, u256([u64::MAX - 1, u64::MAX, 1, 0])),
-        // TODO: https://github.com/rust-lang/compiler-builtins/pull/587#issuecomment-2060543566
-        // (u128::MAX, u128::MAX, u256([1, 0, u64::MAX - 1, u64::MAX])),
+        (u128::MAX, u128::MAX, u256([1, 0, u64::MAX - 1, u64::MAX])),
         (u128::MIN, u128::MIN, u256::ZERO),
         (1234, 0, u256::ZERO),
         (0, 1234, u256::ZERO),
@@ -65,6 +64,7 @@ fn widen_mul_u128() {
     assert!(errors.is_empty());
 }
 
+// unneeded?
 // #[test]
 // fn widen_mul_i128() {
 //     let tests = [
@@ -98,7 +98,11 @@ fn widen_mul_u128() {
 //     }
 
 //     for (i, a, b, exp, res) in &errors {
-//         eprintln!("FAILURE ({i}): {a:#034x} * {b:#034x} = {} got {}", hexi(*exp), hexi(*res));
+//         eprintln!(
+//             "FAILURE ({i}): {a:#034x} * {b:#034x} = {} got {}",
+//             hexi(*exp),
+//             hexi(*res)
+//         );
 //     }
 //     assert!(errors.is_empty());
 // }

From c4596931f433f4d55de1243140472791eab90260 Mon Sep 17 00:00:00 2001
From: Trevor Gross <tmgross@umich.edu>
Date: Fri, 10 May 2024 04:52:51 -0500
Subject: [PATCH 09/13] Add apfloat fallback

---
 testcrate/src/lib.rs      | 32 ++++++++++++++++
 testcrate/tests/addsub.rs | 79 ++++++++++++++++++++++++++-------------
 2 files changed, 85 insertions(+), 26 deletions(-)

diff --git a/testcrate/src/lib.rs b/testcrate/src/lib.rs
index 13abf459..f7a73c2a 100644
--- a/testcrate/src/lib.rs
+++ b/testcrate/src/lib.rs
@@ -263,3 +263,35 @@ pub fn fuzz_float_2<F: Float, E: Fn(F, F)>(n: u32, f: E) {
         f(x, y)
     }
 }
+
+#[macro_export]
+macro_rules! to_apfloat {
+    ($apfloat_ty:ty, $val:expr) => {
+        <$apfloat_ty>::from_bits($val.to_bits().into())
+    };
+}
+
+#[macro_export]
+macro_rules! from_apfloat {
+    ($float_ty:ty, $val:expr) => {
+        <$float_ty>::from_bits($val.to_bits().try_into().unwrap())
+    };
+}
+
+/// Expect a status from a `StatusAnd`. Defaults to OK
+#[macro_export]
+macro_rules! apfloat_expect {
+    // Discard the status
+    ($val:expr, Ignore) => {
+        $val.value
+    };
+
+    ($val:expr) => {
+        apfloat_expect!($val, rustc_apfloat::Status::OK)
+    };
+
+    ($val:expr, $expected_status:expr) => {{
+        assert_eq!($val.status, $expected_status, "from value {}", $val.value);
+        $val.value
+    }};
+}
diff --git a/testcrate/tests/addsub.rs b/testcrate/tests/addsub.rs
index 343e47ae..21a15e38 100644
--- a/testcrate/tests/addsub.rs
+++ b/testcrate/tests/addsub.rs
@@ -73,28 +73,54 @@ fn addsub() {
 }
 
 macro_rules! float_sum {
+    // By default test against Rust which uses system's compiler-rt
     ($($f:ty, $fn_add:ident, $fn_sub:ident);*;) => {
         $(
-            fuzz_float_2(N, |x: $f, y: $f| {
-                let add0 = x + y;
-                let sub0 = x - y;
-                let add1: $f = $fn_add(x, y);
-                let sub1: $f = $fn_sub(x, y);
-                if !Float::eq_repr(add0, add1) {
-                    panic!(
-                        "{}({:?}, {:?}): std: {:?}, builtins: {:?}",
-                        stringify!($fn_add), x, y, add0, add1
-                    );
-                }
-                if !Float::eq_repr(sub0, sub1) {
-                    panic!(
-                        "{:?}({:?}, {:?}): std: {:?}, builtins: {:?}",
-                        stringify!($fn_sub), x, y, sub0, sub1
-                    );
-                }
-            });
+            float_sum!(@inner $f, $fn_add, $fn_sub, |x, y| x + y, |x, y| x - y);
         )*
     };
+
+    // Allow using apfloat instead when the system functinos are not available
+    ($($f:ty, $fn_add:ident, $fn_sub:ident, $apfloat_ty:ty);*;) => {
+        $(
+            float_sum!(
+                @inner $f,
+                $fn_add,
+                $fn_sub,
+                |x: $f, y: $f| from_apfloat!(
+                    $f,
+                    apfloat_expect!(to_apfloat!($apfloat_ty, x) + to_apfloat!($apfloat_ty, y), Ignore)
+                ),
+                |x: $f, y: $f| from_apfloat!(
+                    $f,
+                    apfloat_expect!(to_apfloat!($apfloat_ty, x) - to_apfloat!($apfloat_ty, y), Ignore)
+                ),
+            );
+        )*
+    };
+
+    (@inner $f:ty, $fn_add:ident, $fn_sub:ident, $add_expr:expr, $sub_expr:expr $(,)?) => {
+        fuzz_float_2(N, |x: $f, y: $f| {
+            let add0 = $add_expr(x, y);
+            let sub0 = $sub_expr(x, y);
+            let add1: $f = $fn_add(x, y);
+            let sub1: $f = $fn_sub(x, y);
+            if !Float::eq_repr(add0, add1) {
+                panic!(
+                    "{}({:?}, {:?}): std: {:?}, builtins: {:?}",
+                    stringify!($fn_add), x, y, add0, add1
+                );
+            }
+            if !Float::eq_repr(sub0, sub1) {
+                panic!(
+                    "{:?}({:?}, {:?}): std: {:?}, builtins: {:?}",
+                    stringify!($fn_sub), x, y, sub0, sub1
+                );
+            }
+        });
+    };
+
+
 }
 
 #[cfg(not(all(target_arch = "x86", not(target_feature = "sse"))))]
@@ -110,16 +136,17 @@ fn float_addsub() {
         f32, __addsf3, __subsf3;
         f64, __adddf3, __subdf3;
     );
-}
 
-#[test]
-#[cfg(not(feature = "no-sys-f128"))]
-fn float_addsub_f128() {
-    use compiler_builtins::float::{add::__addtf3, sub::__subtf3, Float};
+    #[cfg(not(feature = "no-f16-f128"))]
+    {
+        use compiler_builtins::float::{add::__addtf3, sub::__subtf3, Float};
+        use rustc_apfloat::ieee::Quad;
+        use rustc_apfloat::{Float as _, FloatConvert as _};
 
-    float_sum!(
-        f128, __addtf3, __subtf3;
-    );
+        float_sum!(
+            f128, __addtf3, __subtf3, Quad;
+        );
+    }
 }
 
 #[cfg(target_arch = "arm")]

From de8f9bee00a80e8bd9bf66148e89dd480d7c422c Mon Sep 17 00:00:00 2001
From: Trevor Gross <tmgross@umich.edu>
Date: Fri, 10 May 2024 05:06:23 -0500
Subject: [PATCH 10/13] add fallback for div

---
 testcrate/tests/addsub.rs  | 12 ++++--
 testcrate/tests/div_rem.rs | 77 ++++++++++++++++++++++++--------------
 2 files changed, 57 insertions(+), 32 deletions(-)

diff --git a/testcrate/tests/addsub.rs b/testcrate/tests/addsub.rs
index 21a15e38..f1665fde 100644
--- a/testcrate/tests/addsub.rs
+++ b/testcrate/tests/addsub.rs
@@ -143,9 +143,15 @@ fn float_addsub() {
         use rustc_apfloat::ieee::Quad;
         use rustc_apfloat::{Float as _, FloatConvert as _};
 
-        float_sum!(
-            f128, __addtf3, __subtf3, Quad;
-        );
+        if cfg!(feature = "no-sys-f128") {
+            float_sum!(
+                f128, __addtf3, __subtf3, Quad;
+            );
+        } else {
+            float_sum!(
+                f128, __addtf3, __subtf3;
+            );
+        }
     }
 }
 
diff --git a/testcrate/tests/div_rem.rs b/testcrate/tests/div_rem.rs
index 7308faf6..ad9979b9 100644
--- a/testcrate/tests/div_rem.rs
+++ b/testcrate/tests/div_rem.rs
@@ -108,34 +108,46 @@ fn divide_sparc() {
 }
 
 macro_rules! float {
-    ($($i:ty, $fn:ident);*;) => {
+    ($($f:ty, $fn:ident);*;) => {
+        $( float!(@inner $f, $fn, |x, y| x / y); )*
+    };
+
+    ($($f:ty, $fn:ident, $apfloat_ty:ty);*;) => {
         $(
-            fuzz_float_2(N, |x: $i, y: $i| {
-                dbg!(x, y);
-                let quo0 = x / y;
-                dbg!(quo0);
-                let quo1: $i = $fn(x, y);
-                dbg!(quo1);
-                #[cfg(not(target_arch = "arm"))]
+            float!(
+                @inner $f,
+                $fn,
+                |x: $f, y: $f| from_apfloat!(
+                    $f,
+                    apfloat_expect!(to_apfloat!($apfloat_ty, x) / to_apfloat!($apfloat_ty, y), Ignore)
+                ),
+            );
+        )*
+    };
+
+    (@inner $f:ty, $fn:ident, $div_expr:expr $(,)?) => {
+        fuzz_float_2(N, |x: $f, y: $f| {
+            let quo0 = $div_expr(x, y);
+            let quo1: $f = $fn(x, y);
+            #[cfg(not(target_arch = "arm"))]
+            if !Float::eq_repr(quo0, quo1) {
+                panic!(
+                    "{}({:?}, {:?}): std: {:?}, builtins: {:?}",
+                    stringify!($fn), x, y, quo0, quo1
+                );
+            }
+
+            // ARM SIMD instructions always flush subnormals to zero
+            #[cfg(target_arch = "arm")]
+            if !(Float::is_subnormal(quo0) || Float::is_subnormal(quo1)) {
                 if !Float::eq_repr(quo0, quo1) {
                     panic!(
                         "{}({:?}, {:?}): std: {:?}, builtins: {:?}",
                         stringify!($fn), x, y, quo0, quo1
                     );
                 }
-
-                // ARM SIMD instructions always flush subnormals to zero
-                #[cfg(target_arch = "arm")]
-                if !(Float::is_subnormal(quo0) || Float::is_subnormal(quo1)) {
-                    if !Float::eq_repr(quo0, quo1) {
-                        panic!(
-                            "{}({:?}, {:?}): std: {:?}, builtins: {:?}",
-                            stringify!($fn), x, y, quo0, quo1
-                        );
-                    }
-                }
-            });
-        )*
+            }
+        });
     };
 }
 
@@ -151,16 +163,23 @@ fn float_div() {
         f32, __divsf3;
         f64, __divdf3;
     );
-}
 
-#[cfg(not(any(feature = "no-sys-f128", feature = "no-f16-f128")))]
-#[test]
-fn float_div_f128() {
-    use compiler_builtins::float::{div::__divtf3, Float};
+    #[cfg(not(feature = "no-f16-f128"))]
+    {
+        use compiler_builtins::float::div::__divtf3;
+        use rustc_apfloat::ieee::Quad;
+        use rustc_apfloat::{Float as _, FloatConvert as _};
 
-    float!(
-        f128, __divtf3;
-    );
+        if cfg!(feature = "no-sys-f128") {
+            float!(
+                f128, __divtf3, Quad;
+            );
+        } else {
+            float!(
+                f128, __divtf3;
+            );
+        }
+    }
 }
 
 #[cfg(not(any(feature = "no-sys-f128", feature = "no-f16-f128")))]

From 27ea26c77e2c38229937377e3a3b72113457f9c4 Mon Sep 17 00:00:00 2001
From: Trevor Gross <tmgross@umich.edu>
Date: Fri, 10 May 2024 05:33:06 -0500
Subject: [PATCH 11/13] Simplify test macros

---
 testcrate/src/lib.rs       | 41 +++++++------------
 testcrate/tests/addsub.rs  | 82 ++++++++++++--------------------------
 testcrate/tests/div_rem.rs | 81 ++++++++++++++++---------------------
 3 files changed, 75 insertions(+), 129 deletions(-)

diff --git a/testcrate/src/lib.rs b/testcrate/src/lib.rs
index f7a73c2a..e688b7c9 100644
--- a/testcrate/src/lib.rs
+++ b/testcrate/src/lib.rs
@@ -264,34 +264,23 @@ pub fn fuzz_float_2<F: Float, E: Fn(F, F)>(n: u32, f: E) {
     }
 }
 
+/// Use the builtin if avialable, fallback to apfloat if not
 #[macro_export]
-macro_rules! to_apfloat {
-    ($apfloat_ty:ty, $val:expr) => {
-        <$apfloat_ty>::from_bits($val.to_bits().into())
-    };
-}
+macro_rules! apfloat_fallback {
+    ($float_ty:ty, $apfloat_ty:ty, $x:expr, $op:tt, $y:expr, $sys_available:meta) => {{
+        #[cfg($sys_available)]
+        let ret = $x $op $y;
 
-#[macro_export]
-macro_rules! from_apfloat {
-    ($float_ty:ty, $val:expr) => {
-        <$float_ty>::from_bits($val.to_bits().try_into().unwrap())
-    };
-}
+        #[cfg(not($sys_available))]
+        let ret = {
+            let x_ap = <$apfloat_ty>::from_bits($x.to_bits().into());
+            let y_ap = <$apfloat_ty>::from_bits($y.to_bits().into());
+            // ignore the status in `rustc_apfloat::StatusAnd`
+            let res = (x_ap $op y_ap).value;
 
-/// Expect a status from a `StatusAnd`. Defaults to OK
-#[macro_export]
-macro_rules! apfloat_expect {
-    // Discard the status
-    ($val:expr, Ignore) => {
-        $val.value
-    };
-
-    ($val:expr) => {
-        apfloat_expect!($val, rustc_apfloat::Status::OK)
-    };
+            <$float_ty>::from_bits(res.to_bits().try_into().unwrap())
+        };
 
-    ($val:expr, $expected_status:expr) => {{
-        assert_eq!($val.status, $expected_status, "from value {}", $val.value);
-        $val.value
-    }};
+        ret
+    }}
 }
diff --git a/testcrate/tests/addsub.rs b/testcrate/tests/addsub.rs
index f1665fde..32b54ff1 100644
--- a/testcrate/tests/addsub.rs
+++ b/testcrate/tests/addsub.rs
@@ -73,54 +73,28 @@ fn addsub() {
 }
 
 macro_rules! float_sum {
-    // By default test against Rust which uses system's compiler-rt
-    ($($f:ty, $fn_add:ident, $fn_sub:ident);*;) => {
+    ($($f:ty, $fn_add:ident, $fn_sub:ident, $apfloat_ty:ty, $sys_available:meta);*;) => {
         $(
-            float_sum!(@inner $f, $fn_add, $fn_sub, |x, y| x + y, |x, y| x - y);
-        )*
-    };
-
-    // Allow using apfloat instead when the system functinos are not available
-    ($($f:ty, $fn_add:ident, $fn_sub:ident, $apfloat_ty:ty);*;) => {
-        $(
-            float_sum!(
-                @inner $f,
-                $fn_add,
-                $fn_sub,
-                |x: $f, y: $f| from_apfloat!(
-                    $f,
-                    apfloat_expect!(to_apfloat!($apfloat_ty, x) + to_apfloat!($apfloat_ty, y), Ignore)
-                ),
-                |x: $f, y: $f| from_apfloat!(
-                    $f,
-                    apfloat_expect!(to_apfloat!($apfloat_ty, x) - to_apfloat!($apfloat_ty, y), Ignore)
-                ),
-            );
+            fuzz_float_2(N, |x: $f, y: $f| {
+                let add0 = apfloat_fallback!($f, $apfloat_ty, x, +, y, $sys_available);
+                let sub0 = apfloat_fallback!($f, $apfloat_ty, x, -, y, $sys_available);
+                let add1: $f = $fn_add(x, y);
+                let sub1: $f = $fn_sub(x, y);
+                if !Float::eq_repr(add0, add1) {
+                    panic!(
+                        "{}({:?}, {:?}): std: {:?}, builtins: {:?}",
+                        stringify!($fn_add), x, y, add0, add1
+                    );
+                }
+                if !Float::eq_repr(sub0, sub1) {
+                    panic!(
+                        "{}({:?}, {:?}): std: {:?}, builtins: {:?}",
+                        stringify!($fn_sub), x, y, sub0, sub1
+                    );
+                }
+            });
         )*
-    };
-
-    (@inner $f:ty, $fn_add:ident, $fn_sub:ident, $add_expr:expr, $sub_expr:expr $(,)?) => {
-        fuzz_float_2(N, |x: $f, y: $f| {
-            let add0 = $add_expr(x, y);
-            let sub0 = $sub_expr(x, y);
-            let add1: $f = $fn_add(x, y);
-            let sub1: $f = $fn_sub(x, y);
-            if !Float::eq_repr(add0, add1) {
-                panic!(
-                    "{}({:?}, {:?}): std: {:?}, builtins: {:?}",
-                    stringify!($fn_add), x, y, add0, add1
-                );
-            }
-            if !Float::eq_repr(sub0, sub1) {
-                panic!(
-                    "{:?}({:?}, {:?}): std: {:?}, builtins: {:?}",
-                    stringify!($fn_sub), x, y, sub0, sub1
-                );
-            }
-        });
-    };
-
-
+    }
 }
 
 #[cfg(not(all(target_arch = "x86", not(target_feature = "sse"))))]
@@ -133,8 +107,8 @@ fn float_addsub() {
     };
 
     float_sum!(
-        f32, __addsf3, __subsf3;
-        f64, __adddf3, __subdf3;
+        f32, __addsf3, __subsf3, Single, all();
+        f64, __adddf3, __subdf3, Double, all();
     );
 
     #[cfg(not(feature = "no-f16-f128"))]
@@ -143,15 +117,9 @@ fn float_addsub() {
         use rustc_apfloat::ieee::Quad;
         use rustc_apfloat::{Float as _, FloatConvert as _};
 
-        if cfg!(feature = "no-sys-f128") {
-            float_sum!(
-                f128, __addtf3, __subtf3, Quad;
-            );
-        } else {
-            float_sum!(
-                f128, __addtf3, __subtf3;
-            );
-        }
+        float_sum!(
+            f128, __addtf3, __subtf3, Quad, not(feature = "no-sys-f128");
+        );
     }
 }
 
diff --git a/testcrate/tests/div_rem.rs b/testcrate/tests/div_rem.rs
index ad9979b9..9439ca43 100644
--- a/testcrate/tests/div_rem.rs
+++ b/testcrate/tests/div_rem.rs
@@ -4,6 +4,9 @@
 
 use compiler_builtins::int::sdiv::{__divmoddi4, __divmodsi4, __divmodti4};
 use compiler_builtins::int::udiv::{__udivmoddi4, __udivmodsi4, __udivmodti4, u128_divide_sparc};
+use rustc_apfloat::ieee::{Double, Single};
+use rustc_apfloat::{Float as _, FloatConvert as _};
+
 use testcrate::*;
 
 compiler_builtins::set_val_callback!();
@@ -108,46 +111,39 @@ fn divide_sparc() {
 }
 
 macro_rules! float {
-    ($($f:ty, $fn:ident);*;) => {
-        $( float!(@inner $f, $fn, |x, y| x / y); )*
-    };
-
-    ($($f:ty, $fn:ident, $apfloat_ty:ty);*;) => {
+    ($($f:ty, $fn:ident, $apfloat_ty:ty, $sys_available:meta);*;) => {
         $(
-            float!(
-                @inner $f,
-                $fn,
-                |x: $f, y: $f| from_apfloat!(
-                    $f,
-                    apfloat_expect!(to_apfloat!($apfloat_ty, x) / to_apfloat!($apfloat_ty, y), Ignore)
-                ),
-            );
-        )*
-    };
-
-    (@inner $f:ty, $fn:ident, $div_expr:expr $(,)?) => {
-        fuzz_float_2(N, |x: $f, y: $f| {
-            let quo0 = $div_expr(x, y);
-            let quo1: $f = $fn(x, y);
-            #[cfg(not(target_arch = "arm"))]
-            if !Float::eq_repr(quo0, quo1) {
-                panic!(
-                    "{}({:?}, {:?}): std: {:?}, builtins: {:?}",
-                    stringify!($fn), x, y, quo0, quo1
-                );
-            }
-
-            // ARM SIMD instructions always flush subnormals to zero
-            #[cfg(target_arch = "arm")]
-            if !(Float::is_subnormal(quo0) || Float::is_subnormal(quo1)) {
+            fuzz_float_2(N, |x: $f, y: $f| {
+                let quo0: $f = apfloat_fallback!($f, $apfloat_ty, x, /, y, $sys_available);
+                let quo1: $f = $fn(x, y);
+                #[cfg(not(target_arch = "arm"))]
                 if !Float::eq_repr(quo0, quo1) {
                     panic!(
                         "{}({:?}, {:?}): std: {:?}, builtins: {:?}",
-                        stringify!($fn), x, y, quo0, quo1
+                        stringify!($fn),
+                        x,
+                        y,
+                        quo0,
+                        quo1
                     );
                 }
-            }
-        });
+
+                // ARM SIMD instructions always flush subnormals to zero
+                #[cfg(target_arch = "arm")]
+                if !(Float::is_subnormal(quo0) || Float::is_subnormal(quo1)) {
+                    if !Float::eq_repr(quo0, quo1) {
+                        panic!(
+                            "{}({:?}, {:?}): std: {:?}, builtins: {:?}",
+                            stringify!($fn),
+                            x,
+                            y,
+                            quo0,
+                            quo1
+                        );
+                    }
+                }
+            });
+        )*
     };
 }
 
@@ -160,25 +156,18 @@ fn float_div() {
     };
 
     float!(
-        f32, __divsf3;
-        f64, __divdf3;
+        f32, __divsf3, Single, all();
+        f64, __divdf3, Double, all();
     );
 
     #[cfg(not(feature = "no-f16-f128"))]
     {
         use compiler_builtins::float::div::__divtf3;
         use rustc_apfloat::ieee::Quad;
-        use rustc_apfloat::{Float as _, FloatConvert as _};
 
-        if cfg!(feature = "no-sys-f128") {
-            float!(
-                f128, __divtf3, Quad;
-            );
-        } else {
-            float!(
-                f128, __divtf3;
-            );
-        }
+        float!(
+            f128, __divtf3, Quad, not(feature = "no-sys-f128");
+        );
     }
 }
 

From ba6162cd7a8ddbe07f667e3e3d7a79ab5329145f Mon Sep 17 00:00:00 2001
From: Trevor Gross <tmgross@umich.edu>
Date: Fri, 10 May 2024 16:26:00 -0500
Subject: [PATCH 12/13] Make macro work without converting

---
 testcrate/build.rs         |  1 -
 testcrate/src/lib.rs       | 76 ++++++++++++++++++++++++++++++-----
 testcrate/tests/addsub.rs  | 13 +++---
 testcrate/tests/big.rs     |  2 +-
 testcrate/tests/cmp.rs     | 82 +++++++++++++++++++++++++++-----------
 testcrate/tests/div_rem.rs | 13 +++---
 testcrate/tests/mul.rs     | 28 ++++++-------
 7 files changed, 152 insertions(+), 63 deletions(-)

diff --git a/testcrate/build.rs b/testcrate/build.rs
index 86c97af1..f279a363 100644
--- a/testcrate/build.rs
+++ b/testcrate/build.rs
@@ -9,7 +9,6 @@ fn main() {
         || target.contains("apple-darwin")
         || target.contains("windows-msvc")
     {
-        println!("cargo:warning=skipping `f128` tests; system does not have relevant symbols");
         println!("cargo:rustc-cfg=feature=\"no-sys-f128\"");
     }
 }
diff --git a/testcrate/src/lib.rs b/testcrate/src/lib.rs
index e688b7c9..4112e3ac 100644
--- a/testcrate/src/lib.rs
+++ b/testcrate/src/lib.rs
@@ -264,23 +264,81 @@ pub fn fuzz_float_2<F: Float, E: Fn(F, F)>(n: u32, f: E) {
     }
 }
 
-/// Use the builtin if avialable, fallback to apfloat if not
+/// Use the builtin operation if avialable, fallback to apfloat if not
 #[macro_export]
 macro_rules! apfloat_fallback {
-    ($float_ty:ty, $apfloat_ty:ty, $x:expr, $op:tt, $y:expr, $sys_available:meta) => {{
+    // binary
+    (
+        $float_ty:ty,
+        $apfloat_ty:ident,
+        $x:expr,
+        $y:expr,
+        $op:expr,
+        $sys_available:meta
+        // $(, ret_float=false)?
+        $(, $($convert_args:tt)*)?
+        $(,)?
+    ) => {{
         #[cfg($sys_available)]
-        let ret = $x $op $y;
+        let ret = {
+            type FloatTy = $float_ty;
+            $op($x, $y)
+        };
+
+        #[cfg(not($sys_available))]
+        let ret = {
+            use rustc_apfloat::Float;
+            type FloatTy = rustc_apfloat::ieee::$apfloat_ty;
+
+            let x_ap = FloatTy::from_bits($x.to_bits().into());
+            let y_ap = FloatTy::from_bits($y.to_bits().into());
+
+            apfloat_fallback!(@convert $float_ty, $op(x_ap, y_ap), $($($convert_args)*)?)
+        };
+
+        ret
+    }};
+
+    // unary
+    (
+        $float_ty:ty,
+        $apfloat_ty:ident,
+        $x:expr,
+        $op:expr,
+        $sys_available:meta
+        $(, $($convert_args:tt)*)?
+        $(,)?
+    ) => {{
+        #[cfg($sys_available)]
+        let ret = {
+            type FloatTy = $float_ty;
+            $op($x)
+        };
 
         #[cfg(not($sys_available))]
         let ret = {
-            let x_ap = <$apfloat_ty>::from_bits($x.to_bits().into());
-            let y_ap = <$apfloat_ty>::from_bits($y.to_bits().into());
-            // ignore the status in `rustc_apfloat::StatusAnd`
-            let res = (x_ap $op y_ap).value;
+            use rustc_apfloat::Float;
+            type FloatTy = rustc_apfloat::ieee::$apfloat_ty;
 
-            <$float_ty>::from_bits(res.to_bits().try_into().unwrap())
+            let x_ap = FloatTy::from_bits($x.to_bits().into());
+            apfloat_fallback!(@convert $float_ty, $op(x_ap), $($($convert_args)*)?)
         };
 
         ret
-    }}
+    }};
+
+
+    // Other operations do not need unwrapping
+    (@convert $float_ty:ty, $val:expr, ret_float=false) => {
+        $val
+    };
+
+    // Some apfloat operations return a `StatusAnd` that we need to extract the value from
+    (@convert $float_ty:ty, $val:expr,) => {{
+        // ignore the status, just get the value
+        let unwrapped = $val.value;
+
+        <$float_ty>::from_bits(FloatTy::to_bits(unwrapped).try_into().unwrap())
+    }};
+
 }
diff --git a/testcrate/tests/addsub.rs b/testcrate/tests/addsub.rs
index 32b54ff1..229d2be6 100644
--- a/testcrate/tests/addsub.rs
+++ b/testcrate/tests/addsub.rs
@@ -2,6 +2,7 @@
 #![feature(f128)]
 #![feature(f16)]
 
+use core::ops::{Add, Sub};
 use testcrate::*;
 
 macro_rules! sum {
@@ -73,11 +74,11 @@ fn addsub() {
 }
 
 macro_rules! float_sum {
-    ($($f:ty, $fn_add:ident, $fn_sub:ident, $apfloat_ty:ty, $sys_available:meta);*;) => {
+    ($($f:ty, $fn_add:ident, $fn_sub:ident, $apfloat_ty:ident, $sys_available:meta);*;) => {
         $(
             fuzz_float_2(N, |x: $f, y: $f| {
-                let add0 = apfloat_fallback!($f, $apfloat_ty, x, +, y, $sys_available);
-                let sub0 = apfloat_fallback!($f, $apfloat_ty, x, -, y, $sys_available);
+                let add0 = apfloat_fallback!($f, $apfloat_ty, x, y, Add::add, $sys_available);
+                let sub0 = apfloat_fallback!($f, $apfloat_ty, x, y, Sub::sub, $sys_available);
                 let add1: $f = $fn_add(x, y);
                 let sub1: $f = $fn_sub(x, y);
                 if !Float::eq_repr(add0, add1) {
@@ -114,8 +115,6 @@ fn float_addsub() {
     #[cfg(not(feature = "no-f16-f128"))]
     {
         use compiler_builtins::float::{add::__addtf3, sub::__subtf3, Float};
-        use rustc_apfloat::ieee::Quad;
-        use rustc_apfloat::{Float as _, FloatConvert as _};
 
         float_sum!(
             f128, __addtf3, __subtf3, Quad, not(feature = "no-sys-f128");
@@ -133,7 +132,7 @@ fn float_addsub_arm() {
     };
 
     float_sum!(
-        f32, __addsf3vfp, __subsf3vfp;
-        f64, __adddf3vfp, __subdf3vfp;
+        f32, __addsf3vfp, __subsf3vfp, Single, all();
+        f64, __adddf3vfp, __subdf3vfp, Double, all();
     );
 }
diff --git a/testcrate/tests/big.rs b/testcrate/tests/big.rs
index 13cd8e18..84320bc6 100644
--- a/testcrate/tests/big.rs
+++ b/testcrate/tests/big.rs
@@ -1,4 +1,4 @@
-use compiler_builtins::int::{i256, u256, HInt, Int, MinInt};
+use compiler_builtins::int::{i256, u256, HInt, MinInt};
 
 const LOHI_SPLIT: u128 = 0xaaaaaaaaaaaaaaaaffffffffffffffff;
 
diff --git a/testcrate/tests/cmp.rs b/testcrate/tests/cmp.rs
index 7ad90a7c..1ad263f4 100644
--- a/testcrate/tests/cmp.rs
+++ b/testcrate/tests/cmp.rs
@@ -1,21 +1,39 @@
 #![allow(unused_macros)]
+#![allow(unreachable_code)]
 #![feature(f128)]
 #![feature(f16)]
 
 use testcrate::*;
 
 macro_rules! cmp {
-    ($x:ident, $y:ident, $($unordered_val:expr, $fn:ident);*;) => {
+    (
+        $f:ty, $x:ident, $y:ident, $apfloat_ty:ident, $sys_available:meta,
+        $($unordered_val:expr, $fn:ident);*;
+    ) => {
         $(
-            let cmp0 = if $x.is_nan() || $y.is_nan() {
+            let cmp0 = if apfloat_fallback!($f, $apfloat_ty, $x,
+                    |x: FloatTy| x.is_nan(),
+                    $sys_available, ret_float = false
+                ) || apfloat_fallback!($f, $apfloat_ty, $y,
+                    |y: FloatTy| y.is_nan(),
+                    $sys_available, ret_float = false
+                )
+            {
                 $unordered_val
-            } else if $x < $y {
+            } else if apfloat_fallback!($f, $apfloat_ty, $x, $y,
+                |x, y| x < y,
+                $sys_available, ret_float = false
+            ) {
                 -1
-            } else if $x == $y {
+            } else if apfloat_fallback!($f, $apfloat_ty, $x, $y,
+                |x, y| x == y,
+                $sys_available, ret_float = false
+            ) {
                 0
             } else {
                 1
             };
+
             let cmp1 = $fn($x, $y);
             if cmp0 != cmp1 {
                 panic!(
@@ -38,7 +56,7 @@ fn float_comparisons() {
 
     fuzz_float_2(N, |x: f32, y: f32| {
         assert_eq!(__unordsf2(x, y) != 0, x.is_nan() || y.is_nan());
-        cmp!(x, y,
+        cmp!(f32, x, y, Single, all(),
             1, __ltsf2;
             1, __lesf2;
             1, __eqsf2;
@@ -49,7 +67,7 @@ fn float_comparisons() {
     });
     fuzz_float_2(N, |x: f64, y: f64| {
         assert_eq!(__unorddf2(x, y) != 0, x.is_nan() || y.is_nan());
-        cmp!(x, y,
+        cmp!(f64, x, y, Double, all(),
             1, __ltdf2;
             1, __ledf2;
             1, __eqdf2;
@@ -58,26 +76,42 @@ fn float_comparisons() {
             1, __nedf2;
         );
     });
-}
 
-#[cfg(not(feature = "no-sys-f128"))]
-#[test]
-fn float_comparisons_f128() {
-    use compiler_builtins::float::cmp::{
-        __eqtf2, __getf2, __gttf2, __letf2, __lttf2, __netf2, __unordtf2,
-    };
+    #[cfg(not(feature = "no-f16-f128"))]
+    {
+        use compiler_builtins::float::cmp::{
+            __eqtf2, __getf2, __gttf2, __letf2, __lttf2, __netf2, __unordtf2,
+        };
 
-    fuzz_float_2(N, |x: f128, y: f128| {
-        assert_eq!(__unordtf2(x, y) != 0, x.is_nan() || y.is_nan());
-        cmp!(x, y,
-            1, __lttf2;
-            1, __letf2;
-            1, __eqtf2;
-            -1, __getf2;
-            -1, __gttf2;
-            1, __netf2;
-        );
-    });
+        fuzz_float_2(N, |x: f128, y: f128| {
+            // let x_isnan = apfloat_fallback!(
+            //     f128,
+            //     Quad,
+            //     x,
+            //     |x: FloatTy| x.is_nan(),
+            //     not(feature = "no-sys-f128"),
+            //     ret_float = false
+            // );
+            // let y_isnan = apfloat_fallback!(
+            //     f128,
+            //     Quad,
+            //     y,
+            //     |y: FloatTy| y.is_nan(),
+            //     not(feature = "no-sys-f128"),
+            //     ret_float = false
+            // );
+            // assert_eq!(__unordtf2(x, y) != 0, x_isnan || y_isnan);
+
+            cmp!(f128, x, y, Quad, not(feature = "no-sys-f128"),
+                1, __lttf2;
+                1, __letf2;
+                1, __eqtf2;
+                -1, __getf2;
+                -1, __gttf2;
+                1, __netf2;
+            );
+        });
+    }
 }
 
 macro_rules! cmp2 {
diff --git a/testcrate/tests/div_rem.rs b/testcrate/tests/div_rem.rs
index 9439ca43..862b42d3 100644
--- a/testcrate/tests/div_rem.rs
+++ b/testcrate/tests/div_rem.rs
@@ -2,10 +2,10 @@
 #![feature(f128)]
 #![feature(f16)]
 
+use core::ops::Div;
+
 use compiler_builtins::int::sdiv::{__divmoddi4, __divmodsi4, __divmodti4};
 use compiler_builtins::int::udiv::{__udivmoddi4, __udivmodsi4, __udivmodti4, u128_divide_sparc};
-use rustc_apfloat::ieee::{Double, Single};
-use rustc_apfloat::{Float as _, FloatConvert as _};
 
 use testcrate::*;
 
@@ -111,10 +111,10 @@ fn divide_sparc() {
 }
 
 macro_rules! float {
-    ($($f:ty, $fn:ident, $apfloat_ty:ty, $sys_available:meta);*;) => {
+    ($($f:ty, $fn:ident, $apfloat_ty:ident, $sys_available:meta);*;) => {
         $(
             fuzz_float_2(N, |x: $f, y: $f| {
-                let quo0: $f = apfloat_fallback!($f, $apfloat_ty, x, /, y, $sys_available);
+                let quo0: $f = apfloat_fallback!($f, $apfloat_ty, x, y, Div::div, $sys_available);
                 let quo1: $f = $fn(x, y);
                 #[cfg(not(target_arch = "arm"))]
                 if !Float::eq_repr(quo0, quo1) {
@@ -163,7 +163,6 @@ fn float_div() {
     #[cfg(not(feature = "no-f16-f128"))]
     {
         use compiler_builtins::float::div::__divtf3;
-        use rustc_apfloat::ieee::Quad;
 
         float!(
             f128, __divtf3, Quad, not(feature = "no-sys-f128");
@@ -189,7 +188,7 @@ fn float_div_arm() {
     };
 
     float!(
-        f32, __divsf3vfp;
-        f64, __divdf3vfp;
+        f32, __divsf3vfp, Single, all();
+        f64, __divdf3vfp, Double, all();
     );
 }
diff --git a/testcrate/tests/mul.rs b/testcrate/tests/mul.rs
index 446d5c46..a37c7903 100644
--- a/testcrate/tests/mul.rs
+++ b/testcrate/tests/mul.rs
@@ -2,6 +2,7 @@
 #![feature(f128)]
 #![feature(f16)]
 
+use core::ops::Mul;
 use testcrate::*;
 
 macro_rules! mul {
@@ -84,10 +85,10 @@ fn overflowing_mul() {
 }
 
 macro_rules! float_mul {
-    ($($f:ty, $fn:ident);*;) => {
+    ($($f:ty, $fn:ident, $apfloat_ty:ident, $sys_available:meta);*;) => {
         $(
             fuzz_float_2(N, |x: $f, y: $f| {
-                let mul0 = x * y;
+                let mul0 = apfloat_fallback!($f, $apfloat_ty, x, y, Mul::mul, $sys_available);
                 let mul1: $f = $fn(x, y);
                 // multiplication of subnormals is not currently handled
                 if !(Float::is_subnormal(mul0) || Float::is_subnormal(mul1)) {
@@ -112,19 +113,18 @@ fn float_mul() {
     };
 
     float_mul!(
-        f32, __mulsf3;
-        f64, __muldf3;
+        f32, __mulsf3, Single, all();
+        f64, __muldf3, Double, all();
     );
-}
 
-#[test]
-#[cfg(not(feature = "no-sys-f128"))]
-fn float_mul_f128() {
-    use compiler_builtins::float::{mul::__multf3, Float};
+    #[cfg(not(feature = "no-f16-f128"))]
+    {
+        use compiler_builtins::float::mul::__multf3;
 
-    float_mul!(
-        f128, __multf3;
-    );
+        float_mul!(
+            f128, __multf3, Quad, not(feature = "no-sys-f128");
+        );
+    }
 }
 
 #[cfg(target_arch = "arm")]
@@ -136,7 +136,7 @@ fn float_mul_arm() {
     };
 
     float_mul!(
-        f32, __mulsf3vfp;
-        f64, __muldf3vfp;
+        f32, __mulsf3vfp, Single, all();
+        f64, __muldf3vfp, Double, all();
     );
 }

From 0facad1ce7e19884cc839df083060bebc430dfb3 Mon Sep 17 00:00:00 2001
From: Trevor Gross <tmgross@umich.edu>
Date: Fri, 10 May 2024 18:22:58 -0500
Subject: [PATCH 13/13] Macros are working for all f128 cases

---
 testcrate/src/lib.rs       | 55 +++++++---------------------------
 testcrate/tests/addsub.rs  |  4 +--
 testcrate/tests/cmp.rs     | 61 +++++++++++++++++++-------------------
 testcrate/tests/div_rem.rs |  2 +-
 testcrate/tests/mul.rs     |  2 +-
 5 files changed, 46 insertions(+), 78 deletions(-)

diff --git a/testcrate/src/lib.rs b/testcrate/src/lib.rs
index 4112e3ac..f9606d47 100644
--- a/testcrate/src/lib.rs
+++ b/testcrate/src/lib.rs
@@ -267,22 +267,18 @@ pub fn fuzz_float_2<F: Float, E: Fn(F, F)>(n: u32, f: E) {
 /// Use the builtin operation if avialable, fallback to apfloat if not
 #[macro_export]
 macro_rules! apfloat_fallback {
-    // binary
     (
         $float_ty:ty,
         $apfloat_ty:ident,
-        $x:expr,
-        $y:expr,
-        $op:expr,
-        $sys_available:meta
-        // $(, ret_float=false)?
-        $(, $($convert_args:tt)*)?
+        $sys_available:meta,
+        $op:expr $(=> $convert:ident)?,
+        $($arg:expr),+
         $(,)?
     ) => {{
         #[cfg($sys_available)]
         let ret = {
             type FloatTy = $float_ty;
-            $op($x, $y)
+            $op( $($arg),+ )
         };
 
         #[cfg(not($sys_available))]
@@ -290,51 +286,22 @@ macro_rules! apfloat_fallback {
             use rustc_apfloat::Float;
             type FloatTy = rustc_apfloat::ieee::$apfloat_ty;
 
-            let x_ap = FloatTy::from_bits($x.to_bits().into());
-            let y_ap = FloatTy::from_bits($y.to_bits().into());
+            let op_res = $op( $(FloatTy::from_bits($arg.to_bits().into())),+ );
 
-            apfloat_fallback!(@convert $float_ty, $op(x_ap, y_ap), $($($convert_args)*)?)
+            apfloat_fallback!(@convert $float_ty, op_res $(,$convert)?)
         };
 
         ret
     }};
 
-    // unary
-    (
-        $float_ty:ty,
-        $apfloat_ty:ident,
-        $x:expr,
-        $op:expr,
-        $sys_available:meta
-        $(, $($convert_args:tt)*)?
-        $(,)?
-    ) => {{
-        #[cfg($sys_available)]
-        let ret = {
-            type FloatTy = $float_ty;
-            $op($x)
-        };
-
-        #[cfg(not($sys_available))]
-        let ret = {
-            use rustc_apfloat::Float;
-            type FloatTy = rustc_apfloat::ieee::$apfloat_ty;
-
-            let x_ap = FloatTy::from_bits($x.to_bits().into());
-            apfloat_fallback!(@convert $float_ty, $op(x_ap), $($($convert_args)*)?)
-        };
-
-        ret
-    }};
-
-
-    // Other operations do not need unwrapping
-    (@convert $float_ty:ty, $val:expr, ret_float=false) => {
+    // Operations that do not need converting back to a float
+    (@convert $float_ty:ty, $val:expr, no_convert) => {
         $val
     };
 
-    // Some apfloat operations return a `StatusAnd` that we need to extract the value from
-    (@convert $float_ty:ty, $val:expr,) => {{
+    // Some apfloat operations return a `StatusAnd` that we need to extract the value from. This
+    // is the default.
+    (@convert $float_ty:ty, $val:expr) => {{
         // ignore the status, just get the value
         let unwrapped = $val.value;
 
diff --git a/testcrate/tests/addsub.rs b/testcrate/tests/addsub.rs
index 229d2be6..20503003 100644
--- a/testcrate/tests/addsub.rs
+++ b/testcrate/tests/addsub.rs
@@ -77,8 +77,8 @@ macro_rules! float_sum {
     ($($f:ty, $fn_add:ident, $fn_sub:ident, $apfloat_ty:ident, $sys_available:meta);*;) => {
         $(
             fuzz_float_2(N, |x: $f, y: $f| {
-                let add0 = apfloat_fallback!($f, $apfloat_ty, x, y, Add::add, $sys_available);
-                let sub0 = apfloat_fallback!($f, $apfloat_ty, x, y, Sub::sub, $sys_available);
+                let add0 = apfloat_fallback!($f, $apfloat_ty, $sys_available, Add::add, x, y);
+                let sub0 = apfloat_fallback!($f, $apfloat_ty, $sys_available, Sub::sub, x, y);
                 let add1: $f = $fn_add(x, y);
                 let sub1: $f = $fn_sub(x, y);
                 if !Float::eq_repr(add0, add1) {
diff --git a/testcrate/tests/cmp.rs b/testcrate/tests/cmp.rs
index 1ad263f4..45a6ee90 100644
--- a/testcrate/tests/cmp.rs
+++ b/testcrate/tests/cmp.rs
@@ -11,34 +11,40 @@ macro_rules! cmp {
         $($unordered_val:expr, $fn:ident);*;
     ) => {
         $(
-            let cmp0 = if apfloat_fallback!($f, $apfloat_ty, $x,
-                    |x: FloatTy| x.is_nan(),
-                    $sys_available, ret_float = false
-                ) || apfloat_fallback!($f, $apfloat_ty, $y,
-                    |y: FloatTy| y.is_nan(),
-                    $sys_available, ret_float = false
+            println!("a");
+            let cmp0 = if apfloat_fallback!(
+                    $f, $apfloat_ty, $sys_available,
+                    |x: FloatTy| x.is_nan() => no_convert,
+                    $x
+                ) || apfloat_fallback!(
+                    $f, $apfloat_ty, $sys_available,
+                    |y: FloatTy| y.is_nan() => no_convert,
+                    $y
                 )
             {
                 $unordered_val
-            } else if apfloat_fallback!($f, $apfloat_ty, $x, $y,
-                |x, y| x < y,
-                $sys_available, ret_float = false
+            } else if apfloat_fallback!(
+                $f, $apfloat_ty, $sys_available,
+                |x, y| x < y => no_convert,
+                $x, $y
             ) {
                 -1
-            } else if apfloat_fallback!($f, $apfloat_ty, $x, $y,
-                |x, y| x == y,
-                $sys_available, ret_float = false
+            } else if apfloat_fallback!(
+                $f, $apfloat_ty, $sys_available,
+                |x, y| x == y => no_convert,
+                $x, $y
             ) {
                 0
             } else {
                 1
             };
+            println!("b");
 
             let cmp1 = $fn($x, $y);
             if cmp0 != cmp1 {
                 panic!(
                     "{}({:?}, {:?}): std: {:?}, builtins: {:?}",
-                    stringify!($fn_builtins), $x, $y, cmp0, cmp1
+                    stringify!($fn), $x, $y, cmp0, cmp1
                 );
             }
         )*
@@ -84,23 +90,18 @@ fn float_comparisons() {
         };
 
         fuzz_float_2(N, |x: f128, y: f128| {
-            // let x_isnan = apfloat_fallback!(
-            //     f128,
-            //     Quad,
-            //     x,
-            //     |x: FloatTy| x.is_nan(),
-            //     not(feature = "no-sys-f128"),
-            //     ret_float = false
-            // );
-            // let y_isnan = apfloat_fallback!(
-            //     f128,
-            //     Quad,
-            //     y,
-            //     |y: FloatTy| y.is_nan(),
-            //     not(feature = "no-sys-f128"),
-            //     ret_float = false
-            // );
-            // assert_eq!(__unordtf2(x, y) != 0, x_isnan || y_isnan);
+            let x_is_nan = apfloat_fallback!(
+                f128, Quad, not(feature = "no-sys-f128"),
+                |x: FloatTy| x.is_nan() => no_convert,
+                x
+            );
+            let y_is_nan = apfloat_fallback!(
+                f128, Quad, not(feature = "no-sys-f128"),
+                |x: FloatTy| x.is_nan() => no_convert,
+                y
+            );
+
+            assert_eq!(__unordtf2(x, y) != 0, x_is_nan || y_is_nan);
 
             cmp!(f128, x, y, Quad, not(feature = "no-sys-f128"),
                 1, __lttf2;
diff --git a/testcrate/tests/div_rem.rs b/testcrate/tests/div_rem.rs
index 862b42d3..4ccc5660 100644
--- a/testcrate/tests/div_rem.rs
+++ b/testcrate/tests/div_rem.rs
@@ -114,7 +114,7 @@ macro_rules! float {
     ($($f:ty, $fn:ident, $apfloat_ty:ident, $sys_available:meta);*;) => {
         $(
             fuzz_float_2(N, |x: $f, y: $f| {
-                let quo0: $f = apfloat_fallback!($f, $apfloat_ty, x, y, Div::div, $sys_available);
+                let quo0: $f = apfloat_fallback!($f, $apfloat_ty, $sys_available, Div::div, x, y);
                 let quo1: $f = $fn(x, y);
                 #[cfg(not(target_arch = "arm"))]
                 if !Float::eq_repr(quo0, quo1) {
diff --git a/testcrate/tests/mul.rs b/testcrate/tests/mul.rs
index a37c7903..39a71ac5 100644
--- a/testcrate/tests/mul.rs
+++ b/testcrate/tests/mul.rs
@@ -88,7 +88,7 @@ macro_rules! float_mul {
     ($($f:ty, $fn:ident, $apfloat_ty:ident, $sys_available:meta);*;) => {
         $(
             fuzz_float_2(N, |x: $f, y: $f| {
-                let mul0 = apfloat_fallback!($f, $apfloat_ty, x, y, Mul::mul, $sys_available);
+                let mul0 = apfloat_fallback!($f, $apfloat_ty, $sys_available, Mul::mul, x, y);
                 let mul1: $f = $fn(x, y);
                 // multiplication of subnormals is not currently handled
                 if !(Float::is_subnormal(mul0) || Float::is_subnormal(mul1)) {