diff --git a/build.rs b/build.rs
index bafbf75d..1229fb2a 100644
--- a/build.rs
+++ b/build.rs
@@ -479,10 +479,6 @@ mod c {
                 ("__floatsitf", "floatsitf.c"),
                 ("__floatunditf", "floatunditf.c"),
                 ("__floatunsitf", "floatunsitf.c"),
-                ("__addtf3", "addtf3.c"),
-                ("__multf3", "multf3.c"),
-                ("__subtf3", "subtf3.c"),
-                ("__divtf3", "divtf3.c"),
                 ("__powitf2", "powitf2.c"),
                 ("__fe_getround", "fp_mode.c"),
                 ("__fe_raise_inexact", "fp_mode.c"),
@@ -500,30 +496,22 @@ mod c {
         if target_arch == "mips64" {
             sources.extend(&[
                 ("__netf2", "comparetf2.c"),
-                ("__addtf3", "addtf3.c"),
-                ("__multf3", "multf3.c"),
-                ("__subtf3", "subtf3.c"),
                 ("__fixtfsi", "fixtfsi.c"),
                 ("__floatsitf", "floatsitf.c"),
                 ("__fixunstfsi", "fixunstfsi.c"),
                 ("__floatunsitf", "floatunsitf.c"),
                 ("__fe_getround", "fp_mode.c"),
-                ("__divtf3", "divtf3.c"),
             ]);
         }
 
         if target_arch == "loongarch64" {
             sources.extend(&[
                 ("__netf2", "comparetf2.c"),
-                ("__addtf3", "addtf3.c"),
-                ("__multf3", "multf3.c"),
-                ("__subtf3", "subtf3.c"),
                 ("__fixtfsi", "fixtfsi.c"),
                 ("__floatsitf", "floatsitf.c"),
                 ("__fixunstfsi", "fixunstfsi.c"),
                 ("__floatunsitf", "floatunsitf.c"),
                 ("__fe_getround", "fp_mode.c"),
-                ("__divtf3", "divtf3.c"),
             ]);
         }
 
diff --git a/src/float/add.rs b/src/float/add.rs
index e2fb8407..8fa9dac5 100644
--- a/src/float/add.rs
+++ b/src/float/add.rs
@@ -1,5 +1,5 @@
 use crate::float::Float;
-use crate::int::{CastInto, Int};
+use crate::int::{CastInto, Int, MinInt};
 
 /// Returns `a + b`
 fn add<F: Float>(a: F, b: F) -> F
@@ -57,9 +57,9 @@ where
         }
 
         // zero + anything = anything
-        if a_abs == Int::ZERO {
+        if a_abs == MinInt::ZERO {
             // but we need to get the sign right for zero + zero
-            if b_abs == Int::ZERO {
+            if b_abs == MinInt::ZERO {
                 return F::from_repr(a.repr() & b.repr());
             } else {
                 return b;
@@ -67,7 +67,7 @@ where
         }
 
         // anything + zero = anything
-        if b_abs == Int::ZERO {
+        if b_abs == MinInt::ZERO {
             return a;
         }
     }
@@ -113,10 +113,10 @@ where
     // Shift the significand of b by the difference in exponents, with a sticky
     // bottom bit to get rounding correct.
     let align = a_exponent.wrapping_sub(b_exponent).cast();
-    if align != Int::ZERO {
+    if align != MinInt::ZERO {
         if align < bits {
             let sticky =
-                F::Int::from_bool(b_significand << bits.wrapping_sub(align).cast() != Int::ZERO);
+                F::Int::from_bool(b_significand << bits.wrapping_sub(align).cast() != MinInt::ZERO);
             b_significand = (b_significand >> align.cast()) | sticky;
         } else {
             b_significand = one; // sticky; b is known to be non-zero.
@@ -125,8 +125,8 @@ where
     if subtraction {
         a_significand = a_significand.wrapping_sub(b_significand);
         // If a == -b, return +zero.
-        if a_significand == Int::ZERO {
-            return F::from_repr(Int::ZERO);
+        if a_significand == MinInt::ZERO {
+            return F::from_repr(MinInt::ZERO);
         }
 
         // If partial cancellation occured, we need to left-shift the result
@@ -143,8 +143,8 @@ where
 
         // If the addition carried up, we need to right-shift the result and
         // adjust the exponent:
-        if a_significand & implicit_bit << 4 != Int::ZERO {
-            let sticky = F::Int::from_bool(a_significand & one != Int::ZERO);
+        if a_significand & implicit_bit << 4 != MinInt::ZERO {
+            let sticky = F::Int::from_bool(a_significand & one != MinInt::ZERO);
             a_significand = a_significand >> 1 | sticky;
             a_exponent += 1;
         }
@@ -160,7 +160,7 @@ where
         // need to shift the significand.
         let shift = (1 - a_exponent).cast();
         let sticky =
-            F::Int::from_bool((a_significand << bits.wrapping_sub(shift).cast()) != Int::ZERO);
+            F::Int::from_bool((a_significand << bits.wrapping_sub(shift).cast()) != MinInt::ZERO);
         a_significand = a_significand >> shift.cast() | sticky;
         a_exponent = 0;
     }
diff --git a/src/float/cmp.rs b/src/float/cmp.rs
index 46e903dc..ae05a3a6 100644
--- a/src/float/cmp.rs
+++ b/src/float/cmp.rs
@@ -1,7 +1,7 @@
 #![allow(unreachable_code)]
 
 use crate::float::Float;
-use crate::int::Int;
+use crate::int::MinInt;
 
 #[derive(Clone, Copy)]
 enum Result {
diff --git a/src/float/div.rs b/src/float/div.rs
index 9038f6b9..6f64dfae 100644
--- a/src/float/div.rs
+++ b/src/float/div.rs
@@ -3,7 +3,9 @@
 #![allow(clippy::needless_return)]
 
 use crate::float::Float;
-use crate::int::{CastInto, DInt, HInt, Int};
+use crate::int::{CastInto, DInt, HInt, Int, MinInt};
+
+use super::HalfRep;
 
 fn div32<F: Float>(a: F, b: F) -> F
 where
@@ -37,6 +39,11 @@ where
     let quiet_bit = implicit_bit >> 1;
     let qnan_rep = exponent_mask | quiet_bit;
 
+    // #[inline(always)]
+    // fn negate<T: Int>(a: T) -> T {
+    //     T::wrapping_neg(a.signe)
+    // }
+
     #[inline(always)]
     fn negate_u32(a: u32) -> u32 {
         (<i32>::wrapping_neg(a as i32)) as u32
@@ -459,10 +466,14 @@ where
     i32: CastInto<F::Int>,
     F::Int: CastInto<i32>,
     u64: CastInto<F::Int>,
+    u64: CastInto<HalfRep<F>>,
+    F::Int: CastInto<HalfRep<F>>,
+    F::Int: From<HalfRep<F>>,
+    F::Int: From<u8>,
     F::Int: CastInto<u64>,
     i64: CastInto<F::Int>,
     F::Int: CastInto<i64>,
-    F::Int: HInt,
+    F::Int: HInt + DInt,
 {
     const NUMBER_OF_HALF_ITERATIONS: usize = 3;
     const NUMBER_OF_FULL_ITERATIONS: usize = 1;
@@ -471,7 +482,7 @@ where
     let one = F::Int::ONE;
     let zero = F::Int::ZERO;
     let hw = F::BITS / 2;
-    let lo_mask = u64::MAX >> hw;
+    let lo_mask = F::Int::MAX >> hw;
 
     let significand_bits = F::SIGNIFICAND_BITS;
     let max_exponent = F::EXPONENT_MAX;
@@ -616,8 +627,9 @@ where
 
     let mut x_uq0 = if NUMBER_OF_HALF_ITERATIONS > 0 {
         // Starting with (n-1) half-width iterations
-        let b_uq1_hw: u32 =
-            (CastInto::<u64>::cast(b_significand) >> (significand_bits + 1 - hw)) as u32;
+        let b_uq1_hw: HalfRep<F> = CastInto::<HalfRep<F>>::cast(
+            CastInto::<u64>::cast(b_significand) >> (significand_bits + 1 - hw),
+        );
 
         // C is (3/4 + 1/sqrt(2)) - 1 truncated to W0 fractional bits as UQ0.HW
         // with W0 being either 16 or 32 and W0 <= HW.
@@ -625,12 +637,13 @@ where
         // b/2 is subtracted to obtain x0) wrapped to [0, 1) range.
 
         // HW is at least 32. Shifting into the highest bits if needed.
-        let c_hw = (0x7504F333_u64 as u32).wrapping_shl(hw.wrapping_sub(32));
+        let c_hw = (CastInto::<HalfRep<F>>::cast(0x7504F333_u64)).wrapping_shl(hw.wrapping_sub(32));
 
         // b >= 1, thus an upper bound for 3/4 + 1/sqrt(2) - b/2 is about 0.9572,
         // so x0 fits to UQ0.HW without wrapping.
-        let x_uq0_hw: u32 = {
-            let mut x_uq0_hw: u32 = c_hw.wrapping_sub(b_uq1_hw /* exact b_hw/2 as UQ0.HW */);
+        let x_uq0_hw: HalfRep<F> = {
+            let mut x_uq0_hw: HalfRep<F> =
+                c_hw.wrapping_sub(b_uq1_hw /* exact b_hw/2 as UQ0.HW */);
             // dbg!(x_uq0_hw);
             // An e_0 error is comprised of errors due to
             // * x0 being an inherently imprecise first approximation of 1/b_hw
@@ -661,8 +674,9 @@ where
                 // no overflow occurred earlier: ((rep_t)x_UQ0_hw * b_UQ1_hw >> HW) is
                 // expected to be strictly positive because b_UQ1_hw has its highest bit set
                 // and x_UQ0_hw should be rather large (it converges to 1/2 < 1/b_hw <= 1).
-                let corr_uq1_hw: u32 =
-                    0.wrapping_sub(((x_uq0_hw as u64).wrapping_mul(b_uq1_hw as u64)) >> hw) as u32;
+                let corr_uq1_hw: HalfRep<F> = CastInto::<HalfRep<F>>::cast(zero.wrapping_sub(
+                    ((F::Int::from(x_uq0_hw)).wrapping_mul(F::Int::from(b_uq1_hw))) >> hw,
+                ));
                 // dbg!(corr_uq1_hw);
 
                 // Now, we should multiply UQ0.HW and UQ1.(HW-1) numbers, naturally
@@ -677,7 +691,9 @@ where
                 // The fact corr_UQ1_hw was virtually round up (due to result of
                 // multiplication being **first** truncated, then negated - to improve
                 // error estimations) can increase x_UQ0_hw by up to 2*Ulp of x_UQ0_hw.
-                x_uq0_hw = ((x_uq0_hw as u64).wrapping_mul(corr_uq1_hw as u64) >> (hw - 1)) as u32;
+                x_uq0_hw = ((F::Int::from(x_uq0_hw)).wrapping_mul(F::Int::from(corr_uq1_hw))
+                    >> (hw - 1))
+                    .cast();
                 // dbg!(x_uq0_hw);
                 // Now, either no overflow occurred or x_UQ0_hw is 0 or 1 in its half_rep_t
                 // representation. In the latter case, x_UQ0_hw will be either 0 or 1 after
@@ -707,7 +723,7 @@ where
             // be not below that value (see g(x) above), so it is safe to decrement just
             // once after the final iteration. On the other hand, an effective value of
             // divisor changes after this point (from b_hw to b), so adjust here.
-            x_uq0_hw.wrapping_sub(1_u32)
+            x_uq0_hw.wrapping_sub(HalfRep::<F>::ONE)
         };
 
         // Error estimations for full-precision iterations are calculated just
@@ -717,7 +733,7 @@ where
         // Simulating operations on a twice_rep_t to perform a single final full-width
         // iteration. Using ad-hoc multiplication implementations to take advantage
         // of particular structure of operands.
-        let blo: u64 = (CastInto::<u64>::cast(b_uq1)) & lo_mask;
+        let blo: F::Int = b_uq1 & lo_mask;
         // x_UQ0 = x_UQ0_hw * 2^HW - 1
         // x_UQ0 * b_UQ1 = (x_UQ0_hw * 2^HW) * (b_UQ1_hw * 2^HW + blo) - b_UQ1
         //
@@ -726,19 +742,20 @@ where
         // +            [  x_UQ0_hw *  blo  ]
         // -                      [      b_UQ1       ]
         // = [      result       ][.... discarded ...]
-        let corr_uq1 = negate_u64(
-            (x_uq0_hw as u64) * (b_uq1_hw as u64) + (((x_uq0_hw as u64) * (blo)) >> hw) - 1,
-        ); // account for *possible* carry
-        let lo_corr = corr_uq1 & lo_mask;
-        let hi_corr = corr_uq1 >> hw;
+        let corr_uq1: F::Int = (F::Int::from(x_uq0_hw) * F::Int::from(b_uq1_hw)
+            + ((F::Int::from(x_uq0_hw) * blo) >> hw))
+            .wrapping_sub(one)
+            .wrapping_neg(); // account for *possible* carry
+        let lo_corr: F::Int = corr_uq1 & lo_mask;
+        let hi_corr: F::Int = corr_uq1 >> hw;
         // x_UQ0 * corr_UQ1 = (x_UQ0_hw * 2^HW) * (hi_corr * 2^HW + lo_corr) - corr_UQ1
-        let mut x_uq0: <F as Float>::Int = ((((x_uq0_hw as u64) * hi_corr) << 1)
-            .wrapping_add(((x_uq0_hw as u64) * lo_corr) >> (hw - 1))
-            .wrapping_sub(2))
-        .cast(); // 1 to account for the highest bit of corr_UQ1 can be 1
-                 // 1 to account for possible carry
-                 // Just like the case of half-width iterations but with possibility
-                 // of overflowing by one extra Ulp of x_UQ0.
+        let mut x_uq0: F::Int = ((F::Int::from(x_uq0_hw) * hi_corr) << 1)
+            .wrapping_add((F::Int::from(x_uq0_hw) * lo_corr) >> (hw - 1))
+            .wrapping_sub(F::Int::from(2u8));
+        // 1 to account for the highest bit of corr_UQ1 can be 1
+        // 1 to account for possible carry
+        // Just like the case of half-width iterations but with possibility
+        // of overflowing by one extra Ulp of x_UQ0.
         x_uq0 -= one;
         // ... and then traditional fixup by 2 should work
 
@@ -755,8 +772,8 @@ where
         x_uq0
     } else {
         // C is (3/4 + 1/sqrt(2)) - 1 truncated to 64 fractional bits as UQ0.n
-        let c: <F as Float>::Int = (0x7504F333 << (F::BITS - 32)).cast();
-        let x_uq0: <F as Float>::Int = c.wrapping_sub(b_uq1);
+        let c: F::Int = (0x7504F333 << (F::BITS - 32)).cast();
+        let x_uq0: F::Int = c.wrapping_sub(b_uq1);
         // E_0 <= 3/4 - 1/sqrt(2) + 2 * 2^-64
         x_uq0
     };
@@ -799,14 +816,27 @@ where
 
     // Add 2 to U_N due to final decrement.
 
-    let reciprocal_precision: <F as Float>::Int = 220.cast();
+    let reciprocal_precision: F::Int = if F::BITS == 32
+        && NUMBER_OF_HALF_ITERATIONS == 2
+        && NUMBER_OF_FULL_ITERATIONS == 1
+    {
+        74.cast()
+    } else if F::BITS == 32 && NUMBER_OF_HALF_ITERATIONS == 0 && NUMBER_OF_FULL_ITERATIONS == 3 {
+        10.cast()
+    } else if F::BITS == 64 && NUMBER_OF_HALF_ITERATIONS == 3 && NUMBER_OF_FULL_ITERATIONS == 1 {
+        220.cast()
+    } else if F::BITS == 128 && NUMBER_OF_HALF_ITERATIONS == 4 && NUMBER_OF_FULL_ITERATIONS == 1 {
+        13922.cast()
+    } else {
+        panic!("invalid iterations for the specified bits");
+    };
 
     // Suppose 1/b - P * 2^-W < x < 1/b + P * 2^-W
     let x_uq0 = x_uq0 - reciprocal_precision;
     // Now 1/b - (2*P) * 2^-W < x < 1/b
     // FIXME Is x_UQ0 still >= 0.5?
 
-    let mut quotient: <F as Float>::Int = x_uq0.widen_mul(a_significand << 1).hi();
+    let mut quotient: F::Int = x_uq0.widen_mul(a_significand << 1).hi();
     // Now, a/b - 4*P * 2^-W < q < a/b for q=<quotient_UQ1:dummy> in UQ1.(SB+1+W).
 
     // quotient_UQ1 is in [0.5, 2.0) as UQ1.(SB+1),
@@ -914,13 +944,8 @@ intrinsics! {
         div64(a, b)
     }
 
-    // TODO: how should `HInt` be handled?
     pub extern "C" fn __divtf3(a: f128, b: f128) -> f128 {
-        if cfg!(target_pointer_width = "64") {
-            div32(a, b)
-        } else {
-            div64(a, b)
-        }
+        div64(a, b)
     }
 
     #[cfg(target_arch = "arm")]
diff --git a/src/float/extend.rs b/src/float/extend.rs
index 7c244660..5b0c0d97 100644
--- a/src/float/extend.rs
+++ b/src/float/extend.rs
@@ -1,5 +1,5 @@
 use crate::float::Float;
-use crate::int::{CastInto, Int};
+use crate::int::{CastInto, Int, MinInt};
 
 /// Generic conversion from a narrower to a wider IEEE-754 floating-point type
 fn extend<F: Float, R: Float>(a: F) -> R
diff --git a/src/float/mod.rs b/src/float/mod.rs
index 02d291ed..a82dd7d2 100644
--- a/src/float/mod.rs
+++ b/src/float/mod.rs
@@ -59,7 +59,7 @@ pub(crate) trait Float:
     /// A mask for the significand
     const SIGNIFICAND_MASK: Self::Int;
 
-    /// The implicit bit of the float format
+    // The implicit bit of the float format
     const IMPLICIT_BIT: Self::Int;
 
     /// A mask for the exponent
diff --git a/src/float/mul.rs b/src/float/mul.rs
index eed29527..e3e5708e 100644
--- a/src/float/mul.rs
+++ b/src/float/mul.rs
@@ -1,5 +1,5 @@
 use crate::float::Float;
-use crate::int::{CastInto, DInt, HInt, Int};
+use crate::int::{CastInto, DInt, HInt, Int, MinInt};
 
 fn mul<F: Float>(a: F, b: F) -> F
 where
diff --git a/src/float/trunc.rs b/src/float/trunc.rs
index 6de446c1..b607a654 100644
--- a/src/float/trunc.rs
+++ b/src/float/trunc.rs
@@ -1,5 +1,5 @@
 use crate::float::Float;
-use crate::int::{CastInto, Int};
+use crate::int::{CastInto, Int, MinInt};
 
 fn trunc<F: Float, R: Float>(a: F) -> R
 where
diff --git a/src/int/addsub.rs b/src/int/addsub.rs
index f31eff4b..e95590d8 100644
--- a/src/int/addsub.rs
+++ b/src/int/addsub.rs
@@ -1,6 +1,6 @@
-use crate::int::{DInt, Int};
+use crate::int::{DInt, Int, MinInt};
 
-trait UAddSub: DInt {
+trait UAddSub: DInt + Int {
     fn uadd(self, other: Self) -> Self {
         let (lo, carry) = self.lo().overflowing_add(other.lo());
         let hi = self.hi().wrapping_add(other.hi());
@@ -22,7 +22,7 @@ impl UAddSub for u128 {}
 
 trait AddSub: Int
 where
-    <Self as Int>::UnsignedInt: UAddSub,
+    <Self as MinInt>::UnsignedInt: UAddSub,
 {
     fn add(self, other: Self) -> Self {
         Self::from_unsigned(self.unsigned().uadd(other.unsigned()))
@@ -37,7 +37,7 @@ impl AddSub for i128 {}
 
 trait Addo: AddSub
 where
-    <Self as Int>::UnsignedInt: UAddSub,
+    <Self as MinInt>::UnsignedInt: UAddSub,
 {
     fn addo(self, other: Self) -> (Self, bool) {
         let sum = AddSub::add(self, other);
@@ -50,7 +50,7 @@ impl Addo for u128 {}
 
 trait Subo: AddSub
 where
-    <Self as Int>::UnsignedInt: UAddSub,
+    <Self as MinInt>::UnsignedInt: UAddSub,
 {
     fn subo(self, other: Self) -> (Self, bool) {
         let sum = AddSub::sub(self, other);
diff --git a/src/int/big.rs b/src/int/big.rs
new file mode 100644
index 00000000..a54d6259
--- /dev/null
+++ b/src/int/big.rs
@@ -0,0 +1,364 @@
+//! Integers used for wide operations, larger than `u128`.
+
+#![allow(unused)]
+
+use crate::int::{DInt, HInt, Int, MinInt};
+use core::{fmt, ops};
+
+const WORD_LO_MASK: u64 = 0x00000000ffffffff;
+const WORD_HI_MASK: u64 = 0xffffffff00000000;
+const WORD_FULL_MASK: u64 = 0xffffffffffffffff;
+const U128_LO_MASK: u128 = u64::MAX as u128;
+const U128_HI_MASK: u128 = (u64::MAX as u128) << 64;
+
+/// A 256-bit unsigned integer represented as 4 64-bit limbs.
+///
+/// Each limb is a native-endian number, but the array is little-limb-endian.
+#[allow(non_camel_case_types)]
+#[derive(Clone, Copy, Debug, PartialEq, PartialOrd)]
+pub struct u256(pub [u64; 4]);
+
+impl u256 {
+    pub const MAX: Self = Self([u64::MAX, u64::MAX, u64::MAX, u64::MAX]);
+
+    /// Reinterpret as a signed integer
+    pub fn signed(self) -> i256 {
+        i256(self.0)
+    }
+}
+
+/// A 256-bit signed integer represented as 4 64-bit limbs.
+///
+/// Each limb is a native-endian number, but the array is little-limb-endian.
+#[allow(non_camel_case_types)]
+#[derive(Clone, Copy, Debug, PartialEq, PartialOrd)]
+pub struct i256(pub [u64; 4]);
+
+impl i256 {
+    /// Reinterpret as an unsigned integer
+    pub fn unsigned(self) -> u256 {
+        u256(self.0)
+    }
+}
+
+impl MinInt for u256 {
+    type OtherSign = i256;
+
+    type UnsignedInt = u256;
+
+    const SIGNED: bool = false;
+    const BITS: u32 = 256;
+    const ZERO: Self = Self([0u64; 4]);
+    const ONE: Self = Self([1, 0, 0, 0]);
+    const MIN: Self = Self([0u64; 4]);
+    const MAX: Self = Self([u64::MAX; 4]);
+}
+
+impl MinInt for i256 {
+    type OtherSign = u256;
+
+    type UnsignedInt = u256;
+
+    const SIGNED: bool = false;
+    const BITS: u32 = 256;
+    const ZERO: Self = Self([0u64; 4]);
+    const ONE: Self = Self([1, 0, 0, 0]);
+    const MIN: Self = Self([0, 0, 0, 1 << 63]);
+    const MAX: Self = Self([u64::MAX, u64::MAX, u64::MAX, u64::MAX << 1]);
+}
+
+// impl Int for i256 {
+//     fn is_zero(self) -> bool {
+//         self == Self::ZERO
+//     }
+
+//     fn wrapping_neg(self) -> Self {
+//         Self::ZERO.wrapping_sub(self)
+//     }
+
+//     fn wrapping_add(self, other: Self) -> Self {
+//         self.overflowing_add(other).0
+//     }
+//
+//     fn overflowing_add(self, other: Self) -> (Self, bool) {
+//         let x0 = (u128::from(self.0[0])).wrapping_add(u128::from(other.0[0]));
+//         let v0 = x0 as u64;
+//         let c0 = x0 >> 64;
+
+//         let x1 = (u128::from(self.0[1]))
+//             .wrapping_add(u128::from(other.0[1]))
+//             .wrapping_add(c0);
+//         let v1 = x1 as u64;
+//         let c1 = x1 >> 64;
+
+//         let x2 = (u128::from(self.0[2]))
+//             .wrapping_add(u128::from(other.0[2]))
+//             .wrapping_add(c1);
+//         let v2 = x2 as u64;
+//         let c2 = x2 >> 64;
+
+//         let x3 = (u128::from(self.0[3]))
+//             .wrapping_add(u128::from(other.0[3]))
+//             .wrapping_add(c2);
+//         let v3 = x3 as u64;
+//         let c3 = x3 >> 64;
+
+//         (Self([v0, v1, v2, v3]), c3 > 0)
+//     }
+// }
+
+macro_rules! impl_common {
+    ($ty:ty) => {
+        //         impl ops::Add for $ty {
+        //             type Output = Self;
+
+        //             fn add(self, rhs: Self) -> Self::Output {
+        //                 let (val, wrapped) = self.overflowing_add(rhs);
+        //                 debug_assert!(!wrapped, "attempted to add with overflow");
+        //                 val
+        //             }
+        //         }
+
+        //         impl ops::AddAssign for $ty {
+        //             fn add_assign(&mut self, rhs: Self) {
+        //                 *self = *self + rhs
+        //             }
+        //         }
+
+        //         impl ops::BitAnd for $ty {
+        //             type Output = Self;
+
+        //             fn bitand(self, rhs: Self) -> Self::Output {
+        //                 Self([
+        //                     self.0[0] & rhs.0[0],
+        //                     self.0[1] & rhs.0[1],
+        //                     self.0[2] & rhs.0[2],
+        //                     self.0[3] & rhs.0[3],
+        //                 ])
+        //             }
+        //         }
+
+        //         impl ops::BitAndAssign for $ty {
+        //             fn bitand_assign(&mut self, rhs: Self) {
+        //                 *self = *self & rhs
+        //             }
+        //         }
+
+        impl ops::BitOr for $ty {
+            type Output = Self;
+
+            fn bitor(mut self, rhs: Self) -> Self::Output {
+                self.0[0] |= rhs.0[0];
+                self.0[1] |= rhs.0[1];
+                self.0[2] |= rhs.0[2];
+                self.0[3] |= rhs.0[3];
+                self
+            }
+        }
+
+        //         impl ops::BitOrAssign for $ty {
+        //             fn bitor_assign(&mut self, rhs: Self) {
+        //                 *self = *self | rhs
+        //             }
+        //         }
+
+        //         impl ops::BitXor for $ty {
+        //             type Output = Self;
+
+        //             fn bitxor(self, rhs: Self) -> Self::Output {
+        //                 Self([
+        //                     self.0[0] ^ rhs.0[0],
+        //                     self.0[1] ^ rhs.0[1],
+        //                     self.0[2] ^ rhs.0[2],
+        //                     self.0[3] ^ rhs.0[3],
+        //                 ])
+        //             }
+        //         }
+
+        //         impl ops::BitXorAssign for $ty {
+        //             fn bitxor_assign(&mut self, rhs: Self) {
+        //                 *self = *self ^ rhs
+        //             }
+        //         }
+
+        impl ops::Not for $ty {
+            type Output = Self;
+
+            fn not(self) -> Self::Output {
+                Self([!self.0[0], !self.0[1], !self.0[2], !self.0[3]])
+            }
+        }
+
+        impl ops::Shl<u32> for $ty {
+            type Output = Self;
+
+            fn shl(self, rhs: u32) -> Self::Output {
+                todo!()
+            }
+        }
+    };
+}
+
+impl_common!(i256);
+impl_common!(u256);
+
+macro_rules! word {
+    (1, $val:expr) => {
+        (($val >> (32 * 3)) & Self::from(WORD_LO_MASK)) as u64
+    };
+    (2, $val:expr) => {
+        (($val >> (32 * 2)) & Self::from(WORD_LO_MASK)) as u64
+    };
+    (3, $val:expr) => {
+        (($val >> (32 * 1)) & Self::from(WORD_LO_MASK)) as u64
+    };
+    (4, $val:expr) => {
+        (($val >> (32 * 0)) & Self::from(WORD_LO_MASK)) as u64
+    };
+}
+
+impl HInt for u128 {
+    type D = u256;
+
+    fn widen(self) -> Self::D {
+        let w0 = self & u128::from(u64::MAX);
+        let w1 = (self >> u64::BITS) & u128::from(u64::MAX);
+        u256([w0 as u64, w1 as u64, 0, 0])
+    }
+
+    fn zero_widen(self) -> Self::D {
+        self.widen()
+    }
+
+    fn zero_widen_mul(self, rhs: Self) -> Self::D {
+        let product11: u64 = word!(1, self) * word!(1, rhs);
+        let product12: u64 = word!(1, self) * word!(2, rhs);
+        let product13: u64 = word!(1, self) * word!(3, rhs);
+        let product14: u64 = word!(1, self) * word!(4, rhs);
+        let product21: u64 = word!(2, self) * word!(1, rhs);
+        let product22: u64 = word!(2, self) * word!(2, rhs);
+        let product23: u64 = word!(2, self) * word!(3, rhs);
+        let product24: u64 = word!(2, self) * word!(4, rhs);
+        let product31: u64 = word!(3, self) * word!(1, rhs);
+        let product32: u64 = word!(3, self) * word!(2, rhs);
+        let product33: u64 = word!(3, self) * word!(3, rhs);
+        let product34: u64 = word!(3, self) * word!(4, rhs);
+        let product41: u64 = word!(4, self) * word!(1, rhs);
+        let product42: u64 = word!(4, self) * word!(2, rhs);
+        let product43: u64 = word!(4, self) * word!(3, rhs);
+        let product44: u64 = word!(4, self) * word!(4, rhs);
+
+        let sum0: u128 = u128::from(product44);
+        let sum1: u128 = u128::from(product34) + u128::from(product43);
+        let sum2: u128 = u128::from(product24) + u128::from(product33) + u128::from(product42);
+        let sum3: u128 = u128::from(product14)
+            + u128::from(product23)
+            + u128::from(product32)
+            + u128::from(product41);
+        let sum4: u128 = u128::from(product13) + u128::from(product22) + u128::from(product31);
+        let sum5: u128 = u128::from(product12) + u128::from(product21);
+        let sum6: u128 = u128::from(product11);
+
+        let r0: u128 =
+            (sum0 & u128::from(WORD_FULL_MASK)) + ((sum1 & u128::from(WORD_LO_MASK)) << 32);
+        let r1: u128 = (sum0 >> 64)
+            + ((sum1 >> 32) & u128::from(WORD_FULL_MASK))
+            + (sum2 & u128::from(WORD_FULL_MASK))
+            + ((sum3 << 32) & u128::from(WORD_HI_MASK));
+
+        let lo = r0.wrapping_add(r1 << 64);
+        let hi = (r1 >> 64)
+            + (sum1 >> 96)
+            + (sum2 >> 64)
+            + (sum3 >> 32)
+            + sum4
+            + (sum5 << 32)
+            + (sum6 << 64);
+
+        u256([
+            (lo & U128_LO_MASK) as u64,
+            ((lo >> 64) & U128_LO_MASK) as u64,
+            (hi & U128_LO_MASK) as u64,
+            ((hi >> 64) & U128_LO_MASK) as u64,
+        ])
+    }
+
+    fn widen_mul(self, rhs: Self) -> Self::D {
+        self.zero_widen_mul(rhs)
+    }
+}
+
+impl HInt for i128 {
+    type D = i256;
+
+    fn widen(self) -> Self::D {
+        let mut ret = self.unsigned().zero_widen().signed();
+        if self.is_negative() {
+            ret.0[2] = u64::MAX;
+            ret.0[3] = u64::MAX;
+        }
+        ret
+    }
+
+    fn zero_widen(self) -> Self::D {
+        self.unsigned().zero_widen().signed()
+    }
+
+    fn zero_widen_mul(self, rhs: Self) -> Self::D {
+        self.unsigned().zero_widen_mul(rhs.unsigned()).signed()
+    }
+
+    fn widen_mul(self, rhs: Self) -> Self::D {
+        unimplemented!()
+        // let mut res = self.zero_widen_mul(rhs);
+        // if self.is_negative() ^ rhs.is_negative() {
+        //     // Sign extend as needed
+        //     // for word in res.0.iter_mut().rev() {
+        //     //     let zeroes = word.leading_zeros();
+        //     //     let leading = u64::MAX << (64 - zeroes);
+        //     //     *word |= leading;
+        //     //     if zeroes != 64 {
+        //     //         break;
+        //     //     }
+        //     // }
+        // }
+
+        // res
+    }
+}
+
+impl DInt for u256 {
+    type H = u128;
+
+    fn lo(self) -> Self::H {
+        let mut tmp = [0u8; 16];
+        tmp[..8].copy_from_slice(&self.0[0].to_le_bytes());
+        tmp[8..].copy_from_slice(&self.0[1].to_le_bytes());
+        u128::from_le_bytes(tmp)
+    }
+
+    fn hi(self) -> Self::H {
+        let mut tmp = [0u8; 16];
+        tmp[..8].copy_from_slice(&self.0[2].to_le_bytes());
+        tmp[8..].copy_from_slice(&self.0[3].to_le_bytes());
+        u128::from_le_bytes(tmp)
+    }
+}
+
+impl DInt for i256 {
+    type H = i128;
+
+    fn lo(self) -> Self::H {
+        let mut tmp = [0u8; 16];
+        tmp[..8].copy_from_slice(&self.0[0].to_le_bytes());
+        tmp[8..].copy_from_slice(&self.0[1].to_le_bytes());
+        i128::from_le_bytes(tmp)
+    }
+
+    fn hi(self) -> Self::H {
+        let mut tmp = [0u8; 16];
+        tmp[..8].copy_from_slice(&self.0[2].to_le_bytes());
+        tmp[8..].copy_from_slice(&self.0[3].to_le_bytes());
+        i128::from_le_bytes(tmp)
+    }
+}
diff --git a/src/int/mod.rs b/src/int/mod.rs
index 509f9fda..bb343d79 100644
--- a/src/int/mod.rs
+++ b/src/int/mod.rs
@@ -3,42 +3,29 @@ use core::ops;
 mod specialized_div_rem;
 
 pub mod addsub;
+mod big;
 pub mod leading_zeros;
 pub mod mul;
 pub mod sdiv;
 pub mod shift;
 pub mod udiv;
 
-pub use self::leading_zeros::__clzsi2;
+pub use big::{i256, u256};
+pub use leading_zeros::__clzsi2;
 
 public_test_dep! {
-/// Trait for some basic operations on integers
-pub(crate) trait Int:
-    Copy
+/// Minimal integer implementations needed on all integer types, including wide integers.
+pub(crate) trait MinInt: Copy
     + core::fmt::Debug
-    + PartialEq
-    + PartialOrd
-    + ops::AddAssign
-    + ops::SubAssign
-    + ops::BitAndAssign
-    + ops::BitOrAssign
-    + ops::BitXorAssign
-    + ops::ShlAssign<i32>
-    + ops::ShrAssign<u32>
-    + ops::Add<Output = Self>
-    + ops::Sub<Output = Self>
-    + ops::Div<Output = Self>
-    + ops::Shl<u32, Output = Self>
-    + ops::Shr<u32, Output = Self>
     + ops::BitOr<Output = Self>
-    + ops::BitXor<Output = Self>
-    + ops::BitAnd<Output = Self>
     + ops::Not<Output = Self>
+    + ops::Shl<u32, Output = Self>
 {
+
     /// Type with the same width but other signedness
-    type OtherSign: Int;
+    type OtherSign: MinInt;
     /// Unsigned version of Self
-    type UnsignedInt: Int;
+    type UnsignedInt: MinInt;
 
     /// If `Self` is a signed integer
     const SIGNED: bool;
@@ -50,13 +37,46 @@ pub(crate) trait Int:
     const ONE: Self;
     const MIN: Self;
     const MAX: Self;
+}
+}
 
+public_test_dep! {
+/// Trait for some basic operations on integers
+pub(crate) trait Int: MinInt
+    + PartialEq
+    + PartialOrd
+    + ops::AddAssign
+    + ops::SubAssign
+    + ops::BitAndAssign
+    + ops::BitOrAssign
+    + ops::BitXorAssign
+    + ops::ShlAssign<i32>
+    + ops::ShrAssign<u32>
+    + ops::Add<Output = Self>
+    + ops::Sub<Output = Self>
+    + ops::Mul<Output = Self>
+    + ops::Div<Output = Self>
+    + ops::Shr<u32, Output = Self>
+    + ops::BitXor<Output = Self>
+    + ops::BitAnd<Output = Self>
+{
     /// LUT used for maximizing the space covered and minimizing the computational cost of fuzzing
     /// in `testcrate`. For example, Self = u128 produces [0,1,2,7,8,15,16,31,32,63,64,95,96,111,
     /// 112,119,120,125,126,127].
-    const FUZZ_LENGTHS: [u8; 20];
+    const FUZZ_LENGTHS: [u8; 20] = make_fuzz_lengths(<Self as MinInt>::BITS);
+
     /// The number of entries of `FUZZ_LENGTHS` actually used. The maximum is 20 for u128.
-    const FUZZ_NUM: usize;
+    const FUZZ_NUM: usize = {
+        let log2 = (<Self as MinInt>::BITS - 1).count_ones() as usize;
+        if log2 == 3 {
+            // case for u8
+            6
+        } else {
+            // 3 entries on each extreme, 2 in the middle, and 4 for each scale of intermediate
+            // boundaries.
+            8 + (4 * (log2 - 4))
+        }
+    };
 
     fn unsigned(self) -> Self::UnsignedInt;
     fn from_unsigned(unsigned: Self::UnsignedInt) -> Self;
@@ -83,74 +103,54 @@ pub(crate) trait Int:
 }
 }
 
+pub(crate) const fn make_fuzz_lengths(bits: u32) -> [u8; 20] {
+    let mut v = [0u8; 20];
+    v[0] = 0;
+    v[1] = 1;
+    v[2] = 2; // important for parity and the iX::MIN case when reversed
+    let mut i = 3;
+
+    // No need for any more until the byte boundary, because there should be no algorithms
+    // that are sensitive to anything not next to byte boundaries after 2. We also scale
+    // in powers of two, which is important to prevent u128 corner tests from getting too
+    // big.
+    let mut l = 8;
+    loop {
+        if l >= ((bits / 2) as u8) {
+            break;
+        }
+        // get both sides of the byte boundary
+        v[i] = l - 1;
+        i += 1;
+        v[i] = l;
+        i += 1;
+        l *= 2;
+    }
+
+    if bits != 8 {
+        // add the lower side of the middle boundary
+        v[i] = ((bits / 2) - 1) as u8;
+        i += 1;
+    }
+
+    // We do not want to jump directly from the Self::BITS/2 boundary to the Self::BITS
+    // boundary because of algorithms that split the high part up. We reverse the scaling
+    // as we go to Self::BITS.
+    let mid = i;
+    let mut j = 1;
+    loop {
+        v[i] = (bits as u8) - (v[mid - j]) - 1;
+        if j == mid {
+            break;
+        }
+        i += 1;
+        j += 1;
+    }
+    v
+}
+
 macro_rules! int_impl_common {
     ($ty:ty) => {
-        const BITS: u32 = <Self as Int>::ZERO.count_zeros();
-        const SIGNED: bool = Self::MIN != Self::ZERO;
-
-        const ZERO: Self = 0;
-        const ONE: Self = 1;
-        const MIN: Self = <Self>::MIN;
-        const MAX: Self = <Self>::MAX;
-
-        const FUZZ_LENGTHS: [u8; 20] = {
-            let bits = <Self as Int>::BITS;
-            let mut v = [0u8; 20];
-            v[0] = 0;
-            v[1] = 1;
-            v[2] = 2; // important for parity and the iX::MIN case when reversed
-            let mut i = 3;
-            // No need for any more until the byte boundary, because there should be no algorithms
-            // that are sensitive to anything not next to byte boundaries after 2. We also scale
-            // in powers of two, which is important to prevent u128 corner tests from getting too
-            // big.
-            let mut l = 8;
-            loop {
-                if l >= ((bits / 2) as u8) {
-                    break;
-                }
-                // get both sides of the byte boundary
-                v[i] = l - 1;
-                i += 1;
-                v[i] = l;
-                i += 1;
-                l *= 2;
-            }
-
-            if bits != 8 {
-                // add the lower side of the middle boundary
-                v[i] = ((bits / 2) - 1) as u8;
-                i += 1;
-            }
-
-            // We do not want to jump directly from the Self::BITS/2 boundary to the Self::BITS
-            // boundary because of algorithms that split the high part up. We reverse the scaling
-            // as we go to Self::BITS.
-            let mid = i;
-            let mut j = 1;
-            loop {
-                v[i] = (bits as u8) - (v[mid - j]) - 1;
-                if j == mid {
-                    break;
-                }
-                i += 1;
-                j += 1;
-            }
-            v
-        };
-
-        const FUZZ_NUM: usize = {
-            let log2 = (<Self as Int>::BITS - 1).count_ones() as usize;
-            if log2 == 3 {
-                // case for u8
-                6
-            } else {
-                // 3 entries on each extreme, 2 in the middle, and 4 for each scale of intermediate
-                // boundaries.
-                8 + (4 * (log2 - 4))
-            }
-        };
-
         fn from_bool(b: bool) -> Self {
             b as $ty
         }
@@ -203,10 +203,20 @@ macro_rules! int_impl_common {
 
 macro_rules! int_impl {
     ($ity:ty, $uty:ty) => {
-        impl Int for $uty {
+        impl MinInt for $uty {
             type OtherSign = $ity;
             type UnsignedInt = $uty;
 
+            const BITS: u32 = <Self as MinInt>::ZERO.count_zeros();
+            const SIGNED: bool = Self::MIN != Self::ZERO;
+
+            const ZERO: Self = 0;
+            const ONE: Self = 1;
+            const MIN: Self = <Self>::MIN;
+            const MAX: Self = <Self>::MAX;
+        }
+
+        impl Int for $uty {
             fn unsigned(self) -> $uty {
                 self
             }
@@ -228,10 +238,20 @@ macro_rules! int_impl {
             int_impl_common!($uty);
         }
 
-        impl Int for $ity {
+        impl MinInt for $ity {
             type OtherSign = $uty;
             type UnsignedInt = $uty;
 
+            const BITS: u32 = <Self as MinInt>::ZERO.count_zeros();
+            const SIGNED: bool = Self::MIN != Self::ZERO;
+
+            const ZERO: Self = 0;
+            const ONE: Self = 1;
+            const MIN: Self = <Self>::MIN;
+            const MAX: Self = <Self>::MAX;
+        }
+
+        impl Int for $ity {
             fn unsigned(self) -> $uty {
                 self as $uty
             }
@@ -259,18 +279,22 @@ int_impl!(i128, u128);
 public_test_dep! {
 /// Trait for integers twice the bit width of another integer. This is implemented for all
 /// primitives except for `u8`, because there is not a smaller primitive.
-pub(crate) trait DInt: Int {
+pub(crate) trait DInt: MinInt {
     /// Integer that is half the bit width of the integer this trait is implemented for
-    type H: HInt<D = Self> + Int;
+    type H: HInt<D = Self>;
 
     /// Returns the low half of `self`
     fn lo(self) -> Self::H;
     /// Returns the high half of `self`
     fn hi(self) -> Self::H;
     /// Returns the low and high halves of `self` as a tuple
-    fn lo_hi(self) -> (Self::H, Self::H);
+    fn lo_hi(self) -> (Self::H, Self::H) {
+        (self.lo(), self.hi())
+    }
     /// Constructs an integer using lower and higher half parts
-    fn from_lo_hi(lo: Self::H, hi: Self::H) -> Self;
+    fn from_lo_hi(lo: Self::H, hi: Self::H) -> Self {
+        lo.zero_widen() | hi.widen_hi()
+    }
 }
 }
 
@@ -279,7 +303,7 @@ public_test_dep! {
 /// primitives except for `u128`, because it there is not a larger primitive.
 pub(crate) trait HInt: Int {
     /// Integer that is double the bit width of the integer this trait is implemented for
-    type D: DInt<H = Self> + Int;
+    type D: DInt<H = Self> + MinInt;
 
     /// Widens (using default extension) the integer to have double bit width
     fn widen(self) -> Self::D;
@@ -287,7 +311,9 @@ pub(crate) trait HInt: Int {
     /// around problems with associated type bounds (such as `Int<Othersign: DInt>`) being unstable
     fn zero_widen(self) -> Self::D;
     /// Widens the integer to have double bit width and shifts the integer into the higher bits
-    fn widen_hi(self) -> Self::D;
+    fn widen_hi(self) -> Self::D {
+        self.widen() << <Self as MinInt>::BITS
+    }
     /// Widening multiplication with zero widening. This cannot overflow.
     fn zero_widen_mul(self, rhs: Self) -> Self::D;
     /// Widening multiplication. This cannot overflow.
@@ -305,13 +331,7 @@ macro_rules! impl_d_int {
                     self as $X
                 }
                 fn hi(self) -> Self::H {
-                    (self >> <$X as Int>::BITS) as $X
-                }
-                fn lo_hi(self) -> (Self::H, Self::H) {
-                    (self.lo(), self.hi())
-                }
-                fn from_lo_hi(lo: Self::H, hi: Self::H) -> Self {
-                    lo.zero_widen() | hi.widen_hi()
+                    (self >> <$X as MinInt>::BITS) as $X
                 }
             }
         )*
@@ -330,9 +350,6 @@ macro_rules! impl_h_int {
                 fn zero_widen(self) -> Self::D {
                     (self as $uH) as $X
                 }
-                fn widen_hi(self) -> Self::D {
-                    (self as $X) << <$H as Int>::BITS
-                }
                 fn zero_widen_mul(self, rhs: Self) -> Self::D {
                     self.zero_widen().wrapping_mul(rhs.zero_widen())
                 }
diff --git a/src/int/mul.rs b/src/int/mul.rs
index 2538e2f4..e0093a72 100644
--- a/src/int/mul.rs
+++ b/src/int/mul.rs
@@ -1,6 +1,6 @@
 use crate::int::{DInt, HInt, Int};
 
-trait Mul: DInt
+trait Mul: DInt + Int
 where
     Self::H: DInt,
 {
@@ -30,7 +30,7 @@ where
 impl Mul for u64 {}
 impl Mul for i128 {}
 
-pub(crate) trait UMulo: Int + DInt {
+pub(crate) trait UMulo: DInt + Int {
     fn mulo(self, rhs: Self) -> (Self, bool) {
         match (self.hi().is_zero(), rhs.hi().is_zero()) {
             // overflow is guaranteed
diff --git a/src/int/shift.rs b/src/int/shift.rs
index dbd04018..31727298 100644
--- a/src/int/shift.rs
+++ b/src/int/shift.rs
@@ -1,4 +1,4 @@
-use crate::int::{DInt, HInt, Int};
+use crate::int::{DInt, HInt, Int, MinInt};
 
 trait Ashl: DInt {
     /// Returns `a << b`, requires `b < Self::BITS`
diff --git a/src/lib.rs b/src/lib.rs
index ea376631..7c0b5072 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -44,6 +44,21 @@ extern crate core;
 
 #[macro_use]
 mod macros;
+macro_rules! vdbg {
+    ($val:expr $(,)?) => {
+        // Use of `match` here is intentional because it affects the lifetimes
+        // of temporaries - https://stackoverflow.com/a/48732525/1063961
+        match $val {
+            tmp => {
+                $crate::write_val(
+                    tmp,
+                    concat!("[", file!(), ":", line!(), "] ", stringify!($val), " = "),
+                );
+                tmp
+            }
+        }
+    };
+}
 
 pub mod float;
 pub mod int;
@@ -80,3 +95,45 @@ pub mod x86;
 pub mod x86_64;
 
 pub mod probestack;
+
+// Hacky way to print values since we don't have `std` for the crate
+mod val_print {
+    extern "C" {
+        fn print_callback(val_ptr: *const u8, val_sz: usize, name_ptr: *const u8, name_len: usize);
+    }
+
+    pub fn write_val<T: Copy>(val: T, name: &str) {
+        unsafe {
+            print_callback(
+                core::ptr::addr_of!(val).cast(),
+                core::mem::size_of::<T>(),
+                name.as_ptr(),
+                name.len(),
+            )
+        };
+    }
+}
+
+pub use val_print::write_val;
+
+#[macro_export]
+macro_rules! set_val_callback {
+    () => {
+        #[no_mangle]
+        unsafe extern "C" fn print_callback(
+            val_ptr: *const u8,
+            val_sz: usize,
+            name_ptr: *const u8,
+            name_len: usize,
+        ) {
+            let val = unsafe { core::slice::from_raw_parts(val_ptr, val_sz) };
+            let name_slice = unsafe { core::slice::from_raw_parts(name_ptr, name_len) };
+            let name = core::str::from_utf8(name_slice).unwrap();
+            print!("{}: 0x", name);
+            for byte in val.iter().rev() {
+                print!("{:02x}", byte);
+            }
+            println!();
+        }
+    };
+}
diff --git a/testcrate/Cargo.toml b/testcrate/Cargo.toml
index 6ff3fde1..6f771181 100644
--- a/testcrate/Cargo.toml
+++ b/testcrate/Cargo.toml
@@ -33,3 +33,5 @@ no-asm = ["compiler_builtins/no-asm"]
 no-f16-f128 = ["compiler_builtins/no-f16-f128"]
 mem = ["compiler_builtins/mem"]
 mangled-names = ["compiler_builtins/mangled-names"]
+# Skip tests that rely on f128 symbols being available on the system
+no-sys-f128 = []
diff --git a/testcrate/benches/float.rs b/testcrate/benches/float.rs
new file mode 100644
index 00000000..a12300b3
--- /dev/null
+++ b/testcrate/benches/float.rs
@@ -0,0 +1,90 @@
+#![feature(test, f16, f128)]
+
+extern crate test;
+use core::hint::black_box;
+use test::Bencher;
+
+extern crate compiler_builtins;
+
+macro_rules! test_values {
+    ($ty:ty) => {
+        &[
+            <$ty>::MIN,
+            <$ty>::MAX,
+            <$ty>::NAN,
+            <$ty>::INFINITY,
+            <$ty>::NEG_INFINITY,
+            <$ty>::MIN_POSITIVE,
+            0.0,
+            1.0,
+            -1.0,
+        ]
+    };
+}
+
+fn combine2<T: Copy>(vals: &[T]) -> Vec<(T, T)> {
+    let mut ret = Vec::new();
+    for x in vals.iter().copied() {
+        for y in vals.iter().copied() {
+            ret.push((x, y));
+        }
+    }
+    ret
+}
+
+macro_rules! test_iter {
+    ($b:ident, $ty:ty, $fn:path) => {{
+        let vals = combine2(test_values!($ty));
+        let iter_loop = || {
+            for (a, b) in vals.iter().copied() {
+                black_box($fn(black_box(a), black_box(b)));
+            }
+        };
+
+        // Warmup
+        for _ in 0..1000 {
+            iter_loop();
+        }
+
+        $b.iter(iter_loop);
+    }};
+}
+
+macro_rules! foobar {
+    ($($ty:ty, $rust_fn:ident, $builtin_fn:ident, $mod:ident::$sym:ident);* $(;)?) => {
+        $(
+            #[bench]
+            fn $rust_fn(b: &mut Bencher) {
+                // Equalize with the builtin function which is called separately
+                #[inline(never)]
+                fn inline_wrapper(a: $ty, b: $ty) -> $ty {
+                    compiler_builtins::float::$mod::$sym(black_box(a), black_box(b))
+                }
+
+                test_iter!(b, $ty, inline_wrapper);
+            }
+
+            #[bench]
+            fn $builtin_fn(b: &mut Bencher) {
+                extern "C" {
+                    fn $sym(a: $ty, b: $ty) -> $ty;
+                }
+
+                unsafe {
+                    test_iter!(b, $ty, $sym);
+                }
+            }
+        )*
+    };
+}
+
+foobar! {
+    f32, addsf3_rust, addsf3_builtin, add::__addsf3;
+    f32, subsf3_rust, subsf3_builtin, sub::__subsf3;
+    f32, mulsf3_rust, mulsf3_builtin, mul::__mulsf3;
+    f32, divsf3_rust, divsf3_builtin, div::__divsf3;
+    f64, adddf3_rust, adddf3_builtin, add::__adddf3;
+    f64, subdf3_rust, subdf3_builtin, sub::__subdf3;
+    f64, muldf3_rust, muldf3_builtin, mul::__muldf3;
+    f64, divdf3_rust, divdf3_builtin, div::__divdf3;
+}
diff --git a/testcrate/build.rs b/testcrate/build.rs
new file mode 100644
index 00000000..86c97af1
--- /dev/null
+++ b/testcrate/build.rs
@@ -0,0 +1,15 @@
+use std::env;
+
+fn main() {
+    let target = env::var("TARGET").unwrap();
+
+    // These platforms do not have f128 symbols available in their system libraries, so
+    // skip related tests.
+    if target.starts_with("arm-")
+        || target.contains("apple-darwin")
+        || target.contains("windows-msvc")
+    {
+        println!("cargo:warning=skipping `f128` tests; system does not have relevant symbols");
+        println!("cargo:rustc-cfg=feature=\"no-sys-f128\"");
+    }
+}
diff --git a/testcrate/src/lib.rs b/testcrate/src/lib.rs
index 9bd155f6..13abf459 100644
--- a/testcrate/src/lib.rs
+++ b/testcrate/src/lib.rs
@@ -15,7 +15,7 @@
 #![no_std]
 
 use compiler_builtins::float::Float;
-use compiler_builtins::int::Int;
+use compiler_builtins::int::{Int, MinInt};
 
 use rand_xoshiro::rand_core::{RngCore, SeedableRng};
 use rand_xoshiro::Xoshiro128StarStar;
@@ -101,7 +101,10 @@ macro_rules! edge_cases {
 
 /// Feeds a series of fuzzing inputs to `f`. The fuzzer first uses an algorithm designed to find
 /// edge cases, followed by a more random fuzzer that runs `n` times.
-pub fn fuzz<I: Int, F: FnMut(I)>(n: u32, mut f: F) {
+pub fn fuzz<I: Int, F: FnMut(I)>(n: u32, mut f: F)
+where
+    <I as MinInt>::UnsignedInt: Int,
+{
     // edge case tester. Calls `f` 210 times for u128.
     // zero gets skipped by the loop
     f(I::ZERO);
@@ -111,7 +114,7 @@ pub fn fuzz<I: Int, F: FnMut(I)>(n: u32, mut f: F) {
 
     // random fuzzer
     let mut rng = Xoshiro128StarStar::seed_from_u64(0);
-    let mut x: I = Int::ZERO;
+    let mut x: I = MinInt::ZERO;
     for _ in 0..n {
         fuzz_step(&mut rng, &mut x);
         f(x)
@@ -119,7 +122,10 @@ pub fn fuzz<I: Int, F: FnMut(I)>(n: u32, mut f: F) {
 }
 
 /// The same as `fuzz`, except `f` has two inputs.
-pub fn fuzz_2<I: Int, F: Fn(I, I)>(n: u32, f: F) {
+pub fn fuzz_2<I: Int, F: Fn(I, I)>(n: u32, f: F)
+where
+    <I as MinInt>::UnsignedInt: Int,
+{
     // Check cases where the first and second inputs are zero. Both call `f` 210 times for `u128`.
     edge_cases!(I, case, {
         f(I::ZERO, case);
@@ -150,10 +156,10 @@ pub fn fuzz_shift<I: Int, F: Fn(I, u32)>(f: F) {
     // Shift functions are very simple and do not need anything other than shifting a small
     // set of random patterns for every fuzz length.
     let mut rng = Xoshiro128StarStar::seed_from_u64(0);
-    let mut x: I = Int::ZERO;
+    let mut x: I = MinInt::ZERO;
     for i in 0..I::FUZZ_NUM {
         fuzz_step(&mut rng, &mut x);
-        f(x, Int::ZERO);
+        f(x, MinInt::ZERO);
         f(x, I::FUZZ_LENGTHS[i] as u32);
     }
 }
diff --git a/testcrate/tests/addsub.rs b/testcrate/tests/addsub.rs
index da7684ec..343e47ae 100644
--- a/testcrate/tests/addsub.rs
+++ b/testcrate/tests/addsub.rs
@@ -1,4 +1,6 @@
 #![allow(unused_macros)]
+#![feature(f128)]
+#![feature(f16)]
 
 use testcrate::*;
 
@@ -80,13 +82,13 @@ macro_rules! float_sum {
                 let sub1: $f = $fn_sub(x, y);
                 if !Float::eq_repr(add0, add1) {
                     panic!(
-                        "{}({}, {}): std: {}, builtins: {}",
+                        "{}({:?}, {:?}): std: {:?}, builtins: {:?}",
                         stringify!($fn_add), x, y, add0, add1
                     );
                 }
                 if !Float::eq_repr(sub0, sub1) {
                     panic!(
-                        "{}({}, {}): std: {}, builtins: {}",
+                        "{:?}({:?}, {:?}): std: {:?}, builtins: {:?}",
                         stringify!($fn_sub), x, y, sub0, sub1
                     );
                 }
@@ -110,6 +112,16 @@ fn float_addsub() {
     );
 }
 
+#[test]
+#[cfg(not(feature = "no-sys-f128"))]
+fn float_addsub_f128() {
+    use compiler_builtins::float::{add::__addtf3, sub::__subtf3, Float};
+
+    float_sum!(
+        f128, __addtf3, __subtf3;
+    );
+}
+
 #[cfg(target_arch = "arm")]
 #[test]
 fn float_addsub_arm() {
diff --git a/testcrate/tests/big.rs b/testcrate/tests/big.rs
new file mode 100644
index 00000000..abf7d77c
--- /dev/null
+++ b/testcrate/tests/big.rs
@@ -0,0 +1,104 @@
+use compiler_builtins::int::{i256, u256, HInt, Int, MinInt};
+
+const LOHI_SPLIT: u128 = 0xaaaaaaaaaaaaaaaaffffffffffffffff;
+
+/// Print a `u256` as hex since we can't add format implementations
+fn hexu(v: u256) -> String {
+    format!(
+        "0x{:016x}{:016x}{:016x}{:016x}",
+        v.0[3], v.0[2], v.0[1], v.0[0]
+    )
+}
+
+fn hexi(v: i256) -> String {
+    hexu(v.unsigned())
+}
+
+#[test]
+fn widen_u128() {
+    assert_eq!(u128::MAX.widen(), u256([u64::MAX, u64::MAX, 0, 0]));
+    assert_eq!(
+        LOHI_SPLIT.widen(),
+        u256([u64::MAX, 0xaaaaaaaaaaaaaaaa, 0, 0])
+    );
+}
+
+#[test]
+fn widen_i128() {
+    assert_eq!((-1i128).widen(), u256::MAX.signed());
+    assert_eq!(
+        (LOHI_SPLIT as i128).widen(),
+        i256([u64::MAX, 0xaaaaaaaaaaaaaaaa, u64::MAX, u64::MAX])
+    );
+    assert_eq!((-1i128).zero_widen().unsigned(), (u128::MAX).widen());
+}
+
+#[test]
+fn widen_mul_u128() {
+    let tests = [
+        (u128::MAX / 2, 2_u128, u256([u64::MAX - 1, u64::MAX, 0, 0])),
+        (u128::MAX, 2_u128, u256([u64::MAX - 1, u64::MAX, 1, 0])),
+        // TODO: https://github.com/rust-lang/compiler-builtins/pull/587#issuecomment-2060543566
+        // (u128::MAX, u128::MAX, u256([1, 0, u64::MAX - 1, u64::MAX])),
+        (u128::MIN, u128::MIN, u256::ZERO),
+        (1234, 0, u256::ZERO),
+        (0, 1234, u256::ZERO),
+    ];
+
+    let mut errors = Vec::new();
+    for (i, (a, b, exp)) in tests.iter().copied().enumerate() {
+        let res = a.widen_mul(b);
+        let res_z = a.zero_widen_mul(b);
+        assert_eq!(res, res_z);
+        if res != exp {
+            errors.push((i, a, b, exp, res));
+        }
+    }
+
+    for (i, a, b, exp, res) in &errors {
+        eprintln!(
+            "FAILURE ({i}): {a:#034x} * {b:#034x} = {} got {}",
+            hexu(*exp),
+            hexu(*res)
+        );
+    }
+    assert!(errors.is_empty());
+}
+
+// #[test]
+// fn widen_mul_i128() {
+//     let tests = [
+//         (
+//             i128::MAX / 2,
+//             2_i128,
+//             i256([u64::MAX - 1, u64::MAX >> 1, 0, 0]),
+//         ),
+//         (i128::MAX, 2_i128, i256([u64::MAX - 1, u64::MAX, 0, 0])),
+//         (i128::MIN, 2_i128, i256([0, 0, u64::MAX, u64::MAX])),
+//         (
+//             i128::MAX,
+//             i128::MAX,
+//             i256([1, 0, u64::MAX - 1, u64::MAX >> 2]),
+//         ),
+//         (i128::MAX, i128::MIN, i256([0, 0, 0, 0b11 << 62])),
+//         (i128::MIN, i128::MIN, i256([0, 0, 0, 0])),
+//         (1234, 0, i256::ZERO),
+//         (0, 1234, i256::ZERO),
+//         (-1234, 0, i256::ZERO),
+//         (0, -1234, i256::ZERO),
+//     ];
+
+//     let mut errors = Vec::new();
+//     for (i, (a, b, exp)) in tests.iter().copied().enumerate() {
+//         let res = a.widen_mul(b);
+//         // TODO check zero widen mul
+//         if res != exp {
+//             errors.push((i, a, b, exp, res));
+//         }
+//     }
+
+//     for (i, a, b, exp, res) in &errors {
+//         eprintln!("FAILURE ({i}): {a:#034x} * {b:#034x} = {} got {}", hexi(*exp), hexi(*res));
+//     }
+//     assert!(errors.is_empty());
+// }
diff --git a/testcrate/tests/cmp.rs b/testcrate/tests/cmp.rs
index 5c10a560..7ad90a7c 100644
--- a/testcrate/tests/cmp.rs
+++ b/testcrate/tests/cmp.rs
@@ -1,4 +1,6 @@
 #![allow(unused_macros)]
+#![feature(f128)]
+#![feature(f16)]
 
 use testcrate::*;
 
@@ -16,7 +18,10 @@ macro_rules! cmp {
             };
             let cmp1 = $fn($x, $y);
             if cmp0 != cmp1 {
-                panic!("{}({}, {}): std: {}, builtins: {}", stringify!($fn_builtins), $x, $y, cmp0, cmp1);
+                panic!(
+                    "{}({:?}, {:?}): std: {:?}, builtins: {:?}",
+                    stringify!($fn_builtins), $x, $y, cmp0, cmp1
+                );
             }
         )*
     };
@@ -55,6 +60,26 @@ fn float_comparisons() {
     });
 }
 
+#[cfg(not(feature = "no-sys-f128"))]
+#[test]
+fn float_comparisons_f128() {
+    use compiler_builtins::float::cmp::{
+        __eqtf2, __getf2, __gttf2, __letf2, __lttf2, __netf2, __unordtf2,
+    };
+
+    fuzz_float_2(N, |x: f128, y: f128| {
+        assert_eq!(__unordtf2(x, y) != 0, x.is_nan() || y.is_nan());
+        cmp!(x, y,
+            1, __lttf2;
+            1, __letf2;
+            1, __eqtf2;
+            -1, __getf2;
+            -1, __gttf2;
+            1, __netf2;
+        );
+    });
+}
+
 macro_rules! cmp2 {
     ($x:ident, $y:ident, $($unordered_val:expr, $fn_std:expr, $fn_builtins:ident);*;) => {
         $(
diff --git a/testcrate/tests/div_rem.rs b/testcrate/tests/div_rem.rs
index de3bd9be..07bd233c 100644
--- a/testcrate/tests/div_rem.rs
+++ b/testcrate/tests/div_rem.rs
@@ -1,9 +1,13 @@
 #![allow(unused_macros)]
+#![feature(f128)]
+#![feature(f16)]
 
 use compiler_builtins::int::sdiv::{__divmoddi4, __divmodsi4, __divmodti4};
 use compiler_builtins::int::udiv::{__udivmoddi4, __udivmodsi4, __udivmodti4, u128_divide_sparc};
 use testcrate::*;
 
+compiler_builtins::set_val_callback!();
+
 // Division algorithms have by far the nastiest and largest number of edge cases, and experience shows
 // that sometimes 100_000 iterations of the random fuzzer is needed.
 
@@ -107,12 +111,15 @@ macro_rules! float {
     ($($i:ty, $fn:ident);*;) => {
         $(
             fuzz_float_2(N, |x: $i, y: $i| {
+                dbg!(x, y);
                 let quo0 = x / y;
+                dbg!(quo0);
                 let quo1: $i = $fn(x, y);
+                dbg!(quo1);
                 #[cfg(not(target_arch = "arm"))]
                 if !Float::eq_repr(quo0, quo1) {
                     panic!(
-                        "{}({}, {}): std: {}, builtins: {}",
+                        "{}({:?}, {:?}): std: {:?}, builtins: {:?}",
                         stringify!($fn), x, y, quo0, quo1
                     );
                 }
@@ -122,7 +129,7 @@ macro_rules! float {
                 if !(Float::is_subnormal(quo0) || Float::is_subnormal(quo1)) {
                     if !Float::eq_repr(quo0, quo1) {
                         panic!(
-                            "{}({}, {}): std: {}, builtins: {}",
+                            "{}({:?}, {:?}): std: {:?}, builtins: {:?}",
                             stringify!($fn), x, y, quo0, quo1
                         );
                     }
@@ -146,6 +153,24 @@ fn float_div() {
     );
 }
 
+#[cfg(not(feature = "no-sys-f128"))]
+#[test]
+fn float_div_f128() {
+    use compiler_builtins::float::{div::__divtf3, Float};
+
+    float!(
+        f128, __divtf3;
+    );
+}
+
+#[test]
+fn div_failures() {
+    use compiler_builtins::float::{div::__divtf3, Float};
+    let a = f128::from_bits(0x1);
+    let b = f128::from_bits(0x1);
+    dbg!(__divtf3(a, b));
+}
+
 #[cfg(target_arch = "arm")]
 #[test]
 fn float_div_arm() {
diff --git a/testcrate/tests/mul.rs b/testcrate/tests/mul.rs
index 819f06ca..446d5c46 100644
--- a/testcrate/tests/mul.rs
+++ b/testcrate/tests/mul.rs
@@ -1,4 +1,6 @@
 #![allow(unused_macros)]
+#![feature(f128)]
+#![feature(f16)]
 
 use testcrate::*;
 
@@ -91,7 +93,7 @@ macro_rules! float_mul {
                 if !(Float::is_subnormal(mul0) || Float::is_subnormal(mul1)) {
                     if !Float::eq_repr(mul0, mul1) {
                         panic!(
-                            "{}({}, {}): std: {}, builtins: {}",
+                            "{}({:?}, {:?}): std: {:?}, builtins: {:?}",
                             stringify!($fn), x, y, mul0, mul1
                         );
                     }
@@ -115,6 +117,16 @@ fn float_mul() {
     );
 }
 
+#[test]
+#[cfg(not(feature = "no-sys-f128"))]
+fn float_mul_f128() {
+    use compiler_builtins::float::{mul::__multf3, Float};
+
+    float_mul!(
+        f128, __multf3;
+    );
+}
+
 #[cfg(target_arch = "arm")]
 #[test]
 fn float_mul_arm() {