Skip to content

Commit

Permalink
Use inline asm! for x86 DIV
Browse files Browse the repository at this point in the history
  • Loading branch information
cuviper committed May 4, 2024
1 parent 636317c commit ecec8c4
Show file tree
Hide file tree
Showing 2 changed files with 44 additions and 7 deletions.
14 changes: 10 additions & 4 deletions src/biguint/convert.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
use super::{biguint_from_vec, BigUint, ToBigUint};

use super::addition::add2;
use super::division::div_rem_digit;
use super::division::{div_rem_digit, FAST_DIV_WIDE};
use super::multiplication::mac_with_carry;

use crate::big_digit::{self, BigDigit};
Expand Down Expand Up @@ -688,16 +688,22 @@ pub(super) fn to_radix_digits_le(u: &BigUint, radix: u32) -> Vec<u8> {

let mut digits = u.clone();

let (base, power) = get_half_radix_base(radix);
// X86 DIV can quickly divide by a full digit, otherwise we choose a divisor
// that's suitable for `div_half` to avoid slow `DoubleBigDigit` division.
let (base, power) = if FAST_DIV_WIDE {
get_radix_base(radix)
} else {
get_half_radix_base(radix)
};
let radix = radix as BigDigit;

// For very large numbers, the O(n²) loop of repeated `div_rem_digit` dominates the
// performance. We can mitigate this by dividing into chunks of a larger base first.
// The threshold for this was chosen by anecdotal performance measurements to
// approximate where this starts to make a noticeable difference.
if digits.data.len() >= 64 {
let mut big_base = BigUint::from(base * base);
let mut big_power = 2usize;
let mut big_base = BigUint::from(base);
let mut big_power = 1usize;

// Choose a target base length near √n.
let target_len = digits.data.len().sqrt();
Expand Down
37 changes: 34 additions & 3 deletions src/biguint/division.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,15 @@ use core::ops::{Div, DivAssign, Rem, RemAssign};
use num_integer::Integer;
use num_traits::{CheckedDiv, CheckedEuclid, Euclid, One, ToPrimitive, Zero};

pub(super) const FAST_DIV_WIDE: bool = cfg!(any(target_arch = "x86", target_arch = "x86_64"));

/// Divide a two digit numerator by a one digit divisor, returns quotient and remainder:
///
/// Note: the caller must ensure that both the quotient and remainder will fit into a single digit.
/// This is _not_ true for an arbitrary numerator/denominator.
///
/// (This function also matches what the x86 divide instruction does).
#[cfg(not(any(target_arch = "x86", target_arch = "x86_64")))]
#[inline]
fn div_wide(hi: BigDigit, lo: BigDigit, divisor: BigDigit) -> (BigDigit, BigDigit) {
debug_assert!(hi < divisor);
Expand All @@ -25,6 +28,34 @@ fn div_wide(hi: BigDigit, lo: BigDigit, divisor: BigDigit) -> (BigDigit, BigDigi
((lhs / rhs) as BigDigit, (lhs % rhs) as BigDigit)
}

/// x86 and x86_64 can use a real `div` instruction.
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
#[inline]
fn div_wide(hi: BigDigit, lo: BigDigit, divisor: BigDigit) -> (BigDigit, BigDigit) {
// This debug assertion covers the potential #DE for divisor==0 or a quotient too large for one
// register, otherwise in release mode it will become a target-specific fault like SIGFPE.
// This should never occur with the inputs from our few `div_wide` callers.
debug_assert!(hi < divisor);

// SAFETY: The `div` instruction only affects registers, reading the explicit operand as the
// divisor, and implicitly reading RDX:RAX or EDX:EAX as the dividend. The result is implicitly
// written back to RAX or EAX for the quotient and RDX or EDX for the remainder. No memory is
// used, and flags are not preserved.
unsafe {
let (div, rem);

core::arch::asm!(
"div {}",
in(reg) divisor,
inout("dx") hi => rem,
inout("ax") lo => div,
options(pure, nomem, nostack),
);

(div, rem)
}
}

/// For small divisors, we can divide without promoting to `DoubleBigDigit` by
/// using half-size pieces of digit, like long-division.
#[inline]
Expand All @@ -45,7 +76,7 @@ pub(super) fn div_rem_digit(mut a: BigUint, b: BigDigit) -> (BigUint, BigDigit)

let mut rem = 0;

if b <= big_digit::HALF {
if !FAST_DIV_WIDE && b <= big_digit::HALF {
for d in a.data.iter_mut().rev() {
let (q, r) = div_half(rem, *d, b);
*d = q;
Expand All @@ -70,7 +101,7 @@ fn rem_digit(a: &BigUint, b: BigDigit) -> BigDigit {

let mut rem = 0;

if b <= big_digit::HALF {
if !FAST_DIV_WIDE && b <= big_digit::HALF {
for &digit in a.data.iter().rev() {
let (_, r) = div_half(rem, digit, b);
rem = r;
Expand Down Expand Up @@ -230,7 +261,7 @@ fn div_rem_core(mut a: BigUint, b: &[BigDigit]) -> (BigUint, BigUint) {
let mut a0 = 0;

// [b1, b0] are the two most significant digits of the divisor. They never change.
let b0 = *b.last().unwrap();
let b0 = b[b.len() - 1];
let b1 = b[b.len() - 2];

let q_len = a.data.len() - b.len() + 1;
Expand Down

0 comments on commit ecec8c4

Please sign in to comment.