Merge pull request #193 from jpflori/powm

powmod functions from GMP
wbhart · Feb 13, 2017 · 8f7df12 · 8f7df12
2 parents 57c6928 + feb4d44
commit 8f7df12
Show file tree

Hide file tree

Showing 21 changed files with 3,360 additions and 547 deletions.
diff --git a/configure.ac b/configure.ac
@@ -1281,13 +1281,13 @@ case $host in
           penryn-*-*) 
             path_64="x86_64/core2/penryn x86_64/core2 x86_64" ;;
           nehalem-*-*) 
-            path_64="x86_64/nehalem x86_64" ;;
+            path_64="x86_64/nehalem x86_64/core2 x86_64" ;;
           westmere-*-*) 
-            path_64="x86_64/nehalem/westmere x86_64/nehalem x86_64" ;;
+            path_64="x86_64/nehalem/westmere x86_64/nehalem x86_64/core2 x86_64" ;;
           sandybridge-*-*) 
-            path_64="x86_64/sandybridge x86_64" ;;
+            path_64="x86_64/sandybridge x86_64/nehalem x86_64/core2 x86_64" ;;
           ivybridge-*-*)
-            path_64="x86_64/sandybridge/ivybridge x86_64/sandybridge x86_64" ;;
+            path_64="x86_64/sandybridge/ivybridge x86_64/sandybridge x86_64/nehalem x86_64/core2 x86_64" ;;
           haswell-*-*)
             path_64="x86_64/haswell/avx x86_64/haswell x86_64/sandybridge x86_64" ;;
           skylake-*-*)
@@ -2199,11 +2199,13 @@ gmp_mpn_functions="$extra_functions					   \
   mod_1_1 mod_1_2 mod_1_3 tdiv_q mp_bases fib_table			\
   mulmid_basecase mulmid mulmid_n toom42_mulmid	mulmod_bexpp1 mulmod_2expm1 \
   mulmod_2expp1_basecase mul_fft \
-  mul mul_n mul_basecase sqr_basecase random random2 pow_1	   \
+  mul mul_n mul_basecase sqr_basecase random random2 	   \
+  pow_1 powlo powm binvert \
   urandomb urandomm randomb rrandom invert \	
   rootrem sizeinbase sqrtrem get_str set_str scan0 scan1 popcount hamdist cmp perfect_square_p \
   bdivmod gcd gcd_1 gcdext tdiv_qr jacobi_base jacobi jacobi_2 get_d  \
-  mullow_n mulhigh_n mullow_n_basecase mullow_basecase redc_1 redc_2	\
+  mullow_n mulhigh_n mullow_n_basecase mullow_basecase \
+  redc_1 redc_2 redc_n \
   sb_divappr_q toom3_mul toom3_mul_n toom4_mul toom4_mul_n \
   dc_div_q dc_divappr_q sb_div_q sb_div_qr dc_div_qr dc_div_qr_n inv_divappr_q_n \
   inv_divappr_q inv_div_q inv_div_qr inv_div_qr_n rootrem_basecase \

diff --git a/gmp-h.in b/gmp-h.in
@@ -1642,6 +1642,7 @@ __GMP_DECLSPEC void mpn_redc_1 __GMP_PROTO ((mp_ptr, mp_ptr, mp_srcptr, mp_size_
 
 #define mpn_redc_2 __MPN(redc_2)
 __GMP_DECLSPEC void mpn_redc_2 __GMP_PROTO ((mp_ptr, mp_ptr, mp_srcptr, mp_size_t, mp_srcptr));
+
 #define mpn_redc_n __MPN(redc_n)
 __GMP_DECLSPEC void mpn_redc_n __GMP_PROTO ((mp_ptr, mp_ptr, mp_srcptr, mp_size_t, mp_srcptr));
 

diff --git a/gmp-impl.h b/gmp-impl.h
@@ -1080,6 +1080,16 @@ __GMP_DECLSPEC int mpn_is_invert __GMP_PROTO ((mp_srcptr, mp_srcptr, mp_size_t))
 #define mpn_invert_trunc __MPN(invert_trunc)
 __GMP_DECLSPEC void mpn_invert_trunc __GMP_PROTO ((mp_ptr, mp_size_t, mp_srcptr, mp_size_t, mp_srcptr));
 
+#define   mpn_binvert __MPN(binvert)
+__GMP_DECLSPEC void      mpn_binvert __GMP_PROTO ((mp_ptr, mp_srcptr, mp_size_t, mp_ptr));
+#define   mpn_binvert_itch __MPN(binvert_itch)
+__GMP_DECLSPEC mp_size_t mpn_binvert_itch __GMP_PROTO ((mp_size_t)) ATTRIBUTE_CONST;
+
+#define   mpn_powm __MPN(powm)
+__GMP_DECLSPEC void      mpn_powm __GMP_PROTO ((mp_ptr, mp_srcptr, mp_size_t, mp_srcptr, mp_size_t, mp_srcptr, mp_size_t, mp_ptr));
+#define   mpn_powlo __MPN(powlo)
+__GMP_DECLSPEC void      mpn_powlo __GMP_PROTO ((mp_ptr, mp_srcptr, mp_srcptr, mp_size_t, mp_size_t, mp_ptr));
+
 #ifndef mpn_divrem_euclidean_qr_1    /* if not done with cpuvec in a fat binary */
 #define mpn_divrem_euclidean_qr_1 __MPN(divrem_euclidean_qr_1)
 __GMP_DECLSPEC mp_limb_t mpn_divrem_euclidean_qr_1 __GMP_PROTO ((mp_ptr, mp_size_t, mp_srcptr, mp_size_t,mp_limb_t));
@@ -2014,6 +2024,23 @@ __GMP_DECLSPEC mp_limb_t gmp_primesieve (mp_ptr, mp_limb_t);
 #define FFT_MULMOD_2EXPP1_CUTOFF 128
 #endif
 
+#if HAVE_NATIVE_mpn_addmul_2 || HAVE_NATIVE_mpn_redc_2
+
+#ifndef REDC_1_TO_REDC_2_THRESHOLD
+#define REDC_1_TO_REDC_2_THRESHOLD       15
+#endif
+#ifndef REDC_2_TO_REDC_N_THRESHOLD
+#define REDC_2_TO_REDC_N_THRESHOLD      100
+#endif
+
+#else
+
+#ifndef REDC_1_TO_REDC_N_THRESHOLD
+#define REDC_1_TO_REDC_N_THRESHOLD      100
+#endif
+
+#endif /* HAVE_NATIVE_mpn_addmul_2 || HAVE_NATIVE_mpn_redc_2 */
+
 #ifndef DC_DIV_QR_THRESHOLD
 #define DC_DIV_QR_THRESHOLD    (3 * MUL_KARATSUBA_THRESHOLD)
 #endif
@@ -2714,6 +2741,17 @@ __GMP_DECLSPEC void mpz_out_raw_m __GMP_PROTO((mpir_out_ptr, mpz_srcptr));
       }                                                                 \
   } while (0)
 
+#define MPN_SIZEINBASE_2EXP(result, ptr, size, base2exp)			\
+  do {										\
+    int          __cnt;								\
+    mp_bitcnt_t  __totbits;							\
+    ASSERT ((size) > 0);							\
+    ASSERT ((ptr)[(size)-1] != 0);						\
+    count_leading_zeros (__cnt, (ptr)[(size)-1]);				\
+    __totbits = (mp_bitcnt_t) (size) * GMP_NUMB_BITS - (__cnt - GMP_NAIL_BITS);	\
+    (result) = (__totbits + (base2exp)-1) / (base2exp);				\
+  } while (0)
+
 /* eliminate mp_bases lookups for base==16 */
 #define MPN_SIZEINBASE_16(result, ptr, size)                            \
   do {                                                                  \
@@ -4336,6 +4374,10 @@ extern mp_size_t                     mullow_dc_threshold;
 #define MULLOW_MUL_THRESHOLD         mullow_mul_threshold
 extern mp_size_t                     mullow_mul_threshold;
 
+#undef  MULMID_TOOM42_THRESHOLD
+#define MULMID_TOOM42_THRESHOLD      mulmid_toom42_threshold
+extern mp_size_t                     mulmid_toom42_threshold;
+
 #undef  MULHIGH_BASECASE_THRESHOLD
 #define MULHIGH_BASECASE_THRESHOLD   mulhigh_basecase_threshold
 extern mp_size_t                     mulhigh_basecase_threshold;
@@ -4394,11 +4436,6 @@ extern mp_size_t                     dc_divappr_q_threshold;
 #define INV_DIVAPPR_Q_THRESHOLD      inv_divappr_q_threshold
 extern mp_size_t                     inv_divappr_q_threshold;
 
-
-#undef  POWM_THRESHOLD
-#define POWM_THRESHOLD               powm_threshold
-extern mp_size_t                     powm_threshold;
-
 #undef  ROOTREM_THRESHOLD
 #define ROOTREM_THRESHOLD            rootrem_threshold
 extern mp_size_t                     rootrem_threshold;
@@ -4427,6 +4464,18 @@ extern mp_size_t                     mod_1_2_threshold;
 #define MOD_1_3_THRESHOLD            mod_1_3_threshold
 extern mp_size_t                     mod_1_3_threshold;
 
+#undef	REDC_1_TO_REDC_2_THRESHOLD
+#define REDC_1_TO_REDC_2_THRESHOLD	redc_1_to_redc_2_threshold
+extern mp_size_t			redc_1_to_redc_2_threshold;
+
+#undef	REDC_2_TO_REDC_N_THRESHOLD
+#define REDC_2_TO_REDC_N_THRESHOLD	redc_2_to_redc_n_threshold
+extern mp_size_t			redc_2_to_redc_n_threshold;
+
+#undef	REDC_1_TO_REDC_N_THRESHOLD
+#define REDC_1_TO_REDC_N_THRESHOLD	redc_1_to_redc_n_threshold
+extern mp_size_t			redc_1_to_redc_n_threshold;
+
 #undef	MATRIX22_STRASSEN_THRESHOLD
 #define MATRIX22_STRASSEN_THRESHOLD	matrix22_strassen_threshold
 extern mp_size_t			matrix22_strassen_threshold;

diff --git a/mpn/asm-defs.m4 b/mpn/asm-defs.m4
@@ -1359,6 +1359,8 @@ define_mpn(preinv_divrem_1)
 define_mpn(preinv_mod_1)
 define_mpn(nand_n)
 define_mpn(nior_n)
+define_mpn(powm)
+define_mpn(powlo)
 define_mpn(random)
 define_mpn(random2)
 define_mpn(redc_1)

diff --git a/mpn/generic/binvert.c b/mpn/generic/binvert.c
@@ -0,0 +1,125 @@
+/* Compute {up,n}^(-1) mod B^n.
+
+   Contributed to the GNU project by Torbjorn Granlund.
+
+   THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES.  IT IS ONLY
+   SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
+   GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GMP RELEASE.
+
+Copyright (C) 2004-2007, 2009, 2012 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#include "mpir.h"
+#include "gmp-impl.h"
+
+
+/*
+  r[k+1] = r[k] - r[k] * (u*r[k] - 1)
+  r[k+1] = r[k] + r[k] - r[k]*(u*r[k])
+*/
+
+#if TUNE_PROGRAM_BUILD
+#define NPOWS \
+ ((sizeof(mp_size_t) > 6 ? 48 : 8*sizeof(mp_size_t)))
+#else
+#define NPOWS \
+ ((sizeof(mp_size_t) > 6 ? 48 : 8*sizeof(mp_size_t)) - LOG2C (BINV_NEWTON_THRESHOLD))
+#endif
+
+mp_size_t
+mpn_binvert_itch (mp_size_t n)
+{
+  mp_size_t itch_local = mpn_mulmod_bnm1_next_size (n);
+  mp_size_t itch_out = mpn_mulmod_bnm1_itch (itch_local, n, (n + 1) >> 1);
+  return itch_local + itch_out;
+}
+
+void
+mpn_binvert (mp_ptr rp, mp_srcptr up, mp_size_t n, mp_ptr scratch)
+{
+  mp_ptr xp;
+  mp_size_t rn, newrn;
+  mp_size_t sizes[NPOWS], *sizp;
+  mp_limb_t di;
+
+  /* Compute the computation precisions from highest to lowest, leaving the
+     base case size in 'rn'.  */
+  sizp = sizes;
+  for (rn = n; ABOVE_THRESHOLD (rn, BINV_NEWTON_THRESHOLD); rn = (rn + 1) >> 1)
+    *sizp++ = rn;
+
+  xp = scratch;
+
+  /* Compute a base value of rn limbs.  */
+  MPN_ZERO (xp, rn);
+  xp[0] = 1;
+  /* JPF: GMP goes the other way around and has renamed to binvert_limb */
+  modlimb_invert (di, up[0]);
+  /* JPF: GMP vs MPIR diff; MPIR returns a two limbs overflow and use sub rather than add */
+  if (BELOW_THRESHOLD (rn, DC_BDIV_Q_THRESHOLD))
+    mpn_sb_bdiv_q (rp, xp+rn, xp, rn, up, rn, di);
+  else
+    mpn_dc_bdiv_q (rp, xp, rn, up, rn, di);
+
+  /* Use Newton iterations to get the desired precision.  */
+  if (rn == n)
+    return;
+  newrn = *--sizp;
+  for (; newrn < n;)
+    {
+      mp_size_t m;
+
+      /* X <- UR. */
+      m = mpn_mulmod_bnm1_next_size (newrn);
+      mpn_mulmod_bnm1 (xp, m, up, newrn, rp, rn, xp + m);
+      mpn_sub_1 (xp + m, xp, rn - (m - newrn), 1);
+
+      /* R = R(X/B^rn) */
+      mpn_mullow_n (rp + rn, rp, xp + rn, newrn - rn);
+      mpn_neg (rp + rn, rp + rn, newrn - rn);
+
+      rn = newrn;
+      newrn = *--sizp;
+    }
+  /* Last iteration would overflow in the mullow call */
+    {
+      mp_size_t m;
+
+      /* X <- UR. */
+      m = mpn_mulmod_bnm1_next_size (newrn);
+      mpn_mulmod_bnm1 (xp, m, up, newrn, rp, rn, xp + m);
+      mpn_sub_1 (xp + m, xp, rn - (m - newrn), 1);
+
+      /* R = R(X/B^rn) */
+      mpn_mullow_n (xp + newrn, rp, xp + rn, newrn - rn); /* JPF: would overflow */
+      /* At most we need 2*(newrn - rn) limbs at xp + newrn, so need 3*newrn - 2*rn */
+      /* As 2rn > newrn, and n == newrn, this gives at max 2*n for xp */
+      /* which we already ensure */
+      mpn_neg (rp + rn, xp + newrn, newrn - rn);
+    }
+}