Skip to content

Commit

Permalink
WIP: Simulated int128 type.
Browse files Browse the repository at this point in the history
  • Loading branch information
roconnor-blockstream committed Oct 26, 2021
1 parent 10f9bd8 commit 4c684ff
Show file tree
Hide file tree
Showing 2 changed files with 233 additions and 75 deletions.
131 changes: 131 additions & 0 deletions src/int128.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,131 @@
#ifndef SECP256K1_INT128_H
#define SECP256K1_INT128_H

#include <stdint.h>

/*
#ifdef UINT128_MAX
#undef UINT128_MAX
*/

#if defined(UINT128_MAX)
typedef uint128_t secp256k1_uint128;
typedef int128_t secp256k1_int128;
#else
typedef struct {
uint64_t lo;
uint64_t hi;
} secp256k1_uint128;

typedef secp256k1_uint128 secp256k1_int128;
#endif

/* Low 32 bits of a (u)int64_t as an uint64_t. */
#define LO32(x) ((uint64_t)(x) & 0xffffffff)
#define HI32(x) ((x) >> 32)

#if defined(_M_X64) | defined(_M_ARM64) | defined(_WIN64) /* MSVC */
#include <intrin.h>
#define secp256k1_umulh __umulh
#define secp256k1_mulh __mulh
#else
static SECP256K1_INLINE uint64_t secp256k1_umulh(uint64_t a, uint64_t b) {
#if defined(UINT128_MAX)
return (uint64_t)(((uint128_t)a * b) >> 64);
#else
uint64_t t1 = LO32(a)*LO32(b);
uint64_t t2 = HI32(a)*LO32(b);
uint64_t t3 = LO32(a)*HI32(b) + HI32(t1) + LO32(t2);
return HI32(a)*HI32(b) + HI32(t2) + HI32(t3);
#endif
}

static SECP256K1_INLINE int64_t secp256k1_mulh(int64_t a, int64_t b) {
#if defined(UINT128_MAX)
return (int64_t)(((int128_t)a * b) >> 64);
#else
uint64_t t1 = (uint64_t)(uint32_t)a * (uint32_t)b;
int64_t t2 = (a >> 32) * (uint32_t)b;
int64_t t3 = (uint32_t)a * (b >> 32);
uint64_t t4 = (t1 >> 32) + (uint32_t)t2 + (uint32_t)t3;
return (a >> 32) * (b >> 32) + (t2 >> 32) + (t3 >> 32) + (int64_t)(t4 >> 32);
#endif
}
#endif

static SECP256K1_INLINE void secp256k1_umul128(secp256k1_int128 *r, uint64_t a, uint64_t b) {
#if defined(UINT128_MAX)
*r = (uint128_t)a * b;
#else
r->hi = secp256k1_umulh(a, b);
r->lo += a * b;
#endif
}

static SECP256K1_INLINE void secp256k1_i128_mul(secp256k1_int128 *r, int64_t a, int64_t b) {
#if defined(UINT128_MAX)
*r = (int128_t)a * b;
#else
r->hi = (uint64_t)secp256k1_mulh(a, b);
r->lo = (uint64_t)a * (uint64_t)b;
#endif
}

static SECP256K1_INLINE void secp256k1_i128_accum_mul(secp256k1_int128 *r, int64_t a, int64_t b) {
#if defined(UINT128_MAX)
*r += (int128_t)a * b;
#else
uint64_t lo = (uint64_t)a * (uint64_t)b;
r->hi += (uint64_t)secp256k1_mulh(a, b) + (r->lo > ~lo);
r->lo += lo;
#endif
}

/* Signed (arithmetic) right shift.
* Non-constant time in b.
*/
static SECP256K1_INLINE void secp256k1_i128_rshift(secp256k1_int128 *r, unsigned int b) {
#if defined(UINT128_MAX)
*r >>= b;
#else
if (b >= 64) {
r->lo = (uint64_t)((int64_t)(r->hi) >> (b-64));
r->hi = (uint64_t)((int64_t)(r->hi) >> 63);
} else if (b > 0) {
r->lo = ((r->hi & (((uint64_t)1<<b)-1)) << (64-b)) | r->lo >> b;
r->hi = (uint64_t)((int64_t)(r->hi) >> b);
}
#endif
}

static SECP256K1_INLINE int64_t secp256k1_i128_to_i64(const secp256k1_int128 *a) {
#if defined(UINT128_MAX)
return (int64_t)(uint64_t)(*a);
#else
return (int64_t)a->lo;
#endif
}

static SECP256K1_INLINE void secp256k1_i128_from_i64(secp256k1_int128 *r, int64_t a) {
#if defined(UINT128_MAX)
*r = a;
#else
r->hi = (uint64_t)(a >> 63);
r->lo = (uint64_t)a;
#endif
}

static SECP256K1_INLINE int secp256k1_i128_eq(const secp256k1_int128 *a, const secp256k1_int128 *b) {
#if defined(UINT128_MAX)
return *a == *b;
#else
return a->hi == b->hi && a->lo == b->lo;
#endif
}

/*
#endif
#define UINT128_MAX ((uint128_t)(int128_t)(-1))
*/

#endif
177 changes: 102 additions & 75 deletions src/modinv64_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
#include "modinv64.h"

#include "util.h"
#include "int128.h"

/* This file implements modular inversion based on the paper "Fast constant-time gcd computation and
* modular inversion" by Daniel J. Bernstein and Bo-Yin Yang.
Expand All @@ -32,15 +33,17 @@ static const secp256k1_modinv64_signed62 SECP256K1_SIGNED62_ONE = {{1}};
/* Compute a*factor and put it in r. All but the top limb in r will be in range [0,2^62). */
static void secp256k1_modinv64_mul_62(secp256k1_modinv64_signed62 *r, const secp256k1_modinv64_signed62 *a, int alen, int64_t factor) {
const int64_t M62 = (int64_t)(UINT64_MAX >> 2);
int128_t c = 0;
secp256k1_int128 c, d;
int i;
secp256k1_i128_from_i64(&c, 0);
for (i = 0; i < 4; ++i) {
if (i < alen) c += (int128_t)a->v[i] * factor;
r->v[i] = (int64_t)c & M62; c >>= 62;
if (i < alen) secp256k1_i128_accum_mul(&c, a->v[i], factor);
r->v[i] = secp256k1_i128_to_i64(&c) & M62; secp256k1_i128_rshift(&c, 62);
}
if (4 < alen) c += (int128_t)a->v[4] * factor;
VERIFY_CHECK(c == (int64_t)c);
r->v[4] = (int64_t)c;
if (4 < alen) secp256k1_i128_accum_mul(&c, a->v[4], factor);
secp256k1_i128_from_i64(&d, secp256k1_i128_to_i64(&c));
VERIFY_CHECK(secp256k1_i128_eq(&c, &d));
r->v[4] = secp256k1_i128_to_i64(&c);
}

/* Return -1 for a<b*factor, 0 for a==b*factor, 1 for a>b*factor. A has alen limbs; b has 5. */
Expand Down Expand Up @@ -307,7 +310,7 @@ static void secp256k1_modinv64_update_de_62(secp256k1_modinv64_signed62 *d, secp
const int64_t e0 = e->v[0], e1 = e->v[1], e2 = e->v[2], e3 = e->v[3], e4 = e->v[4];
const int64_t u = t->u, v = t->v, q = t->q, r = t->r;
int64_t md, me, sd, se;
int128_t cd, ce;
secp256k1_int128 cd, ce;
#ifdef VERIFY
VERIFY_CHECK(secp256k1_modinv64_mul_cmp_62(d, 5, &modinfo->modulus, -2) > 0); /* d > -2*modulus */
VERIFY_CHECK(secp256k1_modinv64_mul_cmp_62(d, 5, &modinfo->modulus, 1) < 0); /* d < modulus */
Expand All @@ -324,54 +327,64 @@ static void secp256k1_modinv64_update_de_62(secp256k1_modinv64_signed62 *d, secp
md = (u & sd) + (v & se);
me = (q & sd) + (r & se);
/* Begin computing t*[d,e]. */
cd = (int128_t)u * d0 + (int128_t)v * e0;
ce = (int128_t)q * d0 + (int128_t)r * e0;
secp256k1_i128_mul(&cd, u, d0);
secp256k1_i128_accum_mul(&cd, v, e0);
secp256k1_i128_mul(&ce, q, d0);
secp256k1_i128_accum_mul(&ce, r, e0);
/* Correct md,me so that t*[d,e]+modulus*[md,me] has 62 zero bottom bits. */
md -= (modinfo->modulus_inv62 * (uint64_t)cd + md) & M62;
me -= (modinfo->modulus_inv62 * (uint64_t)ce + me) & M62;
md -= (modinfo->modulus_inv62 * (uint64_t)secp256k1_i128_to_i64(&cd) + md) & M62;
me -= (modinfo->modulus_inv62 * (uint64_t)secp256k1_i128_to_i64(&ce) + me) & M62;
/* Update the beginning of computation for t*[d,e]+modulus*[md,me] now md,me are known. */
cd += (int128_t)modinfo->modulus.v[0] * md;
ce += (int128_t)modinfo->modulus.v[0] * me;
secp256k1_i128_accum_mul(&cd, modinfo->modulus.v[0], md);
secp256k1_i128_accum_mul(&ce, modinfo->modulus.v[0], me);
/* Verify that the low 62 bits of the computation are indeed zero, and then throw them away. */
VERIFY_CHECK(((int64_t)cd & M62) == 0); cd >>= 62;
VERIFY_CHECK(((int64_t)ce & M62) == 0); ce >>= 62;
VERIFY_CHECK((secp256k1_i128_to_i64(&cd) & M62) == 0); secp256k1_i128_rshift(&cd, 62);
VERIFY_CHECK((secp256k1_i128_to_i64(&ce) & M62) == 0); secp256k1_i128_rshift(&ce, 62);
/* Compute limb 1 of t*[d,e]+modulus*[md,me], and store it as output limb 0 (= down shift). */
cd += (int128_t)u * d1 + (int128_t)v * e1;
ce += (int128_t)q * d1 + (int128_t)r * e1;
secp256k1_i128_accum_mul(&cd, u, d1);
secp256k1_i128_accum_mul(&cd, v, e1);
secp256k1_i128_accum_mul(&ce, q, d1);
secp256k1_i128_accum_mul(&ce, r, e1);
if (modinfo->modulus.v[1]) { /* Optimize for the case where limb of modulus is zero. */
cd += (int128_t)modinfo->modulus.v[1] * md;
ce += (int128_t)modinfo->modulus.v[1] * me;
secp256k1_i128_accum_mul(&cd, modinfo->modulus.v[1], md);
secp256k1_i128_accum_mul(&ce, modinfo->modulus.v[1], me);
}
d->v[0] = (int64_t)cd & M62; cd >>= 62;
e->v[0] = (int64_t)ce & M62; ce >>= 62;
d->v[0] = secp256k1_i128_to_i64(&cd) & M62; secp256k1_i128_rshift(&cd, 62);
e->v[0] = secp256k1_i128_to_i64(&ce) & M62; secp256k1_i128_rshift(&ce, 62);
/* Compute limb 2 of t*[d,e]+modulus*[md,me], and store it as output limb 1. */
cd += (int128_t)u * d2 + (int128_t)v * e2;
ce += (int128_t)q * d2 + (int128_t)r * e2;
secp256k1_i128_accum_mul(&cd, u, d2);
secp256k1_i128_accum_mul(&cd, v, e2);
secp256k1_i128_accum_mul(&ce, q, d2);
secp256k1_i128_accum_mul(&ce, r, e2);
if (modinfo->modulus.v[2]) { /* Optimize for the case where limb of modulus is zero. */
cd += (int128_t)modinfo->modulus.v[2] * md;
ce += (int128_t)modinfo->modulus.v[2] * me;
secp256k1_i128_accum_mul(&cd, modinfo->modulus.v[2], md);
secp256k1_i128_accum_mul(&ce, modinfo->modulus.v[2], me);
}
d->v[1] = (int64_t)cd & M62; cd >>= 62;
e->v[1] = (int64_t)ce & M62; ce >>= 62;
d->v[1] = secp256k1_i128_to_i64(&cd) & M62; secp256k1_i128_rshift(&cd, 62);
e->v[1] = secp256k1_i128_to_i64(&ce) & M62; secp256k1_i128_rshift(&ce, 62);
/* Compute limb 3 of t*[d,e]+modulus*[md,me], and store it as output limb 2. */
cd += (int128_t)u * d3 + (int128_t)v * e3;
ce += (int128_t)q * d3 + (int128_t)r * e3;
secp256k1_i128_accum_mul(&cd, u, d3);
secp256k1_i128_accum_mul(&cd, v, e3);
secp256k1_i128_accum_mul(&ce, q, d3);
secp256k1_i128_accum_mul(&ce, r, e3);
if (modinfo->modulus.v[3]) { /* Optimize for the case where limb of modulus is zero. */
cd += (int128_t)modinfo->modulus.v[3] * md;
ce += (int128_t)modinfo->modulus.v[3] * me;
secp256k1_i128_accum_mul(&cd, modinfo->modulus.v[3], md);
secp256k1_i128_accum_mul(&ce, modinfo->modulus.v[3], me);
}
d->v[2] = (int64_t)cd & M62; cd >>= 62;
e->v[2] = (int64_t)ce & M62; ce >>= 62;
d->v[2] = secp256k1_i128_to_i64(&cd) & M62; secp256k1_i128_rshift(&cd, 62);
e->v[2] = secp256k1_i128_to_i64(&ce) & M62; secp256k1_i128_rshift(&ce, 62);
/* Compute limb 4 of t*[d,e]+modulus*[md,me], and store it as output limb 3. */
cd += (int128_t)u * d4 + (int128_t)v * e4;
ce += (int128_t)q * d4 + (int128_t)r * e4;
cd += (int128_t)modinfo->modulus.v[4] * md;
ce += (int128_t)modinfo->modulus.v[4] * me;
d->v[3] = (int64_t)cd & M62; cd >>= 62;
e->v[3] = (int64_t)ce & M62; ce >>= 62;
secp256k1_i128_accum_mul(&cd, u, d4);
secp256k1_i128_accum_mul(&cd, v, e4);
secp256k1_i128_accum_mul(&ce, q, d4);
secp256k1_i128_accum_mul(&ce, r, e4);
secp256k1_i128_accum_mul(&cd, modinfo->modulus.v[4], md);
secp256k1_i128_accum_mul(&ce, modinfo->modulus.v[4], me);
d->v[3] = secp256k1_i128_to_i64(&cd) & M62; secp256k1_i128_rshift(&cd, 62);
e->v[3] = secp256k1_i128_to_i64(&ce) & M62; secp256k1_i128_rshift(&ce, 62);
/* What remains is limb 5 of t*[d,e]+modulus*[md,me]; store it as output limb 4. */
d->v[4] = (int64_t)cd;
e->v[4] = (int64_t)ce;
d->v[4] = secp256k1_i128_to_i64(&cd);
e->v[4] = secp256k1_i128_to_i64(&ce);
#ifdef VERIFY
VERIFY_CHECK(secp256k1_modinv64_mul_cmp_62(d, 5, &modinfo->modulus, -2) > 0); /* d > -2*modulus */
VERIFY_CHECK(secp256k1_modinv64_mul_cmp_62(d, 5, &modinfo->modulus, 1) < 0); /* d < modulus */
Expand All @@ -389,36 +402,46 @@ static void secp256k1_modinv64_update_fg_62(secp256k1_modinv64_signed62 *f, secp
const int64_t f0 = f->v[0], f1 = f->v[1], f2 = f->v[2], f3 = f->v[3], f4 = f->v[4];
const int64_t g0 = g->v[0], g1 = g->v[1], g2 = g->v[2], g3 = g->v[3], g4 = g->v[4];
const int64_t u = t->u, v = t->v, q = t->q, r = t->r;
int128_t cf, cg;
secp256k1_int128 cf, cg;
/* Start computing t*[f,g]. */
cf = (int128_t)u * f0 + (int128_t)v * g0;
cg = (int128_t)q * f0 + (int128_t)r * g0;
secp256k1_i128_mul(&cf, u, f0);
secp256k1_i128_accum_mul(&cf, v, g0);
secp256k1_i128_mul(&cg, q, f0);
secp256k1_i128_accum_mul(&cg, r, g0);
/* Verify that the bottom 62 bits of the result are zero, and then throw them away. */
VERIFY_CHECK(((int64_t)cf & M62) == 0); cf >>= 62;
VERIFY_CHECK(((int64_t)cg & M62) == 0); cg >>= 62;
VERIFY_CHECK((secp256k1_i128_to_i64(&cf) & M62) == 0); secp256k1_i128_rshift(&cf, 62);
VERIFY_CHECK((secp256k1_i128_to_i64(&cg) & M62) == 0); secp256k1_i128_rshift(&cg, 62);
/* Compute limb 1 of t*[f,g], and store it as output limb 0 (= down shift). */
cf += (int128_t)u * f1 + (int128_t)v * g1;
cg += (int128_t)q * f1 + (int128_t)r * g1;
f->v[0] = (int64_t)cf & M62; cf >>= 62;
g->v[0] = (int64_t)cg & M62; cg >>= 62;
secp256k1_i128_accum_mul(&cf, u, f1);
secp256k1_i128_accum_mul(&cf, v, g1);
secp256k1_i128_accum_mul(&cg, q, f1);
secp256k1_i128_accum_mul(&cg, r, g1);
f->v[0] = secp256k1_i128_to_i64(&cf) & M62; secp256k1_i128_rshift(&cf, 62);
g->v[0] = secp256k1_i128_to_i64(&cg) & M62; secp256k1_i128_rshift(&cg, 62);
/* Compute limb 2 of t*[f,g], and store it as output limb 1. */
cf += (int128_t)u * f2 + (int128_t)v * g2;
cg += (int128_t)q * f2 + (int128_t)r * g2;
f->v[1] = (int64_t)cf & M62; cf >>= 62;
g->v[1] = (int64_t)cg & M62; cg >>= 62;
secp256k1_i128_accum_mul(&cf, u, f2);
secp256k1_i128_accum_mul(&cf, v, g2);
secp256k1_i128_accum_mul(&cg, q, f2);
secp256k1_i128_accum_mul(&cg, r, g2);
f->v[1] = secp256k1_i128_to_i64(&cf) & M62; secp256k1_i128_rshift(&cf, 62);
g->v[1] = secp256k1_i128_to_i64(&cg) & M62; secp256k1_i128_rshift(&cg, 62);
/* Compute limb 3 of t*[f,g], and store it as output limb 2. */
cf += (int128_t)u * f3 + (int128_t)v * g3;
cg += (int128_t)q * f3 + (int128_t)r * g3;
f->v[2] = (int64_t)cf & M62; cf >>= 62;
g->v[2] = (int64_t)cg & M62; cg >>= 62;
secp256k1_i128_accum_mul(&cf, u, f3);
secp256k1_i128_accum_mul(&cf, v, g3);
secp256k1_i128_accum_mul(&cg, q, f3);
secp256k1_i128_accum_mul(&cg, r, g3);
f->v[2] = secp256k1_i128_to_i64(&cf) & M62; secp256k1_i128_rshift(&cf, 62);
g->v[2] = secp256k1_i128_to_i64(&cg) & M62; secp256k1_i128_rshift(&cg, 62);
/* Compute limb 4 of t*[f,g], and store it as output limb 3. */
cf += (int128_t)u * f4 + (int128_t)v * g4;
cg += (int128_t)q * f4 + (int128_t)r * g4;
f->v[3] = (int64_t)cf & M62; cf >>= 62;
g->v[3] = (int64_t)cg & M62; cg >>= 62;
secp256k1_i128_accum_mul(&cf, u, f4);
secp256k1_i128_accum_mul(&cf, v, g4);
secp256k1_i128_accum_mul(&cg, q, f4);
secp256k1_i128_accum_mul(&cg, r, g4);
f->v[3] = secp256k1_i128_to_i64(&cf) & M62; secp256k1_i128_rshift(&cf, 62);
g->v[3] = secp256k1_i128_to_i64(&cg) & M62; secp256k1_i128_rshift(&cg, 62);
/* What remains is limb 5 of t*[f,g]; store it as output limb 4. */
f->v[4] = (int64_t)cf;
g->v[4] = (int64_t)cg;
f->v[4] = secp256k1_i128_to_i64(&cf);
g->v[4] = secp256k1_i128_to_i64(&cg);
}

/* Compute (t/2^62) * [f, g], where t is a transition matrix for 62 divsteps.
Expand All @@ -431,30 +454,34 @@ static void secp256k1_modinv64_update_fg_62_var(int len, secp256k1_modinv64_sign
const int64_t M62 = (int64_t)(UINT64_MAX >> 2);
const int64_t u = t->u, v = t->v, q = t->q, r = t->r;
int64_t fi, gi;
int128_t cf, cg;
secp256k1_int128 cf, cg;
int i;
VERIFY_CHECK(len > 0);
/* Start computing t*[f,g]. */
fi = f->v[0];
gi = g->v[0];
cf = (int128_t)u * fi + (int128_t)v * gi;
cg = (int128_t)q * fi + (int128_t)r * gi;
secp256k1_i128_mul(&cf, u, fi);
secp256k1_i128_accum_mul(&cf, v, gi);
secp256k1_i128_mul(&cg, q, fi);
secp256k1_i128_accum_mul(&cg, r, gi);
/* Verify that the bottom 62 bits of the result are zero, and then throw them away. */
VERIFY_CHECK(((int64_t)cf & M62) == 0); cf >>= 62;
VERIFY_CHECK(((int64_t)cg & M62) == 0); cg >>= 62;
VERIFY_CHECK((secp256k1_i128_to_i64(&cf) & M62) == 0); secp256k1_i128_rshift(&cf, 62);
VERIFY_CHECK((secp256k1_i128_to_i64(&cg) & M62) == 0); secp256k1_i128_rshift(&cg, 62);
/* Now iteratively compute limb i=1..len of t*[f,g], and store them in output limb i-1 (shifting
* down by 62 bits). */
for (i = 1; i < len; ++i) {
fi = f->v[i];
gi = g->v[i];
cf += (int128_t)u * fi + (int128_t)v * gi;
cg += (int128_t)q * fi + (int128_t)r * gi;
f->v[i - 1] = (int64_t)cf & M62; cf >>= 62;
g->v[i - 1] = (int64_t)cg & M62; cg >>= 62;
secp256k1_i128_accum_mul(&cf, u, fi);
secp256k1_i128_accum_mul(&cf, v, gi);
secp256k1_i128_accum_mul(&cg, q, fi);
secp256k1_i128_accum_mul(&cg, r, gi);
f->v[i - 1] = secp256k1_i128_to_i64(&cf) & M62; secp256k1_i128_rshift(&cf, 62);
g->v[i - 1] = secp256k1_i128_to_i64(&cg) & M62; secp256k1_i128_rshift(&cg, 62);
}
/* What remains is limb (len) of t*[f,g]; store it as output limb (len-1). */
f->v[len - 1] = (int64_t)cf;
g->v[len - 1] = (int64_t)cg;
f->v[len - 1] = secp256k1_i128_to_i64(&cf);
g->v[len - 1] = secp256k1_i128_to_i64(&cg);
}

/* Compute the inverse of x modulo modinfo->modulus, and replace x with it (constant time in x). */
Expand Down

0 comments on commit 4c684ff

Please sign in to comment.