Skip to content

Commit

Permalink
Merge #810: Avoid overly-wide multiplications in 5x52 field mul/sqr
Browse files Browse the repository at this point in the history
b53e0cd Avoid overly-wide multiplications (Peter Dettman)

Pull request description:

  Speeds up bench_ecdh, bench_sign, bench_verify relative to master by 5+% at -O3, haswell.

ACKs for top commit:
  sipa:
    ACK b53e0cd
  real-or-random:
    ACK b53e0cd I've inspected the diff and run the tests without asm for a CPU day

Tree-SHA512: 4f79c98371a3dc9da013632210c8db979f910b222291999dfaa0c31849a77eb427361e4ab9206cbfee73c30a8933178784d6cb8e747e8dca6b227eb77fbea2a2
  • Loading branch information
real-or-random committed Oct 17, 2021
2 parents 920a0e5 + b53e0cd commit 9526874
Showing 1 changed file with 22 additions and 23 deletions.
45 changes: 22 additions & 23 deletions src/field_5x52_int128_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -49,23 +49,23 @@ SECP256K1_INLINE static void secp256k1_fe_mul_inner(uint64_t *r, const uint64_t
c = (uint128_t)a4 * b[4];
VERIFY_BITS(c, 112);
/* [c 0 0 0 0 d 0 0 0] = [p8 0 0 0 0 p3 0 0 0] */
d += (c & M) * R; c >>= 52;
d += (uint128_t)R * (uint64_t)c; c >>= 64;
VERIFY_BITS(d, 115);
VERIFY_BITS(c, 60);
/* [c 0 0 0 0 0 d 0 0 0] = [p8 0 0 0 0 p3 0 0 0] */
VERIFY_BITS(c, 48);
/* [(c<<12) 0 0 0 0 0 d 0 0 0] = [p8 0 0 0 0 p3 0 0 0] */
t3 = d & M; d >>= 52;
VERIFY_BITS(t3, 52);
VERIFY_BITS(d, 63);
/* [c 0 0 0 0 d t3 0 0 0] = [p8 0 0 0 0 p3 0 0 0] */
/* [(c<<12) 0 0 0 0 d t3 0 0 0] = [p8 0 0 0 0 p3 0 0 0] */

d += (uint128_t)a0 * b[4]
+ (uint128_t)a1 * b[3]
+ (uint128_t)a2 * b[2]
+ (uint128_t)a3 * b[1]
+ (uint128_t)a4 * b[0];
VERIFY_BITS(d, 115);
/* [c 0 0 0 0 d t3 0 0 0] = [p8 0 0 0 p4 p3 0 0 0] */
d += c * R;
/* [(c<<12) 0 0 0 0 d t3 0 0 0] = [p8 0 0 0 p4 p3 0 0 0] */
d += (uint128_t)(R << 12) * (uint64_t)c;
VERIFY_BITS(d, 116);
/* [d t3 0 0 0] = [p8 0 0 0 p4 p3 0 0 0] */
t4 = d & M; d >>= 52;
Expand Down Expand Up @@ -129,17 +129,16 @@ SECP256K1_INLINE static void secp256k1_fe_mul_inner(uint64_t *r, const uint64_t
+ (uint128_t)a4 * b[3];
VERIFY_BITS(d, 114);
/* [d 0 0 t4 t3 c t1 r0] = [p8 p7 p6 p5 p4 p3 p2 p1 p0] */
c += (d & M) * R; d >>= 52;
c += (uint128_t)R * (uint64_t)d; d >>= 64;
VERIFY_BITS(c, 115);
VERIFY_BITS(d, 62);
/* [d 0 0 0 t4 t3 c r1 r0] = [p8 p7 p6 p5 p4 p3 p2 p1 p0] */
VERIFY_BITS(d, 50);
/* [(d<<12) 0 0 0 t4 t3 c r1 r0] = [p8 p7 p6 p5 p4 p3 p2 p1 p0] */

/* [d 0 0 0 t4 t3 c r1 r0] = [p8 p7 p6 p5 p4 p3 p2 p1 p0] */
r[2] = c & M; c >>= 52;
VERIFY_BITS(r[2], 52);
VERIFY_BITS(c, 63);
/* [d 0 0 0 t4 t3+c r2 r1 r0] = [p8 p7 p6 p5 p4 p3 p2 p1 p0] */
c += d * R + t3;
/* [(d<<12) 0 0 0 t4 t3+c r2 r1 r0] = [p8 p7 p6 p5 p4 p3 p2 p1 p0] */
c += (uint128_t)(R << 12) * (uint64_t)d + t3;
VERIFY_BITS(c, 100);
/* [t4 c r2 r1 r0] = [p8 p7 p6 p5 p4 p3 p2 p1 p0] */
r[3] = c & M; c >>= 52;
Expand Down Expand Up @@ -178,22 +177,22 @@ SECP256K1_INLINE static void secp256k1_fe_sqr_inner(uint64_t *r, const uint64_t
c = (uint128_t)a4 * a4;
VERIFY_BITS(c, 112);
/* [c 0 0 0 0 d 0 0 0] = [p8 0 0 0 0 p3 0 0 0] */
d += (c & M) * R; c >>= 52;
d += (uint128_t)R * (uint64_t)c; c >>= 64;
VERIFY_BITS(d, 115);
VERIFY_BITS(c, 60);
/* [c 0 0 0 0 0 d 0 0 0] = [p8 0 0 0 0 p3 0 0 0] */
VERIFY_BITS(c, 48);
/* [(c<<12) 0 0 0 0 0 d 0 0 0] = [p8 0 0 0 0 p3 0 0 0] */
t3 = d & M; d >>= 52;
VERIFY_BITS(t3, 52);
VERIFY_BITS(d, 63);
/* [c 0 0 0 0 d t3 0 0 0] = [p8 0 0 0 0 p3 0 0 0] */
/* [(c<<12) 0 0 0 0 d t3 0 0 0] = [p8 0 0 0 0 p3 0 0 0] */

a4 *= 2;
d += (uint128_t)a0 * a4
+ (uint128_t)(a1*2) * a3
+ (uint128_t)a2 * a2;
VERIFY_BITS(d, 115);
/* [c 0 0 0 0 d t3 0 0 0] = [p8 0 0 0 p4 p3 0 0 0] */
d += c * R;
/* [(c<<12) 0 0 0 0 d t3 0 0 0] = [p8 0 0 0 p4 p3 0 0 0] */
d += (uint128_t)(R << 12) * (uint64_t)c;
VERIFY_BITS(d, 116);
/* [d t3 0 0 0] = [p8 0 0 0 p4 p3 0 0 0] */
t4 = d & M; d >>= 52;
Expand Down Expand Up @@ -252,16 +251,16 @@ SECP256K1_INLINE static void secp256k1_fe_sqr_inner(uint64_t *r, const uint64_t
d += (uint128_t)a3 * a4;
VERIFY_BITS(d, 114);
/* [d 0 0 t4 t3 c r1 r0] = [p8 p7 p6 p5 p4 p3 p2 p1 p0] */
c += (d & M) * R; d >>= 52;
c += (uint128_t)R * (uint64_t)d; d >>= 64;
VERIFY_BITS(c, 115);
VERIFY_BITS(d, 62);
/* [d 0 0 0 t4 t3 c r1 r0] = [p8 p7 p6 p5 p4 p3 p2 p1 p0] */
VERIFY_BITS(d, 50);
/* [(d<<12) 0 0 0 t4 t3 c r1 r0] = [p8 p7 p6 p5 p4 p3 p2 p1 p0] */
r[2] = c & M; c >>= 52;
VERIFY_BITS(r[2], 52);
VERIFY_BITS(c, 63);
/* [d 0 0 0 t4 t3+c r2 r1 r0] = [p8 p7 p6 p5 p4 p3 p2 p1 p0] */
/* [(d<<12) 0 0 0 t4 t3+c r2 r1 r0] = [p8 p7 p6 p5 p4 p3 p2 p1 p0] */

c += d * R + t3;
c += (uint128_t)(R << 12) * (uint64_t)d + t3;
VERIFY_BITS(c, 100);
/* [t4 c r2 r1 r0] = [p8 p7 p6 p5 p4 p3 p2 p1 p0] */
r[3] = c & M; c >>= 52;
Expand Down

0 comments on commit 9526874

Please sign in to comment.