Skip to content

Commit

Permalink
Merge pull request #1314 from torben-hansen/aws-lc-s2n-bignum-update-…
Browse files Browse the repository at this point in the history
…2023-11-19

Update s2n-bignum subtree 2023-11-19
  • Loading branch information
torben-hansen authored Nov 20, 2023
2 parents b15b9d3 + 4087f5d commit 056bce6
Show file tree
Hide file tree
Showing 14 changed files with 976 additions and 18 deletions.
284 changes: 284 additions & 0 deletions third_party/s2n-bignum/arm/curve25519/bignum_madd_n25519.S
Original file line number Diff line number Diff line change
@@ -0,0 +1,284 @@
// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
// SPDX-License-Identifier: Apache-2.0 OR ISC

// ----------------------------------------------------------------------------
// Multiply-add modulo the order of the curve25519/edwards25519 basepoint
// Inputs x[4], y[4], c[4]; output z[4]
//
// extern void bignum_madd_n25519
// (uint64_t z[static 4], uint64_t x[static 4],
// uint64_t y[static 4], uint64_t c[static 4]);
//
// Performs z := (x * y + c) mod n_25519, where the modulus is
// n_25519 = 2^252 + 27742317777372353535851937790883648493, the
// order of the curve25519/edwards25519 basepoint. The result z
// and the inputs x, y and c are all 4 digits (256 bits).
//
// Standard ARM ABI: X0 = z, X1 = x, X2 = y, X3 = c
// ----------------------------------------------------------------------------
#include "_internal_s2n_bignum.h"

S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_madd_n25519)
S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_madd_n25519)
.text
.balign 4

// Backup of the input pointer so we can modify x0

#define z x19

// Temporaries for reduction phase

#define q x2
#define n0 x3
#define n1 x4
#define t0 x5
#define t1 x6
#define t2 x7

// Loading large constants

#define movbig(nn,n3,n2,n1,n0) \
movz nn, n0; \
movk nn, n1, lsl #16; \
movk nn, n2, lsl #32; \
movk nn, n3, lsl #48

// Single round of modular reduction mod_n25519, mapping
// [m4;m3;m2;m1;m0] = m to [m3;m2;m1;m0] = m mod n_25519,
// *assuming* the input m < 2^64 * n_25519. This is very
// close to the loop body of the bignum_mod_n25519 function.

#define reduce(m4,m3,m2,m1,m0) \
extr q, m4, m3, #60; \
and m3, m3, #0x0FFFFFFFFFFFFFFF; \
sub q, q, m4, lsr #60; \
and t0, m4, #0xF000000000000000; \
add m3, m3, t0; \
mul t0, n0, q; \
mul t1, n1, q; \
umulh t2, n0, q; \
adds t1, t1, t2; \
umulh t2, n1, q; \
adc t2, t2, xzr; \
subs m0, m0, t0; \
sbcs m1, m1, t1; \
sbcs m2, m2, t2; \
sbcs m3, m3, xzr; \
csel t0, n0, xzr, cc; \
csel t1, n1, xzr, cc; \
adds m0, m0, t0; \
and t2, t0, #0x1000000000000000; \
adcs m1, m1, t1; \
adcs m2, m2, xzr; \
adc m3, m3, t2

// Special case of "reduce" with m4 = 0. As well as not using m4,
// the quotient selection is slightly simpler, just floor(m/2^252)
// versus min (floor(m/2^252)) (2^63-1).

#define reduce0(m3,m2,m1,m0) \
lsr q, m3, #60; \
and m3, m3, #0x0FFFFFFFFFFFFFFF; \
mul t0, n0, q; \
mul t1, n1, q; \
umulh t2, n0, q; \
adds t1, t1, t2; \
umulh t2, n1, q; \
adc t2, t2, xzr; \
subs m0, m0, t0; \
sbcs m1, m1, t1; \
sbcs m2, m2, t2; \
sbcs m3, m3, xzr; \
csel t0, n0, xzr, cc; \
csel t1, n1, xzr, cc; \
adds m0, m0, t0; \
and t2, t0, #0x1000000000000000; \
adcs m1, m1, t1; \
adcs m2, m2, xzr; \
adc m3, m3, t2

S2N_BN_SYMBOL(bignum_madd_n25519):

stp x19, x20, [sp, -16]!

// Back up the result pointer so we can overwrite x0 in intermediate steps

mov z, x0

// First compute [x15;x14;x13;x12;x11;x10;x9;x8] = x * y. This is
// a basic 2-level Karatsuba multiplier, similar to the start of
// bignum_mul_p25519, but with changes to the register allocation,
// which in particular preserve x3/w3 for the next step.

ldp x0, x4, [x1]
ldp x5, x6, [x2]
umull x8, w0, w5
lsr x17, x0, #32
umull x7, w17, w5
lsr x16, x5, #32
umull x9, w16, w17
umull x16, w0, w16
adds x8, x8, x7, lsl #32
lsr x7, x7, #32
adc x9, x9, x7
adds x8, x8, x16, lsl #32
lsr x16, x16, #32
adc x9, x9, x16
mul x10, x4, x6
umulh x11, x4, x6
subs x4, x4, x0
cneg x4, x4, cc
csetm x16, cc
adds x10, x10, x9
adc x11, x11, xzr
subs x0, x5, x6
cneg x0, x0, cc
cinv x16, x16, cc
mul x7, x4, x0
umulh x0, x4, x0
adds x9, x8, x10
adcs x10, x10, x11
adc x11, x11, xzr
cmn x16, #0x1
eor x7, x7, x16
adcs x9, x7, x9
eor x0, x0, x16
adcs x10, x0, x10
adc x11, x11, x16
ldp x0, x4, [x1, #16]
ldp x5, x6, [x2, #16]
umull x12, w0, w5
lsr x17, x0, #32
umull x7, w17, w5
lsr x16, x5, #32
umull x13, w16, w17
umull x16, w0, w16
adds x12, x12, x7, lsl #32
lsr x7, x7, #32
adc x13, x13, x7
adds x12, x12, x16, lsl #32
lsr x16, x16, #32
adc x13, x13, x16
mul x14, x4, x6
umulh x15, x4, x6
subs x4, x4, x0
cneg x4, x4, cc
csetm x16, cc
adds x14, x14, x13
adc x15, x15, xzr
subs x0, x5, x6
cneg x0, x0, cc
cinv x16, x16, cc
mul x7, x4, x0
umulh x0, x4, x0
adds x13, x12, x14
adcs x14, x14, x15
adc x15, x15, xzr
cmn x16, #0x1
eor x7, x7, x16
adcs x13, x7, x13
eor x0, x0, x16
adcs x14, x0, x14
adc x15, x15, x16
ldp x0, x4, [x1, #16]
ldp x7, x16, [x1]
subs x0, x0, x7
sbcs x4, x4, x16
csetm x16, cc
ldp x7, x17, [x2]
subs x5, x7, x5
sbcs x6, x17, x6
csetm x17, cc
eor x0, x0, x16
subs x0, x0, x16
eor x4, x4, x16
sbc x4, x4, x16
eor x5, x5, x17
subs x5, x5, x17
eor x6, x6, x17
sbc x6, x6, x17
eor x16, x17, x16
adds x12, x12, x10
adcs x13, x13, x11
adcs x14, x14, xzr
adc x15, x15, xzr
mul x2, x0, x5
umulh x17, x0, x5
mul x7, x4, x6
umulh x1, x4, x6
subs x4, x4, x0
cneg x4, x4, cc
csetm x10, cc
adds x7, x7, x17
adc x1, x1, xzr
subs x6, x5, x6
cneg x6, x6, cc
cinv x10, x10, cc
mul x5, x4, x6
umulh x6, x4, x6
adds x17, x2, x7
adcs x7, x7, x1
adc x1, x1, xzr
cmn x10, #0x1
eor x5, x5, x10
adcs x17, x5, x17
eor x6, x6, x10
adcs x7, x6, x7
adc x1, x1, x10
adds x10, x12, x8
adcs x11, x13, x9
adcs x12, x14, x12
adcs x13, x15, x13
adcs x14, x14, xzr
adc x15, x15, xzr
cmn x16, #0x1
eor x2, x2, x16
adcs x10, x2, x10
eor x17, x17, x16
adcs x11, x17, x11
eor x7, x7, x16
adcs x12, x7, x12
eor x1, x1, x16
adcs x13, x1, x13
adcs x14, x14, x16
adc x15, x15, x16

// Add the constant term, so [x15;x14;x13;x12;x11;x10;x9;x8] = x * y + c
// It's easier to just do this now versus incorporating it into the
// Karatsuba steps above or deferring it until partway through the
// reduction, though it does result in a long carry propagation here.

ldp x0, x1, [x3]
adds x8, x8, x0
adcs x9, x9, x1
ldp x0, x1, [x3, #16]
adcs x10, x10, x0
adcs x11, x11, x1
adcs x12, x12, xzr
adcs x13, x13, xzr
adcs x14, x14, xzr
adc x15, x15, xzr

// Now do the modular reduction and write back

movbig( n0, #0x5812, #0x631a, #0x5cf5, #0xd3ed)
movbig( n1, #0x14de, #0xf9de, #0xa2f7, #0x9cd6)

reduce0(x15,x14,x13,x12)
reduce(x15,x14,x13,x12,x11)
reduce(x14,x13,x12,x11,x10)
reduce(x13,x12,x11,x10,x9)
reduce(x12,x11,x10,x9,x8)

stp x8, x9, [z]
stp x10, x11, [z, #16]

// Restore registers and return

ldp x19, x20, [sp], 16
ret

#if defined(__linux__) && defined(__ELF__)
.section .note.GNU-stacz,"",%progbits
#endif
Loading

0 comments on commit 056bce6

Please sign in to comment.