-
Notifications
You must be signed in to change notification settings - Fork 121
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #1314 from torben-hansen/aws-lc-s2n-bignum-update-…
…2023-11-19 Update s2n-bignum subtree 2023-11-19
- Loading branch information
Showing
14 changed files
with
976 additions
and
18 deletions.
There are no files selected for viewing
284 changes: 284 additions & 0 deletions
284
third_party/s2n-bignum/arm/curve25519/bignum_madd_n25519.S
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,284 @@ | ||
// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. | ||
// SPDX-License-Identifier: Apache-2.0 OR ISC | ||
|
||
// ---------------------------------------------------------------------------- | ||
// Multiply-add modulo the order of the curve25519/edwards25519 basepoint | ||
// Inputs x[4], y[4], c[4]; output z[4] | ||
// | ||
// extern void bignum_madd_n25519 | ||
// (uint64_t z[static 4], uint64_t x[static 4], | ||
// uint64_t y[static 4], uint64_t c[static 4]); | ||
// | ||
// Performs z := (x * y + c) mod n_25519, where the modulus is | ||
// n_25519 = 2^252 + 27742317777372353535851937790883648493, the | ||
// order of the curve25519/edwards25519 basepoint. The result z | ||
// and the inputs x, y and c are all 4 digits (256 bits). | ||
// | ||
// Standard ARM ABI: X0 = z, X1 = x, X2 = y, X3 = c | ||
// ---------------------------------------------------------------------------- | ||
#include "_internal_s2n_bignum.h" | ||
|
||
S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_madd_n25519) | ||
S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_madd_n25519) | ||
.text | ||
.balign 4 | ||
|
||
// Backup of the input pointer so we can modify x0 | ||
|
||
#define z x19 | ||
|
||
// Temporaries for reduction phase | ||
|
||
#define q x2 | ||
#define n0 x3 | ||
#define n1 x4 | ||
#define t0 x5 | ||
#define t1 x6 | ||
#define t2 x7 | ||
|
||
// Loading large constants | ||
|
||
#define movbig(nn,n3,n2,n1,n0) \ | ||
movz nn, n0; \ | ||
movk nn, n1, lsl #16; \ | ||
movk nn, n2, lsl #32; \ | ||
movk nn, n3, lsl #48 | ||
|
||
// Single round of modular reduction mod_n25519, mapping | ||
// [m4;m3;m2;m1;m0] = m to [m3;m2;m1;m0] = m mod n_25519, | ||
// *assuming* the input m < 2^64 * n_25519. This is very | ||
// close to the loop body of the bignum_mod_n25519 function. | ||
|
||
#define reduce(m4,m3,m2,m1,m0) \ | ||
extr q, m4, m3, #60; \ | ||
and m3, m3, #0x0FFFFFFFFFFFFFFF; \ | ||
sub q, q, m4, lsr #60; \ | ||
and t0, m4, #0xF000000000000000; \ | ||
add m3, m3, t0; \ | ||
mul t0, n0, q; \ | ||
mul t1, n1, q; \ | ||
umulh t2, n0, q; \ | ||
adds t1, t1, t2; \ | ||
umulh t2, n1, q; \ | ||
adc t2, t2, xzr; \ | ||
subs m0, m0, t0; \ | ||
sbcs m1, m1, t1; \ | ||
sbcs m2, m2, t2; \ | ||
sbcs m3, m3, xzr; \ | ||
csel t0, n0, xzr, cc; \ | ||
csel t1, n1, xzr, cc; \ | ||
adds m0, m0, t0; \ | ||
and t2, t0, #0x1000000000000000; \ | ||
adcs m1, m1, t1; \ | ||
adcs m2, m2, xzr; \ | ||
adc m3, m3, t2 | ||
|
||
// Special case of "reduce" with m4 = 0. As well as not using m4, | ||
// the quotient selection is slightly simpler, just floor(m/2^252) | ||
// versus min (floor(m/2^252)) (2^63-1). | ||
|
||
#define reduce0(m3,m2,m1,m0) \ | ||
lsr q, m3, #60; \ | ||
and m3, m3, #0x0FFFFFFFFFFFFFFF; \ | ||
mul t0, n0, q; \ | ||
mul t1, n1, q; \ | ||
umulh t2, n0, q; \ | ||
adds t1, t1, t2; \ | ||
umulh t2, n1, q; \ | ||
adc t2, t2, xzr; \ | ||
subs m0, m0, t0; \ | ||
sbcs m1, m1, t1; \ | ||
sbcs m2, m2, t2; \ | ||
sbcs m3, m3, xzr; \ | ||
csel t0, n0, xzr, cc; \ | ||
csel t1, n1, xzr, cc; \ | ||
adds m0, m0, t0; \ | ||
and t2, t0, #0x1000000000000000; \ | ||
adcs m1, m1, t1; \ | ||
adcs m2, m2, xzr; \ | ||
adc m3, m3, t2 | ||
|
||
S2N_BN_SYMBOL(bignum_madd_n25519): | ||
|
||
stp x19, x20, [sp, -16]! | ||
|
||
// Back up the result pointer so we can overwrite x0 in intermediate steps | ||
|
||
mov z, x0 | ||
|
||
// First compute [x15;x14;x13;x12;x11;x10;x9;x8] = x * y. This is | ||
// a basic 2-level Karatsuba multiplier, similar to the start of | ||
// bignum_mul_p25519, but with changes to the register allocation, | ||
// which in particular preserve x3/w3 for the next step. | ||
|
||
ldp x0, x4, [x1] | ||
ldp x5, x6, [x2] | ||
umull x8, w0, w5 | ||
lsr x17, x0, #32 | ||
umull x7, w17, w5 | ||
lsr x16, x5, #32 | ||
umull x9, w16, w17 | ||
umull x16, w0, w16 | ||
adds x8, x8, x7, lsl #32 | ||
lsr x7, x7, #32 | ||
adc x9, x9, x7 | ||
adds x8, x8, x16, lsl #32 | ||
lsr x16, x16, #32 | ||
adc x9, x9, x16 | ||
mul x10, x4, x6 | ||
umulh x11, x4, x6 | ||
subs x4, x4, x0 | ||
cneg x4, x4, cc | ||
csetm x16, cc | ||
adds x10, x10, x9 | ||
adc x11, x11, xzr | ||
subs x0, x5, x6 | ||
cneg x0, x0, cc | ||
cinv x16, x16, cc | ||
mul x7, x4, x0 | ||
umulh x0, x4, x0 | ||
adds x9, x8, x10 | ||
adcs x10, x10, x11 | ||
adc x11, x11, xzr | ||
cmn x16, #0x1 | ||
eor x7, x7, x16 | ||
adcs x9, x7, x9 | ||
eor x0, x0, x16 | ||
adcs x10, x0, x10 | ||
adc x11, x11, x16 | ||
ldp x0, x4, [x1, #16] | ||
ldp x5, x6, [x2, #16] | ||
umull x12, w0, w5 | ||
lsr x17, x0, #32 | ||
umull x7, w17, w5 | ||
lsr x16, x5, #32 | ||
umull x13, w16, w17 | ||
umull x16, w0, w16 | ||
adds x12, x12, x7, lsl #32 | ||
lsr x7, x7, #32 | ||
adc x13, x13, x7 | ||
adds x12, x12, x16, lsl #32 | ||
lsr x16, x16, #32 | ||
adc x13, x13, x16 | ||
mul x14, x4, x6 | ||
umulh x15, x4, x6 | ||
subs x4, x4, x0 | ||
cneg x4, x4, cc | ||
csetm x16, cc | ||
adds x14, x14, x13 | ||
adc x15, x15, xzr | ||
subs x0, x5, x6 | ||
cneg x0, x0, cc | ||
cinv x16, x16, cc | ||
mul x7, x4, x0 | ||
umulh x0, x4, x0 | ||
adds x13, x12, x14 | ||
adcs x14, x14, x15 | ||
adc x15, x15, xzr | ||
cmn x16, #0x1 | ||
eor x7, x7, x16 | ||
adcs x13, x7, x13 | ||
eor x0, x0, x16 | ||
adcs x14, x0, x14 | ||
adc x15, x15, x16 | ||
ldp x0, x4, [x1, #16] | ||
ldp x7, x16, [x1] | ||
subs x0, x0, x7 | ||
sbcs x4, x4, x16 | ||
csetm x16, cc | ||
ldp x7, x17, [x2] | ||
subs x5, x7, x5 | ||
sbcs x6, x17, x6 | ||
csetm x17, cc | ||
eor x0, x0, x16 | ||
subs x0, x0, x16 | ||
eor x4, x4, x16 | ||
sbc x4, x4, x16 | ||
eor x5, x5, x17 | ||
subs x5, x5, x17 | ||
eor x6, x6, x17 | ||
sbc x6, x6, x17 | ||
eor x16, x17, x16 | ||
adds x12, x12, x10 | ||
adcs x13, x13, x11 | ||
adcs x14, x14, xzr | ||
adc x15, x15, xzr | ||
mul x2, x0, x5 | ||
umulh x17, x0, x5 | ||
mul x7, x4, x6 | ||
umulh x1, x4, x6 | ||
subs x4, x4, x0 | ||
cneg x4, x4, cc | ||
csetm x10, cc | ||
adds x7, x7, x17 | ||
adc x1, x1, xzr | ||
subs x6, x5, x6 | ||
cneg x6, x6, cc | ||
cinv x10, x10, cc | ||
mul x5, x4, x6 | ||
umulh x6, x4, x6 | ||
adds x17, x2, x7 | ||
adcs x7, x7, x1 | ||
adc x1, x1, xzr | ||
cmn x10, #0x1 | ||
eor x5, x5, x10 | ||
adcs x17, x5, x17 | ||
eor x6, x6, x10 | ||
adcs x7, x6, x7 | ||
adc x1, x1, x10 | ||
adds x10, x12, x8 | ||
adcs x11, x13, x9 | ||
adcs x12, x14, x12 | ||
adcs x13, x15, x13 | ||
adcs x14, x14, xzr | ||
adc x15, x15, xzr | ||
cmn x16, #0x1 | ||
eor x2, x2, x16 | ||
adcs x10, x2, x10 | ||
eor x17, x17, x16 | ||
adcs x11, x17, x11 | ||
eor x7, x7, x16 | ||
adcs x12, x7, x12 | ||
eor x1, x1, x16 | ||
adcs x13, x1, x13 | ||
adcs x14, x14, x16 | ||
adc x15, x15, x16 | ||
|
||
// Add the constant term, so [x15;x14;x13;x12;x11;x10;x9;x8] = x * y + c | ||
// It's easier to just do this now versus incorporating it into the | ||
// Karatsuba steps above or deferring it until partway through the | ||
// reduction, though it does result in a long carry propagation here. | ||
|
||
ldp x0, x1, [x3] | ||
adds x8, x8, x0 | ||
adcs x9, x9, x1 | ||
ldp x0, x1, [x3, #16] | ||
adcs x10, x10, x0 | ||
adcs x11, x11, x1 | ||
adcs x12, x12, xzr | ||
adcs x13, x13, xzr | ||
adcs x14, x14, xzr | ||
adc x15, x15, xzr | ||
|
||
// Now do the modular reduction and write back | ||
|
||
movbig( n0, #0x5812, #0x631a, #0x5cf5, #0xd3ed) | ||
movbig( n1, #0x14de, #0xf9de, #0xa2f7, #0x9cd6) | ||
|
||
reduce0(x15,x14,x13,x12) | ||
reduce(x15,x14,x13,x12,x11) | ||
reduce(x14,x13,x12,x11,x10) | ||
reduce(x13,x12,x11,x10,x9) | ||
reduce(x12,x11,x10,x9,x8) | ||
|
||
stp x8, x9, [z] | ||
stp x10, x11, [z, #16] | ||
|
||
// Restore registers and return | ||
|
||
ldp x19, x20, [sp], 16 | ||
ret | ||
|
||
#if defined(__linux__) && defined(__ELF__) | ||
.section .note.GNU-stacz,"",%progbits | ||
#endif |
Oops, something went wrong.