From de00f966e8e77daf77cbc5bc030a9e84681979e7 Mon Sep 17 00:00:00 2001 From: John Harrison Date: Fri, 1 Jul 2022 20:50:08 -0500 Subject: [PATCH 1/3] Add basic NIST P-384 point operations A point doubling function, point addition function, and point mixed addition function for the P-384 curve, all using Jacobian coordinates in a Montgomery representation, with input nondegeneracy assumed. Once again, the addition and mixed addition functions offer only marginal efficiency gains over just calling a sequence of basic field operations, but the doubling has some beneficial mathematically equivalent short-cutting of the intermediate modular reductions. s2n-bignum original commit: https://github.com/awslabs/s2n-bignum/commit/df8e913c542e5392a9f9cb6cd42fc90c5a02f72e --- arm/p384/Makefile | 5 +- arm/p384/p384_montjadd.S | 893 +++++++++++++++++++++++++++ arm/p384/p384_montjdouble.S | 963 +++++++++++++++++++++++++++++ arm/p384/p384_montjmixadd.S | 884 +++++++++++++++++++++++++++ x86_att/p384/p384_montjadd.S | 955 +++++++++++++++++++++++++++++ x86_att/p384/p384_montjdouble.S | 1014 +++++++++++++++++++++++++++++++ x86_att/p384/p384_montjmixadd.S | 941 ++++++++++++++++++++++++++++ 7 files changed, 5654 insertions(+), 1 deletion(-) create mode 100644 arm/p384/p384_montjadd.S create mode 100644 arm/p384/p384_montjdouble.S create mode 100644 arm/p384/p384_montjmixadd.S create mode 100644 x86_att/p384/p384_montjadd.S create mode 100644 x86_att/p384/p384_montjdouble.S create mode 100644 x86_att/p384/p384_montjmixadd.S diff --git a/arm/p384/Makefile b/arm/p384/Makefile index 469a20ff12..11a5605504 100644 --- a/arm/p384/Makefile +++ b/arm/p384/Makefile @@ -53,7 +53,10 @@ OBJ = bignum_add_p384.o \ bignum_optneg_p384.o \ bignum_sub_p384.o \ bignum_tomont_p384.o \ - bignum_triple_p384.o + bignum_triple_p384.o \ + p384_montjadd.o \ + p384_montjdouble.o \ + p384_montjmixadd.o %.o : %.S ; $(CC) -E -I../../include $< | $(GAS) -o $@ - diff --git a/arm/p384/p384_montjadd.S b/arm/p384/p384_montjadd.S new file mode 100644 index 0000000000..138afa9dc3 --- /dev/null +++ b/arm/p384/p384_montjadd.S @@ -0,0 +1,893 @@ +/* + * Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"). + * You may not use this file except in compliance with the License. + * A copy of the License is located at + * + * http://aws.amazon.com/apache2.0 + * + * or in the "LICENSE" file accompanying this file. This file is distributed + * on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either + * express or implied. See the License for the specific language governing + * permissions and limitations under the License. + */ + +// ---------------------------------------------------------------------------- +// Point addition on NIST curve P-384 in Montgomery-Jacobian coordinates +// +// extern void p384_montjadd +// (uint64_t p3[static 18],uint64_t p1[static 18],uint64_t p2[static 18]); +// +// Does p3 := p1 + p2 where all points are regarded as Jacobian triples with +// each coordinate in the Montgomery domain, i.e. x' = (2^384 * x) mod p_384. +// A Jacobian triple (x',y',z') represents affine point (x/z^2,y/z^3). +// +// Standard ARM ABI: X0 = p3, X1 = p1, X2 = p2 +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(p384_montjadd) + S2N_BN_SYM_PRIVACY_DIRECTIVE(p384_montjadd) + .text + .balign 4 + +// Size of individual field elements + +#define NUMSIZE 48 + +// Stable homes for input arguments during main code sequence + +#define input_z x24 +#define input_x x25 +#define input_y x26 + +// Pointer-offset pairs for inputs and outputs + +#define x_1 input_x, #0 +#define y_1 input_x, #NUMSIZE +#define z_1 input_x, #(2*NUMSIZE) + +#define x_2 input_y, #0 +#define y_2 input_y, #NUMSIZE +#define z_2 input_y, #(2*NUMSIZE) + +#define x_3 input_z, #0 +#define y_3 input_z, #NUMSIZE +#define z_3 input_z, #(2*NUMSIZE) + +// Pointer-offset pairs for temporaries, with some aliasing +// NSPACE is the total stack needed for these temporaries + +#define z1sq sp, #(NUMSIZE*0) +#define ww sp, #(NUMSIZE*0) + +#define yd sp, #(NUMSIZE*1) +#define y2a sp, #(NUMSIZE*1) + +#define x2a sp, #(NUMSIZE*2) +#define zzx2 sp, #(NUMSIZE*2) + +#define zz sp, #(NUMSIZE*3) +#define t1 sp, #(NUMSIZE*3) + +#define t2 sp, #(NUMSIZE*4) +#define x1a sp, #(NUMSIZE*4) +#define zzx1 sp, #(NUMSIZE*4) + +#define xd sp, #(NUMSIZE*5) +#define z2sq sp, #(NUMSIZE*5) + +#define y1a sp, #(NUMSIZE*6) + +#define NSPACE (NUMSIZE*7) + +// Corresponds exactly to bignum_montmul_p384_alt + +#define montmul_p384(P0,P1,P2) \ + ldp x3, x4, [P1]; \ + ldp x5, x6, [P2]; \ + mul x12, x3, x5; \ + umulh x13, x3, x5; \ + mul x11, x3, x6; \ + umulh x14, x3, x6; \ + adds x13, x13, x11; \ + ldp x7, x8, [P2+16]; \ + mul x11, x3, x7; \ + umulh x15, x3, x7; \ + adcs x14, x14, x11; \ + mul x11, x3, x8; \ + umulh x16, x3, x8; \ + adcs x15, x15, x11; \ + ldp x9, x10, [P2+32]; \ + mul x11, x3, x9; \ + umulh x17, x3, x9; \ + adcs x16, x16, x11; \ + mul x11, x3, x10; \ + umulh x19, x3, x10; \ + adcs x17, x17, x11; \ + adc x19, x19, xzr; \ + mul x11, x4, x5; \ + adds x13, x13, x11; \ + mul x11, x4, x6; \ + adcs x14, x14, x11; \ + mul x11, x4, x7; \ + adcs x15, x15, x11; \ + mul x11, x4, x8; \ + adcs x16, x16, x11; \ + mul x11, x4, x9; \ + adcs x17, x17, x11; \ + mul x11, x4, x10; \ + adcs x19, x19, x11; \ + cset x20, cs; \ + umulh x11, x4, x5; \ + adds x14, x14, x11; \ + umulh x11, x4, x6; \ + adcs x15, x15, x11; \ + umulh x11, x4, x7; \ + adcs x16, x16, x11; \ + umulh x11, x4, x8; \ + adcs x17, x17, x11; \ + umulh x11, x4, x9; \ + adcs x19, x19, x11; \ + umulh x11, x4, x10; \ + adc x20, x20, x11; \ + ldp x3, x4, [P1+16]; \ + mul x11, x3, x5; \ + adds x14, x14, x11; \ + mul x11, x3, x6; \ + adcs x15, x15, x11; \ + mul x11, x3, x7; \ + adcs x16, x16, x11; \ + mul x11, x3, x8; \ + adcs x17, x17, x11; \ + mul x11, x3, x9; \ + adcs x19, x19, x11; \ + mul x11, x3, x10; \ + adcs x20, x20, x11; \ + cset x21, cs; \ + umulh x11, x3, x5; \ + adds x15, x15, x11; \ + umulh x11, x3, x6; \ + adcs x16, x16, x11; \ + umulh x11, x3, x7; \ + adcs x17, x17, x11; \ + umulh x11, x3, x8; \ + adcs x19, x19, x11; \ + umulh x11, x3, x9; \ + adcs x20, x20, x11; \ + umulh x11, x3, x10; \ + adc x21, x21, x11; \ + mul x11, x4, x5; \ + adds x15, x15, x11; \ + mul x11, x4, x6; \ + adcs x16, x16, x11; \ + mul x11, x4, x7; \ + adcs x17, x17, x11; \ + mul x11, x4, x8; \ + adcs x19, x19, x11; \ + mul x11, x4, x9; \ + adcs x20, x20, x11; \ + mul x11, x4, x10; \ + adcs x21, x21, x11; \ + cset x22, cs; \ + umulh x11, x4, x5; \ + adds x16, x16, x11; \ + umulh x11, x4, x6; \ + adcs x17, x17, x11; \ + umulh x11, x4, x7; \ + adcs x19, x19, x11; \ + umulh x11, x4, x8; \ + adcs x20, x20, x11; \ + umulh x11, x4, x9; \ + adcs x21, x21, x11; \ + umulh x11, x4, x10; \ + adc x22, x22, x11; \ + ldp x3, x4, [P1+32]; \ + mul x11, x3, x5; \ + adds x16, x16, x11; \ + mul x11, x3, x6; \ + adcs x17, x17, x11; \ + mul x11, x3, x7; \ + adcs x19, x19, x11; \ + mul x11, x3, x8; \ + adcs x20, x20, x11; \ + mul x11, x3, x9; \ + adcs x21, x21, x11; \ + mul x11, x3, x10; \ + adcs x22, x22, x11; \ + cset x2, cs; \ + umulh x11, x3, x5; \ + adds x17, x17, x11; \ + umulh x11, x3, x6; \ + adcs x19, x19, x11; \ + umulh x11, x3, x7; \ + adcs x20, x20, x11; \ + umulh x11, x3, x8; \ + adcs x21, x21, x11; \ + umulh x11, x3, x9; \ + adcs x22, x22, x11; \ + umulh x11, x3, x10; \ + adc x2, x2, x11; \ + mul x11, x4, x5; \ + adds x17, x17, x11; \ + mul x11, x4, x6; \ + adcs x19, x19, x11; \ + mul x11, x4, x7; \ + adcs x20, x20, x11; \ + mul x11, x4, x8; \ + adcs x21, x21, x11; \ + mul x11, x4, x9; \ + adcs x22, x22, x11; \ + mul x11, x4, x10; \ + adcs x2, x2, x11; \ + cset x1, cs; \ + umulh x11, x4, x5; \ + adds x19, x19, x11; \ + umulh x11, x4, x6; \ + adcs x20, x20, x11; \ + umulh x11, x4, x7; \ + adcs x21, x21, x11; \ + umulh x11, x4, x8; \ + adcs x22, x22, x11; \ + umulh x11, x4, x9; \ + adcs x2, x2, x11; \ + umulh x11, x4, x10; \ + adc x1, x1, x11; \ + lsl x7, x12, #32; \ + add x12, x7, x12; \ + mov x7, #0xffffffff00000001; \ + umulh x7, x7, x12; \ + mov x6, #0xffffffff; \ + mul x5, x6, x12; \ + umulh x6, x6, x12; \ + adds x7, x7, x5; \ + adcs x6, x6, x12; \ + adc x5, xzr, xzr; \ + subs x13, x13, x7; \ + sbcs x14, x14, x6; \ + sbcs x15, x15, x5; \ + sbcs x16, x16, xzr; \ + sbcs x17, x17, xzr; \ + sbc x12, x12, xzr; \ + lsl x7, x13, #32; \ + add x13, x7, x13; \ + mov x7, #0xffffffff00000001; \ + umulh x7, x7, x13; \ + mov x6, #0xffffffff; \ + mul x5, x6, x13; \ + umulh x6, x6, x13; \ + adds x7, x7, x5; \ + adcs x6, x6, x13; \ + adc x5, xzr, xzr; \ + subs x14, x14, x7; \ + sbcs x15, x15, x6; \ + sbcs x16, x16, x5; \ + sbcs x17, x17, xzr; \ + sbcs x12, x12, xzr; \ + sbc x13, x13, xzr; \ + lsl x7, x14, #32; \ + add x14, x7, x14; \ + mov x7, #0xffffffff00000001; \ + umulh x7, x7, x14; \ + mov x6, #0xffffffff; \ + mul x5, x6, x14; \ + umulh x6, x6, x14; \ + adds x7, x7, x5; \ + adcs x6, x6, x14; \ + adc x5, xzr, xzr; \ + subs x15, x15, x7; \ + sbcs x16, x16, x6; \ + sbcs x17, x17, x5; \ + sbcs x12, x12, xzr; \ + sbcs x13, x13, xzr; \ + sbc x14, x14, xzr; \ + lsl x7, x15, #32; \ + add x15, x7, x15; \ + mov x7, #0xffffffff00000001; \ + umulh x7, x7, x15; \ + mov x6, #0xffffffff; \ + mul x5, x6, x15; \ + umulh x6, x6, x15; \ + adds x7, x7, x5; \ + adcs x6, x6, x15; \ + adc x5, xzr, xzr; \ + subs x16, x16, x7; \ + sbcs x17, x17, x6; \ + sbcs x12, x12, x5; \ + sbcs x13, x13, xzr; \ + sbcs x14, x14, xzr; \ + sbc x15, x15, xzr; \ + lsl x7, x16, #32; \ + add x16, x7, x16; \ + mov x7, #0xffffffff00000001; \ + umulh x7, x7, x16; \ + mov x6, #0xffffffff; \ + mul x5, x6, x16; \ + umulh x6, x6, x16; \ + adds x7, x7, x5; \ + adcs x6, x6, x16; \ + adc x5, xzr, xzr; \ + subs x17, x17, x7; \ + sbcs x12, x12, x6; \ + sbcs x13, x13, x5; \ + sbcs x14, x14, xzr; \ + sbcs x15, x15, xzr; \ + sbc x16, x16, xzr; \ + lsl x7, x17, #32; \ + add x17, x7, x17; \ + mov x7, #0xffffffff00000001; \ + umulh x7, x7, x17; \ + mov x6, #0xffffffff; \ + mul x5, x6, x17; \ + umulh x6, x6, x17; \ + adds x7, x7, x5; \ + adcs x6, x6, x17; \ + adc x5, xzr, xzr; \ + subs x12, x12, x7; \ + sbcs x13, x13, x6; \ + sbcs x14, x14, x5; \ + sbcs x15, x15, xzr; \ + sbcs x16, x16, xzr; \ + sbc x17, x17, xzr; \ + adds x12, x12, x19; \ + adcs x13, x13, x20; \ + adcs x14, x14, x21; \ + adcs x15, x15, x22; \ + adcs x16, x16, x2; \ + adcs x17, x17, x1; \ + adc x10, xzr, xzr; \ + mov x11, #0xffffffff00000001; \ + adds x19, x12, x11; \ + mov x11, #0xffffffff; \ + adcs x20, x13, x11; \ + mov x11, #0x1; \ + adcs x21, x14, x11; \ + adcs x22, x15, xzr; \ + adcs x2, x16, xzr; \ + adcs x1, x17, xzr; \ + adcs x10, x10, xzr; \ + csel x12, x12, x19, eq; \ + csel x13, x13, x20, eq; \ + csel x14, x14, x21, eq; \ + csel x15, x15, x22, eq; \ + csel x16, x16, x2, eq; \ + csel x17, x17, x1, eq; \ + stp x12, x13, [P0]; \ + stp x14, x15, [P0+16]; \ + stp x16, x17, [P0+32] + +// Corresponds exactly to bignum_montsqr_p384_alt + +#define montsqr_p384(P0,P1) \ + ldp x2, x3, [P1]; \ + mul x9, x2, x3; \ + umulh x10, x2, x3; \ + ldp x4, x5, [P1+16]; \ + mul x8, x2, x4; \ + adds x10, x10, x8; \ + mul x11, x2, x5; \ + mul x8, x3, x4; \ + adcs x11, x11, x8; \ + umulh x12, x2, x5; \ + mul x8, x3, x5; \ + adcs x12, x12, x8; \ + ldp x6, x7, [P1+32]; \ + mul x13, x2, x7; \ + mul x8, x3, x6; \ + adcs x13, x13, x8; \ + umulh x14, x2, x7; \ + mul x8, x3, x7; \ + adcs x14, x14, x8; \ + mul x15, x5, x6; \ + adcs x15, x15, xzr; \ + umulh x16, x5, x6; \ + adc x16, x16, xzr; \ + umulh x8, x2, x4; \ + adds x11, x11, x8; \ + umulh x8, x3, x4; \ + adcs x12, x12, x8; \ + umulh x8, x3, x5; \ + adcs x13, x13, x8; \ + umulh x8, x3, x6; \ + adcs x14, x14, x8; \ + umulh x8, x3, x7; \ + adcs x15, x15, x8; \ + adc x16, x16, xzr; \ + mul x8, x2, x6; \ + adds x12, x12, x8; \ + mul x8, x4, x5; \ + adcs x13, x13, x8; \ + mul x8, x4, x6; \ + adcs x14, x14, x8; \ + mul x8, x4, x7; \ + adcs x15, x15, x8; \ + mul x8, x5, x7; \ + adcs x16, x16, x8; \ + mul x17, x6, x7; \ + adcs x17, x17, xzr; \ + umulh x19, x6, x7; \ + adc x19, x19, xzr; \ + umulh x8, x2, x6; \ + adds x13, x13, x8; \ + umulh x8, x4, x5; \ + adcs x14, x14, x8; \ + umulh x8, x4, x6; \ + adcs x15, x15, x8; \ + umulh x8, x4, x7; \ + adcs x16, x16, x8; \ + umulh x8, x5, x7; \ + adcs x17, x17, x8; \ + adc x19, x19, xzr; \ + adds x9, x9, x9; \ + adcs x10, x10, x10; \ + adcs x11, x11, x11; \ + adcs x12, x12, x12; \ + adcs x13, x13, x13; \ + adcs x14, x14, x14; \ + adcs x15, x15, x15; \ + adcs x16, x16, x16; \ + adcs x17, x17, x17; \ + adcs x19, x19, x19; \ + cset x20, hs; \ + umulh x8, x2, x2; \ + mul x2, x2, x2; \ + adds x9, x9, x8; \ + mul x8, x3, x3; \ + adcs x10, x10, x8; \ + umulh x8, x3, x3; \ + adcs x11, x11, x8; \ + mul x8, x4, x4; \ + adcs x12, x12, x8; \ + umulh x8, x4, x4; \ + adcs x13, x13, x8; \ + mul x8, x5, x5; \ + adcs x14, x14, x8; \ + umulh x8, x5, x5; \ + adcs x15, x15, x8; \ + mul x8, x6, x6; \ + adcs x16, x16, x8; \ + umulh x8, x6, x6; \ + adcs x17, x17, x8; \ + mul x8, x7, x7; \ + adcs x19, x19, x8; \ + umulh x8, x7, x7; \ + adc x20, x20, x8; \ + lsl x5, x2, #32; \ + add x2, x5, x2; \ + mov x5, #-4294967295; \ + umulh x5, x5, x2; \ + mov x4, #4294967295; \ + mul x3, x4, x2; \ + umulh x4, x4, x2; \ + adds x5, x5, x3; \ + adcs x4, x4, x2; \ + adc x3, xzr, xzr; \ + subs x9, x9, x5; \ + sbcs x10, x10, x4; \ + sbcs x11, x11, x3; \ + sbcs x12, x12, xzr; \ + sbcs x13, x13, xzr; \ + sbc x2, x2, xzr; \ + lsl x5, x9, #32; \ + add x9, x5, x9; \ + mov x5, #-4294967295; \ + umulh x5, x5, x9; \ + mov x4, #4294967295; \ + mul x3, x4, x9; \ + umulh x4, x4, x9; \ + adds x5, x5, x3; \ + adcs x4, x4, x9; \ + adc x3, xzr, xzr; \ + subs x10, x10, x5; \ + sbcs x11, x11, x4; \ + sbcs x12, x12, x3; \ + sbcs x13, x13, xzr; \ + sbcs x2, x2, xzr; \ + sbc x9, x9, xzr; \ + lsl x5, x10, #32; \ + add x10, x5, x10; \ + mov x5, #-4294967295; \ + umulh x5, x5, x10; \ + mov x4, #4294967295; \ + mul x3, x4, x10; \ + umulh x4, x4, x10; \ + adds x5, x5, x3; \ + adcs x4, x4, x10; \ + adc x3, xzr, xzr; \ + subs x11, x11, x5; \ + sbcs x12, x12, x4; \ + sbcs x13, x13, x3; \ + sbcs x2, x2, xzr; \ + sbcs x9, x9, xzr; \ + sbc x10, x10, xzr; \ + lsl x5, x11, #32; \ + add x11, x5, x11; \ + mov x5, #-4294967295; \ + umulh x5, x5, x11; \ + mov x4, #4294967295; \ + mul x3, x4, x11; \ + umulh x4, x4, x11; \ + adds x5, x5, x3; \ + adcs x4, x4, x11; \ + adc x3, xzr, xzr; \ + subs x12, x12, x5; \ + sbcs x13, x13, x4; \ + sbcs x2, x2, x3; \ + sbcs x9, x9, xzr; \ + sbcs x10, x10, xzr; \ + sbc x11, x11, xzr; \ + lsl x5, x12, #32; \ + add x12, x5, x12; \ + mov x5, #-4294967295; \ + umulh x5, x5, x12; \ + mov x4, #4294967295; \ + mul x3, x4, x12; \ + umulh x4, x4, x12; \ + adds x5, x5, x3; \ + adcs x4, x4, x12; \ + adc x3, xzr, xzr; \ + subs x13, x13, x5; \ + sbcs x2, x2, x4; \ + sbcs x9, x9, x3; \ + sbcs x10, x10, xzr; \ + sbcs x11, x11, xzr; \ + sbc x12, x12, xzr; \ + lsl x5, x13, #32; \ + add x13, x5, x13; \ + mov x5, #-4294967295; \ + umulh x5, x5, x13; \ + mov x4, #4294967295; \ + mul x3, x4, x13; \ + umulh x4, x4, x13; \ + adds x5, x5, x3; \ + adcs x4, x4, x13; \ + adc x3, xzr, xzr; \ + subs x2, x2, x5; \ + sbcs x9, x9, x4; \ + sbcs x10, x10, x3; \ + sbcs x11, x11, xzr; \ + sbcs x12, x12, xzr; \ + sbc x13, x13, xzr; \ + adds x2, x2, x14; \ + adcs x9, x9, x15; \ + adcs x10, x10, x16; \ + adcs x11, x11, x17; \ + adcs x12, x12, x19; \ + adcs x13, x13, x20; \ + adc x6, xzr, xzr; \ + mov x8, #-4294967295; \ + adds x14, x2, x8; \ + mov x8, #4294967295; \ + adcs x15, x9, x8; \ + mov x8, #1; \ + adcs x16, x10, x8; \ + adcs x17, x11, xzr; \ + adcs x19, x12, xzr; \ + adcs x20, x13, xzr; \ + adcs x6, x6, xzr; \ + csel x2, x2, x14, eq; \ + csel x9, x9, x15, eq; \ + csel x10, x10, x16, eq; \ + csel x11, x11, x17, eq; \ + csel x12, x12, x19, eq; \ + csel x13, x13, x20, eq; \ + stp x2, x9, [P0]; \ + stp x10, x11, [P0+16]; \ + stp x12, x13, [P0+32] + +// Almost-Montgomery variant which we use when an input to other muls +// with the other argument fully reduced (which is always safe). In +// fact, with the Karatsuba-based Montgomery mul here, we don't even +// *need* the restriction that the other argument is reduced. + +#define amontsqr_p384(P0,P1) \ + ldp x2, x3, [P1]; \ + mul x9, x2, x3; \ + umulh x10, x2, x3; \ + ldp x4, x5, [P1+16]; \ + mul x8, x2, x4; \ + adds x10, x10, x8; \ + mul x11, x2, x5; \ + mul x8, x3, x4; \ + adcs x11, x11, x8; \ + umulh x12, x2, x5; \ + mul x8, x3, x5; \ + adcs x12, x12, x8; \ + ldp x6, x7, [P1+32]; \ + mul x13, x2, x7; \ + mul x8, x3, x6; \ + adcs x13, x13, x8; \ + umulh x14, x2, x7; \ + mul x8, x3, x7; \ + adcs x14, x14, x8; \ + mul x15, x5, x6; \ + adcs x15, x15, xzr; \ + umulh x16, x5, x6; \ + adc x16, x16, xzr; \ + umulh x8, x2, x4; \ + adds x11, x11, x8; \ + umulh x8, x3, x4; \ + adcs x12, x12, x8; \ + umulh x8, x3, x5; \ + adcs x13, x13, x8; \ + umulh x8, x3, x6; \ + adcs x14, x14, x8; \ + umulh x8, x3, x7; \ + adcs x15, x15, x8; \ + adc x16, x16, xzr; \ + mul x8, x2, x6; \ + adds x12, x12, x8; \ + mul x8, x4, x5; \ + adcs x13, x13, x8; \ + mul x8, x4, x6; \ + adcs x14, x14, x8; \ + mul x8, x4, x7; \ + adcs x15, x15, x8; \ + mul x8, x5, x7; \ + adcs x16, x16, x8; \ + mul x17, x6, x7; \ + adcs x17, x17, xzr; \ + umulh x19, x6, x7; \ + adc x19, x19, xzr; \ + umulh x8, x2, x6; \ + adds x13, x13, x8; \ + umulh x8, x4, x5; \ + adcs x14, x14, x8; \ + umulh x8, x4, x6; \ + adcs x15, x15, x8; \ + umulh x8, x4, x7; \ + adcs x16, x16, x8; \ + umulh x8, x5, x7; \ + adcs x17, x17, x8; \ + adc x19, x19, xzr; \ + adds x9, x9, x9; \ + adcs x10, x10, x10; \ + adcs x11, x11, x11; \ + adcs x12, x12, x12; \ + adcs x13, x13, x13; \ + adcs x14, x14, x14; \ + adcs x15, x15, x15; \ + adcs x16, x16, x16; \ + adcs x17, x17, x17; \ + adcs x19, x19, x19; \ + cset x20, hs; \ + umulh x8, x2, x2; \ + mul x2, x2, x2; \ + adds x9, x9, x8; \ + mul x8, x3, x3; \ + adcs x10, x10, x8; \ + umulh x8, x3, x3; \ + adcs x11, x11, x8; \ + mul x8, x4, x4; \ + adcs x12, x12, x8; \ + umulh x8, x4, x4; \ + adcs x13, x13, x8; \ + mul x8, x5, x5; \ + adcs x14, x14, x8; \ + umulh x8, x5, x5; \ + adcs x15, x15, x8; \ + mul x8, x6, x6; \ + adcs x16, x16, x8; \ + umulh x8, x6, x6; \ + adcs x17, x17, x8; \ + mul x8, x7, x7; \ + adcs x19, x19, x8; \ + umulh x8, x7, x7; \ + adc x20, x20, x8; \ + lsl x5, x2, #32; \ + add x2, x5, x2; \ + mov x5, #-4294967295; \ + umulh x5, x5, x2; \ + mov x4, #4294967295; \ + mul x3, x4, x2; \ + umulh x4, x4, x2; \ + adds x5, x5, x3; \ + adcs x4, x4, x2; \ + adc x3, xzr, xzr; \ + subs x9, x9, x5; \ + sbcs x10, x10, x4; \ + sbcs x11, x11, x3; \ + sbcs x12, x12, xzr; \ + sbcs x13, x13, xzr; \ + sbc x2, x2, xzr; \ + lsl x5, x9, #32; \ + add x9, x5, x9; \ + mov x5, #-4294967295; \ + umulh x5, x5, x9; \ + mov x4, #4294967295; \ + mul x3, x4, x9; \ + umulh x4, x4, x9; \ + adds x5, x5, x3; \ + adcs x4, x4, x9; \ + adc x3, xzr, xzr; \ + subs x10, x10, x5; \ + sbcs x11, x11, x4; \ + sbcs x12, x12, x3; \ + sbcs x13, x13, xzr; \ + sbcs x2, x2, xzr; \ + sbc x9, x9, xzr; \ + lsl x5, x10, #32; \ + add x10, x5, x10; \ + mov x5, #-4294967295; \ + umulh x5, x5, x10; \ + mov x4, #4294967295; \ + mul x3, x4, x10; \ + umulh x4, x4, x10; \ + adds x5, x5, x3; \ + adcs x4, x4, x10; \ + adc x3, xzr, xzr; \ + subs x11, x11, x5; \ + sbcs x12, x12, x4; \ + sbcs x13, x13, x3; \ + sbcs x2, x2, xzr; \ + sbcs x9, x9, xzr; \ + sbc x10, x10, xzr; \ + lsl x5, x11, #32; \ + add x11, x5, x11; \ + mov x5, #-4294967295; \ + umulh x5, x5, x11; \ + mov x4, #4294967295; \ + mul x3, x4, x11; \ + umulh x4, x4, x11; \ + adds x5, x5, x3; \ + adcs x4, x4, x11; \ + adc x3, xzr, xzr; \ + subs x12, x12, x5; \ + sbcs x13, x13, x4; \ + sbcs x2, x2, x3; \ + sbcs x9, x9, xzr; \ + sbcs x10, x10, xzr; \ + sbc x11, x11, xzr; \ + lsl x5, x12, #32; \ + add x12, x5, x12; \ + mov x5, #-4294967295; \ + umulh x5, x5, x12; \ + mov x4, #4294967295; \ + mul x3, x4, x12; \ + umulh x4, x4, x12; \ + adds x5, x5, x3; \ + adcs x4, x4, x12; \ + adc x3, xzr, xzr; \ + subs x13, x13, x5; \ + sbcs x2, x2, x4; \ + sbcs x9, x9, x3; \ + sbcs x10, x10, xzr; \ + sbcs x11, x11, xzr; \ + sbc x12, x12, xzr; \ + lsl x5, x13, #32; \ + add x13, x5, x13; \ + mov x5, #-4294967295; \ + umulh x5, x5, x13; \ + mov x4, #4294967295; \ + mul x3, x4, x13; \ + umulh x4, x4, x13; \ + adds x5, x5, x3; \ + adcs x4, x4, x13; \ + adc x3, xzr, xzr; \ + subs x2, x2, x5; \ + sbcs x9, x9, x4; \ + sbcs x10, x10, x3; \ + sbcs x11, x11, xzr; \ + sbcs x12, x12, xzr; \ + sbc x13, x13, xzr; \ + adds x2, x2, x14; \ + adcs x9, x9, x15; \ + adcs x10, x10, x16; \ + adcs x11, x11, x17; \ + adcs x12, x12, x19; \ + adcs x13, x13, x20; \ + mov x14, #-4294967295; \ + mov x15, #4294967295; \ + csel x14, x14, xzr, cs; \ + csel x15, x15, xzr, cs; \ + cset x16, cs; \ + adds x2, x2, x14; \ + adcs x9, x9, x15; \ + adcs x10, x10, x16; \ + adcs x11, x11, xzr; \ + adcs x12, x12, xzr; \ + adc x13, x13, xzr; \ + stp x2, x9, [P0]; \ + stp x10, x11, [P0+16]; \ + stp x12, x13, [P0+32] + +// Corresponds exactly to bignum_sub_p384 + +#define sub_p384(P0,P1,P2) \ + ldp x5, x6, [P1]; \ + ldp x4, x3, [P2]; \ + subs x5, x5, x4; \ + sbcs x6, x6, x3; \ + ldp x7, x8, [P1+16]; \ + ldp x4, x3, [P2+16]; \ + sbcs x7, x7, x4; \ + sbcs x8, x8, x3; \ + ldp x9, x10, [P1+32]; \ + ldp x4, x3, [P2+32]; \ + sbcs x9, x9, x4; \ + sbcs x10, x10, x3; \ + csetm x3, lo; \ + mov x4, #4294967295; \ + and x4, x4, x3; \ + adds x5, x5, x4; \ + eor x4, x4, x3; \ + adcs x6, x6, x4; \ + mov x4, #-2; \ + and x4, x4, x3; \ + adcs x7, x7, x4; \ + adcs x8, x8, x3; \ + adcs x9, x9, x3; \ + adc x10, x10, x3; \ + stp x5, x6, [P0]; \ + stp x7, x8, [P0+16]; \ + stp x9, x10, [P0+32] + +S2N_BN_SYMBOL(p384_montjadd): + +// Save regs and make room on stack for temporary variables + + stp x19, x20, [sp, #-16]! + stp x21, x22, [sp, #-16]! + stp x23, x24, [sp, #-16]! + stp x25, x26, [sp, #-16]! + sub sp, sp, NSPACE + +// Move the input arguments to stable places + + mov input_z, x0 + mov input_x, x1 + mov input_y, x2 + +// Main code, just a sequence of basic field operations +// 8 * multiply + 3 * square + 7 * subtract + + amontsqr_p384(z1sq,z_1) + amontsqr_p384(z2sq,z_2) + + montmul_p384(y1a,z_2,y_1) + montmul_p384(y2a,z_1,y_2) + + montmul_p384(x2a,z1sq,x_2) + montmul_p384(x1a,z2sq,x_1) + montmul_p384(y2a,z1sq,y2a) + montmul_p384(y1a,z2sq,y1a) + + sub_p384(xd,x2a,x1a) + sub_p384(yd,y2a,y1a) + + amontsqr_p384(zz,xd) + montsqr_p384(ww,yd) + + montmul_p384(zzx1,zz,x1a) + montmul_p384(zzx2,zz,x2a) + + sub_p384(x_3,ww,zzx1) + sub_p384(t1,zzx2,zzx1) + + montmul_p384(xd,xd,z_1) + + sub_p384(x_3,x_3,zzx2) + + sub_p384(t2,zzx1,x_3) + + montmul_p384(t1,t1,y1a) + montmul_p384(z_3,xd,z_2) + montmul_p384(t2,yd,t2) + + sub_p384(y_3,t2,t1) + +// Restore stack and registers + + add sp, sp, NSPACE + + ldp x25, x26, [sp], 16 + ldp x23, x24, [sp], 16 + ldp x21, x22, [sp], 16 + ldp x19, x20, [sp], 16 + + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack, "", %progbits +#endif diff --git a/arm/p384/p384_montjdouble.S b/arm/p384/p384_montjdouble.S new file mode 100644 index 0000000000..8fa2ad3234 --- /dev/null +++ b/arm/p384/p384_montjdouble.S @@ -0,0 +1,963 @@ +/* + * Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"). + * You may not use this file except in compliance with the License. + * A copy of the License is located at + * + * http://aws.amazon.com/apache2.0 + * + * or in the "LICENSE" file accompanying this file. This file is distributed + * on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either + * express or implied. See the License for the specific language governing + * permissions and limitations under the License. + */ + +// ---------------------------------------------------------------------------- +// Point doubling on NIST curve P-384 in Montgomery-Jacobian coordinates +// +// extern void p384_montjdouble +// (uint64_t p3[static 18],uint64_t p1[static 18]); +// +// Does p3 := 2 * p1 where all points are regarded as Jacobian triples with +// each coordinate in the Montgomery domain, i.e. x' = (2^384 * x) mod p_384. +// A Jacobian triple (x',y',z') represents affine point (x/z^2,y/z^3). +// +// Standard ARM ABI: X0 = p3, X1 = p1 +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(p384_montjdouble) + S2N_BN_SYM_PRIVACY_DIRECTIVE(p384_montjdouble) + .text + .balign 4 + +// Size of individual field elements + +#define NUMSIZE 48 + +// Stable homes for input arguments during main code sequence + +#define input_z x23 +#define input_x x24 + +// Pointer-offset pairs for inputs and outputs + +#define x_1 input_x, #0 +#define y_1 input_x, #NUMSIZE +#define z_1 input_x, #(2*NUMSIZE) + +#define x_3 input_z, #0 +#define y_3 input_z, #NUMSIZE +#define z_3 input_z, #(2*NUMSIZE) + +// Pointer-offset pairs for temporaries, with some aliasing +// NSPACE is the total stack needed for these temporaries + +#define z2 sp, #(NUMSIZE*0) +#define y2 sp, #(NUMSIZE*1) +#define x2p sp, #(NUMSIZE*2) +#define xy2 sp, #(NUMSIZE*3) + +#define y4 sp, #(NUMSIZE*4) +#define t2 sp, #(NUMSIZE*4) + +#define dx2 sp, #(NUMSIZE*5) +#define t1 sp, #(NUMSIZE*5) + +#define d sp, #(NUMSIZE*6) +#define x4p sp, #(NUMSIZE*6) + +#define NSPACE (NUMSIZE*7) + +// Corresponds exactly to bignum_montmul_p384_alt + +#define montmul_p384(P0,P1,P2) \ + ldp x3, x4, [P1]; \ + ldp x5, x6, [P2]; \ + mul x12, x3, x5; \ + umulh x13, x3, x5; \ + mul x11, x3, x6; \ + umulh x14, x3, x6; \ + adds x13, x13, x11; \ + ldp x7, x8, [P2+16]; \ + mul x11, x3, x7; \ + umulh x15, x3, x7; \ + adcs x14, x14, x11; \ + mul x11, x3, x8; \ + umulh x16, x3, x8; \ + adcs x15, x15, x11; \ + ldp x9, x10, [P2+32]; \ + mul x11, x3, x9; \ + umulh x17, x3, x9; \ + adcs x16, x16, x11; \ + mul x11, x3, x10; \ + umulh x19, x3, x10; \ + adcs x17, x17, x11; \ + adc x19, x19, xzr; \ + mul x11, x4, x5; \ + adds x13, x13, x11; \ + mul x11, x4, x6; \ + adcs x14, x14, x11; \ + mul x11, x4, x7; \ + adcs x15, x15, x11; \ + mul x11, x4, x8; \ + adcs x16, x16, x11; \ + mul x11, x4, x9; \ + adcs x17, x17, x11; \ + mul x11, x4, x10; \ + adcs x19, x19, x11; \ + cset x20, cs; \ + umulh x11, x4, x5; \ + adds x14, x14, x11; \ + umulh x11, x4, x6; \ + adcs x15, x15, x11; \ + umulh x11, x4, x7; \ + adcs x16, x16, x11; \ + umulh x11, x4, x8; \ + adcs x17, x17, x11; \ + umulh x11, x4, x9; \ + adcs x19, x19, x11; \ + umulh x11, x4, x10; \ + adc x20, x20, x11; \ + ldp x3, x4, [P1+16]; \ + mul x11, x3, x5; \ + adds x14, x14, x11; \ + mul x11, x3, x6; \ + adcs x15, x15, x11; \ + mul x11, x3, x7; \ + adcs x16, x16, x11; \ + mul x11, x3, x8; \ + adcs x17, x17, x11; \ + mul x11, x3, x9; \ + adcs x19, x19, x11; \ + mul x11, x3, x10; \ + adcs x20, x20, x11; \ + cset x21, cs; \ + umulh x11, x3, x5; \ + adds x15, x15, x11; \ + umulh x11, x3, x6; \ + adcs x16, x16, x11; \ + umulh x11, x3, x7; \ + adcs x17, x17, x11; \ + umulh x11, x3, x8; \ + adcs x19, x19, x11; \ + umulh x11, x3, x9; \ + adcs x20, x20, x11; \ + umulh x11, x3, x10; \ + adc x21, x21, x11; \ + mul x11, x4, x5; \ + adds x15, x15, x11; \ + mul x11, x4, x6; \ + adcs x16, x16, x11; \ + mul x11, x4, x7; \ + adcs x17, x17, x11; \ + mul x11, x4, x8; \ + adcs x19, x19, x11; \ + mul x11, x4, x9; \ + adcs x20, x20, x11; \ + mul x11, x4, x10; \ + adcs x21, x21, x11; \ + cset x22, cs; \ + umulh x11, x4, x5; \ + adds x16, x16, x11; \ + umulh x11, x4, x6; \ + adcs x17, x17, x11; \ + umulh x11, x4, x7; \ + adcs x19, x19, x11; \ + umulh x11, x4, x8; \ + adcs x20, x20, x11; \ + umulh x11, x4, x9; \ + adcs x21, x21, x11; \ + umulh x11, x4, x10; \ + adc x22, x22, x11; \ + ldp x3, x4, [P1+32]; \ + mul x11, x3, x5; \ + adds x16, x16, x11; \ + mul x11, x3, x6; \ + adcs x17, x17, x11; \ + mul x11, x3, x7; \ + adcs x19, x19, x11; \ + mul x11, x3, x8; \ + adcs x20, x20, x11; \ + mul x11, x3, x9; \ + adcs x21, x21, x11; \ + mul x11, x3, x10; \ + adcs x22, x22, x11; \ + cset x2, cs; \ + umulh x11, x3, x5; \ + adds x17, x17, x11; \ + umulh x11, x3, x6; \ + adcs x19, x19, x11; \ + umulh x11, x3, x7; \ + adcs x20, x20, x11; \ + umulh x11, x3, x8; \ + adcs x21, x21, x11; \ + umulh x11, x3, x9; \ + adcs x22, x22, x11; \ + umulh x11, x3, x10; \ + adc x2, x2, x11; \ + mul x11, x4, x5; \ + adds x17, x17, x11; \ + mul x11, x4, x6; \ + adcs x19, x19, x11; \ + mul x11, x4, x7; \ + adcs x20, x20, x11; \ + mul x11, x4, x8; \ + adcs x21, x21, x11; \ + mul x11, x4, x9; \ + adcs x22, x22, x11; \ + mul x11, x4, x10; \ + adcs x2, x2, x11; \ + cset x1, cs; \ + umulh x11, x4, x5; \ + adds x19, x19, x11; \ + umulh x11, x4, x6; \ + adcs x20, x20, x11; \ + umulh x11, x4, x7; \ + adcs x21, x21, x11; \ + umulh x11, x4, x8; \ + adcs x22, x22, x11; \ + umulh x11, x4, x9; \ + adcs x2, x2, x11; \ + umulh x11, x4, x10; \ + adc x1, x1, x11; \ + lsl x7, x12, #32; \ + add x12, x7, x12; \ + mov x7, #0xffffffff00000001; \ + umulh x7, x7, x12; \ + mov x6, #0xffffffff; \ + mul x5, x6, x12; \ + umulh x6, x6, x12; \ + adds x7, x7, x5; \ + adcs x6, x6, x12; \ + adc x5, xzr, xzr; \ + subs x13, x13, x7; \ + sbcs x14, x14, x6; \ + sbcs x15, x15, x5; \ + sbcs x16, x16, xzr; \ + sbcs x17, x17, xzr; \ + sbc x12, x12, xzr; \ + lsl x7, x13, #32; \ + add x13, x7, x13; \ + mov x7, #0xffffffff00000001; \ + umulh x7, x7, x13; \ + mov x6, #0xffffffff; \ + mul x5, x6, x13; \ + umulh x6, x6, x13; \ + adds x7, x7, x5; \ + adcs x6, x6, x13; \ + adc x5, xzr, xzr; \ + subs x14, x14, x7; \ + sbcs x15, x15, x6; \ + sbcs x16, x16, x5; \ + sbcs x17, x17, xzr; \ + sbcs x12, x12, xzr; \ + sbc x13, x13, xzr; \ + lsl x7, x14, #32; \ + add x14, x7, x14; \ + mov x7, #0xffffffff00000001; \ + umulh x7, x7, x14; \ + mov x6, #0xffffffff; \ + mul x5, x6, x14; \ + umulh x6, x6, x14; \ + adds x7, x7, x5; \ + adcs x6, x6, x14; \ + adc x5, xzr, xzr; \ + subs x15, x15, x7; \ + sbcs x16, x16, x6; \ + sbcs x17, x17, x5; \ + sbcs x12, x12, xzr; \ + sbcs x13, x13, xzr; \ + sbc x14, x14, xzr; \ + lsl x7, x15, #32; \ + add x15, x7, x15; \ + mov x7, #0xffffffff00000001; \ + umulh x7, x7, x15; \ + mov x6, #0xffffffff; \ + mul x5, x6, x15; \ + umulh x6, x6, x15; \ + adds x7, x7, x5; \ + adcs x6, x6, x15; \ + adc x5, xzr, xzr; \ + subs x16, x16, x7; \ + sbcs x17, x17, x6; \ + sbcs x12, x12, x5; \ + sbcs x13, x13, xzr; \ + sbcs x14, x14, xzr; \ + sbc x15, x15, xzr; \ + lsl x7, x16, #32; \ + add x16, x7, x16; \ + mov x7, #0xffffffff00000001; \ + umulh x7, x7, x16; \ + mov x6, #0xffffffff; \ + mul x5, x6, x16; \ + umulh x6, x6, x16; \ + adds x7, x7, x5; \ + adcs x6, x6, x16; \ + adc x5, xzr, xzr; \ + subs x17, x17, x7; \ + sbcs x12, x12, x6; \ + sbcs x13, x13, x5; \ + sbcs x14, x14, xzr; \ + sbcs x15, x15, xzr; \ + sbc x16, x16, xzr; \ + lsl x7, x17, #32; \ + add x17, x7, x17; \ + mov x7, #0xffffffff00000001; \ + umulh x7, x7, x17; \ + mov x6, #0xffffffff; \ + mul x5, x6, x17; \ + umulh x6, x6, x17; \ + adds x7, x7, x5; \ + adcs x6, x6, x17; \ + adc x5, xzr, xzr; \ + subs x12, x12, x7; \ + sbcs x13, x13, x6; \ + sbcs x14, x14, x5; \ + sbcs x15, x15, xzr; \ + sbcs x16, x16, xzr; \ + sbc x17, x17, xzr; \ + adds x12, x12, x19; \ + adcs x13, x13, x20; \ + adcs x14, x14, x21; \ + adcs x15, x15, x22; \ + adcs x16, x16, x2; \ + adcs x17, x17, x1; \ + adc x10, xzr, xzr; \ + mov x11, #0xffffffff00000001; \ + adds x19, x12, x11; \ + mov x11, #0xffffffff; \ + adcs x20, x13, x11; \ + mov x11, #0x1; \ + adcs x21, x14, x11; \ + adcs x22, x15, xzr; \ + adcs x2, x16, xzr; \ + adcs x1, x17, xzr; \ + adcs x10, x10, xzr; \ + csel x12, x12, x19, eq; \ + csel x13, x13, x20, eq; \ + csel x14, x14, x21, eq; \ + csel x15, x15, x22, eq; \ + csel x16, x16, x2, eq; \ + csel x17, x17, x1, eq; \ + stp x12, x13, [P0]; \ + stp x14, x15, [P0+16]; \ + stp x16, x17, [P0+32] + +// Corresponds exactly to bignum_montsqr_p384_alt + +#define montsqr_p384(P0,P1) \ + ldp x2, x3, [P1]; \ + mul x9, x2, x3; \ + umulh x10, x2, x3; \ + ldp x4, x5, [P1+16]; \ + mul x8, x2, x4; \ + adds x10, x10, x8; \ + mul x11, x2, x5; \ + mul x8, x3, x4; \ + adcs x11, x11, x8; \ + umulh x12, x2, x5; \ + mul x8, x3, x5; \ + adcs x12, x12, x8; \ + ldp x6, x7, [P1+32]; \ + mul x13, x2, x7; \ + mul x8, x3, x6; \ + adcs x13, x13, x8; \ + umulh x14, x2, x7; \ + mul x8, x3, x7; \ + adcs x14, x14, x8; \ + mul x15, x5, x6; \ + adcs x15, x15, xzr; \ + umulh x16, x5, x6; \ + adc x16, x16, xzr; \ + umulh x8, x2, x4; \ + adds x11, x11, x8; \ + umulh x8, x3, x4; \ + adcs x12, x12, x8; \ + umulh x8, x3, x5; \ + adcs x13, x13, x8; \ + umulh x8, x3, x6; \ + adcs x14, x14, x8; \ + umulh x8, x3, x7; \ + adcs x15, x15, x8; \ + adc x16, x16, xzr; \ + mul x8, x2, x6; \ + adds x12, x12, x8; \ + mul x8, x4, x5; \ + adcs x13, x13, x8; \ + mul x8, x4, x6; \ + adcs x14, x14, x8; \ + mul x8, x4, x7; \ + adcs x15, x15, x8; \ + mul x8, x5, x7; \ + adcs x16, x16, x8; \ + mul x17, x6, x7; \ + adcs x17, x17, xzr; \ + umulh x19, x6, x7; \ + adc x19, x19, xzr; \ + umulh x8, x2, x6; \ + adds x13, x13, x8; \ + umulh x8, x4, x5; \ + adcs x14, x14, x8; \ + umulh x8, x4, x6; \ + adcs x15, x15, x8; \ + umulh x8, x4, x7; \ + adcs x16, x16, x8; \ + umulh x8, x5, x7; \ + adcs x17, x17, x8; \ + adc x19, x19, xzr; \ + adds x9, x9, x9; \ + adcs x10, x10, x10; \ + adcs x11, x11, x11; \ + adcs x12, x12, x12; \ + adcs x13, x13, x13; \ + adcs x14, x14, x14; \ + adcs x15, x15, x15; \ + adcs x16, x16, x16; \ + adcs x17, x17, x17; \ + adcs x19, x19, x19; \ + cset x20, hs; \ + umulh x8, x2, x2; \ + mul x2, x2, x2; \ + adds x9, x9, x8; \ + mul x8, x3, x3; \ + adcs x10, x10, x8; \ + umulh x8, x3, x3; \ + adcs x11, x11, x8; \ + mul x8, x4, x4; \ + adcs x12, x12, x8; \ + umulh x8, x4, x4; \ + adcs x13, x13, x8; \ + mul x8, x5, x5; \ + adcs x14, x14, x8; \ + umulh x8, x5, x5; \ + adcs x15, x15, x8; \ + mul x8, x6, x6; \ + adcs x16, x16, x8; \ + umulh x8, x6, x6; \ + adcs x17, x17, x8; \ + mul x8, x7, x7; \ + adcs x19, x19, x8; \ + umulh x8, x7, x7; \ + adc x20, x20, x8; \ + lsl x5, x2, #32; \ + add x2, x5, x2; \ + mov x5, #-4294967295; \ + umulh x5, x5, x2; \ + mov x4, #4294967295; \ + mul x3, x4, x2; \ + umulh x4, x4, x2; \ + adds x5, x5, x3; \ + adcs x4, x4, x2; \ + adc x3, xzr, xzr; \ + subs x9, x9, x5; \ + sbcs x10, x10, x4; \ + sbcs x11, x11, x3; \ + sbcs x12, x12, xzr; \ + sbcs x13, x13, xzr; \ + sbc x2, x2, xzr; \ + lsl x5, x9, #32; \ + add x9, x5, x9; \ + mov x5, #-4294967295; \ + umulh x5, x5, x9; \ + mov x4, #4294967295; \ + mul x3, x4, x9; \ + umulh x4, x4, x9; \ + adds x5, x5, x3; \ + adcs x4, x4, x9; \ + adc x3, xzr, xzr; \ + subs x10, x10, x5; \ + sbcs x11, x11, x4; \ + sbcs x12, x12, x3; \ + sbcs x13, x13, xzr; \ + sbcs x2, x2, xzr; \ + sbc x9, x9, xzr; \ + lsl x5, x10, #32; \ + add x10, x5, x10; \ + mov x5, #-4294967295; \ + umulh x5, x5, x10; \ + mov x4, #4294967295; \ + mul x3, x4, x10; \ + umulh x4, x4, x10; \ + adds x5, x5, x3; \ + adcs x4, x4, x10; \ + adc x3, xzr, xzr; \ + subs x11, x11, x5; \ + sbcs x12, x12, x4; \ + sbcs x13, x13, x3; \ + sbcs x2, x2, xzr; \ + sbcs x9, x9, xzr; \ + sbc x10, x10, xzr; \ + lsl x5, x11, #32; \ + add x11, x5, x11; \ + mov x5, #-4294967295; \ + umulh x5, x5, x11; \ + mov x4, #4294967295; \ + mul x3, x4, x11; \ + umulh x4, x4, x11; \ + adds x5, x5, x3; \ + adcs x4, x4, x11; \ + adc x3, xzr, xzr; \ + subs x12, x12, x5; \ + sbcs x13, x13, x4; \ + sbcs x2, x2, x3; \ + sbcs x9, x9, xzr; \ + sbcs x10, x10, xzr; \ + sbc x11, x11, xzr; \ + lsl x5, x12, #32; \ + add x12, x5, x12; \ + mov x5, #-4294967295; \ + umulh x5, x5, x12; \ + mov x4, #4294967295; \ + mul x3, x4, x12; \ + umulh x4, x4, x12; \ + adds x5, x5, x3; \ + adcs x4, x4, x12; \ + adc x3, xzr, xzr; \ + subs x13, x13, x5; \ + sbcs x2, x2, x4; \ + sbcs x9, x9, x3; \ + sbcs x10, x10, xzr; \ + sbcs x11, x11, xzr; \ + sbc x12, x12, xzr; \ + lsl x5, x13, #32; \ + add x13, x5, x13; \ + mov x5, #-4294967295; \ + umulh x5, x5, x13; \ + mov x4, #4294967295; \ + mul x3, x4, x13; \ + umulh x4, x4, x13; \ + adds x5, x5, x3; \ + adcs x4, x4, x13; \ + adc x3, xzr, xzr; \ + subs x2, x2, x5; \ + sbcs x9, x9, x4; \ + sbcs x10, x10, x3; \ + sbcs x11, x11, xzr; \ + sbcs x12, x12, xzr; \ + sbc x13, x13, xzr; \ + adds x2, x2, x14; \ + adcs x9, x9, x15; \ + adcs x10, x10, x16; \ + adcs x11, x11, x17; \ + adcs x12, x12, x19; \ + adcs x13, x13, x20; \ + adc x6, xzr, xzr; \ + mov x8, #-4294967295; \ + adds x14, x2, x8; \ + mov x8, #4294967295; \ + adcs x15, x9, x8; \ + mov x8, #1; \ + adcs x16, x10, x8; \ + adcs x17, x11, xzr; \ + adcs x19, x12, xzr; \ + adcs x20, x13, xzr; \ + adcs x6, x6, xzr; \ + csel x2, x2, x14, eq; \ + csel x9, x9, x15, eq; \ + csel x10, x10, x16, eq; \ + csel x11, x11, x17, eq; \ + csel x12, x12, x19, eq; \ + csel x13, x13, x20, eq; \ + stp x2, x9, [P0]; \ + stp x10, x11, [P0+16]; \ + stp x12, x13, [P0+32] + +// Corresponds exactly to bignum_sub_p384 + +#define sub_p384(P0,P1,P2) \ + ldp x5, x6, [P1]; \ + ldp x4, x3, [P2]; \ + subs x5, x5, x4; \ + sbcs x6, x6, x3; \ + ldp x7, x8, [P1+16]; \ + ldp x4, x3, [P2+16]; \ + sbcs x7, x7, x4; \ + sbcs x8, x8, x3; \ + ldp x9, x10, [P1+32]; \ + ldp x4, x3, [P2+32]; \ + sbcs x9, x9, x4; \ + sbcs x10, x10, x3; \ + csetm x3, lo; \ + mov x4, #4294967295; \ + and x4, x4, x3; \ + adds x5, x5, x4; \ + eor x4, x4, x3; \ + adcs x6, x6, x4; \ + mov x4, #-2; \ + and x4, x4, x3; \ + adcs x7, x7, x4; \ + adcs x8, x8, x3; \ + adcs x9, x9, x3; \ + adc x10, x10, x3; \ + stp x5, x6, [P0]; \ + stp x7, x8, [P0+16]; \ + stp x9, x10, [P0+32] + +// Corresponds exactly to bignum_add_p384 + +#define add_p384(P0,P1,P2) \ + ldp x5, x6, [P1]; \ + ldp x4, x3, [P2]; \ + adds x5, x5, x4; \ + adcs x6, x6, x3; \ + ldp x7, x8, [P1+16]; \ + ldp x4, x3, [P2+16]; \ + adcs x7, x7, x4; \ + adcs x8, x8, x3; \ + ldp x9, x10, [P1+32]; \ + ldp x4, x3, [P2+32]; \ + adcs x9, x9, x4; \ + adcs x10, x10, x3; \ + adc x3, xzr, xzr; \ + mov x4, #0xffffffff; \ + cmp x5, x4; \ + mov x4, #0xffffffff00000000; \ + sbcs xzr, x6, x4; \ + mov x4, #0xfffffffffffffffe; \ + sbcs xzr, x7, x4; \ + adcs xzr, x8, xzr; \ + adcs xzr, x9, xzr; \ + adcs xzr, x10, xzr; \ + adcs x3, x3, xzr; \ + csetm x3, ne; \ + mov x4, #0xffffffff; \ + and x4, x4, x3; \ + subs x5, x5, x4; \ + eor x4, x4, x3; \ + sbcs x6, x6, x4; \ + mov x4, #0xfffffffffffffffe; \ + and x4, x4, x3; \ + sbcs x7, x7, x4; \ + sbcs x8, x8, x3; \ + sbcs x9, x9, x3; \ + sbc x10, x10, x3; \ + stp x5, x6, [P0]; \ + stp x7, x8, [P0+16]; \ + stp x9, x10, [P0+32] + +// P0 = 4 * P1 - P2 + +#define cmsub41_p384(P0,P1,P2) \ + ldp x1, x2, [P1]; \ + ldp x3, x4, [P1+16]; \ + ldp x5, x6, [P1+32]; \ + lsl x0, x1, #2; \ + ldp x7, x8, [P2]; \ + subs x0, x0, x7; \ + extr x1, x2, x1, #62; \ + sbcs x1, x1, x8; \ + ldp x7, x8, [P2+16]; \ + extr x2, x3, x2, #62; \ + sbcs x2, x2, x7; \ + extr x3, x4, x3, #62; \ + sbcs x3, x3, x8; \ + extr x4, x5, x4, #62; \ + ldp x7, x8, [P2+32]; \ + sbcs x4, x4, x7; \ + extr x5, x6, x5, #62; \ + sbcs x5, x5, x8; \ + lsr x6, x6, #62; \ + adc x6, x6, xzr; \ + lsl x7, x6, #32; \ + subs x8, x6, x7; \ + sbc x7, x7, xzr; \ + adds x0, x0, x8; \ + adcs x1, x1, x7; \ + adcs x2, x2, x6; \ + adcs x3, x3, xzr; \ + adcs x4, x4, xzr; \ + adcs x5, x5, xzr; \ + csetm x8, cc; \ + mov x9, #0xffffffff; \ + and x9, x9, x8; \ + adds x0, x0, x9; \ + eor x9, x9, x8; \ + adcs x1, x1, x9; \ + mov x9, #0xfffffffffffffffe; \ + and x9, x9, x8; \ + adcs x2, x2, x9; \ + adcs x3, x3, x8; \ + adcs x4, x4, x8; \ + adc x5, x5, x8; \ + stp x0, x1, [P0]; \ + stp x2, x3, [P0+16]; \ + stp x4, x5, [P0+32] + +// P0 = C * P1 - D * P2 + +#define cmsub_p384(P0,C,P1,D,P2) \ + ldp x0, x1, [P2]; \ + mov x6, #0x00000000ffffffff; \ + subs x6, x6, x0; \ + mov x7, #0xffffffff00000000; \ + sbcs x7, x7, x1; \ + ldp x0, x1, [P2+16]; \ + mov x8, #0xfffffffffffffffe; \ + sbcs x8, x8, x0; \ + mov x13, #0xffffffffffffffff; \ + sbcs x9, x13, x1; \ + ldp x0, x1, [P2+32]; \ + sbcs x10, x13, x0; \ + sbc x11, x13, x1; \ + mov x12, D; \ + mul x0, x12, x6; \ + mul x1, x12, x7; \ + mul x2, x12, x8; \ + mul x3, x12, x9; \ + mul x4, x12, x10; \ + mul x5, x12, x11; \ + umulh x6, x12, x6; \ + umulh x7, x12, x7; \ + umulh x8, x12, x8; \ + umulh x9, x12, x9; \ + umulh x10, x12, x10; \ + umulh x12, x12, x11; \ + adds x1, x1, x6; \ + adcs x2, x2, x7; \ + adcs x3, x3, x8; \ + adcs x4, x4, x9; \ + adcs x5, x5, x10; \ + mov x6, #1; \ + adc x6, x12, x6; \ + ldp x8, x9, [P1]; \ + ldp x10, x11, [P1+16]; \ + ldp x12, x13, [P1+32]; \ + mov x14, C; \ + mul x15, x14, x8; \ + umulh x8, x14, x8; \ + adds x0, x0, x15; \ + mul x15, x14, x9; \ + umulh x9, x14, x9; \ + adcs x1, x1, x15; \ + mul x15, x14, x10; \ + umulh x10, x14, x10; \ + adcs x2, x2, x15; \ + mul x15, x14, x11; \ + umulh x11, x14, x11; \ + adcs x3, x3, x15; \ + mul x15, x14, x12; \ + umulh x12, x14, x12; \ + adcs x4, x4, x15; \ + mul x15, x14, x13; \ + umulh x13, x14, x13; \ + adcs x5, x5, x15; \ + adc x6, x6, xzr; \ + adds x1, x1, x8; \ + adcs x2, x2, x9; \ + adcs x3, x3, x10; \ + adcs x4, x4, x11; \ + adcs x5, x5, x12; \ + adcs x6, x6, x13; \ + lsl x7, x6, #32; \ + subs x8, x6, x7; \ + sbc x7, x7, xzr; \ + adds x0, x0, x8; \ + adcs x1, x1, x7; \ + adcs x2, x2, x6; \ + adcs x3, x3, xzr; \ + adcs x4, x4, xzr; \ + adcs x5, x5, xzr; \ + csetm x6, cc; \ + mov x7, #0xffffffff; \ + and x7, x7, x6; \ + adds x0, x0, x7; \ + eor x7, x7, x6; \ + adcs x1, x1, x7; \ + mov x7, #0xfffffffffffffffe; \ + and x7, x7, x6; \ + adcs x2, x2, x7; \ + adcs x3, x3, x6; \ + adcs x4, x4, x6; \ + adc x5, x5, x6; \ + stp x0, x1, [P0]; \ + stp x2, x3, [P0+16]; \ + stp x4, x5, [P0+32] + +// A weak version of add that only guarantees sum in 6 digits + +#define weakadd_p384(P0,P1,P2) \ + ldp x5, x6, [P1]; \ + ldp x4, x3, [P2]; \ + adds x5, x5, x4; \ + adcs x6, x6, x3; \ + ldp x7, x8, [P1+16]; \ + ldp x4, x3, [P2+16]; \ + adcs x7, x7, x4; \ + adcs x8, x8, x3; \ + ldp x9, x10, [P1+32]; \ + ldp x4, x3, [P2+32]; \ + adcs x9, x9, x4; \ + adcs x10, x10, x3; \ + csetm x3, cs; \ + mov x4, #0xffffffff; \ + and x4, x4, x3; \ + subs x5, x5, x4; \ + eor x4, x4, x3; \ + sbcs x6, x6, x4; \ + mov x4, #0xfffffffffffffffe; \ + and x4, x4, x3; \ + sbcs x7, x7, x4; \ + sbcs x8, x8, x3; \ + sbcs x9, x9, x3; \ + sbc x10, x10, x3; \ + stp x5, x6, [P0]; \ + stp x7, x8, [P0+16]; \ + stp x9, x10, [P0+32] + +// P0 = 3 * P1 - 8 * P2 + +#define cmsub38_p384(P0,P1,P2) \ + ldp x0, x1, [P2]; \ + mov x6, #0x00000000ffffffff; \ + subs x6, x6, x0; \ + mov x7, #0xffffffff00000000; \ + sbcs x7, x7, x1; \ + ldp x0, x1, [P2+16]; \ + mov x8, #0xfffffffffffffffe; \ + sbcs x8, x8, x0; \ + mov x13, #0xffffffffffffffff; \ + sbcs x9, x13, x1; \ + ldp x0, x1, [P2+32]; \ + sbcs x10, x13, x0; \ + sbc x11, x13, x1; \ + lsl x0, x6, #3; \ + extr x1, x7, x6, #61; \ + extr x2, x8, x7, #61; \ + extr x3, x9, x8, #61; \ + extr x4, x10, x9, #61; \ + extr x5, x11, x10, #61; \ + lsr x6, x11, #61; \ + add x6, x6, #1; \ + ldp x8, x9, [P1]; \ + ldp x10, x11, [P1+16]; \ + ldp x12, x13, [P1+32]; \ + mov x14, 3; \ + mul x15, x14, x8; \ + umulh x8, x14, x8; \ + adds x0, x0, x15; \ + mul x15, x14, x9; \ + umulh x9, x14, x9; \ + adcs x1, x1, x15; \ + mul x15, x14, x10; \ + umulh x10, x14, x10; \ + adcs x2, x2, x15; \ + mul x15, x14, x11; \ + umulh x11, x14, x11; \ + adcs x3, x3, x15; \ + mul x15, x14, x12; \ + umulh x12, x14, x12; \ + adcs x4, x4, x15; \ + mul x15, x14, x13; \ + umulh x13, x14, x13; \ + adcs x5, x5, x15; \ + adc x6, x6, xzr; \ + adds x1, x1, x8; \ + adcs x2, x2, x9; \ + adcs x3, x3, x10; \ + adcs x4, x4, x11; \ + adcs x5, x5, x12; \ + adcs x6, x6, x13; \ + lsl x7, x6, #32; \ + subs x8, x6, x7; \ + sbc x7, x7, xzr; \ + adds x0, x0, x8; \ + adcs x1, x1, x7; \ + adcs x2, x2, x6; \ + adcs x3, x3, xzr; \ + adcs x4, x4, xzr; \ + adcs x5, x5, xzr; \ + csetm x6, cc; \ + mov x7, #0xffffffff; \ + and x7, x7, x6; \ + adds x0, x0, x7; \ + eor x7, x7, x6; \ + adcs x1, x1, x7; \ + mov x7, #0xfffffffffffffffe; \ + and x7, x7, x6; \ + adcs x2, x2, x7; \ + adcs x3, x3, x6; \ + adcs x4, x4, x6; \ + adc x5, x5, x6; \ + stp x0, x1, [P0]; \ + stp x2, x3, [P0+16]; \ + stp x4, x5, [P0+32] + +S2N_BN_SYMBOL(p384_montjdouble): + +// Save regs and make room on stack for temporary variables + + stp x19, x20, [sp, #-16]! + stp x21, x22, [sp, #-16]! + stp x23, x24, [sp, #-16]! + sub sp, sp, NSPACE + +// Move the input arguments to stable places + + mov input_z, x0 + mov input_x, x1 + +// Main code, just a sequence of basic field operations + +// z2 = z^2 +// y2 = y^2 + + montsqr_p384(z2,z_1) + montsqr_p384(y2,y_1) + +// x2p = x^2 - z^4 = (x + z^2) * (x - z^2) + + weakadd_p384(t1,x_1,z2) + sub_p384(t2,x_1,z2) + montmul_p384(x2p,t1,t2) + +// t1 = y + z +// x4p = x2p^2 +// xy2 = x * y^2 + + add_p384(t1,y_1,z_1) + montsqr_p384(x4p,x2p) + montmul_p384(xy2,x_1,y2) + +// t2 = (y + z)^2 + + montsqr_p384(t2,t1) + +// d = 12 * xy2 - 9 * x4p +// t1 = y^2 + 2 * y * z + + cmsub_p384(d,12,xy2,9,x4p) + sub_p384(t1,t2,z2) + +// y4 = y^4 + + montsqr_p384(y4,y2) + +// z_3' = 2 * y * z +// dx2 = d * x2p + + sub_p384(z_3,t1,y2) + montmul_p384(dx2,d,x2p) + +// x' = 4 * xy2 - d + + cmsub41_p384(x_3,xy2,d) + +// y' = 3 * dx2 - 8 * y4 + + cmsub38_p384(y_3,dx2,y4) + +// Restore stack and registers + + add sp, sp, NSPACE + + ldp x23, x24, [sp], 16 + ldp x21, x22, [sp], 16 + ldp x19, x20, [sp], 16 + + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack, "", %progbits +#endif diff --git a/arm/p384/p384_montjmixadd.S b/arm/p384/p384_montjmixadd.S new file mode 100644 index 0000000000..f7467be289 --- /dev/null +++ b/arm/p384/p384_montjmixadd.S @@ -0,0 +1,884 @@ +/* + * Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"). + * You may not use this file except in compliance with the License. + * A copy of the License is located at + * + * http://aws.amazon.com/apache2.0 + * + * or in the "LICENSE" file accompanying this file. This file is distributed + * on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either + * express or implied. See the License for the specific language governing + * permissions and limitations under the License. + */ + +// ---------------------------------------------------------------------------- +// Point mixed addition on NIST curve P-384 in Montgomery-Jacobian coordinates +// +// extern void p384_montjmixadd +// (uint64_t p3[static 18],uint64_t p1[static 18],uint64_t p2[static 12]); +// +// Does p3 := p1 + p2 where all points are regarded as Jacobian triples with +// each coordinate in the Montgomery domain, i.e. x' = (2^384 * x) mod p_384. +// A Jacobian triple (x',y',z') represents affine point (x/z^2,y/z^3). +// The "mixed" part means that p2 only has x and y coordinates, with the +// implicit z coordinate assumed to be the identity. +// +// Standard ARM ABI: X0 = p3, X1 = p1, X2 = p2 +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(p384_montjmixadd) + S2N_BN_SYM_PRIVACY_DIRECTIVE(p384_montjmixadd) + .text + .balign 4 + +// Size of individual field elements + +#define NUMSIZE 48 + +// Stable homes for input arguments during main code sequence + +#define input_z x24 +#define input_x x25 +#define input_y x26 + +// Pointer-offset pairs for inputs and outputs + +#define x_1 input_x, #0 +#define y_1 input_x, #NUMSIZE +#define z_1 input_x, #(2*NUMSIZE) + +#define x_2 input_y, #0 +#define y_2 input_y, #NUMSIZE + +#define x_3 input_z, #0 +#define y_3 input_z, #NUMSIZE +#define z_3 input_z, #(2*NUMSIZE) + +// Pointer-offset pairs for temporaries, with some aliasing +// NSPACE is the total stack needed for these temporaries + +#define zp2 sp, #(NUMSIZE*0) +#define ww sp, #(NUMSIZE*0) + +#define yd sp, #(NUMSIZE*1) +#define y2a sp, #(NUMSIZE*1) + +#define x2a sp, #(NUMSIZE*2) +#define zzx2 sp, #(NUMSIZE*2) + +#define zz sp, #(NUMSIZE*3) +#define t1 sp, #(NUMSIZE*3) + +#define t2 sp, #(NUMSIZE*4) +#define zzx1 sp, #(NUMSIZE*4) + +#define xd sp, #(NUMSIZE*5) + +#define NSPACE (NUMSIZE*6) + +// Corresponds exactly to bignum_montmul_p384_alt + +#define montmul_p384(P0,P1,P2) \ + ldp x3, x4, [P1]; \ + ldp x5, x6, [P2]; \ + mul x12, x3, x5; \ + umulh x13, x3, x5; \ + mul x11, x3, x6; \ + umulh x14, x3, x6; \ + adds x13, x13, x11; \ + ldp x7, x8, [P2+16]; \ + mul x11, x3, x7; \ + umulh x15, x3, x7; \ + adcs x14, x14, x11; \ + mul x11, x3, x8; \ + umulh x16, x3, x8; \ + adcs x15, x15, x11; \ + ldp x9, x10, [P2+32]; \ + mul x11, x3, x9; \ + umulh x17, x3, x9; \ + adcs x16, x16, x11; \ + mul x11, x3, x10; \ + umulh x19, x3, x10; \ + adcs x17, x17, x11; \ + adc x19, x19, xzr; \ + mul x11, x4, x5; \ + adds x13, x13, x11; \ + mul x11, x4, x6; \ + adcs x14, x14, x11; \ + mul x11, x4, x7; \ + adcs x15, x15, x11; \ + mul x11, x4, x8; \ + adcs x16, x16, x11; \ + mul x11, x4, x9; \ + adcs x17, x17, x11; \ + mul x11, x4, x10; \ + adcs x19, x19, x11; \ + cset x20, cs; \ + umulh x11, x4, x5; \ + adds x14, x14, x11; \ + umulh x11, x4, x6; \ + adcs x15, x15, x11; \ + umulh x11, x4, x7; \ + adcs x16, x16, x11; \ + umulh x11, x4, x8; \ + adcs x17, x17, x11; \ + umulh x11, x4, x9; \ + adcs x19, x19, x11; \ + umulh x11, x4, x10; \ + adc x20, x20, x11; \ + ldp x3, x4, [P1+16]; \ + mul x11, x3, x5; \ + adds x14, x14, x11; \ + mul x11, x3, x6; \ + adcs x15, x15, x11; \ + mul x11, x3, x7; \ + adcs x16, x16, x11; \ + mul x11, x3, x8; \ + adcs x17, x17, x11; \ + mul x11, x3, x9; \ + adcs x19, x19, x11; \ + mul x11, x3, x10; \ + adcs x20, x20, x11; \ + cset x21, cs; \ + umulh x11, x3, x5; \ + adds x15, x15, x11; \ + umulh x11, x3, x6; \ + adcs x16, x16, x11; \ + umulh x11, x3, x7; \ + adcs x17, x17, x11; \ + umulh x11, x3, x8; \ + adcs x19, x19, x11; \ + umulh x11, x3, x9; \ + adcs x20, x20, x11; \ + umulh x11, x3, x10; \ + adc x21, x21, x11; \ + mul x11, x4, x5; \ + adds x15, x15, x11; \ + mul x11, x4, x6; \ + adcs x16, x16, x11; \ + mul x11, x4, x7; \ + adcs x17, x17, x11; \ + mul x11, x4, x8; \ + adcs x19, x19, x11; \ + mul x11, x4, x9; \ + adcs x20, x20, x11; \ + mul x11, x4, x10; \ + adcs x21, x21, x11; \ + cset x22, cs; \ + umulh x11, x4, x5; \ + adds x16, x16, x11; \ + umulh x11, x4, x6; \ + adcs x17, x17, x11; \ + umulh x11, x4, x7; \ + adcs x19, x19, x11; \ + umulh x11, x4, x8; \ + adcs x20, x20, x11; \ + umulh x11, x4, x9; \ + adcs x21, x21, x11; \ + umulh x11, x4, x10; \ + adc x22, x22, x11; \ + ldp x3, x4, [P1+32]; \ + mul x11, x3, x5; \ + adds x16, x16, x11; \ + mul x11, x3, x6; \ + adcs x17, x17, x11; \ + mul x11, x3, x7; \ + adcs x19, x19, x11; \ + mul x11, x3, x8; \ + adcs x20, x20, x11; \ + mul x11, x3, x9; \ + adcs x21, x21, x11; \ + mul x11, x3, x10; \ + adcs x22, x22, x11; \ + cset x2, cs; \ + umulh x11, x3, x5; \ + adds x17, x17, x11; \ + umulh x11, x3, x6; \ + adcs x19, x19, x11; \ + umulh x11, x3, x7; \ + adcs x20, x20, x11; \ + umulh x11, x3, x8; \ + adcs x21, x21, x11; \ + umulh x11, x3, x9; \ + adcs x22, x22, x11; \ + umulh x11, x3, x10; \ + adc x2, x2, x11; \ + mul x11, x4, x5; \ + adds x17, x17, x11; \ + mul x11, x4, x6; \ + adcs x19, x19, x11; \ + mul x11, x4, x7; \ + adcs x20, x20, x11; \ + mul x11, x4, x8; \ + adcs x21, x21, x11; \ + mul x11, x4, x9; \ + adcs x22, x22, x11; \ + mul x11, x4, x10; \ + adcs x2, x2, x11; \ + cset x1, cs; \ + umulh x11, x4, x5; \ + adds x19, x19, x11; \ + umulh x11, x4, x6; \ + adcs x20, x20, x11; \ + umulh x11, x4, x7; \ + adcs x21, x21, x11; \ + umulh x11, x4, x8; \ + adcs x22, x22, x11; \ + umulh x11, x4, x9; \ + adcs x2, x2, x11; \ + umulh x11, x4, x10; \ + adc x1, x1, x11; \ + lsl x7, x12, #32; \ + add x12, x7, x12; \ + mov x7, #0xffffffff00000001; \ + umulh x7, x7, x12; \ + mov x6, #0xffffffff; \ + mul x5, x6, x12; \ + umulh x6, x6, x12; \ + adds x7, x7, x5; \ + adcs x6, x6, x12; \ + adc x5, xzr, xzr; \ + subs x13, x13, x7; \ + sbcs x14, x14, x6; \ + sbcs x15, x15, x5; \ + sbcs x16, x16, xzr; \ + sbcs x17, x17, xzr; \ + sbc x12, x12, xzr; \ + lsl x7, x13, #32; \ + add x13, x7, x13; \ + mov x7, #0xffffffff00000001; \ + umulh x7, x7, x13; \ + mov x6, #0xffffffff; \ + mul x5, x6, x13; \ + umulh x6, x6, x13; \ + adds x7, x7, x5; \ + adcs x6, x6, x13; \ + adc x5, xzr, xzr; \ + subs x14, x14, x7; \ + sbcs x15, x15, x6; \ + sbcs x16, x16, x5; \ + sbcs x17, x17, xzr; \ + sbcs x12, x12, xzr; \ + sbc x13, x13, xzr; \ + lsl x7, x14, #32; \ + add x14, x7, x14; \ + mov x7, #0xffffffff00000001; \ + umulh x7, x7, x14; \ + mov x6, #0xffffffff; \ + mul x5, x6, x14; \ + umulh x6, x6, x14; \ + adds x7, x7, x5; \ + adcs x6, x6, x14; \ + adc x5, xzr, xzr; \ + subs x15, x15, x7; \ + sbcs x16, x16, x6; \ + sbcs x17, x17, x5; \ + sbcs x12, x12, xzr; \ + sbcs x13, x13, xzr; \ + sbc x14, x14, xzr; \ + lsl x7, x15, #32; \ + add x15, x7, x15; \ + mov x7, #0xffffffff00000001; \ + umulh x7, x7, x15; \ + mov x6, #0xffffffff; \ + mul x5, x6, x15; \ + umulh x6, x6, x15; \ + adds x7, x7, x5; \ + adcs x6, x6, x15; \ + adc x5, xzr, xzr; \ + subs x16, x16, x7; \ + sbcs x17, x17, x6; \ + sbcs x12, x12, x5; \ + sbcs x13, x13, xzr; \ + sbcs x14, x14, xzr; \ + sbc x15, x15, xzr; \ + lsl x7, x16, #32; \ + add x16, x7, x16; \ + mov x7, #0xffffffff00000001; \ + umulh x7, x7, x16; \ + mov x6, #0xffffffff; \ + mul x5, x6, x16; \ + umulh x6, x6, x16; \ + adds x7, x7, x5; \ + adcs x6, x6, x16; \ + adc x5, xzr, xzr; \ + subs x17, x17, x7; \ + sbcs x12, x12, x6; \ + sbcs x13, x13, x5; \ + sbcs x14, x14, xzr; \ + sbcs x15, x15, xzr; \ + sbc x16, x16, xzr; \ + lsl x7, x17, #32; \ + add x17, x7, x17; \ + mov x7, #0xffffffff00000001; \ + umulh x7, x7, x17; \ + mov x6, #0xffffffff; \ + mul x5, x6, x17; \ + umulh x6, x6, x17; \ + adds x7, x7, x5; \ + adcs x6, x6, x17; \ + adc x5, xzr, xzr; \ + subs x12, x12, x7; \ + sbcs x13, x13, x6; \ + sbcs x14, x14, x5; \ + sbcs x15, x15, xzr; \ + sbcs x16, x16, xzr; \ + sbc x17, x17, xzr; \ + adds x12, x12, x19; \ + adcs x13, x13, x20; \ + adcs x14, x14, x21; \ + adcs x15, x15, x22; \ + adcs x16, x16, x2; \ + adcs x17, x17, x1; \ + adc x10, xzr, xzr; \ + mov x11, #0xffffffff00000001; \ + adds x19, x12, x11; \ + mov x11, #0xffffffff; \ + adcs x20, x13, x11; \ + mov x11, #0x1; \ + adcs x21, x14, x11; \ + adcs x22, x15, xzr; \ + adcs x2, x16, xzr; \ + adcs x1, x17, xzr; \ + adcs x10, x10, xzr; \ + csel x12, x12, x19, eq; \ + csel x13, x13, x20, eq; \ + csel x14, x14, x21, eq; \ + csel x15, x15, x22, eq; \ + csel x16, x16, x2, eq; \ + csel x17, x17, x1, eq; \ + stp x12, x13, [P0]; \ + stp x14, x15, [P0+16]; \ + stp x16, x17, [P0+32] + +// Corresponds exactly to bignum_montsqr_p384_alt + +#define montsqr_p384(P0,P1) \ + ldp x2, x3, [P1]; \ + mul x9, x2, x3; \ + umulh x10, x2, x3; \ + ldp x4, x5, [P1+16]; \ + mul x8, x2, x4; \ + adds x10, x10, x8; \ + mul x11, x2, x5; \ + mul x8, x3, x4; \ + adcs x11, x11, x8; \ + umulh x12, x2, x5; \ + mul x8, x3, x5; \ + adcs x12, x12, x8; \ + ldp x6, x7, [P1+32]; \ + mul x13, x2, x7; \ + mul x8, x3, x6; \ + adcs x13, x13, x8; \ + umulh x14, x2, x7; \ + mul x8, x3, x7; \ + adcs x14, x14, x8; \ + mul x15, x5, x6; \ + adcs x15, x15, xzr; \ + umulh x16, x5, x6; \ + adc x16, x16, xzr; \ + umulh x8, x2, x4; \ + adds x11, x11, x8; \ + umulh x8, x3, x4; \ + adcs x12, x12, x8; \ + umulh x8, x3, x5; \ + adcs x13, x13, x8; \ + umulh x8, x3, x6; \ + adcs x14, x14, x8; \ + umulh x8, x3, x7; \ + adcs x15, x15, x8; \ + adc x16, x16, xzr; \ + mul x8, x2, x6; \ + adds x12, x12, x8; \ + mul x8, x4, x5; \ + adcs x13, x13, x8; \ + mul x8, x4, x6; \ + adcs x14, x14, x8; \ + mul x8, x4, x7; \ + adcs x15, x15, x8; \ + mul x8, x5, x7; \ + adcs x16, x16, x8; \ + mul x17, x6, x7; \ + adcs x17, x17, xzr; \ + umulh x19, x6, x7; \ + adc x19, x19, xzr; \ + umulh x8, x2, x6; \ + adds x13, x13, x8; \ + umulh x8, x4, x5; \ + adcs x14, x14, x8; \ + umulh x8, x4, x6; \ + adcs x15, x15, x8; \ + umulh x8, x4, x7; \ + adcs x16, x16, x8; \ + umulh x8, x5, x7; \ + adcs x17, x17, x8; \ + adc x19, x19, xzr; \ + adds x9, x9, x9; \ + adcs x10, x10, x10; \ + adcs x11, x11, x11; \ + adcs x12, x12, x12; \ + adcs x13, x13, x13; \ + adcs x14, x14, x14; \ + adcs x15, x15, x15; \ + adcs x16, x16, x16; \ + adcs x17, x17, x17; \ + adcs x19, x19, x19; \ + cset x20, hs; \ + umulh x8, x2, x2; \ + mul x2, x2, x2; \ + adds x9, x9, x8; \ + mul x8, x3, x3; \ + adcs x10, x10, x8; \ + umulh x8, x3, x3; \ + adcs x11, x11, x8; \ + mul x8, x4, x4; \ + adcs x12, x12, x8; \ + umulh x8, x4, x4; \ + adcs x13, x13, x8; \ + mul x8, x5, x5; \ + adcs x14, x14, x8; \ + umulh x8, x5, x5; \ + adcs x15, x15, x8; \ + mul x8, x6, x6; \ + adcs x16, x16, x8; \ + umulh x8, x6, x6; \ + adcs x17, x17, x8; \ + mul x8, x7, x7; \ + adcs x19, x19, x8; \ + umulh x8, x7, x7; \ + adc x20, x20, x8; \ + lsl x5, x2, #32; \ + add x2, x5, x2; \ + mov x5, #-4294967295; \ + umulh x5, x5, x2; \ + mov x4, #4294967295; \ + mul x3, x4, x2; \ + umulh x4, x4, x2; \ + adds x5, x5, x3; \ + adcs x4, x4, x2; \ + adc x3, xzr, xzr; \ + subs x9, x9, x5; \ + sbcs x10, x10, x4; \ + sbcs x11, x11, x3; \ + sbcs x12, x12, xzr; \ + sbcs x13, x13, xzr; \ + sbc x2, x2, xzr; \ + lsl x5, x9, #32; \ + add x9, x5, x9; \ + mov x5, #-4294967295; \ + umulh x5, x5, x9; \ + mov x4, #4294967295; \ + mul x3, x4, x9; \ + umulh x4, x4, x9; \ + adds x5, x5, x3; \ + adcs x4, x4, x9; \ + adc x3, xzr, xzr; \ + subs x10, x10, x5; \ + sbcs x11, x11, x4; \ + sbcs x12, x12, x3; \ + sbcs x13, x13, xzr; \ + sbcs x2, x2, xzr; \ + sbc x9, x9, xzr; \ + lsl x5, x10, #32; \ + add x10, x5, x10; \ + mov x5, #-4294967295; \ + umulh x5, x5, x10; \ + mov x4, #4294967295; \ + mul x3, x4, x10; \ + umulh x4, x4, x10; \ + adds x5, x5, x3; \ + adcs x4, x4, x10; \ + adc x3, xzr, xzr; \ + subs x11, x11, x5; \ + sbcs x12, x12, x4; \ + sbcs x13, x13, x3; \ + sbcs x2, x2, xzr; \ + sbcs x9, x9, xzr; \ + sbc x10, x10, xzr; \ + lsl x5, x11, #32; \ + add x11, x5, x11; \ + mov x5, #-4294967295; \ + umulh x5, x5, x11; \ + mov x4, #4294967295; \ + mul x3, x4, x11; \ + umulh x4, x4, x11; \ + adds x5, x5, x3; \ + adcs x4, x4, x11; \ + adc x3, xzr, xzr; \ + subs x12, x12, x5; \ + sbcs x13, x13, x4; \ + sbcs x2, x2, x3; \ + sbcs x9, x9, xzr; \ + sbcs x10, x10, xzr; \ + sbc x11, x11, xzr; \ + lsl x5, x12, #32; \ + add x12, x5, x12; \ + mov x5, #-4294967295; \ + umulh x5, x5, x12; \ + mov x4, #4294967295; \ + mul x3, x4, x12; \ + umulh x4, x4, x12; \ + adds x5, x5, x3; \ + adcs x4, x4, x12; \ + adc x3, xzr, xzr; \ + subs x13, x13, x5; \ + sbcs x2, x2, x4; \ + sbcs x9, x9, x3; \ + sbcs x10, x10, xzr; \ + sbcs x11, x11, xzr; \ + sbc x12, x12, xzr; \ + lsl x5, x13, #32; \ + add x13, x5, x13; \ + mov x5, #-4294967295; \ + umulh x5, x5, x13; \ + mov x4, #4294967295; \ + mul x3, x4, x13; \ + umulh x4, x4, x13; \ + adds x5, x5, x3; \ + adcs x4, x4, x13; \ + adc x3, xzr, xzr; \ + subs x2, x2, x5; \ + sbcs x9, x9, x4; \ + sbcs x10, x10, x3; \ + sbcs x11, x11, xzr; \ + sbcs x12, x12, xzr; \ + sbc x13, x13, xzr; \ + adds x2, x2, x14; \ + adcs x9, x9, x15; \ + adcs x10, x10, x16; \ + adcs x11, x11, x17; \ + adcs x12, x12, x19; \ + adcs x13, x13, x20; \ + adc x6, xzr, xzr; \ + mov x8, #-4294967295; \ + adds x14, x2, x8; \ + mov x8, #4294967295; \ + adcs x15, x9, x8; \ + mov x8, #1; \ + adcs x16, x10, x8; \ + adcs x17, x11, xzr; \ + adcs x19, x12, xzr; \ + adcs x20, x13, xzr; \ + adcs x6, x6, xzr; \ + csel x2, x2, x14, eq; \ + csel x9, x9, x15, eq; \ + csel x10, x10, x16, eq; \ + csel x11, x11, x17, eq; \ + csel x12, x12, x19, eq; \ + csel x13, x13, x20, eq; \ + stp x2, x9, [P0]; \ + stp x10, x11, [P0+16]; \ + stp x12, x13, [P0+32] + +// Almost-Montgomery variant which we use when an input to other muls +// with the other argument fully reduced (which is always safe). In +// fact, with the Karatsuba-based Montgomery mul here, we don't even +// *need* the restriction that the other argument is reduced. + +#define amontsqr_p384(P0,P1) \ + ldp x2, x3, [P1]; \ + mul x9, x2, x3; \ + umulh x10, x2, x3; \ + ldp x4, x5, [P1+16]; \ + mul x8, x2, x4; \ + adds x10, x10, x8; \ + mul x11, x2, x5; \ + mul x8, x3, x4; \ + adcs x11, x11, x8; \ + umulh x12, x2, x5; \ + mul x8, x3, x5; \ + adcs x12, x12, x8; \ + ldp x6, x7, [P1+32]; \ + mul x13, x2, x7; \ + mul x8, x3, x6; \ + adcs x13, x13, x8; \ + umulh x14, x2, x7; \ + mul x8, x3, x7; \ + adcs x14, x14, x8; \ + mul x15, x5, x6; \ + adcs x15, x15, xzr; \ + umulh x16, x5, x6; \ + adc x16, x16, xzr; \ + umulh x8, x2, x4; \ + adds x11, x11, x8; \ + umulh x8, x3, x4; \ + adcs x12, x12, x8; \ + umulh x8, x3, x5; \ + adcs x13, x13, x8; \ + umulh x8, x3, x6; \ + adcs x14, x14, x8; \ + umulh x8, x3, x7; \ + adcs x15, x15, x8; \ + adc x16, x16, xzr; \ + mul x8, x2, x6; \ + adds x12, x12, x8; \ + mul x8, x4, x5; \ + adcs x13, x13, x8; \ + mul x8, x4, x6; \ + adcs x14, x14, x8; \ + mul x8, x4, x7; \ + adcs x15, x15, x8; \ + mul x8, x5, x7; \ + adcs x16, x16, x8; \ + mul x17, x6, x7; \ + adcs x17, x17, xzr; \ + umulh x19, x6, x7; \ + adc x19, x19, xzr; \ + umulh x8, x2, x6; \ + adds x13, x13, x8; \ + umulh x8, x4, x5; \ + adcs x14, x14, x8; \ + umulh x8, x4, x6; \ + adcs x15, x15, x8; \ + umulh x8, x4, x7; \ + adcs x16, x16, x8; \ + umulh x8, x5, x7; \ + adcs x17, x17, x8; \ + adc x19, x19, xzr; \ + adds x9, x9, x9; \ + adcs x10, x10, x10; \ + adcs x11, x11, x11; \ + adcs x12, x12, x12; \ + adcs x13, x13, x13; \ + adcs x14, x14, x14; \ + adcs x15, x15, x15; \ + adcs x16, x16, x16; \ + adcs x17, x17, x17; \ + adcs x19, x19, x19; \ + cset x20, hs; \ + umulh x8, x2, x2; \ + mul x2, x2, x2; \ + adds x9, x9, x8; \ + mul x8, x3, x3; \ + adcs x10, x10, x8; \ + umulh x8, x3, x3; \ + adcs x11, x11, x8; \ + mul x8, x4, x4; \ + adcs x12, x12, x8; \ + umulh x8, x4, x4; \ + adcs x13, x13, x8; \ + mul x8, x5, x5; \ + adcs x14, x14, x8; \ + umulh x8, x5, x5; \ + adcs x15, x15, x8; \ + mul x8, x6, x6; \ + adcs x16, x16, x8; \ + umulh x8, x6, x6; \ + adcs x17, x17, x8; \ + mul x8, x7, x7; \ + adcs x19, x19, x8; \ + umulh x8, x7, x7; \ + adc x20, x20, x8; \ + lsl x5, x2, #32; \ + add x2, x5, x2; \ + mov x5, #-4294967295; \ + umulh x5, x5, x2; \ + mov x4, #4294967295; \ + mul x3, x4, x2; \ + umulh x4, x4, x2; \ + adds x5, x5, x3; \ + adcs x4, x4, x2; \ + adc x3, xzr, xzr; \ + subs x9, x9, x5; \ + sbcs x10, x10, x4; \ + sbcs x11, x11, x3; \ + sbcs x12, x12, xzr; \ + sbcs x13, x13, xzr; \ + sbc x2, x2, xzr; \ + lsl x5, x9, #32; \ + add x9, x5, x9; \ + mov x5, #-4294967295; \ + umulh x5, x5, x9; \ + mov x4, #4294967295; \ + mul x3, x4, x9; \ + umulh x4, x4, x9; \ + adds x5, x5, x3; \ + adcs x4, x4, x9; \ + adc x3, xzr, xzr; \ + subs x10, x10, x5; \ + sbcs x11, x11, x4; \ + sbcs x12, x12, x3; \ + sbcs x13, x13, xzr; \ + sbcs x2, x2, xzr; \ + sbc x9, x9, xzr; \ + lsl x5, x10, #32; \ + add x10, x5, x10; \ + mov x5, #-4294967295; \ + umulh x5, x5, x10; \ + mov x4, #4294967295; \ + mul x3, x4, x10; \ + umulh x4, x4, x10; \ + adds x5, x5, x3; \ + adcs x4, x4, x10; \ + adc x3, xzr, xzr; \ + subs x11, x11, x5; \ + sbcs x12, x12, x4; \ + sbcs x13, x13, x3; \ + sbcs x2, x2, xzr; \ + sbcs x9, x9, xzr; \ + sbc x10, x10, xzr; \ + lsl x5, x11, #32; \ + add x11, x5, x11; \ + mov x5, #-4294967295; \ + umulh x5, x5, x11; \ + mov x4, #4294967295; \ + mul x3, x4, x11; \ + umulh x4, x4, x11; \ + adds x5, x5, x3; \ + adcs x4, x4, x11; \ + adc x3, xzr, xzr; \ + subs x12, x12, x5; \ + sbcs x13, x13, x4; \ + sbcs x2, x2, x3; \ + sbcs x9, x9, xzr; \ + sbcs x10, x10, xzr; \ + sbc x11, x11, xzr; \ + lsl x5, x12, #32; \ + add x12, x5, x12; \ + mov x5, #-4294967295; \ + umulh x5, x5, x12; \ + mov x4, #4294967295; \ + mul x3, x4, x12; \ + umulh x4, x4, x12; \ + adds x5, x5, x3; \ + adcs x4, x4, x12; \ + adc x3, xzr, xzr; \ + subs x13, x13, x5; \ + sbcs x2, x2, x4; \ + sbcs x9, x9, x3; \ + sbcs x10, x10, xzr; \ + sbcs x11, x11, xzr; \ + sbc x12, x12, xzr; \ + lsl x5, x13, #32; \ + add x13, x5, x13; \ + mov x5, #-4294967295; \ + umulh x5, x5, x13; \ + mov x4, #4294967295; \ + mul x3, x4, x13; \ + umulh x4, x4, x13; \ + adds x5, x5, x3; \ + adcs x4, x4, x13; \ + adc x3, xzr, xzr; \ + subs x2, x2, x5; \ + sbcs x9, x9, x4; \ + sbcs x10, x10, x3; \ + sbcs x11, x11, xzr; \ + sbcs x12, x12, xzr; \ + sbc x13, x13, xzr; \ + adds x2, x2, x14; \ + adcs x9, x9, x15; \ + adcs x10, x10, x16; \ + adcs x11, x11, x17; \ + adcs x12, x12, x19; \ + adcs x13, x13, x20; \ + mov x14, #-4294967295; \ + mov x15, #4294967295; \ + csel x14, x14, xzr, cs; \ + csel x15, x15, xzr, cs; \ + cset x16, cs; \ + adds x2, x2, x14; \ + adcs x9, x9, x15; \ + adcs x10, x10, x16; \ + adcs x11, x11, xzr; \ + adcs x12, x12, xzr; \ + adc x13, x13, xzr; \ + stp x2, x9, [P0]; \ + stp x10, x11, [P0+16]; \ + stp x12, x13, [P0+32] + +// Corresponds exactly to bignum_sub_p384 + +#define sub_p384(P0,P1,P2) \ + ldp x5, x6, [P1]; \ + ldp x4, x3, [P2]; \ + subs x5, x5, x4; \ + sbcs x6, x6, x3; \ + ldp x7, x8, [P1+16]; \ + ldp x4, x3, [P2+16]; \ + sbcs x7, x7, x4; \ + sbcs x8, x8, x3; \ + ldp x9, x10, [P1+32]; \ + ldp x4, x3, [P2+32]; \ + sbcs x9, x9, x4; \ + sbcs x10, x10, x3; \ + csetm x3, lo; \ + mov x4, #4294967295; \ + and x4, x4, x3; \ + adds x5, x5, x4; \ + eor x4, x4, x3; \ + adcs x6, x6, x4; \ + mov x4, #-2; \ + and x4, x4, x3; \ + adcs x7, x7, x4; \ + adcs x8, x8, x3; \ + adcs x9, x9, x3; \ + adc x10, x10, x3; \ + stp x5, x6, [P0]; \ + stp x7, x8, [P0+16]; \ + stp x9, x10, [P0+32] + +S2N_BN_SYMBOL(p384_montjmixadd): + +// Save regs and make room on stack for temporary variables + + stp x19, x20, [sp, #-16]! + stp x21, x22, [sp, #-16]! + stp x23, x24, [sp, #-16]! + stp x25, x26, [sp, #-16]! + sub sp, sp, NSPACE + +// Move the input arguments to stable places + + mov input_z, x0 + mov input_x, x1 + mov input_y, x2 + +// Main code, just a sequence of basic field operations +// 8 * multiply + 3 * square + 7 * subtract + + amontsqr_p384(zp2,z_1) + montmul_p384(y2a,z_1,y_2) + + montmul_p384(x2a,zp2,x_2) + montmul_p384(y2a,zp2,y2a) + + sub_p384(xd,x2a,x_1) + sub_p384(yd,y2a,y_1) + + amontsqr_p384(zz,xd) + montsqr_p384(ww,yd) + + montmul_p384(zzx1,zz,x_1) + montmul_p384(zzx2,zz,x2a) + + sub_p384(x_3,ww,zzx1) + sub_p384(t1,zzx2,zzx1) + + montmul_p384(z_3,xd,z_1) + + sub_p384(x_3,x_3,zzx2) + + sub_p384(t2,zzx1,x_3) + + montmul_p384(t1,t1,y_1) + montmul_p384(t2,yd,t2) + + sub_p384(y_3,t2,t1) + +// Restore stack and registers + + add sp, sp, NSPACE + + ldp x25, x26, [sp], 16 + ldp x23, x24, [sp], 16 + ldp x21, x22, [sp], 16 + ldp x19, x20, [sp], 16 + + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack, "", %progbits +#endif diff --git a/x86_att/p384/p384_montjadd.S b/x86_att/p384/p384_montjadd.S new file mode 100644 index 0000000000..e550f38609 --- /dev/null +++ b/x86_att/p384/p384_montjadd.S @@ -0,0 +1,955 @@ +/* + * Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"). + * You may not use this file except in compliance with the License. + * A copy of the License is located at + * + * http://aws.amazon.com/apache2.0 + * + * or in the "LICENSE" file accompanying this file. This file is distributed + * on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either + * express or implied. See the License for the specific language governing + * permissions and limitations under the License. + */ + +// ---------------------------------------------------------------------------- +// Point addition on NIST curve P-384 in Montgomery-Jacobian coordinates +// +// extern void p384_montjadd +// (uint64_t p3[static 18],uint64_t p1[static 18],uint64_t p2[static 18]); +// +// Does p3 := p1 + p2 where all points are regarded as Jacobian triples with +// each coordinate in the Montgomery domain, i.e. x' = (2^384 * x) mod p_384. +// A Jacobian triple (x',y',z') represents affine point (x/z^2,y/z^3). +// +// Standard x86-64 ABI: RDI = p3, RSI = p1, RDX = p2 +// Microsoft x64 ABI: RCX = p3, RDX = p1, R8 = p2 +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(p384_montjadd) + S2N_BN_SYM_PRIVACY_DIRECTIVE(p384_montjadd) + .text + .balign 4 + +// Size of individual field elements + +#define NUMSIZE 48 + +// Pointer-offset pairs for inputs and outputs +// These assume %rdi = p3, %rsi = p1 and %rcx = p2, +// which needs to be set up explicitly before use + +#define x_1 0(%rsi) +#define y_1 NUMSIZE(%rsi) +#define z_1 (2*NUMSIZE)(%rsi) + +#define x_2 0(%rcx) +#define y_2 NUMSIZE(%rcx) +#define z_2 (2*NUMSIZE)(%rcx) + +#define x_3 0(%rdi) +#define y_3 NUMSIZE(%rdi) +#define z_3 (2*NUMSIZE)(%rdi) + +// In one place it's convenient to use another register +// since the squaring function overwrites %rcx + +#define z_2_alt (2*NUMSIZE)(%rsi) + +// Pointer-offset pairs for temporaries, with some aliasing +// NSPACE is the total stack needed for these temporaries + +#define z1sq (NUMSIZE*0)(%rsp) +#define ww (NUMSIZE*0)(%rsp) + +#define yd (NUMSIZE*1)(%rsp) +#define y2a (NUMSIZE*1)(%rsp) + +#define x2a (NUMSIZE*2)(%rsp) +#define zzx2 (NUMSIZE*2)(%rsp) + +#define zz (NUMSIZE*3)(%rsp) +#define t1 (NUMSIZE*3)(%rsp) + +#define t2 (NUMSIZE*4)(%rsp) +#define x1a (NUMSIZE*4)(%rsp) +#define zzx1 (NUMSIZE*4)(%rsp) + +#define xd (NUMSIZE*5)(%rsp) +#define z2sq (NUMSIZE*5)(%rsp) + +#define y1a (NUMSIZE*6)(%rsp) + +// Temporaries for the actual input pointers + +#define input_x (NUMSIZE*7)(%rsp) +#define input_y (NUMSIZE*7+8)(%rsp) +#define input_z (NUMSIZE*7+16)(%rsp) + +#define NSPACE (NUMSIZE*7+24) + +// Corresponds exactly to bignum_montmul_p384 + +#define montmul_p384(P0,P1,P2) \ + movq P2, %rdx ; \ + xorl %r15d, %r15d ; \ + mulxq P1, %r8, %r9 ; \ + mulxq 0x8+P1, %rbx, %r10 ; \ + addq %rbx, %r9 ; \ + mulxq 0x10+P1, %rbx, %r11 ; \ + adcq %rbx, %r10 ; \ + mulxq 0x18+P1, %rbx, %r12 ; \ + adcq %rbx, %r11 ; \ + mulxq 0x20+P1, %rbx, %r13 ; \ + adcq %rbx, %r12 ; \ + mulxq 0x28+P1, %rbx, %r14 ; \ + adcq %rbx, %r13 ; \ + adcq %r15, %r14 ; \ + movq %r8, %rdx ; \ + shlq $0x20, %rdx ; \ + addq %r8, %rdx ; \ + xorl %ebp, %ebp ; \ + movq $0xffffffff00000001, %rax ; \ + mulxq %rax, %rbx, %rax ; \ + movl $0xffffffff, %ebx ; \ + mulxq %rbx, %r8, %rbx ; \ + adcq %r8, %rax ; \ + adcq %rdx, %rbx ; \ + adcl %ebp, %ebp ; \ + subq %rax, %r9 ; \ + sbbq %rbx, %r10 ; \ + sbbq %rbp, %r11 ; \ + sbbq $0x0, %r12 ; \ + sbbq $0x0, %r13 ; \ + sbbq $0x0, %rdx ; \ + addq %rdx, %r14 ; \ + adcq $0x0, %r15 ; \ + movq 0x8+P2, %rdx ; \ + xorl %r8d, %r8d ; \ + mulxq P1, %rax, %rbx ; \ + adcxq %rax, %r9 ; \ + adoxq %rbx, %r10 ; \ + mulxq 0x8+P1, %rax, %rbx ; \ + adcxq %rax, %r10 ; \ + adoxq %rbx, %r11 ; \ + mulxq 0x10+P1, %rax, %rbx ; \ + adcxq %rax, %r11 ; \ + adoxq %rbx, %r12 ; \ + mulxq 0x18+P1, %rax, %rbx ; \ + adcxq %rax, %r12 ; \ + adoxq %rbx, %r13 ; \ + mulxq 0x20+P1, %rax, %rbx ; \ + adcxq %rax, %r13 ; \ + adoxq %rbx, %r14 ; \ + adoxq %r8, %r15 ; \ + mulxq 0x28+P1, %rax, %rbx ; \ + adcq %rax, %r14 ; \ + adcq %rbx, %r15 ; \ + adcq %r8, %r8 ; \ + movq %r9, %rdx ; \ + shlq $0x20, %rdx ; \ + addq %r9, %rdx ; \ + xorl %ebp, %ebp ; \ + movq $0xffffffff00000001, %rax ; \ + mulxq %rax, %rbx, %rax ; \ + movl $0xffffffff, %ebx ; \ + mulxq %rbx, %r9, %rbx ; \ + adcq %r9, %rax ; \ + adcq %rdx, %rbx ; \ + adcl %ebp, %ebp ; \ + subq %rax, %r10 ; \ + sbbq %rbx, %r11 ; \ + sbbq %rbp, %r12 ; \ + sbbq $0x0, %r13 ; \ + sbbq $0x0, %r14 ; \ + sbbq $0x0, %rdx ; \ + addq %rdx, %r15 ; \ + adcq $0x0, %r8 ; \ + movq 0x10+P2, %rdx ; \ + xorl %r9d, %r9d ; \ + mulxq P1, %rax, %rbx ; \ + adcxq %rax, %r10 ; \ + adoxq %rbx, %r11 ; \ + mulxq 0x8+P1, %rax, %rbx ; \ + adcxq %rax, %r11 ; \ + adoxq %rbx, %r12 ; \ + mulxq 0x10+P1, %rax, %rbx ; \ + adcxq %rax, %r12 ; \ + adoxq %rbx, %r13 ; \ + mulxq 0x18+P1, %rax, %rbx ; \ + adcxq %rax, %r13 ; \ + adoxq %rbx, %r14 ; \ + mulxq 0x20+P1, %rax, %rbx ; \ + adcxq %rax, %r14 ; \ + adoxq %rbx, %r15 ; \ + adoxq %r9, %r8 ; \ + mulxq 0x28+P1, %rax, %rbx ; \ + adcq %rax, %r15 ; \ + adcq %rbx, %r8 ; \ + adcq %r9, %r9 ; \ + movq %r10, %rdx ; \ + shlq $0x20, %rdx ; \ + addq %r10, %rdx ; \ + xorl %ebp, %ebp ; \ + movq $0xffffffff00000001, %rax ; \ + mulxq %rax, %rbx, %rax ; \ + movl $0xffffffff, %ebx ; \ + mulxq %rbx, %r10, %rbx ; \ + adcq %r10, %rax ; \ + adcq %rdx, %rbx ; \ + adcl %ebp, %ebp ; \ + subq %rax, %r11 ; \ + sbbq %rbx, %r12 ; \ + sbbq %rbp, %r13 ; \ + sbbq $0x0, %r14 ; \ + sbbq $0x0, %r15 ; \ + sbbq $0x0, %rdx ; \ + addq %rdx, %r8 ; \ + adcq $0x0, %r9 ; \ + movq 0x18+P2, %rdx ; \ + xorl %r10d, %r10d ; \ + mulxq P1, %rax, %rbx ; \ + adcxq %rax, %r11 ; \ + adoxq %rbx, %r12 ; \ + mulxq 0x8+P1, %rax, %rbx ; \ + adcxq %rax, %r12 ; \ + adoxq %rbx, %r13 ; \ + mulxq 0x10+P1, %rax, %rbx ; \ + adcxq %rax, %r13 ; \ + adoxq %rbx, %r14 ; \ + mulxq 0x18+P1, %rax, %rbx ; \ + adcxq %rax, %r14 ; \ + adoxq %rbx, %r15 ; \ + mulxq 0x20+P1, %rax, %rbx ; \ + adcxq %rax, %r15 ; \ + adoxq %rbx, %r8 ; \ + adoxq %r10, %r9 ; \ + mulxq 0x28+P1, %rax, %rbx ; \ + adcq %rax, %r8 ; \ + adcq %rbx, %r9 ; \ + adcq %r10, %r10 ; \ + movq %r11, %rdx ; \ + shlq $0x20, %rdx ; \ + addq %r11, %rdx ; \ + xorl %ebp, %ebp ; \ + movq $0xffffffff00000001, %rax ; \ + mulxq %rax, %rbx, %rax ; \ + movl $0xffffffff, %ebx ; \ + mulxq %rbx, %r11, %rbx ; \ + adcq %r11, %rax ; \ + adcq %rdx, %rbx ; \ + adcl %ebp, %ebp ; \ + subq %rax, %r12 ; \ + sbbq %rbx, %r13 ; \ + sbbq %rbp, %r14 ; \ + sbbq $0x0, %r15 ; \ + sbbq $0x0, %r8 ; \ + sbbq $0x0, %rdx ; \ + addq %rdx, %r9 ; \ + adcq $0x0, %r10 ; \ + movq 0x20+P2, %rdx ; \ + xorl %r11d, %r11d ; \ + mulxq P1, %rax, %rbx ; \ + adcxq %rax, %r12 ; \ + adoxq %rbx, %r13 ; \ + mulxq 0x8+P1, %rax, %rbx ; \ + adcxq %rax, %r13 ; \ + adoxq %rbx, %r14 ; \ + mulxq 0x10+P1, %rax, %rbx ; \ + adcxq %rax, %r14 ; \ + adoxq %rbx, %r15 ; \ + mulxq 0x18+P1, %rax, %rbx ; \ + adcxq %rax, %r15 ; \ + adoxq %rbx, %r8 ; \ + mulxq 0x20+P1, %rax, %rbx ; \ + adcxq %rax, %r8 ; \ + adoxq %rbx, %r9 ; \ + adoxq %r11, %r10 ; \ + mulxq 0x28+P1, %rax, %rbx ; \ + adcq %rax, %r9 ; \ + adcq %rbx, %r10 ; \ + adcq %r11, %r11 ; \ + movq %r12, %rdx ; \ + shlq $0x20, %rdx ; \ + addq %r12, %rdx ; \ + xorl %ebp, %ebp ; \ + movq $0xffffffff00000001, %rax ; \ + mulxq %rax, %rbx, %rax ; \ + movl $0xffffffff, %ebx ; \ + mulxq %rbx, %r12, %rbx ; \ + adcq %r12, %rax ; \ + adcq %rdx, %rbx ; \ + adcl %ebp, %ebp ; \ + subq %rax, %r13 ; \ + sbbq %rbx, %r14 ; \ + sbbq %rbp, %r15 ; \ + sbbq $0x0, %r8 ; \ + sbbq $0x0, %r9 ; \ + sbbq $0x0, %rdx ; \ + addq %rdx, %r10 ; \ + adcq $0x0, %r11 ; \ + movq 0x28+P2, %rdx ; \ + xorl %r12d, %r12d ; \ + mulxq P1, %rax, %rbx ; \ + adcxq %rax, %r13 ; \ + adoxq %rbx, %r14 ; \ + mulxq 0x8+P1, %rax, %rbx ; \ + adcxq %rax, %r14 ; \ + adoxq %rbx, %r15 ; \ + mulxq 0x10+P1, %rax, %rbx ; \ + adcxq %rax, %r15 ; \ + adoxq %rbx, %r8 ; \ + mulxq 0x18+P1, %rax, %rbx ; \ + adcxq %rax, %r8 ; \ + adoxq %rbx, %r9 ; \ + mulxq 0x20+P1, %rax, %rbx ; \ + adcxq %rax, %r9 ; \ + adoxq %rbx, %r10 ; \ + adoxq %r12, %r11 ; \ + mulxq 0x28+P1, %rax, %rbx ; \ + adcq %rax, %r10 ; \ + adcq %rbx, %r11 ; \ + adcq %r12, %r12 ; \ + movq %r13, %rdx ; \ + shlq $0x20, %rdx ; \ + addq %r13, %rdx ; \ + xorl %ebp, %ebp ; \ + movq $0xffffffff00000001, %rax ; \ + mulxq %rax, %rbx, %rax ; \ + movl $0xffffffff, %ebx ; \ + mulxq %rbx, %r13, %rbx ; \ + adcq %r13, %rax ; \ + adcq %rdx, %rbx ; \ + adcl %ebp, %ebp ; \ + subq %rax, %r14 ; \ + sbbq %rbx, %r15 ; \ + sbbq %rbp, %r8 ; \ + sbbq $0x0, %r9 ; \ + sbbq $0x0, %r10 ; \ + sbbq $0x0, %rdx ; \ + addq %rdx, %r11 ; \ + adcq $0x0, %r12 ; \ + xorl %edx, %edx ; \ + xorl %ebp, %ebp ; \ + xorl %r13d, %r13d ; \ + movq $0xffffffff00000001, %rax ; \ + addq %r14, %rax ; \ + movl $0xffffffff, %ebx ; \ + adcq %r15, %rbx ; \ + movl $0x1, %ecx ; \ + adcq %r8, %rcx ; \ + adcq %r9, %rdx ; \ + adcq %r10, %rbp ; \ + adcq %r11, %r13 ; \ + adcq $0x0, %r12 ; \ + cmovne %rax, %r14 ; \ + cmovne %rbx, %r15 ; \ + cmovne %rcx, %r8 ; \ + cmovne %rdx, %r9 ; \ + cmovne %rbp, %r10 ; \ + cmovne %r13, %r11 ; \ + movq %r14, P0 ; \ + movq %r15, 0x8+P0 ; \ + movq %r8, 0x10+P0 ; \ + movq %r9, 0x18+P0 ; \ + movq %r10, 0x20+P0 ; \ + movq %r11, 0x28+P0 + +// Corresponds exactly to bignum_montsqr_p384 + +#define montsqr_p384(P0,P1) \ + movq P1, %rdx ; \ + mulxq 0x8+P1, %r9, %r10 ; \ + mulxq 0x18+P1, %r11, %r12 ; \ + mulxq 0x28+P1, %r13, %r14 ; \ + movq 0x18+P1, %rdx ; \ + mulxq 0x20+P1, %r15, %rcx ; \ + xorl %ebp, %ebp ; \ + movq 0x10+P1, %rdx ; \ + mulxq P1, %rax, %rbx ; \ + adcxq %rax, %r10 ; \ + adoxq %rbx, %r11 ; \ + mulxq 0x8+P1, %rax, %rbx ; \ + adcxq %rax, %r11 ; \ + adoxq %rbx, %r12 ; \ + movq 0x8+P1, %rdx ; \ + mulxq 0x18+P1, %rax, %rbx ; \ + adcxq %rax, %r12 ; \ + adoxq %rbx, %r13 ; \ + mulxq 0x20+P1, %rax, %rbx ; \ + adcxq %rax, %r13 ; \ + adoxq %rbx, %r14 ; \ + mulxq 0x28+P1, %rax, %rbx ; \ + adcxq %rax, %r14 ; \ + adoxq %rbx, %r15 ; \ + adcxq %rbp, %r15 ; \ + adoxq %rbp, %rcx ; \ + adcq %rbp, %rcx ; \ + xorl %ebp, %ebp ; \ + movq 0x20+P1, %rdx ; \ + mulxq P1, %rax, %rbx ; \ + adcxq %rax, %r12 ; \ + adoxq %rbx, %r13 ; \ + movq 0x10+P1, %rdx ; \ + mulxq 0x18+P1, %rax, %rbx ; \ + adcxq %rax, %r13 ; \ + adoxq %rbx, %r14 ; \ + mulxq 0x20+P1, %rax, %rbx ; \ + adcxq %rax, %r14 ; \ + adoxq %rbx, %r15 ; \ + mulxq 0x28+P1, %rax, %rdx ; \ + adcxq %rax, %r15 ; \ + adoxq %rdx, %rcx ; \ + movq 0x28+P1, %rdx ; \ + mulxq 0x20+P1, %rbx, %rbp ; \ + mulxq 0x18+P1, %rax, %rdx ; \ + adcxq %rax, %rcx ; \ + adoxq %rdx, %rbx ; \ + movl $0x0, %eax ; \ + adcxq %rax, %rbx ; \ + adoxq %rax, %rbp ; \ + adcq %rax, %rbp ; \ + xorq %rax, %rax ; \ + movq P1, %rdx ; \ + mulxq P1, %r8, %rax ; \ + adcxq %r9, %r9 ; \ + adoxq %rax, %r9 ; \ + movq 0x8+P1, %rdx ; \ + mulxq %rdx, %rax, %rdx ; \ + adcxq %r10, %r10 ; \ + adoxq %rax, %r10 ; \ + adcxq %r11, %r11 ; \ + adoxq %rdx, %r11 ; \ + movq 0x10+P1, %rdx ; \ + mulxq %rdx, %rax, %rdx ; \ + adcxq %r12, %r12 ; \ + adoxq %rax, %r12 ; \ + adcxq %r13, %r13 ; \ + adoxq %rdx, %r13 ; \ + movq 0x18+P1, %rdx ; \ + mulxq %rdx, %rax, %rdx ; \ + adcxq %r14, %r14 ; \ + adoxq %rax, %r14 ; \ + adcxq %r15, %r15 ; \ + adoxq %rdx, %r15 ; \ + movq 0x20+P1, %rdx ; \ + mulxq %rdx, %rax, %rdx ; \ + adcxq %rcx, %rcx ; \ + adoxq %rax, %rcx ; \ + adcxq %rbx, %rbx ; \ + adoxq %rdx, %rbx ; \ + movq 0x28+P1, %rdx ; \ + mulxq %rdx, %rax, %rsi ; \ + adcxq %rbp, %rbp ; \ + adoxq %rax, %rbp ; \ + movl $0x0, %eax ; \ + adcxq %rax, %rsi ; \ + adoxq %rax, %rsi ; \ + movq %rbx, P0 ; \ + movq %r8, %rdx ; \ + shlq $0x20, %rdx ; \ + addq %r8, %rdx ; \ + movq $0xffffffff00000001, %rax ; \ + mulxq %rax, %r8, %rax ; \ + movl $0xffffffff, %ebx ; \ + mulxq %rbx, %rbx, %r8 ; \ + addq %rbx, %rax ; \ + adcq %rdx, %r8 ; \ + movl $0x0, %ebx ; \ + adcq %rbx, %rbx ; \ + subq %rax, %r9 ; \ + sbbq %r8, %r10 ; \ + sbbq %rbx, %r11 ; \ + sbbq $0x0, %r12 ; \ + sbbq $0x0, %r13 ; \ + movq %rdx, %r8 ; \ + sbbq $0x0, %r8 ; \ + movq %r9, %rdx ; \ + shlq $0x20, %rdx ; \ + addq %r9, %rdx ; \ + movq $0xffffffff00000001, %rax ; \ + mulxq %rax, %r9, %rax ; \ + movl $0xffffffff, %ebx ; \ + mulxq %rbx, %rbx, %r9 ; \ + addq %rbx, %rax ; \ + adcq %rdx, %r9 ; \ + movl $0x0, %ebx ; \ + adcq %rbx, %rbx ; \ + subq %rax, %r10 ; \ + sbbq %r9, %r11 ; \ + sbbq %rbx, %r12 ; \ + sbbq $0x0, %r13 ; \ + sbbq $0x0, %r8 ; \ + movq %rdx, %r9 ; \ + sbbq $0x0, %r9 ; \ + movq %r10, %rdx ; \ + shlq $0x20, %rdx ; \ + addq %r10, %rdx ; \ + movq $0xffffffff00000001, %rax ; \ + mulxq %rax, %r10, %rax ; \ + movl $0xffffffff, %ebx ; \ + mulxq %rbx, %rbx, %r10 ; \ + addq %rbx, %rax ; \ + adcq %rdx, %r10 ; \ + movl $0x0, %ebx ; \ + adcq %rbx, %rbx ; \ + subq %rax, %r11 ; \ + sbbq %r10, %r12 ; \ + sbbq %rbx, %r13 ; \ + sbbq $0x0, %r8 ; \ + sbbq $0x0, %r9 ; \ + movq %rdx, %r10 ; \ + sbbq $0x0, %r10 ; \ + movq %r11, %rdx ; \ + shlq $0x20, %rdx ; \ + addq %r11, %rdx ; \ + movq $0xffffffff00000001, %rax ; \ + mulxq %rax, %r11, %rax ; \ + movl $0xffffffff, %ebx ; \ + mulxq %rbx, %rbx, %r11 ; \ + addq %rbx, %rax ; \ + adcq %rdx, %r11 ; \ + movl $0x0, %ebx ; \ + adcq %rbx, %rbx ; \ + subq %rax, %r12 ; \ + sbbq %r11, %r13 ; \ + sbbq %rbx, %r8 ; \ + sbbq $0x0, %r9 ; \ + sbbq $0x0, %r10 ; \ + movq %rdx, %r11 ; \ + sbbq $0x0, %r11 ; \ + movq %r12, %rdx ; \ + shlq $0x20, %rdx ; \ + addq %r12, %rdx ; \ + movq $0xffffffff00000001, %rax ; \ + mulxq %rax, %r12, %rax ; \ + movl $0xffffffff, %ebx ; \ + mulxq %rbx, %rbx, %r12 ; \ + addq %rbx, %rax ; \ + adcq %rdx, %r12 ; \ + movl $0x0, %ebx ; \ + adcq %rbx, %rbx ; \ + subq %rax, %r13 ; \ + sbbq %r12, %r8 ; \ + sbbq %rbx, %r9 ; \ + sbbq $0x0, %r10 ; \ + sbbq $0x0, %r11 ; \ + movq %rdx, %r12 ; \ + sbbq $0x0, %r12 ; \ + movq %r13, %rdx ; \ + shlq $0x20, %rdx ; \ + addq %r13, %rdx ; \ + movq $0xffffffff00000001, %rax ; \ + mulxq %rax, %r13, %rax ; \ + movl $0xffffffff, %ebx ; \ + mulxq %rbx, %rbx, %r13 ; \ + addq %rbx, %rax ; \ + adcq %rdx, %r13 ; \ + movl $0x0, %ebx ; \ + adcq %rbx, %rbx ; \ + subq %rax, %r8 ; \ + sbbq %r13, %r9 ; \ + sbbq %rbx, %r10 ; \ + sbbq $0x0, %r11 ; \ + sbbq $0x0, %r12 ; \ + movq %rdx, %r13 ; \ + sbbq $0x0, %r13 ; \ + movq P0, %rbx ; \ + addq %r8, %r14 ; \ + adcq %r9, %r15 ; \ + adcq %r10, %rcx ; \ + adcq %r11, %rbx ; \ + adcq %r12, %rbp ; \ + adcq %r13, %rsi ; \ + movl $0x0, %r8d ; \ + adcq %r8, %r8 ; \ + xorq %r11, %r11 ; \ + xorq %r12, %r12 ; \ + xorq %r13, %r13 ; \ + movq $0xffffffff00000001, %rax ; \ + addq %r14, %rax ; \ + movl $0xffffffff, %r9d ; \ + adcq %r15, %r9 ; \ + movl $0x1, %r10d ; \ + adcq %rcx, %r10 ; \ + adcq %rbx, %r11 ; \ + adcq %rbp, %r12 ; \ + adcq %rsi, %r13 ; \ + adcq $0x0, %r8 ; \ + cmovne %rax, %r14 ; \ + cmovne %r9, %r15 ; \ + cmovne %r10, %rcx ; \ + cmovne %r11, %rbx ; \ + cmovne %r12, %rbp ; \ + cmovne %r13, %rsi ; \ + movq %r14, P0 ; \ + movq %r15, 0x8+P0 ; \ + movq %rcx, 0x10+P0 ; \ + movq %rbx, 0x18+P0 ; \ + movq %rbp, 0x20+P0 ; \ + movq %rsi, 0x28+P0 + +// Almost-Montgomery variant which we use when an input to other muls +// with the other argument fully reduced (which is always safe). + +#define amontsqr_p384(P0,P1) \ + movq P1, %rdx ; \ + mulxq 0x8+P1, %r9, %r10 ; \ + mulxq 0x18+P1, %r11, %r12 ; \ + mulxq 0x28+P1, %r13, %r14 ; \ + movq 0x18+P1, %rdx ; \ + mulxq 0x20+P1, %r15, %rcx ; \ + xorl %ebp, %ebp ; \ + movq 0x10+P1, %rdx ; \ + mulxq P1, %rax, %rbx ; \ + adcxq %rax, %r10 ; \ + adoxq %rbx, %r11 ; \ + mulxq 0x8+P1, %rax, %rbx ; \ + adcxq %rax, %r11 ; \ + adoxq %rbx, %r12 ; \ + movq 0x8+P1, %rdx ; \ + mulxq 0x18+P1, %rax, %rbx ; \ + adcxq %rax, %r12 ; \ + adoxq %rbx, %r13 ; \ + mulxq 0x20+P1, %rax, %rbx ; \ + adcxq %rax, %r13 ; \ + adoxq %rbx, %r14 ; \ + mulxq 0x28+P1, %rax, %rbx ; \ + adcxq %rax, %r14 ; \ + adoxq %rbx, %r15 ; \ + adcxq %rbp, %r15 ; \ + adoxq %rbp, %rcx ; \ + adcq %rbp, %rcx ; \ + xorl %ebp, %ebp ; \ + movq 0x20+P1, %rdx ; \ + mulxq P1, %rax, %rbx ; \ + adcxq %rax, %r12 ; \ + adoxq %rbx, %r13 ; \ + movq 0x10+P1, %rdx ; \ + mulxq 0x18+P1, %rax, %rbx ; \ + adcxq %rax, %r13 ; \ + adoxq %rbx, %r14 ; \ + mulxq 0x20+P1, %rax, %rbx ; \ + adcxq %rax, %r14 ; \ + adoxq %rbx, %r15 ; \ + mulxq 0x28+P1, %rax, %rdx ; \ + adcxq %rax, %r15 ; \ + adoxq %rdx, %rcx ; \ + movq 0x28+P1, %rdx ; \ + mulxq 0x20+P1, %rbx, %rbp ; \ + mulxq 0x18+P1, %rax, %rdx ; \ + adcxq %rax, %rcx ; \ + adoxq %rdx, %rbx ; \ + movl $0x0, %eax ; \ + adcxq %rax, %rbx ; \ + adoxq %rax, %rbp ; \ + adcq %rax, %rbp ; \ + xorq %rax, %rax ; \ + movq P1, %rdx ; \ + mulxq P1, %r8, %rax ; \ + adcxq %r9, %r9 ; \ + adoxq %rax, %r9 ; \ + movq 0x8+P1, %rdx ; \ + mulxq %rdx, %rax, %rdx ; \ + adcxq %r10, %r10 ; \ + adoxq %rax, %r10 ; \ + adcxq %r11, %r11 ; \ + adoxq %rdx, %r11 ; \ + movq 0x10+P1, %rdx ; \ + mulxq %rdx, %rax, %rdx ; \ + adcxq %r12, %r12 ; \ + adoxq %rax, %r12 ; \ + adcxq %r13, %r13 ; \ + adoxq %rdx, %r13 ; \ + movq 0x18+P1, %rdx ; \ + mulxq %rdx, %rax, %rdx ; \ + adcxq %r14, %r14 ; \ + adoxq %rax, %r14 ; \ + adcxq %r15, %r15 ; \ + adoxq %rdx, %r15 ; \ + movq 0x20+P1, %rdx ; \ + mulxq %rdx, %rax, %rdx ; \ + adcxq %rcx, %rcx ; \ + adoxq %rax, %rcx ; \ + adcxq %rbx, %rbx ; \ + adoxq %rdx, %rbx ; \ + movq 0x28+P1, %rdx ; \ + mulxq %rdx, %rax, %rsi ; \ + adcxq %rbp, %rbp ; \ + adoxq %rax, %rbp ; \ + movl $0x0, %eax ; \ + adcxq %rax, %rsi ; \ + adoxq %rax, %rsi ; \ + movq %rbx, P0 ; \ + movq %r8, %rdx ; \ + shlq $0x20, %rdx ; \ + addq %r8, %rdx ; \ + movq $0xffffffff00000001, %rax ; \ + mulxq %rax, %r8, %rax ; \ + movl $0xffffffff, %ebx ; \ + mulxq %rbx, %rbx, %r8 ; \ + addq %rbx, %rax ; \ + adcq %rdx, %r8 ; \ + movl $0x0, %ebx ; \ + adcq %rbx, %rbx ; \ + subq %rax, %r9 ; \ + sbbq %r8, %r10 ; \ + sbbq %rbx, %r11 ; \ + sbbq $0x0, %r12 ; \ + sbbq $0x0, %r13 ; \ + movq %rdx, %r8 ; \ + sbbq $0x0, %r8 ; \ + movq %r9, %rdx ; \ + shlq $0x20, %rdx ; \ + addq %r9, %rdx ; \ + movq $0xffffffff00000001, %rax ; \ + mulxq %rax, %r9, %rax ; \ + movl $0xffffffff, %ebx ; \ + mulxq %rbx, %rbx, %r9 ; \ + addq %rbx, %rax ; \ + adcq %rdx, %r9 ; \ + movl $0x0, %ebx ; \ + adcq %rbx, %rbx ; \ + subq %rax, %r10 ; \ + sbbq %r9, %r11 ; \ + sbbq %rbx, %r12 ; \ + sbbq $0x0, %r13 ; \ + sbbq $0x0, %r8 ; \ + movq %rdx, %r9 ; \ + sbbq $0x0, %r9 ; \ + movq %r10, %rdx ; \ + shlq $0x20, %rdx ; \ + addq %r10, %rdx ; \ + movq $0xffffffff00000001, %rax ; \ + mulxq %rax, %r10, %rax ; \ + movl $0xffffffff, %ebx ; \ + mulxq %rbx, %rbx, %r10 ; \ + addq %rbx, %rax ; \ + adcq %rdx, %r10 ; \ + movl $0x0, %ebx ; \ + adcq %rbx, %rbx ; \ + subq %rax, %r11 ; \ + sbbq %r10, %r12 ; \ + sbbq %rbx, %r13 ; \ + sbbq $0x0, %r8 ; \ + sbbq $0x0, %r9 ; \ + movq %rdx, %r10 ; \ + sbbq $0x0, %r10 ; \ + movq %r11, %rdx ; \ + shlq $0x20, %rdx ; \ + addq %r11, %rdx ; \ + movq $0xffffffff00000001, %rax ; \ + mulxq %rax, %r11, %rax ; \ + movl $0xffffffff, %ebx ; \ + mulxq %rbx, %rbx, %r11 ; \ + addq %rbx, %rax ; \ + adcq %rdx, %r11 ; \ + movl $0x0, %ebx ; \ + adcq %rbx, %rbx ; \ + subq %rax, %r12 ; \ + sbbq %r11, %r13 ; \ + sbbq %rbx, %r8 ; \ + sbbq $0x0, %r9 ; \ + sbbq $0x0, %r10 ; \ + movq %rdx, %r11 ; \ + sbbq $0x0, %r11 ; \ + movq %r12, %rdx ; \ + shlq $0x20, %rdx ; \ + addq %r12, %rdx ; \ + movq $0xffffffff00000001, %rax ; \ + mulxq %rax, %r12, %rax ; \ + movl $0xffffffff, %ebx ; \ + mulxq %rbx, %rbx, %r12 ; \ + addq %rbx, %rax ; \ + adcq %rdx, %r12 ; \ + movl $0x0, %ebx ; \ + adcq %rbx, %rbx ; \ + subq %rax, %r13 ; \ + sbbq %r12, %r8 ; \ + sbbq %rbx, %r9 ; \ + sbbq $0x0, %r10 ; \ + sbbq $0x0, %r11 ; \ + movq %rdx, %r12 ; \ + sbbq $0x0, %r12 ; \ + movq %r13, %rdx ; \ + shlq $0x20, %rdx ; \ + addq %r13, %rdx ; \ + movq $0xffffffff00000001, %rax ; \ + mulxq %rax, %r13, %rax ; \ + movl $0xffffffff, %ebx ; \ + mulxq %rbx, %rbx, %r13 ; \ + addq %rbx, %rax ; \ + adcq %rdx, %r13 ; \ + movl $0x0, %ebx ; \ + adcq %rbx, %rbx ; \ + subq %rax, %r8 ; \ + sbbq %r13, %r9 ; \ + sbbq %rbx, %r10 ; \ + sbbq $0x0, %r11 ; \ + sbbq $0x0, %r12 ; \ + movq %rdx, %r13 ; \ + sbbq $0x0, %r13 ; \ + movq P0, %rbx ; \ + addq %r8, %r14 ; \ + adcq %r9, %r15 ; \ + adcq %r10, %rcx ; \ + adcq %r11, %rbx ; \ + adcq %r12, %rbp ; \ + adcq %r13, %rsi ; \ + movl $0x0, %r8d ; \ + movq $0xffffffff00000001, %rax ; \ + movl $0xffffffff, %r9d ; \ + movl $0x1, %r10d ; \ + cmovnc %r8, %rax ; \ + cmovnc %r8, %r9 ; \ + cmovnc %r8, %r10 ; \ + addq %rax, %r14 ; \ + adcq %r9, %r15 ; \ + adcq %r10, %rcx ; \ + adcq %r8, %rbx ; \ + adcq %r8, %rbp ; \ + adcq %r8, %rsi ; \ + movq %r14, P0 ; \ + movq %r15, 0x8+P0 ; \ + movq %rcx, 0x10+P0 ; \ + movq %rbx, 0x18+P0 ; \ + movq %rbp, 0x20+P0 ; \ + movq %rsi, 0x28+P0 + +// Corresponds exactly to bignum_sub_p384 + +#define sub_p384(P0,P1,P2) \ + movq P1, %rax ; \ + subq P2, %rax ; \ + movq 0x8+P1, %rdx ; \ + sbbq 0x8+P2, %rdx ; \ + movq 0x10+P1, %r8 ; \ + sbbq 0x10+P2, %r8 ; \ + movq 0x18+P1, %r9 ; \ + sbbq 0x18+P2, %r9 ; \ + movq 0x20+P1, %r10 ; \ + sbbq 0x20+P2, %r10 ; \ + movq 0x28+P1, %r11 ; \ + sbbq 0x28+P2, %r11 ; \ + sbbq %rcx, %rcx ; \ + movl $0xffffffff, %esi ; \ + andq %rsi, %rcx ; \ + xorq %rsi, %rsi ; \ + subq %rcx, %rsi ; \ + subq %rsi, %rax ; \ + movq %rax, P0 ; \ + sbbq %rcx, %rdx ; \ + movq %rdx, 0x8+P0 ; \ + sbbq %rax, %rax ; \ + andq %rsi, %rcx ; \ + negq %rax; \ + sbbq %rcx, %r8 ; \ + movq %r8, 0x10+P0 ; \ + sbbq $0x0, %r9 ; \ + movq %r9, 0x18+P0 ; \ + sbbq $0x0, %r10 ; \ + movq %r10, 0x20+P0 ; \ + sbbq $0x0, %r11 ; \ + movq %r11, 0x28+P0 + +S2N_BN_SYMBOL(p384_montjadd): + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi + movq %r8, %rdx +#endif + +// Save registers and make room on stack for temporary variables +// Put the input arguments in non-volatile places on the stack + + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + + subq $NSPACE, %rsp + + movq %rdi, input_z + movq %rsi, input_x + movq %rdx, input_y + +// Main code, just a sequence of basic field operations +// 8 * multiply + 3 * square + 7 * subtract + + amontsqr_p384(z1sq,z_1) + movq input_y, %rsi + amontsqr_p384(z2sq,z_2_alt) + + movq input_x, %rsi + movq input_y, %rcx + montmul_p384(y1a,z_2,y_1) + movq input_x, %rsi + movq input_y, %rcx + montmul_p384(y2a,z_1,y_2) + + movq input_y, %rcx + montmul_p384(x2a,z1sq,x_2) + movq input_x, %rsi + montmul_p384(x1a,z2sq,x_1) + montmul_p384(y2a,z1sq,y2a) + montmul_p384(y1a,z2sq,y1a) + + sub_p384(xd,x2a,x1a) + sub_p384(yd,y2a,y1a) + + amontsqr_p384(zz,xd) + montsqr_p384(ww,yd) + + montmul_p384(zzx1,zz,x1a) + montmul_p384(zzx2,zz,x2a) + + movq input_z, %rdi + sub_p384(x_3,ww,zzx1) + sub_p384(t1,zzx2,zzx1) + + movq input_x, %rsi + montmul_p384(xd,xd,z_1) + + movq input_z, %rdi + sub_p384(x_3,x_3,zzx2) + + movq input_z, %rdi + sub_p384(t2,zzx1,x_3) + + montmul_p384(t1,t1,y1a) + + movq input_z, %rdi + movq input_y, %rcx + montmul_p384(z_3,xd,z_2) + montmul_p384(t2,yd,t2) + + movq input_z, %rdi + sub_p384(y_3,t2,t1) + +// Restore stack and registers + + addq $NSPACE, %rsp + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbp + popq %rbx + +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack, "", %progbits +#endif diff --git a/x86_att/p384/p384_montjdouble.S b/x86_att/p384/p384_montjdouble.S new file mode 100644 index 0000000000..d7de785797 --- /dev/null +++ b/x86_att/p384/p384_montjdouble.S @@ -0,0 +1,1014 @@ +/* + * Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"). + * You may not use this file except in compliance with the License. + * A copy of the License is located at + * + * http://aws.amazon.com/apache2.0 + * + * or in the "LICENSE" file accompanying this file. This file is distributed + * on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either + * express or implied. See the License for the specific language governing + * permissions and limitations under the License. + */ + +// ---------------------------------------------------------------------------- +// Point doubling on NIST curve P-384 in Montgomery-Jacobian coordinates +// +// extern void p384_montjdouble +// (uint64_t p3[static 18],uint64_t p1[static 18]); +// +// Does p3 := 2 * p1 where all points are regarded as Jacobian triples with +// each coordinate in the Montgomery domain, i.e. x' = (2^384 * x) mod p_384. +// A Jacobian triple (x',y',z') represents affine point (x/z^2,y/z^3). +// +// Standard x86-64 ABI: RDI = p3, RSI = p1 +// Microsoft x64 ABI: RCX = p3, RDX = p1 +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(p384_montjdouble) + S2N_BN_SYM_PRIVACY_DIRECTIVE(p384_montjdouble) + .text + .balign 4 + +// Size of individual field elements + +#define NUMSIZE 48 + +// Pointer-offset pairs for inputs and outputs +// These assume %rdi = p3, %rsi = p1. The latter stays true +// but montsqr below modifies %rdi as well. Thus, we need +// to save %rdi and restore it before the writes to outputs. + +#define x_1 0(%rsi) +#define y_1 NUMSIZE(%rsi) +#define z_1 (2*NUMSIZE)(%rsi) + +#define x_3 0(%rdi) +#define y_3 NUMSIZE(%rdi) +#define z_3 (2*NUMSIZE)(%rdi) + +// Pointer-offset pairs for temporaries, with some aliasing +// NSPACE is the total stack needed for these temporaries + +#define z2 (NUMSIZE*0)(%rsp) +#define y2 (NUMSIZE*1)(%rsp) +#define x2p (NUMSIZE*2)(%rsp) +#define xy2 (NUMSIZE*3)(%rsp) + +#define y4 (NUMSIZE*4)(%rsp) +#define t2 (NUMSIZE*4)(%rsp) + +#define dx2 (NUMSIZE*5)(%rsp) +#define t1 (NUMSIZE*5)(%rsp) + +#define d (NUMSIZE*6)(%rsp) +#define x4p (NUMSIZE*6)(%rsp) + +// Safe place for pointer to the output + +#define input_z (NUMSIZE*7)(%rsp) + +#define NSPACE (NUMSIZE*7+8) + +// Corresponds exactly to bignum_montmul_p384 + +#define montmul_p384(P0,P1,P2) \ + movq P2, %rdx ; \ + xorl %r15d, %r15d ; \ + mulxq P1, %r8, %r9 ; \ + mulxq 0x8+P1, %rbx, %r10 ; \ + addq %rbx, %r9 ; \ + mulxq 0x10+P1, %rbx, %r11 ; \ + adcq %rbx, %r10 ; \ + mulxq 0x18+P1, %rbx, %r12 ; \ + adcq %rbx, %r11 ; \ + mulxq 0x20+P1, %rbx, %r13 ; \ + adcq %rbx, %r12 ; \ + mulxq 0x28+P1, %rbx, %r14 ; \ + adcq %rbx, %r13 ; \ + adcq %r15, %r14 ; \ + movq %r8, %rdx ; \ + shlq $0x20, %rdx ; \ + addq %r8, %rdx ; \ + xorl %ebp, %ebp ; \ + movq $0xffffffff00000001, %rax ; \ + mulxq %rax, %rbx, %rax ; \ + movl $0xffffffff, %ebx ; \ + mulxq %rbx, %r8, %rbx ; \ + adcq %r8, %rax ; \ + adcq %rdx, %rbx ; \ + adcl %ebp, %ebp ; \ + subq %rax, %r9 ; \ + sbbq %rbx, %r10 ; \ + sbbq %rbp, %r11 ; \ + sbbq $0x0, %r12 ; \ + sbbq $0x0, %r13 ; \ + sbbq $0x0, %rdx ; \ + addq %rdx, %r14 ; \ + adcq $0x0, %r15 ; \ + movq 0x8+P2, %rdx ; \ + xorl %r8d, %r8d ; \ + mulxq P1, %rax, %rbx ; \ + adcxq %rax, %r9 ; \ + adoxq %rbx, %r10 ; \ + mulxq 0x8+P1, %rax, %rbx ; \ + adcxq %rax, %r10 ; \ + adoxq %rbx, %r11 ; \ + mulxq 0x10+P1, %rax, %rbx ; \ + adcxq %rax, %r11 ; \ + adoxq %rbx, %r12 ; \ + mulxq 0x18+P1, %rax, %rbx ; \ + adcxq %rax, %r12 ; \ + adoxq %rbx, %r13 ; \ + mulxq 0x20+P1, %rax, %rbx ; \ + adcxq %rax, %r13 ; \ + adoxq %rbx, %r14 ; \ + adoxq %r8, %r15 ; \ + mulxq 0x28+P1, %rax, %rbx ; \ + adcq %rax, %r14 ; \ + adcq %rbx, %r15 ; \ + adcq %r8, %r8 ; \ + movq %r9, %rdx ; \ + shlq $0x20, %rdx ; \ + addq %r9, %rdx ; \ + xorl %ebp, %ebp ; \ + movq $0xffffffff00000001, %rax ; \ + mulxq %rax, %rbx, %rax ; \ + movl $0xffffffff, %ebx ; \ + mulxq %rbx, %r9, %rbx ; \ + adcq %r9, %rax ; \ + adcq %rdx, %rbx ; \ + adcl %ebp, %ebp ; \ + subq %rax, %r10 ; \ + sbbq %rbx, %r11 ; \ + sbbq %rbp, %r12 ; \ + sbbq $0x0, %r13 ; \ + sbbq $0x0, %r14 ; \ + sbbq $0x0, %rdx ; \ + addq %rdx, %r15 ; \ + adcq $0x0, %r8 ; \ + movq 0x10+P2, %rdx ; \ + xorl %r9d, %r9d ; \ + mulxq P1, %rax, %rbx ; \ + adcxq %rax, %r10 ; \ + adoxq %rbx, %r11 ; \ + mulxq 0x8+P1, %rax, %rbx ; \ + adcxq %rax, %r11 ; \ + adoxq %rbx, %r12 ; \ + mulxq 0x10+P1, %rax, %rbx ; \ + adcxq %rax, %r12 ; \ + adoxq %rbx, %r13 ; \ + mulxq 0x18+P1, %rax, %rbx ; \ + adcxq %rax, %r13 ; \ + adoxq %rbx, %r14 ; \ + mulxq 0x20+P1, %rax, %rbx ; \ + adcxq %rax, %r14 ; \ + adoxq %rbx, %r15 ; \ + adoxq %r9, %r8 ; \ + mulxq 0x28+P1, %rax, %rbx ; \ + adcq %rax, %r15 ; \ + adcq %rbx, %r8 ; \ + adcq %r9, %r9 ; \ + movq %r10, %rdx ; \ + shlq $0x20, %rdx ; \ + addq %r10, %rdx ; \ + xorl %ebp, %ebp ; \ + movq $0xffffffff00000001, %rax ; \ + mulxq %rax, %rbx, %rax ; \ + movl $0xffffffff, %ebx ; \ + mulxq %rbx, %r10, %rbx ; \ + adcq %r10, %rax ; \ + adcq %rdx, %rbx ; \ + adcl %ebp, %ebp ; \ + subq %rax, %r11 ; \ + sbbq %rbx, %r12 ; \ + sbbq %rbp, %r13 ; \ + sbbq $0x0, %r14 ; \ + sbbq $0x0, %r15 ; \ + sbbq $0x0, %rdx ; \ + addq %rdx, %r8 ; \ + adcq $0x0, %r9 ; \ + movq 0x18+P2, %rdx ; \ + xorl %r10d, %r10d ; \ + mulxq P1, %rax, %rbx ; \ + adcxq %rax, %r11 ; \ + adoxq %rbx, %r12 ; \ + mulxq 0x8+P1, %rax, %rbx ; \ + adcxq %rax, %r12 ; \ + adoxq %rbx, %r13 ; \ + mulxq 0x10+P1, %rax, %rbx ; \ + adcxq %rax, %r13 ; \ + adoxq %rbx, %r14 ; \ + mulxq 0x18+P1, %rax, %rbx ; \ + adcxq %rax, %r14 ; \ + adoxq %rbx, %r15 ; \ + mulxq 0x20+P1, %rax, %rbx ; \ + adcxq %rax, %r15 ; \ + adoxq %rbx, %r8 ; \ + adoxq %r10, %r9 ; \ + mulxq 0x28+P1, %rax, %rbx ; \ + adcq %rax, %r8 ; \ + adcq %rbx, %r9 ; \ + adcq %r10, %r10 ; \ + movq %r11, %rdx ; \ + shlq $0x20, %rdx ; \ + addq %r11, %rdx ; \ + xorl %ebp, %ebp ; \ + movq $0xffffffff00000001, %rax ; \ + mulxq %rax, %rbx, %rax ; \ + movl $0xffffffff, %ebx ; \ + mulxq %rbx, %r11, %rbx ; \ + adcq %r11, %rax ; \ + adcq %rdx, %rbx ; \ + adcl %ebp, %ebp ; \ + subq %rax, %r12 ; \ + sbbq %rbx, %r13 ; \ + sbbq %rbp, %r14 ; \ + sbbq $0x0, %r15 ; \ + sbbq $0x0, %r8 ; \ + sbbq $0x0, %rdx ; \ + addq %rdx, %r9 ; \ + adcq $0x0, %r10 ; \ + movq 0x20+P2, %rdx ; \ + xorl %r11d, %r11d ; \ + mulxq P1, %rax, %rbx ; \ + adcxq %rax, %r12 ; \ + adoxq %rbx, %r13 ; \ + mulxq 0x8+P1, %rax, %rbx ; \ + adcxq %rax, %r13 ; \ + adoxq %rbx, %r14 ; \ + mulxq 0x10+P1, %rax, %rbx ; \ + adcxq %rax, %r14 ; \ + adoxq %rbx, %r15 ; \ + mulxq 0x18+P1, %rax, %rbx ; \ + adcxq %rax, %r15 ; \ + adoxq %rbx, %r8 ; \ + mulxq 0x20+P1, %rax, %rbx ; \ + adcxq %rax, %r8 ; \ + adoxq %rbx, %r9 ; \ + adoxq %r11, %r10 ; \ + mulxq 0x28+P1, %rax, %rbx ; \ + adcq %rax, %r9 ; \ + adcq %rbx, %r10 ; \ + adcq %r11, %r11 ; \ + movq %r12, %rdx ; \ + shlq $0x20, %rdx ; \ + addq %r12, %rdx ; \ + xorl %ebp, %ebp ; \ + movq $0xffffffff00000001, %rax ; \ + mulxq %rax, %rbx, %rax ; \ + movl $0xffffffff, %ebx ; \ + mulxq %rbx, %r12, %rbx ; \ + adcq %r12, %rax ; \ + adcq %rdx, %rbx ; \ + adcl %ebp, %ebp ; \ + subq %rax, %r13 ; \ + sbbq %rbx, %r14 ; \ + sbbq %rbp, %r15 ; \ + sbbq $0x0, %r8 ; \ + sbbq $0x0, %r9 ; \ + sbbq $0x0, %rdx ; \ + addq %rdx, %r10 ; \ + adcq $0x0, %r11 ; \ + movq 0x28+P2, %rdx ; \ + xorl %r12d, %r12d ; \ + mulxq P1, %rax, %rbx ; \ + adcxq %rax, %r13 ; \ + adoxq %rbx, %r14 ; \ + mulxq 0x8+P1, %rax, %rbx ; \ + adcxq %rax, %r14 ; \ + adoxq %rbx, %r15 ; \ + mulxq 0x10+P1, %rax, %rbx ; \ + adcxq %rax, %r15 ; \ + adoxq %rbx, %r8 ; \ + mulxq 0x18+P1, %rax, %rbx ; \ + adcxq %rax, %r8 ; \ + adoxq %rbx, %r9 ; \ + mulxq 0x20+P1, %rax, %rbx ; \ + adcxq %rax, %r9 ; \ + adoxq %rbx, %r10 ; \ + adoxq %r12, %r11 ; \ + mulxq 0x28+P1, %rax, %rbx ; \ + adcq %rax, %r10 ; \ + adcq %rbx, %r11 ; \ + adcq %r12, %r12 ; \ + movq %r13, %rdx ; \ + shlq $0x20, %rdx ; \ + addq %r13, %rdx ; \ + xorl %ebp, %ebp ; \ + movq $0xffffffff00000001, %rax ; \ + mulxq %rax, %rbx, %rax ; \ + movl $0xffffffff, %ebx ; \ + mulxq %rbx, %r13, %rbx ; \ + adcq %r13, %rax ; \ + adcq %rdx, %rbx ; \ + adcl %ebp, %ebp ; \ + subq %rax, %r14 ; \ + sbbq %rbx, %r15 ; \ + sbbq %rbp, %r8 ; \ + sbbq $0x0, %r9 ; \ + sbbq $0x0, %r10 ; \ + sbbq $0x0, %rdx ; \ + addq %rdx, %r11 ; \ + adcq $0x0, %r12 ; \ + xorl %edx, %edx ; \ + xorl %ebp, %ebp ; \ + xorl %r13d, %r13d ; \ + movq $0xffffffff00000001, %rax ; \ + addq %r14, %rax ; \ + movl $0xffffffff, %ebx ; \ + adcq %r15, %rbx ; \ + movl $0x1, %ecx ; \ + adcq %r8, %rcx ; \ + adcq %r9, %rdx ; \ + adcq %r10, %rbp ; \ + adcq %r11, %r13 ; \ + adcq $0x0, %r12 ; \ + cmovne %rax, %r14 ; \ + cmovne %rbx, %r15 ; \ + cmovne %rcx, %r8 ; \ + cmovne %rdx, %r9 ; \ + cmovne %rbp, %r10 ; \ + cmovne %r13, %r11 ; \ + movq %r14, P0 ; \ + movq %r15, 0x8+P0 ; \ + movq %r8, 0x10+P0 ; \ + movq %r9, 0x18+P0 ; \ + movq %r10, 0x20+P0 ; \ + movq %r11, 0x28+P0 + +// Corresponds exactly to bignum_montsqr_p384 + +#define montsqr_p384(P0,P1) \ + movq P1, %rdx ; \ + mulxq 0x8+P1, %r9, %r10 ; \ + mulxq 0x18+P1, %r11, %r12 ; \ + mulxq 0x28+P1, %r13, %r14 ; \ + movq 0x18+P1, %rdx ; \ + mulxq 0x20+P1, %r15, %rcx ; \ + xorl %ebp, %ebp ; \ + movq 0x10+P1, %rdx ; \ + mulxq P1, %rax, %rbx ; \ + adcxq %rax, %r10 ; \ + adoxq %rbx, %r11 ; \ + mulxq 0x8+P1, %rax, %rbx ; \ + adcxq %rax, %r11 ; \ + adoxq %rbx, %r12 ; \ + movq 0x8+P1, %rdx ; \ + mulxq 0x18+P1, %rax, %rbx ; \ + adcxq %rax, %r12 ; \ + adoxq %rbx, %r13 ; \ + mulxq 0x20+P1, %rax, %rbx ; \ + adcxq %rax, %r13 ; \ + adoxq %rbx, %r14 ; \ + mulxq 0x28+P1, %rax, %rbx ; \ + adcxq %rax, %r14 ; \ + adoxq %rbx, %r15 ; \ + adcxq %rbp, %r15 ; \ + adoxq %rbp, %rcx ; \ + adcq %rbp, %rcx ; \ + xorl %ebp, %ebp ; \ + movq 0x20+P1, %rdx ; \ + mulxq P1, %rax, %rbx ; \ + adcxq %rax, %r12 ; \ + adoxq %rbx, %r13 ; \ + movq 0x10+P1, %rdx ; \ + mulxq 0x18+P1, %rax, %rbx ; \ + adcxq %rax, %r13 ; \ + adoxq %rbx, %r14 ; \ + mulxq 0x20+P1, %rax, %rbx ; \ + adcxq %rax, %r14 ; \ + adoxq %rbx, %r15 ; \ + mulxq 0x28+P1, %rax, %rdx ; \ + adcxq %rax, %r15 ; \ + adoxq %rdx, %rcx ; \ + movq 0x28+P1, %rdx ; \ + mulxq 0x20+P1, %rbx, %rbp ; \ + mulxq 0x18+P1, %rax, %rdx ; \ + adcxq %rax, %rcx ; \ + adoxq %rdx, %rbx ; \ + movl $0x0, %eax ; \ + adcxq %rax, %rbx ; \ + adoxq %rax, %rbp ; \ + adcq %rax, %rbp ; \ + xorq %rax, %rax ; \ + movq P1, %rdx ; \ + mulxq P1, %r8, %rax ; \ + adcxq %r9, %r9 ; \ + adoxq %rax, %r9 ; \ + movq 0x8+P1, %rdx ; \ + mulxq %rdx, %rax, %rdx ; \ + adcxq %r10, %r10 ; \ + adoxq %rax, %r10 ; \ + adcxq %r11, %r11 ; \ + adoxq %rdx, %r11 ; \ + movq 0x10+P1, %rdx ; \ + mulxq %rdx, %rax, %rdx ; \ + adcxq %r12, %r12 ; \ + adoxq %rax, %r12 ; \ + adcxq %r13, %r13 ; \ + adoxq %rdx, %r13 ; \ + movq 0x18+P1, %rdx ; \ + mulxq %rdx, %rax, %rdx ; \ + adcxq %r14, %r14 ; \ + adoxq %rax, %r14 ; \ + adcxq %r15, %r15 ; \ + adoxq %rdx, %r15 ; \ + movq 0x20+P1, %rdx ; \ + mulxq %rdx, %rax, %rdx ; \ + adcxq %rcx, %rcx ; \ + adoxq %rax, %rcx ; \ + adcxq %rbx, %rbx ; \ + adoxq %rdx, %rbx ; \ + movq 0x28+P1, %rdx ; \ + mulxq %rdx, %rax, %rdi ; \ + adcxq %rbp, %rbp ; \ + adoxq %rax, %rbp ; \ + movl $0x0, %eax ; \ + adcxq %rax, %rdi ; \ + adoxq %rax, %rdi ; \ + movq %rbx, P0 ; \ + movq %r8, %rdx ; \ + shlq $0x20, %rdx ; \ + addq %r8, %rdx ; \ + movq $0xffffffff00000001, %rax ; \ + mulxq %rax, %r8, %rax ; \ + movl $0xffffffff, %ebx ; \ + mulxq %rbx, %rbx, %r8 ; \ + addq %rbx, %rax ; \ + adcq %rdx, %r8 ; \ + movl $0x0, %ebx ; \ + adcq %rbx, %rbx ; \ + subq %rax, %r9 ; \ + sbbq %r8, %r10 ; \ + sbbq %rbx, %r11 ; \ + sbbq $0x0, %r12 ; \ + sbbq $0x0, %r13 ; \ + movq %rdx, %r8 ; \ + sbbq $0x0, %r8 ; \ + movq %r9, %rdx ; \ + shlq $0x20, %rdx ; \ + addq %r9, %rdx ; \ + movq $0xffffffff00000001, %rax ; \ + mulxq %rax, %r9, %rax ; \ + movl $0xffffffff, %ebx ; \ + mulxq %rbx, %rbx, %r9 ; \ + addq %rbx, %rax ; \ + adcq %rdx, %r9 ; \ + movl $0x0, %ebx ; \ + adcq %rbx, %rbx ; \ + subq %rax, %r10 ; \ + sbbq %r9, %r11 ; \ + sbbq %rbx, %r12 ; \ + sbbq $0x0, %r13 ; \ + sbbq $0x0, %r8 ; \ + movq %rdx, %r9 ; \ + sbbq $0x0, %r9 ; \ + movq %r10, %rdx ; \ + shlq $0x20, %rdx ; \ + addq %r10, %rdx ; \ + movq $0xffffffff00000001, %rax ; \ + mulxq %rax, %r10, %rax ; \ + movl $0xffffffff, %ebx ; \ + mulxq %rbx, %rbx, %r10 ; \ + addq %rbx, %rax ; \ + adcq %rdx, %r10 ; \ + movl $0x0, %ebx ; \ + adcq %rbx, %rbx ; \ + subq %rax, %r11 ; \ + sbbq %r10, %r12 ; \ + sbbq %rbx, %r13 ; \ + sbbq $0x0, %r8 ; \ + sbbq $0x0, %r9 ; \ + movq %rdx, %r10 ; \ + sbbq $0x0, %r10 ; \ + movq %r11, %rdx ; \ + shlq $0x20, %rdx ; \ + addq %r11, %rdx ; \ + movq $0xffffffff00000001, %rax ; \ + mulxq %rax, %r11, %rax ; \ + movl $0xffffffff, %ebx ; \ + mulxq %rbx, %rbx, %r11 ; \ + addq %rbx, %rax ; \ + adcq %rdx, %r11 ; \ + movl $0x0, %ebx ; \ + adcq %rbx, %rbx ; \ + subq %rax, %r12 ; \ + sbbq %r11, %r13 ; \ + sbbq %rbx, %r8 ; \ + sbbq $0x0, %r9 ; \ + sbbq $0x0, %r10 ; \ + movq %rdx, %r11 ; \ + sbbq $0x0, %r11 ; \ + movq %r12, %rdx ; \ + shlq $0x20, %rdx ; \ + addq %r12, %rdx ; \ + movq $0xffffffff00000001, %rax ; \ + mulxq %rax, %r12, %rax ; \ + movl $0xffffffff, %ebx ; \ + mulxq %rbx, %rbx, %r12 ; \ + addq %rbx, %rax ; \ + adcq %rdx, %r12 ; \ + movl $0x0, %ebx ; \ + adcq %rbx, %rbx ; \ + subq %rax, %r13 ; \ + sbbq %r12, %r8 ; \ + sbbq %rbx, %r9 ; \ + sbbq $0x0, %r10 ; \ + sbbq $0x0, %r11 ; \ + movq %rdx, %r12 ; \ + sbbq $0x0, %r12 ; \ + movq %r13, %rdx ; \ + shlq $0x20, %rdx ; \ + addq %r13, %rdx ; \ + movq $0xffffffff00000001, %rax ; \ + mulxq %rax, %r13, %rax ; \ + movl $0xffffffff, %ebx ; \ + mulxq %rbx, %rbx, %r13 ; \ + addq %rbx, %rax ; \ + adcq %rdx, %r13 ; \ + movl $0x0, %ebx ; \ + adcq %rbx, %rbx ; \ + subq %rax, %r8 ; \ + sbbq %r13, %r9 ; \ + sbbq %rbx, %r10 ; \ + sbbq $0x0, %r11 ; \ + sbbq $0x0, %r12 ; \ + movq %rdx, %r13 ; \ + sbbq $0x0, %r13 ; \ + movq P0, %rbx ; \ + addq %r8, %r14 ; \ + adcq %r9, %r15 ; \ + adcq %r10, %rcx ; \ + adcq %r11, %rbx ; \ + adcq %r12, %rbp ; \ + adcq %r13, %rdi ; \ + movl $0x0, %r8d ; \ + adcq %r8, %r8 ; \ + xorq %r11, %r11 ; \ + xorq %r12, %r12 ; \ + xorq %r13, %r13 ; \ + movq $0xffffffff00000001, %rax ; \ + addq %r14, %rax ; \ + movl $0xffffffff, %r9d ; \ + adcq %r15, %r9 ; \ + movl $0x1, %r10d ; \ + adcq %rcx, %r10 ; \ + adcq %rbx, %r11 ; \ + adcq %rbp, %r12 ; \ + adcq %rdi, %r13 ; \ + adcq $0x0, %r8 ; \ + cmovne %rax, %r14 ; \ + cmovne %r9, %r15 ; \ + cmovne %r10, %rcx ; \ + cmovne %r11, %rbx ; \ + cmovne %r12, %rbp ; \ + cmovne %r13, %rdi ; \ + movq %r14, P0 ; \ + movq %r15, 0x8+P0 ; \ + movq %rcx, 0x10+P0 ; \ + movq %rbx, 0x18+P0 ; \ + movq %rbp, 0x20+P0 ; \ + movq %rdi, 0x28+P0 + +#define sub_p384(P0,P1,P2) \ + movq P1, %rax ; \ + subq P2, %rax ; \ + movq 0x8+P1, %rdx ; \ + sbbq 0x8+P2, %rdx ; \ + movq 0x10+P1, %r8 ; \ + sbbq 0x10+P2, %r8 ; \ + movq 0x18+P1, %r9 ; \ + sbbq 0x18+P2, %r9 ; \ + movq 0x20+P1, %r10 ; \ + sbbq 0x20+P2, %r10 ; \ + movq 0x28+P1, %r11 ; \ + sbbq 0x28+P2, %r11 ; \ + sbbq %rcx, %rcx ; \ + movl $0xffffffff, %ebx ; \ + andq %rbx, %rcx ; \ + xorq %rbx, %rbx ; \ + subq %rcx, %rbx ; \ + subq %rbx, %rax ; \ + movq %rax, P0 ; \ + sbbq %rcx, %rdx ; \ + movq %rdx, 0x8+P0 ; \ + sbbq %rax, %rax ; \ + andq %rbx, %rcx ; \ + negq %rax; \ + sbbq %rcx, %r8 ; \ + movq %r8, 0x10+P0 ; \ + sbbq $0x0, %r9 ; \ + movq %r9, 0x18+P0 ; \ + sbbq $0x0, %r10 ; \ + movq %r10, 0x20+P0 ; \ + sbbq $0x0, %r11 ; \ + movq %r11, 0x28+P0 + +// Simplified bignum_add_p384, without carry chain suspension + +#define add_p384(P0,P1,P2) \ + movq P1, %rax ; \ + addq P2, %rax ; \ + movq 0x8+P1, %rcx ; \ + adcq 0x8+P2, %rcx ; \ + movq 0x10+P1, %r8 ; \ + adcq 0x10+P2, %r8 ; \ + movq 0x18+P1, %r9 ; \ + adcq 0x18+P2, %r9 ; \ + movq 0x20+P1, %r10 ; \ + adcq 0x20+P2, %r10 ; \ + movq 0x28+P1, %r11 ; \ + adcq 0x28+P2, %r11 ; \ + movl $0x0, %edx ; \ + adcq %rdx, %rdx ; \ + movq $0xffffffff00000001, %rbp ; \ + addq %rbp, %rax ; \ + movl $0xffffffff, %ebp ; \ + adcq %rbp, %rcx ; \ + adcq $0x1, %r8 ; \ + adcq $0x0, %r9 ; \ + adcq $0x0, %r10 ; \ + adcq $0x0, %r11 ; \ + adcq $0xffffffffffffffff, %rdx ; \ + movl $1, %ebx ; \ + andq %rdx, %rbx ; \ + andq %rbp, %rdx ; \ + xorq %rbp, %rbp ; \ + subq %rdx, %rbp ; \ + subq %rbp, %rax ; \ + movq %rax, P0 ; \ + sbbq %rdx, %rcx ; \ + movq %rcx, 0x8+P0 ; \ + sbbq %rbx, %r8 ; \ + movq %r8, 0x10+P0 ; \ + sbbq $0x0, %r9 ; \ + movq %r9, 0x18+P0 ; \ + sbbq $0x0, %r10 ; \ + movq %r10, 0x20+P0 ; \ + sbbq $0x0, %r11 ; \ + movq %r11, 0x28+P0 + +// P0 = 4 * P1 - P2 + +#define cmsub41_p384(P0,P1,P2) \ + movq 40+P1, %rdx ; \ + movq %rdx, %r13 ; \ + shrq $62, %rdx ; \ + movq 32+P1, %r12 ; \ + shldq $2, %r12, %r13 ; \ + movq 24+P1, %r11 ; \ + shldq $2, %r11, %r12 ; \ + movq 16+P1, %r10 ; \ + shldq $2, %r10, %r11 ; \ + movq 8+P1, %r9 ; \ + shldq $2, %r9, %r10 ; \ + movq P1, %r8 ; \ + shldq $2, %r8, %r9 ; \ + shlq $2, %r8 ; \ + addq $1, %rdx ; \ + subq P2, %r8 ; \ + sbbq 0x8+P2, %r9 ; \ + sbbq 0x10+P2, %r10 ; \ + sbbq 0x18+P2, %r11 ; \ + sbbq 0x20+P2, %r12 ; \ + sbbq 0x28+P2, %r13 ; \ + sbbq $0, %rdx ; \ + xorq %rcx, %rcx ; \ + movq $0xffffffff00000001, %rax ; \ + mulxq %rax, %rax, %rcx ; \ + adcxq %rax, %r8 ; \ + adoxq %rcx, %r9 ; \ + movl $0xffffffff, %eax ; \ + mulxq %rax, %rax, %rcx ; \ + adcxq %rax, %r9 ; \ + adoxq %rcx, %r10 ; \ + adcxq %rdx, %r10 ; \ + movl $0x0, %eax ; \ + movl $0x0, %ecx ; \ + adoxq %rax, %rax ; \ + adcq %rax, %r11 ; \ + adcq %rcx, %r12 ; \ + adcq %rcx, %r13 ; \ + adcq %rcx, %rcx ; \ + subq $0x1, %rcx ; \ + movl $0xffffffff, %edx ; \ + xorq %rax, %rax ; \ + andq %rcx, %rdx ; \ + subq %rdx, %rax ; \ + andq $0x1, %rcx ; \ + subq %rax, %r8 ; \ + movq %r8, P0 ; \ + sbbq %rdx, %r9 ; \ + movq %r9, 0x8+P0 ; \ + sbbq %rcx, %r10 ; \ + movq %r10, 0x10+P0 ; \ + sbbq $0x0, %r11 ; \ + movq %r11, 0x18+P0 ; \ + sbbq $0x0, %r12 ; \ + movq %r12, 0x20+P0 ; \ + sbbq $0x0, %r13 ; \ + movq %r13, 0x28+P0 + +// P0 = C * P1 - D * P2 + +#define cmsub_p384(P0,C,P1,D,P2) \ + movq $0x00000000ffffffff, %r8 ; \ + subq P2, %r8 ; \ + movq $0xffffffff00000000, %r9 ; \ + sbbq 8+P2, %r9 ; \ + movq $0xfffffffffffffffe, %r10 ; \ + sbbq 16+P2, %r10 ; \ + movq $0xffffffffffffffff, %r11 ; \ + sbbq 24+P2, %r11 ; \ + movq $0xffffffffffffffff, %r12 ; \ + sbbq 32+P2, %r12 ; \ + movq $0xffffffffffffffff, %r13 ; \ + sbbq 40+P2, %r13 ; \ + movq $D, %rdx ; \ + mulxq %r8, %r8, %rax ; \ + mulxq %r9, %r9, %rcx ; \ + addq %rax, %r9 ; \ + mulxq %r10, %r10, %rax ; \ + adcq %rcx, %r10 ; \ + mulxq %r11, %r11, %rcx ; \ + adcq %rax, %r11 ; \ + mulxq %r12, %r12, %rax ; \ + adcq %rcx, %r12 ; \ + mulxq %r13, %r13, %r14 ; \ + adcq %rax, %r13 ; \ + adcq $1, %r14 ; \ + xorl %ecx, %ecx ; \ + movq $C, %rdx ; \ + mulxq P1, %rax, %rbx ; \ + adcxq %rax, %r8 ; \ + adoxq %rbx, %r9 ; \ + mulxq 8+P1, %rax, %rbx ; \ + adcxq %rax, %r9 ; \ + adoxq %rbx, %r10 ; \ + mulxq 16+P1, %rax, %rbx ; \ + adcxq %rax, %r10 ; \ + adoxq %rbx, %r11 ; \ + mulxq 24+P1, %rax, %rbx ; \ + adcxq %rax, %r11 ; \ + adoxq %rbx, %r12 ; \ + mulxq 32+P1, %rax, %rbx ; \ + adcxq %rax, %r12 ; \ + adoxq %rbx, %r13 ; \ + mulxq 40+P1, %rax, %rdx ; \ + adcxq %rax, %r13 ; \ + adoxq %r14, %rdx ; \ + adcxq %rcx, %rdx ; \ + xorq %rcx, %rcx ; \ + movq $0xffffffff00000001, %rax ; \ + mulxq %rax, %rax, %rcx ; \ + adcxq %rax, %r8 ; \ + adoxq %rcx, %r9 ; \ + movl $0xffffffff, %eax ; \ + mulxq %rax, %rax, %rcx ; \ + adcxq %rax, %r9 ; \ + adoxq %rcx, %r10 ; \ + adcxq %rdx, %r10 ; \ + movl $0x0, %eax ; \ + movl $0x0, %ecx ; \ + adoxq %rax, %rax ; \ + adcq %rax, %r11 ; \ + adcq %rcx, %r12 ; \ + adcq %rcx, %r13 ; \ + adcq %rcx, %rcx ; \ + subq $0x1, %rcx ; \ + movl $0xffffffff, %edx ; \ + xorq %rax, %rax ; \ + andq %rcx, %rdx ; \ + subq %rdx, %rax ; \ + andq $0x1, %rcx ; \ + subq %rax, %r8 ; \ + movq %r8, P0 ; \ + sbbq %rdx, %r9 ; \ + movq %r9, 0x8+P0 ; \ + sbbq %rcx, %r10 ; \ + movq %r10, 0x10+P0 ; \ + sbbq $0x0, %r11 ; \ + movq %r11, 0x18+P0 ; \ + sbbq $0x0, %r12 ; \ + movq %r12, 0x20+P0 ; \ + sbbq $0x0, %r13 ; \ + movq %r13, 0x28+P0 + +// A weak version of add that only guarantees sum in 6 digits + +#define weakadd_p384(P0,P1,P2) \ + movq P1, %rax ; \ + addq P2, %rax ; \ + movq 0x8+P1, %rcx ; \ + adcq 0x8+P2, %rcx ; \ + movq 0x10+P1, %r8 ; \ + adcq 0x10+P2, %r8 ; \ + movq 0x18+P1, %r9 ; \ + adcq 0x18+P2, %r9 ; \ + movq 0x20+P1, %r10 ; \ + adcq 0x20+P2, %r10 ; \ + movq 0x28+P1, %r11 ; \ + adcq 0x28+P2, %r11 ; \ + sbbq %rdx, %rdx ; \ + movl $1, %ebx ; \ + andq %rdx, %rbx ; \ + movl $0xffffffff, %ebp ; \ + andq %rbp, %rdx ; \ + xorq %rbp, %rbp ; \ + subq %rdx, %rbp ; \ + addq %rbp, %rax ; \ + movq %rax, P0 ; \ + adcq %rdx, %rcx ; \ + movq %rcx, 0x8+P0 ; \ + adcq %rbx, %r8 ; \ + movq %r8, 0x10+P0 ; \ + adcq $0x0, %r9 ; \ + movq %r9, 0x18+P0 ; \ + adcq $0x0, %r10 ; \ + movq %r10, 0x20+P0 ; \ + adcq $0x0, %r11 ; \ + movq %r11, 0x28+P0 + +// P0 = 3 * P1 - 8 * P2 + +#define cmsub38_p384(P0,P1,P2) \ + movq $0x00000000ffffffff, %r8 ; \ + subq P2, %r8 ; \ + movq $0xffffffff00000000, %r9 ; \ + sbbq 8+P2, %r9 ; \ + movq $0xfffffffffffffffe, %r10 ; \ + sbbq 16+P2, %r10 ; \ + movq $0xffffffffffffffff, %r11 ; \ + sbbq 24+P2, %r11 ; \ + movq $0xffffffffffffffff, %r12 ; \ + sbbq 32+P2, %r12 ; \ + movq $0xffffffffffffffff, %r13 ; \ + sbbq 40+P2, %r13 ; \ + movq %r13, %r14 ; \ + shrq $61, %r14 ; \ + shldq $3, %r12, %r13 ; \ + shldq $3, %r11, %r12 ; \ + shldq $3, %r10, %r11 ; \ + shldq $3, %r9, %r10 ; \ + shldq $3, %r8, %r9 ; \ + shlq $3, %r8 ; \ + addq $1, %r14 ; \ + xorl %ecx, %ecx ; \ + movq $3, %rdx ; \ + mulxq P1, %rax, %rbx ; \ + adcxq %rax, %r8 ; \ + adoxq %rbx, %r9 ; \ + mulxq 8+P1, %rax, %rbx ; \ + adcxq %rax, %r9 ; \ + adoxq %rbx, %r10 ; \ + mulxq 16+P1, %rax, %rbx ; \ + adcxq %rax, %r10 ; \ + adoxq %rbx, %r11 ; \ + mulxq 24+P1, %rax, %rbx ; \ + adcxq %rax, %r11 ; \ + adoxq %rbx, %r12 ; \ + mulxq 32+P1, %rax, %rbx ; \ + adcxq %rax, %r12 ; \ + adoxq %rbx, %r13 ; \ + mulxq 40+P1, %rax, %rdx ; \ + adcxq %rax, %r13 ; \ + adoxq %r14, %rdx ; \ + adcxq %rcx, %rdx ; \ + xorq %rcx, %rcx ; \ + movq $0xffffffff00000001, %rax ; \ + mulxq %rax, %rax, %rcx ; \ + adcxq %rax, %r8 ; \ + adoxq %rcx, %r9 ; \ + movl $0xffffffff, %eax ; \ + mulxq %rax, %rax, %rcx ; \ + adcxq %rax, %r9 ; \ + adoxq %rcx, %r10 ; \ + adcxq %rdx, %r10 ; \ + movl $0x0, %eax ; \ + movl $0x0, %ecx ; \ + adoxq %rax, %rax ; \ + adcq %rax, %r11 ; \ + adcq %rcx, %r12 ; \ + adcq %rcx, %r13 ; \ + adcq %rcx, %rcx ; \ + subq $0x1, %rcx ; \ + movl $0xffffffff, %edx ; \ + xorq %rax, %rax ; \ + andq %rcx, %rdx ; \ + subq %rdx, %rax ; \ + andq $0x1, %rcx ; \ + subq %rax, %r8 ; \ + movq %r8, P0 ; \ + sbbq %rdx, %r9 ; \ + movq %r9, 0x8+P0 ; \ + sbbq %rcx, %r10 ; \ + movq %r10, 0x10+P0 ; \ + sbbq $0x0, %r11 ; \ + movq %r11, 0x18+P0 ; \ + sbbq $0x0, %r12 ; \ + movq %r12, 0x20+P0 ; \ + sbbq $0x0, %r13 ; \ + movq %r13, 0x28+P0 + +S2N_BN_SYMBOL(p384_montjdouble): + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi +#endif + +// Save registers and make room on stack for temporary variables +// Save the outpuy pointer %rdi which gets overwritten in earlier +// operations before it is used. + + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + + subq $NSPACE, %rsp + + movq %rdi, input_z + +// Main code, just a sequence of basic field operations + +// z2 = z^2 +// y2 = y^2 + + montsqr_p384(z2,z_1) + montsqr_p384(y2,y_1) + +// x2p = x^2 - z^4 = (x + z^2) * (x - z^2) + + weakadd_p384(t1,x_1,z2) + sub_p384(t2,x_1,z2) + montmul_p384(x2p,t1,t2) + +// t1 = y + z +// x4p = x2p^2 +// xy2 = x * y^2 + + add_p384(t1,y_1,z_1) + montsqr_p384(x4p,x2p) + montmul_p384(xy2,x_1,y2) + +// t2 = (y + z)^2 + + montsqr_p384(t2,t1) + +// d = 12 * xy2 - 9 * x4p +// t1 = y^2 + 2 * y * z + + cmsub_p384(d,12,xy2,9,x4p) + sub_p384(t1,t2,z2) + +// y4 = y^4 + + montsqr_p384(y4,y2) + +// Restore the output pointer to write to x_3, y_3 and z_3. + + movq input_z, %rdi + +// z_3' = 2 * y * z +// dx2 = d * x2p + + sub_p384(z_3,t1,y2) + montmul_p384(dx2,d,x2p) + +// x' = 4 * xy2 - d + + cmsub41_p384(x_3,xy2,d) + +// y' = 3 * dx2 - 8 * y4 + + cmsub38_p384(y_3,dx2,y4) + +// Restore stack and registers + + addq $NSPACE, %rsp + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbp + popq %rbx + +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack, "", %progbits +#endif diff --git a/x86_att/p384/p384_montjmixadd.S b/x86_att/p384/p384_montjmixadd.S new file mode 100644 index 0000000000..6749209eb4 --- /dev/null +++ b/x86_att/p384/p384_montjmixadd.S @@ -0,0 +1,941 @@ +/* + * Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"). + * You may not use this file except in compliance with the License. + * A copy of the License is located at + * + * http://aws.amazon.com/apache2.0 + * + * or in the "LICENSE" file accompanying this file. This file is distributed + * on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either + * express or implied. See the License for the specific language governing + * permissions and limitations under the License. + */ + +// ---------------------------------------------------------------------------- +// Point mixed addition on NIST curve P-384 in Montgomery-Jacobian coordinates +// +// extern void p384_montjmixadd +// (uint64_t p3[static 18],uint64_t p1[static 18],uint64_t p2[static 12]); +// +// Does p3 := p1 + p2 where all points are regarded as Jacobian triples with +// each coordinate in the Montgomery domain, i.e. x' = (2^384 * x) mod p_384. +// A Jacobian triple (x',y',z') represents affine point (x/z^2,y/z^3). +// The "mixed" part means that p2 only has x and y coordinates, with the +// implicit z coordinate assumed to be the identity. +// +// Standard x86-64 ABI: RDI = p3, RSI = p1, RDX = p2 +// Microsoft x64 ABI: RCX = p3, RDX = p1, R8 = p2 +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(p384_montjmixadd) + S2N_BN_SYM_PRIVACY_DIRECTIVE(p384_montjmixadd) + .text + .balign 4 + +// Size of individual field elements + +#define NUMSIZE 48 + +// Pointer-offset pairs for inputs and outputs +// These assume %rdi = p3, %rsi = p1 and %rcx = p2, +// which needs to be set up explicitly before use + +#define x_1 0(%rsi) +#define y_1 NUMSIZE(%rsi) +#define z_1 (2*NUMSIZE)(%rsi) + +#define x_2 0(%rcx) +#define y_2 NUMSIZE(%rcx) + +#define x_3 0(%rdi) +#define y_3 NUMSIZE(%rdi) +#define z_3 (2*NUMSIZE)(%rdi) + +// Pointer-offset pairs for temporaries, with some aliasing +// NSPACE is the total stack needed for these temporaries + +#define zp2 (NUMSIZE*0)(%rsp) +#define ww (NUMSIZE*0)(%rsp) + +#define yd (NUMSIZE*1)(%rsp) +#define y2a (NUMSIZE*1)(%rsp) + +#define x2a (NUMSIZE*2)(%rsp) +#define zzx2 (NUMSIZE*2)(%rsp) + +#define zz (NUMSIZE*3)(%rsp) +#define t1 (NUMSIZE*3)(%rsp) + +#define t2 (NUMSIZE*4)(%rsp) +#define zzx1 (NUMSIZE*4)(%rsp) + +#define xd (NUMSIZE*5)(%rsp) + +// Temporaries for the actual input pointers + +#define input_x (NUMSIZE*6)(%rsp) +#define input_y (NUMSIZE*6+8)(%rsp) +#define input_z (NUMSIZE*6+16)(%rsp) + +#define NSPACE (NUMSIZE*6+24) + +// Corresponds exactly to bignum_montmul_p384 + +#define montmul_p384(P0,P1,P2) \ + movq P2, %rdx ; \ + xorl %r15d, %r15d ; \ + mulxq P1, %r8, %r9 ; \ + mulxq 0x8+P1, %rbx, %r10 ; \ + addq %rbx, %r9 ; \ + mulxq 0x10+P1, %rbx, %r11 ; \ + adcq %rbx, %r10 ; \ + mulxq 0x18+P1, %rbx, %r12 ; \ + adcq %rbx, %r11 ; \ + mulxq 0x20+P1, %rbx, %r13 ; \ + adcq %rbx, %r12 ; \ + mulxq 0x28+P1, %rbx, %r14 ; \ + adcq %rbx, %r13 ; \ + adcq %r15, %r14 ; \ + movq %r8, %rdx ; \ + shlq $0x20, %rdx ; \ + addq %r8, %rdx ; \ + xorl %ebp, %ebp ; \ + movq $0xffffffff00000001, %rax ; \ + mulxq %rax, %rbx, %rax ; \ + movl $0xffffffff, %ebx ; \ + mulxq %rbx, %r8, %rbx ; \ + adcq %r8, %rax ; \ + adcq %rdx, %rbx ; \ + adcl %ebp, %ebp ; \ + subq %rax, %r9 ; \ + sbbq %rbx, %r10 ; \ + sbbq %rbp, %r11 ; \ + sbbq $0x0, %r12 ; \ + sbbq $0x0, %r13 ; \ + sbbq $0x0, %rdx ; \ + addq %rdx, %r14 ; \ + adcq $0x0, %r15 ; \ + movq 0x8+P2, %rdx ; \ + xorl %r8d, %r8d ; \ + mulxq P1, %rax, %rbx ; \ + adcxq %rax, %r9 ; \ + adoxq %rbx, %r10 ; \ + mulxq 0x8+P1, %rax, %rbx ; \ + adcxq %rax, %r10 ; \ + adoxq %rbx, %r11 ; \ + mulxq 0x10+P1, %rax, %rbx ; \ + adcxq %rax, %r11 ; \ + adoxq %rbx, %r12 ; \ + mulxq 0x18+P1, %rax, %rbx ; \ + adcxq %rax, %r12 ; \ + adoxq %rbx, %r13 ; \ + mulxq 0x20+P1, %rax, %rbx ; \ + adcxq %rax, %r13 ; \ + adoxq %rbx, %r14 ; \ + adoxq %r8, %r15 ; \ + mulxq 0x28+P1, %rax, %rbx ; \ + adcq %rax, %r14 ; \ + adcq %rbx, %r15 ; \ + adcq %r8, %r8 ; \ + movq %r9, %rdx ; \ + shlq $0x20, %rdx ; \ + addq %r9, %rdx ; \ + xorl %ebp, %ebp ; \ + movq $0xffffffff00000001, %rax ; \ + mulxq %rax, %rbx, %rax ; \ + movl $0xffffffff, %ebx ; \ + mulxq %rbx, %r9, %rbx ; \ + adcq %r9, %rax ; \ + adcq %rdx, %rbx ; \ + adcl %ebp, %ebp ; \ + subq %rax, %r10 ; \ + sbbq %rbx, %r11 ; \ + sbbq %rbp, %r12 ; \ + sbbq $0x0, %r13 ; \ + sbbq $0x0, %r14 ; \ + sbbq $0x0, %rdx ; \ + addq %rdx, %r15 ; \ + adcq $0x0, %r8 ; \ + movq 0x10+P2, %rdx ; \ + xorl %r9d, %r9d ; \ + mulxq P1, %rax, %rbx ; \ + adcxq %rax, %r10 ; \ + adoxq %rbx, %r11 ; \ + mulxq 0x8+P1, %rax, %rbx ; \ + adcxq %rax, %r11 ; \ + adoxq %rbx, %r12 ; \ + mulxq 0x10+P1, %rax, %rbx ; \ + adcxq %rax, %r12 ; \ + adoxq %rbx, %r13 ; \ + mulxq 0x18+P1, %rax, %rbx ; \ + adcxq %rax, %r13 ; \ + adoxq %rbx, %r14 ; \ + mulxq 0x20+P1, %rax, %rbx ; \ + adcxq %rax, %r14 ; \ + adoxq %rbx, %r15 ; \ + adoxq %r9, %r8 ; \ + mulxq 0x28+P1, %rax, %rbx ; \ + adcq %rax, %r15 ; \ + adcq %rbx, %r8 ; \ + adcq %r9, %r9 ; \ + movq %r10, %rdx ; \ + shlq $0x20, %rdx ; \ + addq %r10, %rdx ; \ + xorl %ebp, %ebp ; \ + movq $0xffffffff00000001, %rax ; \ + mulxq %rax, %rbx, %rax ; \ + movl $0xffffffff, %ebx ; \ + mulxq %rbx, %r10, %rbx ; \ + adcq %r10, %rax ; \ + adcq %rdx, %rbx ; \ + adcl %ebp, %ebp ; \ + subq %rax, %r11 ; \ + sbbq %rbx, %r12 ; \ + sbbq %rbp, %r13 ; \ + sbbq $0x0, %r14 ; \ + sbbq $0x0, %r15 ; \ + sbbq $0x0, %rdx ; \ + addq %rdx, %r8 ; \ + adcq $0x0, %r9 ; \ + movq 0x18+P2, %rdx ; \ + xorl %r10d, %r10d ; \ + mulxq P1, %rax, %rbx ; \ + adcxq %rax, %r11 ; \ + adoxq %rbx, %r12 ; \ + mulxq 0x8+P1, %rax, %rbx ; \ + adcxq %rax, %r12 ; \ + adoxq %rbx, %r13 ; \ + mulxq 0x10+P1, %rax, %rbx ; \ + adcxq %rax, %r13 ; \ + adoxq %rbx, %r14 ; \ + mulxq 0x18+P1, %rax, %rbx ; \ + adcxq %rax, %r14 ; \ + adoxq %rbx, %r15 ; \ + mulxq 0x20+P1, %rax, %rbx ; \ + adcxq %rax, %r15 ; \ + adoxq %rbx, %r8 ; \ + adoxq %r10, %r9 ; \ + mulxq 0x28+P1, %rax, %rbx ; \ + adcq %rax, %r8 ; \ + adcq %rbx, %r9 ; \ + adcq %r10, %r10 ; \ + movq %r11, %rdx ; \ + shlq $0x20, %rdx ; \ + addq %r11, %rdx ; \ + xorl %ebp, %ebp ; \ + movq $0xffffffff00000001, %rax ; \ + mulxq %rax, %rbx, %rax ; \ + movl $0xffffffff, %ebx ; \ + mulxq %rbx, %r11, %rbx ; \ + adcq %r11, %rax ; \ + adcq %rdx, %rbx ; \ + adcl %ebp, %ebp ; \ + subq %rax, %r12 ; \ + sbbq %rbx, %r13 ; \ + sbbq %rbp, %r14 ; \ + sbbq $0x0, %r15 ; \ + sbbq $0x0, %r8 ; \ + sbbq $0x0, %rdx ; \ + addq %rdx, %r9 ; \ + adcq $0x0, %r10 ; \ + movq 0x20+P2, %rdx ; \ + xorl %r11d, %r11d ; \ + mulxq P1, %rax, %rbx ; \ + adcxq %rax, %r12 ; \ + adoxq %rbx, %r13 ; \ + mulxq 0x8+P1, %rax, %rbx ; \ + adcxq %rax, %r13 ; \ + adoxq %rbx, %r14 ; \ + mulxq 0x10+P1, %rax, %rbx ; \ + adcxq %rax, %r14 ; \ + adoxq %rbx, %r15 ; \ + mulxq 0x18+P1, %rax, %rbx ; \ + adcxq %rax, %r15 ; \ + adoxq %rbx, %r8 ; \ + mulxq 0x20+P1, %rax, %rbx ; \ + adcxq %rax, %r8 ; \ + adoxq %rbx, %r9 ; \ + adoxq %r11, %r10 ; \ + mulxq 0x28+P1, %rax, %rbx ; \ + adcq %rax, %r9 ; \ + adcq %rbx, %r10 ; \ + adcq %r11, %r11 ; \ + movq %r12, %rdx ; \ + shlq $0x20, %rdx ; \ + addq %r12, %rdx ; \ + xorl %ebp, %ebp ; \ + movq $0xffffffff00000001, %rax ; \ + mulxq %rax, %rbx, %rax ; \ + movl $0xffffffff, %ebx ; \ + mulxq %rbx, %r12, %rbx ; \ + adcq %r12, %rax ; \ + adcq %rdx, %rbx ; \ + adcl %ebp, %ebp ; \ + subq %rax, %r13 ; \ + sbbq %rbx, %r14 ; \ + sbbq %rbp, %r15 ; \ + sbbq $0x0, %r8 ; \ + sbbq $0x0, %r9 ; \ + sbbq $0x0, %rdx ; \ + addq %rdx, %r10 ; \ + adcq $0x0, %r11 ; \ + movq 0x28+P2, %rdx ; \ + xorl %r12d, %r12d ; \ + mulxq P1, %rax, %rbx ; \ + adcxq %rax, %r13 ; \ + adoxq %rbx, %r14 ; \ + mulxq 0x8+P1, %rax, %rbx ; \ + adcxq %rax, %r14 ; \ + adoxq %rbx, %r15 ; \ + mulxq 0x10+P1, %rax, %rbx ; \ + adcxq %rax, %r15 ; \ + adoxq %rbx, %r8 ; \ + mulxq 0x18+P1, %rax, %rbx ; \ + adcxq %rax, %r8 ; \ + adoxq %rbx, %r9 ; \ + mulxq 0x20+P1, %rax, %rbx ; \ + adcxq %rax, %r9 ; \ + adoxq %rbx, %r10 ; \ + adoxq %r12, %r11 ; \ + mulxq 0x28+P1, %rax, %rbx ; \ + adcq %rax, %r10 ; \ + adcq %rbx, %r11 ; \ + adcq %r12, %r12 ; \ + movq %r13, %rdx ; \ + shlq $0x20, %rdx ; \ + addq %r13, %rdx ; \ + xorl %ebp, %ebp ; \ + movq $0xffffffff00000001, %rax ; \ + mulxq %rax, %rbx, %rax ; \ + movl $0xffffffff, %ebx ; \ + mulxq %rbx, %r13, %rbx ; \ + adcq %r13, %rax ; \ + adcq %rdx, %rbx ; \ + adcl %ebp, %ebp ; \ + subq %rax, %r14 ; \ + sbbq %rbx, %r15 ; \ + sbbq %rbp, %r8 ; \ + sbbq $0x0, %r9 ; \ + sbbq $0x0, %r10 ; \ + sbbq $0x0, %rdx ; \ + addq %rdx, %r11 ; \ + adcq $0x0, %r12 ; \ + xorl %edx, %edx ; \ + xorl %ebp, %ebp ; \ + xorl %r13d, %r13d ; \ + movq $0xffffffff00000001, %rax ; \ + addq %r14, %rax ; \ + movl $0xffffffff, %ebx ; \ + adcq %r15, %rbx ; \ + movl $0x1, %ecx ; \ + adcq %r8, %rcx ; \ + adcq %r9, %rdx ; \ + adcq %r10, %rbp ; \ + adcq %r11, %r13 ; \ + adcq $0x0, %r12 ; \ + cmovne %rax, %r14 ; \ + cmovne %rbx, %r15 ; \ + cmovne %rcx, %r8 ; \ + cmovne %rdx, %r9 ; \ + cmovne %rbp, %r10 ; \ + cmovne %r13, %r11 ; \ + movq %r14, P0 ; \ + movq %r15, 0x8+P0 ; \ + movq %r8, 0x10+P0 ; \ + movq %r9, 0x18+P0 ; \ + movq %r10, 0x20+P0 ; \ + movq %r11, 0x28+P0 + +// Corresponds exactly to bignum_montsqr_p384 + +#define montsqr_p384(P0,P1) \ + movq P1, %rdx ; \ + mulxq 0x8+P1, %r9, %r10 ; \ + mulxq 0x18+P1, %r11, %r12 ; \ + mulxq 0x28+P1, %r13, %r14 ; \ + movq 0x18+P1, %rdx ; \ + mulxq 0x20+P1, %r15, %rcx ; \ + xorl %ebp, %ebp ; \ + movq 0x10+P1, %rdx ; \ + mulxq P1, %rax, %rbx ; \ + adcxq %rax, %r10 ; \ + adoxq %rbx, %r11 ; \ + mulxq 0x8+P1, %rax, %rbx ; \ + adcxq %rax, %r11 ; \ + adoxq %rbx, %r12 ; \ + movq 0x8+P1, %rdx ; \ + mulxq 0x18+P1, %rax, %rbx ; \ + adcxq %rax, %r12 ; \ + adoxq %rbx, %r13 ; \ + mulxq 0x20+P1, %rax, %rbx ; \ + adcxq %rax, %r13 ; \ + adoxq %rbx, %r14 ; \ + mulxq 0x28+P1, %rax, %rbx ; \ + adcxq %rax, %r14 ; \ + adoxq %rbx, %r15 ; \ + adcxq %rbp, %r15 ; \ + adoxq %rbp, %rcx ; \ + adcq %rbp, %rcx ; \ + xorl %ebp, %ebp ; \ + movq 0x20+P1, %rdx ; \ + mulxq P1, %rax, %rbx ; \ + adcxq %rax, %r12 ; \ + adoxq %rbx, %r13 ; \ + movq 0x10+P1, %rdx ; \ + mulxq 0x18+P1, %rax, %rbx ; \ + adcxq %rax, %r13 ; \ + adoxq %rbx, %r14 ; \ + mulxq 0x20+P1, %rax, %rbx ; \ + adcxq %rax, %r14 ; \ + adoxq %rbx, %r15 ; \ + mulxq 0x28+P1, %rax, %rdx ; \ + adcxq %rax, %r15 ; \ + adoxq %rdx, %rcx ; \ + movq 0x28+P1, %rdx ; \ + mulxq 0x20+P1, %rbx, %rbp ; \ + mulxq 0x18+P1, %rax, %rdx ; \ + adcxq %rax, %rcx ; \ + adoxq %rdx, %rbx ; \ + movl $0x0, %eax ; \ + adcxq %rax, %rbx ; \ + adoxq %rax, %rbp ; \ + adcq %rax, %rbp ; \ + xorq %rax, %rax ; \ + movq P1, %rdx ; \ + mulxq P1, %r8, %rax ; \ + adcxq %r9, %r9 ; \ + adoxq %rax, %r9 ; \ + movq 0x8+P1, %rdx ; \ + mulxq %rdx, %rax, %rdx ; \ + adcxq %r10, %r10 ; \ + adoxq %rax, %r10 ; \ + adcxq %r11, %r11 ; \ + adoxq %rdx, %r11 ; \ + movq 0x10+P1, %rdx ; \ + mulxq %rdx, %rax, %rdx ; \ + adcxq %r12, %r12 ; \ + adoxq %rax, %r12 ; \ + adcxq %r13, %r13 ; \ + adoxq %rdx, %r13 ; \ + movq 0x18+P1, %rdx ; \ + mulxq %rdx, %rax, %rdx ; \ + adcxq %r14, %r14 ; \ + adoxq %rax, %r14 ; \ + adcxq %r15, %r15 ; \ + adoxq %rdx, %r15 ; \ + movq 0x20+P1, %rdx ; \ + mulxq %rdx, %rax, %rdx ; \ + adcxq %rcx, %rcx ; \ + adoxq %rax, %rcx ; \ + adcxq %rbx, %rbx ; \ + adoxq %rdx, %rbx ; \ + movq 0x28+P1, %rdx ; \ + mulxq %rdx, %rax, %rsi ; \ + adcxq %rbp, %rbp ; \ + adoxq %rax, %rbp ; \ + movl $0x0, %eax ; \ + adcxq %rax, %rsi ; \ + adoxq %rax, %rsi ; \ + movq %rbx, P0 ; \ + movq %r8, %rdx ; \ + shlq $0x20, %rdx ; \ + addq %r8, %rdx ; \ + movq $0xffffffff00000001, %rax ; \ + mulxq %rax, %r8, %rax ; \ + movl $0xffffffff, %ebx ; \ + mulxq %rbx, %rbx, %r8 ; \ + addq %rbx, %rax ; \ + adcq %rdx, %r8 ; \ + movl $0x0, %ebx ; \ + adcq %rbx, %rbx ; \ + subq %rax, %r9 ; \ + sbbq %r8, %r10 ; \ + sbbq %rbx, %r11 ; \ + sbbq $0x0, %r12 ; \ + sbbq $0x0, %r13 ; \ + movq %rdx, %r8 ; \ + sbbq $0x0, %r8 ; \ + movq %r9, %rdx ; \ + shlq $0x20, %rdx ; \ + addq %r9, %rdx ; \ + movq $0xffffffff00000001, %rax ; \ + mulxq %rax, %r9, %rax ; \ + movl $0xffffffff, %ebx ; \ + mulxq %rbx, %rbx, %r9 ; \ + addq %rbx, %rax ; \ + adcq %rdx, %r9 ; \ + movl $0x0, %ebx ; \ + adcq %rbx, %rbx ; \ + subq %rax, %r10 ; \ + sbbq %r9, %r11 ; \ + sbbq %rbx, %r12 ; \ + sbbq $0x0, %r13 ; \ + sbbq $0x0, %r8 ; \ + movq %rdx, %r9 ; \ + sbbq $0x0, %r9 ; \ + movq %r10, %rdx ; \ + shlq $0x20, %rdx ; \ + addq %r10, %rdx ; \ + movq $0xffffffff00000001, %rax ; \ + mulxq %rax, %r10, %rax ; \ + movl $0xffffffff, %ebx ; \ + mulxq %rbx, %rbx, %r10 ; \ + addq %rbx, %rax ; \ + adcq %rdx, %r10 ; \ + movl $0x0, %ebx ; \ + adcq %rbx, %rbx ; \ + subq %rax, %r11 ; \ + sbbq %r10, %r12 ; \ + sbbq %rbx, %r13 ; \ + sbbq $0x0, %r8 ; \ + sbbq $0x0, %r9 ; \ + movq %rdx, %r10 ; \ + sbbq $0x0, %r10 ; \ + movq %r11, %rdx ; \ + shlq $0x20, %rdx ; \ + addq %r11, %rdx ; \ + movq $0xffffffff00000001, %rax ; \ + mulxq %rax, %r11, %rax ; \ + movl $0xffffffff, %ebx ; \ + mulxq %rbx, %rbx, %r11 ; \ + addq %rbx, %rax ; \ + adcq %rdx, %r11 ; \ + movl $0x0, %ebx ; \ + adcq %rbx, %rbx ; \ + subq %rax, %r12 ; \ + sbbq %r11, %r13 ; \ + sbbq %rbx, %r8 ; \ + sbbq $0x0, %r9 ; \ + sbbq $0x0, %r10 ; \ + movq %rdx, %r11 ; \ + sbbq $0x0, %r11 ; \ + movq %r12, %rdx ; \ + shlq $0x20, %rdx ; \ + addq %r12, %rdx ; \ + movq $0xffffffff00000001, %rax ; \ + mulxq %rax, %r12, %rax ; \ + movl $0xffffffff, %ebx ; \ + mulxq %rbx, %rbx, %r12 ; \ + addq %rbx, %rax ; \ + adcq %rdx, %r12 ; \ + movl $0x0, %ebx ; \ + adcq %rbx, %rbx ; \ + subq %rax, %r13 ; \ + sbbq %r12, %r8 ; \ + sbbq %rbx, %r9 ; \ + sbbq $0x0, %r10 ; \ + sbbq $0x0, %r11 ; \ + movq %rdx, %r12 ; \ + sbbq $0x0, %r12 ; \ + movq %r13, %rdx ; \ + shlq $0x20, %rdx ; \ + addq %r13, %rdx ; \ + movq $0xffffffff00000001, %rax ; \ + mulxq %rax, %r13, %rax ; \ + movl $0xffffffff, %ebx ; \ + mulxq %rbx, %rbx, %r13 ; \ + addq %rbx, %rax ; \ + adcq %rdx, %r13 ; \ + movl $0x0, %ebx ; \ + adcq %rbx, %rbx ; \ + subq %rax, %r8 ; \ + sbbq %r13, %r9 ; \ + sbbq %rbx, %r10 ; \ + sbbq $0x0, %r11 ; \ + sbbq $0x0, %r12 ; \ + movq %rdx, %r13 ; \ + sbbq $0x0, %r13 ; \ + movq P0, %rbx ; \ + addq %r8, %r14 ; \ + adcq %r9, %r15 ; \ + adcq %r10, %rcx ; \ + adcq %r11, %rbx ; \ + adcq %r12, %rbp ; \ + adcq %r13, %rsi ; \ + movl $0x0, %r8d ; \ + adcq %r8, %r8 ; \ + xorq %r11, %r11 ; \ + xorq %r12, %r12 ; \ + xorq %r13, %r13 ; \ + movq $0xffffffff00000001, %rax ; \ + addq %r14, %rax ; \ + movl $0xffffffff, %r9d ; \ + adcq %r15, %r9 ; \ + movl $0x1, %r10d ; \ + adcq %rcx, %r10 ; \ + adcq %rbx, %r11 ; \ + adcq %rbp, %r12 ; \ + adcq %rsi, %r13 ; \ + adcq $0x0, %r8 ; \ + cmovne %rax, %r14 ; \ + cmovne %r9, %r15 ; \ + cmovne %r10, %rcx ; \ + cmovne %r11, %rbx ; \ + cmovne %r12, %rbp ; \ + cmovne %r13, %rsi ; \ + movq %r14, P0 ; \ + movq %r15, 0x8+P0 ; \ + movq %rcx, 0x10+P0 ; \ + movq %rbx, 0x18+P0 ; \ + movq %rbp, 0x20+P0 ; \ + movq %rsi, 0x28+P0 + +// Almost-Montgomery variant which we use when an input to other muls +// with the other argument fully reduced (which is always safe). + +#define amontsqr_p384(P0,P1) \ + movq P1, %rdx ; \ + mulxq 0x8+P1, %r9, %r10 ; \ + mulxq 0x18+P1, %r11, %r12 ; \ + mulxq 0x28+P1, %r13, %r14 ; \ + movq 0x18+P1, %rdx ; \ + mulxq 0x20+P1, %r15, %rcx ; \ + xorl %ebp, %ebp ; \ + movq 0x10+P1, %rdx ; \ + mulxq P1, %rax, %rbx ; \ + adcxq %rax, %r10 ; \ + adoxq %rbx, %r11 ; \ + mulxq 0x8+P1, %rax, %rbx ; \ + adcxq %rax, %r11 ; \ + adoxq %rbx, %r12 ; \ + movq 0x8+P1, %rdx ; \ + mulxq 0x18+P1, %rax, %rbx ; \ + adcxq %rax, %r12 ; \ + adoxq %rbx, %r13 ; \ + mulxq 0x20+P1, %rax, %rbx ; \ + adcxq %rax, %r13 ; \ + adoxq %rbx, %r14 ; \ + mulxq 0x28+P1, %rax, %rbx ; \ + adcxq %rax, %r14 ; \ + adoxq %rbx, %r15 ; \ + adcxq %rbp, %r15 ; \ + adoxq %rbp, %rcx ; \ + adcq %rbp, %rcx ; \ + xorl %ebp, %ebp ; \ + movq 0x20+P1, %rdx ; \ + mulxq P1, %rax, %rbx ; \ + adcxq %rax, %r12 ; \ + adoxq %rbx, %r13 ; \ + movq 0x10+P1, %rdx ; \ + mulxq 0x18+P1, %rax, %rbx ; \ + adcxq %rax, %r13 ; \ + adoxq %rbx, %r14 ; \ + mulxq 0x20+P1, %rax, %rbx ; \ + adcxq %rax, %r14 ; \ + adoxq %rbx, %r15 ; \ + mulxq 0x28+P1, %rax, %rdx ; \ + adcxq %rax, %r15 ; \ + adoxq %rdx, %rcx ; \ + movq 0x28+P1, %rdx ; \ + mulxq 0x20+P1, %rbx, %rbp ; \ + mulxq 0x18+P1, %rax, %rdx ; \ + adcxq %rax, %rcx ; \ + adoxq %rdx, %rbx ; \ + movl $0x0, %eax ; \ + adcxq %rax, %rbx ; \ + adoxq %rax, %rbp ; \ + adcq %rax, %rbp ; \ + xorq %rax, %rax ; \ + movq P1, %rdx ; \ + mulxq P1, %r8, %rax ; \ + adcxq %r9, %r9 ; \ + adoxq %rax, %r9 ; \ + movq 0x8+P1, %rdx ; \ + mulxq %rdx, %rax, %rdx ; \ + adcxq %r10, %r10 ; \ + adoxq %rax, %r10 ; \ + adcxq %r11, %r11 ; \ + adoxq %rdx, %r11 ; \ + movq 0x10+P1, %rdx ; \ + mulxq %rdx, %rax, %rdx ; \ + adcxq %r12, %r12 ; \ + adoxq %rax, %r12 ; \ + adcxq %r13, %r13 ; \ + adoxq %rdx, %r13 ; \ + movq 0x18+P1, %rdx ; \ + mulxq %rdx, %rax, %rdx ; \ + adcxq %r14, %r14 ; \ + adoxq %rax, %r14 ; \ + adcxq %r15, %r15 ; \ + adoxq %rdx, %r15 ; \ + movq 0x20+P1, %rdx ; \ + mulxq %rdx, %rax, %rdx ; \ + adcxq %rcx, %rcx ; \ + adoxq %rax, %rcx ; \ + adcxq %rbx, %rbx ; \ + adoxq %rdx, %rbx ; \ + movq 0x28+P1, %rdx ; \ + mulxq %rdx, %rax, %rsi ; \ + adcxq %rbp, %rbp ; \ + adoxq %rax, %rbp ; \ + movl $0x0, %eax ; \ + adcxq %rax, %rsi ; \ + adoxq %rax, %rsi ; \ + movq %rbx, P0 ; \ + movq %r8, %rdx ; \ + shlq $0x20, %rdx ; \ + addq %r8, %rdx ; \ + movq $0xffffffff00000001, %rax ; \ + mulxq %rax, %r8, %rax ; \ + movl $0xffffffff, %ebx ; \ + mulxq %rbx, %rbx, %r8 ; \ + addq %rbx, %rax ; \ + adcq %rdx, %r8 ; \ + movl $0x0, %ebx ; \ + adcq %rbx, %rbx ; \ + subq %rax, %r9 ; \ + sbbq %r8, %r10 ; \ + sbbq %rbx, %r11 ; \ + sbbq $0x0, %r12 ; \ + sbbq $0x0, %r13 ; \ + movq %rdx, %r8 ; \ + sbbq $0x0, %r8 ; \ + movq %r9, %rdx ; \ + shlq $0x20, %rdx ; \ + addq %r9, %rdx ; \ + movq $0xffffffff00000001, %rax ; \ + mulxq %rax, %r9, %rax ; \ + movl $0xffffffff, %ebx ; \ + mulxq %rbx, %rbx, %r9 ; \ + addq %rbx, %rax ; \ + adcq %rdx, %r9 ; \ + movl $0x0, %ebx ; \ + adcq %rbx, %rbx ; \ + subq %rax, %r10 ; \ + sbbq %r9, %r11 ; \ + sbbq %rbx, %r12 ; \ + sbbq $0x0, %r13 ; \ + sbbq $0x0, %r8 ; \ + movq %rdx, %r9 ; \ + sbbq $0x0, %r9 ; \ + movq %r10, %rdx ; \ + shlq $0x20, %rdx ; \ + addq %r10, %rdx ; \ + movq $0xffffffff00000001, %rax ; \ + mulxq %rax, %r10, %rax ; \ + movl $0xffffffff, %ebx ; \ + mulxq %rbx, %rbx, %r10 ; \ + addq %rbx, %rax ; \ + adcq %rdx, %r10 ; \ + movl $0x0, %ebx ; \ + adcq %rbx, %rbx ; \ + subq %rax, %r11 ; \ + sbbq %r10, %r12 ; \ + sbbq %rbx, %r13 ; \ + sbbq $0x0, %r8 ; \ + sbbq $0x0, %r9 ; \ + movq %rdx, %r10 ; \ + sbbq $0x0, %r10 ; \ + movq %r11, %rdx ; \ + shlq $0x20, %rdx ; \ + addq %r11, %rdx ; \ + movq $0xffffffff00000001, %rax ; \ + mulxq %rax, %r11, %rax ; \ + movl $0xffffffff, %ebx ; \ + mulxq %rbx, %rbx, %r11 ; \ + addq %rbx, %rax ; \ + adcq %rdx, %r11 ; \ + movl $0x0, %ebx ; \ + adcq %rbx, %rbx ; \ + subq %rax, %r12 ; \ + sbbq %r11, %r13 ; \ + sbbq %rbx, %r8 ; \ + sbbq $0x0, %r9 ; \ + sbbq $0x0, %r10 ; \ + movq %rdx, %r11 ; \ + sbbq $0x0, %r11 ; \ + movq %r12, %rdx ; \ + shlq $0x20, %rdx ; \ + addq %r12, %rdx ; \ + movq $0xffffffff00000001, %rax ; \ + mulxq %rax, %r12, %rax ; \ + movl $0xffffffff, %ebx ; \ + mulxq %rbx, %rbx, %r12 ; \ + addq %rbx, %rax ; \ + adcq %rdx, %r12 ; \ + movl $0x0, %ebx ; \ + adcq %rbx, %rbx ; \ + subq %rax, %r13 ; \ + sbbq %r12, %r8 ; \ + sbbq %rbx, %r9 ; \ + sbbq $0x0, %r10 ; \ + sbbq $0x0, %r11 ; \ + movq %rdx, %r12 ; \ + sbbq $0x0, %r12 ; \ + movq %r13, %rdx ; \ + shlq $0x20, %rdx ; \ + addq %r13, %rdx ; \ + movq $0xffffffff00000001, %rax ; \ + mulxq %rax, %r13, %rax ; \ + movl $0xffffffff, %ebx ; \ + mulxq %rbx, %rbx, %r13 ; \ + addq %rbx, %rax ; \ + adcq %rdx, %r13 ; \ + movl $0x0, %ebx ; \ + adcq %rbx, %rbx ; \ + subq %rax, %r8 ; \ + sbbq %r13, %r9 ; \ + sbbq %rbx, %r10 ; \ + sbbq $0x0, %r11 ; \ + sbbq $0x0, %r12 ; \ + movq %rdx, %r13 ; \ + sbbq $0x0, %r13 ; \ + movq P0, %rbx ; \ + addq %r8, %r14 ; \ + adcq %r9, %r15 ; \ + adcq %r10, %rcx ; \ + adcq %r11, %rbx ; \ + adcq %r12, %rbp ; \ + adcq %r13, %rsi ; \ + movl $0x0, %r8d ; \ + movq $0xffffffff00000001, %rax ; \ + movl $0xffffffff, %r9d ; \ + movl $0x1, %r10d ; \ + cmovnc %r8, %rax ; \ + cmovnc %r8, %r9 ; \ + cmovnc %r8, %r10 ; \ + addq %rax, %r14 ; \ + adcq %r9, %r15 ; \ + adcq %r10, %rcx ; \ + adcq %r8, %rbx ; \ + adcq %r8, %rbp ; \ + adcq %r8, %rsi ; \ + movq %r14, P0 ; \ + movq %r15, 0x8+P0 ; \ + movq %rcx, 0x10+P0 ; \ + movq %rbx, 0x18+P0 ; \ + movq %rbp, 0x20+P0 ; \ + movq %rsi, 0x28+P0 + +// Corresponds exactly to bignum_sub_p384 + +#define sub_p384(P0,P1,P2) \ + movq P1, %rax ; \ + subq P2, %rax ; \ + movq 0x8+P1, %rdx ; \ + sbbq 0x8+P2, %rdx ; \ + movq 0x10+P1, %r8 ; \ + sbbq 0x10+P2, %r8 ; \ + movq 0x18+P1, %r9 ; \ + sbbq 0x18+P2, %r9 ; \ + movq 0x20+P1, %r10 ; \ + sbbq 0x20+P2, %r10 ; \ + movq 0x28+P1, %r11 ; \ + sbbq 0x28+P2, %r11 ; \ + sbbq %rcx, %rcx ; \ + movl $0xffffffff, %esi ; \ + andq %rsi, %rcx ; \ + xorq %rsi, %rsi ; \ + subq %rcx, %rsi ; \ + subq %rsi, %rax ; \ + movq %rax, P0 ; \ + sbbq %rcx, %rdx ; \ + movq %rdx, 0x8+P0 ; \ + sbbq %rax, %rax ; \ + andq %rsi, %rcx ; \ + negq %rax; \ + sbbq %rcx, %r8 ; \ + movq %r8, 0x10+P0 ; \ + sbbq $0x0, %r9 ; \ + movq %r9, 0x18+P0 ; \ + sbbq $0x0, %r10 ; \ + movq %r10, 0x20+P0 ; \ + sbbq $0x0, %r11 ; \ + movq %r11, 0x28+P0 + +S2N_BN_SYMBOL(p384_montjmixadd): + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi + movq %r8, %rdx +#endif + +// Save registers and make room on stack for temporary variables +// Put the input arguments in non-volatile places on the stack + + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + + subq $NSPACE, %rsp + + movq %rdi, input_z + movq %rsi, input_x + movq %rdx, input_y + +// Main code, just a sequence of basic field operations +// 8 * multiply + 3 * square + 7 * subtract + + amontsqr_p384(zp2,z_1) + + movq input_x, %rsi + movq input_y, %rcx + montmul_p384(y2a,z_1,y_2) + + movq input_y, %rcx + montmul_p384(x2a,zp2,x_2) + + montmul_p384(y2a,zp2,y2a) + + movq input_x, %rsi + sub_p384(xd,x2a,x_1) + movq input_x, %rsi + sub_p384(yd,y2a,y_1) + + amontsqr_p384(zz,xd) + montsqr_p384(ww,yd) + + movq input_x, %rsi + montmul_p384(zzx1,zz,x_1) + montmul_p384(zzx2,zz,x2a) + + movq input_z, %rdi + sub_p384(x_3,ww,zzx1) + sub_p384(t1,zzx2,zzx1) + + movq input_z, %rdi + movq input_x, %rsi + montmul_p384(z_3,xd,z_1) + + movq input_z, %rdi + sub_p384(x_3,x_3,zzx2) + + movq input_z, %rdi + sub_p384(t2,zzx1,x_3) + + movq input_x, %rsi + montmul_p384(t1,t1,y_1) + montmul_p384(t2,yd,t2) + + movq input_z, %rdi + sub_p384(y_3,t2,t1) + +// Restore stack and registers + + addq $NSPACE, %rsp + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbp + popq %rbx + +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack, "", %progbits +#endif From 6fec512d151fcb73f20593881bfc9ba543b6f162 Mon Sep 17 00:00:00 2001 From: John Harrison Date: Fri, 15 Jul 2022 21:36:04 -0700 Subject: [PATCH 2/3] Tweak ARM bignum_sqr_p521_alt to use fewer registers s2n-bignum original commit: https://github.com/awslabs/s2n-bignum/commit/57a43a3c6f4d29c822b1c226557ced539be575ef --- arm/p521/bignum_sqr_p521_alt.S | 68 ++++++++++++++++++---------------- 1 file changed, 37 insertions(+), 31 deletions(-) diff --git a/arm/p521/bignum_sqr_p521_alt.S b/arm/p521/bignum_sqr_p521_alt.S index 6ba447f390..fe2201c00e 100644 --- a/arm/p521/bignum_sqr_p521_alt.S +++ b/arm/p521/bignum_sqr_p521_alt.S @@ -43,23 +43,23 @@ #define l x10 -#define u0 x11 -#define u1 x12 -#define u2 x13 -#define u3 x14 -#define u4 x15 -#define u5 x16 -#define u6 x17 -#define u7 x19 -#define u8 x20 -#define u9 x21 -#define u10 x22 -#define u11 x23 -#define u12 x24 -#define u13 x25 -#define u14 x26 -#define u15 x27 -#define u16 x29 +#define u0 x2 // The same as a0 +#define u1 x11 +#define u2 x12 +#define u3 x13 +#define u4 x14 +#define u5 x15 +#define u6 x16 +#define u7 x17 +#define u8 x19 +#define u9 x20 +#define u10 x21 +#define u11 x22 +#define u12 x23 +#define u13 x24 +#define u14 x25 +#define u15 x26 +#define u16 x4 // The same as a2 S2N_BN_SYMBOL(bignum_sqr_p521_alt): @@ -69,7 +69,6 @@ S2N_BN_SYMBOL(bignum_sqr_p521_alt): stp x21, x22, [sp, #-16]! stp x23, x24, [sp, #-16]! stp x25, x26, [sp, #-16]! - stp x27, x29, [sp, #-16]! // Load low 8 elements as [a7;a6;a5;a4;a3;a2;a1;a0], set up an initial // window [u8;u7;u6;u5;u4;u3;u2;u1] = 10 + 20 + 30 + 40 + 50 + 60 + 70 @@ -231,7 +230,6 @@ S2N_BN_SYMBOL(bignum_sqr_p521_alt): // Add the homogeneous terms 00 + 11 + 22 + 33 + 44 + 55 + 66 + 77 umulh l, a0, a0 - mul u0, a0, a0 adds u1, u1, l mul l, a1, a1 @@ -269,49 +267,58 @@ S2N_BN_SYMBOL(bignum_sqr_p521_alt): umulh l, a7, a7 adc u15, u15, l -// Now load in the top digit a8, and also set up its double and square +// Now load in the top digit a8, and immediately double the register ldr a8, [x, #64] - mul u16, a8, a8 add a8, a8, a8 -// Add a8 * [a7;...;a0] into the top of the buffer +// Add (2 * a8) * [a7;...;a0] into the top of the buffer +// At the end of the first chain we form u16 = a8 ^ 2. +// This needs us to shift right the modified a8 again but it saves a +// register, and the overall performance impact seems slightly positive. mul l, a8, a0 adds u8, u8, l - mul l, a8, a1 + umulh l, a8, a0 adcs u9, u9, l mul l, a8, a2 adcs u10, u10, l - mul l, a8, a3 + umulh l, a8, a2 adcs u11, u11, l mul l, a8, a4 adcs u12, u12, l - mul l, a8, a5 + umulh l, a8, a4 adcs u13, u13, l mul l, a8, a6 adcs u14, u14, l - mul l, a8, a7 + umulh l, a8, a6 adcs u15, u15, l + lsr u16, a8, #1 + mul u16, u16, u16 adc u16, u16, xzr - umulh l, a8, a0 + mul l, a8, a1 adds u9, u9, l umulh l, a8, a1 adcs u10, u10, l - umulh l, a8, a2 + mul l, a8, a3 adcs u11, u11, l umulh l, a8, a3 adcs u12, u12, l - umulh l, a8, a4 + mul l, a8, a5 adcs u13, u13, l umulh l, a8, a5 adcs u14, u14, l - umulh l, a8, a6 + mul l, a8, a7 adcs u15, u15, l umulh l, a8, a7 adc u16, u16, l +// Finally squeeze in the lowest mul. This didn't need to be involved +// in the addition chains and moreover lets us re-use u0 == a0 + + mul u0, a0, a0 + // Now we have the full product, which we consider as // 2^521 * h + l. Form h + l + 1 @@ -361,7 +368,6 @@ S2N_BN_SYMBOL(bignum_sqr_p521_alt): // Restore registers and return - ldp x27, x29, [sp], #16 ldp x25, x26, [sp], #16 ldp x23, x24, [sp], #16 ldp x21, x22, [sp], #16 From 7b51bdfd435c31d1c00b3100c567c0f912ebd509 Mon Sep 17 00:00:00 2001 From: John Harrison Date: Thu, 21 Jul 2022 06:35:02 -0700 Subject: [PATCH 3/3] Add basic NIST P-521 point operations The same trio of a point doubling function, point addition function and point mixed addition function, this time for the P-521 curve, all using Jacobian coordinates, with input nondegeneracy assumed. s2n-bignum original commit: https://github.com/awslabs/s2n-bignum/commit/047c0b1401610f9933a60ce0836143f9217ffa34 --- arm/p521/Makefile | 5 +- arm/p521/p521_jadd.S | 808 +++++++++++++++++++ arm/p521/p521_jdouble.S | 1470 +++++++++++++++++++++++++++++++++++ arm/p521/p521_jmixadd.S | 797 +++++++++++++++++++ x86_att/p521/p521_jadd.S | 765 ++++++++++++++++++ x86_att/p521/p521_jdouble.S | 1386 +++++++++++++++++++++++++++++++++ x86_att/p521/p521_jmixadd.S | 756 ++++++++++++++++++ 7 files changed, 5986 insertions(+), 1 deletion(-) create mode 100644 arm/p521/p521_jadd.S create mode 100644 arm/p521/p521_jdouble.S create mode 100644 arm/p521/p521_jmixadd.S create mode 100644 x86_att/p521/p521_jadd.S create mode 100644 x86_att/p521/p521_jdouble.S create mode 100644 x86_att/p521/p521_jmixadd.S diff --git a/arm/p521/Makefile b/arm/p521/Makefile index 921016115f..9ea36beb96 100644 --- a/arm/p521/Makefile +++ b/arm/p521/Makefile @@ -53,7 +53,10 @@ OBJ = bignum_add_p521.o \ bignum_sub_p521.o \ bignum_tolebytes_p521.o \ bignum_tomont_p521.o \ - bignum_triple_p521.o + bignum_triple_p521.o \ + p521_jadd.o \ + p521_jdouble.o \ + p521_jmixadd.o %.o : %.S ; $(CC) -E -I../../include $< | $(GAS) -o $@ - diff --git a/arm/p521/p521_jadd.S b/arm/p521/p521_jadd.S new file mode 100644 index 0000000000..85e62e01e7 --- /dev/null +++ b/arm/p521/p521_jadd.S @@ -0,0 +1,808 @@ +/* + * Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"). + * You may not use this file except in compliance with the License. + * A copy of the License is located at + * + * http://aws.amazon.com/apache2.0 + * + * or in the "LICENSE" file accompanying this file. This file is distributed + * on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either + * express or implied. See the License for the specific language governing + * permissions and limitations under the License. + */ + +// ---------------------------------------------------------------------------- +// Point addition on NIST curve P-521 in Jacobian coordinates +// +// extern void p521_jadd +// (uint64_t p3[static 27],uint64_t p1[static 27],uint64_t p2[static 27]); +// +// Does p3 := p1 + p2 where all points are regarded as Jacobian triples. +// A Jacobian triple (x,y,z) represents affine point (x/z^2,y/z^3). +// It is assumed that all coordinates of the input points p1 and p2 are +// fully reduced mod p_521, that both z coordinates are nonzero and +// that neither p1 =~= p2 or p1 =~= -p2, where "=~=" means "represents +// the same affine point as". +// +// Standard ARM ABI: X0 = p3, X1 = p1, X2 = p2 +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(p521_jadd) + S2N_BN_SYM_PRIVACY_DIRECTIVE(p521_jadd) + + .text + .balign 4 + +// Size of individual field elements + +#define NUMSIZE 72 + +// Stable homes for input arguments during main code sequence + +#define input_z x26 +#define input_x x27 +#define input_y x28 + +// Pointer-offset pairs for inputs and outputs + +#define x_1 input_x, #0 +#define y_1 input_x, #NUMSIZE +#define z_1 input_x, #(2*NUMSIZE) + +#define x_2 input_y, #0 +#define y_2 input_y, #NUMSIZE +#define z_2 input_y, #(2*NUMSIZE) + +#define x_3 input_z, #0 +#define y_3 input_z, #NUMSIZE +#define z_3 input_z, #(2*NUMSIZE) + +// Pointer-offset pairs for temporaries, with some aliasing +// NSPACE is the total stack needed for these temporaries + +#define z1sq sp, #(NUMSIZE*0) +#define ww sp, #(NUMSIZE*0) + +#define yd sp, #(NUMSIZE*1) +#define y2a sp, #(NUMSIZE*1) + +#define x2a sp, #(NUMSIZE*2) +#define zzx2 sp, #(NUMSIZE*2) + +#define zz sp, #(NUMSIZE*3) +#define t1 sp, #(NUMSIZE*3) + +#define t2 sp, #(NUMSIZE*4) +#define x1a sp, #(NUMSIZE*4) +#define zzx1 sp, #(NUMSIZE*4) + +#define xd sp, #(NUMSIZE*5) +#define z2sq sp, #(NUMSIZE*5) + +#define y1a sp, #(NUMSIZE*6) + +// NUMSIZE*7 is not 16-aligned so we round it up + +#define NSPACE (NUMSIZE*7+8) + +// Corresponds exactly to bignum_mul_p521_alt + +#define mul_p521(P0,P1,P2) \ + ldp x3, x4, [P1]; \ + ldp x5, x6, [P2]; \ + mul x15, x3, x5; \ + umulh x16, x3, x5; \ + mul x14, x3, x6; \ + umulh x17, x3, x6; \ + adds x16, x16, x14; \ + ldp x7, x8, [P2+16]; \ + mul x14, x3, x7; \ + umulh x19, x3, x7; \ + adcs x17, x17, x14; \ + mul x14, x3, x8; \ + umulh x20, x3, x8; \ + adcs x19, x19, x14; \ + ldp x9, x10, [P2+32]; \ + mul x14, x3, x9; \ + umulh x21, x3, x9; \ + adcs x20, x20, x14; \ + mul x14, x3, x10; \ + umulh x22, x3, x10; \ + adcs x21, x21, x14; \ + ldp x11, x12, [P2+48]; \ + mul x14, x3, x11; \ + umulh x23, x3, x11; \ + adcs x22, x22, x14; \ + ldr x13, [P2+64]; \ + mul x14, x3, x12; \ + umulh x24, x3, x12; \ + adcs x23, x23, x14; \ + mul x14, x3, x13; \ + umulh x1, x3, x13; \ + adcs x24, x24, x14; \ + adc x1, x1, xzr; \ + mul x14, x4, x5; \ + adds x16, x16, x14; \ + mul x14, x4, x6; \ + adcs x17, x17, x14; \ + mul x14, x4, x7; \ + adcs x19, x19, x14; \ + mul x14, x4, x8; \ + adcs x20, x20, x14; \ + mul x14, x4, x9; \ + adcs x21, x21, x14; \ + mul x14, x4, x10; \ + adcs x22, x22, x14; \ + mul x14, x4, x11; \ + adcs x23, x23, x14; \ + mul x14, x4, x12; \ + adcs x24, x24, x14; \ + mul x14, x4, x13; \ + adcs x1, x1, x14; \ + cset x0, hs; \ + umulh x14, x4, x5; \ + adds x17, x17, x14; \ + umulh x14, x4, x6; \ + adcs x19, x19, x14; \ + umulh x14, x4, x7; \ + adcs x20, x20, x14; \ + umulh x14, x4, x8; \ + adcs x21, x21, x14; \ + umulh x14, x4, x9; \ + adcs x22, x22, x14; \ + umulh x14, x4, x10; \ + adcs x23, x23, x14; \ + umulh x14, x4, x11; \ + adcs x24, x24, x14; \ + umulh x14, x4, x12; \ + adcs x1, x1, x14; \ + umulh x14, x4, x13; \ + adc x0, x0, x14; \ + stp x15, x16, [P0]; \ + ldp x3, x4, [P1+16]; \ + mul x14, x3, x5; \ + adds x17, x17, x14; \ + mul x14, x3, x6; \ + adcs x19, x19, x14; \ + mul x14, x3, x7; \ + adcs x20, x20, x14; \ + mul x14, x3, x8; \ + adcs x21, x21, x14; \ + mul x14, x3, x9; \ + adcs x22, x22, x14; \ + mul x14, x3, x10; \ + adcs x23, x23, x14; \ + mul x14, x3, x11; \ + adcs x24, x24, x14; \ + mul x14, x3, x12; \ + adcs x1, x1, x14; \ + mul x14, x3, x13; \ + adcs x0, x0, x14; \ + cset x15, hs; \ + umulh x14, x3, x5; \ + adds x19, x19, x14; \ + umulh x14, x3, x6; \ + adcs x20, x20, x14; \ + umulh x14, x3, x7; \ + adcs x21, x21, x14; \ + umulh x14, x3, x8; \ + adcs x22, x22, x14; \ + umulh x14, x3, x9; \ + adcs x23, x23, x14; \ + umulh x14, x3, x10; \ + adcs x24, x24, x14; \ + umulh x14, x3, x11; \ + adcs x1, x1, x14; \ + umulh x14, x3, x12; \ + adcs x0, x0, x14; \ + umulh x14, x3, x13; \ + adc x15, x15, x14; \ + mul x14, x4, x5; \ + adds x19, x19, x14; \ + mul x14, x4, x6; \ + adcs x20, x20, x14; \ + mul x14, x4, x7; \ + adcs x21, x21, x14; \ + mul x14, x4, x8; \ + adcs x22, x22, x14; \ + mul x14, x4, x9; \ + adcs x23, x23, x14; \ + mul x14, x4, x10; \ + adcs x24, x24, x14; \ + mul x14, x4, x11; \ + adcs x1, x1, x14; \ + mul x14, x4, x12; \ + adcs x0, x0, x14; \ + mul x14, x4, x13; \ + adcs x15, x15, x14; \ + cset x16, hs; \ + umulh x14, x4, x5; \ + adds x20, x20, x14; \ + umulh x14, x4, x6; \ + adcs x21, x21, x14; \ + umulh x14, x4, x7; \ + adcs x22, x22, x14; \ + umulh x14, x4, x8; \ + adcs x23, x23, x14; \ + umulh x14, x4, x9; \ + adcs x24, x24, x14; \ + umulh x14, x4, x10; \ + adcs x1, x1, x14; \ + umulh x14, x4, x11; \ + adcs x0, x0, x14; \ + umulh x14, x4, x12; \ + adcs x15, x15, x14; \ + umulh x14, x4, x13; \ + adc x16, x16, x14; \ + stp x17, x19, [P0+16]; \ + ldp x3, x4, [P1+32]; \ + mul x14, x3, x5; \ + adds x20, x20, x14; \ + mul x14, x3, x6; \ + adcs x21, x21, x14; \ + mul x14, x3, x7; \ + adcs x22, x22, x14; \ + mul x14, x3, x8; \ + adcs x23, x23, x14; \ + mul x14, x3, x9; \ + adcs x24, x24, x14; \ + mul x14, x3, x10; \ + adcs x1, x1, x14; \ + mul x14, x3, x11; \ + adcs x0, x0, x14; \ + mul x14, x3, x12; \ + adcs x15, x15, x14; \ + mul x14, x3, x13; \ + adcs x16, x16, x14; \ + cset x17, hs; \ + umulh x14, x3, x5; \ + adds x21, x21, x14; \ + umulh x14, x3, x6; \ + adcs x22, x22, x14; \ + umulh x14, x3, x7; \ + adcs x23, x23, x14; \ + umulh x14, x3, x8; \ + adcs x24, x24, x14; \ + umulh x14, x3, x9; \ + adcs x1, x1, x14; \ + umulh x14, x3, x10; \ + adcs x0, x0, x14; \ + umulh x14, x3, x11; \ + adcs x15, x15, x14; \ + umulh x14, x3, x12; \ + adcs x16, x16, x14; \ + umulh x14, x3, x13; \ + adc x17, x17, x14; \ + mul x14, x4, x5; \ + adds x21, x21, x14; \ + mul x14, x4, x6; \ + adcs x22, x22, x14; \ + mul x14, x4, x7; \ + adcs x23, x23, x14; \ + mul x14, x4, x8; \ + adcs x24, x24, x14; \ + mul x14, x4, x9; \ + adcs x1, x1, x14; \ + mul x14, x4, x10; \ + adcs x0, x0, x14; \ + mul x14, x4, x11; \ + adcs x15, x15, x14; \ + mul x14, x4, x12; \ + adcs x16, x16, x14; \ + mul x14, x4, x13; \ + adcs x17, x17, x14; \ + cset x19, hs; \ + umulh x14, x4, x5; \ + adds x22, x22, x14; \ + umulh x14, x4, x6; \ + adcs x23, x23, x14; \ + umulh x14, x4, x7; \ + adcs x24, x24, x14; \ + umulh x14, x4, x8; \ + adcs x1, x1, x14; \ + umulh x14, x4, x9; \ + adcs x0, x0, x14; \ + umulh x14, x4, x10; \ + adcs x15, x15, x14; \ + umulh x14, x4, x11; \ + adcs x16, x16, x14; \ + umulh x14, x4, x12; \ + adcs x17, x17, x14; \ + umulh x14, x4, x13; \ + adc x19, x19, x14; \ + stp x20, x21, [P0+32]; \ + ldp x3, x4, [P1+48]; \ + mul x14, x3, x5; \ + adds x22, x22, x14; \ + mul x14, x3, x6; \ + adcs x23, x23, x14; \ + mul x14, x3, x7; \ + adcs x24, x24, x14; \ + mul x14, x3, x8; \ + adcs x1, x1, x14; \ + mul x14, x3, x9; \ + adcs x0, x0, x14; \ + mul x14, x3, x10; \ + adcs x15, x15, x14; \ + mul x14, x3, x11; \ + adcs x16, x16, x14; \ + mul x14, x3, x12; \ + adcs x17, x17, x14; \ + mul x14, x3, x13; \ + adcs x19, x19, x14; \ + cset x20, hs; \ + umulh x14, x3, x5; \ + adds x23, x23, x14; \ + umulh x14, x3, x6; \ + adcs x24, x24, x14; \ + umulh x14, x3, x7; \ + adcs x1, x1, x14; \ + umulh x14, x3, x8; \ + adcs x0, x0, x14; \ + umulh x14, x3, x9; \ + adcs x15, x15, x14; \ + umulh x14, x3, x10; \ + adcs x16, x16, x14; \ + umulh x14, x3, x11; \ + adcs x17, x17, x14; \ + umulh x14, x3, x12; \ + adcs x19, x19, x14; \ + umulh x14, x3, x13; \ + adc x20, x20, x14; \ + mul x14, x4, x5; \ + adds x23, x23, x14; \ + mul x14, x4, x6; \ + adcs x24, x24, x14; \ + mul x14, x4, x7; \ + adcs x1, x1, x14; \ + mul x14, x4, x8; \ + adcs x0, x0, x14; \ + mul x14, x4, x9; \ + adcs x15, x15, x14; \ + mul x14, x4, x10; \ + adcs x16, x16, x14; \ + mul x14, x4, x11; \ + adcs x17, x17, x14; \ + mul x14, x4, x12; \ + adcs x19, x19, x14; \ + mul x14, x4, x13; \ + adcs x20, x20, x14; \ + cset x21, hs; \ + umulh x14, x4, x5; \ + adds x24, x24, x14; \ + umulh x14, x4, x6; \ + adcs x1, x1, x14; \ + umulh x14, x4, x7; \ + adcs x0, x0, x14; \ + umulh x14, x4, x8; \ + adcs x15, x15, x14; \ + umulh x14, x4, x9; \ + adcs x16, x16, x14; \ + umulh x14, x4, x10; \ + adcs x17, x17, x14; \ + umulh x14, x4, x11; \ + adcs x19, x19, x14; \ + umulh x14, x4, x12; \ + adcs x20, x20, x14; \ + umulh x14, x4, x13; \ + adc x21, x21, x14; \ + stp x22, x23, [P0+48]; \ + ldr x3, [P1+64]; \ + mul x14, x3, x5; \ + adds x24, x24, x14; \ + mul x14, x3, x6; \ + adcs x1, x1, x14; \ + mul x14, x3, x7; \ + adcs x0, x0, x14; \ + mul x14, x3, x8; \ + adcs x15, x15, x14; \ + mul x14, x3, x9; \ + adcs x16, x16, x14; \ + mul x14, x3, x10; \ + adcs x17, x17, x14; \ + mul x14, x3, x11; \ + adcs x19, x19, x14; \ + mul x14, x3, x12; \ + adcs x20, x20, x14; \ + mul x14, x3, x13; \ + adc x21, x21, x14; \ + umulh x14, x3, x5; \ + adds x1, x1, x14; \ + umulh x14, x3, x6; \ + adcs x0, x0, x14; \ + umulh x14, x3, x7; \ + adcs x15, x15, x14; \ + umulh x14, x3, x8; \ + adcs x16, x16, x14; \ + umulh x14, x3, x9; \ + adcs x17, x17, x14; \ + umulh x14, x3, x10; \ + adcs x19, x19, x14; \ + umulh x14, x3, x11; \ + adcs x20, x20, x14; \ + umulh x14, x3, x12; \ + adc x21, x21, x14; \ + cmp xzr, xzr; \ + ldp x5, x6, [P0]; \ + extr x14, x1, x24, #9; \ + adcs x5, x5, x14; \ + extr x14, x0, x1, #9; \ + adcs x6, x6, x14; \ + ldp x7, x8, [P0+16]; \ + extr x14, x15, x0, #9; \ + adcs x7, x7, x14; \ + extr x14, x16, x15, #9; \ + adcs x8, x8, x14; \ + ldp x9, x10, [P0+32]; \ + extr x14, x17, x16, #9; \ + adcs x9, x9, x14; \ + extr x14, x19, x17, #9; \ + adcs x10, x10, x14; \ + ldp x11, x12, [P0+48]; \ + extr x14, x20, x19, #9; \ + adcs x11, x11, x14; \ + extr x14, x21, x20, #9; \ + adcs x12, x12, x14; \ + orr x13, x24, #0xfffffffffffffe00; \ + lsr x14, x21, #9; \ + adcs x13, x13, x14; \ + sbcs x5, x5, xzr; \ + sbcs x6, x6, xzr; \ + sbcs x7, x7, xzr; \ + sbcs x8, x8, xzr; \ + sbcs x9, x9, xzr; \ + sbcs x10, x10, xzr; \ + sbcs x11, x11, xzr; \ + sbcs x12, x12, xzr; \ + sbc x13, x13, xzr; \ + and x13, x13, #0x1ff; \ + stp x5, x6, [P0]; \ + stp x7, x8, [P0+16]; \ + stp x9, x10, [P0+32]; \ + stp x11, x12, [P0+48]; \ + str x13, [P0+64] + +// Corresponds exactly to bignum_sqr_p521_alt + +#define sqr_p521(P0,P1) \ + ldp x2, x3, [P1]; \ + mul x11, x2, x3; \ + umulh x12, x2, x3; \ + ldp x4, x5, [P1+16]; \ + mul x10, x2, x4; \ + umulh x13, x2, x4; \ + adds x12, x12, x10; \ + ldp x6, x7, [P1+32]; \ + mul x10, x2, x5; \ + umulh x14, x2, x5; \ + adcs x13, x13, x10; \ + ldp x8, x9, [P1+48]; \ + mul x10, x2, x6; \ + umulh x15, x2, x6; \ + adcs x14, x14, x10; \ + mul x10, x2, x7; \ + umulh x16, x2, x7; \ + adcs x15, x15, x10; \ + mul x10, x2, x8; \ + umulh x17, x2, x8; \ + adcs x16, x16, x10; \ + mul x10, x2, x9; \ + umulh x19, x2, x9; \ + adcs x17, x17, x10; \ + adc x19, x19, xzr; \ + mul x10, x3, x4; \ + adds x13, x13, x10; \ + mul x10, x3, x5; \ + adcs x14, x14, x10; \ + mul x10, x3, x6; \ + adcs x15, x15, x10; \ + mul x10, x3, x7; \ + adcs x16, x16, x10; \ + mul x10, x3, x8; \ + adcs x17, x17, x10; \ + mul x10, x3, x9; \ + adcs x19, x19, x10; \ + cset x20, hs; \ + umulh x10, x3, x4; \ + adds x14, x14, x10; \ + umulh x10, x3, x5; \ + adcs x15, x15, x10; \ + umulh x10, x3, x6; \ + adcs x16, x16, x10; \ + umulh x10, x3, x7; \ + adcs x17, x17, x10; \ + umulh x10, x3, x8; \ + adcs x19, x19, x10; \ + umulh x10, x3, x9; \ + adc x20, x20, x10; \ + mul x10, x6, x7; \ + umulh x21, x6, x7; \ + adds x20, x20, x10; \ + adc x21, x21, xzr; \ + mul x10, x4, x5; \ + adds x15, x15, x10; \ + mul x10, x4, x6; \ + adcs x16, x16, x10; \ + mul x10, x4, x7; \ + adcs x17, x17, x10; \ + mul x10, x4, x8; \ + adcs x19, x19, x10; \ + mul x10, x4, x9; \ + adcs x20, x20, x10; \ + mul x10, x6, x8; \ + adcs x21, x21, x10; \ + cset x22, hs; \ + umulh x10, x4, x5; \ + adds x16, x16, x10; \ + umulh x10, x4, x6; \ + adcs x17, x17, x10; \ + umulh x10, x4, x7; \ + adcs x19, x19, x10; \ + umulh x10, x4, x8; \ + adcs x20, x20, x10; \ + umulh x10, x4, x9; \ + adcs x21, x21, x10; \ + umulh x10, x6, x8; \ + adc x22, x22, x10; \ + mul x10, x7, x8; \ + umulh x23, x7, x8; \ + adds x22, x22, x10; \ + adc x23, x23, xzr; \ + mul x10, x5, x6; \ + adds x17, x17, x10; \ + mul x10, x5, x7; \ + adcs x19, x19, x10; \ + mul x10, x5, x8; \ + adcs x20, x20, x10; \ + mul x10, x5, x9; \ + adcs x21, x21, x10; \ + mul x10, x6, x9; \ + adcs x22, x22, x10; \ + mul x10, x7, x9; \ + adcs x23, x23, x10; \ + cset x24, hs; \ + umulh x10, x5, x6; \ + adds x19, x19, x10; \ + umulh x10, x5, x7; \ + adcs x20, x20, x10; \ + umulh x10, x5, x8; \ + adcs x21, x21, x10; \ + umulh x10, x5, x9; \ + adcs x22, x22, x10; \ + umulh x10, x6, x9; \ + adcs x23, x23, x10; \ + umulh x10, x7, x9; \ + adc x24, x24, x10; \ + mul x10, x8, x9; \ + umulh x25, x8, x9; \ + adds x24, x24, x10; \ + adc x25, x25, xzr; \ + adds x11, x11, x11; \ + adcs x12, x12, x12; \ + adcs x13, x13, x13; \ + adcs x14, x14, x14; \ + adcs x15, x15, x15; \ + adcs x16, x16, x16; \ + adcs x17, x17, x17; \ + adcs x19, x19, x19; \ + adcs x20, x20, x20; \ + adcs x21, x21, x21; \ + adcs x22, x22, x22; \ + adcs x23, x23, x23; \ + adcs x24, x24, x24; \ + adcs x25, x25, x25; \ + cset x0, hs; \ + umulh x10, x2, x2; \ + adds x11, x11, x10; \ + mul x10, x3, x3; \ + adcs x12, x12, x10; \ + umulh x10, x3, x3; \ + adcs x13, x13, x10; \ + mul x10, x4, x4; \ + adcs x14, x14, x10; \ + umulh x10, x4, x4; \ + adcs x15, x15, x10; \ + mul x10, x5, x5; \ + adcs x16, x16, x10; \ + umulh x10, x5, x5; \ + adcs x17, x17, x10; \ + mul x10, x6, x6; \ + adcs x19, x19, x10; \ + umulh x10, x6, x6; \ + adcs x20, x20, x10; \ + mul x10, x7, x7; \ + adcs x21, x21, x10; \ + umulh x10, x7, x7; \ + adcs x22, x22, x10; \ + mul x10, x8, x8; \ + adcs x23, x23, x10; \ + umulh x10, x8, x8; \ + adcs x24, x24, x10; \ + mul x10, x9, x9; \ + adcs x25, x25, x10; \ + umulh x10, x9, x9; \ + adc x0, x0, x10; \ + ldr x1, [P1+64]; \ + add x1, x1, x1; \ + mul x10, x1, x2; \ + adds x19, x19, x10; \ + umulh x10, x1, x2; \ + adcs x20, x20, x10; \ + mul x10, x1, x4; \ + adcs x21, x21, x10; \ + umulh x10, x1, x4; \ + adcs x22, x22, x10; \ + mul x10, x1, x6; \ + adcs x23, x23, x10; \ + umulh x10, x1, x6; \ + adcs x24, x24, x10; \ + mul x10, x1, x8; \ + adcs x25, x25, x10; \ + umulh x10, x1, x8; \ + adcs x0, x0, x10; \ + lsr x4, x1, #1; \ + mul x4, x4, x4; \ + adc x4, x4, xzr; \ + mul x10, x1, x3; \ + adds x20, x20, x10; \ + umulh x10, x1, x3; \ + adcs x21, x21, x10; \ + mul x10, x1, x5; \ + adcs x22, x22, x10; \ + umulh x10, x1, x5; \ + adcs x23, x23, x10; \ + mul x10, x1, x7; \ + adcs x24, x24, x10; \ + umulh x10, x1, x7; \ + adcs x25, x25, x10; \ + mul x10, x1, x9; \ + adcs x0, x0, x10; \ + umulh x10, x1, x9; \ + adc x4, x4, x10; \ + mul x2, x2, x2; \ + cmp xzr, xzr; \ + extr x10, x20, x19, #9; \ + adcs x2, x2, x10; \ + extr x10, x21, x20, #9; \ + adcs x11, x11, x10; \ + extr x10, x22, x21, #9; \ + adcs x12, x12, x10; \ + extr x10, x23, x22, #9; \ + adcs x13, x13, x10; \ + extr x10, x24, x23, #9; \ + adcs x14, x14, x10; \ + extr x10, x25, x24, #9; \ + adcs x15, x15, x10; \ + extr x10, x0, x25, #9; \ + adcs x16, x16, x10; \ + extr x10, x4, x0, #9; \ + adcs x17, x17, x10; \ + orr x19, x19, #0xfffffffffffffe00; \ + lsr x10, x4, #9; \ + adcs x19, x19, x10; \ + sbcs x2, x2, xzr; \ + sbcs x11, x11, xzr; \ + sbcs x12, x12, xzr; \ + sbcs x13, x13, xzr; \ + sbcs x14, x14, xzr; \ + sbcs x15, x15, xzr; \ + sbcs x16, x16, xzr; \ + sbcs x17, x17, xzr; \ + sbc x19, x19, xzr; \ + and x19, x19, #0x1ff; \ + stp x2, x11, [P0]; \ + stp x12, x13, [P0+16]; \ + stp x14, x15, [P0+32]; \ + stp x16, x17, [P0+48]; \ + str x19, [P0+64] + +// Corresponds exactly to bignum_sub_p521 + +#define sub_p521(P0,P1,P2) \ + ldp x5, x6, [P1]; \ + ldp x4, x3, [P2]; \ + subs x5, x5, x4; \ + sbcs x6, x6, x3; \ + ldp x7, x8, [P1+16]; \ + ldp x4, x3, [P2+16]; \ + sbcs x7, x7, x4; \ + sbcs x8, x8, x3; \ + ldp x9, x10, [P1+32]; \ + ldp x4, x3, [P2+32]; \ + sbcs x9, x9, x4; \ + sbcs x10, x10, x3; \ + ldp x11, x12, [P1+48]; \ + ldp x4, x3, [P2+48]; \ + sbcs x11, x11, x4; \ + sbcs x12, x12, x3; \ + ldr x13, [P1+64]; \ + ldr x4, [P2+64]; \ + sbcs x13, x13, x4; \ + sbcs x5, x5, xzr; \ + sbcs x6, x6, xzr; \ + sbcs x7, x7, xzr; \ + sbcs x8, x8, xzr; \ + sbcs x9, x9, xzr; \ + sbcs x10, x10, xzr; \ + sbcs x11, x11, xzr; \ + sbcs x12, x12, xzr; \ + sbcs x13, x13, xzr; \ + and x13, x13, #0x1ff; \ + stp x5, x6, [P0]; \ + stp x7, x8, [P0+16]; \ + stp x9, x10, [P0+32]; \ + stp x11, x12, [P0+48]; \ + str x13, [P0+64] + +S2N_BN_SYMBOL(p521_jadd): + +// Save regs and make room on stack for temporary variables + + stp x19, x20, [sp, #-16]! + stp x21, x22, [sp, #-16]! + stp x23, x24, [sp, #-16]! + stp x25, x26, [sp, #-16]! + stp x27, x28, [sp, #-16]! + sub sp, sp, NSPACE + +// Move the input arguments to stable places + + mov input_z, x0 + mov input_x, x1 + mov input_y, x2 + +// Main code, just a sequence of basic field operations + + sqr_p521(z1sq,z_1) + sqr_p521(z2sq,z_2) + + mul_p521(y1a,z_2,y_1) + mul_p521(y2a,z_1,y_2) + + mul_p521(x2a,z1sq,x_2) + mul_p521(x1a,z2sq,x_1) + mul_p521(y2a,z1sq,y2a) + mul_p521(y1a,z2sq,y1a) + + sub_p521(xd,x2a,x1a) + sub_p521(yd,y2a,y1a) + + sqr_p521(zz,xd) + sqr_p521(ww,yd) + + mul_p521(zzx1,zz,x1a) + mul_p521(zzx2,zz,x2a) + + sub_p521(x_3,ww,zzx1) + sub_p521(t1,zzx2,zzx1) + + mul_p521(xd,xd,z_1) + + sub_p521(x_3,x_3,zzx2) + + sub_p521(t2,zzx1,x_3) + + mul_p521(t1,t1,y1a) + mul_p521(z_3,xd,z_2) + mul_p521(t2,yd,t2) + + sub_p521(y_3,t2,t1) + +// Restore stack and registers + + add sp, sp, NSPACE + + ldp x27, x28, [sp], 16 + ldp x25, x26, [sp], 16 + ldp x23, x24, [sp], 16 + ldp x21, x22, [sp], 16 + ldp x19, x20, [sp], 16 + + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack, "", %progbits +#endif diff --git a/arm/p521/p521_jdouble.S b/arm/p521/p521_jdouble.S new file mode 100644 index 0000000000..242b492a81 --- /dev/null +++ b/arm/p521/p521_jdouble.S @@ -0,0 +1,1470 @@ +/* + * Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"). + * You may not use this file except in compliance with the License. + * A copy of the License is located at + * + * http://aws.amazon.com/apache2.0 + * + * or in the "LICENSE" file accompanying this file. This file is distributed + * on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either + * express or implied. See the License for the specific language governing + * permissions and limitations under the License. + */ + +// ---------------------------------------------------------------------------- +// Point doubling on NIST curve P-521 in Jacobian coordinates +// +// extern void p521_jdouble +// (uint64_t p3[static 27],uint64_t p1[static 27]); +// +// Does p3 := 2 * p1 where all points are regarded as Jacobian triples. +// A Jacobian triple (x,y,z) represents affine point (x/z^2,y/z^3). +// It is assumed that all coordinates of the input point are fully +// reduced mod p_521 and that the z coordinate is not zero. +// +// Standard ARM ABI: X0 = p3, X1 = p1 +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(p521_jdouble) + S2N_BN_SYM_PRIVACY_DIRECTIVE(p521_jdouble) + + .text + .balign 4 + +// Size of individual field elements + +#define NUMSIZE 72 + +// Stable homes for input arguments during main code sequence + +#define input_z x26 +#define input_x x27 + +// Pointer-offset pairs for inputs and outputs + +#define x_1 input_x, #0 +#define y_1 input_x, #NUMSIZE +#define z_1 input_x, #(2*NUMSIZE) + +#define x_3 input_z, #0 +#define y_3 input_z, #NUMSIZE +#define z_3 input_z, #(2*NUMSIZE) + +// Pointer-offset pairs for temporaries + +#define z2 sp, #(NUMSIZE*0) +#define y2 sp, #(NUMSIZE*1) +#define x2p sp, #(NUMSIZE*2) +#define xy2 sp, #(NUMSIZE*3) + +#define y4 sp, #(NUMSIZE*4) +#define t2 sp, #(NUMSIZE*4) + +#define dx2 sp, #(NUMSIZE*5) +#define t1 sp, #(NUMSIZE*5) + +#define d sp, #(NUMSIZE*6) +#define x4p sp, #(NUMSIZE*6) + +// NUMSIZE*7 is not 16-aligned so we round it up + +#define NSPACE (NUMSIZE*7+8) + +// Corresponds exactly to bignum_mul_p521_alt + +#define mul_p521(P0,P1,P2) \ + ldp x3, x4, [P1]; \ + ldp x5, x6, [P2]; \ + mul x15, x3, x5; \ + umulh x16, x3, x5; \ + mul x14, x3, x6; \ + umulh x17, x3, x6; \ + adds x16, x16, x14; \ + ldp x7, x8, [P2+16]; \ + mul x14, x3, x7; \ + umulh x19, x3, x7; \ + adcs x17, x17, x14; \ + mul x14, x3, x8; \ + umulh x20, x3, x8; \ + adcs x19, x19, x14; \ + ldp x9, x10, [P2+32]; \ + mul x14, x3, x9; \ + umulh x21, x3, x9; \ + adcs x20, x20, x14; \ + mul x14, x3, x10; \ + umulh x22, x3, x10; \ + adcs x21, x21, x14; \ + ldp x11, x12, [P2+48]; \ + mul x14, x3, x11; \ + umulh x23, x3, x11; \ + adcs x22, x22, x14; \ + ldr x13, [P2+64]; \ + mul x14, x3, x12; \ + umulh x24, x3, x12; \ + adcs x23, x23, x14; \ + mul x14, x3, x13; \ + umulh x1, x3, x13; \ + adcs x24, x24, x14; \ + adc x1, x1, xzr; \ + mul x14, x4, x5; \ + adds x16, x16, x14; \ + mul x14, x4, x6; \ + adcs x17, x17, x14; \ + mul x14, x4, x7; \ + adcs x19, x19, x14; \ + mul x14, x4, x8; \ + adcs x20, x20, x14; \ + mul x14, x4, x9; \ + adcs x21, x21, x14; \ + mul x14, x4, x10; \ + adcs x22, x22, x14; \ + mul x14, x4, x11; \ + adcs x23, x23, x14; \ + mul x14, x4, x12; \ + adcs x24, x24, x14; \ + mul x14, x4, x13; \ + adcs x1, x1, x14; \ + cset x0, hs; \ + umulh x14, x4, x5; \ + adds x17, x17, x14; \ + umulh x14, x4, x6; \ + adcs x19, x19, x14; \ + umulh x14, x4, x7; \ + adcs x20, x20, x14; \ + umulh x14, x4, x8; \ + adcs x21, x21, x14; \ + umulh x14, x4, x9; \ + adcs x22, x22, x14; \ + umulh x14, x4, x10; \ + adcs x23, x23, x14; \ + umulh x14, x4, x11; \ + adcs x24, x24, x14; \ + umulh x14, x4, x12; \ + adcs x1, x1, x14; \ + umulh x14, x4, x13; \ + adc x0, x0, x14; \ + stp x15, x16, [P0]; \ + ldp x3, x4, [P1+16]; \ + mul x14, x3, x5; \ + adds x17, x17, x14; \ + mul x14, x3, x6; \ + adcs x19, x19, x14; \ + mul x14, x3, x7; \ + adcs x20, x20, x14; \ + mul x14, x3, x8; \ + adcs x21, x21, x14; \ + mul x14, x3, x9; \ + adcs x22, x22, x14; \ + mul x14, x3, x10; \ + adcs x23, x23, x14; \ + mul x14, x3, x11; \ + adcs x24, x24, x14; \ + mul x14, x3, x12; \ + adcs x1, x1, x14; \ + mul x14, x3, x13; \ + adcs x0, x0, x14; \ + cset x15, hs; \ + umulh x14, x3, x5; \ + adds x19, x19, x14; \ + umulh x14, x3, x6; \ + adcs x20, x20, x14; \ + umulh x14, x3, x7; \ + adcs x21, x21, x14; \ + umulh x14, x3, x8; \ + adcs x22, x22, x14; \ + umulh x14, x3, x9; \ + adcs x23, x23, x14; \ + umulh x14, x3, x10; \ + adcs x24, x24, x14; \ + umulh x14, x3, x11; \ + adcs x1, x1, x14; \ + umulh x14, x3, x12; \ + adcs x0, x0, x14; \ + umulh x14, x3, x13; \ + adc x15, x15, x14; \ + mul x14, x4, x5; \ + adds x19, x19, x14; \ + mul x14, x4, x6; \ + adcs x20, x20, x14; \ + mul x14, x4, x7; \ + adcs x21, x21, x14; \ + mul x14, x4, x8; \ + adcs x22, x22, x14; \ + mul x14, x4, x9; \ + adcs x23, x23, x14; \ + mul x14, x4, x10; \ + adcs x24, x24, x14; \ + mul x14, x4, x11; \ + adcs x1, x1, x14; \ + mul x14, x4, x12; \ + adcs x0, x0, x14; \ + mul x14, x4, x13; \ + adcs x15, x15, x14; \ + cset x16, hs; \ + umulh x14, x4, x5; \ + adds x20, x20, x14; \ + umulh x14, x4, x6; \ + adcs x21, x21, x14; \ + umulh x14, x4, x7; \ + adcs x22, x22, x14; \ + umulh x14, x4, x8; \ + adcs x23, x23, x14; \ + umulh x14, x4, x9; \ + adcs x24, x24, x14; \ + umulh x14, x4, x10; \ + adcs x1, x1, x14; \ + umulh x14, x4, x11; \ + adcs x0, x0, x14; \ + umulh x14, x4, x12; \ + adcs x15, x15, x14; \ + umulh x14, x4, x13; \ + adc x16, x16, x14; \ + stp x17, x19, [P0+16]; \ + ldp x3, x4, [P1+32]; \ + mul x14, x3, x5; \ + adds x20, x20, x14; \ + mul x14, x3, x6; \ + adcs x21, x21, x14; \ + mul x14, x3, x7; \ + adcs x22, x22, x14; \ + mul x14, x3, x8; \ + adcs x23, x23, x14; \ + mul x14, x3, x9; \ + adcs x24, x24, x14; \ + mul x14, x3, x10; \ + adcs x1, x1, x14; \ + mul x14, x3, x11; \ + adcs x0, x0, x14; \ + mul x14, x3, x12; \ + adcs x15, x15, x14; \ + mul x14, x3, x13; \ + adcs x16, x16, x14; \ + cset x17, hs; \ + umulh x14, x3, x5; \ + adds x21, x21, x14; \ + umulh x14, x3, x6; \ + adcs x22, x22, x14; \ + umulh x14, x3, x7; \ + adcs x23, x23, x14; \ + umulh x14, x3, x8; \ + adcs x24, x24, x14; \ + umulh x14, x3, x9; \ + adcs x1, x1, x14; \ + umulh x14, x3, x10; \ + adcs x0, x0, x14; \ + umulh x14, x3, x11; \ + adcs x15, x15, x14; \ + umulh x14, x3, x12; \ + adcs x16, x16, x14; \ + umulh x14, x3, x13; \ + adc x17, x17, x14; \ + mul x14, x4, x5; \ + adds x21, x21, x14; \ + mul x14, x4, x6; \ + adcs x22, x22, x14; \ + mul x14, x4, x7; \ + adcs x23, x23, x14; \ + mul x14, x4, x8; \ + adcs x24, x24, x14; \ + mul x14, x4, x9; \ + adcs x1, x1, x14; \ + mul x14, x4, x10; \ + adcs x0, x0, x14; \ + mul x14, x4, x11; \ + adcs x15, x15, x14; \ + mul x14, x4, x12; \ + adcs x16, x16, x14; \ + mul x14, x4, x13; \ + adcs x17, x17, x14; \ + cset x19, hs; \ + umulh x14, x4, x5; \ + adds x22, x22, x14; \ + umulh x14, x4, x6; \ + adcs x23, x23, x14; \ + umulh x14, x4, x7; \ + adcs x24, x24, x14; \ + umulh x14, x4, x8; \ + adcs x1, x1, x14; \ + umulh x14, x4, x9; \ + adcs x0, x0, x14; \ + umulh x14, x4, x10; \ + adcs x15, x15, x14; \ + umulh x14, x4, x11; \ + adcs x16, x16, x14; \ + umulh x14, x4, x12; \ + adcs x17, x17, x14; \ + umulh x14, x4, x13; \ + adc x19, x19, x14; \ + stp x20, x21, [P0+32]; \ + ldp x3, x4, [P1+48]; \ + mul x14, x3, x5; \ + adds x22, x22, x14; \ + mul x14, x3, x6; \ + adcs x23, x23, x14; \ + mul x14, x3, x7; \ + adcs x24, x24, x14; \ + mul x14, x3, x8; \ + adcs x1, x1, x14; \ + mul x14, x3, x9; \ + adcs x0, x0, x14; \ + mul x14, x3, x10; \ + adcs x15, x15, x14; \ + mul x14, x3, x11; \ + adcs x16, x16, x14; \ + mul x14, x3, x12; \ + adcs x17, x17, x14; \ + mul x14, x3, x13; \ + adcs x19, x19, x14; \ + cset x20, hs; \ + umulh x14, x3, x5; \ + adds x23, x23, x14; \ + umulh x14, x3, x6; \ + adcs x24, x24, x14; \ + umulh x14, x3, x7; \ + adcs x1, x1, x14; \ + umulh x14, x3, x8; \ + adcs x0, x0, x14; \ + umulh x14, x3, x9; \ + adcs x15, x15, x14; \ + umulh x14, x3, x10; \ + adcs x16, x16, x14; \ + umulh x14, x3, x11; \ + adcs x17, x17, x14; \ + umulh x14, x3, x12; \ + adcs x19, x19, x14; \ + umulh x14, x3, x13; \ + adc x20, x20, x14; \ + mul x14, x4, x5; \ + adds x23, x23, x14; \ + mul x14, x4, x6; \ + adcs x24, x24, x14; \ + mul x14, x4, x7; \ + adcs x1, x1, x14; \ + mul x14, x4, x8; \ + adcs x0, x0, x14; \ + mul x14, x4, x9; \ + adcs x15, x15, x14; \ + mul x14, x4, x10; \ + adcs x16, x16, x14; \ + mul x14, x4, x11; \ + adcs x17, x17, x14; \ + mul x14, x4, x12; \ + adcs x19, x19, x14; \ + mul x14, x4, x13; \ + adcs x20, x20, x14; \ + cset x21, hs; \ + umulh x14, x4, x5; \ + adds x24, x24, x14; \ + umulh x14, x4, x6; \ + adcs x1, x1, x14; \ + umulh x14, x4, x7; \ + adcs x0, x0, x14; \ + umulh x14, x4, x8; \ + adcs x15, x15, x14; \ + umulh x14, x4, x9; \ + adcs x16, x16, x14; \ + umulh x14, x4, x10; \ + adcs x17, x17, x14; \ + umulh x14, x4, x11; \ + adcs x19, x19, x14; \ + umulh x14, x4, x12; \ + adcs x20, x20, x14; \ + umulh x14, x4, x13; \ + adc x21, x21, x14; \ + stp x22, x23, [P0+48]; \ + ldr x3, [P1+64]; \ + mul x14, x3, x5; \ + adds x24, x24, x14; \ + mul x14, x3, x6; \ + adcs x1, x1, x14; \ + mul x14, x3, x7; \ + adcs x0, x0, x14; \ + mul x14, x3, x8; \ + adcs x15, x15, x14; \ + mul x14, x3, x9; \ + adcs x16, x16, x14; \ + mul x14, x3, x10; \ + adcs x17, x17, x14; \ + mul x14, x3, x11; \ + adcs x19, x19, x14; \ + mul x14, x3, x12; \ + adcs x20, x20, x14; \ + mul x14, x3, x13; \ + adc x21, x21, x14; \ + umulh x14, x3, x5; \ + adds x1, x1, x14; \ + umulh x14, x3, x6; \ + adcs x0, x0, x14; \ + umulh x14, x3, x7; \ + adcs x15, x15, x14; \ + umulh x14, x3, x8; \ + adcs x16, x16, x14; \ + umulh x14, x3, x9; \ + adcs x17, x17, x14; \ + umulh x14, x3, x10; \ + adcs x19, x19, x14; \ + umulh x14, x3, x11; \ + adcs x20, x20, x14; \ + umulh x14, x3, x12; \ + adc x21, x21, x14; \ + cmp xzr, xzr; \ + ldp x5, x6, [P0]; \ + extr x14, x1, x24, #9; \ + adcs x5, x5, x14; \ + extr x14, x0, x1, #9; \ + adcs x6, x6, x14; \ + ldp x7, x8, [P0+16]; \ + extr x14, x15, x0, #9; \ + adcs x7, x7, x14; \ + extr x14, x16, x15, #9; \ + adcs x8, x8, x14; \ + ldp x9, x10, [P0+32]; \ + extr x14, x17, x16, #9; \ + adcs x9, x9, x14; \ + extr x14, x19, x17, #9; \ + adcs x10, x10, x14; \ + ldp x11, x12, [P0+48]; \ + extr x14, x20, x19, #9; \ + adcs x11, x11, x14; \ + extr x14, x21, x20, #9; \ + adcs x12, x12, x14; \ + orr x13, x24, #0xfffffffffffffe00; \ + lsr x14, x21, #9; \ + adcs x13, x13, x14; \ + sbcs x5, x5, xzr; \ + sbcs x6, x6, xzr; \ + sbcs x7, x7, xzr; \ + sbcs x8, x8, xzr; \ + sbcs x9, x9, xzr; \ + sbcs x10, x10, xzr; \ + sbcs x11, x11, xzr; \ + sbcs x12, x12, xzr; \ + sbc x13, x13, xzr; \ + and x13, x13, #0x1ff; \ + stp x5, x6, [P0]; \ + stp x7, x8, [P0+16]; \ + stp x9, x10, [P0+32]; \ + stp x11, x12, [P0+48]; \ + str x13, [P0+64] + +// Corresponds exactly to bignum_sqr_p521_alt + +#define sqr_p521(P0,P1) \ + ldp x2, x3, [P1]; \ + mul x11, x2, x3; \ + umulh x12, x2, x3; \ + ldp x4, x5, [P1+16]; \ + mul x10, x2, x4; \ + umulh x13, x2, x4; \ + adds x12, x12, x10; \ + ldp x6, x7, [P1+32]; \ + mul x10, x2, x5; \ + umulh x14, x2, x5; \ + adcs x13, x13, x10; \ + ldp x8, x9, [P1+48]; \ + mul x10, x2, x6; \ + umulh x15, x2, x6; \ + adcs x14, x14, x10; \ + mul x10, x2, x7; \ + umulh x16, x2, x7; \ + adcs x15, x15, x10; \ + mul x10, x2, x8; \ + umulh x17, x2, x8; \ + adcs x16, x16, x10; \ + mul x10, x2, x9; \ + umulh x19, x2, x9; \ + adcs x17, x17, x10; \ + adc x19, x19, xzr; \ + mul x10, x3, x4; \ + adds x13, x13, x10; \ + mul x10, x3, x5; \ + adcs x14, x14, x10; \ + mul x10, x3, x6; \ + adcs x15, x15, x10; \ + mul x10, x3, x7; \ + adcs x16, x16, x10; \ + mul x10, x3, x8; \ + adcs x17, x17, x10; \ + mul x10, x3, x9; \ + adcs x19, x19, x10; \ + cset x20, hs; \ + umulh x10, x3, x4; \ + adds x14, x14, x10; \ + umulh x10, x3, x5; \ + adcs x15, x15, x10; \ + umulh x10, x3, x6; \ + adcs x16, x16, x10; \ + umulh x10, x3, x7; \ + adcs x17, x17, x10; \ + umulh x10, x3, x8; \ + adcs x19, x19, x10; \ + umulh x10, x3, x9; \ + adc x20, x20, x10; \ + mul x10, x6, x7; \ + umulh x21, x6, x7; \ + adds x20, x20, x10; \ + adc x21, x21, xzr; \ + mul x10, x4, x5; \ + adds x15, x15, x10; \ + mul x10, x4, x6; \ + adcs x16, x16, x10; \ + mul x10, x4, x7; \ + adcs x17, x17, x10; \ + mul x10, x4, x8; \ + adcs x19, x19, x10; \ + mul x10, x4, x9; \ + adcs x20, x20, x10; \ + mul x10, x6, x8; \ + adcs x21, x21, x10; \ + cset x22, hs; \ + umulh x10, x4, x5; \ + adds x16, x16, x10; \ + umulh x10, x4, x6; \ + adcs x17, x17, x10; \ + umulh x10, x4, x7; \ + adcs x19, x19, x10; \ + umulh x10, x4, x8; \ + adcs x20, x20, x10; \ + umulh x10, x4, x9; \ + adcs x21, x21, x10; \ + umulh x10, x6, x8; \ + adc x22, x22, x10; \ + mul x10, x7, x8; \ + umulh x23, x7, x8; \ + adds x22, x22, x10; \ + adc x23, x23, xzr; \ + mul x10, x5, x6; \ + adds x17, x17, x10; \ + mul x10, x5, x7; \ + adcs x19, x19, x10; \ + mul x10, x5, x8; \ + adcs x20, x20, x10; \ + mul x10, x5, x9; \ + adcs x21, x21, x10; \ + mul x10, x6, x9; \ + adcs x22, x22, x10; \ + mul x10, x7, x9; \ + adcs x23, x23, x10; \ + cset x24, hs; \ + umulh x10, x5, x6; \ + adds x19, x19, x10; \ + umulh x10, x5, x7; \ + adcs x20, x20, x10; \ + umulh x10, x5, x8; \ + adcs x21, x21, x10; \ + umulh x10, x5, x9; \ + adcs x22, x22, x10; \ + umulh x10, x6, x9; \ + adcs x23, x23, x10; \ + umulh x10, x7, x9; \ + adc x24, x24, x10; \ + mul x10, x8, x9; \ + umulh x25, x8, x9; \ + adds x24, x24, x10; \ + adc x25, x25, xzr; \ + adds x11, x11, x11; \ + adcs x12, x12, x12; \ + adcs x13, x13, x13; \ + adcs x14, x14, x14; \ + adcs x15, x15, x15; \ + adcs x16, x16, x16; \ + adcs x17, x17, x17; \ + adcs x19, x19, x19; \ + adcs x20, x20, x20; \ + adcs x21, x21, x21; \ + adcs x22, x22, x22; \ + adcs x23, x23, x23; \ + adcs x24, x24, x24; \ + adcs x25, x25, x25; \ + cset x0, hs; \ + umulh x10, x2, x2; \ + adds x11, x11, x10; \ + mul x10, x3, x3; \ + adcs x12, x12, x10; \ + umulh x10, x3, x3; \ + adcs x13, x13, x10; \ + mul x10, x4, x4; \ + adcs x14, x14, x10; \ + umulh x10, x4, x4; \ + adcs x15, x15, x10; \ + mul x10, x5, x5; \ + adcs x16, x16, x10; \ + umulh x10, x5, x5; \ + adcs x17, x17, x10; \ + mul x10, x6, x6; \ + adcs x19, x19, x10; \ + umulh x10, x6, x6; \ + adcs x20, x20, x10; \ + mul x10, x7, x7; \ + adcs x21, x21, x10; \ + umulh x10, x7, x7; \ + adcs x22, x22, x10; \ + mul x10, x8, x8; \ + adcs x23, x23, x10; \ + umulh x10, x8, x8; \ + adcs x24, x24, x10; \ + mul x10, x9, x9; \ + adcs x25, x25, x10; \ + umulh x10, x9, x9; \ + adc x0, x0, x10; \ + ldr x1, [P1+64]; \ + add x1, x1, x1; \ + mul x10, x1, x2; \ + adds x19, x19, x10; \ + umulh x10, x1, x2; \ + adcs x20, x20, x10; \ + mul x10, x1, x4; \ + adcs x21, x21, x10; \ + umulh x10, x1, x4; \ + adcs x22, x22, x10; \ + mul x10, x1, x6; \ + adcs x23, x23, x10; \ + umulh x10, x1, x6; \ + adcs x24, x24, x10; \ + mul x10, x1, x8; \ + adcs x25, x25, x10; \ + umulh x10, x1, x8; \ + adcs x0, x0, x10; \ + lsr x4, x1, #1; \ + mul x4, x4, x4; \ + adc x4, x4, xzr; \ + mul x10, x1, x3; \ + adds x20, x20, x10; \ + umulh x10, x1, x3; \ + adcs x21, x21, x10; \ + mul x10, x1, x5; \ + adcs x22, x22, x10; \ + umulh x10, x1, x5; \ + adcs x23, x23, x10; \ + mul x10, x1, x7; \ + adcs x24, x24, x10; \ + umulh x10, x1, x7; \ + adcs x25, x25, x10; \ + mul x10, x1, x9; \ + adcs x0, x0, x10; \ + umulh x10, x1, x9; \ + adc x4, x4, x10; \ + mul x2, x2, x2; \ + cmp xzr, xzr; \ + extr x10, x20, x19, #9; \ + adcs x2, x2, x10; \ + extr x10, x21, x20, #9; \ + adcs x11, x11, x10; \ + extr x10, x22, x21, #9; \ + adcs x12, x12, x10; \ + extr x10, x23, x22, #9; \ + adcs x13, x13, x10; \ + extr x10, x24, x23, #9; \ + adcs x14, x14, x10; \ + extr x10, x25, x24, #9; \ + adcs x15, x15, x10; \ + extr x10, x0, x25, #9; \ + adcs x16, x16, x10; \ + extr x10, x4, x0, #9; \ + adcs x17, x17, x10; \ + orr x19, x19, #0xfffffffffffffe00; \ + lsr x10, x4, #9; \ + adcs x19, x19, x10; \ + sbcs x2, x2, xzr; \ + sbcs x11, x11, xzr; \ + sbcs x12, x12, xzr; \ + sbcs x13, x13, xzr; \ + sbcs x14, x14, xzr; \ + sbcs x15, x15, xzr; \ + sbcs x16, x16, xzr; \ + sbcs x17, x17, xzr; \ + sbc x19, x19, xzr; \ + and x19, x19, #0x1ff; \ + stp x2, x11, [P0]; \ + stp x12, x13, [P0+16]; \ + stp x14, x15, [P0+32]; \ + stp x16, x17, [P0+48]; \ + str x19, [P0+64] + +// Corresponds exactly to bignum_add_p521 + +#define add_p521(P0,P1,P2) \ + cmp xzr, xzr; \ + ldp x5, x6, [P1]; \ + ldp x4, x3, [P2]; \ + adcs x5, x5, x4; \ + adcs x6, x6, x3; \ + ldp x7, x8, [P1+16]; \ + ldp x4, x3, [P2+16]; \ + adcs x7, x7, x4; \ + adcs x8, x8, x3; \ + ldp x9, x10, [P1+32]; \ + ldp x4, x3, [P2+32]; \ + adcs x9, x9, x4; \ + adcs x10, x10, x3; \ + ldp x11, x12, [P1+48]; \ + ldp x4, x3, [P2+48]; \ + adcs x11, x11, x4; \ + adcs x12, x12, x3; \ + ldr x13, [P1+64]; \ + ldr x4, [P2+64]; \ + adc x13, x13, x4; \ + subs x4, x13, #512; \ + csetm x4, hs; \ + sbcs x5, x5, xzr; \ + and x4, x4, #0x200; \ + sbcs x6, x6, xzr; \ + sbcs x7, x7, xzr; \ + sbcs x8, x8, xzr; \ + sbcs x9, x9, xzr; \ + sbcs x10, x10, xzr; \ + sbcs x11, x11, xzr; \ + sbcs x12, x12, xzr; \ + sbc x13, x13, x4; \ + stp x5, x6, [P0]; \ + stp x7, x8, [P0+16]; \ + stp x9, x10, [P0+32]; \ + stp x11, x12, [P0+48]; \ + str x13, [P0+64] + +// Corresponds exactly to bignum_sub_p521 + +#define sub_p521(P0,P1,P2) \ + ldp x5, x6, [P1]; \ + ldp x4, x3, [P2]; \ + subs x5, x5, x4; \ + sbcs x6, x6, x3; \ + ldp x7, x8, [P1+16]; \ + ldp x4, x3, [P2+16]; \ + sbcs x7, x7, x4; \ + sbcs x8, x8, x3; \ + ldp x9, x10, [P1+32]; \ + ldp x4, x3, [P2+32]; \ + sbcs x9, x9, x4; \ + sbcs x10, x10, x3; \ + ldp x11, x12, [P1+48]; \ + ldp x4, x3, [P2+48]; \ + sbcs x11, x11, x4; \ + sbcs x12, x12, x3; \ + ldr x13, [P1+64]; \ + ldr x4, [P2+64]; \ + sbcs x13, x13, x4; \ + sbcs x5, x5, xzr; \ + sbcs x6, x6, xzr; \ + sbcs x7, x7, xzr; \ + sbcs x8, x8, xzr; \ + sbcs x9, x9, xzr; \ + sbcs x10, x10, xzr; \ + sbcs x11, x11, xzr; \ + sbcs x12, x12, xzr; \ + sbcs x13, x13, xzr; \ + and x13, x13, #0x1ff; \ + stp x5, x6, [P0]; \ + stp x7, x8, [P0+16]; \ + stp x9, x10, [P0+32]; \ + stp x11, x12, [P0+48]; \ + str x13, [P0+64] + +// Weak multiplication not fully reducing + +#define weakmul_p521(P0,P1,P2) \ + ldp x3, x4, [P1]; \ + ldp x5, x6, [P2]; \ + mul x15, x3, x5; \ + umulh x16, x3, x5; \ + mul x14, x3, x6; \ + umulh x17, x3, x6; \ + adds x16, x16, x14; \ + ldp x7, x8, [P2+16]; \ + mul x14, x3, x7; \ + umulh x19, x3, x7; \ + adcs x17, x17, x14; \ + mul x14, x3, x8; \ + umulh x20, x3, x8; \ + adcs x19, x19, x14; \ + ldp x9, x10, [P2+32]; \ + mul x14, x3, x9; \ + umulh x21, x3, x9; \ + adcs x20, x20, x14; \ + mul x14, x3, x10; \ + umulh x22, x3, x10; \ + adcs x21, x21, x14; \ + ldp x11, x12, [P2+48]; \ + mul x14, x3, x11; \ + umulh x23, x3, x11; \ + adcs x22, x22, x14; \ + ldr x13, [P2+64]; \ + mul x14, x3, x12; \ + umulh x24, x3, x12; \ + adcs x23, x23, x14; \ + mul x14, x3, x13; \ + umulh x1, x3, x13; \ + adcs x24, x24, x14; \ + adc x1, x1, xzr; \ + mul x14, x4, x5; \ + adds x16, x16, x14; \ + mul x14, x4, x6; \ + adcs x17, x17, x14; \ + mul x14, x4, x7; \ + adcs x19, x19, x14; \ + mul x14, x4, x8; \ + adcs x20, x20, x14; \ + mul x14, x4, x9; \ + adcs x21, x21, x14; \ + mul x14, x4, x10; \ + adcs x22, x22, x14; \ + mul x14, x4, x11; \ + adcs x23, x23, x14; \ + mul x14, x4, x12; \ + adcs x24, x24, x14; \ + mul x14, x4, x13; \ + adcs x1, x1, x14; \ + cset x0, hs; \ + umulh x14, x4, x5; \ + adds x17, x17, x14; \ + umulh x14, x4, x6; \ + adcs x19, x19, x14; \ + umulh x14, x4, x7; \ + adcs x20, x20, x14; \ + umulh x14, x4, x8; \ + adcs x21, x21, x14; \ + umulh x14, x4, x9; \ + adcs x22, x22, x14; \ + umulh x14, x4, x10; \ + adcs x23, x23, x14; \ + umulh x14, x4, x11; \ + adcs x24, x24, x14; \ + umulh x14, x4, x12; \ + adcs x1, x1, x14; \ + umulh x14, x4, x13; \ + adc x0, x0, x14; \ + stp x15, x16, [P0]; \ + ldp x3, x4, [P1+16]; \ + mul x14, x3, x5; \ + adds x17, x17, x14; \ + mul x14, x3, x6; \ + adcs x19, x19, x14; \ + mul x14, x3, x7; \ + adcs x20, x20, x14; \ + mul x14, x3, x8; \ + adcs x21, x21, x14; \ + mul x14, x3, x9; \ + adcs x22, x22, x14; \ + mul x14, x3, x10; \ + adcs x23, x23, x14; \ + mul x14, x3, x11; \ + adcs x24, x24, x14; \ + mul x14, x3, x12; \ + adcs x1, x1, x14; \ + mul x14, x3, x13; \ + adcs x0, x0, x14; \ + cset x15, hs; \ + umulh x14, x3, x5; \ + adds x19, x19, x14; \ + umulh x14, x3, x6; \ + adcs x20, x20, x14; \ + umulh x14, x3, x7; \ + adcs x21, x21, x14; \ + umulh x14, x3, x8; \ + adcs x22, x22, x14; \ + umulh x14, x3, x9; \ + adcs x23, x23, x14; \ + umulh x14, x3, x10; \ + adcs x24, x24, x14; \ + umulh x14, x3, x11; \ + adcs x1, x1, x14; \ + umulh x14, x3, x12; \ + adcs x0, x0, x14; \ + umulh x14, x3, x13; \ + adc x15, x15, x14; \ + mul x14, x4, x5; \ + adds x19, x19, x14; \ + mul x14, x4, x6; \ + adcs x20, x20, x14; \ + mul x14, x4, x7; \ + adcs x21, x21, x14; \ + mul x14, x4, x8; \ + adcs x22, x22, x14; \ + mul x14, x4, x9; \ + adcs x23, x23, x14; \ + mul x14, x4, x10; \ + adcs x24, x24, x14; \ + mul x14, x4, x11; \ + adcs x1, x1, x14; \ + mul x14, x4, x12; \ + adcs x0, x0, x14; \ + mul x14, x4, x13; \ + adcs x15, x15, x14; \ + cset x16, hs; \ + umulh x14, x4, x5; \ + adds x20, x20, x14; \ + umulh x14, x4, x6; \ + adcs x21, x21, x14; \ + umulh x14, x4, x7; \ + adcs x22, x22, x14; \ + umulh x14, x4, x8; \ + adcs x23, x23, x14; \ + umulh x14, x4, x9; \ + adcs x24, x24, x14; \ + umulh x14, x4, x10; \ + adcs x1, x1, x14; \ + umulh x14, x4, x11; \ + adcs x0, x0, x14; \ + umulh x14, x4, x12; \ + adcs x15, x15, x14; \ + umulh x14, x4, x13; \ + adc x16, x16, x14; \ + stp x17, x19, [P0+16]; \ + ldp x3, x4, [P1+32]; \ + mul x14, x3, x5; \ + adds x20, x20, x14; \ + mul x14, x3, x6; \ + adcs x21, x21, x14; \ + mul x14, x3, x7; \ + adcs x22, x22, x14; \ + mul x14, x3, x8; \ + adcs x23, x23, x14; \ + mul x14, x3, x9; \ + adcs x24, x24, x14; \ + mul x14, x3, x10; \ + adcs x1, x1, x14; \ + mul x14, x3, x11; \ + adcs x0, x0, x14; \ + mul x14, x3, x12; \ + adcs x15, x15, x14; \ + mul x14, x3, x13; \ + adcs x16, x16, x14; \ + cset x17, hs; \ + umulh x14, x3, x5; \ + adds x21, x21, x14; \ + umulh x14, x3, x6; \ + adcs x22, x22, x14; \ + umulh x14, x3, x7; \ + adcs x23, x23, x14; \ + umulh x14, x3, x8; \ + adcs x24, x24, x14; \ + umulh x14, x3, x9; \ + adcs x1, x1, x14; \ + umulh x14, x3, x10; \ + adcs x0, x0, x14; \ + umulh x14, x3, x11; \ + adcs x15, x15, x14; \ + umulh x14, x3, x12; \ + adcs x16, x16, x14; \ + umulh x14, x3, x13; \ + adc x17, x17, x14; \ + mul x14, x4, x5; \ + adds x21, x21, x14; \ + mul x14, x4, x6; \ + adcs x22, x22, x14; \ + mul x14, x4, x7; \ + adcs x23, x23, x14; \ + mul x14, x4, x8; \ + adcs x24, x24, x14; \ + mul x14, x4, x9; \ + adcs x1, x1, x14; \ + mul x14, x4, x10; \ + adcs x0, x0, x14; \ + mul x14, x4, x11; \ + adcs x15, x15, x14; \ + mul x14, x4, x12; \ + adcs x16, x16, x14; \ + mul x14, x4, x13; \ + adcs x17, x17, x14; \ + cset x19, hs; \ + umulh x14, x4, x5; \ + adds x22, x22, x14; \ + umulh x14, x4, x6; \ + adcs x23, x23, x14; \ + umulh x14, x4, x7; \ + adcs x24, x24, x14; \ + umulh x14, x4, x8; \ + adcs x1, x1, x14; \ + umulh x14, x4, x9; \ + adcs x0, x0, x14; \ + umulh x14, x4, x10; \ + adcs x15, x15, x14; \ + umulh x14, x4, x11; \ + adcs x16, x16, x14; \ + umulh x14, x4, x12; \ + adcs x17, x17, x14; \ + umulh x14, x4, x13; \ + adc x19, x19, x14; \ + stp x20, x21, [P0+32]; \ + ldp x3, x4, [P1+48]; \ + mul x14, x3, x5; \ + adds x22, x22, x14; \ + mul x14, x3, x6; \ + adcs x23, x23, x14; \ + mul x14, x3, x7; \ + adcs x24, x24, x14; \ + mul x14, x3, x8; \ + adcs x1, x1, x14; \ + mul x14, x3, x9; \ + adcs x0, x0, x14; \ + mul x14, x3, x10; \ + adcs x15, x15, x14; \ + mul x14, x3, x11; \ + adcs x16, x16, x14; \ + mul x14, x3, x12; \ + adcs x17, x17, x14; \ + mul x14, x3, x13; \ + adcs x19, x19, x14; \ + cset x20, hs; \ + umulh x14, x3, x5; \ + adds x23, x23, x14; \ + umulh x14, x3, x6; \ + adcs x24, x24, x14; \ + umulh x14, x3, x7; \ + adcs x1, x1, x14; \ + umulh x14, x3, x8; \ + adcs x0, x0, x14; \ + umulh x14, x3, x9; \ + adcs x15, x15, x14; \ + umulh x14, x3, x10; \ + adcs x16, x16, x14; \ + umulh x14, x3, x11; \ + adcs x17, x17, x14; \ + umulh x14, x3, x12; \ + adcs x19, x19, x14; \ + umulh x14, x3, x13; \ + adc x20, x20, x14; \ + mul x14, x4, x5; \ + adds x23, x23, x14; \ + mul x14, x4, x6; \ + adcs x24, x24, x14; \ + mul x14, x4, x7; \ + adcs x1, x1, x14; \ + mul x14, x4, x8; \ + adcs x0, x0, x14; \ + mul x14, x4, x9; \ + adcs x15, x15, x14; \ + mul x14, x4, x10; \ + adcs x16, x16, x14; \ + mul x14, x4, x11; \ + adcs x17, x17, x14; \ + mul x14, x4, x12; \ + adcs x19, x19, x14; \ + mul x14, x4, x13; \ + adcs x20, x20, x14; \ + cset x21, hs; \ + umulh x14, x4, x5; \ + adds x24, x24, x14; \ + umulh x14, x4, x6; \ + adcs x1, x1, x14; \ + umulh x14, x4, x7; \ + adcs x0, x0, x14; \ + umulh x14, x4, x8; \ + adcs x15, x15, x14; \ + umulh x14, x4, x9; \ + adcs x16, x16, x14; \ + umulh x14, x4, x10; \ + adcs x17, x17, x14; \ + umulh x14, x4, x11; \ + adcs x19, x19, x14; \ + umulh x14, x4, x12; \ + adcs x20, x20, x14; \ + umulh x14, x4, x13; \ + adc x21, x21, x14; \ + stp x22, x23, [P0+48]; \ + ldr x3, [P1+64]; \ + mul x14, x3, x5; \ + adds x24, x24, x14; \ + mul x14, x3, x6; \ + adcs x1, x1, x14; \ + mul x14, x3, x7; \ + adcs x0, x0, x14; \ + mul x14, x3, x8; \ + adcs x15, x15, x14; \ + mul x14, x3, x9; \ + adcs x16, x16, x14; \ + mul x14, x3, x10; \ + adcs x17, x17, x14; \ + mul x14, x3, x11; \ + adcs x19, x19, x14; \ + mul x14, x3, x12; \ + adcs x20, x20, x14; \ + mul x14, x3, x13; \ + adc x21, x21, x14; \ + umulh x14, x3, x5; \ + adds x1, x1, x14; \ + umulh x14, x3, x6; \ + adcs x0, x0, x14; \ + umulh x14, x3, x7; \ + adcs x15, x15, x14; \ + umulh x14, x3, x8; \ + adcs x16, x16, x14; \ + umulh x14, x3, x9; \ + adcs x17, x17, x14; \ + umulh x14, x3, x10; \ + adcs x19, x19, x14; \ + umulh x14, x3, x11; \ + adcs x20, x20, x14; \ + umulh x14, x3, x12; \ + adc x21, x21, x14; \ + ldp x5, x6, [P0]; \ + extr x14, x1, x24, #9; \ + adds x5, x5, x14; \ + extr x14, x0, x1, #9; \ + adcs x6, x6, x14; \ + ldp x7, x8, [P0+16]; \ + extr x14, x15, x0, #9; \ + adcs x7, x7, x14; \ + extr x14, x16, x15, #9; \ + adcs x8, x8, x14; \ + ldp x9, x10, [P0+32]; \ + extr x14, x17, x16, #9; \ + adcs x9, x9, x14; \ + extr x14, x19, x17, #9; \ + adcs x10, x10, x14; \ + ldp x11, x12, [P0+48]; \ + extr x14, x20, x19, #9; \ + adcs x11, x11, x14; \ + extr x14, x21, x20, #9; \ + adcs x12, x12, x14; \ + and x13, x24, #0x1ff; \ + lsr x14, x21, #9; \ + adc x13, x13, x14; \ + stp x5, x6, [P0]; \ + stp x7, x8, [P0+16]; \ + stp x9, x10, [P0+32]; \ + stp x11, x12, [P0+48]; \ + str x13, [P0+64] + +// P0 = C * P1 - D * P2 == C * P1 + D * (p_521 - P2) + +#define cmsub_p521(P0,C,P1,D,P2) \ + ldp x6, x7, [P1]; \ + mov x1, #(C); \ + mul x3, x1, x6; \ + mul x4, x1, x7; \ + umulh x6, x1, x6; \ + adds x4, x4, x6; \ + umulh x7, x1, x7; \ + ldp x8, x9, [P1+16]; \ + mul x5, x1, x8; \ + mul x6, x1, x9; \ + umulh x8, x1, x8; \ + adcs x5, x5, x7; \ + umulh x9, x1, x9; \ + adcs x6, x6, x8; \ + ldp x10, x11, [P1+32]; \ + mul x7, x1, x10; \ + mul x8, x1, x11; \ + umulh x10, x1, x10; \ + adcs x7, x7, x9; \ + umulh x11, x1, x11; \ + adcs x8, x8, x10; \ + ldp x12, x13, [P1+48]; \ + mul x9, x1, x12; \ + mul x10, x1, x13; \ + umulh x12, x1, x12; \ + adcs x9, x9, x11; \ + umulh x13, x1, x13; \ + adcs x10, x10, x12; \ + ldr x14, [P1+64]; \ + mul x11, x1, x14; \ + adc x11, x11, x13; \ + mov x1, #(D); \ + ldp x20, x21, [P2]; \ + mvn x20, x20; \ + mul x0, x1, x20; \ + umulh x20, x1, x20; \ + adds x3, x3, x0; \ + mvn x21, x21; \ + mul x0, x1, x21; \ + umulh x21, x1, x21; \ + adcs x4, x4, x0; \ + ldp x22, x23, [P2+16]; \ + mvn x22, x22; \ + mul x0, x1, x22; \ + umulh x22, x1, x22; \ + adcs x5, x5, x0; \ + mvn x23, x23; \ + mul x0, x1, x23; \ + umulh x23, x1, x23; \ + adcs x6, x6, x0; \ + ldp x17, x19, [P2+32]; \ + mvn x17, x17; \ + mul x0, x1, x17; \ + umulh x17, x1, x17; \ + adcs x7, x7, x0; \ + mvn x19, x19; \ + mul x0, x1, x19; \ + umulh x19, x1, x19; \ + adcs x8, x8, x0; \ + ldp x2, x16, [P2+48]; \ + mvn x2, x2; \ + mul x0, x1, x2; \ + umulh x2, x1, x2; \ + adcs x9, x9, x0; \ + mvn x16, x16; \ + mul x0, x1, x16; \ + umulh x16, x1, x16; \ + adcs x10, x10, x0; \ + ldr x0, [P2+64]; \ + eor x0, x0, #0x1ff; \ + mul x0, x1, x0; \ + adc x11, x11, x0; \ + adds x4, x4, x20; \ + adcs x5, x5, x21; \ + and x15, x4, x5; \ + adcs x6, x6, x22; \ + and x15, x15, x6; \ + adcs x7, x7, x23; \ + and x15, x15, x7; \ + adcs x8, x8, x17; \ + and x15, x15, x8; \ + adcs x9, x9, x19; \ + and x15, x15, x9; \ + adcs x10, x10, x2; \ + and x15, x15, x10; \ + adc x11, x11, x16; \ + lsr x12, x11, #9; \ + orr x11, x11, #0xfffffffffffffe00; \ + cmp xzr, xzr; \ + adcs xzr, x3, x12; \ + adcs xzr, x15, xzr; \ + adcs xzr, x11, xzr; \ + adcs x3, x3, x12; \ + adcs x4, x4, xzr; \ + adcs x5, x5, xzr; \ + adcs x6, x6, xzr; \ + adcs x7, x7, xzr; \ + adcs x8, x8, xzr; \ + adcs x9, x9, xzr; \ + adcs x10, x10, xzr; \ + adc x11, x11, xzr; \ + and x11, x11, #0x1ff; \ + stp x3, x4, [P0]; \ + stp x5, x6, [P0+16]; \ + stp x7, x8, [P0+32]; \ + stp x9, x10, [P0+48]; \ + str x11, [P0+64] + +// P0 = 3 * P1 - 8 * P2 == 3 * P1 + 8 * (p_521 - P2) + +#define cmsub38_p521(P0,P1,P2) \ + ldp x6, x7, [P1]; \ + lsl x3, x6, #1; \ + adds x3, x3, x6; \ + extr x4, x7, x6, #63; \ + adcs x4, x4, x7; \ + ldp x8, x9, [P1+16]; \ + extr x5, x8, x7, #63; \ + adcs x5, x5, x8; \ + extr x6, x9, x8, #63; \ + adcs x6, x6, x9; \ + ldp x10, x11, [P1+32]; \ + extr x7, x10, x9, #63; \ + adcs x7, x7, x10; \ + extr x8, x11, x10, #63; \ + adcs x8, x8, x11; \ + ldp x12, x13, [P1+48]; \ + extr x9, x12, x11, #63; \ + adcs x9, x9, x12; \ + extr x10, x13, x12, #63; \ + adcs x10, x10, x13; \ + ldr x14, [P1+64]; \ + extr x11, x14, x13, #63; \ + adc x11, x11, x14; \ + ldp x20, x21, [P2]; \ + mvn x20, x20; \ + lsl x0, x20, #3; \ + adds x3, x3, x0; \ + mvn x21, x21; \ + extr x0, x21, x20, #61; \ + adcs x4, x4, x0; \ + ldp x22, x23, [P2+16]; \ + mvn x22, x22; \ + extr x0, x22, x21, #61; \ + adcs x5, x5, x0; \ + and x15, x4, x5; \ + mvn x23, x23; \ + extr x0, x23, x22, #61; \ + adcs x6, x6, x0; \ + and x15, x15, x6; \ + ldp x20, x21, [P2+32]; \ + mvn x20, x20; \ + extr x0, x20, x23, #61; \ + adcs x7, x7, x0; \ + and x15, x15, x7; \ + mvn x21, x21; \ + extr x0, x21, x20, #61; \ + adcs x8, x8, x0; \ + and x15, x15, x8; \ + ldp x22, x23, [P2+48]; \ + mvn x22, x22; \ + extr x0, x22, x21, #61; \ + adcs x9, x9, x0; \ + and x15, x15, x9; \ + mvn x23, x23; \ + extr x0, x23, x22, #61; \ + adcs x10, x10, x0; \ + and x15, x15, x10; \ + ldr x0, [P2+64]; \ + eor x0, x0, #0x1ff; \ + extr x0, x0, x23, #61; \ + adc x11, x11, x0; \ + lsr x12, x11, #9; \ + orr x11, x11, #0xfffffffffffffe00; \ + cmp xzr, xzr; \ + adcs xzr, x3, x12; \ + adcs xzr, x15, xzr; \ + adcs xzr, x11, xzr; \ + adcs x3, x3, x12; \ + adcs x4, x4, xzr; \ + adcs x5, x5, xzr; \ + adcs x6, x6, xzr; \ + adcs x7, x7, xzr; \ + adcs x8, x8, xzr; \ + adcs x9, x9, xzr; \ + adcs x10, x10, xzr; \ + adc x11, x11, xzr; \ + and x11, x11, #0x1ff; \ + stp x3, x4, [P0]; \ + stp x5, x6, [P0+16]; \ + stp x7, x8, [P0+32]; \ + stp x9, x10, [P0+48]; \ + str x11, [P0+64] + +// P0 = 4 * P1 - P2 = 4 * P1 + (p_521 - P2) + +#define cmsub41_p521(P0,P1,P2) \ + ldp x6, x7, [P1]; \ + lsl x3, x6, #2; \ + extr x4, x7, x6, #62; \ + ldp x8, x9, [P1+16]; \ + extr x5, x8, x7, #62; \ + extr x6, x9, x8, #62; \ + ldp x10, x11, [P1+32]; \ + extr x7, x10, x9, #62; \ + extr x8, x11, x10, #62; \ + ldp x12, x13, [P1+48]; \ + extr x9, x12, x11, #62; \ + extr x10, x13, x12, #62; \ + ldr x14, [P1+64]; \ + extr x11, x14, x13, #62; \ + ldp x0, x1, [P2]; \ + mvn x0, x0; \ + adds x3, x3, x0; \ + sbcs x4, x4, x1; \ + ldp x0, x1, [P2+16]; \ + sbcs x5, x5, x0; \ + and x15, x4, x5; \ + sbcs x6, x6, x1; \ + and x15, x15, x6; \ + ldp x0, x1, [P2+32]; \ + sbcs x7, x7, x0; \ + and x15, x15, x7; \ + sbcs x8, x8, x1; \ + and x15, x15, x8; \ + ldp x0, x1, [P2+48]; \ + sbcs x9, x9, x0; \ + and x15, x15, x9; \ + sbcs x10, x10, x1; \ + and x15, x15, x10; \ + ldr x0, [P2+64]; \ + eor x0, x0, #0x1ff; \ + adc x11, x11, x0; \ + lsr x12, x11, #9; \ + orr x11, x11, #0xfffffffffffffe00; \ + cmp xzr, xzr; \ + adcs xzr, x3, x12; \ + adcs xzr, x15, xzr; \ + adcs xzr, x11, xzr; \ + adcs x3, x3, x12; \ + adcs x4, x4, xzr; \ + adcs x5, x5, xzr; \ + adcs x6, x6, xzr; \ + adcs x7, x7, xzr; \ + adcs x8, x8, xzr; \ + adcs x9, x9, xzr; \ + adcs x10, x10, xzr; \ + adc x11, x11, xzr; \ + and x11, x11, #0x1ff; \ + stp x3, x4, [P0]; \ + stp x5, x6, [P0+16]; \ + stp x7, x8, [P0+32]; \ + stp x9, x10, [P0+48]; \ + str x11, [P0+64] + +S2N_BN_SYMBOL(p521_jdouble): + +// Save regs and make room on stack for temporary variables + + stp x19, x20, [sp, #-16]! + stp x21, x22, [sp, #-16]! + stp x23, x24, [sp, #-16]! + stp x25, x26, [sp, #-16]! + stp x27, x28, [sp, #-16]! + sub sp, sp, NSPACE + +// Move the input arguments to stable places + + mov input_z, x0 + mov input_x, x1 + +// Main code, just a sequence of basic field operations + +// z2 = z^2 +// y2 = y^2 + + sqr_p521(z2,z_1) + sqr_p521(y2,y_1) + +// x2p = x^2 - z^4 = (x + z^2) * (x - z^2) + + add_p521(t1,x_1,z2) + sub_p521(t2,x_1,z2) + mul_p521(x2p,t1,t2) + +// t1 = y + z +// x4p = x2p^2 +// xy2 = x * y^2 + + add_p521(t1,y_1,z_1) + sqr_p521(x4p,x2p) + weakmul_p521(xy2,x_1,y2) + +// t2 = (y + z)^2 + + sqr_p521(t2,t1) + +// d = 12 * xy2 - 9 * x4p +// t1 = y^2 + 2 * y * z + + cmsub_p521(d,12,xy2,9,x4p) + sub_p521(t1,t2,z2) + +// y4 = y^4 + + sqr_p521(y4,y2) + +// z_3' = 2 * y * z +// dx2 = d * x2p + + sub_p521(z_3,t1,y2) + weakmul_p521(dx2,d,x2p) + +// x' = 4 * xy2 - d + + cmsub41_p521(x_3,xy2,d) + +// y' = 3 * dx2 - 8 * y4 + + cmsub38_p521(y_3,dx2,y4) + +// Restore stack and registers + + add sp, sp, NSPACE + + ldp x27, x28, [sp], 16 + ldp x25, x26, [sp], 16 + ldp x23, x24, [sp], 16 + ldp x21, x22, [sp], 16 + ldp x19, x20, [sp], 16 + + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack, "", %progbits +#endif diff --git a/arm/p521/p521_jmixadd.S b/arm/p521/p521_jmixadd.S new file mode 100644 index 0000000000..6e8b46b00c --- /dev/null +++ b/arm/p521/p521_jmixadd.S @@ -0,0 +1,797 @@ +/* + * Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"). + * You may not use this file except in compliance with the License. + * A copy of the License is located at + * + * http://aws.amazon.com/apache2.0 + * + * or in the "LICENSE" file accompanying this file. This file is distributed + * on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either + * express or implied. See the License for the specific language governing + * permissions and limitations under the License. + */ + +// ---------------------------------------------------------------------------- +// Point mixed addition on NIST curve P-521 in Jacobian coordinates +// +// extern void p521_jmixadd +// (uint64_t p3[static 27],uint64_t p1[static 27],uint64_t p2[static 18]); +// +// Does p3 := p1 + p2 where all points are regarded as Jacobian triples. +// A Jacobian triple (x,y,z) represents affine point (x/z^2,y/z^3). +// The "mixed" part means that p2 only has x and y coordinates, with the +// implicit z coordinate assumed to be the identity. It is assumed that +// all the coordinates of the input points p1 and p2 are fully reduced +// mod p_521, that the z coordinate of p1 is nonzero and that neither +// p1 =~= p2 or p1 =~= -p2, where "=~=" means "represents the same affine +// point as". +// +// Standard ARM ABI: X0 = p3, X1 = p1, X2 = p2 +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(p521_jmixadd) + S2N_BN_SYM_PRIVACY_DIRECTIVE(p521_jmixadd) + + .text + .balign 4 + +// Size of individual field elements + +#define NUMSIZE 72 + +// Stable homes for input arguments during main code sequence + +#define input_z x26 +#define input_x x27 +#define input_y x28 + +// Pointer-offset pairs for inputs and outputs + +#define x_1 input_x, #0 +#define y_1 input_x, #NUMSIZE +#define z_1 input_x, #(2*NUMSIZE) + +#define x_2 input_y, #0 +#define y_2 input_y, #NUMSIZE + +#define x_3 input_z, #0 +#define y_3 input_z, #NUMSIZE +#define z_3 input_z, #(2*NUMSIZE) + +// Pointer-offset pairs for temporaries, with some aliasing +// NSPACE is the total stack needed for these temporaries + +#define zp2 sp, #(NUMSIZE*0) +#define ww sp, #(NUMSIZE*0) + +#define yd sp, #(NUMSIZE*1) +#define y2a sp, #(NUMSIZE*1) + +#define x2a sp, #(NUMSIZE*2) +#define zzx2 sp, #(NUMSIZE*2) + +#define zz sp, #(NUMSIZE*3) +#define t1 sp, #(NUMSIZE*3) + +#define t2 sp, #(NUMSIZE*4) +#define zzx1 sp, #(NUMSIZE*4) + +#define xd sp, #(NUMSIZE*5) + +#define NSPACE (NUMSIZE*6) + +// Corresponds exactly to bignum_mul_p521_alt + +#define mul_p521(P0,P1,P2) \ + ldp x3, x4, [P1]; \ + ldp x5, x6, [P2]; \ + mul x15, x3, x5; \ + umulh x16, x3, x5; \ + mul x14, x3, x6; \ + umulh x17, x3, x6; \ + adds x16, x16, x14; \ + ldp x7, x8, [P2+16]; \ + mul x14, x3, x7; \ + umulh x19, x3, x7; \ + adcs x17, x17, x14; \ + mul x14, x3, x8; \ + umulh x20, x3, x8; \ + adcs x19, x19, x14; \ + ldp x9, x10, [P2+32]; \ + mul x14, x3, x9; \ + umulh x21, x3, x9; \ + adcs x20, x20, x14; \ + mul x14, x3, x10; \ + umulh x22, x3, x10; \ + adcs x21, x21, x14; \ + ldp x11, x12, [P2+48]; \ + mul x14, x3, x11; \ + umulh x23, x3, x11; \ + adcs x22, x22, x14; \ + ldr x13, [P2+64]; \ + mul x14, x3, x12; \ + umulh x24, x3, x12; \ + adcs x23, x23, x14; \ + mul x14, x3, x13; \ + umulh x1, x3, x13; \ + adcs x24, x24, x14; \ + adc x1, x1, xzr; \ + mul x14, x4, x5; \ + adds x16, x16, x14; \ + mul x14, x4, x6; \ + adcs x17, x17, x14; \ + mul x14, x4, x7; \ + adcs x19, x19, x14; \ + mul x14, x4, x8; \ + adcs x20, x20, x14; \ + mul x14, x4, x9; \ + adcs x21, x21, x14; \ + mul x14, x4, x10; \ + adcs x22, x22, x14; \ + mul x14, x4, x11; \ + adcs x23, x23, x14; \ + mul x14, x4, x12; \ + adcs x24, x24, x14; \ + mul x14, x4, x13; \ + adcs x1, x1, x14; \ + cset x0, hs; \ + umulh x14, x4, x5; \ + adds x17, x17, x14; \ + umulh x14, x4, x6; \ + adcs x19, x19, x14; \ + umulh x14, x4, x7; \ + adcs x20, x20, x14; \ + umulh x14, x4, x8; \ + adcs x21, x21, x14; \ + umulh x14, x4, x9; \ + adcs x22, x22, x14; \ + umulh x14, x4, x10; \ + adcs x23, x23, x14; \ + umulh x14, x4, x11; \ + adcs x24, x24, x14; \ + umulh x14, x4, x12; \ + adcs x1, x1, x14; \ + umulh x14, x4, x13; \ + adc x0, x0, x14; \ + stp x15, x16, [P0]; \ + ldp x3, x4, [P1+16]; \ + mul x14, x3, x5; \ + adds x17, x17, x14; \ + mul x14, x3, x6; \ + adcs x19, x19, x14; \ + mul x14, x3, x7; \ + adcs x20, x20, x14; \ + mul x14, x3, x8; \ + adcs x21, x21, x14; \ + mul x14, x3, x9; \ + adcs x22, x22, x14; \ + mul x14, x3, x10; \ + adcs x23, x23, x14; \ + mul x14, x3, x11; \ + adcs x24, x24, x14; \ + mul x14, x3, x12; \ + adcs x1, x1, x14; \ + mul x14, x3, x13; \ + adcs x0, x0, x14; \ + cset x15, hs; \ + umulh x14, x3, x5; \ + adds x19, x19, x14; \ + umulh x14, x3, x6; \ + adcs x20, x20, x14; \ + umulh x14, x3, x7; \ + adcs x21, x21, x14; \ + umulh x14, x3, x8; \ + adcs x22, x22, x14; \ + umulh x14, x3, x9; \ + adcs x23, x23, x14; \ + umulh x14, x3, x10; \ + adcs x24, x24, x14; \ + umulh x14, x3, x11; \ + adcs x1, x1, x14; \ + umulh x14, x3, x12; \ + adcs x0, x0, x14; \ + umulh x14, x3, x13; \ + adc x15, x15, x14; \ + mul x14, x4, x5; \ + adds x19, x19, x14; \ + mul x14, x4, x6; \ + adcs x20, x20, x14; \ + mul x14, x4, x7; \ + adcs x21, x21, x14; \ + mul x14, x4, x8; \ + adcs x22, x22, x14; \ + mul x14, x4, x9; \ + adcs x23, x23, x14; \ + mul x14, x4, x10; \ + adcs x24, x24, x14; \ + mul x14, x4, x11; \ + adcs x1, x1, x14; \ + mul x14, x4, x12; \ + adcs x0, x0, x14; \ + mul x14, x4, x13; \ + adcs x15, x15, x14; \ + cset x16, hs; \ + umulh x14, x4, x5; \ + adds x20, x20, x14; \ + umulh x14, x4, x6; \ + adcs x21, x21, x14; \ + umulh x14, x4, x7; \ + adcs x22, x22, x14; \ + umulh x14, x4, x8; \ + adcs x23, x23, x14; \ + umulh x14, x4, x9; \ + adcs x24, x24, x14; \ + umulh x14, x4, x10; \ + adcs x1, x1, x14; \ + umulh x14, x4, x11; \ + adcs x0, x0, x14; \ + umulh x14, x4, x12; \ + adcs x15, x15, x14; \ + umulh x14, x4, x13; \ + adc x16, x16, x14; \ + stp x17, x19, [P0+16]; \ + ldp x3, x4, [P1+32]; \ + mul x14, x3, x5; \ + adds x20, x20, x14; \ + mul x14, x3, x6; \ + adcs x21, x21, x14; \ + mul x14, x3, x7; \ + adcs x22, x22, x14; \ + mul x14, x3, x8; \ + adcs x23, x23, x14; \ + mul x14, x3, x9; \ + adcs x24, x24, x14; \ + mul x14, x3, x10; \ + adcs x1, x1, x14; \ + mul x14, x3, x11; \ + adcs x0, x0, x14; \ + mul x14, x3, x12; \ + adcs x15, x15, x14; \ + mul x14, x3, x13; \ + adcs x16, x16, x14; \ + cset x17, hs; \ + umulh x14, x3, x5; \ + adds x21, x21, x14; \ + umulh x14, x3, x6; \ + adcs x22, x22, x14; \ + umulh x14, x3, x7; \ + adcs x23, x23, x14; \ + umulh x14, x3, x8; \ + adcs x24, x24, x14; \ + umulh x14, x3, x9; \ + adcs x1, x1, x14; \ + umulh x14, x3, x10; \ + adcs x0, x0, x14; \ + umulh x14, x3, x11; \ + adcs x15, x15, x14; \ + umulh x14, x3, x12; \ + adcs x16, x16, x14; \ + umulh x14, x3, x13; \ + adc x17, x17, x14; \ + mul x14, x4, x5; \ + adds x21, x21, x14; \ + mul x14, x4, x6; \ + adcs x22, x22, x14; \ + mul x14, x4, x7; \ + adcs x23, x23, x14; \ + mul x14, x4, x8; \ + adcs x24, x24, x14; \ + mul x14, x4, x9; \ + adcs x1, x1, x14; \ + mul x14, x4, x10; \ + adcs x0, x0, x14; \ + mul x14, x4, x11; \ + adcs x15, x15, x14; \ + mul x14, x4, x12; \ + adcs x16, x16, x14; \ + mul x14, x4, x13; \ + adcs x17, x17, x14; \ + cset x19, hs; \ + umulh x14, x4, x5; \ + adds x22, x22, x14; \ + umulh x14, x4, x6; \ + adcs x23, x23, x14; \ + umulh x14, x4, x7; \ + adcs x24, x24, x14; \ + umulh x14, x4, x8; \ + adcs x1, x1, x14; \ + umulh x14, x4, x9; \ + adcs x0, x0, x14; \ + umulh x14, x4, x10; \ + adcs x15, x15, x14; \ + umulh x14, x4, x11; \ + adcs x16, x16, x14; \ + umulh x14, x4, x12; \ + adcs x17, x17, x14; \ + umulh x14, x4, x13; \ + adc x19, x19, x14; \ + stp x20, x21, [P0+32]; \ + ldp x3, x4, [P1+48]; \ + mul x14, x3, x5; \ + adds x22, x22, x14; \ + mul x14, x3, x6; \ + adcs x23, x23, x14; \ + mul x14, x3, x7; \ + adcs x24, x24, x14; \ + mul x14, x3, x8; \ + adcs x1, x1, x14; \ + mul x14, x3, x9; \ + adcs x0, x0, x14; \ + mul x14, x3, x10; \ + adcs x15, x15, x14; \ + mul x14, x3, x11; \ + adcs x16, x16, x14; \ + mul x14, x3, x12; \ + adcs x17, x17, x14; \ + mul x14, x3, x13; \ + adcs x19, x19, x14; \ + cset x20, hs; \ + umulh x14, x3, x5; \ + adds x23, x23, x14; \ + umulh x14, x3, x6; \ + adcs x24, x24, x14; \ + umulh x14, x3, x7; \ + adcs x1, x1, x14; \ + umulh x14, x3, x8; \ + adcs x0, x0, x14; \ + umulh x14, x3, x9; \ + adcs x15, x15, x14; \ + umulh x14, x3, x10; \ + adcs x16, x16, x14; \ + umulh x14, x3, x11; \ + adcs x17, x17, x14; \ + umulh x14, x3, x12; \ + adcs x19, x19, x14; \ + umulh x14, x3, x13; \ + adc x20, x20, x14; \ + mul x14, x4, x5; \ + adds x23, x23, x14; \ + mul x14, x4, x6; \ + adcs x24, x24, x14; \ + mul x14, x4, x7; \ + adcs x1, x1, x14; \ + mul x14, x4, x8; \ + adcs x0, x0, x14; \ + mul x14, x4, x9; \ + adcs x15, x15, x14; \ + mul x14, x4, x10; \ + adcs x16, x16, x14; \ + mul x14, x4, x11; \ + adcs x17, x17, x14; \ + mul x14, x4, x12; \ + adcs x19, x19, x14; \ + mul x14, x4, x13; \ + adcs x20, x20, x14; \ + cset x21, hs; \ + umulh x14, x4, x5; \ + adds x24, x24, x14; \ + umulh x14, x4, x6; \ + adcs x1, x1, x14; \ + umulh x14, x4, x7; \ + adcs x0, x0, x14; \ + umulh x14, x4, x8; \ + adcs x15, x15, x14; \ + umulh x14, x4, x9; \ + adcs x16, x16, x14; \ + umulh x14, x4, x10; \ + adcs x17, x17, x14; \ + umulh x14, x4, x11; \ + adcs x19, x19, x14; \ + umulh x14, x4, x12; \ + adcs x20, x20, x14; \ + umulh x14, x4, x13; \ + adc x21, x21, x14; \ + stp x22, x23, [P0+48]; \ + ldr x3, [P1+64]; \ + mul x14, x3, x5; \ + adds x24, x24, x14; \ + mul x14, x3, x6; \ + adcs x1, x1, x14; \ + mul x14, x3, x7; \ + adcs x0, x0, x14; \ + mul x14, x3, x8; \ + adcs x15, x15, x14; \ + mul x14, x3, x9; \ + adcs x16, x16, x14; \ + mul x14, x3, x10; \ + adcs x17, x17, x14; \ + mul x14, x3, x11; \ + adcs x19, x19, x14; \ + mul x14, x3, x12; \ + adcs x20, x20, x14; \ + mul x14, x3, x13; \ + adc x21, x21, x14; \ + umulh x14, x3, x5; \ + adds x1, x1, x14; \ + umulh x14, x3, x6; \ + adcs x0, x0, x14; \ + umulh x14, x3, x7; \ + adcs x15, x15, x14; \ + umulh x14, x3, x8; \ + adcs x16, x16, x14; \ + umulh x14, x3, x9; \ + adcs x17, x17, x14; \ + umulh x14, x3, x10; \ + adcs x19, x19, x14; \ + umulh x14, x3, x11; \ + adcs x20, x20, x14; \ + umulh x14, x3, x12; \ + adc x21, x21, x14; \ + cmp xzr, xzr; \ + ldp x5, x6, [P0]; \ + extr x14, x1, x24, #9; \ + adcs x5, x5, x14; \ + extr x14, x0, x1, #9; \ + adcs x6, x6, x14; \ + ldp x7, x8, [P0+16]; \ + extr x14, x15, x0, #9; \ + adcs x7, x7, x14; \ + extr x14, x16, x15, #9; \ + adcs x8, x8, x14; \ + ldp x9, x10, [P0+32]; \ + extr x14, x17, x16, #9; \ + adcs x9, x9, x14; \ + extr x14, x19, x17, #9; \ + adcs x10, x10, x14; \ + ldp x11, x12, [P0+48]; \ + extr x14, x20, x19, #9; \ + adcs x11, x11, x14; \ + extr x14, x21, x20, #9; \ + adcs x12, x12, x14; \ + orr x13, x24, #0xfffffffffffffe00; \ + lsr x14, x21, #9; \ + adcs x13, x13, x14; \ + sbcs x5, x5, xzr; \ + sbcs x6, x6, xzr; \ + sbcs x7, x7, xzr; \ + sbcs x8, x8, xzr; \ + sbcs x9, x9, xzr; \ + sbcs x10, x10, xzr; \ + sbcs x11, x11, xzr; \ + sbcs x12, x12, xzr; \ + sbc x13, x13, xzr; \ + and x13, x13, #0x1ff; \ + stp x5, x6, [P0]; \ + stp x7, x8, [P0+16]; \ + stp x9, x10, [P0+32]; \ + stp x11, x12, [P0+48]; \ + str x13, [P0+64] + +// Corresponds exactly to bignum_sqr_p521_alt + +#define sqr_p521(P0,P1) \ + ldp x2, x3, [P1]; \ + mul x11, x2, x3; \ + umulh x12, x2, x3; \ + ldp x4, x5, [P1+16]; \ + mul x10, x2, x4; \ + umulh x13, x2, x4; \ + adds x12, x12, x10; \ + ldp x6, x7, [P1+32]; \ + mul x10, x2, x5; \ + umulh x14, x2, x5; \ + adcs x13, x13, x10; \ + ldp x8, x9, [P1+48]; \ + mul x10, x2, x6; \ + umulh x15, x2, x6; \ + adcs x14, x14, x10; \ + mul x10, x2, x7; \ + umulh x16, x2, x7; \ + adcs x15, x15, x10; \ + mul x10, x2, x8; \ + umulh x17, x2, x8; \ + adcs x16, x16, x10; \ + mul x10, x2, x9; \ + umulh x19, x2, x9; \ + adcs x17, x17, x10; \ + adc x19, x19, xzr; \ + mul x10, x3, x4; \ + adds x13, x13, x10; \ + mul x10, x3, x5; \ + adcs x14, x14, x10; \ + mul x10, x3, x6; \ + adcs x15, x15, x10; \ + mul x10, x3, x7; \ + adcs x16, x16, x10; \ + mul x10, x3, x8; \ + adcs x17, x17, x10; \ + mul x10, x3, x9; \ + adcs x19, x19, x10; \ + cset x20, hs; \ + umulh x10, x3, x4; \ + adds x14, x14, x10; \ + umulh x10, x3, x5; \ + adcs x15, x15, x10; \ + umulh x10, x3, x6; \ + adcs x16, x16, x10; \ + umulh x10, x3, x7; \ + adcs x17, x17, x10; \ + umulh x10, x3, x8; \ + adcs x19, x19, x10; \ + umulh x10, x3, x9; \ + adc x20, x20, x10; \ + mul x10, x6, x7; \ + umulh x21, x6, x7; \ + adds x20, x20, x10; \ + adc x21, x21, xzr; \ + mul x10, x4, x5; \ + adds x15, x15, x10; \ + mul x10, x4, x6; \ + adcs x16, x16, x10; \ + mul x10, x4, x7; \ + adcs x17, x17, x10; \ + mul x10, x4, x8; \ + adcs x19, x19, x10; \ + mul x10, x4, x9; \ + adcs x20, x20, x10; \ + mul x10, x6, x8; \ + adcs x21, x21, x10; \ + cset x22, hs; \ + umulh x10, x4, x5; \ + adds x16, x16, x10; \ + umulh x10, x4, x6; \ + adcs x17, x17, x10; \ + umulh x10, x4, x7; \ + adcs x19, x19, x10; \ + umulh x10, x4, x8; \ + adcs x20, x20, x10; \ + umulh x10, x4, x9; \ + adcs x21, x21, x10; \ + umulh x10, x6, x8; \ + adc x22, x22, x10; \ + mul x10, x7, x8; \ + umulh x23, x7, x8; \ + adds x22, x22, x10; \ + adc x23, x23, xzr; \ + mul x10, x5, x6; \ + adds x17, x17, x10; \ + mul x10, x5, x7; \ + adcs x19, x19, x10; \ + mul x10, x5, x8; \ + adcs x20, x20, x10; \ + mul x10, x5, x9; \ + adcs x21, x21, x10; \ + mul x10, x6, x9; \ + adcs x22, x22, x10; \ + mul x10, x7, x9; \ + adcs x23, x23, x10; \ + cset x24, hs; \ + umulh x10, x5, x6; \ + adds x19, x19, x10; \ + umulh x10, x5, x7; \ + adcs x20, x20, x10; \ + umulh x10, x5, x8; \ + adcs x21, x21, x10; \ + umulh x10, x5, x9; \ + adcs x22, x22, x10; \ + umulh x10, x6, x9; \ + adcs x23, x23, x10; \ + umulh x10, x7, x9; \ + adc x24, x24, x10; \ + mul x10, x8, x9; \ + umulh x25, x8, x9; \ + adds x24, x24, x10; \ + adc x25, x25, xzr; \ + adds x11, x11, x11; \ + adcs x12, x12, x12; \ + adcs x13, x13, x13; \ + adcs x14, x14, x14; \ + adcs x15, x15, x15; \ + adcs x16, x16, x16; \ + adcs x17, x17, x17; \ + adcs x19, x19, x19; \ + adcs x20, x20, x20; \ + adcs x21, x21, x21; \ + adcs x22, x22, x22; \ + adcs x23, x23, x23; \ + adcs x24, x24, x24; \ + adcs x25, x25, x25; \ + cset x0, hs; \ + umulh x10, x2, x2; \ + adds x11, x11, x10; \ + mul x10, x3, x3; \ + adcs x12, x12, x10; \ + umulh x10, x3, x3; \ + adcs x13, x13, x10; \ + mul x10, x4, x4; \ + adcs x14, x14, x10; \ + umulh x10, x4, x4; \ + adcs x15, x15, x10; \ + mul x10, x5, x5; \ + adcs x16, x16, x10; \ + umulh x10, x5, x5; \ + adcs x17, x17, x10; \ + mul x10, x6, x6; \ + adcs x19, x19, x10; \ + umulh x10, x6, x6; \ + adcs x20, x20, x10; \ + mul x10, x7, x7; \ + adcs x21, x21, x10; \ + umulh x10, x7, x7; \ + adcs x22, x22, x10; \ + mul x10, x8, x8; \ + adcs x23, x23, x10; \ + umulh x10, x8, x8; \ + adcs x24, x24, x10; \ + mul x10, x9, x9; \ + adcs x25, x25, x10; \ + umulh x10, x9, x9; \ + adc x0, x0, x10; \ + ldr x1, [P1+64]; \ + add x1, x1, x1; \ + mul x10, x1, x2; \ + adds x19, x19, x10; \ + umulh x10, x1, x2; \ + adcs x20, x20, x10; \ + mul x10, x1, x4; \ + adcs x21, x21, x10; \ + umulh x10, x1, x4; \ + adcs x22, x22, x10; \ + mul x10, x1, x6; \ + adcs x23, x23, x10; \ + umulh x10, x1, x6; \ + adcs x24, x24, x10; \ + mul x10, x1, x8; \ + adcs x25, x25, x10; \ + umulh x10, x1, x8; \ + adcs x0, x0, x10; \ + lsr x4, x1, #1; \ + mul x4, x4, x4; \ + adc x4, x4, xzr; \ + mul x10, x1, x3; \ + adds x20, x20, x10; \ + umulh x10, x1, x3; \ + adcs x21, x21, x10; \ + mul x10, x1, x5; \ + adcs x22, x22, x10; \ + umulh x10, x1, x5; \ + adcs x23, x23, x10; \ + mul x10, x1, x7; \ + adcs x24, x24, x10; \ + umulh x10, x1, x7; \ + adcs x25, x25, x10; \ + mul x10, x1, x9; \ + adcs x0, x0, x10; \ + umulh x10, x1, x9; \ + adc x4, x4, x10; \ + mul x2, x2, x2; \ + cmp xzr, xzr; \ + extr x10, x20, x19, #9; \ + adcs x2, x2, x10; \ + extr x10, x21, x20, #9; \ + adcs x11, x11, x10; \ + extr x10, x22, x21, #9; \ + adcs x12, x12, x10; \ + extr x10, x23, x22, #9; \ + adcs x13, x13, x10; \ + extr x10, x24, x23, #9; \ + adcs x14, x14, x10; \ + extr x10, x25, x24, #9; \ + adcs x15, x15, x10; \ + extr x10, x0, x25, #9; \ + adcs x16, x16, x10; \ + extr x10, x4, x0, #9; \ + adcs x17, x17, x10; \ + orr x19, x19, #0xfffffffffffffe00; \ + lsr x10, x4, #9; \ + adcs x19, x19, x10; \ + sbcs x2, x2, xzr; \ + sbcs x11, x11, xzr; \ + sbcs x12, x12, xzr; \ + sbcs x13, x13, xzr; \ + sbcs x14, x14, xzr; \ + sbcs x15, x15, xzr; \ + sbcs x16, x16, xzr; \ + sbcs x17, x17, xzr; \ + sbc x19, x19, xzr; \ + and x19, x19, #0x1ff; \ + stp x2, x11, [P0]; \ + stp x12, x13, [P0+16]; \ + stp x14, x15, [P0+32]; \ + stp x16, x17, [P0+48]; \ + str x19, [P0+64] + +// Corresponds exactly to bignum_sub_p521 + +#define sub_p521(P0,P1,P2) \ + ldp x5, x6, [P1]; \ + ldp x4, x3, [P2]; \ + subs x5, x5, x4; \ + sbcs x6, x6, x3; \ + ldp x7, x8, [P1+16]; \ + ldp x4, x3, [P2+16]; \ + sbcs x7, x7, x4; \ + sbcs x8, x8, x3; \ + ldp x9, x10, [P1+32]; \ + ldp x4, x3, [P2+32]; \ + sbcs x9, x9, x4; \ + sbcs x10, x10, x3; \ + ldp x11, x12, [P1+48]; \ + ldp x4, x3, [P2+48]; \ + sbcs x11, x11, x4; \ + sbcs x12, x12, x3; \ + ldr x13, [P1+64]; \ + ldr x4, [P2+64]; \ + sbcs x13, x13, x4; \ + sbcs x5, x5, xzr; \ + sbcs x6, x6, xzr; \ + sbcs x7, x7, xzr; \ + sbcs x8, x8, xzr; \ + sbcs x9, x9, xzr; \ + sbcs x10, x10, xzr; \ + sbcs x11, x11, xzr; \ + sbcs x12, x12, xzr; \ + sbcs x13, x13, xzr; \ + and x13, x13, #0x1ff; \ + stp x5, x6, [P0]; \ + stp x7, x8, [P0+16]; \ + stp x9, x10, [P0+32]; \ + stp x11, x12, [P0+48]; \ + str x13, [P0+64] + +S2N_BN_SYMBOL(p521_jmixadd): + +// Save regs and make room on stack for temporary variables + + stp x19, x20, [sp, #-16]! + stp x21, x22, [sp, #-16]! + stp x23, x24, [sp, #-16]! + stp x25, x26, [sp, #-16]! + stp x27, x28, [sp, #-16]! + sub sp, sp, NSPACE + +// Move the input arguments to stable places + + mov input_z, x0 + mov input_x, x1 + mov input_y, x2 + +// Main code, just a sequence of basic field operations + + sqr_p521(zp2,z_1) + mul_p521(y2a,z_1,y_2) + + mul_p521(x2a,zp2,x_2) + mul_p521(y2a,zp2,y2a) + + sub_p521(xd,x2a,x_1) + sub_p521(yd,y2a,y_1) + + sqr_p521(zz,xd) + sqr_p521(ww,yd) + + mul_p521(zzx1,zz,x_1) + mul_p521(zzx2,zz,x2a) + + sub_p521(x_3,ww,zzx1) + sub_p521(t1,zzx2,zzx1) + + mul_p521(z_3,xd,z_1) + + sub_p521(x_3,x_3,zzx2) + + sub_p521(t2,zzx1,x_3) + + mul_p521(t1,t1,y_1) + mul_p521(t2,yd,t2) + + sub_p521(y_3,t2,t1) + +// Restore stack and registers + + add sp, sp, NSPACE + + ldp x27, x28, [sp], 16 + ldp x25, x26, [sp], 16 + ldp x23, x24, [sp], 16 + ldp x21, x22, [sp], 16 + ldp x19, x20, [sp], 16 + + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack, "", %progbits +#endif diff --git a/x86_att/p521/p521_jadd.S b/x86_att/p521/p521_jadd.S new file mode 100644 index 0000000000..c1ac9a235f --- /dev/null +++ b/x86_att/p521/p521_jadd.S @@ -0,0 +1,765 @@ +/* + * Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"). + * You may not use this file except in compliance with the License. + * A copy of the License is located at + * + * http://aws.amazon.com/apache2.0 + * + * or in the "LICENSE" file accompanying this file. This file is distributed + * on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either + * express or implied. See the License for the specific language governing + * permissions and limitations under the License. + */ + +// ---------------------------------------------------------------------------- +// Point addition on NIST curve P-521 in Jacobian coordinates +// +// extern void p521_jadd +// (uint64_t p3[static 27],uint64_t p1[static 27],uint64_t p2[static 27]); +// +// Does p3 := p1 + p2 where all points are regarded as Jacobian triples. +// A Jacobian triple (x,y,z) represents affine point (x/z^2,y/z^3). +// It is assumed that all coordinates of the input points p1 and p2 are +// fully reduced mod p_521, that both z coordinates are nonzero and +// that neither p1 =~= p2 or p1 =~= -p2, where "=~=" means "represents +// the same affine point as". +// +// Standard x86-64 ABI: RDI = p3, RSI = p1, RDX = p2 +// Microsoft x64 ABI: RCX = p3, RDX = p1, R8 = p2 +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(p521_jadd) + S2N_BN_SYM_PRIVACY_DIRECTIVE(p521_jadd) + .text + +// Size of individual field elements + +#define NUMSIZE 72 + +// Stable homes for input arguments during main code sequence +// These are where they arrive except for input_y, initially in %rdx + +#define input_z %rdi +#define input_x %rsi +#define input_y %rcx + +// Pointer-offset pairs for inputs and outputs + +#define x_1 0(input_x) +#define y_1 NUMSIZE(input_x) +#define z_1 (2*NUMSIZE)(input_x) + +#define x_2 0(input_y) +#define y_2 NUMSIZE(input_y) +#define z_2 (2*NUMSIZE)(input_y) + +#define x_3 0(input_z) +#define y_3 NUMSIZE(input_z) +#define z_3 (2*NUMSIZE)(input_z) + +// Pointer-offset pairs for temporaries, with some aliasing +// The tmp field is internal storage for field mul and sqr. +// NSPACE is the total stack needed for these temporaries + +#define z1sq (NUMSIZE*0)(%rsp) +#define ww (NUMSIZE*0)(%rsp) + +#define yd (NUMSIZE*1)(%rsp) +#define y2a (NUMSIZE*1)(%rsp) + +#define x2a (NUMSIZE*2)(%rsp) +#define zzx2 (NUMSIZE*2)(%rsp) + +#define zz (NUMSIZE*3)(%rsp) +#define t1 (NUMSIZE*3)(%rsp) + +#define t2 (NUMSIZE*4)(%rsp) +#define x1a (NUMSIZE*4)(%rsp) +#define zzx1 (NUMSIZE*4)(%rsp) + +#define xd (NUMSIZE*5)(%rsp) +#define z2sq (NUMSIZE*5)(%rsp) + +#define y1a (NUMSIZE*6)(%rsp) + +#define tmp (NUMSIZE*7)(%rsp) + +#define NSPACE (NUMSIZE*7+64) + +// Corresponds exactly to bignum_mul_p521 + +#define mul_p521(P0,P1,P2) \ + xorl %ebp, %ebp ; \ + movq P2, %rdx ; \ + mulxq P1, %r8, %r9 ; \ + movq %r8, 504(%rsp) ; \ + mulxq 0x8+P1, %rbx, %r10 ; \ + adcq %rbx, %r9 ; \ + mulxq 0x10+P1, %rbx, %r11 ; \ + adcq %rbx, %r10 ; \ + mulxq 0x18+P1, %rbx, %r12 ; \ + adcq %rbx, %r11 ; \ + mulxq 0x20+P1, %rbx, %r13 ; \ + adcq %rbx, %r12 ; \ + mulxq 0x28+P1, %rbx, %r14 ; \ + adcq %rbx, %r13 ; \ + mulxq 0x30+P1, %rbx, %r15 ; \ + adcq %rbx, %r14 ; \ + mulxq 0x38+P1, %rbx, %r8 ; \ + adcq %rbx, %r15 ; \ + adcq %rbp, %r8 ; \ + movq 0x8+P2, %rdx ; \ + xorl %ebp, %ebp ; \ + mulxq P1, %rax, %rbx ; \ + adcxq %rax, %r9 ; \ + adoxq %rbx, %r10 ; \ + movq %r9, 512(%rsp) ; \ + mulxq 0x8+P1, %rax, %rbx ; \ + adcxq %rax, %r10 ; \ + adoxq %rbx, %r11 ; \ + mulxq 0x10+P1, %rax, %rbx ; \ + adcxq %rax, %r11 ; \ + adoxq %rbx, %r12 ; \ + mulxq 0x18+P1, %rax, %rbx ; \ + adcxq %rax, %r12 ; \ + adoxq %rbx, %r13 ; \ + mulxq 0x20+P1, %rax, %rbx ; \ + adcxq %rax, %r13 ; \ + adoxq %rbx, %r14 ; \ + mulxq 0x28+P1, %rax, %rbx ; \ + adcxq %rax, %r14 ; \ + adoxq %rbx, %r15 ; \ + mulxq 0x30+P1, %rax, %rbx ; \ + adcxq %rax, %r15 ; \ + adoxq %rbx, %r8 ; \ + mulxq 0x38+P1, %rax, %r9 ; \ + adcxq %rax, %r8 ; \ + adoxq %rbp, %r9 ; \ + adcq %rbp, %r9 ; \ + movq 0x10+P2, %rdx ; \ + xorl %ebp, %ebp ; \ + mulxq P1, %rax, %rbx ; \ + adcxq %rax, %r10 ; \ + adoxq %rbx, %r11 ; \ + movq %r10, 520(%rsp) ; \ + mulxq 0x8+P1, %rax, %rbx ; \ + adcxq %rax, %r11 ; \ + adoxq %rbx, %r12 ; \ + mulxq 0x10+P1, %rax, %rbx ; \ + adcxq %rax, %r12 ; \ + adoxq %rbx, %r13 ; \ + mulxq 0x18+P1, %rax, %rbx ; \ + adcxq %rax, %r13 ; \ + adoxq %rbx, %r14 ; \ + mulxq 0x20+P1, %rax, %rbx ; \ + adcxq %rax, %r14 ; \ + adoxq %rbx, %r15 ; \ + mulxq 0x28+P1, %rax, %rbx ; \ + adcxq %rax, %r15 ; \ + adoxq %rbx, %r8 ; \ + mulxq 0x30+P1, %rax, %rbx ; \ + adcxq %rax, %r8 ; \ + adoxq %rbx, %r9 ; \ + mulxq 0x38+P1, %rax, %r10 ; \ + adcxq %rax, %r9 ; \ + adoxq %rbp, %r10 ; \ + adcq %rbp, %r10 ; \ + movq 0x18+P2, %rdx ; \ + xorl %ebp, %ebp ; \ + mulxq P1, %rax, %rbx ; \ + adcxq %rax, %r11 ; \ + adoxq %rbx, %r12 ; \ + movq %r11, 528(%rsp) ; \ + mulxq 0x8+P1, %rax, %rbx ; \ + adcxq %rax, %r12 ; \ + adoxq %rbx, %r13 ; \ + mulxq 0x10+P1, %rax, %rbx ; \ + adcxq %rax, %r13 ; \ + adoxq %rbx, %r14 ; \ + mulxq 0x18+P1, %rax, %rbx ; \ + adcxq %rax, %r14 ; \ + adoxq %rbx, %r15 ; \ + mulxq 0x20+P1, %rax, %rbx ; \ + adcxq %rax, %r15 ; \ + adoxq %rbx, %r8 ; \ + mulxq 0x28+P1, %rax, %rbx ; \ + adcxq %rax, %r8 ; \ + adoxq %rbx, %r9 ; \ + mulxq 0x30+P1, %rax, %rbx ; \ + adcxq %rax, %r9 ; \ + adoxq %rbx, %r10 ; \ + mulxq 0x38+P1, %rax, %r11 ; \ + adcxq %rax, %r10 ; \ + adoxq %rbp, %r11 ; \ + adcq %rbp, %r11 ; \ + movq 0x20+P2, %rdx ; \ + xorl %ebp, %ebp ; \ + mulxq P1, %rax, %rbx ; \ + adcxq %rax, %r12 ; \ + adoxq %rbx, %r13 ; \ + movq %r12, 536(%rsp) ; \ + mulxq 0x8+P1, %rax, %rbx ; \ + adcxq %rax, %r13 ; \ + adoxq %rbx, %r14 ; \ + mulxq 0x10+P1, %rax, %rbx ; \ + adcxq %rax, %r14 ; \ + adoxq %rbx, %r15 ; \ + mulxq 0x18+P1, %rax, %rbx ; \ + adcxq %rax, %r15 ; \ + adoxq %rbx, %r8 ; \ + mulxq 0x20+P1, %rax, %rbx ; \ + adcxq %rax, %r8 ; \ + adoxq %rbx, %r9 ; \ + mulxq 0x28+P1, %rax, %rbx ; \ + adcxq %rax, %r9 ; \ + adoxq %rbx, %r10 ; \ + mulxq 0x30+P1, %rax, %rbx ; \ + adcxq %rax, %r10 ; \ + adoxq %rbx, %r11 ; \ + mulxq 0x38+P1, %rax, %r12 ; \ + adcxq %rax, %r11 ; \ + adoxq %rbp, %r12 ; \ + adcq %rbp, %r12 ; \ + movq 0x28+P2, %rdx ; \ + xorl %ebp, %ebp ; \ + mulxq P1, %rax, %rbx ; \ + adcxq %rax, %r13 ; \ + adoxq %rbx, %r14 ; \ + movq %r13, 544(%rsp) ; \ + mulxq 0x8+P1, %rax, %rbx ; \ + adcxq %rax, %r14 ; \ + adoxq %rbx, %r15 ; \ + mulxq 0x10+P1, %rax, %rbx ; \ + adcxq %rax, %r15 ; \ + adoxq %rbx, %r8 ; \ + mulxq 0x18+P1, %rax, %rbx ; \ + adcxq %rax, %r8 ; \ + adoxq %rbx, %r9 ; \ + mulxq 0x20+P1, %rax, %rbx ; \ + adcxq %rax, %r9 ; \ + adoxq %rbx, %r10 ; \ + mulxq 0x28+P1, %rax, %rbx ; \ + adcxq %rax, %r10 ; \ + adoxq %rbx, %r11 ; \ + mulxq 0x30+P1, %rax, %rbx ; \ + adcxq %rax, %r11 ; \ + adoxq %rbx, %r12 ; \ + mulxq 0x38+P1, %rax, %r13 ; \ + adcxq %rax, %r12 ; \ + adoxq %rbp, %r13 ; \ + adcq %rbp, %r13 ; \ + movq 0x30+P2, %rdx ; \ + xorl %ebp, %ebp ; \ + mulxq P1, %rax, %rbx ; \ + adcxq %rax, %r14 ; \ + adoxq %rbx, %r15 ; \ + movq %r14, 552(%rsp) ; \ + mulxq 0x8+P1, %rax, %rbx ; \ + adcxq %rax, %r15 ; \ + adoxq %rbx, %r8 ; \ + mulxq 0x10+P1, %rax, %rbx ; \ + adcxq %rax, %r8 ; \ + adoxq %rbx, %r9 ; \ + mulxq 0x18+P1, %rax, %rbx ; \ + adcxq %rax, %r9 ; \ + adoxq %rbx, %r10 ; \ + mulxq 0x20+P1, %rax, %rbx ; \ + adcxq %rax, %r10 ; \ + adoxq %rbx, %r11 ; \ + mulxq 0x28+P1, %rax, %rbx ; \ + adcxq %rax, %r11 ; \ + adoxq %rbx, %r12 ; \ + mulxq 0x30+P1, %rax, %rbx ; \ + adcxq %rax, %r12 ; \ + adoxq %rbx, %r13 ; \ + mulxq 0x38+P1, %rax, %r14 ; \ + adcxq %rax, %r13 ; \ + adoxq %rbp, %r14 ; \ + adcq %rbp, %r14 ; \ + movq 0x38+P2, %rdx ; \ + xorl %ebp, %ebp ; \ + mulxq P1, %rax, %rbx ; \ + adcxq %rax, %r15 ; \ + adoxq %rbx, %r8 ; \ + movq %r15, 560(%rsp) ; \ + mulxq 0x8+P1, %rax, %rbx ; \ + adcxq %rax, %r8 ; \ + adoxq %rbx, %r9 ; \ + mulxq 0x10+P1, %rax, %rbx ; \ + adcxq %rax, %r9 ; \ + adoxq %rbx, %r10 ; \ + mulxq 0x18+P1, %rax, %rbx ; \ + adcxq %rax, %r10 ; \ + adoxq %rbx, %r11 ; \ + mulxq 0x20+P1, %rax, %rbx ; \ + adcxq %rax, %r11 ; \ + adoxq %rbx, %r12 ; \ + mulxq 0x28+P1, %rax, %rbx ; \ + adcxq %rax, %r12 ; \ + adoxq %rbx, %r13 ; \ + mulxq 0x30+P1, %rax, %rbx ; \ + adcxq %rax, %r13 ; \ + adoxq %rbx, %r14 ; \ + mulxq 0x38+P1, %rax, %r15 ; \ + adcxq %rax, %r14 ; \ + adoxq %rbp, %r15 ; \ + adcq %rbp, %r15 ; \ + movq 0x40+P1, %rdx ; \ + xorl %ebp, %ebp ; \ + mulxq P2, %rax, %rbx ; \ + adcxq %rax, %r8 ; \ + adoxq %rbx, %r9 ; \ + mulxq 0x8+P2, %rax, %rbx ; \ + adcxq %rax, %r9 ; \ + adoxq %rbx, %r10 ; \ + mulxq 0x10+P2, %rax, %rbx ; \ + adcxq %rax, %r10 ; \ + adoxq %rbx, %r11 ; \ + mulxq 0x18+P2, %rax, %rbx ; \ + adcxq %rax, %r11 ; \ + adoxq %rbx, %r12 ; \ + mulxq 0x20+P2, %rax, %rbx ; \ + adcxq %rax, %r12 ; \ + adoxq %rbx, %r13 ; \ + mulxq 0x28+P2, %rax, %rbx ; \ + adcxq %rax, %r13 ; \ + adoxq %rbx, %r14 ; \ + mulxq 0x30+P2, %rax, %rbx ; \ + adcxq %rax, %r14 ; \ + adoxq %rbx, %r15 ; \ + mulxq 0x38+P2, %rax, %rbx ; \ + adcxq %rax, %r15 ; \ + adoxq %rbp, %rbx ; \ + adcq %rbx, %rbp ; \ + movq 0x40+P2, %rdx ; \ + xorl %eax, %eax ; \ + mulxq P1, %rax, %rbx ; \ + adcxq %rax, %r8 ; \ + adoxq %rbx, %r9 ; \ + mulxq 0x8+P1, %rax, %rbx ; \ + adcxq %rax, %r9 ; \ + adoxq %rbx, %r10 ; \ + mulxq 0x10+P1, %rax, %rbx ; \ + adcxq %rax, %r10 ; \ + adoxq %rbx, %r11 ; \ + mulxq 0x18+P1, %rax, %rbx ; \ + adcxq %rax, %r11 ; \ + adoxq %rbx, %r12 ; \ + mulxq 0x20+P1, %rax, %rbx ; \ + adcxq %rax, %r12 ; \ + adoxq %rbx, %r13 ; \ + mulxq 0x28+P1, %rax, %rbx ; \ + adcxq %rax, %r13 ; \ + adoxq %rbx, %r14 ; \ + mulxq 0x30+P1, %rax, %rbx ; \ + adcxq %rax, %r14 ; \ + adoxq %rbx, %r15 ; \ + mulxq 0x38+P1, %rax, %rbx ; \ + adcxq %rax, %r15 ; \ + adoxq %rbx, %rbp ; \ + mulxq 0x40+P1, %rax, %rbx ; \ + adcq %rax, %rbp ; \ + movq %r8, %rax ; \ + andq $0x1ff, %rax ; \ + shrdq $0x9, %r9, %r8 ; \ + shrdq $0x9, %r10, %r9 ; \ + shrdq $0x9, %r11, %r10 ; \ + shrdq $0x9, %r12, %r11 ; \ + shrdq $0x9, %r13, %r12 ; \ + shrdq $0x9, %r14, %r13 ; \ + shrdq $0x9, %r15, %r14 ; \ + shrdq $0x9, %rbp, %r15 ; \ + shrq $0x9, %rbp ; \ + addq %rax, %rbp ; \ + stc; \ + adcq 504(%rsp), %r8 ; \ + adcq 512(%rsp), %r9 ; \ + adcq 520(%rsp), %r10 ; \ + adcq 528(%rsp), %r11 ; \ + adcq 536(%rsp), %r12 ; \ + adcq 544(%rsp), %r13 ; \ + adcq 552(%rsp), %r14 ; \ + adcq 560(%rsp), %r15 ; \ + adcq $0xfffffffffffffe00, %rbp ; \ + cmc; \ + sbbq $0x0, %r8 ; \ + movq %r8, P0 ; \ + sbbq $0x0, %r9 ; \ + movq %r9, 0x8+P0 ; \ + sbbq $0x0, %r10 ; \ + movq %r10, 0x10+P0 ; \ + sbbq $0x0, %r11 ; \ + movq %r11, 0x18+P0 ; \ + sbbq $0x0, %r12 ; \ + movq %r12, 0x20+P0 ; \ + sbbq $0x0, %r13 ; \ + movq %r13, 0x28+P0 ; \ + sbbq $0x0, %r14 ; \ + movq %r14, 0x30+P0 ; \ + sbbq $0x0, %r15 ; \ + movq %r15, 0x38+P0 ; \ + sbbq $0x0, %rbp ; \ + andq $0x1ff, %rbp ; \ + movq %rbp, 0x40+P0 + +// Corresponds exactly to bignum_sqr_p521 + +#define sqr_p521(P0,P1) \ + xorl %ebp, %ebp ; \ + movq P1, %rdx ; \ + mulxq 0x8+P1, %r9, %rax ; \ + movq %r9, 512(%rsp) ; \ + mulxq 0x10+P1, %r10, %rbx ; \ + adcxq %rax, %r10 ; \ + movq %r10, 520(%rsp) ; \ + mulxq 0x18+P1, %r11, %rax ; \ + adcxq %rbx, %r11 ; \ + mulxq 0x20+P1, %r12, %rbx ; \ + adcxq %rax, %r12 ; \ + mulxq 0x28+P1, %r13, %rax ; \ + adcxq %rbx, %r13 ; \ + mulxq 0x30+P1, %r14, %rbx ; \ + adcxq %rax, %r14 ; \ + mulxq 0x38+P1, %r15, %r8 ; \ + adcxq %rbx, %r15 ; \ + adcxq %rbp, %r8 ; \ + xorl %ebp, %ebp ; \ + movq 0x8+P1, %rdx ; \ + mulxq 0x10+P1, %rax, %rbx ; \ + adcxq %rax, %r11 ; \ + adoxq %rbx, %r12 ; \ + movq %r11, 528(%rsp) ; \ + mulxq 0x18+P1, %rax, %rbx ; \ + adcxq %rax, %r12 ; \ + adoxq %rbx, %r13 ; \ + movq %r12, 536(%rsp) ; \ + mulxq 0x20+P1, %rax, %rbx ; \ + adcxq %rax, %r13 ; \ + adoxq %rbx, %r14 ; \ + mulxq 0x28+P1, %rax, %rbx ; \ + adcxq %rax, %r14 ; \ + adoxq %rbx, %r15 ; \ + mulxq 0x30+P1, %rax, %rbx ; \ + adcxq %rax, %r15 ; \ + adoxq %rbx, %r8 ; \ + mulxq 0x38+P1, %rax, %r9 ; \ + adcxq %rax, %r8 ; \ + adoxq %rbp, %r9 ; \ + movq 0x20+P1, %rdx ; \ + mulxq 0x28+P1, %rax, %r10 ; \ + adcxq %rax, %r9 ; \ + adoxq %rbp, %r10 ; \ + adcxq %rbp, %r10 ; \ + xorl %ebp, %ebp ; \ + movq 0x10+P1, %rdx ; \ + mulxq 0x18+P1, %rax, %rbx ; \ + adcxq %rax, %r13 ; \ + adoxq %rbx, %r14 ; \ + movq %r13, 544(%rsp) ; \ + mulxq 0x20+P1, %rax, %rbx ; \ + adcxq %rax, %r14 ; \ + adoxq %rbx, %r15 ; \ + movq %r14, 552(%rsp) ; \ + mulxq 0x28+P1, %rax, %rbx ; \ + adcxq %rax, %r15 ; \ + adoxq %rbx, %r8 ; \ + mulxq 0x30+P1, %rax, %rbx ; \ + adcxq %rax, %r8 ; \ + adoxq %rbx, %r9 ; \ + mulxq 0x38+P1, %rax, %rbx ; \ + adcxq %rax, %r9 ; \ + adoxq %rbx, %r10 ; \ + movq 0x30+P1, %rdx ; \ + mulxq 0x20+P1, %rax, %r11 ; \ + adcxq %rax, %r10 ; \ + adoxq %rbp, %r11 ; \ + mulxq 0x28+P1, %rax, %r12 ; \ + adcxq %rax, %r11 ; \ + adoxq %rbp, %r12 ; \ + adcxq %rbp, %r12 ; \ + xorl %ebp, %ebp ; \ + movq 0x18+P1, %rdx ; \ + mulxq 0x20+P1, %rax, %rbx ; \ + adcxq %rax, %r15 ; \ + adoxq %rbx, %r8 ; \ + movq %r15, 560(%rsp) ; \ + mulxq 0x28+P1, %rax, %rbx ; \ + adcxq %rax, %r8 ; \ + adoxq %rbx, %r9 ; \ + mulxq 0x30+P1, %rax, %rbx ; \ + adcxq %rax, %r9 ; \ + adoxq %rbx, %r10 ; \ + mulxq 0x38+P1, %rax, %rbx ; \ + adcxq %rax, %r10 ; \ + adoxq %rbx, %r11 ; \ + movq 0x38+P1, %rdx ; \ + mulxq 0x20+P1, %rax, %rbx ; \ + adcxq %rax, %r11 ; \ + adoxq %rbx, %r12 ; \ + mulxq 0x28+P1, %rax, %r13 ; \ + adcxq %rax, %r12 ; \ + adoxq %rbp, %r13 ; \ + mulxq 0x30+P1, %rax, %r14 ; \ + adcxq %rax, %r13 ; \ + adoxq %rbp, %r14 ; \ + adcxq %rbp, %r14 ; \ + xorl %ebp, %ebp ; \ + movq P1, %rdx ; \ + mulxq %rdx, %rax, %rbx ; \ + movq %rax, 504(%rsp) ; \ + movq 512(%rsp), %rax ; \ + adcxq %rax, %rax ; \ + adoxq %rbx, %rax ; \ + movq %rax, 512(%rsp) ; \ + movq 520(%rsp), %rax ; \ + movq 0x8+P1, %rdx ; \ + mulxq %rdx, %rdx, %rbx ; \ + adcxq %rax, %rax ; \ + adoxq %rdx, %rax ; \ + movq %rax, 520(%rsp) ; \ + movq 528(%rsp), %rax ; \ + adcxq %rax, %rax ; \ + adoxq %rbx, %rax ; \ + movq %rax, 528(%rsp) ; \ + movq 536(%rsp), %rax ; \ + movq 0x10+P1, %rdx ; \ + mulxq %rdx, %rdx, %rbx ; \ + adcxq %rax, %rax ; \ + adoxq %rdx, %rax ; \ + movq %rax, 536(%rsp) ; \ + movq 544(%rsp), %rax ; \ + adcxq %rax, %rax ; \ + adoxq %rbx, %rax ; \ + movq %rax, 544(%rsp) ; \ + movq 552(%rsp), %rax ; \ + movq 0x18+P1, %rdx ; \ + mulxq %rdx, %rdx, %rbx ; \ + adcxq %rax, %rax ; \ + adoxq %rdx, %rax ; \ + movq %rax, 552(%rsp) ; \ + movq 560(%rsp), %rax ; \ + adcxq %rax, %rax ; \ + adoxq %rbx, %rax ; \ + movq %rax, 560(%rsp) ; \ + movq 0x20+P1, %rdx ; \ + mulxq %rdx, %rdx, %rbx ; \ + adcxq %r8, %r8 ; \ + adoxq %rdx, %r8 ; \ + adcxq %r9, %r9 ; \ + adoxq %rbx, %r9 ; \ + movq 0x28+P1, %rdx ; \ + mulxq %rdx, %rdx, %rbx ; \ + adcxq %r10, %r10 ; \ + adoxq %rdx, %r10 ; \ + adcxq %r11, %r11 ; \ + adoxq %rbx, %r11 ; \ + movq 0x30+P1, %rdx ; \ + mulxq %rdx, %rdx, %rbx ; \ + adcxq %r12, %r12 ; \ + adoxq %rdx, %r12 ; \ + adcxq %r13, %r13 ; \ + adoxq %rbx, %r13 ; \ + movq 0x38+P1, %rdx ; \ + mulxq %rdx, %rdx, %r15 ; \ + adcxq %r14, %r14 ; \ + adoxq %rdx, %r14 ; \ + adcxq %rbp, %r15 ; \ + adoxq %rbp, %r15 ; \ + movq 0x40+P1, %rdx ; \ + movq %rdx, %rbp ; \ + imulq %rbp, %rbp ; \ + addq %rdx, %rdx ; \ + mulxq P1, %rax, %rbx ; \ + adcxq %rax, %r8 ; \ + adoxq %rbx, %r9 ; \ + mulxq 0x8+P1, %rax, %rbx ; \ + adcxq %rax, %r9 ; \ + adoxq %rbx, %r10 ; \ + mulxq 0x10+P1, %rax, %rbx ; \ + adcxq %rax, %r10 ; \ + adoxq %rbx, %r11 ; \ + mulxq 0x18+P1, %rax, %rbx ; \ + adcxq %rax, %r11 ; \ + adoxq %rbx, %r12 ; \ + mulxq 0x20+P1, %rax, %rbx ; \ + adcxq %rax, %r12 ; \ + adoxq %rbx, %r13 ; \ + mulxq 0x28+P1, %rax, %rbx ; \ + adcxq %rax, %r13 ; \ + adoxq %rbx, %r14 ; \ + mulxq 0x30+P1, %rax, %rbx ; \ + adcxq %rax, %r14 ; \ + adoxq %rbx, %r15 ; \ + mulxq 0x38+P1, %rax, %rbx ; \ + adcxq %rax, %r15 ; \ + adoxq %rbx, %rbp ; \ + adcq $0x0, %rbp ; \ + movq %r8, %rax ; \ + andq $0x1ff, %rax ; \ + shrdq $0x9, %r9, %r8 ; \ + shrdq $0x9, %r10, %r9 ; \ + shrdq $0x9, %r11, %r10 ; \ + shrdq $0x9, %r12, %r11 ; \ + shrdq $0x9, %r13, %r12 ; \ + shrdq $0x9, %r14, %r13 ; \ + shrdq $0x9, %r15, %r14 ; \ + shrdq $0x9, %rbp, %r15 ; \ + shrq $0x9, %rbp ; \ + addq %rax, %rbp ; \ + stc; \ + adcq 504(%rsp), %r8 ; \ + adcq 512(%rsp), %r9 ; \ + adcq 520(%rsp), %r10 ; \ + adcq 528(%rsp), %r11 ; \ + adcq 536(%rsp), %r12 ; \ + adcq 544(%rsp), %r13 ; \ + adcq 552(%rsp), %r14 ; \ + adcq 560(%rsp), %r15 ; \ + adcq $0xfffffffffffffe00, %rbp ; \ + cmc; \ + sbbq $0x0, %r8 ; \ + movq %r8, P0 ; \ + sbbq $0x0, %r9 ; \ + movq %r9, 0x8+P0 ; \ + sbbq $0x0, %r10 ; \ + movq %r10, 0x10+P0 ; \ + sbbq $0x0, %r11 ; \ + movq %r11, 0x18+P0 ; \ + sbbq $0x0, %r12 ; \ + movq %r12, 0x20+P0 ; \ + sbbq $0x0, %r13 ; \ + movq %r13, 0x28+P0 ; \ + sbbq $0x0, %r14 ; \ + movq %r14, 0x30+P0 ; \ + sbbq $0x0, %r15 ; \ + movq %r15, 0x38+P0 ; \ + sbbq $0x0, %rbp ; \ + andq $0x1ff, %rbp ; \ + movq %rbp, 0x40+P0 + +// Corresponds exactly to bignum_sub_p521 + +#define sub_p521(P0,P1,P2) \ + movq P1, %rax ; \ + subq P2, %rax ; \ + movq 0x8+P1, %rdx ; \ + sbbq 0x8+P2, %rdx ; \ + movq 0x10+P1, %r8 ; \ + sbbq 0x10+P2, %r8 ; \ + movq 0x18+P1, %r9 ; \ + sbbq 0x18+P2, %r9 ; \ + movq 0x20+P1, %r10 ; \ + sbbq 0x20+P2, %r10 ; \ + movq 0x28+P1, %r11 ; \ + sbbq 0x28+P2, %r11 ; \ + movq 0x30+P1, %r12 ; \ + sbbq 0x30+P2, %r12 ; \ + movq 0x38+P1, %r13 ; \ + sbbq 0x38+P2, %r13 ; \ + movq 0x40+P1, %r14 ; \ + sbbq 0x40+P2, %r14 ; \ + sbbq $0x0, %rax ; \ + movq %rax, P0 ; \ + sbbq $0x0, %rdx ; \ + movq %rdx, 0x8+P0 ; \ + sbbq $0x0, %r8 ; \ + movq %r8, 0x10+P0 ; \ + sbbq $0x0, %r9 ; \ + movq %r9, 0x18+P0 ; \ + sbbq $0x0, %r10 ; \ + movq %r10, 0x20+P0 ; \ + sbbq $0x0, %r11 ; \ + movq %r11, 0x28+P0 ; \ + sbbq $0x0, %r12 ; \ + movq %r12, 0x30+P0 ; \ + sbbq $0x0, %r13 ; \ + movq %r13, 0x38+P0 ; \ + sbbq $0x0, %r14 ; \ + andq $0x1ff, %r14 ; \ + movq %r14, 0x40+P0 + +S2N_BN_SYMBOL(p521_jadd): + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi + movq %r8, %rdx +#endif + +// Save registers and make room on stack for temporary variables + + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + + subq $NSPACE, %rsp + +// Move the input arguments to stable places (two are already there) + + movq %rdx, input_y + +// Main code, just a sequence of basic field operations + + sqr_p521(z1sq,z_1) + sqr_p521(z2sq,z_2) + + mul_p521(y1a,z_2,y_1) + mul_p521(y2a,z_1,y_2) + + mul_p521(x2a,z1sq,x_2) + mul_p521(x1a,z2sq,x_1) + mul_p521(y2a,z1sq,y2a) + mul_p521(y1a,z2sq,y1a) + + sub_p521(xd,x2a,x1a) + sub_p521(yd,y2a,y1a) + + sqr_p521(zz,xd) + sqr_p521(ww,yd) + + mul_p521(zzx1,zz,x1a) + mul_p521(zzx2,zz,x2a) + + sub_p521(x_3,ww,zzx1) + sub_p521(t1,zzx2,zzx1) + + mul_p521(xd,xd,z_1) + + sub_p521(x_3,x_3,zzx2) + + sub_p521(t2,zzx1,x_3) + + mul_p521(t1,t1,y1a) + mul_p521(z_3,xd,z_2) + mul_p521(t2,yd,t2) + + sub_p521(y_3,t2,t1) + +// Restore stack and registers + + addq $NSPACE, %rsp + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbp + popq %rbx + +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack, "", %progbits +#endif diff --git a/x86_att/p521/p521_jdouble.S b/x86_att/p521/p521_jdouble.S new file mode 100644 index 0000000000..16a5deeb04 --- /dev/null +++ b/x86_att/p521/p521_jdouble.S @@ -0,0 +1,1386 @@ +/* + * Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"). + * You may not use this file except in compliance with the License. + * A copy of the License is located at + * + * http://aws.amazon.com/apache2.0 + * + * or in the "LICENSE" file accompanying this file. This file is distributed + * on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either + * express or implied. See the License for the specific language governing + * permissions and limitations under the License. + */ + +// ---------------------------------------------------------------------------- +// Point doubling on NIST curve P-521 in Jacobian coordinates +// +// extern void p521_jdouble +// (uint64_t p3[static 27],uint64_t p1[static 27]); +// +// Does p3 := 2 * p1 where all points are regarded as Jacobian triples. +// A Jacobian triple (x,y,z) represents affine point (x/z^2,y/z^3). +// It is assumed that all coordinates of the input point are fully +// reduced mod p_521 and that the z coordinate is not zero. +// +// Standard x86-64 ABI: RDI = p3, RSI = p1 +// Microsoft x64 ABI: RCX = p3, RDX = p1 +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(p521_jdouble) + S2N_BN_SYM_PRIVACY_DIRECTIVE(p521_jdouble) + .text + +// Size of individual field elements + +#define NUMSIZE 72 + +// Stable homes for input arguments during main code sequence +// This is actually where they come in anyway and they stay there. + +#define input_z %rdi +#define input_x %rsi + +// Pointer-offset pairs for inputs and outputs + +#define x_1 0(input_x) +#define y_1 NUMSIZE(input_x) +#define z_1 (2*NUMSIZE)(input_x) + +#define x_3 0(input_z) +#define y_3 NUMSIZE(input_z) +#define z_3 (2*NUMSIZE)(input_z) + +// Pointer-offset pairs for temporaries, with some aliasing +// The tmp field is internal storage for field mul and sqr. +// NSPACE is the total stack needed for these temporaries + +#define z2 (NUMSIZE*0)(%rsp) +#define y2 (NUMSIZE*1)(%rsp) +#define x2p (NUMSIZE*2)(%rsp) +#define xy2 (NUMSIZE*3)(%rsp) + +#define y4 (NUMSIZE*4)(%rsp) +#define t2 (NUMSIZE*4)(%rsp) + +#define dx2 (NUMSIZE*5)(%rsp) +#define t1 (NUMSIZE*5)(%rsp) + +#define d (NUMSIZE*6)(%rsp) +#define x4p (NUMSIZE*6)(%rsp) + +#define tmp (NUMSIZE*7)(%rsp) + +#define NSPACE (NUMSIZE*7+64) + +// Corresponds exactly to bignum_mul_p521 + +#define mul_p521(P0,P1,P2) \ + xorl %ecx, %ecx ; \ + movq P2, %rdx ; \ + mulxq P1, %r8, %r9 ; \ + movq %r8, 504(%rsp) ; \ + mulxq 0x8+P1, %rbx, %r10 ; \ + adcq %rbx, %r9 ; \ + mulxq 0x10+P1, %rbx, %r11 ; \ + adcq %rbx, %r10 ; \ + mulxq 0x18+P1, %rbx, %r12 ; \ + adcq %rbx, %r11 ; \ + mulxq 0x20+P1, %rbx, %r13 ; \ + adcq %rbx, %r12 ; \ + mulxq 0x28+P1, %rbx, %r14 ; \ + adcq %rbx, %r13 ; \ + mulxq 0x30+P1, %rbx, %r15 ; \ + adcq %rbx, %r14 ; \ + mulxq 0x38+P1, %rbx, %r8 ; \ + adcq %rbx, %r15 ; \ + adcq %rcx, %r8 ; \ + movq 0x8+P2, %rdx ; \ + xorl %ecx, %ecx ; \ + mulxq P1, %rax, %rbx ; \ + adcxq %rax, %r9 ; \ + adoxq %rbx, %r10 ; \ + movq %r9, 512(%rsp) ; \ + mulxq 0x8+P1, %rax, %rbx ; \ + adcxq %rax, %r10 ; \ + adoxq %rbx, %r11 ; \ + mulxq 0x10+P1, %rax, %rbx ; \ + adcxq %rax, %r11 ; \ + adoxq %rbx, %r12 ; \ + mulxq 0x18+P1, %rax, %rbx ; \ + adcxq %rax, %r12 ; \ + adoxq %rbx, %r13 ; \ + mulxq 0x20+P1, %rax, %rbx ; \ + adcxq %rax, %r13 ; \ + adoxq %rbx, %r14 ; \ + mulxq 0x28+P1, %rax, %rbx ; \ + adcxq %rax, %r14 ; \ + adoxq %rbx, %r15 ; \ + mulxq 0x30+P1, %rax, %rbx ; \ + adcxq %rax, %r15 ; \ + adoxq %rbx, %r8 ; \ + mulxq 0x38+P1, %rax, %r9 ; \ + adcxq %rax, %r8 ; \ + adoxq %rcx, %r9 ; \ + adcq %rcx, %r9 ; \ + movq 0x10+P2, %rdx ; \ + xorl %ecx, %ecx ; \ + mulxq P1, %rax, %rbx ; \ + adcxq %rax, %r10 ; \ + adoxq %rbx, %r11 ; \ + movq %r10, 520(%rsp) ; \ + mulxq 0x8+P1, %rax, %rbx ; \ + adcxq %rax, %r11 ; \ + adoxq %rbx, %r12 ; \ + mulxq 0x10+P1, %rax, %rbx ; \ + adcxq %rax, %r12 ; \ + adoxq %rbx, %r13 ; \ + mulxq 0x18+P1, %rax, %rbx ; \ + adcxq %rax, %r13 ; \ + adoxq %rbx, %r14 ; \ + mulxq 0x20+P1, %rax, %rbx ; \ + adcxq %rax, %r14 ; \ + adoxq %rbx, %r15 ; \ + mulxq 0x28+P1, %rax, %rbx ; \ + adcxq %rax, %r15 ; \ + adoxq %rbx, %r8 ; \ + mulxq 0x30+P1, %rax, %rbx ; \ + adcxq %rax, %r8 ; \ + adoxq %rbx, %r9 ; \ + mulxq 0x38+P1, %rax, %r10 ; \ + adcxq %rax, %r9 ; \ + adoxq %rcx, %r10 ; \ + adcq %rcx, %r10 ; \ + movq 0x18+P2, %rdx ; \ + xorl %ecx, %ecx ; \ + mulxq P1, %rax, %rbx ; \ + adcxq %rax, %r11 ; \ + adoxq %rbx, %r12 ; \ + movq %r11, 528(%rsp) ; \ + mulxq 0x8+P1, %rax, %rbx ; \ + adcxq %rax, %r12 ; \ + adoxq %rbx, %r13 ; \ + mulxq 0x10+P1, %rax, %rbx ; \ + adcxq %rax, %r13 ; \ + adoxq %rbx, %r14 ; \ + mulxq 0x18+P1, %rax, %rbx ; \ + adcxq %rax, %r14 ; \ + adoxq %rbx, %r15 ; \ + mulxq 0x20+P1, %rax, %rbx ; \ + adcxq %rax, %r15 ; \ + adoxq %rbx, %r8 ; \ + mulxq 0x28+P1, %rax, %rbx ; \ + adcxq %rax, %r8 ; \ + adoxq %rbx, %r9 ; \ + mulxq 0x30+P1, %rax, %rbx ; \ + adcxq %rax, %r9 ; \ + adoxq %rbx, %r10 ; \ + mulxq 0x38+P1, %rax, %r11 ; \ + adcxq %rax, %r10 ; \ + adoxq %rcx, %r11 ; \ + adcq %rcx, %r11 ; \ + movq 0x20+P2, %rdx ; \ + xorl %ecx, %ecx ; \ + mulxq P1, %rax, %rbx ; \ + adcxq %rax, %r12 ; \ + adoxq %rbx, %r13 ; \ + movq %r12, 536(%rsp) ; \ + mulxq 0x8+P1, %rax, %rbx ; \ + adcxq %rax, %r13 ; \ + adoxq %rbx, %r14 ; \ + mulxq 0x10+P1, %rax, %rbx ; \ + adcxq %rax, %r14 ; \ + adoxq %rbx, %r15 ; \ + mulxq 0x18+P1, %rax, %rbx ; \ + adcxq %rax, %r15 ; \ + adoxq %rbx, %r8 ; \ + mulxq 0x20+P1, %rax, %rbx ; \ + adcxq %rax, %r8 ; \ + adoxq %rbx, %r9 ; \ + mulxq 0x28+P1, %rax, %rbx ; \ + adcxq %rax, %r9 ; \ + adoxq %rbx, %r10 ; \ + mulxq 0x30+P1, %rax, %rbx ; \ + adcxq %rax, %r10 ; \ + adoxq %rbx, %r11 ; \ + mulxq 0x38+P1, %rax, %r12 ; \ + adcxq %rax, %r11 ; \ + adoxq %rcx, %r12 ; \ + adcq %rcx, %r12 ; \ + movq 0x28+P2, %rdx ; \ + xorl %ecx, %ecx ; \ + mulxq P1, %rax, %rbx ; \ + adcxq %rax, %r13 ; \ + adoxq %rbx, %r14 ; \ + movq %r13, 544(%rsp) ; \ + mulxq 0x8+P1, %rax, %rbx ; \ + adcxq %rax, %r14 ; \ + adoxq %rbx, %r15 ; \ + mulxq 0x10+P1, %rax, %rbx ; \ + adcxq %rax, %r15 ; \ + adoxq %rbx, %r8 ; \ + mulxq 0x18+P1, %rax, %rbx ; \ + adcxq %rax, %r8 ; \ + adoxq %rbx, %r9 ; \ + mulxq 0x20+P1, %rax, %rbx ; \ + adcxq %rax, %r9 ; \ + adoxq %rbx, %r10 ; \ + mulxq 0x28+P1, %rax, %rbx ; \ + adcxq %rax, %r10 ; \ + adoxq %rbx, %r11 ; \ + mulxq 0x30+P1, %rax, %rbx ; \ + adcxq %rax, %r11 ; \ + adoxq %rbx, %r12 ; \ + mulxq 0x38+P1, %rax, %r13 ; \ + adcxq %rax, %r12 ; \ + adoxq %rcx, %r13 ; \ + adcq %rcx, %r13 ; \ + movq 0x30+P2, %rdx ; \ + xorl %ecx, %ecx ; \ + mulxq P1, %rax, %rbx ; \ + adcxq %rax, %r14 ; \ + adoxq %rbx, %r15 ; \ + movq %r14, 552(%rsp) ; \ + mulxq 0x8+P1, %rax, %rbx ; \ + adcxq %rax, %r15 ; \ + adoxq %rbx, %r8 ; \ + mulxq 0x10+P1, %rax, %rbx ; \ + adcxq %rax, %r8 ; \ + adoxq %rbx, %r9 ; \ + mulxq 0x18+P1, %rax, %rbx ; \ + adcxq %rax, %r9 ; \ + adoxq %rbx, %r10 ; \ + mulxq 0x20+P1, %rax, %rbx ; \ + adcxq %rax, %r10 ; \ + adoxq %rbx, %r11 ; \ + mulxq 0x28+P1, %rax, %rbx ; \ + adcxq %rax, %r11 ; \ + adoxq %rbx, %r12 ; \ + mulxq 0x30+P1, %rax, %rbx ; \ + adcxq %rax, %r12 ; \ + adoxq %rbx, %r13 ; \ + mulxq 0x38+P1, %rax, %r14 ; \ + adcxq %rax, %r13 ; \ + adoxq %rcx, %r14 ; \ + adcq %rcx, %r14 ; \ + movq 0x38+P2, %rdx ; \ + xorl %ecx, %ecx ; \ + mulxq P1, %rax, %rbx ; \ + adcxq %rax, %r15 ; \ + adoxq %rbx, %r8 ; \ + movq %r15, 560(%rsp) ; \ + mulxq 0x8+P1, %rax, %rbx ; \ + adcxq %rax, %r8 ; \ + adoxq %rbx, %r9 ; \ + mulxq 0x10+P1, %rax, %rbx ; \ + adcxq %rax, %r9 ; \ + adoxq %rbx, %r10 ; \ + mulxq 0x18+P1, %rax, %rbx ; \ + adcxq %rax, %r10 ; \ + adoxq %rbx, %r11 ; \ + mulxq 0x20+P1, %rax, %rbx ; \ + adcxq %rax, %r11 ; \ + adoxq %rbx, %r12 ; \ + mulxq 0x28+P1, %rax, %rbx ; \ + adcxq %rax, %r12 ; \ + adoxq %rbx, %r13 ; \ + mulxq 0x30+P1, %rax, %rbx ; \ + adcxq %rax, %r13 ; \ + adoxq %rbx, %r14 ; \ + mulxq 0x38+P1, %rax, %r15 ; \ + adcxq %rax, %r14 ; \ + adoxq %rcx, %r15 ; \ + adcq %rcx, %r15 ; \ + movq 0x40+P1, %rdx ; \ + xorl %ecx, %ecx ; \ + mulxq P2, %rax, %rbx ; \ + adcxq %rax, %r8 ; \ + adoxq %rbx, %r9 ; \ + mulxq 0x8+P2, %rax, %rbx ; \ + adcxq %rax, %r9 ; \ + adoxq %rbx, %r10 ; \ + mulxq 0x10+P2, %rax, %rbx ; \ + adcxq %rax, %r10 ; \ + adoxq %rbx, %r11 ; \ + mulxq 0x18+P2, %rax, %rbx ; \ + adcxq %rax, %r11 ; \ + adoxq %rbx, %r12 ; \ + mulxq 0x20+P2, %rax, %rbx ; \ + adcxq %rax, %r12 ; \ + adoxq %rbx, %r13 ; \ + mulxq 0x28+P2, %rax, %rbx ; \ + adcxq %rax, %r13 ; \ + adoxq %rbx, %r14 ; \ + mulxq 0x30+P2, %rax, %rbx ; \ + adcxq %rax, %r14 ; \ + adoxq %rbx, %r15 ; \ + mulxq 0x38+P2, %rax, %rbx ; \ + adcxq %rax, %r15 ; \ + adoxq %rcx, %rbx ; \ + adcq %rbx, %rcx ; \ + movq 0x40+P2, %rdx ; \ + xorl %eax, %eax ; \ + mulxq P1, %rax, %rbx ; \ + adcxq %rax, %r8 ; \ + adoxq %rbx, %r9 ; \ + mulxq 0x8+P1, %rax, %rbx ; \ + adcxq %rax, %r9 ; \ + adoxq %rbx, %r10 ; \ + mulxq 0x10+P1, %rax, %rbx ; \ + adcxq %rax, %r10 ; \ + adoxq %rbx, %r11 ; \ + mulxq 0x18+P1, %rax, %rbx ; \ + adcxq %rax, %r11 ; \ + adoxq %rbx, %r12 ; \ + mulxq 0x20+P1, %rax, %rbx ; \ + adcxq %rax, %r12 ; \ + adoxq %rbx, %r13 ; \ + mulxq 0x28+P1, %rax, %rbx ; \ + adcxq %rax, %r13 ; \ + adoxq %rbx, %r14 ; \ + mulxq 0x30+P1, %rax, %rbx ; \ + adcxq %rax, %r14 ; \ + adoxq %rbx, %r15 ; \ + mulxq 0x38+P1, %rax, %rbx ; \ + adcxq %rax, %r15 ; \ + adoxq %rbx, %rcx ; \ + mulxq 0x40+P1, %rax, %rbx ; \ + adcq %rax, %rcx ; \ + movq %r8, %rax ; \ + andq $0x1ff, %rax ; \ + shrdq $0x9, %r9, %r8 ; \ + shrdq $0x9, %r10, %r9 ; \ + shrdq $0x9, %r11, %r10 ; \ + shrdq $0x9, %r12, %r11 ; \ + shrdq $0x9, %r13, %r12 ; \ + shrdq $0x9, %r14, %r13 ; \ + shrdq $0x9, %r15, %r14 ; \ + shrdq $0x9, %rcx, %r15 ; \ + shrq $0x9, %rcx ; \ + addq %rax, %rcx ; \ + stc; \ + adcq 504(%rsp), %r8 ; \ + adcq 512(%rsp), %r9 ; \ + adcq 520(%rsp), %r10 ; \ + adcq 528(%rsp), %r11 ; \ + adcq 536(%rsp), %r12 ; \ + adcq 544(%rsp), %r13 ; \ + adcq 552(%rsp), %r14 ; \ + adcq 560(%rsp), %r15 ; \ + adcq $0xfffffffffffffe00, %rcx ; \ + cmc; \ + sbbq $0x0, %r8 ; \ + movq %r8, P0 ; \ + sbbq $0x0, %r9 ; \ + movq %r9, 0x8+P0 ; \ + sbbq $0x0, %r10 ; \ + movq %r10, 0x10+P0 ; \ + sbbq $0x0, %r11 ; \ + movq %r11, 0x18+P0 ; \ + sbbq $0x0, %r12 ; \ + movq %r12, 0x20+P0 ; \ + sbbq $0x0, %r13 ; \ + movq %r13, 0x28+P0 ; \ + sbbq $0x0, %r14 ; \ + movq %r14, 0x30+P0 ; \ + sbbq $0x0, %r15 ; \ + movq %r15, 0x38+P0 ; \ + sbbq $0x0, %rcx ; \ + andq $0x1ff, %rcx ; \ + movq %rcx, 0x40+P0 + +// Corresponds exactly to bignum_sqr_p521 + +#define sqr_p521(P0,P1) \ + xorl %ecx, %ecx ; \ + movq P1, %rdx ; \ + mulxq 0x8+P1, %r9, %rax ; \ + movq %r9, 512(%rsp) ; \ + mulxq 0x10+P1, %r10, %rbx ; \ + adcxq %rax, %r10 ; \ + movq %r10, 520(%rsp) ; \ + mulxq 0x18+P1, %r11, %rax ; \ + adcxq %rbx, %r11 ; \ + mulxq 0x20+P1, %r12, %rbx ; \ + adcxq %rax, %r12 ; \ + mulxq 0x28+P1, %r13, %rax ; \ + adcxq %rbx, %r13 ; \ + mulxq 0x30+P1, %r14, %rbx ; \ + adcxq %rax, %r14 ; \ + mulxq 0x38+P1, %r15, %r8 ; \ + adcxq %rbx, %r15 ; \ + adcxq %rcx, %r8 ; \ + xorl %ecx, %ecx ; \ + movq 0x8+P1, %rdx ; \ + mulxq 0x10+P1, %rax, %rbx ; \ + adcxq %rax, %r11 ; \ + adoxq %rbx, %r12 ; \ + movq %r11, 528(%rsp) ; \ + mulxq 0x18+P1, %rax, %rbx ; \ + adcxq %rax, %r12 ; \ + adoxq %rbx, %r13 ; \ + movq %r12, 536(%rsp) ; \ + mulxq 0x20+P1, %rax, %rbx ; \ + adcxq %rax, %r13 ; \ + adoxq %rbx, %r14 ; \ + mulxq 0x28+P1, %rax, %rbx ; \ + adcxq %rax, %r14 ; \ + adoxq %rbx, %r15 ; \ + mulxq 0x30+P1, %rax, %rbx ; \ + adcxq %rax, %r15 ; \ + adoxq %rbx, %r8 ; \ + mulxq 0x38+P1, %rax, %r9 ; \ + adcxq %rax, %r8 ; \ + adoxq %rcx, %r9 ; \ + movq 0x20+P1, %rdx ; \ + mulxq 0x28+P1, %rax, %r10 ; \ + adcxq %rax, %r9 ; \ + adoxq %rcx, %r10 ; \ + adcxq %rcx, %r10 ; \ + xorl %ecx, %ecx ; \ + movq 0x10+P1, %rdx ; \ + mulxq 0x18+P1, %rax, %rbx ; \ + adcxq %rax, %r13 ; \ + adoxq %rbx, %r14 ; \ + movq %r13, 544(%rsp) ; \ + mulxq 0x20+P1, %rax, %rbx ; \ + adcxq %rax, %r14 ; \ + adoxq %rbx, %r15 ; \ + movq %r14, 552(%rsp) ; \ + mulxq 0x28+P1, %rax, %rbx ; \ + adcxq %rax, %r15 ; \ + adoxq %rbx, %r8 ; \ + mulxq 0x30+P1, %rax, %rbx ; \ + adcxq %rax, %r8 ; \ + adoxq %rbx, %r9 ; \ + mulxq 0x38+P1, %rax, %rbx ; \ + adcxq %rax, %r9 ; \ + adoxq %rbx, %r10 ; \ + movq 0x30+P1, %rdx ; \ + mulxq 0x20+P1, %rax, %r11 ; \ + adcxq %rax, %r10 ; \ + adoxq %rcx, %r11 ; \ + mulxq 0x28+P1, %rax, %r12 ; \ + adcxq %rax, %r11 ; \ + adoxq %rcx, %r12 ; \ + adcxq %rcx, %r12 ; \ + xorl %ecx, %ecx ; \ + movq 0x18+P1, %rdx ; \ + mulxq 0x20+P1, %rax, %rbx ; \ + adcxq %rax, %r15 ; \ + adoxq %rbx, %r8 ; \ + movq %r15, 560(%rsp) ; \ + mulxq 0x28+P1, %rax, %rbx ; \ + adcxq %rax, %r8 ; \ + adoxq %rbx, %r9 ; \ + mulxq 0x30+P1, %rax, %rbx ; \ + adcxq %rax, %r9 ; \ + adoxq %rbx, %r10 ; \ + mulxq 0x38+P1, %rax, %rbx ; \ + adcxq %rax, %r10 ; \ + adoxq %rbx, %r11 ; \ + movq 0x38+P1, %rdx ; \ + mulxq 0x20+P1, %rax, %rbx ; \ + adcxq %rax, %r11 ; \ + adoxq %rbx, %r12 ; \ + mulxq 0x28+P1, %rax, %r13 ; \ + adcxq %rax, %r12 ; \ + adoxq %rcx, %r13 ; \ + mulxq 0x30+P1, %rax, %r14 ; \ + adcxq %rax, %r13 ; \ + adoxq %rcx, %r14 ; \ + adcxq %rcx, %r14 ; \ + xorl %ecx, %ecx ; \ + movq P1, %rdx ; \ + mulxq %rdx, %rax, %rbx ; \ + movq %rax, 504(%rsp) ; \ + movq 512(%rsp), %rax ; \ + adcxq %rax, %rax ; \ + adoxq %rbx, %rax ; \ + movq %rax, 512(%rsp) ; \ + movq 520(%rsp), %rax ; \ + movq 0x8+P1, %rdx ; \ + mulxq %rdx, %rdx, %rbx ; \ + adcxq %rax, %rax ; \ + adoxq %rdx, %rax ; \ + movq %rax, 520(%rsp) ; \ + movq 528(%rsp), %rax ; \ + adcxq %rax, %rax ; \ + adoxq %rbx, %rax ; \ + movq %rax, 528(%rsp) ; \ + movq 536(%rsp), %rax ; \ + movq 0x10+P1, %rdx ; \ + mulxq %rdx, %rdx, %rbx ; \ + adcxq %rax, %rax ; \ + adoxq %rdx, %rax ; \ + movq %rax, 536(%rsp) ; \ + movq 544(%rsp), %rax ; \ + adcxq %rax, %rax ; \ + adoxq %rbx, %rax ; \ + movq %rax, 544(%rsp) ; \ + movq 552(%rsp), %rax ; \ + movq 0x18+P1, %rdx ; \ + mulxq %rdx, %rdx, %rbx ; \ + adcxq %rax, %rax ; \ + adoxq %rdx, %rax ; \ + movq %rax, 552(%rsp) ; \ + movq 560(%rsp), %rax ; \ + adcxq %rax, %rax ; \ + adoxq %rbx, %rax ; \ + movq %rax, 560(%rsp) ; \ + movq 0x20+P1, %rdx ; \ + mulxq %rdx, %rdx, %rbx ; \ + adcxq %r8, %r8 ; \ + adoxq %rdx, %r8 ; \ + adcxq %r9, %r9 ; \ + adoxq %rbx, %r9 ; \ + movq 0x28+P1, %rdx ; \ + mulxq %rdx, %rdx, %rbx ; \ + adcxq %r10, %r10 ; \ + adoxq %rdx, %r10 ; \ + adcxq %r11, %r11 ; \ + adoxq %rbx, %r11 ; \ + movq 0x30+P1, %rdx ; \ + mulxq %rdx, %rdx, %rbx ; \ + adcxq %r12, %r12 ; \ + adoxq %rdx, %r12 ; \ + adcxq %r13, %r13 ; \ + adoxq %rbx, %r13 ; \ + movq 0x38+P1, %rdx ; \ + mulxq %rdx, %rdx, %r15 ; \ + adcxq %r14, %r14 ; \ + adoxq %rdx, %r14 ; \ + adcxq %rcx, %r15 ; \ + adoxq %rcx, %r15 ; \ + movq 0x40+P1, %rdx ; \ + movq %rdx, %rcx ; \ + imulq %rcx, %rcx ; \ + addq %rdx, %rdx ; \ + mulxq P1, %rax, %rbx ; \ + adcxq %rax, %r8 ; \ + adoxq %rbx, %r9 ; \ + mulxq 0x8+P1, %rax, %rbx ; \ + adcxq %rax, %r9 ; \ + adoxq %rbx, %r10 ; \ + mulxq 0x10+P1, %rax, %rbx ; \ + adcxq %rax, %r10 ; \ + adoxq %rbx, %r11 ; \ + mulxq 0x18+P1, %rax, %rbx ; \ + adcxq %rax, %r11 ; \ + adoxq %rbx, %r12 ; \ + mulxq 0x20+P1, %rax, %rbx ; \ + adcxq %rax, %r12 ; \ + adoxq %rbx, %r13 ; \ + mulxq 0x28+P1, %rax, %rbx ; \ + adcxq %rax, %r13 ; \ + adoxq %rbx, %r14 ; \ + mulxq 0x30+P1, %rax, %rbx ; \ + adcxq %rax, %r14 ; \ + adoxq %rbx, %r15 ; \ + mulxq 0x38+P1, %rax, %rbx ; \ + adcxq %rax, %r15 ; \ + adoxq %rbx, %rcx ; \ + adcq $0x0, %rcx ; \ + movq %r8, %rax ; \ + andq $0x1ff, %rax ; \ + shrdq $0x9, %r9, %r8 ; \ + shrdq $0x9, %r10, %r9 ; \ + shrdq $0x9, %r11, %r10 ; \ + shrdq $0x9, %r12, %r11 ; \ + shrdq $0x9, %r13, %r12 ; \ + shrdq $0x9, %r14, %r13 ; \ + shrdq $0x9, %r15, %r14 ; \ + shrdq $0x9, %rcx, %r15 ; \ + shrq $0x9, %rcx ; \ + addq %rax, %rcx ; \ + stc; \ + adcq 504(%rsp), %r8 ; \ + adcq 512(%rsp), %r9 ; \ + adcq 520(%rsp), %r10 ; \ + adcq 528(%rsp), %r11 ; \ + adcq 536(%rsp), %r12 ; \ + adcq 544(%rsp), %r13 ; \ + adcq 552(%rsp), %r14 ; \ + adcq 560(%rsp), %r15 ; \ + adcq $0xfffffffffffffe00, %rcx ; \ + cmc; \ + sbbq $0x0, %r8 ; \ + movq %r8, P0 ; \ + sbbq $0x0, %r9 ; \ + movq %r9, 0x8+P0 ; \ + sbbq $0x0, %r10 ; \ + movq %r10, 0x10+P0 ; \ + sbbq $0x0, %r11 ; \ + movq %r11, 0x18+P0 ; \ + sbbq $0x0, %r12 ; \ + movq %r12, 0x20+P0 ; \ + sbbq $0x0, %r13 ; \ + movq %r13, 0x28+P0 ; \ + sbbq $0x0, %r14 ; \ + movq %r14, 0x30+P0 ; \ + sbbq $0x0, %r15 ; \ + movq %r15, 0x38+P0 ; \ + sbbq $0x0, %rcx ; \ + andq $0x1ff, %rcx ; \ + movq %rcx, 0x40+P0 + +// Corresponds exactly to bignum_add_p521 + +#define add_p521(P0,P1,P2) \ + stc; \ + movq P1, %rax ; \ + adcq P2, %rax ; \ + movq 0x8+P1, %rbx ; \ + adcq 0x8+P2, %rbx ; \ + movq 0x10+P1, %r8 ; \ + adcq 0x10+P2, %r8 ; \ + movq 0x18+P1, %r9 ; \ + adcq 0x18+P2, %r9 ; \ + movq 0x20+P1, %r10 ; \ + adcq 0x20+P2, %r10 ; \ + movq 0x28+P1, %r11 ; \ + adcq 0x28+P2, %r11 ; \ + movq 0x30+P1, %r12 ; \ + adcq 0x30+P2, %r12 ; \ + movq 0x38+P1, %r13 ; \ + adcq 0x38+P2, %r13 ; \ + movq 0x40+P1, %r14 ; \ + adcq 0x40+P2, %r14 ; \ + movq $0x200, %rdx ; \ + andq %r14, %rdx ; \ + cmpq $0x200, %rdx ; \ + sbbq $0x0, %rax ; \ + movq %rax, P0 ; \ + sbbq $0x0, %rbx ; \ + movq %rbx, 0x8+P0 ; \ + sbbq $0x0, %r8 ; \ + movq %r8, 0x10+P0 ; \ + sbbq $0x0, %r9 ; \ + movq %r9, 0x18+P0 ; \ + sbbq $0x0, %r10 ; \ + movq %r10, 0x20+P0 ; \ + sbbq $0x0, %r11 ; \ + movq %r11, 0x28+P0 ; \ + sbbq $0x0, %r12 ; \ + movq %r12, 0x30+P0 ; \ + sbbq $0x0, %r13 ; \ + movq %r13, 0x38+P0 ; \ + sbbq %rdx, %r14 ; \ + movq %r14, 0x40+P0 + +// Corresponds exactly to bignum_sub_p521 + +#define sub_p521(P0,P1,P2) \ + movq P1, %rax ; \ + subq P2, %rax ; \ + movq 0x8+P1, %rdx ; \ + sbbq 0x8+P2, %rdx ; \ + movq 0x10+P1, %r8 ; \ + sbbq 0x10+P2, %r8 ; \ + movq 0x18+P1, %r9 ; \ + sbbq 0x18+P2, %r9 ; \ + movq 0x20+P1, %r10 ; \ + sbbq 0x20+P2, %r10 ; \ + movq 0x28+P1, %r11 ; \ + sbbq 0x28+P2, %r11 ; \ + movq 0x30+P1, %r12 ; \ + sbbq 0x30+P2, %r12 ; \ + movq 0x38+P1, %r13 ; \ + sbbq 0x38+P2, %r13 ; \ + movq 0x40+P1, %r14 ; \ + sbbq 0x40+P2, %r14 ; \ + sbbq $0x0, %rax ; \ + movq %rax, P0 ; \ + sbbq $0x0, %rdx ; \ + movq %rdx, 0x8+P0 ; \ + sbbq $0x0, %r8 ; \ + movq %r8, 0x10+P0 ; \ + sbbq $0x0, %r9 ; \ + movq %r9, 0x18+P0 ; \ + sbbq $0x0, %r10 ; \ + movq %r10, 0x20+P0 ; \ + sbbq $0x0, %r11 ; \ + movq %r11, 0x28+P0 ; \ + sbbq $0x0, %r12 ; \ + movq %r12, 0x30+P0 ; \ + sbbq $0x0, %r13 ; \ + movq %r13, 0x38+P0 ; \ + sbbq $0x0, %r14 ; \ + andq $0x1ff, %r14 ; \ + movq %r14, 0x40+P0 + +// Weak multiplication not fully reducing + +#define weakmul_p521(P0,P1,P2) \ + xorl %ecx, %ecx ; \ + movq P2, %rdx ; \ + mulxq P1, %r8, %r9 ; \ + movq %r8, 504(%rsp) ; \ + mulxq 0x8+P1, %rbx, %r10 ; \ + adcq %rbx, %r9 ; \ + mulxq 0x10+P1, %rbx, %r11 ; \ + adcq %rbx, %r10 ; \ + mulxq 0x18+P1, %rbx, %r12 ; \ + adcq %rbx, %r11 ; \ + mulxq 0x20+P1, %rbx, %r13 ; \ + adcq %rbx, %r12 ; \ + mulxq 0x28+P1, %rbx, %r14 ; \ + adcq %rbx, %r13 ; \ + mulxq 0x30+P1, %rbx, %r15 ; \ + adcq %rbx, %r14 ; \ + mulxq 0x38+P1, %rbx, %r8 ; \ + adcq %rbx, %r15 ; \ + adcq %rcx, %r8 ; \ + movq 0x8+P2, %rdx ; \ + xorl %ecx, %ecx ; \ + mulxq P1, %rax, %rbx ; \ + adcxq %rax, %r9 ; \ + adoxq %rbx, %r10 ; \ + movq %r9, 512(%rsp) ; \ + mulxq 0x8+P1, %rax, %rbx ; \ + adcxq %rax, %r10 ; \ + adoxq %rbx, %r11 ; \ + mulxq 0x10+P1, %rax, %rbx ; \ + adcxq %rax, %r11 ; \ + adoxq %rbx, %r12 ; \ + mulxq 0x18+P1, %rax, %rbx ; \ + adcxq %rax, %r12 ; \ + adoxq %rbx, %r13 ; \ + mulxq 0x20+P1, %rax, %rbx ; \ + adcxq %rax, %r13 ; \ + adoxq %rbx, %r14 ; \ + mulxq 0x28+P1, %rax, %rbx ; \ + adcxq %rax, %r14 ; \ + adoxq %rbx, %r15 ; \ + mulxq 0x30+P1, %rax, %rbx ; \ + adcxq %rax, %r15 ; \ + adoxq %rbx, %r8 ; \ + mulxq 0x38+P1, %rax, %r9 ; \ + adcxq %rax, %r8 ; \ + adoxq %rcx, %r9 ; \ + adcq %rcx, %r9 ; \ + movq 0x10+P2, %rdx ; \ + xorl %ecx, %ecx ; \ + mulxq P1, %rax, %rbx ; \ + adcxq %rax, %r10 ; \ + adoxq %rbx, %r11 ; \ + movq %r10, 520(%rsp) ; \ + mulxq 0x8+P1, %rax, %rbx ; \ + adcxq %rax, %r11 ; \ + adoxq %rbx, %r12 ; \ + mulxq 0x10+P1, %rax, %rbx ; \ + adcxq %rax, %r12 ; \ + adoxq %rbx, %r13 ; \ + mulxq 0x18+P1, %rax, %rbx ; \ + adcxq %rax, %r13 ; \ + adoxq %rbx, %r14 ; \ + mulxq 0x20+P1, %rax, %rbx ; \ + adcxq %rax, %r14 ; \ + adoxq %rbx, %r15 ; \ + mulxq 0x28+P1, %rax, %rbx ; \ + adcxq %rax, %r15 ; \ + adoxq %rbx, %r8 ; \ + mulxq 0x30+P1, %rax, %rbx ; \ + adcxq %rax, %r8 ; \ + adoxq %rbx, %r9 ; \ + mulxq 0x38+P1, %rax, %r10 ; \ + adcxq %rax, %r9 ; \ + adoxq %rcx, %r10 ; \ + adcq %rcx, %r10 ; \ + movq 0x18+P2, %rdx ; \ + xorl %ecx, %ecx ; \ + mulxq P1, %rax, %rbx ; \ + adcxq %rax, %r11 ; \ + adoxq %rbx, %r12 ; \ + movq %r11, 528(%rsp) ; \ + mulxq 0x8+P1, %rax, %rbx ; \ + adcxq %rax, %r12 ; \ + adoxq %rbx, %r13 ; \ + mulxq 0x10+P1, %rax, %rbx ; \ + adcxq %rax, %r13 ; \ + adoxq %rbx, %r14 ; \ + mulxq 0x18+P1, %rax, %rbx ; \ + adcxq %rax, %r14 ; \ + adoxq %rbx, %r15 ; \ + mulxq 0x20+P1, %rax, %rbx ; \ + adcxq %rax, %r15 ; \ + adoxq %rbx, %r8 ; \ + mulxq 0x28+P1, %rax, %rbx ; \ + adcxq %rax, %r8 ; \ + adoxq %rbx, %r9 ; \ + mulxq 0x30+P1, %rax, %rbx ; \ + adcxq %rax, %r9 ; \ + adoxq %rbx, %r10 ; \ + mulxq 0x38+P1, %rax, %r11 ; \ + adcxq %rax, %r10 ; \ + adoxq %rcx, %r11 ; \ + adcq %rcx, %r11 ; \ + movq 0x20+P2, %rdx ; \ + xorl %ecx, %ecx ; \ + mulxq P1, %rax, %rbx ; \ + adcxq %rax, %r12 ; \ + adoxq %rbx, %r13 ; \ + movq %r12, 536(%rsp) ; \ + mulxq 0x8+P1, %rax, %rbx ; \ + adcxq %rax, %r13 ; \ + adoxq %rbx, %r14 ; \ + mulxq 0x10+P1, %rax, %rbx ; \ + adcxq %rax, %r14 ; \ + adoxq %rbx, %r15 ; \ + mulxq 0x18+P1, %rax, %rbx ; \ + adcxq %rax, %r15 ; \ + adoxq %rbx, %r8 ; \ + mulxq 0x20+P1, %rax, %rbx ; \ + adcxq %rax, %r8 ; \ + adoxq %rbx, %r9 ; \ + mulxq 0x28+P1, %rax, %rbx ; \ + adcxq %rax, %r9 ; \ + adoxq %rbx, %r10 ; \ + mulxq 0x30+P1, %rax, %rbx ; \ + adcxq %rax, %r10 ; \ + adoxq %rbx, %r11 ; \ + mulxq 0x38+P1, %rax, %r12 ; \ + adcxq %rax, %r11 ; \ + adoxq %rcx, %r12 ; \ + adcq %rcx, %r12 ; \ + movq 0x28+P2, %rdx ; \ + xorl %ecx, %ecx ; \ + mulxq P1, %rax, %rbx ; \ + adcxq %rax, %r13 ; \ + adoxq %rbx, %r14 ; \ + movq %r13, 544(%rsp) ; \ + mulxq 0x8+P1, %rax, %rbx ; \ + adcxq %rax, %r14 ; \ + adoxq %rbx, %r15 ; \ + mulxq 0x10+P1, %rax, %rbx ; \ + adcxq %rax, %r15 ; \ + adoxq %rbx, %r8 ; \ + mulxq 0x18+P1, %rax, %rbx ; \ + adcxq %rax, %r8 ; \ + adoxq %rbx, %r9 ; \ + mulxq 0x20+P1, %rax, %rbx ; \ + adcxq %rax, %r9 ; \ + adoxq %rbx, %r10 ; \ + mulxq 0x28+P1, %rax, %rbx ; \ + adcxq %rax, %r10 ; \ + adoxq %rbx, %r11 ; \ + mulxq 0x30+P1, %rax, %rbx ; \ + adcxq %rax, %r11 ; \ + adoxq %rbx, %r12 ; \ + mulxq 0x38+P1, %rax, %r13 ; \ + adcxq %rax, %r12 ; \ + adoxq %rcx, %r13 ; \ + adcq %rcx, %r13 ; \ + movq 0x30+P2, %rdx ; \ + xorl %ecx, %ecx ; \ + mulxq P1, %rax, %rbx ; \ + adcxq %rax, %r14 ; \ + adoxq %rbx, %r15 ; \ + movq %r14, 552(%rsp) ; \ + mulxq 0x8+P1, %rax, %rbx ; \ + adcxq %rax, %r15 ; \ + adoxq %rbx, %r8 ; \ + mulxq 0x10+P1, %rax, %rbx ; \ + adcxq %rax, %r8 ; \ + adoxq %rbx, %r9 ; \ + mulxq 0x18+P1, %rax, %rbx ; \ + adcxq %rax, %r9 ; \ + adoxq %rbx, %r10 ; \ + mulxq 0x20+P1, %rax, %rbx ; \ + adcxq %rax, %r10 ; \ + adoxq %rbx, %r11 ; \ + mulxq 0x28+P1, %rax, %rbx ; \ + adcxq %rax, %r11 ; \ + adoxq %rbx, %r12 ; \ + mulxq 0x30+P1, %rax, %rbx ; \ + adcxq %rax, %r12 ; \ + adoxq %rbx, %r13 ; \ + mulxq 0x38+P1, %rax, %r14 ; \ + adcxq %rax, %r13 ; \ + adoxq %rcx, %r14 ; \ + adcq %rcx, %r14 ; \ + movq 0x38+P2, %rdx ; \ + xorl %ecx, %ecx ; \ + mulxq P1, %rax, %rbx ; \ + adcxq %rax, %r15 ; \ + adoxq %rbx, %r8 ; \ + movq %r15, 560(%rsp) ; \ + mulxq 0x8+P1, %rax, %rbx ; \ + adcxq %rax, %r8 ; \ + adoxq %rbx, %r9 ; \ + mulxq 0x10+P1, %rax, %rbx ; \ + adcxq %rax, %r9 ; \ + adoxq %rbx, %r10 ; \ + mulxq 0x18+P1, %rax, %rbx ; \ + adcxq %rax, %r10 ; \ + adoxq %rbx, %r11 ; \ + mulxq 0x20+P1, %rax, %rbx ; \ + adcxq %rax, %r11 ; \ + adoxq %rbx, %r12 ; \ + mulxq 0x28+P1, %rax, %rbx ; \ + adcxq %rax, %r12 ; \ + adoxq %rbx, %r13 ; \ + mulxq 0x30+P1, %rax, %rbx ; \ + adcxq %rax, %r13 ; \ + adoxq %rbx, %r14 ; \ + mulxq 0x38+P1, %rax, %r15 ; \ + adcxq %rax, %r14 ; \ + adoxq %rcx, %r15 ; \ + adcq %rcx, %r15 ; \ + movq 0x40+P1, %rdx ; \ + xorl %ecx, %ecx ; \ + mulxq P2, %rax, %rbx ; \ + adcxq %rax, %r8 ; \ + adoxq %rbx, %r9 ; \ + mulxq 0x8+P2, %rax, %rbx ; \ + adcxq %rax, %r9 ; \ + adoxq %rbx, %r10 ; \ + mulxq 0x10+P2, %rax, %rbx ; \ + adcxq %rax, %r10 ; \ + adoxq %rbx, %r11 ; \ + mulxq 0x18+P2, %rax, %rbx ; \ + adcxq %rax, %r11 ; \ + adoxq %rbx, %r12 ; \ + mulxq 0x20+P2, %rax, %rbx ; \ + adcxq %rax, %r12 ; \ + adoxq %rbx, %r13 ; \ + mulxq 0x28+P2, %rax, %rbx ; \ + adcxq %rax, %r13 ; \ + adoxq %rbx, %r14 ; \ + mulxq 0x30+P2, %rax, %rbx ; \ + adcxq %rax, %r14 ; \ + adoxq %rbx, %r15 ; \ + mulxq 0x38+P2, %rax, %rbx ; \ + adcxq %rax, %r15 ; \ + adoxq %rcx, %rbx ; \ + adcq %rbx, %rcx ; \ + movq 0x40+P2, %rdx ; \ + xorl %eax, %eax ; \ + mulxq P1, %rax, %rbx ; \ + adcxq %rax, %r8 ; \ + adoxq %rbx, %r9 ; \ + mulxq 0x8+P1, %rax, %rbx ; \ + adcxq %rax, %r9 ; \ + adoxq %rbx, %r10 ; \ + mulxq 0x10+P1, %rax, %rbx ; \ + adcxq %rax, %r10 ; \ + adoxq %rbx, %r11 ; \ + mulxq 0x18+P1, %rax, %rbx ; \ + adcxq %rax, %r11 ; \ + adoxq %rbx, %r12 ; \ + mulxq 0x20+P1, %rax, %rbx ; \ + adcxq %rax, %r12 ; \ + adoxq %rbx, %r13 ; \ + mulxq 0x28+P1, %rax, %rbx ; \ + adcxq %rax, %r13 ; \ + adoxq %rbx, %r14 ; \ + mulxq 0x30+P1, %rax, %rbx ; \ + adcxq %rax, %r14 ; \ + adoxq %rbx, %r15 ; \ + mulxq 0x38+P1, %rax, %rbx ; \ + adcxq %rax, %r15 ; \ + adoxq %rbx, %rcx ; \ + mulxq 0x40+P1, %rax, %rbx ; \ + adcq %rax, %rcx ; \ + movq %r8, %rax ; \ + andq $0x1ff, %rax ; \ + shrdq $0x9, %r9, %r8 ; \ + shrdq $0x9, %r10, %r9 ; \ + shrdq $0x9, %r11, %r10 ; \ + shrdq $0x9, %r12, %r11 ; \ + shrdq $0x9, %r13, %r12 ; \ + shrdq $0x9, %r14, %r13 ; \ + shrdq $0x9, %r15, %r14 ; \ + shrdq $0x9, %rcx, %r15 ; \ + shrq $0x9, %rcx ; \ + addq %rax, %rcx ; \ + addq 504(%rsp), %r8 ; \ + movq %r8, P0 ; \ + adcq 512(%rsp), %r9 ; \ + movq %r9, 0x8+P0 ; \ + adcq 520(%rsp), %r10 ; \ + movq %r10, 0x10+P0 ; \ + adcq 528(%rsp), %r11 ; \ + movq %r11, 0x18+P0 ; \ + adcq 536(%rsp), %r12 ; \ + movq %r12, 0x20+P0 ; \ + adcq 544(%rsp), %r13 ; \ + movq %r13, 0x28+P0 ; \ + adcq 552(%rsp), %r14 ; \ + movq %r14, 0x30+P0 ; \ + adcq 560(%rsp), %r15 ; \ + movq %r15, 0x38+P0 ; \ + adcq $0, %rcx ; \ + movq %rcx, 0x40+P0 + +// P0 = C * P1 - D * P2 == C * P1 + D * (p_521 - P2) + +#define cmsub_p521(P0,C,P1,D,P2) \ + movq $D, %rdx ; \ + movq 64+P2, %rbx ; \ + xorq $0x1FF, %rbx ; \ + movq P2, %rax ; \ + notq %rax; \ + mulxq %rax, %r8, %r9 ; \ + movq 8+P2, %rax ; \ + notq %rax; \ + mulxq %rax, %rax, %r10 ; \ + addq %rax, %r9 ; \ + movq 16+P2, %rax ; \ + notq %rax; \ + mulxq %rax, %rax, %r11 ; \ + adcq %rax, %r10 ; \ + movq 24+P2, %rax ; \ + notq %rax; \ + mulxq %rax, %rax, %r12 ; \ + adcq %rax, %r11 ; \ + movq 32+P2, %rax ; \ + notq %rax; \ + mulxq %rax, %rax, %r13 ; \ + adcq %rax, %r12 ; \ + movq 40+P2, %rax ; \ + notq %rax; \ + mulxq %rax, %rax, %r14 ; \ + adcq %rax, %r13 ; \ + movq 48+P2, %rax ; \ + notq %rax; \ + mulxq %rax, %rax, %r15 ; \ + adcq %rax, %r14 ; \ + movq 56+P2, %rax ; \ + notq %rax; \ + mulxq %rax, %rax, %rcx ; \ + adcq %rax, %r15 ; \ + mulxq %rbx, %rbx, %rax ; \ + adcq %rcx, %rbx ; \ + xorl %eax, %eax ; \ + movq $C, %rdx ; \ + mulxq P1, %rax, %rcx ; \ + adcxq %rax, %r8 ; \ + adoxq %rcx, %r9 ; \ + mulxq 8+P1, %rax, %rcx ; \ + adcxq %rax, %r9 ; \ + adoxq %rcx, %r10 ; \ + mulxq 16+P1, %rax, %rcx ; \ + adcxq %rax, %r10 ; \ + adoxq %rcx, %r11 ; \ + mulxq 24+P1, %rax, %rcx ; \ + adcxq %rax, %r11 ; \ + adoxq %rcx, %r12 ; \ + mulxq 32+P1, %rax, %rcx ; \ + adcxq %rax, %r12 ; \ + adoxq %rcx, %r13 ; \ + mulxq 40+P1, %rax, %rcx ; \ + adcxq %rax, %r13 ; \ + adoxq %rcx, %r14 ; \ + mulxq 48+P1, %rax, %rcx ; \ + adcxq %rax, %r14 ; \ + adoxq %rcx, %r15 ; \ + mulxq 56+P1, %rax, %rcx ; \ + adcxq %rax, %r15 ; \ + adoxq %rcx, %rbx ; \ + mulxq 64+P1, %rax, %rcx ; \ + adcxq %rax, %rbx ; \ + movq %r9, %rax ; \ + andq %r10, %rax ; \ + andq %r11, %rax ; \ + andq %r12, %rax ; \ + andq %r13, %rax ; \ + andq %r14, %rax ; \ + andq %r15, %rax ; \ + movq %rbx, %rdx ; \ + shrq $9, %rdx ; \ + orq $~0x1FF, %rbx ; \ + leaq 1(%rdx), %rcx ; \ + addq %r8, %rcx ; \ + movl $0, %ecx ; \ + adcq %rcx, %rax ; \ + movq %rbx, %rax ; \ + adcq %rcx, %rax ; \ + adcq %rdx, %r8 ; \ + movq %r8, P0 ; \ + adcq %rcx, %r9 ; \ + movq %r9, 8+P0 ; \ + adcq %rcx, %r10 ; \ + movq %r10, 16+P0 ; \ + adcq %rcx, %r11 ; \ + movq %r11, 24+P0 ; \ + adcq %rcx, %r12 ; \ + movq %r12, 32+P0 ; \ + adcq %rcx, %r13 ; \ + movq %r13, 40+P0 ; \ + adcq %rcx, %r14 ; \ + movq %r14, 48+P0 ; \ + adcq %rcx, %r15 ; \ + movq %r15, 56+P0 ; \ + adcq %rcx, %rbx ; \ + andq $0x1FF, %rbx ; \ + movq %rbx, 64+P0 + +// P0 = 3 * P1 - 8 * P2 == 3 * P1 + 8 * (p_521 - P2) + +#define cmsub38_p521(P0,P1,P2) \ + movq 64+P2, %rbx ; \ + xorq $0x1FF, %rbx ; \ + movq 56+P2, %r15 ; \ + notq %r15; \ + shldq $3, %r15, %rbx ; \ + movq 48+P2, %r14 ; \ + notq %r14; \ + shldq $3, %r14, %r15 ; \ + movq 40+P2, %r13 ; \ + notq %r13; \ + shldq $3, %r13, %r14 ; \ + movq 32+P2, %r12 ; \ + notq %r12; \ + shldq $3, %r12, %r13 ; \ + movq 24+P2, %r11 ; \ + notq %r11; \ + shldq $3, %r11, %r12 ; \ + movq 16+P2, %r10 ; \ + notq %r10; \ + shldq $3, %r10, %r11 ; \ + movq 8+P2, %r9 ; \ + notq %r9; \ + shldq $3, %r9, %r10 ; \ + movq P2, %r8 ; \ + notq %r8; \ + shldq $3, %r8, %r9 ; \ + shlq $3, %r8 ; \ + movq $3, %rdx ; \ + xorl %eax, %eax ; \ + mulxq P1, %rax, %rcx ; \ + adcxq %rax, %r8 ; \ + adoxq %rcx, %r9 ; \ + mulxq 8+P1, %rax, %rcx ; \ + adcxq %rax, %r9 ; \ + adoxq %rcx, %r10 ; \ + mulxq 16+P1, %rax, %rcx ; \ + adcxq %rax, %r10 ; \ + adoxq %rcx, %r11 ; \ + mulxq 24+P1, %rax, %rcx ; \ + adcxq %rax, %r11 ; \ + adoxq %rcx, %r12 ; \ + mulxq 32+P1, %rax, %rcx ; \ + adcxq %rax, %r12 ; \ + adoxq %rcx, %r13 ; \ + mulxq 40+P1, %rax, %rcx ; \ + adcxq %rax, %r13 ; \ + adoxq %rcx, %r14 ; \ + mulxq 48+P1, %rax, %rcx ; \ + adcxq %rax, %r14 ; \ + adoxq %rcx, %r15 ; \ + mulxq 56+P1, %rax, %rcx ; \ + adcxq %rax, %r15 ; \ + adoxq %rcx, %rbx ; \ + mulxq 64+P1, %rax, %rcx ; \ + adcxq %rax, %rbx ; \ + movq %r9, %rax ; \ + andq %r10, %rax ; \ + andq %r11, %rax ; \ + andq %r12, %rax ; \ + andq %r13, %rax ; \ + andq %r14, %rax ; \ + andq %r15, %rax ; \ + movq %rbx, %rdx ; \ + shrq $9, %rdx ; \ + orq $~0x1FF, %rbx ; \ + leaq 1(%rdx), %rcx ; \ + addq %r8, %rcx ; \ + movl $0, %ecx ; \ + adcq %rcx, %rax ; \ + movq %rbx, %rax ; \ + adcq %rcx, %rax ; \ + adcq %rdx, %r8 ; \ + movq %r8, P0 ; \ + adcq %rcx, %r9 ; \ + movq %r9, 8+P0 ; \ + adcq %rcx, %r10 ; \ + movq %r10, 16+P0 ; \ + adcq %rcx, %r11 ; \ + movq %r11, 24+P0 ; \ + adcq %rcx, %r12 ; \ + movq %r12, 32+P0 ; \ + adcq %rcx, %r13 ; \ + movq %r13, 40+P0 ; \ + adcq %rcx, %r14 ; \ + movq %r14, 48+P0 ; \ + adcq %rcx, %r15 ; \ + movq %r15, 56+P0 ; \ + adcq %rcx, %rbx ; \ + andq $0x1FF, %rbx ; \ + movq %rbx, 64+P0 + +// P0 = 4 * P1 - P2 = 4 * P1 + (p_521 - P2) + +#define cmsub41_p521(P0,P1,P2) \ + movq 64+P1, %rbx ; \ + movq 56+P1, %r15 ; \ + shldq $2, %r15, %rbx ; \ + movq 48+P1, %r14 ; \ + shldq $2, %r14, %r15 ; \ + movq 40+P1, %r13 ; \ + shldq $2, %r13, %r14 ; \ + movq 32+P1, %r12 ; \ + shldq $2, %r12, %r13 ; \ + movq 24+P1, %r11 ; \ + shldq $2, %r11, %r12 ; \ + movq 16+P1, %r10 ; \ + shldq $2, %r10, %r11 ; \ + movq 8+P1, %r9 ; \ + shldq $2, %r9, %r10 ; \ + movq P1, %r8 ; \ + shldq $2, %r8, %r9 ; \ + shlq $2, %r8 ; \ + movq 64+P2, %rcx ; \ + xorq $0x1FF, %rcx ; \ + movq P2, %rax ; \ + notq %rax; \ + addq %rax, %r8 ; \ + movq 8+P2, %rax ; \ + notq %rax; \ + adcq %rax, %r9 ; \ + movq 16+P2, %rax ; \ + notq %rax; \ + adcq %rax, %r10 ; \ + movq 24+P2, %rax ; \ + notq %rax; \ + adcq %rax, %r11 ; \ + movq 32+P2, %rax ; \ + notq %rax; \ + adcq %rax, %r12 ; \ + movq 40+P2, %rax ; \ + notq %rax; \ + adcq %rax, %r13 ; \ + movq 48+P2, %rax ; \ + notq %rax; \ + adcq %rax, %r14 ; \ + movq 56+P2, %rax ; \ + notq %rax; \ + adcq %rax, %r15 ; \ + adcq %rcx, %rbx ; \ + movq %r9, %rax ; \ + andq %r10, %rax ; \ + andq %r11, %rax ; \ + andq %r12, %rax ; \ + andq %r13, %rax ; \ + andq %r14, %rax ; \ + andq %r15, %rax ; \ + movq %rbx, %rdx ; \ + shrq $9, %rdx ; \ + orq $~0x1FF, %rbx ; \ + leaq 1(%rdx), %rcx ; \ + addq %r8, %rcx ; \ + movl $0, %ecx ; \ + adcq %rcx, %rax ; \ + movq %rbx, %rax ; \ + adcq %rcx, %rax ; \ + adcq %rdx, %r8 ; \ + movq %r8, P0 ; \ + adcq %rcx, %r9 ; \ + movq %r9, 8+P0 ; \ + adcq %rcx, %r10 ; \ + movq %r10, 16+P0 ; \ + adcq %rcx, %r11 ; \ + movq %r11, 24+P0 ; \ + adcq %rcx, %r12 ; \ + movq %r12, 32+P0 ; \ + adcq %rcx, %r13 ; \ + movq %r13, 40+P0 ; \ + adcq %rcx, %r14 ; \ + movq %r14, 48+P0 ; \ + adcq %rcx, %r15 ; \ + movq %r15, 56+P0 ; \ + adcq %rcx, %rbx ; \ + andq $0x1FF, %rbx ; \ + movq %rbx, 64+P0 + +S2N_BN_SYMBOL(p521_jdouble): + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi +#endif + +// Save registers and make room on stack for temporary variables + + pushq %rbx + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + + subq $NSPACE, %rsp + +// Main code, just a sequence of basic field operations + +// z2 = z^2 +// y2 = y^2 + + sqr_p521(z2,z_1) + sqr_p521(y2,y_1) + +// x2p = x^2 - z^4 = (x + z^2) * (x - z^2) + + add_p521(t1,x_1,z2) + sub_p521(t2,x_1,z2) + mul_p521(x2p,t1,t2) + +// t1 = y + z +// x4p = x2p^2 +// xy2 = x * y^2 + + add_p521(t1,y_1,z_1) + sqr_p521(x4p,x2p) + weakmul_p521(xy2,x_1,y2) + +// t2 = (y + z)^2 + + sqr_p521(t2,t1) + +// d = 12 * xy2 - 9 * x4p +// t1 = y^2 + 2 * y * z + + cmsub_p521(d,12,xy2,9,x4p) + sub_p521(t1,t2,z2) + +// y4 = y^4 + + sqr_p521(y4,y2) + +// z_3' = 2 * y * z +// dx2 = d * x2p + + sub_p521(z_3,t1,y2) + weakmul_p521(dx2,d,x2p) + +// x' = 4 * xy2 - d + + cmsub41_p521(x_3,xy2,d) + +// y' = 3 * dx2 - 8 * y4 + + cmsub38_p521(y_3,dx2,y4) + +// Restore stack and registers + + addq $NSPACE, %rsp + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbx + +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack, "", %progbits +#endif diff --git a/x86_att/p521/p521_jmixadd.S b/x86_att/p521/p521_jmixadd.S new file mode 100644 index 0000000000..52e1568b56 --- /dev/null +++ b/x86_att/p521/p521_jmixadd.S @@ -0,0 +1,756 @@ +/* + * Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"). + * You may not use this file except in compliance with the License. + * A copy of the License is located at + * + * http://aws.amazon.com/apache2.0 + * + * or in the "LICENSE" file accompanying this file. This file is distributed + * on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either + * express or implied. See the License for the specific language governing + * permissions and limitations under the License. + */ + +// ---------------------------------------------------------------------------- +// Point mixed addition on NIST curve P-521 in Jacobian coordinates +// +// extern void p521_jmixadd +// (uint64_t p3[static 27],uint64_t p1[static 27],uint64_t p2[static 18]); +// +// Does p3 := p1 + p2 where all points are regarded as Jacobian triples. +// A Jacobian triple (x,y,z) represents affine point (x/z^2,y/z^3). +// The "mixed" part means that p2 only has x and y coordinates, with the +// implicit z coordinate assumed to be the identity. It is assumed that +// all the coordinates of the input points p1 and p2 are fully reduced +// mod p_521, that the z coordinate of p1 is nonzero and that neither +// p1 =~= p2 or p1 =~= -p2, where "=~=" means "represents the same affine +// point as". +// +// Standard x86-64 ABI: RDI = p3, RSI = p1, RDX = p2 +// Microsoft x64 ABI: RCX = p3, RDX = p1, R8 = p2 +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(p521_jmixadd) + S2N_BN_SYM_PRIVACY_DIRECTIVE(p521_jmixadd) + .text + +// Size of individual field elements + +#define NUMSIZE 72 + +// Stable homes for input arguments during main code sequence +// These are where they arrive except for input_y, initially in %rdx + +#define input_z %rdi +#define input_x %rsi +#define input_y %rcx + +// Pointer-offset pairs for inputs and outputs + +#define x_1 0(input_x) +#define y_1 NUMSIZE(input_x) +#define z_1 (2*NUMSIZE)(input_x) + +#define x_2 0(input_y) +#define y_2 NUMSIZE(input_y) + +#define x_3 0(input_z) +#define y_3 NUMSIZE(input_z) +#define z_3 (2*NUMSIZE)(input_z) + +// Pointer-offset pairs for temporaries, with some aliasing +// The tmp field is internal storage for field mul and sqr. +// NSPACE is the total stack needed for these temporaries + +#define zp2 (NUMSIZE*0)(%rsp) +#define ww (NUMSIZE*0)(%rsp) + +#define yd (NUMSIZE*1)(%rsp) +#define y2a (NUMSIZE*1)(%rsp) + +#define x2a (NUMSIZE*2)(%rsp) +#define zzx2 (NUMSIZE*2)(%rsp) + +#define zz (NUMSIZE*3)(%rsp) +#define t1 (NUMSIZE*3)(%rsp) + +#define t2 (NUMSIZE*4)(%rsp) +#define zzx1 (NUMSIZE*4)(%rsp) + +#define xd (NUMSIZE*5)(%rsp) + +#define tmp (NUMSIZE*6)(%rsp) + +#define NSPACE (NUMSIZE*6+64) + +// Corresponds exactly to bignum_mul_p521 + +#define mul_p521(P0,P1,P2) \ + xorl %ebp, %ebp ; \ + movq P2, %rdx ; \ + mulxq P1, %r8, %r9 ; \ + movq %r8, 432(%rsp) ; \ + mulxq 0x8+P1, %rbx, %r10 ; \ + adcq %rbx, %r9 ; \ + mulxq 0x10+P1, %rbx, %r11 ; \ + adcq %rbx, %r10 ; \ + mulxq 0x18+P1, %rbx, %r12 ; \ + adcq %rbx, %r11 ; \ + mulxq 0x20+P1, %rbx, %r13 ; \ + adcq %rbx, %r12 ; \ + mulxq 0x28+P1, %rbx, %r14 ; \ + adcq %rbx, %r13 ; \ + mulxq 0x30+P1, %rbx, %r15 ; \ + adcq %rbx, %r14 ; \ + mulxq 0x38+P1, %rbx, %r8 ; \ + adcq %rbx, %r15 ; \ + adcq %rbp, %r8 ; \ + movq 0x8+P2, %rdx ; \ + xorl %ebp, %ebp ; \ + mulxq P1, %rax, %rbx ; \ + adcxq %rax, %r9 ; \ + adoxq %rbx, %r10 ; \ + movq %r9, 440(%rsp) ; \ + mulxq 0x8+P1, %rax, %rbx ; \ + adcxq %rax, %r10 ; \ + adoxq %rbx, %r11 ; \ + mulxq 0x10+P1, %rax, %rbx ; \ + adcxq %rax, %r11 ; \ + adoxq %rbx, %r12 ; \ + mulxq 0x18+P1, %rax, %rbx ; \ + adcxq %rax, %r12 ; \ + adoxq %rbx, %r13 ; \ + mulxq 0x20+P1, %rax, %rbx ; \ + adcxq %rax, %r13 ; \ + adoxq %rbx, %r14 ; \ + mulxq 0x28+P1, %rax, %rbx ; \ + adcxq %rax, %r14 ; \ + adoxq %rbx, %r15 ; \ + mulxq 0x30+P1, %rax, %rbx ; \ + adcxq %rax, %r15 ; \ + adoxq %rbx, %r8 ; \ + mulxq 0x38+P1, %rax, %r9 ; \ + adcxq %rax, %r8 ; \ + adoxq %rbp, %r9 ; \ + adcq %rbp, %r9 ; \ + movq 0x10+P2, %rdx ; \ + xorl %ebp, %ebp ; \ + mulxq P1, %rax, %rbx ; \ + adcxq %rax, %r10 ; \ + adoxq %rbx, %r11 ; \ + movq %r10, 448(%rsp) ; \ + mulxq 0x8+P1, %rax, %rbx ; \ + adcxq %rax, %r11 ; \ + adoxq %rbx, %r12 ; \ + mulxq 0x10+P1, %rax, %rbx ; \ + adcxq %rax, %r12 ; \ + adoxq %rbx, %r13 ; \ + mulxq 0x18+P1, %rax, %rbx ; \ + adcxq %rax, %r13 ; \ + adoxq %rbx, %r14 ; \ + mulxq 0x20+P1, %rax, %rbx ; \ + adcxq %rax, %r14 ; \ + adoxq %rbx, %r15 ; \ + mulxq 0x28+P1, %rax, %rbx ; \ + adcxq %rax, %r15 ; \ + adoxq %rbx, %r8 ; \ + mulxq 0x30+P1, %rax, %rbx ; \ + adcxq %rax, %r8 ; \ + adoxq %rbx, %r9 ; \ + mulxq 0x38+P1, %rax, %r10 ; \ + adcxq %rax, %r9 ; \ + adoxq %rbp, %r10 ; \ + adcq %rbp, %r10 ; \ + movq 0x18+P2, %rdx ; \ + xorl %ebp, %ebp ; \ + mulxq P1, %rax, %rbx ; \ + adcxq %rax, %r11 ; \ + adoxq %rbx, %r12 ; \ + movq %r11, 456(%rsp) ; \ + mulxq 0x8+P1, %rax, %rbx ; \ + adcxq %rax, %r12 ; \ + adoxq %rbx, %r13 ; \ + mulxq 0x10+P1, %rax, %rbx ; \ + adcxq %rax, %r13 ; \ + adoxq %rbx, %r14 ; \ + mulxq 0x18+P1, %rax, %rbx ; \ + adcxq %rax, %r14 ; \ + adoxq %rbx, %r15 ; \ + mulxq 0x20+P1, %rax, %rbx ; \ + adcxq %rax, %r15 ; \ + adoxq %rbx, %r8 ; \ + mulxq 0x28+P1, %rax, %rbx ; \ + adcxq %rax, %r8 ; \ + adoxq %rbx, %r9 ; \ + mulxq 0x30+P1, %rax, %rbx ; \ + adcxq %rax, %r9 ; \ + adoxq %rbx, %r10 ; \ + mulxq 0x38+P1, %rax, %r11 ; \ + adcxq %rax, %r10 ; \ + adoxq %rbp, %r11 ; \ + adcq %rbp, %r11 ; \ + movq 0x20+P2, %rdx ; \ + xorl %ebp, %ebp ; \ + mulxq P1, %rax, %rbx ; \ + adcxq %rax, %r12 ; \ + adoxq %rbx, %r13 ; \ + movq %r12, 464(%rsp) ; \ + mulxq 0x8+P1, %rax, %rbx ; \ + adcxq %rax, %r13 ; \ + adoxq %rbx, %r14 ; \ + mulxq 0x10+P1, %rax, %rbx ; \ + adcxq %rax, %r14 ; \ + adoxq %rbx, %r15 ; \ + mulxq 0x18+P1, %rax, %rbx ; \ + adcxq %rax, %r15 ; \ + adoxq %rbx, %r8 ; \ + mulxq 0x20+P1, %rax, %rbx ; \ + adcxq %rax, %r8 ; \ + adoxq %rbx, %r9 ; \ + mulxq 0x28+P1, %rax, %rbx ; \ + adcxq %rax, %r9 ; \ + adoxq %rbx, %r10 ; \ + mulxq 0x30+P1, %rax, %rbx ; \ + adcxq %rax, %r10 ; \ + adoxq %rbx, %r11 ; \ + mulxq 0x38+P1, %rax, %r12 ; \ + adcxq %rax, %r11 ; \ + adoxq %rbp, %r12 ; \ + adcq %rbp, %r12 ; \ + movq 0x28+P2, %rdx ; \ + xorl %ebp, %ebp ; \ + mulxq P1, %rax, %rbx ; \ + adcxq %rax, %r13 ; \ + adoxq %rbx, %r14 ; \ + movq %r13, 472(%rsp) ; \ + mulxq 0x8+P1, %rax, %rbx ; \ + adcxq %rax, %r14 ; \ + adoxq %rbx, %r15 ; \ + mulxq 0x10+P1, %rax, %rbx ; \ + adcxq %rax, %r15 ; \ + adoxq %rbx, %r8 ; \ + mulxq 0x18+P1, %rax, %rbx ; \ + adcxq %rax, %r8 ; \ + adoxq %rbx, %r9 ; \ + mulxq 0x20+P1, %rax, %rbx ; \ + adcxq %rax, %r9 ; \ + adoxq %rbx, %r10 ; \ + mulxq 0x28+P1, %rax, %rbx ; \ + adcxq %rax, %r10 ; \ + adoxq %rbx, %r11 ; \ + mulxq 0x30+P1, %rax, %rbx ; \ + adcxq %rax, %r11 ; \ + adoxq %rbx, %r12 ; \ + mulxq 0x38+P1, %rax, %r13 ; \ + adcxq %rax, %r12 ; \ + adoxq %rbp, %r13 ; \ + adcq %rbp, %r13 ; \ + movq 0x30+P2, %rdx ; \ + xorl %ebp, %ebp ; \ + mulxq P1, %rax, %rbx ; \ + adcxq %rax, %r14 ; \ + adoxq %rbx, %r15 ; \ + movq %r14, 480(%rsp) ; \ + mulxq 0x8+P1, %rax, %rbx ; \ + adcxq %rax, %r15 ; \ + adoxq %rbx, %r8 ; \ + mulxq 0x10+P1, %rax, %rbx ; \ + adcxq %rax, %r8 ; \ + adoxq %rbx, %r9 ; \ + mulxq 0x18+P1, %rax, %rbx ; \ + adcxq %rax, %r9 ; \ + adoxq %rbx, %r10 ; \ + mulxq 0x20+P1, %rax, %rbx ; \ + adcxq %rax, %r10 ; \ + adoxq %rbx, %r11 ; \ + mulxq 0x28+P1, %rax, %rbx ; \ + adcxq %rax, %r11 ; \ + adoxq %rbx, %r12 ; \ + mulxq 0x30+P1, %rax, %rbx ; \ + adcxq %rax, %r12 ; \ + adoxq %rbx, %r13 ; \ + mulxq 0x38+P1, %rax, %r14 ; \ + adcxq %rax, %r13 ; \ + adoxq %rbp, %r14 ; \ + adcq %rbp, %r14 ; \ + movq 0x38+P2, %rdx ; \ + xorl %ebp, %ebp ; \ + mulxq P1, %rax, %rbx ; \ + adcxq %rax, %r15 ; \ + adoxq %rbx, %r8 ; \ + movq %r15, 488(%rsp) ; \ + mulxq 0x8+P1, %rax, %rbx ; \ + adcxq %rax, %r8 ; \ + adoxq %rbx, %r9 ; \ + mulxq 0x10+P1, %rax, %rbx ; \ + adcxq %rax, %r9 ; \ + adoxq %rbx, %r10 ; \ + mulxq 0x18+P1, %rax, %rbx ; \ + adcxq %rax, %r10 ; \ + adoxq %rbx, %r11 ; \ + mulxq 0x20+P1, %rax, %rbx ; \ + adcxq %rax, %r11 ; \ + adoxq %rbx, %r12 ; \ + mulxq 0x28+P1, %rax, %rbx ; \ + adcxq %rax, %r12 ; \ + adoxq %rbx, %r13 ; \ + mulxq 0x30+P1, %rax, %rbx ; \ + adcxq %rax, %r13 ; \ + adoxq %rbx, %r14 ; \ + mulxq 0x38+P1, %rax, %r15 ; \ + adcxq %rax, %r14 ; \ + adoxq %rbp, %r15 ; \ + adcq %rbp, %r15 ; \ + movq 0x40+P1, %rdx ; \ + xorl %ebp, %ebp ; \ + mulxq P2, %rax, %rbx ; \ + adcxq %rax, %r8 ; \ + adoxq %rbx, %r9 ; \ + mulxq 0x8+P2, %rax, %rbx ; \ + adcxq %rax, %r9 ; \ + adoxq %rbx, %r10 ; \ + mulxq 0x10+P2, %rax, %rbx ; \ + adcxq %rax, %r10 ; \ + adoxq %rbx, %r11 ; \ + mulxq 0x18+P2, %rax, %rbx ; \ + adcxq %rax, %r11 ; \ + adoxq %rbx, %r12 ; \ + mulxq 0x20+P2, %rax, %rbx ; \ + adcxq %rax, %r12 ; \ + adoxq %rbx, %r13 ; \ + mulxq 0x28+P2, %rax, %rbx ; \ + adcxq %rax, %r13 ; \ + adoxq %rbx, %r14 ; \ + mulxq 0x30+P2, %rax, %rbx ; \ + adcxq %rax, %r14 ; \ + adoxq %rbx, %r15 ; \ + mulxq 0x38+P2, %rax, %rbx ; \ + adcxq %rax, %r15 ; \ + adoxq %rbp, %rbx ; \ + adcq %rbx, %rbp ; \ + movq 0x40+P2, %rdx ; \ + xorl %eax, %eax ; \ + mulxq P1, %rax, %rbx ; \ + adcxq %rax, %r8 ; \ + adoxq %rbx, %r9 ; \ + mulxq 0x8+P1, %rax, %rbx ; \ + adcxq %rax, %r9 ; \ + adoxq %rbx, %r10 ; \ + mulxq 0x10+P1, %rax, %rbx ; \ + adcxq %rax, %r10 ; \ + adoxq %rbx, %r11 ; \ + mulxq 0x18+P1, %rax, %rbx ; \ + adcxq %rax, %r11 ; \ + adoxq %rbx, %r12 ; \ + mulxq 0x20+P1, %rax, %rbx ; \ + adcxq %rax, %r12 ; \ + adoxq %rbx, %r13 ; \ + mulxq 0x28+P1, %rax, %rbx ; \ + adcxq %rax, %r13 ; \ + adoxq %rbx, %r14 ; \ + mulxq 0x30+P1, %rax, %rbx ; \ + adcxq %rax, %r14 ; \ + adoxq %rbx, %r15 ; \ + mulxq 0x38+P1, %rax, %rbx ; \ + adcxq %rax, %r15 ; \ + adoxq %rbx, %rbp ; \ + mulxq 0x40+P1, %rax, %rbx ; \ + adcq %rax, %rbp ; \ + movq %r8, %rax ; \ + andq $0x1ff, %rax ; \ + shrdq $0x9, %r9, %r8 ; \ + shrdq $0x9, %r10, %r9 ; \ + shrdq $0x9, %r11, %r10 ; \ + shrdq $0x9, %r12, %r11 ; \ + shrdq $0x9, %r13, %r12 ; \ + shrdq $0x9, %r14, %r13 ; \ + shrdq $0x9, %r15, %r14 ; \ + shrdq $0x9, %rbp, %r15 ; \ + shrq $0x9, %rbp ; \ + addq %rax, %rbp ; \ + stc; \ + adcq 432(%rsp), %r8 ; \ + adcq 440(%rsp), %r9 ; \ + adcq 448(%rsp), %r10 ; \ + adcq 456(%rsp), %r11 ; \ + adcq 464(%rsp), %r12 ; \ + adcq 472(%rsp), %r13 ; \ + adcq 480(%rsp), %r14 ; \ + adcq 488(%rsp), %r15 ; \ + adcq $0xfffffffffffffe00, %rbp ; \ + cmc; \ + sbbq $0x0, %r8 ; \ + movq %r8, P0 ; \ + sbbq $0x0, %r9 ; \ + movq %r9, 0x8+P0 ; \ + sbbq $0x0, %r10 ; \ + movq %r10, 0x10+P0 ; \ + sbbq $0x0, %r11 ; \ + movq %r11, 0x18+P0 ; \ + sbbq $0x0, %r12 ; \ + movq %r12, 0x20+P0 ; \ + sbbq $0x0, %r13 ; \ + movq %r13, 0x28+P0 ; \ + sbbq $0x0, %r14 ; \ + movq %r14, 0x30+P0 ; \ + sbbq $0x0, %r15 ; \ + movq %r15, 0x38+P0 ; \ + sbbq $0x0, %rbp ; \ + andq $0x1ff, %rbp ; \ + movq %rbp, 0x40+P0 + +// Corresponds exactly to bignum_sqr_p521 + +#define sqr_p521(P0,P1) \ + xorl %ebp, %ebp ; \ + movq P1, %rdx ; \ + mulxq 0x8+P1, %r9, %rax ; \ + movq %r9, 440(%rsp) ; \ + mulxq 0x10+P1, %r10, %rbx ; \ + adcxq %rax, %r10 ; \ + movq %r10, 448(%rsp) ; \ + mulxq 0x18+P1, %r11, %rax ; \ + adcxq %rbx, %r11 ; \ + mulxq 0x20+P1, %r12, %rbx ; \ + adcxq %rax, %r12 ; \ + mulxq 0x28+P1, %r13, %rax ; \ + adcxq %rbx, %r13 ; \ + mulxq 0x30+P1, %r14, %rbx ; \ + adcxq %rax, %r14 ; \ + mulxq 0x38+P1, %r15, %r8 ; \ + adcxq %rbx, %r15 ; \ + adcxq %rbp, %r8 ; \ + xorl %ebp, %ebp ; \ + movq 0x8+P1, %rdx ; \ + mulxq 0x10+P1, %rax, %rbx ; \ + adcxq %rax, %r11 ; \ + adoxq %rbx, %r12 ; \ + movq %r11, 456(%rsp) ; \ + mulxq 0x18+P1, %rax, %rbx ; \ + adcxq %rax, %r12 ; \ + adoxq %rbx, %r13 ; \ + movq %r12, 464(%rsp) ; \ + mulxq 0x20+P1, %rax, %rbx ; \ + adcxq %rax, %r13 ; \ + adoxq %rbx, %r14 ; \ + mulxq 0x28+P1, %rax, %rbx ; \ + adcxq %rax, %r14 ; \ + adoxq %rbx, %r15 ; \ + mulxq 0x30+P1, %rax, %rbx ; \ + adcxq %rax, %r15 ; \ + adoxq %rbx, %r8 ; \ + mulxq 0x38+P1, %rax, %r9 ; \ + adcxq %rax, %r8 ; \ + adoxq %rbp, %r9 ; \ + movq 0x20+P1, %rdx ; \ + mulxq 0x28+P1, %rax, %r10 ; \ + adcxq %rax, %r9 ; \ + adoxq %rbp, %r10 ; \ + adcxq %rbp, %r10 ; \ + xorl %ebp, %ebp ; \ + movq 0x10+P1, %rdx ; \ + mulxq 0x18+P1, %rax, %rbx ; \ + adcxq %rax, %r13 ; \ + adoxq %rbx, %r14 ; \ + movq %r13, 472(%rsp) ; \ + mulxq 0x20+P1, %rax, %rbx ; \ + adcxq %rax, %r14 ; \ + adoxq %rbx, %r15 ; \ + movq %r14, 480(%rsp) ; \ + mulxq 0x28+P1, %rax, %rbx ; \ + adcxq %rax, %r15 ; \ + adoxq %rbx, %r8 ; \ + mulxq 0x30+P1, %rax, %rbx ; \ + adcxq %rax, %r8 ; \ + adoxq %rbx, %r9 ; \ + mulxq 0x38+P1, %rax, %rbx ; \ + adcxq %rax, %r9 ; \ + adoxq %rbx, %r10 ; \ + movq 0x30+P1, %rdx ; \ + mulxq 0x20+P1, %rax, %r11 ; \ + adcxq %rax, %r10 ; \ + adoxq %rbp, %r11 ; \ + mulxq 0x28+P1, %rax, %r12 ; \ + adcxq %rax, %r11 ; \ + adoxq %rbp, %r12 ; \ + adcxq %rbp, %r12 ; \ + xorl %ebp, %ebp ; \ + movq 0x18+P1, %rdx ; \ + mulxq 0x20+P1, %rax, %rbx ; \ + adcxq %rax, %r15 ; \ + adoxq %rbx, %r8 ; \ + movq %r15, 488(%rsp) ; \ + mulxq 0x28+P1, %rax, %rbx ; \ + adcxq %rax, %r8 ; \ + adoxq %rbx, %r9 ; \ + mulxq 0x30+P1, %rax, %rbx ; \ + adcxq %rax, %r9 ; \ + adoxq %rbx, %r10 ; \ + mulxq 0x38+P1, %rax, %rbx ; \ + adcxq %rax, %r10 ; \ + adoxq %rbx, %r11 ; \ + movq 0x38+P1, %rdx ; \ + mulxq 0x20+P1, %rax, %rbx ; \ + adcxq %rax, %r11 ; \ + adoxq %rbx, %r12 ; \ + mulxq 0x28+P1, %rax, %r13 ; \ + adcxq %rax, %r12 ; \ + adoxq %rbp, %r13 ; \ + mulxq 0x30+P1, %rax, %r14 ; \ + adcxq %rax, %r13 ; \ + adoxq %rbp, %r14 ; \ + adcxq %rbp, %r14 ; \ + xorl %ebp, %ebp ; \ + movq P1, %rdx ; \ + mulxq %rdx, %rax, %rbx ; \ + movq %rax, 432(%rsp) ; \ + movq 440(%rsp), %rax ; \ + adcxq %rax, %rax ; \ + adoxq %rbx, %rax ; \ + movq %rax, 440(%rsp) ; \ + movq 448(%rsp), %rax ; \ + movq 0x8+P1, %rdx ; \ + mulxq %rdx, %rdx, %rbx ; \ + adcxq %rax, %rax ; \ + adoxq %rdx, %rax ; \ + movq %rax, 448(%rsp) ; \ + movq 456(%rsp), %rax ; \ + adcxq %rax, %rax ; \ + adoxq %rbx, %rax ; \ + movq %rax, 456(%rsp) ; \ + movq 464(%rsp), %rax ; \ + movq 0x10+P1, %rdx ; \ + mulxq %rdx, %rdx, %rbx ; \ + adcxq %rax, %rax ; \ + adoxq %rdx, %rax ; \ + movq %rax, 464(%rsp) ; \ + movq 472(%rsp), %rax ; \ + adcxq %rax, %rax ; \ + adoxq %rbx, %rax ; \ + movq %rax, 472(%rsp) ; \ + movq 480(%rsp), %rax ; \ + movq 0x18+P1, %rdx ; \ + mulxq %rdx, %rdx, %rbx ; \ + adcxq %rax, %rax ; \ + adoxq %rdx, %rax ; \ + movq %rax, 480(%rsp) ; \ + movq 488(%rsp), %rax ; \ + adcxq %rax, %rax ; \ + adoxq %rbx, %rax ; \ + movq %rax, 488(%rsp) ; \ + movq 0x20+P1, %rdx ; \ + mulxq %rdx, %rdx, %rbx ; \ + adcxq %r8, %r8 ; \ + adoxq %rdx, %r8 ; \ + adcxq %r9, %r9 ; \ + adoxq %rbx, %r9 ; \ + movq 0x28+P1, %rdx ; \ + mulxq %rdx, %rdx, %rbx ; \ + adcxq %r10, %r10 ; \ + adoxq %rdx, %r10 ; \ + adcxq %r11, %r11 ; \ + adoxq %rbx, %r11 ; \ + movq 0x30+P1, %rdx ; \ + mulxq %rdx, %rdx, %rbx ; \ + adcxq %r12, %r12 ; \ + adoxq %rdx, %r12 ; \ + adcxq %r13, %r13 ; \ + adoxq %rbx, %r13 ; \ + movq 0x38+P1, %rdx ; \ + mulxq %rdx, %rdx, %r15 ; \ + adcxq %r14, %r14 ; \ + adoxq %rdx, %r14 ; \ + adcxq %rbp, %r15 ; \ + adoxq %rbp, %r15 ; \ + movq 0x40+P1, %rdx ; \ + movq %rdx, %rbp ; \ + imulq %rbp, %rbp ; \ + addq %rdx, %rdx ; \ + mulxq P1, %rax, %rbx ; \ + adcxq %rax, %r8 ; \ + adoxq %rbx, %r9 ; \ + mulxq 0x8+P1, %rax, %rbx ; \ + adcxq %rax, %r9 ; \ + adoxq %rbx, %r10 ; \ + mulxq 0x10+P1, %rax, %rbx ; \ + adcxq %rax, %r10 ; \ + adoxq %rbx, %r11 ; \ + mulxq 0x18+P1, %rax, %rbx ; \ + adcxq %rax, %r11 ; \ + adoxq %rbx, %r12 ; \ + mulxq 0x20+P1, %rax, %rbx ; \ + adcxq %rax, %r12 ; \ + adoxq %rbx, %r13 ; \ + mulxq 0x28+P1, %rax, %rbx ; \ + adcxq %rax, %r13 ; \ + adoxq %rbx, %r14 ; \ + mulxq 0x30+P1, %rax, %rbx ; \ + adcxq %rax, %r14 ; \ + adoxq %rbx, %r15 ; \ + mulxq 0x38+P1, %rax, %rbx ; \ + adcxq %rax, %r15 ; \ + adoxq %rbx, %rbp ; \ + adcq $0x0, %rbp ; \ + movq %r8, %rax ; \ + andq $0x1ff, %rax ; \ + shrdq $0x9, %r9, %r8 ; \ + shrdq $0x9, %r10, %r9 ; \ + shrdq $0x9, %r11, %r10 ; \ + shrdq $0x9, %r12, %r11 ; \ + shrdq $0x9, %r13, %r12 ; \ + shrdq $0x9, %r14, %r13 ; \ + shrdq $0x9, %r15, %r14 ; \ + shrdq $0x9, %rbp, %r15 ; \ + shrq $0x9, %rbp ; \ + addq %rax, %rbp ; \ + stc; \ + adcq 432(%rsp), %r8 ; \ + adcq 440(%rsp), %r9 ; \ + adcq 448(%rsp), %r10 ; \ + adcq 456(%rsp), %r11 ; \ + adcq 464(%rsp), %r12 ; \ + adcq 472(%rsp), %r13 ; \ + adcq 480(%rsp), %r14 ; \ + adcq 488(%rsp), %r15 ; \ + adcq $0xfffffffffffffe00, %rbp ; \ + cmc; \ + sbbq $0x0, %r8 ; \ + movq %r8, P0 ; \ + sbbq $0x0, %r9 ; \ + movq %r9, 0x8+P0 ; \ + sbbq $0x0, %r10 ; \ + movq %r10, 0x10+P0 ; \ + sbbq $0x0, %r11 ; \ + movq %r11, 0x18+P0 ; \ + sbbq $0x0, %r12 ; \ + movq %r12, 0x20+P0 ; \ + sbbq $0x0, %r13 ; \ + movq %r13, 0x28+P0 ; \ + sbbq $0x0, %r14 ; \ + movq %r14, 0x30+P0 ; \ + sbbq $0x0, %r15 ; \ + movq %r15, 0x38+P0 ; \ + sbbq $0x0, %rbp ; \ + andq $0x1ff, %rbp ; \ + movq %rbp, 0x40+P0 + +// Corresponds exactly to bignum_sub_p521 + +#define sub_p521(P0,P1,P2) \ + movq P1, %rax ; \ + subq P2, %rax ; \ + movq 0x8+P1, %rdx ; \ + sbbq 0x8+P2, %rdx ; \ + movq 0x10+P1, %r8 ; \ + sbbq 0x10+P2, %r8 ; \ + movq 0x18+P1, %r9 ; \ + sbbq 0x18+P2, %r9 ; \ + movq 0x20+P1, %r10 ; \ + sbbq 0x20+P2, %r10 ; \ + movq 0x28+P1, %r11 ; \ + sbbq 0x28+P2, %r11 ; \ + movq 0x30+P1, %r12 ; \ + sbbq 0x30+P2, %r12 ; \ + movq 0x38+P1, %r13 ; \ + sbbq 0x38+P2, %r13 ; \ + movq 0x40+P1, %r14 ; \ + sbbq 0x40+P2, %r14 ; \ + sbbq $0x0, %rax ; \ + movq %rax, P0 ; \ + sbbq $0x0, %rdx ; \ + movq %rdx, 0x8+P0 ; \ + sbbq $0x0, %r8 ; \ + movq %r8, 0x10+P0 ; \ + sbbq $0x0, %r9 ; \ + movq %r9, 0x18+P0 ; \ + sbbq $0x0, %r10 ; \ + movq %r10, 0x20+P0 ; \ + sbbq $0x0, %r11 ; \ + movq %r11, 0x28+P0 ; \ + sbbq $0x0, %r12 ; \ + movq %r12, 0x30+P0 ; \ + sbbq $0x0, %r13 ; \ + movq %r13, 0x38+P0 ; \ + sbbq $0x0, %r14 ; \ + andq $0x1ff, %r14 ; \ + movq %r14, 0x40+P0 + +S2N_BN_SYMBOL(p521_jmixadd): + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi + movq %r8, %rdx +#endif + +// Save registers and make room on stack for temporary variables + + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + + subq $NSPACE, %rsp + +// Move the input arguments to stable places (two are already there) + + movq %rdx, input_y + +// Main code, just a sequence of basic field operations + + sqr_p521(zp2,z_1) + mul_p521(y2a,z_1,y_2) + + mul_p521(x2a,zp2,x_2) + mul_p521(y2a,zp2,y2a) + + sub_p521(xd,x2a,x_1) + sub_p521(yd,y2a,y_1) + + sqr_p521(zz,xd) + sqr_p521(ww,yd) + + mul_p521(zzx1,zz,x_1) + mul_p521(zzx2,zz,x2a) + + sub_p521(x_3,ww,zzx1) + sub_p521(t1,zzx2,zzx1) + + mul_p521(z_3,xd,z_1) + + sub_p521(x_3,x_3,zzx2) + + sub_p521(t2,zzx1,x_3) + + mul_p521(t1,t1,y_1) + mul_p521(t2,yd,t2) + + sub_p521(y_3,t2,t1) + +// Restore stack and registers + + addq $NSPACE, %rsp + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbp + popq %rbx + +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack, "", %progbits +#endif