diff --git a/.gitignore b/.gitignore index 1cfb0188..12c76538 100644 --- a/.gitignore +++ b/.gitignore @@ -2,6 +2,7 @@ *.correct *.o *.obj +.vscode tests/test tests/ctCheck benchmarks/benchmark diff --git a/arm/Makefile b/arm/Makefile index 4ceaf99c..bd0e409e 100644 --- a/arm/Makefile +++ b/arm/Makefile @@ -308,10 +308,12 @@ BIGNUM_OBJ = curve25519/bignum_add_p25519.o \ p521/bignum_montsqr_p521_neon.o \ p521/bignum_mul_p521.o \ p521/bignum_mul_p521_alt.o \ + p521/bignum_mul_p521_neon.o \ p521/bignum_neg_p521.o \ p521/bignum_optneg_p521.o \ p521/bignum_sqr_p521.o \ p521/bignum_sqr_p521_alt.o \ + p521/bignum_sqr_p521_neon.o \ p521/bignum_sub_p521.o \ p521/bignum_tolebytes_p521.o \ p521/bignum_tomont_p521.o \ diff --git a/arm/p521/Makefile b/arm/p521/Makefile index 64db0725..3e5e0e85 100644 --- a/arm/p521/Makefile +++ b/arm/p521/Makefile @@ -38,10 +38,12 @@ OBJ = bignum_add_p521.o \ bignum_montsqr_p521_neon.o \ bignum_mul_p521.o \ bignum_mul_p521_alt.o \ + bignum_mul_p521_neon.o \ bignum_neg_p521.o \ bignum_optneg_p521.o \ bignum_sqr_p521.o \ bignum_sqr_p521_alt.o \ + bignum_sqr_p521_neon.o \ bignum_sub_p521.o \ bignum_tolebytes_p521.o \ bignum_tomont_p521.o \ diff --git a/arm/p521/bignum_montsqr_p521_neon.S b/arm/p521/bignum_montsqr_p521_neon.S index c4d11731..57cf9116 100644 --- a/arm/p521/bignum_montsqr_p521_neon.S +++ b/arm/p521/bignum_montsqr_p521_neon.S @@ -562,7 +562,7 @@ // The bash script used for step 2 is as follows: // // # Store the assembly instructions except the last 'ret', -// # callee-register store/loads and add/sub sp #80 as, say, 'input.S'. +// # callee-register store/loads as, say, 'input.S'. // export OUTPUTS="[hint_buffer0,hint_buffer16,hint_buffer32,hint_buffer48,hint_buffer64]" // export RESERVED_REGS="[x18,x25,x26,x27,x28,x29,x30,sp,q8,q9,q10,q11,q12,q13,q14,q15,v8,v9,v10,v11,v12,v13,v14,v15]" // /tools/external/slothy.sh input.S my_out_dir diff --git a/arm/p521/bignum_mul_p521_neon.S b/arm/p521/bignum_mul_p521_neon.S new file mode 100644 index 00000000..c9d34151 --- /dev/null +++ b/arm/p521/bignum_mul_p521_neon.S @@ -0,0 +1,1402 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Multiply modulo p_521, z := (x * y) mod p_521, assuming x and y reduced +// Inputs x[9], y[9]; output z[9] +// +// extern void bignum_mul_p521_neon +// (uint64_t z[static 9], uint64_t x[static 9], uint64_t y[static 9]); +// +// Standard ARM ABI: X0 = z, X1 = x, X2 = y +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + +// bignum_mul_p521_neon is functionally equivalent to bignum_mul_p521. +// It is written in a way that +// 1. A subset of scalar multiplications in bignum_montmul_p384 are carefully +// chosen and vectorized +// 2. The vectorized assembly is rescheduled using the SLOTHY superoptimizer. +// https://github.com/slothy-optimizer/slothy +// +// The output program of step 1. is as follows: +// +// stp x19, x20, [sp, #-16]! +// stp x21, x22, [sp, #-16]! +// stp x23, x24, [sp, #-16]! +// stp x25, x26, [sp, #-16]! +// sub sp, sp, #80 +// ldp x15, x21, [x1] +// ldp x10, x17, [x1, #16] +// ldp x13, x16, [x2] +// ldr q18, [x1] +// ldr q28, [x2] +// ldp x5, x20, [x2, #16] +// movi v16.2D, #0x00000000ffffffff +// uzp2 v7.4S, v28.4S, v28.4S +// xtn v4.2S, v18.2D +// xtn v1.2S, v28.2D +// rev64 v27.4S, v28.4S +// umull v21.2D, v4.2S, v1.2S +// umull v28.2D, v4.2S, v7.2S +// uzp2 v5.4S, v18.4S, v18.4S +// mul v18.4S, v27.4S, v18.4S +// usra v28.2D, v21.2D, #32 +// umull v29.2D, v5.2S, v7.2S +// uaddlp v18.2D, v18.4S +// and v16.16B, v28.16B, v16.16B +// umlal v16.2D, v5.2S, v1.2S +// shl v18.2D, v18.2D, #32 +// usra v29.2D, v28.2D, #32 +// umlal v18.2D, v4.2S, v1.2S +// usra v29.2D, v16.2D, #32 +// mov x8, v18.d[0] +// mov x9, v18.d[1] +// mul x6, x10, x5 +// mul x19, x17, x20 +// mov x14, v29.d[0] +// adds x9, x9, x14 +// mov x14, v29.d[1] +// adcs x6, x6, x14 +// umulh x14, x10, x5 +// adcs x19, x19, x14 +// umulh x14, x17, x20 +// adc x14, x14, xzr +// adds x11, x9, x8 +// adcs x9, x6, x9 +// adcs x6, x19, x6 +// adcs x19, x14, x19 +// adc x14, xzr, x14 +// adds x3, x9, x8 +// adcs x24, x6, x11 +// adcs x9, x19, x9 +// adcs x6, x14, x6 +// adcs x19, xzr, x19 +// adc x14, xzr, x14 +// subs x4, x10, x17 +// cneg x4, x4, cc +// csetm x7, cc +// subs x23, x20, x5 +// cneg x23, x23, cc +// mul x22, x4, x23 +// umulh x4, x4, x23 +// cinv x7, x7, cc +// cmn x7, #0x1 +// eor x23, x22, x7 +// adcs x6, x6, x23 +// eor x4, x4, x7 +// adcs x19, x19, x4 +// adc x14, x14, x7 +// subs x4, x15, x21 +// cneg x4, x4, cc +// csetm x7, cc +// subs x23, x16, x13 +// cneg x23, x23, cc +// mul x22, x4, x23 +// umulh x4, x4, x23 +// cinv x7, x7, cc +// cmn x7, #0x1 +// eor x23, x22, x7 +// adcs x11, x11, x23 +// eor x4, x4, x7 +// adcs x3, x3, x4 +// adcs x24, x24, x7 +// adcs x9, x9, x7 +// adcs x6, x6, x7 +// adcs x19, x19, x7 +// adc x14, x14, x7 +// subs x4, x21, x17 +// cneg x4, x4, cc +// csetm x7, cc +// subs x23, x20, x16 +// cneg x23, x23, cc +// mul x22, x4, x23 +// umulh x4, x4, x23 +// cinv x7, x7, cc +// cmn x7, #0x1 +// eor x23, x22, x7 +// adcs x9, x9, x23 +// eor x4, x4, x7 +// adcs x6, x6, x4 +// adcs x19, x19, x7 +// adc x14, x14, x7 +// subs x4, x15, x10 +// cneg x4, x4, cc +// csetm x7, cc +// subs x23, x5, x13 +// cneg x23, x23, cc +// mul x22, x4, x23 +// umulh x4, x4, x23 +// cinv x7, x7, cc +// cmn x7, #0x1 +// eor x23, x22, x7 +// adcs x3, x3, x23 +// eor x4, x4, x7 +// adcs x24, x24, x4 +// adcs x9, x9, x7 +// adcs x6, x6, x7 +// adcs x19, x19, x7 +// adc x14, x14, x7 +// subs x17, x15, x17 +// cneg x17, x17, cc +// csetm x4, cc +// subs x13, x20, x13 +// cneg x13, x13, cc +// mul x20, x17, x13 +// umulh x17, x17, x13 +// cinv x13, x4, cc +// cmn x13, #0x1 +// eor x20, x20, x13 +// adcs x20, x24, x20 +// eor x17, x17, x13 +// adcs x17, x9, x17 +// adcs x9, x6, x13 +// adcs x6, x19, x13 +// adc x13, x14, x13 +// subs x21, x21, x10 +// cneg x21, x21, cc +// csetm x10, cc +// subs x16, x5, x16 +// cneg x16, x16, cc +// mul x5, x21, x16 +// umulh x21, x21, x16 +// cinv x10, x10, cc +// cmn x10, #0x1 +// eor x16, x5, x10 +// adcs x16, x20, x16 +// eor x21, x21, x10 +// adcs x21, x17, x21 +// adcs x17, x9, x10 +// adcs x5, x6, x10 +// adc x10, x13, x10 +// lsl x13, x8, #9 +// extr x20, x11, x8, #55 +// extr x8, x3, x11, #55 +// extr x9, x16, x3, #55 +// lsr x16, x16, #55 +// stp x21, x17, [sp] // @slothy:writes=stack0 +// stp x5, x10, [sp, #16] // @slothy:writes=stack16 +// stp x13, x20, [sp, #32] // @slothy:writes=stack32 +// stp x8, x9, [sp, #48] // @slothy:writes=stack48 +// str x16, [sp, #64] // @slothy:writes=stack64 +// ldp x21, x10, [x1, #32] +// ldp x17, x13, [x1, #48] +// ldp x16, x5, [x2, #32] +// ldr q18, [x1, #32] +// ldr q28, [x2, #32] +// ldp x20, x8, [x2, #48] +// movi v16.2D, #0x00000000ffffffff +// uzp2 v7.4S, v28.4S, v28.4S +// xtn v4.2S, v18.2D +// xtn v1.2S, v28.2D +// rev64 v28.4S, v28.4S +// umull v27.2D, v4.2S, v1.2S +// umull v29.2D, v4.2S, v7.2S +// uzp2 v21.4S, v18.4S, v18.4S +// mul v28.4S, v28.4S, v18.4S +// usra v29.2D, v27.2D, #32 +// umull v18.2D, v21.2S, v7.2S +// uaddlp v28.2D, v28.4S +// and v16.16B, v29.16B, v16.16B +// umlal v16.2D, v21.2S, v1.2S +// shl v28.2D, v28.2D, #32 +// usra v18.2D, v29.2D, #32 +// umlal v28.2D, v4.2S, v1.2S +// usra v18.2D, v16.2D, #32 +// mov x9, v28.d[0] +// mov x6, v28.d[1] +// mul x19, x17, x20 +// mul x14, x13, x8 +// mov x11, v18.d[0] +// adds x6, x6, x11 +// mov x11, v18.d[1] +// adcs x19, x19, x11 +// umulh x11, x17, x20 +// adcs x14, x14, x11 +// umulh x11, x13, x8 +// adc x11, x11, xzr +// adds x3, x6, x9 +// adcs x6, x19, x6 +// adcs x19, x14, x19 +// adcs x14, x11, x14 +// adc x11, xzr, x11 +// adds x24, x6, x9 +// adcs x4, x19, x3 +// adcs x6, x14, x6 +// adcs x19, x11, x19 +// adcs x14, xzr, x14 +// adc x11, xzr, x11 +// subs x7, x17, x13 +// cneg x7, x7, cc +// csetm x23, cc +// subs x22, x8, x20 +// cneg x22, x22, cc +// mul x12, x7, x22 +// umulh x7, x7, x22 +// cinv x23, x23, cc +// cmn x23, #0x1 +// eor x22, x12, x23 +// adcs x19, x19, x22 +// eor x7, x7, x23 +// adcs x14, x14, x7 +// adc x11, x11, x23 +// subs x7, x21, x10 +// cneg x7, x7, cc +// csetm x23, cc +// subs x22, x5, x16 +// cneg x22, x22, cc +// mul x12, x7, x22 +// umulh x7, x7, x22 +// cinv x23, x23, cc +// cmn x23, #0x1 +// eor x22, x12, x23 +// adcs x3, x3, x22 +// eor x7, x7, x23 +// adcs x24, x24, x7 +// adcs x4, x4, x23 +// adcs x6, x6, x23 +// adcs x19, x19, x23 +// adcs x14, x14, x23 +// adc x11, x11, x23 +// subs x7, x10, x13 +// cneg x7, x7, cc +// csetm x23, cc +// subs x22, x8, x5 +// cneg x22, x22, cc +// mul x12, x7, x22 +// umulh x7, x7, x22 +// cinv x23, x23, cc +// cmn x23, #0x1 +// eor x22, x12, x23 +// adcs x6, x6, x22 +// eor x7, x7, x23 +// adcs x19, x19, x7 +// adcs x14, x14, x23 +// adc x11, x11, x23 +// subs x7, x21, x17 +// cneg x7, x7, cc +// csetm x23, cc +// subs x22, x20, x16 +// cneg x22, x22, cc +// mul x12, x7, x22 +// umulh x7, x7, x22 +// cinv x23, x23, cc +// cmn x23, #0x1 +// eor x22, x12, x23 +// adcs x24, x24, x22 +// eor x7, x7, x23 +// adcs x4, x4, x7 +// adcs x6, x6, x23 +// adcs x19, x19, x23 +// adcs x14, x14, x23 +// adc x11, x11, x23 +// subs x7, x21, x13 +// cneg x7, x7, cc +// csetm x23, cc +// subs x22, x8, x16 +// cneg x22, x22, cc +// mul x12, x7, x22 +// umulh x7, x7, x22 +// cinv x23, x23, cc +// cmn x23, #0x1 +// eor x22, x12, x23 +// adcs x4, x4, x22 +// eor x7, x7, x23 +// adcs x6, x6, x7 +// adcs x19, x19, x23 +// adcs x14, x14, x23 +// adc x11, x11, x23 +// subs x7, x10, x17 +// cneg x7, x7, cc +// csetm x23, cc +// subs x22, x20, x5 +// cneg x22, x22, cc +// mul x12, x7, x22 +// umulh x7, x7, x22 +// cinv x23, x23, cc +// cmn x23, #0x1 +// eor x22, x12, x23 +// adcs x4, x4, x22 +// eor x7, x7, x23 +// adcs x6, x6, x7 +// adcs x19, x19, x23 +// adcs x14, x14, x23 +// adc x11, x11, x23 +// ldp x7, x23, [sp] // @slothy:reads=stack0 +// adds x9, x9, x7 +// adcs x3, x3, x23 +// stp x9, x3, [sp] // @slothy:writes=stack0 +// ldp x9, x3, [sp, #16] // @slothy:reads=stack16 +// adcs x9, x24, x9 +// adcs x3, x4, x3 +// stp x9, x3, [sp, #16] // @slothy:writes=stack16 +// ldp x9, x3, [sp, #32] // @slothy:reads=stack32 +// adcs x9, x6, x9 +// adcs x6, x19, x3 +// stp x9, x6, [sp, #32] // @slothy:writes=stack32 +// ldp x9, x6, [sp, #48] // @slothy:reads=stack48 +// adcs x9, x14, x9 +// adcs x6, x11, x6 +// stp x9, x6, [sp, #48] // @slothy:writes=stack48 +// ldr x9, [sp, #64] // @slothy:reads=stack64 +// adc x9, x9, xzr +// str x9, [sp, #64] // @slothy:writes=stack64 +// ldp x9, x6, [x1] +// subs x21, x21, x9 +// sbcs x10, x10, x6 +// ldp x9, x6, [x1, #16] +// sbcs x17, x17, x9 +// sbcs x13, x13, x6 +// csetm x9, cc +// ldp x6, x19, [x2] +// subs x16, x6, x16 +// sbcs x5, x19, x5 +// ldp x6, x19, [x2, #16] +// sbcs x20, x6, x20 +// sbcs x8, x19, x8 +// csetm x6, cc +// eor x21, x21, x9 +// subs x21, x21, x9 +// eor x10, x10, x9 +// sbcs x10, x10, x9 +// eor x17, x17, x9 +// sbcs x17, x17, x9 +// eor x13, x13, x9 +// sbc x13, x13, x9 +// eor x16, x16, x6 +// subs x16, x16, x6 +// eor x5, x5, x6 +// sbcs x5, x5, x6 +// eor x20, x20, x6 +// sbcs x20, x20, x6 +// eor x8, x8, x6 +// sbc x8, x8, x6 +// eor x9, x6, x9 +// mul x6, x21, x16 +// mul x19, x10, x5 +// mul x14, x17, x20 +// mul x11, x13, x8 +// umulh x3, x21, x16 +// adds x19, x19, x3 +// umulh x3, x10, x5 +// adcs x14, x14, x3 +// umulh x3, x17, x20 +// adcs x11, x11, x3 +// umulh x3, x13, x8 +// adc x3, x3, xzr +// adds x24, x19, x6 +// adcs x19, x14, x19 +// adcs x14, x11, x14 +// adcs x11, x3, x11 +// adc x3, xzr, x3 +// adds x4, x19, x6 +// adcs x7, x14, x24 +// adcs x19, x11, x19 +// adcs x14, x3, x14 +// adcs x11, xzr, x11 +// adc x3, xzr, x3 +// subs x23, x17, x13 +// cneg x23, x23, cc +// csetm x22, cc +// subs x12, x8, x20 +// cneg x12, x12, cc +// mul x15, x23, x12 +// umulh x23, x23, x12 +// cinv x22, x22, cc +// cmn x22, #0x1 +// eor x12, x15, x22 +// adcs x14, x14, x12 +// eor x23, x23, x22 +// adcs x11, x11, x23 +// adc x3, x3, x22 +// subs x23, x21, x10 +// cneg x23, x23, cc +// csetm x22, cc +// subs x12, x5, x16 +// cneg x12, x12, cc +// mul x15, x23, x12 +// umulh x23, x23, x12 +// cinv x22, x22, cc +// cmn x22, #0x1 +// eor x12, x15, x22 +// adcs x24, x24, x12 +// eor x23, x23, x22 +// adcs x4, x4, x23 +// adcs x7, x7, x22 +// adcs x19, x19, x22 +// adcs x14, x14, x22 +// adcs x11, x11, x22 +// adc x3, x3, x22 +// subs x23, x10, x13 +// cneg x23, x23, cc +// csetm x22, cc +// subs x12, x8, x5 +// cneg x12, x12, cc +// mul x15, x23, x12 +// umulh x23, x23, x12 +// cinv x22, x22, cc +// cmn x22, #0x1 +// eor x12, x15, x22 +// adcs x19, x19, x12 +// eor x23, x23, x22 +// adcs x14, x14, x23 +// adcs x11, x11, x22 +// adc x3, x3, x22 +// subs x23, x21, x17 +// cneg x23, x23, cc +// csetm x22, cc +// subs x12, x20, x16 +// cneg x12, x12, cc +// mul x15, x23, x12 +// umulh x23, x23, x12 +// cinv x22, x22, cc +// cmn x22, #0x1 +// eor x12, x15, x22 +// adcs x4, x4, x12 +// eor x23, x23, x22 +// adcs x7, x7, x23 +// adcs x19, x19, x22 +// adcs x14, x14, x22 +// adcs x11, x11, x22 +// adc x3, x3, x22 +// subs x21, x21, x13 +// cneg x21, x21, cc +// csetm x13, cc +// subs x16, x8, x16 +// cneg x16, x16, cc +// mul x8, x21, x16 +// umulh x21, x21, x16 +// cinv x13, x13, cc +// cmn x13, #0x1 +// eor x16, x8, x13 +// adcs x16, x7, x16 +// eor x21, x21, x13 +// adcs x21, x19, x21 +// adcs x8, x14, x13 +// adcs x19, x11, x13 +// adc x13, x3, x13 +// subs x10, x10, x17 +// cneg x10, x10, cc +// csetm x17, cc +// subs x5, x20, x5 +// cneg x5, x5, cc +// mul x20, x10, x5 +// umulh x10, x10, x5 +// cinv x17, x17, cc +// cmn x17, #0x1 +// eor x5, x20, x17 +// adcs x16, x16, x5 +// eor x10, x10, x17 +// adcs x21, x21, x10 +// adcs x10, x8, x17 +// adcs x5, x19, x17 +// adc x17, x13, x17 +// ldp x13, x20, [sp] // @slothy:reads=stack0 +// ldp x8, x19, [sp, #16] // @slothy:reads=stack16 +// eor x6, x6, x9 +// adds x6, x6, x13 +// eor x14, x24, x9 +// adcs x14, x14, x20 +// eor x11, x4, x9 +// adcs x11, x11, x8 +// eor x16, x16, x9 +// adcs x16, x16, x19 +// eor x21, x21, x9 +// ldp x3, x24, [sp, #32] // @slothy:reads=stack32 +// ldp x4, x7, [sp, #48] // @slothy:reads=stack48 +// ldr x23, [sp, #64] // @slothy:reads=stack64 +// adcs x21, x21, x3 +// eor x10, x10, x9 +// adcs x10, x10, x24 +// eor x5, x5, x9 +// adcs x5, x5, x4 +// eor x17, x17, x9 +// adcs x17, x17, x7 +// adc x22, x23, xzr +// adds x21, x21, x13 +// adcs x10, x10, x20 +// adcs x13, x5, x8 +// adcs x17, x17, x19 +// and x5, x9, #0x1ff +// lsl x20, x6, #9 +// orr x5, x20, x5 +// adcs x5, x3, x5 +// extr x20, x14, x6, #55 +// adcs x20, x24, x20 +// extr x8, x11, x14, #55 +// adcs x8, x4, x8 +// extr x9, x16, x11, #55 +// adcs x9, x7, x9 +// lsr x16, x16, #55 +// adc x16, x16, x23 +// ldr x6, [x2, #64] +// ldp x19, x14, [x1] +// and x11, x19, #0xfffffffffffff +// mul x11, x6, x11 +// ldr x3, [x1, #64] +// ldp x24, x4, [x2] +// and x7, x24, #0xfffffffffffff +// mul x7, x3, x7 +// add x11, x11, x7 +// extr x19, x14, x19, #52 +// and x19, x19, #0xfffffffffffff +// mul x19, x6, x19 +// extr x24, x4, x24, #52 +// and x24, x24, #0xfffffffffffff +// mul x24, x3, x24 +// add x19, x19, x24 +// lsr x24, x11, #52 +// add x19, x19, x24 +// lsl x11, x11, #12 +// extr x11, x19, x11, #12 +// adds x21, x21, x11 +// ldp x11, x24, [x1, #16] +// ldp x7, x23, [x2, #16] +// extr x14, x11, x14, #40 +// and x14, x14, #0xfffffffffffff +// mul x14, x6, x14 +// extr x4, x7, x4, #40 +// and x4, x4, #0xfffffffffffff +// mul x4, x3, x4 +// add x14, x14, x4 +// lsr x4, x19, #52 +// add x14, x14, x4 +// lsl x19, x19, #12 +// extr x19, x14, x19, #24 +// adcs x10, x10, x19 +// extr x19, x24, x11, #28 +// and x19, x19, #0xfffffffffffff +// mul x19, x6, x19 +// extr x11, x23, x7, #28 +// and x11, x11, #0xfffffffffffff +// mul x11, x3, x11 +// add x19, x19, x11 +// lsr x11, x14, #52 +// add x19, x19, x11 +// lsl x14, x14, #12 +// extr x14, x19, x14, #36 +// adcs x13, x13, x14 +// and x14, x10, x13 +// ldp x11, x4, [x1, #32] +// ldp x7, x12, [x2, #32] +// extr x24, x11, x24, #16 +// and x24, x24, #0xfffffffffffff +// mul x24, x6, x24 +// extr x23, x7, x23, #16 +// and x23, x23, #0xfffffffffffff +// mul x23, x3, x23 +// add x24, x24, x23 +// lsl x23, x22, #48 +// add x24, x24, x23 +// lsr x23, x19, #52 +// add x24, x24, x23 +// lsl x19, x19, #12 +// extr x19, x24, x19, #48 +// adcs x17, x17, x19 +// and x19, x14, x17 +// lsr x14, x11, #4 +// and x14, x14, #0xfffffffffffff +// mul x14, x6, x14 +// lsr x23, x7, #4 +// and x23, x23, #0xfffffffffffff +// mul x23, x3, x23 +// add x14, x14, x23 +// lsr x23, x24, #52 +// add x14, x14, x23 +// lsl x24, x24, #12 +// extr x24, x14, x24, #60 +// extr x11, x4, x11, #56 +// and x11, x11, #0xfffffffffffff +// mul x11, x6, x11 +// extr x7, x12, x7, #56 +// and x7, x7, #0xfffffffffffff +// mul x7, x3, x7 +// add x11, x11, x7 +// lsr x14, x14, #52 +// add x14, x11, x14 +// lsl x11, x24, #8 +// extr x11, x14, x11, #8 +// adcs x5, x5, x11 +// and x19, x19, x5 +// ldp x11, x24, [x1, #48] +// ldp x2, x7, [x2, #48] +// extr x4, x11, x4, #44 +// and x4, x4, #0xfffffffffffff +// mul x4, x6, x4 +// extr x23, x2, x12, #44 +// and x23, x23, #0xfffffffffffff +// mul x23, x3, x23 +// add x4, x4, x23 +// lsr x23, x14, #52 +// add x4, x4, x23 +// lsl x14, x14, #12 +// extr x14, x4, x14, #20 +// adcs x20, x20, x14 +// and x19, x19, x20 +// extr x14, x24, x11, #32 +// and x14, x14, #0xfffffffffffff +// mul x14, x6, x14 +// extr x2, x7, x2, #32 +// and x2, x2, #0xfffffffffffff +// mul x2, x3, x2 +// add x2, x14, x2 +// lsr x14, x4, #52 +// add x2, x2, x14 +// lsl x14, x4, #12 +// extr x14, x2, x14, #32 +// adcs x8, x8, x14 +// and x19, x19, x8 +// lsr x14, x24, #20 +// mul x14, x6, x14 +// lsr x11, x7, #20 +// mul x11, x3, x11 +// add x14, x14, x11 +// lsr x11, x2, #52 +// add x14, x14, x11 +// lsl x2, x2, #12 +// extr x2, x14, x2, #44 +// adcs x9, x9, x2 +// and x2, x19, x9 +// mul x6, x6, x3 +// lsr x19, x14, #44 +// add x6, x6, x19 +// adc x16, x16, x6 +// lsr x6, x16, #9 +// orr x16, x16, #0xfffffffffffffe00 +// cmp xzr, xzr +// adcs xzr, x21, x6 +// adcs xzr, x2, xzr +// adcs xzr, x16, xzr +// adcs x21, x21, x6 +// adcs x10, x10, xzr +// adcs x13, x13, xzr +// adcs x17, x17, xzr +// adcs x5, x5, xzr +// adcs x20, x20, xzr +// adcs x8, x8, xzr +// adcs x9, x9, xzr +// adc x16, x16, xzr +// and x2, x21, #0x1ff +// extr x21, x10, x21, #9 +// extr x10, x13, x10, #9 +// stp x21, x10, [x0] // @slothy:writes=buffer0 +// extr x21, x17, x13, #9 +// extr x10, x5, x17, #9 +// stp x21, x10, [x0, #16] // @slothy:writes=buffer16 +// extr x21, x20, x5, #9 +// extr x10, x8, x20, #9 +// stp x21, x10, [x0, #32] // @slothy:writes=buffer32 +// extr x21, x9, x8, #9 +// extr x10, x16, x9, #9 +// stp x21, x10, [x0, #48] // @slothy:writes=buffer48 +// str x2, [x0, #64] // @slothy:writes=buffer64 +// add sp, sp, #80 +// ldp x25, x26, [sp], #16 +// ldp x23, x24, [sp], #16 +// ldp x21, x22, [sp], #16 +// ldp x19, x20, [sp], #16 +// ret +// +// The bash script used for step 2 is as follows: +// +// # Store the assembly instructions except the last 'ret', +// # callee-register store/loads and add/sub sp #80 as, say, 'input.S'. +// export OUTPUTS="[hint_buffer0,hint_buffer16,hint_buffer32,hint_buffer48,hint_buffer64]" +// export RESERVED_REGS="[x18,x27,x28,x29,x30,sp,q8,q9,q10,q11,q12,q13,q14,q15,v8,v9,v10,v11,v12,v13,v14,v15]" +// /tools/external/slothy.sh input.S my_out_dir +// # my_out_dir/3.opt.s is the optimized assembly. Its output may differ +// # from this file since the sequence is non-deterministically chosen. +// # Please add 'ret' at the end of the output assembly. + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mul_p521_neon) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mul_p521_neon) + .text + .balign 4 + +S2N_BN_SYMBOL(bignum_mul_p521_neon): + +// Save registers and make space for the temporary buffer + + stp x19, x20, [sp, #-16]! + stp x21, x22, [sp, #-16]! + stp x23, x24, [sp, #-16]! + stp x25, x26, [sp, #-16]! + sub sp, sp, #80 + + ldr q6, [x2] + ldp x10, x17, [x1, #16] + ldr q4, [x1] + ldr q16, [x2, #32] + ldp x5, x20, [x2, #16] + ldr q2, [x1, #32] + movi v31.2D, #0x00000000ffffffff + uzp2 v17.4S, v6.4S, v6.4S + rev64 v7.4S, v6.4S + ldp x15, x21, [x1] + xtn v25.2S, v6.2D + xtn v22.2S, v4.2D + subs x14, x10, x17 + mul v7.4S, v7.4S, v4.4S + csetm x8, cc + rev64 v3.4S, v16.4S + xtn v1.2S, v16.2D + ldp x13, x16, [x2] + mul x26, x10, x5 + uzp2 v16.4S, v16.4S, v16.4S + uaddlp v26.2D, v7.4S + cneg x4, x14, cc + subs x24, x15, x21 + xtn v5.2S, v2.2D + mul v28.4S, v3.4S, v2.4S + shl v26.2D, v26.2D, #32 + mul x22, x17, x20 + umull v20.2D, v22.2S, v25.2S + uzp2 v6.4S, v4.4S, v4.4S + umull v18.2D, v22.2S, v17.2S + uzp2 v4.4S, v2.4S, v2.4S + cneg x14, x24, cc + csetm x7, cc + umulh x11, x17, x20 + usra v18.2D, v20.2D, #32 + uaddlp v7.2D, v28.4S + subs x19, x16, x13 + umlal v26.2D, v22.2S, v25.2S + cneg x19, x19, cc + shl v28.2D, v7.2D, #32 + umull v7.2D, v5.2S, v1.2S + umull v30.2D, v5.2S, v16.2S + cinv x6, x7, cc + mul x25, x14, x19 + umlal v28.2D, v5.2S, v1.2S + umull v21.2D, v6.2S, v17.2S + umulh x14, x14, x19 + usra v30.2D, v7.2D, #32 + subs x9, x20, x5 + and v29.16B, v18.16B, v31.16B + cinv x23, x8, cc + mov x8, v26.d[1] + cneg x12, x9, cc + usra v21.2D, v18.2D, #32 + umlal v29.2D, v6.2S, v25.2S + mul x24, x4, x12 + umull v18.2D, v4.2S, v16.2S + movi v25.2D, #0x00000000ffffffff + eor x9, x14, x6 + and v7.16B, v30.16B, v25.16B + usra v21.2D, v29.2D, #32 + umulh x7, x10, x5 + usra v18.2D, v30.2D, #32 + umlal v7.2D, v4.2S, v1.2S + mov x19, v21.d[0] + umulh x3, x4, x12 + mov x14, v21.d[1] + usra v18.2D, v7.2D, #32 + adds x4, x8, x19 + mov x8, v26.d[0] + adcs x19, x26, x14 + adcs x14, x22, x7 + adc x12, x11, xzr + adds x11, x4, x8 + adcs x26, x19, x4 + adcs x22, x14, x19 + eor x4, x24, x23 + adcs x14, x12, x14 + eor x7, x25, x6 + adc x25, xzr, x12 + eor x19, x3, x23 + adds x3, x26, x8 + adcs x24, x22, x11 + adcs x12, x14, x26 + adcs x22, x25, x22 + adcs x26, xzr, x14 + adc x14, xzr, x25 + cmn x23, #0x1 + adcs x22, x22, x4 + adcs x19, x26, x19 + adc x25, x14, x23 + subs x14, x21, x17 + cneg x23, x14, cc + csetm x26, cc + subs x4, x20, x16 + cneg x14, x4, cc + cinv x4, x26, cc + cmn x6, #0x1 + adcs x11, x11, x7 + mul x7, x23, x14 + adcs x9, x3, x9 + adcs x26, x24, x6 + umulh x3, x23, x14 + adcs x14, x12, x6 + adcs x22, x22, x6 + adcs x12, x19, x6 + extr x24, x11, x8, #55 + adc x6, x25, x6 + subs x19, x15, x17 + csetm x17, cc + cneg x23, x19, cc + subs x19, x20, x13 + lsl x25, x8, #9 + eor x8, x7, x4 + cneg x20, x19, cc + umulh x7, x23, x20 + cinv x19, x17, cc + subs x17, x15, x10 + csetm x15, cc + stp x25, x24, [sp, #32] + cneg x24, x17, cc + mul x20, x23, x20 + subs x25, x5, x13 + cneg x13, x25, cc + cinv x15, x15, cc + mul x25, x24, x13 + subs x21, x21, x10 + csetm x23, cc + cneg x17, x21, cc + subs x21, x5, x16 + umulh x13, x24, x13 + cinv x10, x23, cc + cneg x23, x21, cc + cmn x4, #0x1 + adcs x14, x14, x8 + eor x21, x3, x4 + adcs x21, x22, x21 + eor x5, x20, x19 + adcs x24, x12, x4 + mul x12, x17, x23 + eor x8, x25, x15 + adc x25, x6, x4 + cmn x15, #0x1 + adcs x6, x9, x8 + ldp x20, x8, [x2, #48] + eor x9, x13, x15 + adcs x4, x26, x9 + umulh x26, x17, x23 + ldp x17, x13, [x1, #48] + adcs x9, x14, x15 + adcs x16, x21, x15 + adcs x14, x24, x15 + eor x21, x7, x19 + mul x23, x17, x20 + adc x24, x25, x15 + cmn x19, #0x1 + adcs x7, x4, x5 + adcs x9, x9, x21 + umulh x3, x13, x8 + adcs x16, x16, x19 + adcs x22, x14, x19 + eor x5, x12, x10 + adc x12, x24, x19 + cmn x10, #0x1 + adcs x19, x7, x5 + eor x14, x26, x10 + mov x7, v28.d[1] + adcs x24, x9, x14 + extr x4, x19, x6, #55 + umulh x15, x17, x20 + mov x14, v18.d[1] + lsr x9, x19, #55 + adcs x5, x16, x10 + mov x16, v18.d[0] + adcs x19, x22, x10 + str x9, [sp, #64] + extr x25, x6, x11, #55 + adc x21, x12, x10 + subs x26, x17, x13 + stp x25, x4, [sp, #48] + stp x19, x21, [sp, #16] + csetm x6, cc + cneg x4, x26, cc + mul x19, x13, x8 + subs x11, x8, x20 + stp x24, x5, [sp] + ldp x21, x10, [x1, #32] + cinv x12, x6, cc + cneg x6, x11, cc + mov x9, v28.d[0] + umulh x25, x4, x6 + adds x22, x7, x16 + ldp x16, x5, [x2, #32] + adcs x14, x23, x14 + adcs x11, x19, x15 + adc x24, x3, xzr + adds x3, x22, x9 + adcs x15, x14, x22 + mul x22, x4, x6 + adcs x6, x11, x14 + adcs x4, x24, x11 + eor x14, x25, x12 + adc x26, xzr, x24 + subs x7, x21, x10 + csetm x23, cc + cneg x19, x7, cc + subs x24, x5, x16 + cneg x11, x24, cc + cinv x7, x23, cc + adds x25, x15, x9 + eor x23, x22, x12 + adcs x22, x6, x3 + mul x24, x19, x11 + adcs x15, x4, x15 + adcs x6, x26, x6 + umulh x19, x19, x11 + adcs x11, xzr, x4 + adc x26, xzr, x26 + cmn x12, #0x1 + adcs x4, x6, x23 + eor x6, x24, x7 + adcs x14, x11, x14 + adc x26, x26, x12 + subs x11, x10, x13 + cneg x12, x11, cc + csetm x11, cc + eor x19, x19, x7 + subs x24, x8, x5 + cinv x11, x11, cc + cneg x24, x24, cc + cmn x7, #0x1 + adcs x3, x3, x6 + mul x23, x12, x24 + adcs x25, x25, x19 + adcs x6, x22, x7 + umulh x19, x12, x24 + adcs x22, x15, x7 + adcs x12, x4, x7 + eor x24, x23, x11 + adcs x4, x14, x7 + adc x26, x26, x7 + eor x19, x19, x11 + subs x14, x21, x17 + cneg x7, x14, cc + csetm x14, cc + subs x23, x20, x16 + cinv x14, x14, cc + cneg x23, x23, cc + cmn x11, #0x1 + adcs x22, x22, x24 + mul x24, x7, x23 + adcs x15, x12, x19 + adcs x4, x4, x11 + adc x19, x26, x11 + umulh x26, x7, x23 + subs x7, x21, x13 + eor x11, x24, x14 + cneg x23, x7, cc + csetm x12, cc + subs x7, x8, x16 + cneg x7, x7, cc + cinv x12, x12, cc + cmn x14, #0x1 + eor x26, x26, x14 + adcs x11, x25, x11 + mul x25, x23, x7 + adcs x26, x6, x26 + adcs x6, x22, x14 + adcs x24, x15, x14 + umulh x23, x23, x7 + adcs x4, x4, x14 + adc x22, x19, x14 + eor x14, x25, x12 + eor x7, x23, x12 + cmn x12, #0x1 + adcs x14, x26, x14 + ldp x19, x25, [x2] + ldp x15, x23, [x2, #16] + adcs x26, x6, x7 + adcs x24, x24, x12 + adcs x7, x4, x12 + adc x4, x22, x12 + subs x19, x19, x16 + ldp x16, x22, [x1] + sbcs x6, x25, x5 + ldp x12, x25, [x1, #16] + sbcs x15, x15, x20 + sbcs x8, x23, x8 + csetm x23, cc + subs x21, x21, x16 + eor x16, x19, x23 + sbcs x19, x10, x22 + eor x22, x6, x23 + eor x8, x8, x23 + sbcs x6, x17, x12 + sbcs x13, x13, x25 + csetm x12, cc + subs x10, x10, x17 + cneg x17, x10, cc + csetm x25, cc + subs x5, x20, x5 + eor x10, x19, x12 + cneg x19, x5, cc + eor x20, x15, x23 + eor x21, x21, x12 + cinv x15, x25, cc + mul x25, x17, x19 + subs x16, x16, x23 + sbcs x5, x22, x23 + eor x6, x6, x12 + sbcs x20, x20, x23 + eor x22, x13, x12 + sbc x8, x8, x23 + subs x21, x21, x12 + umulh x19, x17, x19 + sbcs x10, x10, x12 + sbcs x17, x6, x12 + eor x6, x19, x15 + eor x19, x25, x15 + umulh x25, x17, x20 + sbc x13, x22, x12 + cmn x15, #0x1 + adcs x22, x14, x19 + adcs x19, x26, x6 + ldp x6, x26, [sp] + adcs x14, x24, x15 + umulh x24, x21, x16 + adcs x7, x7, x15 + adc x15, x4, x15 + adds x4, x9, x6 + eor x9, x23, x12 + adcs x12, x3, x26 + stp x4, x12, [sp] + ldp x4, x26, [sp, #16] + umulh x12, x10, x5 + ldp x6, x23, [sp, #32] + adcs x3, x11, x4 + mul x4, x13, x8 + adcs x26, x22, x26 + ldp x22, x11, [sp, #48] + adcs x6, x19, x6 + stp x3, x26, [sp, #16] + mul x26, x10, x5 + adcs x14, x14, x23 + stp x6, x14, [sp, #32] + ldr x6, [sp, #64] + adcs x22, x7, x22 + adcs x14, x15, x11 + mul x11, x17, x20 + adc x19, x6, xzr + stp x22, x14, [sp, #48] + adds x14, x26, x24 + str x19, [sp, #64] + umulh x19, x13, x8 + adcs x7, x11, x12 + adcs x22, x4, x25 + mul x6, x21, x16 + adc x19, x19, xzr + subs x11, x17, x13 + cneg x12, x11, cc + csetm x11, cc + subs x24, x8, x20 + cinv x11, x11, cc + cneg x24, x24, cc + adds x4, x14, x6 + adcs x14, x7, x14 + mul x3, x12, x24 + adcs x7, x22, x7 + adcs x22, x19, x22 + umulh x12, x12, x24 + adc x24, xzr, x19 + adds x19, x14, x6 + eor x3, x3, x11 + adcs x26, x7, x4 + adcs x14, x22, x14 + adcs x25, x24, x7 + adcs x23, xzr, x22 + eor x7, x12, x11 + adc x12, xzr, x24 + subs x22, x21, x10 + cneg x24, x22, cc + csetm x22, cc + subs x15, x5, x16 + cinv x22, x22, cc + cneg x15, x15, cc + cmn x11, #0x1 + adcs x3, x25, x3 + mul x25, x24, x15 + adcs x23, x23, x7 + adc x11, x12, x11 + subs x7, x10, x13 + umulh x15, x24, x15 + cneg x12, x7, cc + csetm x7, cc + eor x24, x25, x22 + eor x25, x15, x22 + cmn x22, #0x1 + adcs x24, x4, x24 + adcs x19, x19, x25 + adcs x15, x26, x22 + adcs x4, x14, x22 + adcs x26, x3, x22 + adcs x25, x23, x22 + adc x23, x11, x22 + subs x14, x21, x17 + cneg x3, x14, cc + csetm x11, cc + subs x14, x8, x5 + cneg x14, x14, cc + cinv x7, x7, cc + subs x13, x21, x13 + cneg x21, x13, cc + csetm x13, cc + mul x22, x12, x14 + subs x8, x8, x16 + cinv x13, x13, cc + umulh x14, x12, x14 + cneg x12, x8, cc + subs x8, x20, x16 + cneg x8, x8, cc + cinv x16, x11, cc + eor x22, x22, x7 + cmn x7, #0x1 + eor x14, x14, x7 + adcs x4, x4, x22 + mul x11, x3, x8 + adcs x22, x26, x14 + adcs x14, x25, x7 + eor x25, x24, x9 + adc x26, x23, x7 + umulh x7, x3, x8 + subs x17, x10, x17 + cneg x24, x17, cc + eor x3, x11, x16 + csetm x11, cc + subs x20, x20, x5 + cneg x5, x20, cc + cinv x11, x11, cc + cmn x16, #0x1 + mul x17, x21, x12 + eor x8, x7, x16 + adcs x10, x19, x3 + and x19, x9, #0x1ff + adcs x20, x15, x8 + umulh x15, x21, x12 + eor x12, x10, x9 + eor x8, x6, x9 + adcs x6, x4, x16 + adcs x4, x22, x16 + adcs x21, x14, x16 + adc x7, x26, x16 + mul x10, x24, x5 + cmn x13, #0x1 + ldp x3, x14, [x1] + eor x17, x17, x13 + umulh x5, x24, x5 + adcs x20, x20, x17 + eor x17, x15, x13 + adcs x16, x6, x17 + eor x22, x10, x11 + adcs x23, x4, x13 + extr x10, x14, x3, #52 + and x26, x3, #0xfffffffffffff + adcs x24, x21, x13 + and x15, x10, #0xfffffffffffff + adc x6, x7, x13 + cmn x11, #0x1 + adcs x17, x20, x22 + eor x4, x5, x11 + ldp x21, x10, [sp] + adcs x7, x16, x4 + eor x16, x17, x9 + eor x13, x7, x9 + ldp x3, x17, [sp, #16] + adcs x7, x23, x11 + eor x23, x7, x9 + ldp x5, x22, [sp, #32] + adcs x7, x24, x11 + adc x24, x6, x11 + ldr x6, [x2, #64] + adds x20, x8, x21 + lsl x11, x20, #9 + eor x4, x7, x9 + orr x7, x11, x19 + eor x8, x24, x9 + adcs x11, x25, x10 + mul x26, x6, x26 + ldp x19, x24, [sp, #48] + adcs x12, x12, x3 + adcs x16, x16, x17 + adcs x9, x13, x5 + ldr x25, [sp, #64] + extr x20, x11, x20, #55 + adcs x13, x23, x22 + adcs x4, x4, x19 + extr x23, x12, x11, #55 + adcs x8, x8, x24 + adc x11, x25, xzr + adds x21, x9, x21 + extr x9, x16, x12, #55 + lsr x12, x16, #55 + adcs x10, x13, x10 + mul x15, x6, x15 + adcs x13, x4, x3 + ldp x16, x4, [x2] + ldr x3, [x1, #64] + adcs x17, x8, x17 + adcs x5, x5, x7 + adcs x20, x22, x20 + adcs x8, x19, x23 + and x22, x16, #0xfffffffffffff + ldp x19, x7, [x1, #16] + adcs x9, x24, x9 + extr x24, x4, x16, #52 + adc x16, x12, x25 + mul x22, x3, x22 + and x25, x24, #0xfffffffffffff + extr x14, x19, x14, #40 + and x12, x14, #0xfffffffffffff + extr x23, x7, x19, #28 + ldp x19, x24, [x2, #16] + mul x14, x3, x25 + and x23, x23, #0xfffffffffffff + add x22, x26, x22 + lsl x11, x11, #48 + lsr x26, x22, #52 + lsl x25, x22, #12 + mul x22, x6, x12 + extr x12, x19, x4, #40 + add x4, x15, x14 + mul x15, x6, x23 + add x4, x4, x26 + extr x23, x24, x19, #28 + ldp x14, x19, [x1, #32] + and x26, x12, #0xfffffffffffff + extr x12, x4, x25, #12 + and x25, x23, #0xfffffffffffff + adds x21, x21, x12 + mul x12, x3, x26 + extr x23, x14, x7, #16 + and x23, x23, #0xfffffffffffff + mul x7, x3, x25 + ldp x25, x26, [x2, #32] + add x12, x22, x12 + extr x22, x19, x14, #56 + mul x23, x6, x23 + lsr x14, x14, #4 + extr x24, x25, x24, #16 + add x7, x15, x7 + and x15, x24, #0xfffffffffffff + and x22, x22, #0xfffffffffffff + lsr x24, x4, #52 + mul x15, x3, x15 + and x14, x14, #0xfffffffffffff + add x12, x12, x24 + lsl x24, x4, #12 + lsr x4, x12, #52 + extr x24, x12, x24, #24 + adcs x10, x10, x24 + lsl x24, x12, #12 + add x12, x7, x4 + mul x22, x6, x22 + add x4, x23, x15 + extr x7, x12, x24, #36 + adcs x13, x13, x7 + lsl x15, x12, #12 + add x7, x4, x11 + lsr x24, x12, #52 + ldp x23, x11, [x2, #48] + add x4, x7, x24 + mul x12, x6, x14 + extr x7, x26, x25, #56 + extr x14, x4, x15, #48 + and x2, x7, #0xfffffffffffff + extr x24, x11, x23, #32 + ldp x15, x7, [x1, #48] + and x1, x24, #0xfffffffffffff + lsr x24, x4, #52 + mul x2, x3, x2 + extr x26, x23, x26, #44 + lsr x23, x25, #4 + and x23, x23, #0xfffffffffffff + and x25, x26, #0xfffffffffffff + extr x26, x7, x15, #32 + extr x19, x15, x19, #44 + mul x23, x3, x23 + and x15, x26, #0xfffffffffffff + lsl x26, x4, #12 + and x4, x19, #0xfffffffffffff + lsr x11, x11, #20 + mul x19, x6, x4 + adcs x17, x17, x14 + add x14, x22, x2 + add x22, x12, x23 + lsr x7, x7, #20 + add x22, x22, x24 + extr x2, x22, x26, #60 + mul x24, x3, x25 + lsr x22, x22, #52 + add x14, x14, x22 + lsl x22, x2, #8 + extr x22, x14, x22, #8 + lsl x2, x14, #12 + mul x1, x3, x1 + adcs x12, x5, x22 + mul x5, x6, x15 + and x26, x10, x13 + and x4, x26, x17 + add x23, x19, x24 + lsr x14, x14, #52 + mul x22, x3, x11 + add x11, x23, x14 + extr x25, x11, x2, #20 + lsl x19, x11, #12 + adcs x25, x20, x25 + and x14, x4, x12 + add x1, x5, x1 + and x14, x14, x25 + mul x15, x6, x7 + add x26, x15, x22 + mul x6, x6, x3 + lsr x22, x11, #52 + add x4, x1, x22 + lsr x1, x4, #52 + extr x3, x4, x19, #32 + lsl x15, x4, #12 + add x7, x26, x1 + adcs x23, x8, x3 + extr x20, x7, x15, #44 + and x3, x14, x23 + lsr x19, x7, #44 + adcs x7, x9, x20 + add x11, x6, x19 + adc x4, x16, x11 + lsr x14, x4, #9 + cmp xzr, xzr + and x15, x3, x7 + orr x3, x4, #0xfffffffffffffe00 + adcs xzr, x21, x14 + adcs xzr, x15, xzr + adcs xzr, x3, xzr + adcs x11, x21, x14 + and x14, x11, #0x1ff + adcs x1, x10, xzr + extr x10, x1, x11, #9 + str x14, [x0, #64] + adcs x14, x13, xzr + extr x11, x14, x1, #9 + adcs x1, x17, xzr + extr x4, x1, x14, #9 + stp x10, x11, [x0] + adcs x11, x12, xzr + extr x14, x11, x1, #9 + adcs x10, x25, xzr + extr x11, x10, x11, #9 + stp x4, x14, [x0, #16] + adcs x14, x23, xzr + extr x10, x14, x10, #9 + adcs x1, x7, xzr + stp x11, x10, [x0, #32] + extr x14, x1, x14, #9 + adc x10, x3, xzr + extr x26, x10, x1, #9 + stp x14, x26, [x0, #48] + +// Restore regs and return + + add sp, sp, #80 + ldp x25, x26, [sp], #16 + ldp x23, x24, [sp], #16 + ldp x21, x22, [sp], #16 + ldp x19, x20, [sp], #16 + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/arm/p521/bignum_sqr_p521_neon.S b/arm/p521/bignum_sqr_p521_neon.S new file mode 100644 index 00000000..13cd1c25 --- /dev/null +++ b/arm/p521/bignum_sqr_p521_neon.S @@ -0,0 +1,1121 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Square modulo p_521, z := (x^2) mod p_521, assuming x reduced +// Input x[9]; output z[9] +// +// extern void bignum_sqr_p521_neon (uint64_t z[static 9], +// uint64_t x[static 9]); +// +// Standard ARM ABI: X0 = z, X1 = x +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + +// bignum_montsqr_p521_neon is functionally equivalent to bignum_montsqr_p521. +// It is written in a way that +// 1. A subset of scalar multiplications in bignum_montmul_p384 are carefully +// chosen and vectorized +// 2. The vectorized assembly is rescheduled using the SLOTHY superoptimizer. +// https://github.com/slothy-optimizer/slothy +// +// The output program of step 1. is as follows: +// +// stp x19, x20, [sp, #-16]! +// stp x21, x22, [sp, #-16]! +// stp x23, x24, [sp, #-16]! +// ldp x20, x19, [x1] +// ldr q23, [x1] +// ldr q1, [x1] +// ldr q16, [x1] +// ldp x14, x12, [x1, #16] +// ldr q28, [x1, #16] +// ldr q31, [x1, #16] +// ldp x9, x2, [x1, #32] +// ldr q29, [x1, #32] +// ldr q4, [x1, #32] +// ldr q5, [x1] +// ldr q2, [x1, #32] +// ldp x6, x13, [x1, #48] +// ldr q24, [x1, #48] +// ldr q27, [x1, #48] +// ldr q0, [x1, #16] +// ldr q30, [x1, #48] +// mul x17, x9, x6 +// mul x10, x2, x13 +// umulh x24, x9, x6 +// subs x4, x9, x2 +// cneg x4, x4, cc +// csetm x16, cc +// subs x3, x13, x6 +// cneg x23, x3, cc +// mul x3, x4, x23 +// umulh x4, x4, x23 +// cinv x22, x16, cc +// eor x23, x3, x22 +// eor x16, x4, x22 +// adds x3, x17, x24 +// adc x24, x24, xzr +// umulh x4, x2, x13 +// adds x3, x3, x10 +// adcs x24, x24, x4 +// adc x4, x4, xzr +// adds x24, x24, x10 +// adc x10, x4, xzr +// cmn x22, #0x1 +// adcs x4, x3, x23 +// adcs x24, x24, x16 +// adc x10, x10, x22 +// adds x8, x17, x17 +// adcs x22, x4, x4 +// adcs x5, x24, x24 +// adcs x11, x10, x10 +// adc x23, xzr, xzr +// movi v25.2D, #0xffffffff +// uzp2 v19.4S, v4.4S, v4.4S +// xtn v26.2S, v29.2D +// xtn v22.2S, v4.2D +// rev64 v4.4S, v4.4S +// umull v7.2D, v26.2S, v22.2S +// umull v21.2D, v26.2S, v19.2S +// uzp2 v17.4S, v29.4S, v29.4S +// mul v4.4S, v4.4S, v29.4S +// usra v21.2D, v7.2D, #32 +// umull v18.2D, v17.2S, v19.2S +// uaddlp v4.2D, v4.4S +// and v7.16B, v21.16B, v25.16B +// umlal v7.2D, v17.2S, v22.2S +// shl v4.2D, v4.2D, #32 +// usra v18.2D, v21.2D, #32 +// umlal v4.2D, v26.2S, v22.2S +// usra v18.2D, v7.2D, #32 +// mov x15, v4.d[0] +// mov x16, v4.d[1] +// mul x3, x9, x2 +// mov x10, v18.d[0] +// mov x17, v18.d[1] +// umulh x4, x9, x2 +// adds x24, x10, x3 +// adcs x10, x16, x4 +// adc x17, x17, xzr +// adds x7, x24, x3 +// adcs x10, x10, x4 +// adc x17, x17, xzr +// adds x8, x8, x10 +// adcs x22, x22, x17 +// adcs x21, x5, xzr +// adcs x5, x11, xzr +// adc x11, x23, xzr +// movi v25.2D, #0xffffffff +// uzp2 v19.4S, v27.4S, v27.4S +// xtn v26.2S, v24.2D +// xtn v22.2S, v27.2D +// rev64 v4.4S, v27.4S +// umull v7.2D, v26.2S, v22.2S +// umull v21.2D, v26.2S, v19.2S +// uzp2 v17.4S, v24.4S, v24.4S +// mul v4.4S, v4.4S, v24.4S +// usra v21.2D, v7.2D, #32 +// umull v18.2D, v17.2S, v19.2S +// uaddlp v4.2D, v4.4S +// and v7.16B, v21.16B, v25.16B +// umlal v7.2D, v17.2S, v22.2S +// shl v4.2D, v4.2D, #32 +// usra v18.2D, v21.2D, #32 +// umlal v4.2D, v26.2S, v22.2S +// usra v18.2D, v7.2D, #32 +// mov x23, v4.d[0] +// mov x16, v4.d[1] +// mul x3, x6, x13 +// mov x10, v18.d[0] +// mov x17, v18.d[1] +// umulh x4, x6, x13 +// adds x24, x10, x3 +// adcs x10, x16, x4 +// adc x17, x17, xzr +// adds x24, x24, x3 +// adcs x10, x10, x4 +// adc x17, x17, xzr +// adds x23, x23, x21 +// adcs x16, x24, x5 +// adcs x3, x10, x11 +// adc x21, x17, xzr +// ldr x17, [x1, #64] +// add x5, x17, x17 +// mul x11, x17, x17 +// and x17, x20, #0xfffffffffffff +// mul x4, x5, x17 +// extr x17, x19, x20, #52 +// and x17, x17, #0xfffffffffffff +// mul x10, x5, x17 +// lsr x17, x4, #52 +// add x24, x10, x17 +// lsl x17, x4, #12 +// extr x17, x24, x17, #12 +// adds x15, x15, x17 +// extr x17, x14, x19, #40 +// and x17, x17, #0xfffffffffffff +// mul x10, x5, x17 +// lsr x17, x24, #52 +// add x4, x10, x17 +// lsl x17, x24, #12 +// extr x17, x4, x17, #24 +// adcs x7, x7, x17 +// extr x17, x12, x14, #28 +// and x17, x17, #0xfffffffffffff +// mul x10, x5, x17 +// lsr x17, x4, #52 +// add x24, x10, x17 +// lsl x17, x4, #12 +// extr x17, x24, x17, #36 +// adcs x8, x8, x17 +// extr x17, x9, x12, #16 +// and x17, x17, #0xfffffffffffff +// mul x10, x5, x17 +// lsr x17, x24, #52 +// add x4, x10, x17 +// lsl x17, x24, #12 +// extr x17, x4, x17, #48 +// adcs x22, x22, x17 +// lsr x17, x9, #4 +// and x17, x17, #0xfffffffffffff +// mul x10, x5, x17 +// lsr x17, x4, #52 +// add x24, x10, x17 +// lsl x17, x4, #12 +// extr x4, x24, x17, #60 +// extr x17, x2, x9, #56 +// and x17, x17, #0xfffffffffffff +// mul x10, x5, x17 +// lsr x17, x24, #52 +// add x24, x10, x17 +// lsl x17, x4, #8 +// extr x17, x24, x17, #8 +// adcs x23, x23, x17 +// extr x17, x6, x2, #44 +// and x17, x17, #0xfffffffffffff +// mul x10, x5, x17 +// lsr x17, x24, #52 +// add x4, x10, x17 +// lsl x17, x24, #12 +// extr x17, x4, x17, #20 +// adcs x16, x16, x17 +// extr x17, x13, x6, #32 +// and x17, x17, #0xfffffffffffff +// mul x10, x5, x17 +// lsr x17, x4, #52 +// add x24, x10, x17 +// lsl x17, x4, #12 +// extr x17, x24, x17, #32 +// adcs x3, x3, x17 +// lsr x17, x13, #20 +// mul x10, x5, x17 +// lsr x17, x24, #52 +// add x10, x10, x17 +// lsl x17, x24, #12 +// extr x17, x10, x17, #44 +// adcs x4, x21, x17 +// lsr x17, x10, #44 +// adc x24, x11, x17 +// extr x10, x7, x15, #9 +// extr x17, x8, x7, #9 +// stp x10, x17, [x0] // @slothy:writes=buffer0 +// extr x10, x22, x8, #9 +// extr x17, x23, x22, #9 +// stp x10, x17, [x0, #16] // @slothy:writes=buffer16 +// extr x10, x16, x23, #9 +// extr x17, x3, x16, #9 +// stp x10, x17, [x0, #32] // @slothy:writes=buffer32 +// extr x10, x4, x3, #9 +// extr x17, x24, x4, #9 +// stp x10, x17, [x0, #48] // @slothy:writes=buffer48 +// and x10, x15, #0x1ff +// lsr x17, x24, #9 +// add x17, x10, x17 +// str x17, [x0, #64] // @slothy:writes=buffer64 +// uzp1 v17.4S, v28.4S, v23.4S +// rev64 v4.4S, v28.4S +// uzp1 v7.4S, v23.4S, v23.4S +// mul v4.4S, v4.4S, v23.4S +// uaddlp v4.2D, v4.4S +// shl v4.2D, v4.2D, #32 +// umlal v4.2D, v7.2S, v17.2S +// mov x8, v4.d[0] +// mov x22, v4.d[1] +// umulh x23, x20, x14 +// subs x17, x20, x19 +// cneg x4, x17, cc +// csetm x24, cc +// subs x17, x12, x14 +// cneg x17, x17, cc +// mul x10, x4, x17 +// umulh x17, x4, x17 +// cinv x16, x24, cc +// eor x3, x10, x16 +// eor x4, x17, x16 +// adds x24, x8, x23 +// adc x10, x23, xzr +// umulh x17, x19, x12 +// adds x24, x24, x22 +// adcs x10, x10, x17 +// adc x17, x17, xzr +// adds x10, x10, x22 +// adc x17, x17, xzr +// cmn x16, #0x1 +// adcs x24, x24, x3 +// adcs x10, x10, x4 +// adc x17, x17, x16 +// adds x15, x8, x8 +// adcs x7, x24, x24 +// adcs x8, x10, x10 +// adcs x22, x17, x17 +// adc x23, xzr, xzr +// movi v25.2D, #0xffffffff +// uzp2 v19.4S, v16.4S, v16.4S +// xtn v26.2S, v1.2D +// xtn v22.2S, v16.2D +// rev64 v4.4S, v16.4S +// umull v7.2D, v26.2S, v22.2S +// umull v21.2D, v26.2S, v19.2S +// uzp2 v17.4S, v1.4S, v1.4S +// mul v4.4S, v4.4S, v1.4S +// usra v21.2D, v7.2D, #32 +// umull v18.2D, v17.2S, v19.2S +// uaddlp v4.2D, v4.4S +// and v7.16B, v21.16B, v25.16B +// umlal v7.2D, v17.2S, v22.2S +// shl v4.2D, v4.2D, #32 +// usra v18.2D, v21.2D, #32 +// umlal v4.2D, v26.2S, v22.2S +// usra v18.2D, v7.2D, #32 +// mov x21, v4.d[0] +// mov x16, v4.d[1] +// mul x3, x20, x19 +// mov x10, v18.d[0] +// mov x17, v18.d[1] +// umulh x4, x20, x19 +// adds x24, x10, x3 +// adcs x10, x16, x4 +// adc x17, x17, xzr +// adds x5, x24, x3 +// adcs x10, x10, x4 +// adc x17, x17, xzr +// adds x11, x15, x10 +// adcs x15, x7, x17 +// adcs x7, x8, xzr +// adcs x8, x22, xzr +// adc x22, x23, xzr +// xtn v7.2S, v31.2D +// shrn v4.2S, v31.2D, #32 +// umull v4.2D, v7.2S, v4.2S +// shl v4.2D, v4.2D, #33 +// umlal v4.2D, v7.2S, v7.2S +// mov x23, v4.d[0] +// mov x16, v4.d[1] +// mul x3, x14, x12 +// umulh x10, x14, x14 +// umulh x17, x12, x12 +// umulh x4, x14, x12 +// adds x24, x10, x3 +// adcs x10, x16, x4 +// adc x17, x17, xzr +// adds x24, x24, x3 +// adcs x10, x10, x4 +// adc x17, x17, xzr +// adds x16, x23, x7 +// adcs x3, x24, x8 +// adcs x4, x10, x22 +// adc x24, x17, xzr +// ldp x10, x17, [x0] // @slothy:reads=buffer0 +// adds x10, x10, x21 +// adcs x17, x17, x5 +// stp x10, x17, [x0] // @slothy:writes=buffer0 +// ldp x10, x17, [x0, #16] // @slothy:reads=buffer16 +// adcs x10, x10, x11 +// adcs x17, x17, x15 +// stp x10, x17, [x0, #16] // @slothy:writes=buffer16 +// ldp x10, x17, [x0, #32] // @slothy:reads=buffer32 +// adcs x10, x10, x16 +// adcs x17, x17, x3 +// stp x10, x17, [x0, #32] // @slothy:writes=buffer32 +// ldp x10, x17, [x0, #48] // @slothy:reads=buffer48 +// adcs x10, x10, x4 +// adcs x17, x17, x24 +// stp x10, x17, [x0, #48] // @slothy:writes=buffer48 +// ldr x17, [x0, #64] // @slothy:reads=buffer64 +// adc x17, x17, xzr +// str x17, [x0, #64] // @slothy:writes=buffer64 +// movi v25.2D, #0xffffffff +// uzp2 v19.4S, v2.4S, v2.4S +// xtn v26.2S, v5.2D +// xtn v22.2S, v2.2D +// rev64 v4.4S, v2.4S +// umull v7.2D, v26.2S, v22.2S +// umull v21.2D, v26.2S, v19.2S +// uzp2 v17.4S, v5.4S, v5.4S +// mul v4.4S, v4.4S, v5.4S +// usra v21.2D, v7.2D, #32 +// umull v18.2D, v17.2S, v19.2S +// uaddlp v4.2D, v4.4S +// and v7.16B, v21.16B, v25.16B +// umlal v7.2D, v17.2S, v22.2S +// shl v4.2D, v4.2D, #32 +// usra v18.2D, v21.2D, #32 +// umlal v4.2D, v26.2S, v22.2S +// usra v18.2D, v7.2D, #32 +// mov x5, v4.d[0] +// mov x4, v4.d[1] +// movi v25.2D, #0xffffffff +// uzp2 v17.4S, v30.4S, v30.4S +// xtn v19.2S, v0.2D +// xtn v26.2S, v30.2D +// rev64 v4.4S, v30.4S +// umull v7.2D, v19.2S, v26.2S +// umull v22.2D, v19.2S, v17.2S +// uzp2 v21.4S, v0.4S, v0.4S +// mul v4.4S, v4.4S, v0.4S +// usra v22.2D, v7.2D, #32 +// umull v17.2D, v21.2S, v17.2S +// uaddlp v4.2D, v4.4S +// and v7.16B, v22.16B, v25.16B +// umlal v7.2D, v21.2S, v26.2S +// shl v4.2D, v4.2D, #32 +// usra v17.2D, v22.2D, #32 +// umlal v4.2D, v19.2S, v26.2S +// usra v17.2D, v7.2D, #32 +// mov x24, v4.d[0] +// mov x10, v4.d[1] +// mov x17, v18.d[0] +// adds x4, x4, x17 +// mov x17, v18.d[1] +// adcs x24, x24, x17 +// mov x17, v17.d[0] +// adcs x10, x10, x17 +// mov x17, v17.d[1] +// adc x17, x17, xzr +// adds x15, x4, x5 +// adcs x4, x24, x4 +// adcs x24, x10, x24 +// adcs x10, x17, x10 +// adc x17, xzr, x17 +// adds x7, x4, x5 +// adcs x8, x24, x15 +// adcs x22, x10, x4 +// adcs x23, x17, x24 +// adcs x16, xzr, x10 +// adc x3, xzr, x17 +// subs x17, x14, x12 +// cneg x24, x17, cc +// csetm x4, cc +// subs x17, x13, x6 +// cneg x10, x17, cc +// mul x17, x24, x10 +// umulh x24, x24, x10 +// cinv x10, x4, cc +// cmn x10, #0x1 +// eor x17, x17, x10 +// adcs x23, x23, x17 +// eor x17, x24, x10 +// adcs x16, x16, x17 +// adc x3, x3, x10 +// subs x17, x20, x19 +// cneg x24, x17, cc +// csetm x4, cc +// subs x17, x2, x9 +// cneg x10, x17, cc +// mul x17, x24, x10 +// umulh x24, x24, x10 +// cinv x10, x4, cc +// cmn x10, #0x1 +// eor x17, x17, x10 +// adcs x11, x15, x17 +// eor x17, x24, x10 +// adcs x15, x7, x17 +// adcs x7, x8, x10 +// adcs x22, x22, x10 +// adcs x23, x23, x10 +// adcs x16, x16, x10 +// adc x3, x3, x10 +// subs x17, x19, x12 +// cneg x24, x17, cc +// csetm x4, cc +// subs x17, x13, x2 +// cneg x10, x17, cc +// mul x17, x24, x10 +// umulh x24, x24, x10 +// cinv x10, x4, cc +// cmn x10, #0x1 +// eor x17, x17, x10 +// adcs x8, x22, x17 +// eor x17, x24, x10 +// adcs x23, x23, x17 +// adcs x16, x16, x10 +// adc x3, x3, x10 +// subs x17, x20, x14 +// cneg x24, x17, cc +// csetm x4, cc +// subs x17, x6, x9 +// cneg x10, x17, cc +// mul x17, x24, x10 +// umulh x24, x24, x10 +// cinv x10, x4, cc +// cmn x10, #0x1 +// eor x17, x17, x10 +// adcs x22, x15, x17 +// eor x17, x24, x10 +// adcs x4, x7, x17 +// adcs x24, x8, x10 +// adcs x23, x23, x10 +// adcs x16, x16, x10 +// adc x3, x3, x10 +// subs x12, x20, x12 +// cneg x10, x12, cc +// csetm x17, cc +// subs x12, x13, x9 +// cneg x9, x12, cc +// mul x12, x10, x9 +// umulh x13, x10, x9 +// cinv x9, x17, cc +// cmn x9, #0x1 +// eor x12, x12, x9 +// adcs x4, x4, x12 +// eor x12, x13, x9 +// adcs x24, x24, x12 +// adcs x10, x23, x9 +// adcs x17, x16, x9 +// adc x13, x3, x9 +// subs x19, x19, x14 +// cneg x12, x19, cc +// csetm x9, cc +// subs x6, x6, x2 +// cneg x14, x6, cc +// mul x19, x12, x14 +// umulh x12, x12, x14 +// cinv x14, x9, cc +// cmn x14, #0x1 +// eor x19, x19, x14 +// adcs x23, x4, x19 +// eor x19, x12, x14 +// adcs x16, x24, x19 +// adcs x6, x10, x14 +// adcs x2, x17, x14 +// adc x9, x13, x14 +// ldp x12, x14, [x0] // @slothy:reads=buffer0 +// extr x19, x6, x16, #8 +// adds x10, x19, x12 +// extr x19, x2, x6, #8 +// adcs x17, x19, x14 +// ldp x14, x12, [x0, #16] // @slothy:reads=buffer16 +// extr x19, x9, x2, #8 +// adcs x13, x19, x14 +// and x14, x17, x13 +// lsr x19, x9, #8 +// adcs x6, x19, x12 +// and x9, x14, x6 +// ldp x14, x12, [x0, #32] // @slothy:reads=buffer32 +// lsl x19, x5, #1 +// adcs x2, x19, x14 +// and x14, x9, x2 +// extr x19, x11, x5, #63 +// adcs x3, x19, x12 +// and x9, x14, x3 +// ldp x14, x12, [x0, #48] // @slothy:reads=buffer48 +// extr x19, x22, x11, #63 +// adcs x4, x19, x14 +// and x14, x9, x4 +// extr x19, x23, x22, #63 +// adcs x24, x19, x12 +// and x12, x14, x24 +// ldr x14, [x0, #64] // @slothy:reads=buffer64 +// extr x19, x16, x23, #63 +// and x19, x19, #0x1ff +// adc x19, x14, x19 +// lsr x14, x19, #9 +// orr x19, x19, #0xfffffffffffffe00 +// cmp xzr, xzr +// adcs xzr, x10, x14 +// adcs xzr, x12, xzr +// adcs xzr, x19, xzr +// adcs x10, x10, x14 +// adcs x17, x17, xzr +// adcs x13, x13, xzr +// adcs x6, x6, xzr +// adcs x2, x2, xzr +// adcs x9, x3, xzr +// adcs x12, x4, xzr +// adcs x14, x24, xzr +// adc x19, x19, xzr +// and x19, x19, #0x1ff +// stp x10, x17, [x0] // @slothy:writes=buffer0 +// stp x13, x6, [x0, #16] // @slothy:writes=buffer16 +// stp x2, x9, [x0, #32] // @slothy:writes=buffer32 +// stp x12, x14, [x0, #48] // @slothy:writes=buffer48 +// str x19, [x0, #64] // @slothy:writes=buffer64 +// ldp x23, x24, [sp], #16 +// ldp x21, x22, [sp], #16 +// ldp x19, x20, [sp], #16 +// ret +// +// The bash script used for step 2 is as follows: +// +// # Store the assembly instructions except the last 'ret', +// # callee-register store/loads as, say, 'input.S'. +// export OUTPUTS="[hint_buffer0,hint_buffer16,hint_buffer32,hint_buffer48,hint_buffer64]" +// export RESERVED_REGS="[x18,x25,x26,x27,x28,x29,x30,sp,q8,q9,q10,q11,q12,q13,q14,q15,v8,v9,v10,v11,v12,v13,v14,v15]" +// /tools/external/slothy.sh input.S my_out_dir +// # my_out_dir/3.opt.s is the optimized assembly. Its output may differ +// # from this file since the sequence is non-deterministically chosen. +// # Please add 'ret' at the end of the output assembly. + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_sqr_p521_neon) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_sqr_p521_neon) + .text + .balign 4 + +S2N_BN_SYMBOL(bignum_sqr_p521_neon): + +// Save registers + + stp x19, x20, [sp, #-16]! + stp x21, x22, [sp, #-16]! + stp x23, x24, [sp, #-16]! + + ldr q23, [x1, #32] + ldp x9, x2, [x1, #32] + ldr q16, [x1, #32] + ldr q20, [x1, #48] + ldp x6, x13, [x1, #48] + rev64 v2.4S, v23.4S + mul x14, x9, x2 + ldr q31, [x1, #48] + subs x22, x9, x2 + uzp2 v26.4S, v23.4S, v23.4S + mul v30.4S, v2.4S, v16.4S + xtn v0.2S, v20.2D + csetm x12, cc + xtn v21.2S, v16.2D + xtn v23.2S, v23.2D + umulh x10, x9, x6 + rev64 v27.4S, v31.4S + umull v2.2D, v21.2S, v26.2S + cneg x23, x22, cc + uaddlp v25.2D, v30.4S + umull v18.2D, v21.2S, v23.2S + mul x22, x9, x6 + mul v6.4S, v27.4S, v20.4S + uzp2 v17.4S, v20.4S, v20.4S + shl v20.2D, v25.2D, #32 + uzp2 v27.4S, v31.4S, v31.4S + mul x16, x2, x13 + umlal v20.2D, v21.2S, v23.2S + usra v2.2D, v18.2D, #32 + adds x8, x22, x10 + umull v25.2D, v17.2S, v27.2S + xtn v31.2S, v31.2D + movi v1.2D, #0xffffffff + adc x3, x10, xzr + umulh x21, x2, x13 + uzp2 v21.4S, v16.4S, v16.4S + umull v18.2D, v0.2S, v27.2S + subs x19, x13, x6 + and v7.16B, v2.16B, v1.16B + umull v27.2D, v0.2S, v31.2S + cneg x20, x19, cc + movi v30.2D, #0xffffffff + umull v16.2D, v21.2S, v26.2S + umlal v7.2D, v21.2S, v23.2S + mul x19, x23, x20 + cinv x7, x12, cc + uaddlp v6.2D, v6.4S + eor x12, x19, x7 + adds x11, x8, x16 + umulh x10, x23, x20 + ldr q1, [x1] + usra v16.2D, v2.2D, #32 + adcs x19, x3, x21 + shl v2.2D, v6.2D, #32 + adc x20, x21, xzr + adds x17, x19, x16 + usra v18.2D, v27.2D, #32 + adc x19, x20, xzr + cmn x7, #0x1 + umlal v2.2D, v0.2S, v31.2S + umulh x16, x9, x2 + adcs x8, x11, x12 + usra v16.2D, v7.2D, #32 + ldr x12, [x1, #64] + eor x20, x10, x7 + umulh x10, x6, x13 + mov x23, v2.d[0] + mov x3, v2.d[1] + adcs x21, x17, x20 + usra v25.2D, v18.2D, #32 + and v23.16B, v18.16B, v30.16B + adc x7, x19, x7 + adds x22, x22, x22 + ldr q7, [x1, #16] + adcs x17, x8, x8 + umlal v23.2D, v17.2S, v31.2S + mov x19, v16.d[0] + mul x11, x12, x12 + ldr q4, [x1] + usra v25.2D, v23.2D, #32 + add x5, x12, x12 + adcs x15, x21, x21 + ldr q28, [x1] + mov x12, v20.d[1] + adcs x24, x7, x7 + mov x21, v16.d[1] + adc x4, xzr, xzr + adds x19, x19, x14 + ldr q18, [x1, #16] + xtn v26.2S, v1.2D + adcs x8, x12, x16 + adc x21, x21, xzr + adds x7, x19, x14 + xtn v23.2S, v7.2D + rev64 v21.4S, v28.4S + adcs x12, x8, x16 + ldp x20, x19, [x1] + mov x16, v25.d[1] + xtn v22.2S, v28.2D + adc x14, x21, xzr + adds x8, x22, x12 + uzp2 v24.4S, v28.4S, v28.4S + rev64 v28.4S, v18.4S + mul x12, x6, x13 + mul v16.4S, v21.4S, v1.4S + shrn v31.2S, v7.2D, #32 + adcs x22, x17, x14 + mov x14, v25.d[0] + and x21, x20, #0xfffffffffffff + umull v17.2D, v26.2S, v24.2S + ldr q2, [x1, #32] + adcs x17, x15, xzr + ldr q30, [x1, #48] + umull v7.2D, v26.2S, v22.2S + adcs x15, x24, xzr + ldr q0, [x1, #16] + movi v6.2D, #0xffffffff + adc x4, x4, xzr + adds x14, x14, x12 + uzp1 v27.4S, v18.4S, v4.4S + uzp2 v19.4S, v1.4S, v1.4S + adcs x24, x3, x10 + mul x3, x5, x21 + umull v29.2D, v23.2S, v31.2S + ldr q5, [x1] + adc x21, x16, xzr + adds x16, x14, x12 + extr x12, x19, x20, #52 + umull v18.2D, v19.2S, v24.2S + adcs x24, x24, x10 + and x10, x12, #0xfffffffffffff + ldp x14, x12, [x1, #16] + usra v17.2D, v7.2D, #32 + adc x21, x21, xzr + adds x23, x23, x17 + mul x17, x5, x10 + shl v21.2D, v29.2D, #33 + lsl x10, x3, #12 + lsr x1, x3, #52 + rev64 v29.4S, v2.4S + uaddlp v25.2D, v16.4S + add x17, x17, x1 + adcs x16, x16, x15 + extr x3, x14, x19, #40 + mov x15, v20.d[0] + extr x10, x17, x10, #12 + and x3, x3, #0xfffffffffffff + shl v3.2D, v25.2D, #32 + and v6.16B, v17.16B, v6.16B + mul x1, x5, x3 + usra v18.2D, v17.2D, #32 + adcs x3, x24, x4 + extr x4, x12, x14, #28 + umlal v6.2D, v19.2S, v22.2S + xtn v20.2S, v2.2D + umlal v3.2D, v26.2S, v22.2S + movi v26.2D, #0xffffffff + lsr x24, x17, #52 + and x4, x4, #0xfffffffffffff + uzp2 v19.4S, v2.4S, v2.4S + add x1, x1, x24 + mul x24, x5, x4 + lsl x4, x17, #12 + xtn v24.2S, v5.2D + extr x17, x1, x4, #24 + adc x21, x21, xzr + umlal v21.2D, v23.2S, v23.2S + adds x4, x15, x10 + lsl x10, x1, #12 + adcs x15, x7, x17 + mul v23.4S, v28.4S, v4.4S + and x7, x4, #0x1ff + lsr x17, x1, #52 + umulh x1, x19, x12 + uzp2 v17.4S, v5.4S, v5.4S + extr x4, x15, x4, #9 + add x24, x24, x17 + mul v29.4S, v29.4S, v5.4S + extr x17, x24, x10, #36 + extr x10, x9, x12, #16 + uzp1 v28.4S, v4.4S, v4.4S + adcs x17, x8, x17 + and x8, x10, #0xfffffffffffff + umull v16.2D, v24.2S, v20.2S + extr x10, x17, x15, #9 + mul x15, x5, x8 + stp x4, x10, [x0] + lsl x4, x24, #12 + lsr x8, x9, #4 + uaddlp v4.2D, v23.4S + and x8, x8, #0xfffffffffffff + umull v23.2D, v24.2S, v19.2S + mul x8, x5, x8 + extr x10, x2, x9, #56 + lsr x24, x24, #52 + and x10, x10, #0xfffffffffffff + add x15, x15, x24 + extr x4, x15, x4, #48 + mul x24, x5, x10 + lsr x10, x15, #52 + usra v23.2D, v16.2D, #32 + add x10, x8, x10 + shl v4.2D, v4.2D, #32 + adcs x22, x22, x4 + extr x4, x6, x2, #44 + lsl x15, x15, #12 + lsr x8, x10, #52 + extr x15, x10, x15, #60 + and x10, x4, #0xfffffffffffff + umlal v4.2D, v28.2S, v27.2S + add x8, x24, x8 + extr x4, x13, x6, #32 + mul x24, x5, x10 + uzp2 v16.4S, v30.4S, v30.4S + lsl x10, x15, #8 + rev64 v28.4S, v30.4S + and x15, x4, #0xfffffffffffff + extr x4, x8, x10, #8 + mul x10, x5, x15 + lsl x15, x8, #12 + adcs x23, x23, x4 + lsr x4, x8, #52 + lsr x8, x13, #20 + add x4, x24, x4 + mul x8, x5, x8 + lsr x24, x4, #52 + extr x15, x4, x15, #20 + lsl x4, x4, #12 + add x10, x10, x24 + adcs x15, x16, x15 + extr x4, x10, x4, #32 + umulh x5, x20, x14 + adcs x3, x3, x4 + usra v18.2D, v6.2D, #32 + lsl x16, x10, #12 + extr x24, x15, x23, #9 + lsr x10, x10, #52 + uzp2 v27.4S, v0.4S, v0.4S + add x8, x8, x10 + extr x10, x3, x15, #9 + extr x4, x22, x17, #9 + and v25.16B, v23.16B, v26.16B + lsr x17, x8, #44 + extr x15, x8, x16, #44 + extr x16, x23, x22, #9 + xtn v7.2S, v30.2D + mov x8, v4.d[0] + stp x24, x10, [x0, #32] + uaddlp v30.2D, v29.4S + stp x4, x16, [x0, #16] + umulh x24, x20, x19 + adcs x15, x21, x15 + adc x16, x11, x17 + subs x11, x20, x19 + xtn v5.2S, v0.2D + csetm x17, cc + extr x3, x15, x3, #9 + mov x22, v4.d[1] + cneg x21, x11, cc + subs x10, x12, x14 + mul v31.4S, v28.4S, v0.4S + cneg x10, x10, cc + cinv x11, x17, cc + shl v4.2D, v30.2D, #32 + umull v28.2D, v5.2S, v16.2S + extr x23, x16, x15, #9 + adds x4, x8, x5 + mul x17, x21, x10 + umull v22.2D, v5.2S, v7.2S + adc x15, x5, xzr + adds x4, x4, x22 + uaddlp v2.2D, v31.4S + lsr x5, x16, #9 + adcs x16, x15, x1 + mov x15, v18.d[0] + adc x1, x1, xzr + umulh x10, x21, x10 + adds x22, x16, x22 + umlal v4.2D, v24.2S, v20.2S + umull v30.2D, v27.2S, v16.2S + stp x3, x23, [x0, #48] + add x3, x7, x5 + adc x16, x1, xzr + usra v28.2D, v22.2D, #32 + mul x23, x20, x19 + eor x1, x17, x11 + cmn x11, #0x1 + mov x17, v18.d[1] + umull v18.2D, v17.2S, v19.2S + adcs x7, x4, x1 + eor x1, x10, x11 + umlal v25.2D, v17.2S, v20.2S + movi v16.2D, #0xffffffff + adcs x22, x22, x1 + usra v18.2D, v23.2D, #32 + umulh x4, x14, x14 + adc x1, x16, x11 + adds x10, x8, x8 + shl v23.2D, v2.2D, #32 + str x3, [x0, #64] + adcs x5, x7, x7 + and v16.16B, v28.16B, v16.16B + usra v30.2D, v28.2D, #32 + adcs x7, x22, x22 + mov x21, v3.d[1] + adcs x11, x1, x1 + umlal v16.2D, v27.2S, v7.2S + adc x22, xzr, xzr + adds x16, x15, x23 + mul x8, x14, x12 + umlal v23.2D, v5.2S, v7.2S + usra v18.2D, v25.2D, #32 + umulh x15, x14, x12 + adcs x21, x21, x24 + usra v30.2D, v16.2D, #32 + adc x1, x17, xzr + adds x3, x16, x23 + adcs x21, x21, x24 + adc x1, x1, xzr + adds x24, x10, x21 + umulh x21, x12, x12 + adcs x16, x5, x1 + adcs x10, x7, xzr + mov x17, v21.d[1] + adcs x23, x11, xzr + adc x5, x22, xzr + adds x1, x4, x8 + adcs x22, x17, x15 + ldp x17, x4, [x0] + mov x11, v21.d[0] + adc x21, x21, xzr + adds x1, x1, x8 + adcs x15, x22, x15 + adc x8, x21, xzr + adds x22, x11, x10 + mov x21, v3.d[0] + adcs x11, x1, x23 + ldp x1, x10, [x0, #16] + adcs x15, x15, x5 + adc x7, x8, xzr + adds x8, x17, x21 + mov x23, v4.d[1] + ldp x5, x21, [x0, #32] + adcs x17, x4, x3 + ldr x4, [x0, #64] + mov x3, v18.d[0] + adcs x24, x1, x24 + stp x8, x17, [x0] + adcs x17, x10, x16 + ldp x1, x16, [x0, #48] + adcs x5, x5, x22 + adcs x8, x21, x11 + stp x5, x8, [x0, #32] + adcs x1, x1, x15 + mov x15, v23.d[1] + adcs x21, x16, x7 + stp x1, x21, [x0, #48] + adc x10, x4, xzr + subs x7, x14, x12 + mov x16, v18.d[1] + cneg x5, x7, cc + csetm x4, cc + subs x11, x13, x6 + mov x8, v23.d[0] + cneg x7, x11, cc + cinv x21, x4, cc + mov x11, v30.d[0] + adds x4, x23, x3 + mul x22, x5, x7 + mov x23, v30.d[1] + adcs x8, x8, x16 + adcs x16, x15, x11 + adc x11, x23, xzr + umulh x3, x5, x7 + stp x24, x17, [x0, #16] + mov x5, v4.d[0] + subs x15, x20, x19 + cneg x7, x15, cc + str x10, [x0, #64] + csetm x1, cc + subs x24, x2, x9 + cneg x17, x24, cc + cinv x15, x1, cc + adds x23, x4, x5 + umulh x1, x7, x17 + adcs x24, x8, x4 + adcs x10, x16, x8 + eor x8, x22, x21 + adcs x16, x11, x16 + mul x22, x7, x17 + eor x17, x1, x15 + adc x1, xzr, x11 + adds x11, x24, x5 + eor x7, x3, x21 + adcs x3, x10, x23 + adcs x24, x16, x24 + adcs x4, x1, x10 + eor x10, x22, x15 + adcs x16, xzr, x16 + adc x1, xzr, x1 + cmn x21, #0x1 + adcs x8, x4, x8 + adcs x22, x16, x7 + adc x7, x1, x21 + subs x21, x19, x12 + csetm x4, cc + cneg x1, x21, cc + subs x21, x13, x2 + cinv x16, x4, cc + cneg x4, x21, cc + cmn x15, #0x1 + adcs x21, x23, x10 + mul x23, x1, x4 + adcs x11, x11, x17 + adcs x3, x3, x15 + umulh x1, x1, x4 + adcs x24, x24, x15 + adcs x8, x8, x15 + adcs x22, x22, x15 + eor x17, x23, x16 + adc x15, x7, x15 + subs x7, x20, x14 + cneg x7, x7, cc + csetm x4, cc + subs x10, x20, x12 + cneg x23, x10, cc + csetm x10, cc + subs x12, x6, x9 + cinv x20, x4, cc + cneg x12, x12, cc + cmn x16, #0x1 + eor x1, x1, x16 + adcs x17, x24, x17 + mul x4, x7, x12 + adcs x8, x8, x1 + umulh x1, x7, x12 + adcs x24, x22, x16 + adc x7, x15, x16 + subs x12, x13, x9 + cneg x12, x12, cc + cinv x13, x10, cc + subs x19, x19, x14 + mul x9, x23, x12 + cneg x19, x19, cc + csetm x10, cc + eor x16, x1, x20 + subs x22, x6, x2 + umulh x12, x23, x12 + eor x1, x4, x20 + cinv x4, x10, cc + cneg x22, x22, cc + cmn x20, #0x1 + adcs x15, x11, x1 + eor x6, x12, x13 + adcs x10, x3, x16 + adcs x17, x17, x20 + eor x23, x9, x13 + adcs x2, x8, x20 + mul x11, x19, x22 + adcs x24, x24, x20 + adc x7, x7, x20 + cmn x13, #0x1 + adcs x3, x10, x23 + umulh x22, x19, x22 + adcs x17, x17, x6 + eor x12, x22, x4 + extr x22, x15, x21, #63 + adcs x8, x2, x13 + extr x21, x21, x5, #63 + ldp x16, x23, [x0] + adcs x20, x24, x13 + eor x1, x11, x4 + adc x6, x7, x13 + cmn x4, #0x1 + ldp x2, x7, [x0, #16] + adcs x1, x3, x1 + extr x19, x1, x15, #63 + adcs x14, x17, x12 + extr x1, x14, x1, #63 + lsl x17, x5, #1 + adcs x8, x8, x4 + extr x12, x8, x14, #8 + ldp x15, x11, [x0, #32] + adcs x9, x20, x4 + adc x3, x6, x4 + adds x16, x12, x16 + extr x6, x9, x8, #8 + ldp x14, x12, [x0, #48] + extr x8, x3, x9, #8 + adcs x20, x6, x23 + ldr x24, [x0, #64] + lsr x6, x3, #8 + adcs x8, x8, x2 + and x2, x1, #0x1ff + and x1, x20, x8 + adcs x4, x6, x7 + adcs x3, x17, x15 + and x1, x1, x4 + adcs x9, x21, x11 + and x1, x1, x3 + adcs x6, x22, x14 + and x1, x1, x9 + and x21, x1, x6 + adcs x14, x19, x12 + adc x1, x24, x2 + cmp xzr, xzr + orr x12, x1, #0xfffffffffffffe00 + lsr x1, x1, #9 + adcs xzr, x16, x1 + and x21, x21, x14 + adcs xzr, x21, xzr + adcs xzr, x12, xzr + adcs x21, x16, x1 + adcs x1, x20, xzr + adcs x19, x8, xzr + stp x21, x1, [x0] + adcs x1, x4, xzr + adcs x21, x3, xzr + stp x19, x1, [x0, #16] + adcs x1, x9, xzr + stp x21, x1, [x0, #32] + adcs x21, x6, xzr + adcs x1, x14, xzr + stp x21, x1, [x0, #48] + adc x1, x12, xzr + and x1, x1, #0x1ff + str x1, [x0, #64] + +// Restore regs and return + + ldp x23, x24, [sp], #16 + ldp x21, x22, [sp], #16 + ldp x19, x20, [sp], #16 + + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/arm/proofs/bignum_mul_p521.ml b/arm/proofs/bignum_mul_p521.ml index 04fc37ff..8c9ccf60 100644 --- a/arm/proofs/bignum_mul_p521.ml +++ b/arm/proofs/bignum_mul_p521.ml @@ -654,6 +654,15 @@ let bignum_mul_p521_mc = let BIGNUM_MUL_P521_EXEC = ARM_MK_EXEC_RULE bignum_mul_p521_mc;; +(* bignum_mul_p521_mc without callee-save register spills + ret. *) +let bignum_mul_p521_core_mc_def, + bignum_mul_p521_core_mc, + BIGNUM_MUL_P521_CORE_EXEC = + mk_sublist_of_mc "bignum_mul_p521_core_mc" + bignum_mul_p521_mc + (`20`,`LENGTH bignum_mul_p521_mc - 44`) + (fst BIGNUM_MUL_P521_EXEC);; + (* ------------------------------------------------------------------------- *) (* Proof. *) (* ------------------------------------------------------------------------- *) @@ -753,20 +762,20 @@ let ADK_48_TAC = DECARRY_RULE o CONJUNCTS) THEN DISCH_THEN(fun th -> REWRITE_TAC[th]) THEN REAL_INTEGER_TAC;; -let BIGNUM_MUL_P521_CORRECT = prove +let BIGNUM_MUL_P521_CORE_CORRECT = prove (`!z x y a b pc stackpointer. aligned 16 stackpointer /\ ALL (nonoverlapping (stackpointer,80)) - [(word pc,0x9ec); (z,8 * 9); (x,8 * 9); (y,8 * 9)] /\ - nonoverlapping (z,8 * 9) (word pc,0x9ec) + [(word pc,LENGTH bignum_mul_p521_core_mc); (z,8 * 9); (x,8 * 9); (y,8 * 9)] /\ + nonoverlapping (z,8 * 9) (word pc,LENGTH bignum_mul_p521_core_mc) ==> ensures arm - (\s. aligned_bytes_loaded s (word pc) bignum_mul_p521_mc /\ - read PC s = word(pc + 0x14) /\ + (\s. aligned_bytes_loaded s (word pc) bignum_mul_p521_core_mc /\ + read PC s = word(pc) /\ read SP s = stackpointer /\ C_ARGUMENTS [z; x; y] s /\ bignum_from_memory (x,9) s = a /\ bignum_from_memory (y,9) s = b) - (\s. read PC s = word (pc + 0x9d4) /\ + (\s. read PC s = word (pc + LENGTH bignum_mul_p521_core_mc) /\ (a < p_521 /\ b < p_521 ==> bignum_from_memory (z,9) s = (a * b) MOD p_521)) (MAYCHANGE [PC; X3; X4; X5; X6; X7; X8; X9; @@ -778,14 +787,15 @@ let BIGNUM_MUL_P521_CORRECT = prove MAP_EVERY X_GEN_TAC [`z:int64`; `x:int64`; `y:int64`; `a:num`; `b:num`; `pc:num`; `stackpointer:int64`] THEN - REWRITE_TAC[ALL; C_ARGUMENTS; SOME_FLAGS; NONOVERLAPPING_CLAUSES] THEN + REWRITE_TAC[ALL; C_ARGUMENTS; SOME_FLAGS; NONOVERLAPPING_CLAUSES; + fst BIGNUM_MUL_P521_CORE_EXEC] THEN DISCH_THEN(REPEAT_TCL CONJUNCTS_THEN ASSUME_TAC) THEN (*** Globalize the a < p_521 /\ b < p_521 assumption for simplicity ***) ASM_CASES_TAC `a < p_521 /\ b < p_521` THENL [ASM_REWRITE_TAC[] THEN FIRST_X_ASSUM(CONJUNCTS_THEN ASSUME_TAC); - ARM_SIM_TAC BIGNUM_MUL_P521_EXEC (1--624)] THEN + ARM_SIM_TAC BIGNUM_MUL_P521_CORE_EXEC (1--624)] THEN (*** Digitize, deduce the bound on the top words ***) @@ -803,7 +813,7 @@ let BIGNUM_MUL_P521_CORRECT = prove (*** 4x4 multiplication of the low portions and its rebasing ***) - ARM_ACCSTEPS_TAC BIGNUM_MUL_P521_EXEC + ARM_ACCSTEPS_TAC BIGNUM_MUL_P521_CORE_EXEC [5; 6; 7; 8; 10; 12; 14; 16; 17; 18; 19; 20; 21; 22; 23; 24; 25; 26; 27; 33; 38; 40; 41; 47; 52; 54; 55; 56; 57; 58; 59; 65; 70; 72; 73; 74; 80; 85; 87; 88; 89; 90; 91; 97; 102; 104; 105; 106; @@ -865,7 +875,7 @@ let BIGNUM_MUL_P521_CORRECT = prove (*** 4x4 multiplication of the high portions ***) - ARM_ACCSTEPS_TAC BIGNUM_MUL_P521_EXEC + ARM_ACCSTEPS_TAC BIGNUM_MUL_P521_CORE_EXEC [138; 139; 140; 141; 143; 145; 147; 149; 150; 151; 152; 153; 154; 155; 156; 157; 158; 159; 160; 166; 171; 173; 174; 180; 185; 187; 188; 189; 190; 191; 192; 198; 203; 205; 206; 207; @@ -885,7 +895,7 @@ let BIGNUM_MUL_P521_CORRECT = prove (*** Addition combining high and low parts into hl ***) - ARM_ACCSTEPS_TAC BIGNUM_MUL_P521_EXEC + ARM_ACCSTEPS_TAC BIGNUM_MUL_P521_CORE_EXEC [258; 259; 262; 263; 266; 267; 270; 271; 274] (257--275) THEN ABBREV_TAC @@ -930,7 +940,7 @@ let BIGNUM_MUL_P521_CORRECT = prove (*** The sign-magnitude difference computation ***) - ARM_ACCSTEPS_TAC BIGNUM_MUL_P521_EXEC + ARM_ACCSTEPS_TAC BIGNUM_MUL_P521_CORE_EXEC [277; 278; 280; 281; 284; 285; 287; 288; 291; 293; 295; 297; 299; 301; 303; 305] (276--306) THEN @@ -996,7 +1006,7 @@ let BIGNUM_MUL_P521_CORRECT = prove (*** One more 4x4 multiplication of the cross-terms ***) - ARM_ACCSTEPS_TAC BIGNUM_MUL_P521_EXEC + ARM_ACCSTEPS_TAC BIGNUM_MUL_P521_CORE_EXEC [307; 308; 309; 310; 312; 314; 316; 318; 319; 320; 321; 322; 323; 324; 325; 326; 327; 328; 329; 335; 340; 342; 343; 349; 354; 356; 357; 358; 359; 360; 361; 367; 372; 374; 375; 376; @@ -1020,7 +1030,7 @@ let BIGNUM_MUL_P521_CORRECT = prove EXPAND_TAC "hl" THEN CONV_TAC(LAND_CONV(ONCE_DEPTH_CONV BIGNUM_OF_WORDLIST_DIV_CONV)) THEN DISCH_TAC THEN - ARM_ACCSTEPS_TAC BIGNUM_MUL_P521_EXEC + ARM_ACCSTEPS_TAC BIGNUM_MUL_P521_CORE_EXEC [429; 431; 433; 435; 440; 442; 444; 446] (426--447) THEN ABBREV_TAC @@ -1096,7 +1106,7 @@ let BIGNUM_MUL_P521_CORRECT = prove DISCH_THEN(ASSUME_TAC o MATCH_MP (INTEGER_RULE `(x:int == a + y) (mod n) /\ (y' == y) (mod n) ==> (x == a + y') (mod n)`))] THEN - ARM_ACCSTEPS_TAC BIGNUM_MUL_P521_EXEC + ARM_ACCSTEPS_TAC BIGNUM_MUL_P521_CORE_EXEC [448; 449; 450; 451; 455; 457; 459; 461] (448--463) THEN ABBREV_TAC @@ -1174,7 +1184,7 @@ let BIGNUM_MUL_P521_CORRECT = prove (*** The intricate augmentation of the product with top words ***) - ARM_ACCSTEPS_TAC BIGNUM_MUL_P521_EXEC + ARM_ACCSTEPS_TAC BIGNUM_MUL_P521_CORE_EXEC [484; 498; 510; 527; 551; 566; 579; 590; 595] (464--595) THEN SUBGOAL_THEN @@ -1466,7 +1476,7 @@ let BIGNUM_MUL_P521_CORRECT = prove STRIP_ASSUME_TAC THENL [REWRITE_TAC[MOD_LT_EQ] THEN UNDISCH_TAC `n < 2 EXP 576` THEN ARITH_TAC; ALL_TAC] THEN - ARM_STEPS_TAC BIGNUM_MUL_P521_EXEC (596--598) THEN + ARM_STEPS_TAC BIGNUM_MUL_P521_CORE_EXEC (596--598) THEN RULE_ASSUM_TAC(REWRITE_RULE[GSYM WORD_AND_ASSOC; DIMINDEX_64; NUM_REDUCE_CONV `9 MOD 64`]) THEN MAP_EVERY ABBREV_TAC @@ -1476,7 +1486,7 @@ let BIGNUM_MUL_P521_CORRECT = prove word_and sum_s498 (word_and sum_s510 (word_and sum_s527 (word_and sum_s551 (word_and sum_s566 (word_and sum_s579 sum_s590)))))`] THEN - ARM_ACCSTEPS_TAC BIGNUM_MUL_P521_EXEC (599--601) (599--601) THEN + ARM_ACCSTEPS_TAC BIGNUM_MUL_P521_CORE_EXEC (599--601) (599--601) THEN SUBGOAL_THEN `carry_s601 <=> 2 EXP 192 <= @@ -1488,7 +1498,7 @@ let BIGNUM_MUL_P521_CORRECT = prove ACCUMULATOR_ASSUM_LIST(MP_TAC o end_itlist CONJ o DECARRY_RULE) THEN DISCH_THEN(fun th -> REWRITE_TAC[th]) THEN BOUNDER_TAC[]; ACCUMULATOR_POP_ASSUM_LIST(K ALL_TAC)] THEN - ARM_ACCSTEPS_TAC BIGNUM_MUL_P521_EXEC (602--610) (602--610) THEN + ARM_ACCSTEPS_TAC BIGNUM_MUL_P521_CORE_EXEC (602--610) (602--610) THEN SUBGOAL_THEN `val(d:int64) = 2 EXP 9 * (2 EXP 55 - 1) + val(sum_s595:int64) MOD 2 EXP 9` SUBST_ALL_TAC THENL @@ -1600,7 +1610,7 @@ let BIGNUM_MUL_P521_CORRECT = prove (*** The rotation to shift from the 512 position ***) - ARM_STEPS_TAC BIGNUM_MUL_P521_EXEC (611--624) THEN + ARM_STEPS_TAC BIGNUM_MUL_P521_CORE_EXEC (611--624) THEN ENSURES_FINAL_STATE_TAC THEN ASM_REWRITE_TAC[] THEN CONV_TAC MOD_DOWN_CONV THEN CONV_TAC SYM_CONV THEN REWRITE_TAC[MOD_UNIQUE] THEN @@ -1628,6 +1638,40 @@ let BIGNUM_MUL_P521_CORRECT = prove CONV_TAC NUM_REDUCE_CONV THEN REWRITE_TAC[GSYM REAL_OF_NUM_CLAUSES] THEN REAL_INTEGER_TAC);; +let BIGNUM_MUL_P521_CORRECT = time prove + (`!z x y a b pc stackpointer. + aligned 16 stackpointer /\ + ALL (nonoverlapping (stackpointer,80)) + [(word pc,LENGTH bignum_mul_p521_mc); (z,8 * 9); + (x,8 * 9); (y,8 * 9)] /\ + nonoverlapping (z,8 * 9) (word pc,LENGTH bignum_mul_p521_mc) + ==> ensures arm + (\s. aligned_bytes_loaded s (word pc) bignum_mul_p521_mc /\ + read PC s = word(pc + 20) /\ + read SP s = stackpointer /\ + C_ARGUMENTS [z; x; y] s /\ + bignum_from_memory (x,9) s = a /\ + bignum_from_memory (y,9) s = b) + (\s. read PC s = word (pc + 20 + LENGTH bignum_mul_p521_core_mc) /\ + (a < p_521 /\ b < p_521 + ==> bignum_from_memory (z,9) s = (a * b) MOD p_521)) + (MAYCHANGE [PC; X3; X4; X5; X6; X7; X8; X9; + X10; X11; X12; X13; X14; X15; X16; X17; X19; + X20; X21; X22; X23; X24; X25; X26] ,, + MAYCHANGE SOME_FLAGS ,, + MAYCHANGE [memory :> bignum(z,9); + memory :> bytes(stackpointer,80)])`, + + ARM_SUB_LIST_OF_MC_TAC BIGNUM_MUL_P521_CORE_CORRECT + bignum_mul_p521_core_mc_def + [fst BIGNUM_MUL_P521_CORE_EXEC;fst BIGNUM_MUL_P521_EXEC] THEN + + REPEAT (POP_ASSUM MP_TAC) THEN + REWRITE_TAC([fst BIGNUM_MUL_P521_CORE_EXEC;fst BIGNUM_MUL_P521_EXEC;ALL; + NONOVERLAPPING_CLAUSES]) THEN + REPEAT STRIP_TAC THEN + ASM_REWRITE_TAC[] THEN NONOVERLAPPING_TAC);; + let BIGNUM_MUL_P521_SUBROUTINE_CORRECT = prove (`!z x y a b pc stackpointer returnaddress. aligned 16 stackpointer /\ @@ -1648,6 +1692,8 @@ let BIGNUM_MUL_P521_SUBROUTINE_CORRECT = prove (MAYCHANGE_REGS_AND_FLAGS_PERMITTED_BY_ABI ,, MAYCHANGE [memory :> bytes(z,8 * 9); memory :> bytes(word_sub stackpointer (word 144),144)])`, - ARM_ADD_RETURN_STACK_TAC - BIGNUM_MUL_P521_EXEC BIGNUM_MUL_P521_CORRECT + let th = CONV_RULE (ONCE_DEPTH_CONV NUM_ADD_CONV) + (REWRITE_RULE [fst BIGNUM_MUL_P521_CORE_EXEC;fst BIGNUM_MUL_P521_EXEC] + BIGNUM_MUL_P521_CORRECT) in + ARM_ADD_RETURN_STACK_TAC BIGNUM_MUL_P521_EXEC th `[X19;X20;X21;X22;X23;X24;X25;X26]` 144);; diff --git a/arm/proofs/bignum_mul_p521_neon.ml b/arm/proofs/bignum_mul_p521_neon.ml new file mode 100644 index 00000000..4a1d688c --- /dev/null +++ b/arm/proofs/bignum_mul_p521_neon.ml @@ -0,0 +1,1206 @@ +(* + * Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + *) + +(* ========================================================================= *) +(* Multiplication modulo p_521. *) +(* ========================================================================= *) + +needs "arm/proofs/bignum_mul_p521.ml";; +needs "arm/proofs/equiv.ml";; +needs "arm/proofs/neon_helper.ml";; + +(* This is the intermediate program that is equivalent to the core parts of + bignum_mul_p521 and bignum_mul_p521_neon. This is a vectorized + version of core of bignum_mul_p521 but instructions are unscheduled. *) + +let bignum_mul_p521_interm1_ops:int list = [ + 0xa940542f; (* ldp x15, x21, [x1] *) + 0xa941442a; (* ldp x10, x17, [x1, #16] *) + 0xa940404d; (* ldp x13, x16, [x2] *) + 0x3dc00032; (* ldr q18, [x1] *) + 0x3dc0005c; (* ldr q28, [x2] *) + 0xa9415045; (* ldp x5, x20, [x2, #16] *) + 0x6f00e5f0; (* movi v16.2d, #0xffffffff *) + 0x4e9c5b87; (* uzp2 v7.4s, v28.4s, v28.4s *) + 0x0ea12a44; (* xtn v4.2s, v18.2d *) + 0x0ea12b81; (* xtn v1.2s, v28.2d *) + 0x4ea00b9b; (* rev64 v27.4s, v28.4s *) + 0x2ea1c095; (* umull v21.2d, v4.2s, v1.2s *) + 0x2ea7c09c; (* umull v28.2d, v4.2s, v7.2s *) + 0x4e925a45; (* uzp2 v5.4s, v18.4s, v18.4s *) + 0x4eb29f72; (* mul v18.4s, v27.4s, v18.4s *) + 0x6f6016bc; (* usra v28.2d, v21.2d, #32 *) + 0x2ea7c0bd; (* umull v29.2d, v5.2s, v7.2s *) + 0x6ea02a52; (* uaddlp v18.2d, v18.4s *) + 0x4e301f90; (* and v16.16b, v28.16b, v16.16b *) + 0x2ea180b0; (* umlal v16.2d, v5.2s, v1.2s *) + 0x4f605652; (* shl v18.2d, v18.2d, #32 *) + 0x6f60179d; (* usra v29.2d, v28.2d, #32 *) + 0x2ea18092; (* umlal v18.2d, v4.2s, v1.2s *) + 0x6f60161d; (* usra v29.2d, v16.2d, #32 *) + 0x4e083e48; (* mov x8, v18.d[0] *) + 0x4e183e49; (* mov x9, v18.d[1] *) + 0x9b057d46; (* mul x6, x10, x5 *) + 0x9b147e33; (* mul x19, x17, x20 *) + 0x4e083fae; (* mov x14, v29.d[0] *) + 0xab0e0129; (* adds x9, x9, x14 *) + 0x4e183fae; (* mov x14, v29.d[1] *) + 0xba0e00c6; (* adcs x6, x6, x14 *) + 0x9bc57d4e; (* umulh x14, x10, x5 *) + 0xba0e0273; (* adcs x19, x19, x14 *) + 0x9bd47e2e; (* umulh x14, x17, x20 *) + 0x9a1f01ce; (* adc x14, x14, xzr *) + 0xab08012b; (* adds x11, x9, x8 *) + 0xba0900c9; (* adcs x9, x6, x9 *) + 0xba060266; (* adcs x6, x19, x6 *) + 0xba1301d3; (* adcs x19, x14, x19 *) + 0x9a0e03ee; (* adc x14, xzr, x14 *) + 0xab080123; (* adds x3, x9, x8 *) + 0xba0b00d8; (* adcs x24, x6, x11 *) + 0xba090269; (* adcs x9, x19, x9 *) + 0xba0601c6; (* adcs x6, x14, x6 *) + 0xba1303f3; (* adcs x19, xzr, x19 *) + 0x9a0e03ee; (* adc x14, xzr, x14 *) + 0xeb110144; (* subs x4, x10, x17 *) + 0xda842484; (* cneg x4, x4, cc // cc = lo, ul, last *) + 0xda9f23e7; (* csetm x7, cc // cc = lo, ul, last *) + 0xeb050297; (* subs x23, x20, x5 *) + 0xda9726f7; (* cneg x23, x23, cc // cc = lo, ul, last *) + 0x9b177c96; (* mul x22, x4, x23 *) + 0x9bd77c84; (* umulh x4, x4, x23 *) + 0xda8720e7; (* cinv x7, x7, cc // cc = lo, ul, last *) + 0xb10004ff; (* cmn x7, #0x1 *) + 0xca0702d7; (* eor x23, x22, x7 *) + 0xba1700c6; (* adcs x6, x6, x23 *) + 0xca070084; (* eor x4, x4, x7 *) + 0xba040273; (* adcs x19, x19, x4 *) + 0x9a0701ce; (* adc x14, x14, x7 *) + 0xeb1501e4; (* subs x4, x15, x21 *) + 0xda842484; (* cneg x4, x4, cc // cc = lo, ul, last *) + 0xda9f23e7; (* csetm x7, cc // cc = lo, ul, last *) + 0xeb0d0217; (* subs x23, x16, x13 *) + 0xda9726f7; (* cneg x23, x23, cc // cc = lo, ul, last *) + 0x9b177c96; (* mul x22, x4, x23 *) + 0x9bd77c84; (* umulh x4, x4, x23 *) + 0xda8720e7; (* cinv x7, x7, cc // cc = lo, ul, last *) + 0xb10004ff; (* cmn x7, #0x1 *) + 0xca0702d7; (* eor x23, x22, x7 *) + 0xba17016b; (* adcs x11, x11, x23 *) + 0xca070084; (* eor x4, x4, x7 *) + 0xba040063; (* adcs x3, x3, x4 *) + 0xba070318; (* adcs x24, x24, x7 *) + 0xba070129; (* adcs x9, x9, x7 *) + 0xba0700c6; (* adcs x6, x6, x7 *) + 0xba070273; (* adcs x19, x19, x7 *) + 0x9a0701ce; (* adc x14, x14, x7 *) + 0xeb1102a4; (* subs x4, x21, x17 *) + 0xda842484; (* cneg x4, x4, cc // cc = lo, ul, last *) + 0xda9f23e7; (* csetm x7, cc // cc = lo, ul, last *) + 0xeb100297; (* subs x23, x20, x16 *) + 0xda9726f7; (* cneg x23, x23, cc // cc = lo, ul, last *) + 0x9b177c96; (* mul x22, x4, x23 *) + 0x9bd77c84; (* umulh x4, x4, x23 *) + 0xda8720e7; (* cinv x7, x7, cc // cc = lo, ul, last *) + 0xb10004ff; (* cmn x7, #0x1 *) + 0xca0702d7; (* eor x23, x22, x7 *) + 0xba170129; (* adcs x9, x9, x23 *) + 0xca070084; (* eor x4, x4, x7 *) + 0xba0400c6; (* adcs x6, x6, x4 *) + 0xba070273; (* adcs x19, x19, x7 *) + 0x9a0701ce; (* adc x14, x14, x7 *) + 0xeb0a01e4; (* subs x4, x15, x10 *) + 0xda842484; (* cneg x4, x4, cc // cc = lo, ul, last *) + 0xda9f23e7; (* csetm x7, cc // cc = lo, ul, last *) + 0xeb0d00b7; (* subs x23, x5, x13 *) + 0xda9726f7; (* cneg x23, x23, cc // cc = lo, ul, last *) + 0x9b177c96; (* mul x22, x4, x23 *) + 0x9bd77c84; (* umulh x4, x4, x23 *) + 0xda8720e7; (* cinv x7, x7, cc // cc = lo, ul, last *) + 0xb10004ff; (* cmn x7, #0x1 *) + 0xca0702d7; (* eor x23, x22, x7 *) + 0xba170063; (* adcs x3, x3, x23 *) + 0xca070084; (* eor x4, x4, x7 *) + 0xba040318; (* adcs x24, x24, x4 *) + 0xba070129; (* adcs x9, x9, x7 *) + 0xba0700c6; (* adcs x6, x6, x7 *) + 0xba070273; (* adcs x19, x19, x7 *) + 0x9a0701ce; (* adc x14, x14, x7 *) + 0xeb1101f1; (* subs x17, x15, x17 *) + 0xda912631; (* cneg x17, x17, cc // cc = lo, ul, last *) + 0xda9f23e4; (* csetm x4, cc // cc = lo, ul, last *) + 0xeb0d028d; (* subs x13, x20, x13 *) + 0xda8d25ad; (* cneg x13, x13, cc // cc = lo, ul, last *) + 0x9b0d7e34; (* mul x20, x17, x13 *) + 0x9bcd7e31; (* umulh x17, x17, x13 *) + 0xda84208d; (* cinv x13, x4, cc // cc = lo, ul, last *) + 0xb10005bf; (* cmn x13, #0x1 *) + 0xca0d0294; (* eor x20, x20, x13 *) + 0xba140314; (* adcs x20, x24, x20 *) + 0xca0d0231; (* eor x17, x17, x13 *) + 0xba110131; (* adcs x17, x9, x17 *) + 0xba0d00c9; (* adcs x9, x6, x13 *) + 0xba0d0266; (* adcs x6, x19, x13 *) + 0x9a0d01cd; (* adc x13, x14, x13 *) + 0xeb0a02b5; (* subs x21, x21, x10 *) + 0xda9526b5; (* cneg x21, x21, cc // cc = lo, ul, last *) + 0xda9f23ea; (* csetm x10, cc // cc = lo, ul, last *) + 0xeb1000b0; (* subs x16, x5, x16 *) + 0xda902610; (* cneg x16, x16, cc // cc = lo, ul, last *) + 0x9b107ea5; (* mul x5, x21, x16 *) + 0x9bd07eb5; (* umulh x21, x21, x16 *) + 0xda8a214a; (* cinv x10, x10, cc // cc = lo, ul, last *) + 0xb100055f; (* cmn x10, #0x1 *) + 0xca0a00b0; (* eor x16, x5, x10 *) + 0xba100290; (* adcs x16, x20, x16 *) + 0xca0a02b5; (* eor x21, x21, x10 *) + 0xba150235; (* adcs x21, x17, x21 *) + 0xba0a0131; (* adcs x17, x9, x10 *) + 0xba0a00c5; (* adcs x5, x6, x10 *) + 0x9a0a01aa; (* adc x10, x13, x10 *) + 0xd377d90d; (* lsl x13, x8, #9 *) + 0x93c8dd74; (* extr x20, x11, x8, #55 *) + 0x93cbdc68; (* extr x8, x3, x11, #55 *) + 0x93c3de09; (* extr x9, x16, x3, #55 *) + 0xd377fe10; (* lsr x16, x16, #55 *) + 0xa90047f5; (* stp x21, x17, [sp] *) + 0xa9012be5; (* stp x5, x10, [sp, #16] *) + 0xa90253ed; (* stp x13, x20, [sp, #32] *) + 0xa90327e8; (* stp x8, x9, [sp, #48] *) + 0xf90023f0; (* str x16, [sp, #64] *) + 0xa9422835; (* ldp x21, x10, [x1, #32] *) + 0xa9433431; (* ldp x17, x13, [x1, #48] *) + 0xa9421450; (* ldp x16, x5, [x2, #32] *) + 0x3dc00832; (* ldr q18, [x1, #32] *) + 0x3dc0085c; (* ldr q28, [x2, #32] *) + 0xa9432054; (* ldp x20, x8, [x2, #48] *) + 0x6f00e5f0; (* movi v16.2d, #0xffffffff *) + 0x4e9c5b87; (* uzp2 v7.4s, v28.4s, v28.4s *) + 0x0ea12a44; (* xtn v4.2s, v18.2d *) + 0x0ea12b81; (* xtn v1.2s, v28.2d *) + 0x4ea00b9c; (* rev64 v28.4s, v28.4s *) + 0x2ea1c09b; (* umull v27.2d, v4.2s, v1.2s *) + 0x2ea7c09d; (* umull v29.2d, v4.2s, v7.2s *) + 0x4e925a55; (* uzp2 v21.4s, v18.4s, v18.4s *) + 0x4eb29f9c; (* mul v28.4s, v28.4s, v18.4s *) + 0x6f60177d; (* usra v29.2d, v27.2d, #32 *) + 0x2ea7c2b2; (* umull v18.2d, v21.2s, v7.2s *) + 0x6ea02b9c; (* uaddlp v28.2d, v28.4s *) + 0x4e301fb0; (* and v16.16b, v29.16b, v16.16b *) + 0x2ea182b0; (* umlal v16.2d, v21.2s, v1.2s *) + 0x4f60579c; (* shl v28.2d, v28.2d, #32 *) + 0x6f6017b2; (* usra v18.2d, v29.2d, #32 *) + 0x2ea1809c; (* umlal v28.2d, v4.2s, v1.2s *) + 0x6f601612; (* usra v18.2d, v16.2d, #32 *) + 0x4e083f89; (* mov x9, v28.d[0] *) + 0x4e183f86; (* mov x6, v28.d[1] *) + 0x9b147e33; (* mul x19, x17, x20 *) + 0x9b087dae; (* mul x14, x13, x8 *) + 0x4e083e4b; (* mov x11, v18.d[0] *) + 0xab0b00c6; (* adds x6, x6, x11 *) + 0x4e183e4b; (* mov x11, v18.d[1] *) + 0xba0b0273; (* adcs x19, x19, x11 *) + 0x9bd47e2b; (* umulh x11, x17, x20 *) + 0xba0b01ce; (* adcs x14, x14, x11 *) + 0x9bc87dab; (* umulh x11, x13, x8 *) + 0x9a1f016b; (* adc x11, x11, xzr *) + 0xab0900c3; (* adds x3, x6, x9 *) + 0xba060266; (* adcs x6, x19, x6 *) + 0xba1301d3; (* adcs x19, x14, x19 *) + 0xba0e016e; (* adcs x14, x11, x14 *) + 0x9a0b03eb; (* adc x11, xzr, x11 *) + 0xab0900d8; (* adds x24, x6, x9 *) + 0xba030264; (* adcs x4, x19, x3 *) + 0xba0601c6; (* adcs x6, x14, x6 *) + 0xba130173; (* adcs x19, x11, x19 *) + 0xba0e03ee; (* adcs x14, xzr, x14 *) + 0x9a0b03eb; (* adc x11, xzr, x11 *) + 0xeb0d0227; (* subs x7, x17, x13 *) + 0xda8724e7; (* cneg x7, x7, cc // cc = lo, ul, last *) + 0xda9f23f7; (* csetm x23, cc // cc = lo, ul, last *) + 0xeb140116; (* subs x22, x8, x20 *) + 0xda9626d6; (* cneg x22, x22, cc // cc = lo, ul, last *) + 0x9b167cec; (* mul x12, x7, x22 *) + 0x9bd67ce7; (* umulh x7, x7, x22 *) + 0xda9722f7; (* cinv x23, x23, cc // cc = lo, ul, last *) + 0xb10006ff; (* cmn x23, #0x1 *) + 0xca170196; (* eor x22, x12, x23 *) + 0xba160273; (* adcs x19, x19, x22 *) + 0xca1700e7; (* eor x7, x7, x23 *) + 0xba0701ce; (* adcs x14, x14, x7 *) + 0x9a17016b; (* adc x11, x11, x23 *) + 0xeb0a02a7; (* subs x7, x21, x10 *) + 0xda8724e7; (* cneg x7, x7, cc // cc = lo, ul, last *) + 0xda9f23f7; (* csetm x23, cc // cc = lo, ul, last *) + 0xeb1000b6; (* subs x22, x5, x16 *) + 0xda9626d6; (* cneg x22, x22, cc // cc = lo, ul, last *) + 0x9b167cec; (* mul x12, x7, x22 *) + 0x9bd67ce7; (* umulh x7, x7, x22 *) + 0xda9722f7; (* cinv x23, x23, cc // cc = lo, ul, last *) + 0xb10006ff; (* cmn x23, #0x1 *) + 0xca170196; (* eor x22, x12, x23 *) + 0xba160063; (* adcs x3, x3, x22 *) + 0xca1700e7; (* eor x7, x7, x23 *) + 0xba070318; (* adcs x24, x24, x7 *) + 0xba170084; (* adcs x4, x4, x23 *) + 0xba1700c6; (* adcs x6, x6, x23 *) + 0xba170273; (* adcs x19, x19, x23 *) + 0xba1701ce; (* adcs x14, x14, x23 *) + 0x9a17016b; (* adc x11, x11, x23 *) + 0xeb0d0147; (* subs x7, x10, x13 *) + 0xda8724e7; (* cneg x7, x7, cc // cc = lo, ul, last *) + 0xda9f23f7; (* csetm x23, cc // cc = lo, ul, last *) + 0xeb050116; (* subs x22, x8, x5 *) + 0xda9626d6; (* cneg x22, x22, cc // cc = lo, ul, last *) + 0x9b167cec; (* mul x12, x7, x22 *) + 0x9bd67ce7; (* umulh x7, x7, x22 *) + 0xda9722f7; (* cinv x23, x23, cc // cc = lo, ul, last *) + 0xb10006ff; (* cmn x23, #0x1 *) + 0xca170196; (* eor x22, x12, x23 *) + 0xba1600c6; (* adcs x6, x6, x22 *) + 0xca1700e7; (* eor x7, x7, x23 *) + 0xba070273; (* adcs x19, x19, x7 *) + 0xba1701ce; (* adcs x14, x14, x23 *) + 0x9a17016b; (* adc x11, x11, x23 *) + 0xeb1102a7; (* subs x7, x21, x17 *) + 0xda8724e7; (* cneg x7, x7, cc // cc = lo, ul, last *) + 0xda9f23f7; (* csetm x23, cc // cc = lo, ul, last *) + 0xeb100296; (* subs x22, x20, x16 *) + 0xda9626d6; (* cneg x22, x22, cc // cc = lo, ul, last *) + 0x9b167cec; (* mul x12, x7, x22 *) + 0x9bd67ce7; (* umulh x7, x7, x22 *) + 0xda9722f7; (* cinv x23, x23, cc // cc = lo, ul, last *) + 0xb10006ff; (* cmn x23, #0x1 *) + 0xca170196; (* eor x22, x12, x23 *) + 0xba160318; (* adcs x24, x24, x22 *) + 0xca1700e7; (* eor x7, x7, x23 *) + 0xba070084; (* adcs x4, x4, x7 *) + 0xba1700c6; (* adcs x6, x6, x23 *) + 0xba170273; (* adcs x19, x19, x23 *) + 0xba1701ce; (* adcs x14, x14, x23 *) + 0x9a17016b; (* adc x11, x11, x23 *) + 0xeb0d02a7; (* subs x7, x21, x13 *) + 0xda8724e7; (* cneg x7, x7, cc // cc = lo, ul, last *) + 0xda9f23f7; (* csetm x23, cc // cc = lo, ul, last *) + 0xeb100116; (* subs x22, x8, x16 *) + 0xda9626d6; (* cneg x22, x22, cc // cc = lo, ul, last *) + 0x9b167cec; (* mul x12, x7, x22 *) + 0x9bd67ce7; (* umulh x7, x7, x22 *) + 0xda9722f7; (* cinv x23, x23, cc // cc = lo, ul, last *) + 0xb10006ff; (* cmn x23, #0x1 *) + 0xca170196; (* eor x22, x12, x23 *) + 0xba160084; (* adcs x4, x4, x22 *) + 0xca1700e7; (* eor x7, x7, x23 *) + 0xba0700c6; (* adcs x6, x6, x7 *) + 0xba170273; (* adcs x19, x19, x23 *) + 0xba1701ce; (* adcs x14, x14, x23 *) + 0x9a17016b; (* adc x11, x11, x23 *) + 0xeb110147; (* subs x7, x10, x17 *) + 0xda8724e7; (* cneg x7, x7, cc // cc = lo, ul, last *) + 0xda9f23f7; (* csetm x23, cc // cc = lo, ul, last *) + 0xeb050296; (* subs x22, x20, x5 *) + 0xda9626d6; (* cneg x22, x22, cc // cc = lo, ul, last *) + 0x9b167cec; (* mul x12, x7, x22 *) + 0x9bd67ce7; (* umulh x7, x7, x22 *) + 0xda9722f7; (* cinv x23, x23, cc // cc = lo, ul, last *) + 0xb10006ff; (* cmn x23, #0x1 *) + 0xca170196; (* eor x22, x12, x23 *) + 0xba160084; (* adcs x4, x4, x22 *) + 0xca1700e7; (* eor x7, x7, x23 *) + 0xba0700c6; (* adcs x6, x6, x7 *) + 0xba170273; (* adcs x19, x19, x23 *) + 0xba1701ce; (* adcs x14, x14, x23 *) + 0x9a17016b; (* adc x11, x11, x23 *) + 0xa9405fe7; (* ldp x7, x23, [sp] *) + 0xab070129; (* adds x9, x9, x7 *) + 0xba170063; (* adcs x3, x3, x23 *) + 0xa9000fe9; (* stp x9, x3, [sp] *) + 0xa9410fe9; (* ldp x9, x3, [sp, #16] *) + 0xba090309; (* adcs x9, x24, x9 *) + 0xba030083; (* adcs x3, x4, x3 *) + 0xa9010fe9; (* stp x9, x3, [sp, #16] *) + 0xa9420fe9; (* ldp x9, x3, [sp, #32] *) + 0xba0900c9; (* adcs x9, x6, x9 *) + 0xba030266; (* adcs x6, x19, x3 *) + 0xa9021be9; (* stp x9, x6, [sp, #32] *) + 0xa9431be9; (* ldp x9, x6, [sp, #48] *) + 0xba0901c9; (* adcs x9, x14, x9 *) + 0xba060166; (* adcs x6, x11, x6 *) + 0xa9031be9; (* stp x9, x6, [sp, #48] *) + 0xf94023e9; (* ldr x9, [sp, #64] *) + 0x9a1f0129; (* adc x9, x9, xzr *) + 0xf90023e9; (* str x9, [sp, #64] *) + 0xa9401829; (* ldp x9, x6, [x1] *) + 0xeb0902b5; (* subs x21, x21, x9 *) + 0xfa06014a; (* sbcs x10, x10, x6 *) + 0xa9411829; (* ldp x9, x6, [x1, #16] *) + 0xfa090231; (* sbcs x17, x17, x9 *) + 0xfa0601ad; (* sbcs x13, x13, x6 *) + 0xda9f23e9; (* csetm x9, cc // cc = lo, ul, last *) + 0xa9404c46; (* ldp x6, x19, [x2] *) + 0xeb1000d0; (* subs x16, x6, x16 *) + 0xfa050265; (* sbcs x5, x19, x5 *) + 0xa9414c46; (* ldp x6, x19, [x2, #16] *) + 0xfa1400d4; (* sbcs x20, x6, x20 *) + 0xfa080268; (* sbcs x8, x19, x8 *) + 0xda9f23e6; (* csetm x6, cc // cc = lo, ul, last *) + 0xca0902b5; (* eor x21, x21, x9 *) + 0xeb0902b5; (* subs x21, x21, x9 *) + 0xca09014a; (* eor x10, x10, x9 *) + 0xfa09014a; (* sbcs x10, x10, x9 *) + 0xca090231; (* eor x17, x17, x9 *) + 0xfa090231; (* sbcs x17, x17, x9 *) + 0xca0901ad; (* eor x13, x13, x9 *) + 0xda0901ad; (* sbc x13, x13, x9 *) + 0xca060210; (* eor x16, x16, x6 *) + 0xeb060210; (* subs x16, x16, x6 *) + 0xca0600a5; (* eor x5, x5, x6 *) + 0xfa0600a5; (* sbcs x5, x5, x6 *) + 0xca060294; (* eor x20, x20, x6 *) + 0xfa060294; (* sbcs x20, x20, x6 *) + 0xca060108; (* eor x8, x8, x6 *) + 0xda060108; (* sbc x8, x8, x6 *) + 0xca0900c9; (* eor x9, x6, x9 *) + 0x9b107ea6; (* mul x6, x21, x16 *) + 0x9b057d53; (* mul x19, x10, x5 *) + 0x9b147e2e; (* mul x14, x17, x20 *) + 0x9b087dab; (* mul x11, x13, x8 *) + 0x9bd07ea3; (* umulh x3, x21, x16 *) + 0xab030273; (* adds x19, x19, x3 *) + 0x9bc57d43; (* umulh x3, x10, x5 *) + 0xba0301ce; (* adcs x14, x14, x3 *) + 0x9bd47e23; (* umulh x3, x17, x20 *) + 0xba03016b; (* adcs x11, x11, x3 *) + 0x9bc87da3; (* umulh x3, x13, x8 *) + 0x9a1f0063; (* adc x3, x3, xzr *) + 0xab060278; (* adds x24, x19, x6 *) + 0xba1301d3; (* adcs x19, x14, x19 *) + 0xba0e016e; (* adcs x14, x11, x14 *) + 0xba0b006b; (* adcs x11, x3, x11 *) + 0x9a0303e3; (* adc x3, xzr, x3 *) + 0xab060264; (* adds x4, x19, x6 *) + 0xba1801c7; (* adcs x7, x14, x24 *) + 0xba130173; (* adcs x19, x11, x19 *) + 0xba0e006e; (* adcs x14, x3, x14 *) + 0xba0b03eb; (* adcs x11, xzr, x11 *) + 0x9a0303e3; (* adc x3, xzr, x3 *) + 0xeb0d0237; (* subs x23, x17, x13 *) + 0xda9726f7; (* cneg x23, x23, cc // cc = lo, ul, last *) + 0xda9f23f6; (* csetm x22, cc // cc = lo, ul, last *) + 0xeb14010c; (* subs x12, x8, x20 *) + 0xda8c258c; (* cneg x12, x12, cc // cc = lo, ul, last *) + 0x9b0c7eef; (* mul x15, x23, x12 *) + 0x9bcc7ef7; (* umulh x23, x23, x12 *) + 0xda9622d6; (* cinv x22, x22, cc // cc = lo, ul, last *) + 0xb10006df; (* cmn x22, #0x1 *) + 0xca1601ec; (* eor x12, x15, x22 *) + 0xba0c01ce; (* adcs x14, x14, x12 *) + 0xca1602f7; (* eor x23, x23, x22 *) + 0xba17016b; (* adcs x11, x11, x23 *) + 0x9a160063; (* adc x3, x3, x22 *) + 0xeb0a02b7; (* subs x23, x21, x10 *) + 0xda9726f7; (* cneg x23, x23, cc // cc = lo, ul, last *) + 0xda9f23f6; (* csetm x22, cc // cc = lo, ul, last *) + 0xeb1000ac; (* subs x12, x5, x16 *) + 0xda8c258c; (* cneg x12, x12, cc // cc = lo, ul, last *) + 0x9b0c7eef; (* mul x15, x23, x12 *) + 0x9bcc7ef7; (* umulh x23, x23, x12 *) + 0xda9622d6; (* cinv x22, x22, cc // cc = lo, ul, last *) + 0xb10006df; (* cmn x22, #0x1 *) + 0xca1601ec; (* eor x12, x15, x22 *) + 0xba0c0318; (* adcs x24, x24, x12 *) + 0xca1602f7; (* eor x23, x23, x22 *) + 0xba170084; (* adcs x4, x4, x23 *) + 0xba1600e7; (* adcs x7, x7, x22 *) + 0xba160273; (* adcs x19, x19, x22 *) + 0xba1601ce; (* adcs x14, x14, x22 *) + 0xba16016b; (* adcs x11, x11, x22 *) + 0x9a160063; (* adc x3, x3, x22 *) + 0xeb0d0157; (* subs x23, x10, x13 *) + 0xda9726f7; (* cneg x23, x23, cc // cc = lo, ul, last *) + 0xda9f23f6; (* csetm x22, cc // cc = lo, ul, last *) + 0xeb05010c; (* subs x12, x8, x5 *) + 0xda8c258c; (* cneg x12, x12, cc // cc = lo, ul, last *) + 0x9b0c7eef; (* mul x15, x23, x12 *) + 0x9bcc7ef7; (* umulh x23, x23, x12 *) + 0xda9622d6; (* cinv x22, x22, cc // cc = lo, ul, last *) + 0xb10006df; (* cmn x22, #0x1 *) + 0xca1601ec; (* eor x12, x15, x22 *) + 0xba0c0273; (* adcs x19, x19, x12 *) + 0xca1602f7; (* eor x23, x23, x22 *) + 0xba1701ce; (* adcs x14, x14, x23 *) + 0xba16016b; (* adcs x11, x11, x22 *) + 0x9a160063; (* adc x3, x3, x22 *) + 0xeb1102b7; (* subs x23, x21, x17 *) + 0xda9726f7; (* cneg x23, x23, cc // cc = lo, ul, last *) + 0xda9f23f6; (* csetm x22, cc // cc = lo, ul, last *) + 0xeb10028c; (* subs x12, x20, x16 *) + 0xda8c258c; (* cneg x12, x12, cc // cc = lo, ul, last *) + 0x9b0c7eef; (* mul x15, x23, x12 *) + 0x9bcc7ef7; (* umulh x23, x23, x12 *) + 0xda9622d6; (* cinv x22, x22, cc // cc = lo, ul, last *) + 0xb10006df; (* cmn x22, #0x1 *) + 0xca1601ec; (* eor x12, x15, x22 *) + 0xba0c0084; (* adcs x4, x4, x12 *) + 0xca1602f7; (* eor x23, x23, x22 *) + 0xba1700e7; (* adcs x7, x7, x23 *) + 0xba160273; (* adcs x19, x19, x22 *) + 0xba1601ce; (* adcs x14, x14, x22 *) + 0xba16016b; (* adcs x11, x11, x22 *) + 0x9a160063; (* adc x3, x3, x22 *) + 0xeb0d02b5; (* subs x21, x21, x13 *) + 0xda9526b5; (* cneg x21, x21, cc // cc = lo, ul, last *) + 0xda9f23ed; (* csetm x13, cc // cc = lo, ul, last *) + 0xeb100110; (* subs x16, x8, x16 *) + 0xda902610; (* cneg x16, x16, cc // cc = lo, ul, last *) + 0x9b107ea8; (* mul x8, x21, x16 *) + 0x9bd07eb5; (* umulh x21, x21, x16 *) + 0xda8d21ad; (* cinv x13, x13, cc // cc = lo, ul, last *) + 0xb10005bf; (* cmn x13, #0x1 *) + 0xca0d0110; (* eor x16, x8, x13 *) + 0xba1000f0; (* adcs x16, x7, x16 *) + 0xca0d02b5; (* eor x21, x21, x13 *) + 0xba150275; (* adcs x21, x19, x21 *) + 0xba0d01c8; (* adcs x8, x14, x13 *) + 0xba0d0173; (* adcs x19, x11, x13 *) + 0x9a0d006d; (* adc x13, x3, x13 *) + 0xeb11014a; (* subs x10, x10, x17 *) + 0xda8a254a; (* cneg x10, x10, cc // cc = lo, ul, last *) + 0xda9f23f1; (* csetm x17, cc // cc = lo, ul, last *) + 0xeb050285; (* subs x5, x20, x5 *) + 0xda8524a5; (* cneg x5, x5, cc // cc = lo, ul, last *) + 0x9b057d54; (* mul x20, x10, x5 *) + 0x9bc57d4a; (* umulh x10, x10, x5 *) + 0xda912231; (* cinv x17, x17, cc // cc = lo, ul, last *) + 0xb100063f; (* cmn x17, #0x1 *) + 0xca110285; (* eor x5, x20, x17 *) + 0xba050210; (* adcs x16, x16, x5 *) + 0xca11014a; (* eor x10, x10, x17 *) + 0xba0a02b5; (* adcs x21, x21, x10 *) + 0xba11010a; (* adcs x10, x8, x17 *) + 0xba110265; (* adcs x5, x19, x17 *) + 0x9a1101b1; (* adc x17, x13, x17 *) + 0xa94053ed; (* ldp x13, x20, [sp] *) + 0xa9414fe8; (* ldp x8, x19, [sp, #16] *) + 0xca0900c6; (* eor x6, x6, x9 *) + 0xab0d00c6; (* adds x6, x6, x13 *) + 0xca09030e; (* eor x14, x24, x9 *) + 0xba1401ce; (* adcs x14, x14, x20 *) + 0xca09008b; (* eor x11, x4, x9 *) + 0xba08016b; (* adcs x11, x11, x8 *) + 0xca090210; (* eor x16, x16, x9 *) + 0xba130210; (* adcs x16, x16, x19 *) + 0xca0902b5; (* eor x21, x21, x9 *) + 0xa94263e3; (* ldp x3, x24, [sp, #32] *) + 0xa9431fe4; (* ldp x4, x7, [sp, #48] *) + 0xf94023f7; (* ldr x23, [sp, #64] *) + 0xba0302b5; (* adcs x21, x21, x3 *) + 0xca09014a; (* eor x10, x10, x9 *) + 0xba18014a; (* adcs x10, x10, x24 *) + 0xca0900a5; (* eor x5, x5, x9 *) + 0xba0400a5; (* adcs x5, x5, x4 *) + 0xca090231; (* eor x17, x17, x9 *) + 0xba070231; (* adcs x17, x17, x7 *) + 0x9a1f02f6; (* adc x22, x23, xzr *) + 0xab0d02b5; (* adds x21, x21, x13 *) + 0xba14014a; (* adcs x10, x10, x20 *) + 0xba0800ad; (* adcs x13, x5, x8 *) + 0xba130231; (* adcs x17, x17, x19 *) + 0x92402125; (* and x5, x9, #0x1ff *) + 0xd377d8d4; (* lsl x20, x6, #9 *) + 0xaa050285; (* orr x5, x20, x5 *) + 0xba050065; (* adcs x5, x3, x5 *) + 0x93c6ddd4; (* extr x20, x14, x6, #55 *) + 0xba140314; (* adcs x20, x24, x20 *) + 0x93cedd68; (* extr x8, x11, x14, #55 *) + 0xba080088; (* adcs x8, x4, x8 *) + 0x93cbde09; (* extr x9, x16, x11, #55 *) + 0xba0900e9; (* adcs x9, x7, x9 *) + 0xd377fe10; (* lsr x16, x16, #55 *) + 0x9a170210; (* adc x16, x16, x23 *) + 0xf9402046; (* ldr x6, [x2, #64] *) + 0xa9403833; (* ldp x19, x14, [x1] *) + 0x9240ce6b; (* and x11, x19, #0xfffffffffffff *) + 0x9b0b7ccb; (* mul x11, x6, x11 *) + 0xf9402023; (* ldr x3, [x1, #64] *) + 0xa9401058; (* ldp x24, x4, [x2] *) + 0x9240cf07; (* and x7, x24, #0xfffffffffffff *) + 0x9b077c67; (* mul x7, x3, x7 *) + 0x8b07016b; (* add x11, x11, x7 *) + 0x93d3d1d3; (* extr x19, x14, x19, #52 *) + 0x9240ce73; (* and x19, x19, #0xfffffffffffff *) + 0x9b137cd3; (* mul x19, x6, x19 *) + 0x93d8d098; (* extr x24, x4, x24, #52 *) + 0x9240cf18; (* and x24, x24, #0xfffffffffffff *) + 0x9b187c78; (* mul x24, x3, x24 *) + 0x8b180273; (* add x19, x19, x24 *) + 0xd374fd78; (* lsr x24, x11, #52 *) + 0x8b180273; (* add x19, x19, x24 *) + 0xd374cd6b; (* lsl x11, x11, #12 *) + 0x93cb326b; (* extr x11, x19, x11, #12 *) + 0xab0b02b5; (* adds x21, x21, x11 *) + 0xa941602b; (* ldp x11, x24, [x1, #16] *) + 0xa9415c47; (* ldp x7, x23, [x2, #16] *) + 0x93cea16e; (* extr x14, x11, x14, #40 *) + 0x9240cdce; (* and x14, x14, #0xfffffffffffff *) + 0x9b0e7cce; (* mul x14, x6, x14 *) + 0x93c4a0e4; (* extr x4, x7, x4, #40 *) + 0x9240cc84; (* and x4, x4, #0xfffffffffffff *) + 0x9b047c64; (* mul x4, x3, x4 *) + 0x8b0401ce; (* add x14, x14, x4 *) + 0xd374fe64; (* lsr x4, x19, #52 *) + 0x8b0401ce; (* add x14, x14, x4 *) + 0xd374ce73; (* lsl x19, x19, #12 *) + 0x93d361d3; (* extr x19, x14, x19, #24 *) + 0xba13014a; (* adcs x10, x10, x19 *) + 0x93cb7313; (* extr x19, x24, x11, #28 *) + 0x9240ce73; (* and x19, x19, #0xfffffffffffff *) + 0x9b137cd3; (* mul x19, x6, x19 *) + 0x93c772eb; (* extr x11, x23, x7, #28 *) + 0x9240cd6b; (* and x11, x11, #0xfffffffffffff *) + 0x9b0b7c6b; (* mul x11, x3, x11 *) + 0x8b0b0273; (* add x19, x19, x11 *) + 0xd374fdcb; (* lsr x11, x14, #52 *) + 0x8b0b0273; (* add x19, x19, x11 *) + 0xd374cdce; (* lsl x14, x14, #12 *) + 0x93ce926e; (* extr x14, x19, x14, #36 *) + 0xba0e01ad; (* adcs x13, x13, x14 *) + 0x8a0d014e; (* and x14, x10, x13 *) + 0xa942102b; (* ldp x11, x4, [x1, #32] *) + 0xa9423047; (* ldp x7, x12, [x2, #32] *) + 0x93d84178; (* extr x24, x11, x24, #16 *) + 0x9240cf18; (* and x24, x24, #0xfffffffffffff *) + 0x9b187cd8; (* mul x24, x6, x24 *) + 0x93d740f7; (* extr x23, x7, x23, #16 *) + 0x9240cef7; (* and x23, x23, #0xfffffffffffff *) + 0x9b177c77; (* mul x23, x3, x23 *) + 0x8b170318; (* add x24, x24, x23 *) + 0xd3503ed7; (* lsl x23, x22, #48 *) + 0x8b170318; (* add x24, x24, x23 *) + 0xd374fe77; (* lsr x23, x19, #52 *) + 0x8b170318; (* add x24, x24, x23 *) + 0xd374ce73; (* lsl x19, x19, #12 *) + 0x93d3c313; (* extr x19, x24, x19, #48 *) + 0xba130231; (* adcs x17, x17, x19 *) + 0x8a1101d3; (* and x19, x14, x17 *) + 0xd344fd6e; (* lsr x14, x11, #4 *) + 0x9240cdce; (* and x14, x14, #0xfffffffffffff *) + 0x9b0e7cce; (* mul x14, x6, x14 *) + 0xd344fcf7; (* lsr x23, x7, #4 *) + 0x9240cef7; (* and x23, x23, #0xfffffffffffff *) + 0x9b177c77; (* mul x23, x3, x23 *) + 0x8b1701ce; (* add x14, x14, x23 *) + 0xd374ff17; (* lsr x23, x24, #52 *) + 0x8b1701ce; (* add x14, x14, x23 *) + 0xd374cf18; (* lsl x24, x24, #12 *) + 0x93d8f1d8; (* extr x24, x14, x24, #60 *) + 0x93cbe08b; (* extr x11, x4, x11, #56 *) + 0x9240cd6b; (* and x11, x11, #0xfffffffffffff *) + 0x9b0b7ccb; (* mul x11, x6, x11 *) + 0x93c7e187; (* extr x7, x12, x7, #56 *) + 0x9240cce7; (* and x7, x7, #0xfffffffffffff *) + 0x9b077c67; (* mul x7, x3, x7 *) + 0x8b07016b; (* add x11, x11, x7 *) + 0xd374fdce; (* lsr x14, x14, #52 *) + 0x8b0e016e; (* add x14, x11, x14 *) + 0xd378df0b; (* lsl x11, x24, #8 *) + 0x93cb21cb; (* extr x11, x14, x11, #8 *) + 0xba0b00a5; (* adcs x5, x5, x11 *) + 0x8a050273; (* and x19, x19, x5 *) + 0xa943602b; (* ldp x11, x24, [x1, #48] *) + 0xa9431c42; (* ldp x2, x7, [x2, #48] *) + 0x93c4b164; (* extr x4, x11, x4, #44 *) + 0x9240cc84; (* and x4, x4, #0xfffffffffffff *) + 0x9b047cc4; (* mul x4, x6, x4 *) + 0x93ccb057; (* extr x23, x2, x12, #44 *) + 0x9240cef7; (* and x23, x23, #0xfffffffffffff *) + 0x9b177c77; (* mul x23, x3, x23 *) + 0x8b170084; (* add x4, x4, x23 *) + 0xd374fdd7; (* lsr x23, x14, #52 *) + 0x8b170084; (* add x4, x4, x23 *) + 0xd374cdce; (* lsl x14, x14, #12 *) + 0x93ce508e; (* extr x14, x4, x14, #20 *) + 0xba0e0294; (* adcs x20, x20, x14 *) + 0x8a140273; (* and x19, x19, x20 *) + 0x93cb830e; (* extr x14, x24, x11, #32 *) + 0x9240cdce; (* and x14, x14, #0xfffffffffffff *) + 0x9b0e7cce; (* mul x14, x6, x14 *) + 0x93c280e2; (* extr x2, x7, x2, #32 *) + 0x9240cc42; (* and x2, x2, #0xfffffffffffff *) + 0x9b027c62; (* mul x2, x3, x2 *) + 0x8b0201c2; (* add x2, x14, x2 *) + 0xd374fc8e; (* lsr x14, x4, #52 *) + 0x8b0e0042; (* add x2, x2, x14 *) + 0xd374cc8e; (* lsl x14, x4, #12 *) + 0x93ce804e; (* extr x14, x2, x14, #32 *) + 0xba0e0108; (* adcs x8, x8, x14 *) + 0x8a080273; (* and x19, x19, x8 *) + 0xd354ff0e; (* lsr x14, x24, #20 *) + 0x9b0e7cce; (* mul x14, x6, x14 *) + 0xd354fceb; (* lsr x11, x7, #20 *) + 0x9b0b7c6b; (* mul x11, x3, x11 *) + 0x8b0b01ce; (* add x14, x14, x11 *) + 0xd374fc4b; (* lsr x11, x2, #52 *) + 0x8b0b01ce; (* add x14, x14, x11 *) + 0xd374cc42; (* lsl x2, x2, #12 *) + 0x93c2b1c2; (* extr x2, x14, x2, #44 *) + 0xba020129; (* adcs x9, x9, x2 *) + 0x8a090262; (* and x2, x19, x9 *) + 0x9b037cc6; (* mul x6, x6, x3 *) + 0xd36cfdd3; (* lsr x19, x14, #44 *) + 0x8b1300c6; (* add x6, x6, x19 *) + 0x9a060210; (* adc x16, x16, x6 *) + 0xd349fe06; (* lsr x6, x16, #9 *) + 0xb277da10; (* orr x16, x16, #0xfffffffffffffe00 *) + 0xeb1f03ff; (* cmp xzr, xzr *) + 0xba0602bf; (* adcs xzr, x21, x6 *) + 0xba1f005f; (* adcs xzr, x2, xzr *) + 0xba1f021f; (* adcs xzr, x16, xzr *) + 0xba0602b5; (* adcs x21, x21, x6 *) + 0xba1f014a; (* adcs x10, x10, xzr *) + 0xba1f01ad; (* adcs x13, x13, xzr *) + 0xba1f0231; (* adcs x17, x17, xzr *) + 0xba1f00a5; (* adcs x5, x5, xzr *) + 0xba1f0294; (* adcs x20, x20, xzr *) + 0xba1f0108; (* adcs x8, x8, xzr *) + 0xba1f0129; (* adcs x9, x9, xzr *) + 0x9a1f0210; (* adc x16, x16, xzr *) + 0x924022a2; (* and x2, x21, #0x1ff *) + 0x93d52555; (* extr x21, x10, x21, #9 *) + 0x93ca25aa; (* extr x10, x13, x10, #9 *) + 0xa9002815; (* stp x21, x10, [x0] *) + 0x93cd2635; (* extr x21, x17, x13, #9 *) + 0x93d124aa; (* extr x10, x5, x17, #9 *) + 0xa9012815; (* stp x21, x10, [x0, #16] *) + 0x93c52695; (* extr x21, x20, x5, #9 *) + 0x93d4250a; (* extr x10, x8, x20, #9 *) + 0xa9022815; (* stp x21, x10, [x0, #32] *) + 0x93c82535; (* extr x21, x9, x8, #9 *) + 0x93c9260a; (* extr x10, x16, x9, #9 *) + 0xa9032815; (* stp x21, x10, [x0, #48] *) + 0xf9002002; (* str x2, [x0, #64] *) +];; + +let bignum_mul_p521_interm1_core_mc = + let charlist = List.concat_map + (fun op32 -> + [Char.chr (Int.logand op32 255); + Char.chr (Int.logand (Int.shift_right op32 8) 255); + Char.chr (Int.logand (Int.shift_right op32 16) 255); + Char.chr (Int.logand (Int.shift_right op32 24) 255)]) + bignum_mul_p521_interm1_ops in + let byte_list = Bytes.init (List.length charlist) (fun i -> List.nth charlist i) in + define_word_list "bignum_mul_p521_interm1_core_mc" (term_of_bytes byte_list);; + +let BIGNUM_MUL_P521_INTERM1_CORE_EXEC = + ARM_MK_EXEC_RULE bignum_mul_p521_interm1_core_mc;; + +let equiv_input_states = new_definition + `!s1 s1' x y z stackpointer. + (equiv_input_states:(armstate#armstate)->int64->int64->int64->int64->bool) + (s1,s1') x y z stackpointer <=> + (C_ARGUMENTS [z; x; y] s1 /\ + C_ARGUMENTS [z; x; y] s1' /\ + read SP s1 = stackpointer /\ + read SP s1' = stackpointer /\ + ?a. bignum_from_memory (x,9) s1 = a /\ + bignum_from_memory (x,9) s1' = a /\ + (?b. bignum_from_memory (y,9) s1 = b /\ + bignum_from_memory (y,9) s1' = b))`;; + +let equiv_output_states = new_definition + `!s1 s1' z stackpointer. + (equiv_output_states:(armstate#armstate)->int64->int64->bool) + (s1,s1') z stackpointer <=> + (?a. + read SP s1 = stackpointer /\ + read SP s1' = stackpointer /\ + bignum_from_memory (z,9) s1 = a /\ + bignum_from_memory (z,9) s1' = a)`;; + +let actions = [ + ("equal", 0, 3, 0, 3); + ("insert", 3, 3, 3, 5); + ("equal", 3, 4, 5, 6); + ("replace", 4, 6, 6, 26); + ("equal", 6, 8, 26, 28); + ("replace", 8, 9, 28, 29); + ("equal", 9, 10, 29, 30); + ("replace", 10, 11, 30, 31); + ("equal", 11, 136, 31, 156); + ("insert", 136, 136, 156, 158); + ("equal", 136, 137, 158, 159); + ("replace", 137, 139, 159, 179); + ("equal", 139, 141, 179, 181); + ("replace", 141, 142, 181, 182); + ("equal", 142, 143, 182, 183); + ("replace", 143, 144, 183, 184); + ("equal", 144, 624, 184, 664); +];; + +let equiv_goal1 = mk_equiv_statement + `aligned 16 stackpointer /\ + ALL (nonoverlapping (z:int64,8 * 9)) + [(word pc,LENGTH bignum_mul_p521_core_mc); + (word pc2,LENGTH bignum_mul_p521_interm1_core_mc)] /\ + ALL (nonoverlapping (stackpointer, 80)) + [(word pc,LENGTH bignum_mul_p521_core_mc); + (word pc2,LENGTH bignum_mul_p521_interm1_core_mc); + (z,8 * 9); (x:int64,8 * 9); (y:int64,8 * 9)]` + equiv_input_states + equiv_output_states + bignum_mul_p521_core_mc 0 + `MAYCHANGE [PC; X3; X4; X5; X6; X7; X8; X9; + X10; X11; X12; X13; X14; X15; X16; X17; X19; + X20; X21; X22; X23; X24; X25; X26] ,, + MAYCHANGE SOME_FLAGS ,, + MAYCHANGE [memory :> bignum(z,9); + memory :> bytes(stackpointer,80)]` + bignum_mul_p521_interm1_core_mc 0 + `MAYCHANGE_REGS_AND_FLAGS_PERMITTED_BY_ABI ,, + MAYCHANGE [X19; X20; X21; X22; X23; X24; X25; X26] ,, + MAYCHANGE [memory :> bignum(z,9); + memory :> bytes(stackpointer,80)]`;; + +let _org_extra_word_CONV = !extra_word_CONV;; +extra_word_CONV := + [GEN_REWRITE_CONV I [WORD_BITMANIP_SIMP_LEMMAS; WORD_MUL64_LO; WORD_MUL64_HI; + WORD_SQR64_HI; WORD_SQR128_DIGIT0]] + @ (!extra_word_CONV);; + +let BIGNUM_MUL_P521_CORE_EQUIV1 = time prove(equiv_goal1, + + REWRITE_TAC[MAYCHANGE_REGS_AND_FLAGS_PERMITTED_BY_ABI;SOME_FLAGS; + ALLPAIRS;ALL;NONOVERLAPPING_CLAUSES; + fst BIGNUM_MUL_P521_CORE_EXEC; + fst BIGNUM_MUL_P521_INTERM1_CORE_EXEC] THEN + REPEAT STRIP_TAC THEN + (** Initialize **) + EQUIV_INITIATE_TAC equiv_input_states THEN + REPEAT (FIRST_X_ASSUM BIGNUM_EXPAND_AND_DIGITIZE_TAC) THEN + ASM_PROPAGATE_DIGIT_EQS_FROM_EXPANDED_BIGNUM_TAC THEN + (* necessary to run ldr qs *) + COMBINE_READ_BYTES64_PAIRS_TAC THEN + + (* Start *) + EQUIV_STEPS_TAC actions + BIGNUM_MUL_P521_CORE_EXEC + BIGNUM_MUL_P521_INTERM1_CORE_EXEC THEN + + REPEAT_N 2 ENSURES_FINAL_STATE'_TAC THEN + (* Prove remaining clauses from the postcondition *) + ASM_REWRITE_TAC[] THEN + REPEAT CONJ_TAC THENL [ + (** SUBGOAL 1. Outputs **) + ASM_REWRITE_TAC[equiv_output_states;mk_equiv_regs;mk_equiv_bool_regs; + BIGNUM_EXPAND_CONV `bignum_from_memory (ptr,9) state`; + C_ARGUMENTS] THEN + REPEAT (HINT_EXISTS_REFL_TAC THEN ASM_REWRITE_TAC[]); + + (** SUBGOAL 2. Maychange left **) + DISCARD_ASSUMPTIONS_TAC (fun th -> free_in `s0':armstate` (concl th)) THEN + MONOTONE_MAYCHANGE_TAC; + + (** SUBGOAL 3. Maychange right **) + DISCARD_ASSUMPTIONS_TAC (fun th -> free_in `s0:armstate` (concl th)) THEN + MONOTONE_MAYCHANGE_TAC + ]);; + +extra_word_CONV := _org_extra_word_CONV;; + + + +(****************************************************************************** + The second program equivalence between the core part of intermediate + program and fully optimized program +******************************************************************************) + +let bignum_mul_p521_neon_mc = + define_from_elf "bignum_mul_p521_neon_mc" + "arm/p521/bignum_mul_p521_neon.o";; + +let BIGNUM_MUL_P521_NEON_EXEC = + ARM_MK_EXEC_RULE bignum_mul_p521_neon_mc;; + +let bignum_mul_p521_neon_core_mc_def, + bignum_mul_p521_neon_core_mc, + BIGNUM_MUL_P521_NEON_CORE_EXEC = + mk_sublist_of_mc "bignum_mul_p521_neon_core_mc" + bignum_mul_p521_neon_mc + (`20`,`LENGTH bignum_mul_p521_neon_mc - 44`) + (fst BIGNUM_MUL_P521_NEON_EXEC);; + + +let equiv_goal2 = mk_equiv_statement + `aligned 16 stackpointer /\ + ALL (nonoverlapping (z:int64,8 * 9)) + [(word pc,LENGTH bignum_mul_p521_interm1_core_mc); + (word pc2,LENGTH bignum_mul_p521_neon_core_mc)] /\ + ALL (nonoverlapping (stackpointer, 80)) + [(word pc,LENGTH bignum_mul_p521_interm1_core_mc); + (word pc2,LENGTH bignum_mul_p521_neon_core_mc); + (z,8 * 9); (x:int64,8 * 9); (y:int64,8 * 9)]` + equiv_input_states + equiv_output_states + bignum_mul_p521_interm1_core_mc 0 + `MAYCHANGE_REGS_AND_FLAGS_PERMITTED_BY_ABI ,, + MAYCHANGE [X19; X20; X21; X22; X23; X24; X25; X26] ,, + MAYCHANGE [memory :> bignum(z,9); + memory :> bytes(stackpointer,80)]` + bignum_mul_p521_neon_core_mc 0 + `MAYCHANGE_REGS_AND_FLAGS_PERMITTED_BY_ABI ,, + MAYCHANGE [X19; X20; X21; X22; X23; X24; X25; X26] ,, + MAYCHANGE [memory :> bignum(z,9); + memory :> bytes(stackpointer,80)]`;; + + +(* Line numbers from the fully optimized prog. to the intermediate prog. + The script that prints this map is being privately maintained by aqjune-aws. *) + +let inst_map = [5;2;4;158;6;157;7;8;11;1;10;9;48;15;50;164;163;3;27;161;18;49;62;162;168;21;28;12;14;13;167;63;64;35;16;171;65;23;66;174;165;166;69;67;176;17;68;169;51;19;55;26;52;22;20;53;170;160;73;172;24;33;175;173;29;54;31;177;30;25;32;34;36;37;38;39;57;40;71;41;59;42;43;44;45;46;47;56;58;60;61;80;81;82;83;84;87;70;72;85;74;75;86;76;77;78;145;79;112;114;113;115;144;89;116;118;119;95;97;151;96;117;98;99;102;100;128;130;129;131;101;135;132;88;90;91;92;121;93;133;104;94;103;105;159;106;107;134;155;108;109;110;123;180;111;120;122;124;188;125;126;137;127;136;138;139;179;140;147;186;184;148;141;182;142;153;146;143;201;152;150;203;202;181;204;149;154;208;205;178;207;183;156;185;187;189;190;191;206;192;193;212;194;215;217;216;218;219;222;195;210;196;220;197;198;221;199;200;209;211;224;213;214;233;234;235;226;236;240;237;223;225;238;227;228;239;229;230;242;231;232;244;248;249;250;251;255;252;241;243;253;245;246;247;254;265;257;266;267;268;269;272;256;259;258;270;260;261;262;271;263;264;274;276;273;275;323;326;277;278;279;280;324;316;325;319;327;328;329;317;338;318;340;344;320;321;322;281;282;283;284;332;285;342;330;288;286;339;341;334;343;336;345;331;287;333;335;292;290;355;337;289;291;293;297;294;351;295;296;298;346;299;300;301;353;305;302;350;303;309;306;304;348;307;308;313;310;311;349;314;312;352;315;357;354;356;347;358;370;371;372;373;377;374;359;360;375;361;362;376;363;364;379;365;366;367;368;381;369;384;385;386;387;391;388;378;380;389;382;383;402;390;403;404;393;395;392;394;396;397;398;399;400;401;417;418;419;405;406;409;434;435;436;407;437;441;408;438;420;421;424;411;410;413;412;422;414;415;470;416;423;450;451;426;452;453;454;457;425;439;428;427;492;429;440;472;468;430;431;432;433;455;442;505;443;456;444;445;446;459;447;513;506;448;514;449;458;460;461;466;462;474;476;467;463;481;477;464;465;504;469;493;483;494;485;471;507;478;473;475;480;479;496;482;484;498;486;487;488;500;502;489;515;490;509;508;491;495;497;499;510;525;501;516;503;511;517;527;528;539;526;518;540;512;561;520;522;529;530;519;541;521;542;552;531;523;543;524;532;554;555;544;553;533;580;556;569;557;545;558;581;534;559;570;535;536;546;537;538;548;547;582;560;549;550;565;562;563;594;564;571;583;566;584;611;593;612;576;585;598;572;573;599;608;595;574;609;578;596;623;597;567;586;575;621;577;579;600;587;588;589;590;604;613;591;610;551;568;601;602;624;603;605;617;606;592;614;607;622;625;632;615;616;626;618;628;627;619;629;620;633;630;634;635;636;638;631;637;639;640;641;642;651;643;652;664;644;653;645;655;654;646;656;647;658;657;648;659;649;660;661;650;662;663];; + +(* (state number, (equation, fresh var)) *) +let state_to_abbrevs: (int * thm) list ref = ref [];; + +let BIGNUM_MUL_P521_CORE_EQUIV2 = time prove( + equiv_goal2, + + REWRITE_TAC[MAYCHANGE_REGS_AND_FLAGS_PERMITTED_BY_ABI;SOME_FLAGS; + ALLPAIRS;ALL;NONOVERLAPPING_CLAUSES; + fst BIGNUM_MUL_P521_INTERM1_CORE_EXEC; + fst BIGNUM_MUL_P521_NEON_CORE_EXEC] THEN + REPEAT STRIP_TAC THEN + (** Initialize **) + EQUIV_INITIATE_TAC equiv_input_states THEN + REPEAT (FIRST_X_ASSUM BIGNUM_EXPAND_AND_DIGITIZE_TAC) THEN + ASM_PROPAGATE_DIGIT_EQS_FROM_EXPANDED_BIGNUM_TAC THEN + (* necessary to run ldr qs *) + COMBINE_READ_BYTES64_PAIRS_TAC THEN + + (* Left *) + ARM_STEPS'_AND_ABBREV_TAC BIGNUM_MUL_P521_INTERM1_CORE_EXEC + (1--(List.length inst_map)) state_to_abbrevs THEN + + (* Right *) + ARM_STEPS'_AND_REWRITE_TAC BIGNUM_MUL_P521_NEON_CORE_EXEC + (1--(List.length inst_map)) inst_map state_to_abbrevs THEN + + REPEAT_N 2 ENSURES_FINAL_STATE'_TAC THEN + (* Prove remaining clauses from the postcondition *) + ASM_REWRITE_TAC[] THEN + REPEAT CONJ_TAC THENL [ + (** SUBGOAL 1. Outputs **) + ASM_REWRITE_TAC[equiv_output_states;mk_equiv_regs;mk_equiv_bool_regs; + BIGNUM_EXPAND_CONV `bignum_from_memory (ptr,9) state`; + C_ARGUMENTS] THEN + REPEAT (HINT_EXISTS_REFL_TAC THEN ASM_REWRITE_TAC[]); + + (** SUBGOAL 2. Maychange left **) + DISCARD_ASSUMPTIONS_TAC (fun th -> free_in `s0':armstate` (concl th)) THEN + MONOTONE_MAYCHANGE_TAC; + + (** SUBGOAL 3. Maychange right **) + DISCARD_ASSUMPTIONS_TAC (fun th -> free_in `s0:armstate` (concl th)) THEN + MONOTONE_MAYCHANGE_TAC + ]);; + + +(****************************************************************************** + Use transitivity of two program equivalences to prove end-to-end + correctness +******************************************************************************) + +let equiv_goal = mk_equiv_statement + `aligned 16 stackpointer /\ + ALL (nonoverlapping (z:int64,8 * 9)) + [(word pc,LENGTH bignum_mul_p521_core_mc); + (word pc2,LENGTH bignum_mul_p521_neon_core_mc)] /\ + ALL (nonoverlapping (stackpointer, 80)) + [(word pc,LENGTH bignum_mul_p521_core_mc); + (word pc2,LENGTH bignum_mul_p521_neon_core_mc); + (z,8 * 9); (x:int64,8 * 9); (y:int64,8 * 9)]` + equiv_input_states + equiv_output_states + bignum_mul_p521_core_mc 0 + `MAYCHANGE [PC; X3; X4; X5; X6; X7; X8; X9; + X10; X11; X12; X13; X14; X15; X16; X17; X19; + X20; X21; X22; X23; X24; X25; X26] ,, + MAYCHANGE SOME_FLAGS ,, + MAYCHANGE [memory :> bignum(z,9); + memory :> bytes(stackpointer,80)]` + bignum_mul_p521_neon_core_mc 0 + `MAYCHANGE_REGS_AND_FLAGS_PERMITTED_BY_ABI ,, + MAYCHANGE [X19; X20; X21; X22; X23; X24; X25; X26] ,, + MAYCHANGE [memory :> bignum(z,9); + memory :> bytes(stackpointer,80)]`;; + +let equiv_output_states_TRANS = prove( + `!s s2 s' z stackpointer. + equiv_output_states (s,s') z stackpointer/\ + equiv_output_states (s',s2) z stackpointer + ==> equiv_output_states (s,s2) z stackpointer`, + MESON_TAC[equiv_output_states]);; + +let BIGNUM_MUL_P521_CORE_EQUIV = time prove(equiv_goal, + REPEAT STRIP_TAC THEN + (* To prove the goal, show that there exists an empty slot in the memory + which can locate bignum_mul_p521_interm1_core_mc. *) + SUBGOAL_THEN + `?pc3. + ALL (nonoverlapping (z:int64,8 * 9)) + [(word pc:int64,LENGTH bignum_mul_p521_core_mc); + (word pc3:int64,LENGTH bignum_mul_p521_interm1_core_mc)] /\ + ALL (nonoverlapping (z:int64,8 * 9)) + [(word pc3:int64,LENGTH bignum_mul_p521_interm1_core_mc); + (word pc2:int64,LENGTH bignum_mul_p521_neon_core_mc)] /\ + // Input buffers and the intermediate program don't alias + ALL (nonoverlapping + (word pc3:int64, LENGTH bignum_mul_p521_interm1_core_mc)) + [x,8 * 9; y,8 * 9; stackpointer,80] /\ + 4 divides val (word pc3:int64)` + MP_TAC THENL [ + REPEAT (FIRST_X_ASSUM MP_TAC) THEN + ASM_REWRITE_TAC + [ALL;NONOVERLAPPING_CLAUSES; + fst BIGNUM_MUL_P521_INTERM1_CORE_EXEC; + fst BIGNUM_MUL_P521_NEON_CORE_EXEC; + fst BIGNUM_MUL_P521_CORE_EXEC;GSYM CONJ_ASSOC] THEN + REPEAT STRIP_TAC THEN ASM_REWRITE_TAC[] THEN POP_ASSUM_LIST (K ALL_TAC) THEN + FIND_HOLE_TAC; + + ALL_TAC + ] THEN + STRIP_TAC THEN + + (* instantiate first equiv *) + ENSURES2_TRANS_TAC BIGNUM_MUL_P521_CORE_EQUIV1 BIGNUM_MUL_P521_CORE_EQUIV2 THEN + + (* break 'ALL nonoverlapping' in assumptions *) + RULE_ASSUM_TAC (REWRITE_RULE[ + ALLPAIRS;ALL; + fst BIGNUM_MUL_P521_CORE_EXEC; + fst BIGNUM_MUL_P521_NEON_CORE_EXEC; + fst BIGNUM_MUL_P521_INTERM1_CORE_EXEC; + NONOVERLAPPING_CLAUSES]) THEN + REPEAT SPLIT_FIRST_CONJ_ASSUM_TAC THEN + + MATCH_MP_TAC ENSURES2_WEAKEN THEN + REWRITE_TAC[] THEN + REPEAT CONJ_TAC THENL [ + REPEAT STRIP_TAC THEN ASM_REWRITE_TAC[] THEN + REWRITE_TAC[TAUT `(p /\ q /\ r) /\ p /\ q /\ r' <=> p /\ q /\ r /\ r'`] THEN + EXISTS_TAC + `write (memory :> bytelist + (word pc3,LENGTH bignum_mul_p521_interm1_core_mc)) + bignum_mul_p521_interm1_core_mc + (write PC (word pc3) s')` THEN + PROVE_CONJ_OF_EQ_READS_TAC BIGNUM_MUL_P521_INTERM1_CORE_EXEC THENL [ + UNDISCH_TAC `equiv_input_states (s,s') x y z stackpointer` THEN + REWRITE_TAC[equiv_input_states;C_ARGUMENTS;BIGNUM_FROM_MEMORY_BYTES; + fst BIGNUM_MUL_P521_INTERM1_CORE_EXEC] THEN + STRIP_TAC THEN ASM_REWRITE_TAC[] THEN + REPEAT (TRY HINT_EXISTS_REFL_TAC THEN + PROVE_CONJ_OF_EQ_READS_TAC BIGNUM_MUL_P521_INTERM1_CORE_EXEC); + + UNDISCH_TAC `equiv_input_states (s,s') x y z stackpointer` THEN + REWRITE_TAC[equiv_input_states;C_ARGUMENTS;BIGNUM_FROM_MEMORY_BYTES; + fst BIGNUM_MUL_P521_INTERM1_CORE_EXEC] THEN + STRIP_TAC THEN ASM_REWRITE_TAC[] THEN + REPEAT (TRY HINT_EXISTS_REFL_TAC THEN + PROVE_CONJ_OF_EQ_READS_TAC BIGNUM_MUL_P521_INTERM1_CORE_EXEC); + ]; + + REPEAT GEN_TAC THEN STRIP_TAC THEN + ASM_REWRITE_TAC[] THEN ASM_MESON_TAC[equiv_output_states_TRANS]; + + SUBSUMED_MAYCHANGE_TAC + ]);; + + + +(****************************************************************************** + Inducing BIGNUM_MUL_P521_NEON_SUBROUTINE_CORRECT + from BIGNUM_MUL_P521_CORE_CORRECT +******************************************************************************) + +(* Prove BIGNUM_MUL_P521_CORE_CORRECT_N first *) + +let event_n_at_pc_goal = mk_eventually_n_at_pc_statement + `ALL (nonoverlapping + (word pc:int64, LENGTH + (APPEND bignum_mul_p521_core_mc barrier_inst_bytes))) + [(z:int64,8 * 9); (stackpointer:int64,80)] /\ + aligned 16 stackpointer` + [`z:int64`;`x:int64`;`y:int64`] (*pc_mc_ofs*)0 + bignum_mul_p521_core_mc (*pc_ofs*)0 + `\s0. C_ARGUMENTS [z;x;y] s0 /\ read SP s0 = stackpointer`;; + +let BIGNUM_MUL_P521_EVENTUALLY_N_AT_PC = time prove(event_n_at_pc_goal, + + REWRITE_TAC[LENGTH_APPEND;fst BIGNUM_MUL_P521_CORE_EXEC; + BARRIER_INST_BYTES_LENGTH] THEN + REWRITE_TAC[eventually_n_at_pc;ALL;NONOVERLAPPING_CLAUSES;C_ARGUMENTS] THEN + SUBGOAL_THEN `4 divides (LENGTH bignum_mul_p521_core_mc)` + (fun th -> REWRITE_TAC[MATCH_MP aligned_bytes_loaded_append th; + fst BIGNUM_MUL_P521_CORE_EXEC]) THENL [ + REWRITE_TAC[fst BIGNUM_MUL_P521_CORE_EXEC] + THEN CONV_TAC NUM_DIVIDES_CONV + THEN NO_TAC; + ALL_TAC] THEN + REPEAT GEN_TAC THEN STRIP_TAC THEN + (* now start..! *) + X_GEN_TAC `s0:armstate` THEN GEN_TAC THEN STRIP_TAC THEN + (* eventually ==> eventually_n *) + PROVE_EVENTUALLY_IMPLIES_EVENTUALLY_N_TAC BIGNUM_MUL_P521_CORE_EXEC);; + + +let BIGNUM_MUL_P521_CORE_CORRECT_N = + prove_correct_n + BIGNUM_MUL_P521_EXEC + BIGNUM_MUL_P521_CORE_EXEC + BIGNUM_MUL_P521_CORE_CORRECT + BIGNUM_MUL_P521_EVENTUALLY_N_AT_PC;; + + +(* This theorem is a copy of BIGNUM_MUL_P521_CORE_CORRECT with + - 'pc' replaced with 'pc2' + - LENGTH of bignum_mul_p521_core_mc replaced with + bignum_mul_p521_neon_core_m + - The MAYCHANGE set replaced with the Neon version's one *) +let BIGNUM_MUL_P521_NEON_CORE_CORRECT = prove + (`!z x y a b pc2 stackpointer. + aligned 16 stackpointer /\ + ALL (nonoverlapping (stackpointer,80)) + [(word pc2,LENGTH bignum_mul_p521_neon_core_mc); (z,8 * 9); + (x,8 * 9); (y,8 * 9)] /\ + nonoverlapping (z,8 * 9) (word pc2,LENGTH bignum_mul_p521_neon_core_mc) + ==> ensures arm + (\s. aligned_bytes_loaded s (word pc2) bignum_mul_p521_neon_core_mc /\ + read PC s = word(pc2) /\ + read SP s = stackpointer /\ + C_ARGUMENTS [z; x; y] s /\ + bignum_from_memory (x,9) s = a /\ + bignum_from_memory (y,9) s = b) + (\s. read PC s = word (pc2 + LENGTH bignum_mul_p521_neon_core_mc) /\ + (a < p_521 /\ b < p_521 + ==> bignum_from_memory (z,9) s = (a * b) MOD p_521)) + (MAYCHANGE_REGS_AND_FLAGS_PERMITTED_BY_ABI ,, + MAYCHANGE [X19; X20; X21; X22; X23; X24; X25; X26] ,, + MAYCHANGE [memory :> bignum(z,9); + memory :> bytes(stackpointer,80)])`, + REPEAT GEN_TAC THEN + (* Prepare pc for the original program. *) + SUBGOAL_THEN + `?pc. + ALL (nonoverlapping (word pc, + LENGTH (APPEND bignum_mul_p521_core_mc barrier_inst_bytes))) + [(stackpointer:int64,80);(z:int64,8*9);(x:int64,8 * 9);(y:int64,8 * 9)] /\ + 4 divides val (word pc:int64)` MP_TAC THENL [ + REWRITE_TAC[fst BIGNUM_MUL_P521_CORE_EXEC; + NONOVERLAPPING_CLAUSES;ALL; + LENGTH_APPEND;BARRIER_INST_BYTES_LENGTH] THEN + time FIND_HOLE_TAC; + + (** SUBGOAL 2 **) + ALL_TAC + ] THEN + + REPEAT_N 2 STRIP_TAC THEN + + VCGEN_EQUIV_TAC BIGNUM_MUL_P521_CORE_EQUIV BIGNUM_MUL_P521_CORE_CORRECT_N + [fst BIGNUM_MUL_P521_CORE_EXEC;fst BIGNUM_MUL_P521_NEON_CORE_EXEC] THEN + + (* unfold definitions that may block tactics *) + RULE_ASSUM_TAC (REWRITE_RULE[ALL;NONOVERLAPPING_CLAUSES; + fst BIGNUM_MUL_P521_EXEC; + fst BIGNUM_MUL_P521_NEON_EXEC]) THEN + REPEAT SPLIT_FIRST_CONJ_ASSUM_TAC THEN + REWRITE_TAC[C_ARGUMENTS;BIGNUM_FROM_MEMORY_BYTES] THEN + REPEAT CONJ_TAC THENL [ + (** SUBGOAL 1. Precond **) + X_GEN_TAC `s2:armstate` THEN REPEAT STRIP_TAC THEN + SUBGOAL_THEN `4 divides val (word pc2:int64)` ASSUME_TAC THENL + [ FIRST_ASSUM (fun th -> + MP_TAC th THEN REWRITE_TAC[DIVIDES_4_VAL_WORD_64;aligned_bytes_loaded_word] + THEN METIS_TAC[]) THEN NO_TAC; ALL_TAC ] THEN + ASM_REWRITE_TAC[equiv_input_states;C_ARGUMENTS] THEN + EXISTS_TAC + `write (memory :> bytelist + (word pc,LENGTH (APPEND bignum_mul_p521_core_mc barrier_inst_bytes))) + (APPEND bignum_mul_p521_core_mc barrier_inst_bytes) + (write PC (word pc) s2)` THEN + (* Expand variables appearing in the equiv relation *) + REPEAT CONJ_TAC THEN + TRY (PROVE_CONJ_OF_EQ_READS_TAC BIGNUM_MUL_P521_CORE_EXEC) THEN + (* Now has only one subgoal: the '?a. ...' part of input equivalence! *) + REWRITE_TAC[BIGNUM_FROM_MEMORY_BYTES] THEN + REPEAT (HINT_EXISTS_REFL_TAC THEN + PROVE_CONJ_OF_EQ_READS_TAC BIGNUM_MUL_P521_CORE_EXEC); + + (** SUBGOAL 2. Postcond **) + MESON_TAC[equiv_output_states;BIGNUM_FROM_MEMORY_BYTES; + fst BIGNUM_MUL_P521_NEON_CORE_EXEC]; + + (** SUBGOAL 3. Frame **) + MESON_TAC[MAYCHANGE_REGS_AND_FLAGS_PERMITTED_BY_ABI] + ]);; + + +let BIGNUM_MUL_P521_NEON_CORRECT = prove + (`!z x y a b pc stackpointer. + aligned 16 stackpointer /\ + ALL (nonoverlapping (stackpointer,80)) + [(word pc,LENGTH bignum_mul_p521_neon_mc); (z,8 * 9); + (x,8 * 9); (y,8 * 9)] /\ + nonoverlapping (z,8 * 9) (word pc,LENGTH bignum_mul_p521_neon_mc) + ==> ensures arm + (\s. aligned_bytes_loaded s (word pc) bignum_mul_p521_neon_mc /\ + read PC s = word(pc+20) /\ + read SP s = stackpointer /\ + C_ARGUMENTS [z; x; y] s /\ + bignum_from_memory (x,9) s = a /\ + bignum_from_memory (y,9) s = b) + (\s. read PC s = word (pc + (20 + LENGTH bignum_mul_p521_neon_core_mc)) /\ + (a < p_521 /\ b < p_521 + ==> bignum_from_memory (z,9) s = (a * b) MOD p_521)) + (MAYCHANGE_REGS_AND_FLAGS_PERMITTED_BY_ABI ,, + MAYCHANGE [X19; X20; X21; X22; X23; X24; X25; X26] ,, + MAYCHANGE [memory :> bignum(z,9); + memory :> bytes(stackpointer,80)])`, + + ARM_SUB_LIST_OF_MC_TAC BIGNUM_MUL_P521_NEON_CORE_CORRECT + bignum_mul_p521_neon_core_mc_def + [fst BIGNUM_MUL_P521_NEON_EXEC; + fst BIGNUM_MUL_P521_NEON_CORE_EXEC]);; + + +let BIGNUM_MUL_P521_NEON_SUBROUTINE_CORRECT = prove + (`!z x y a b pc stackpointer returnaddress. + aligned 16 stackpointer /\ + nonoverlapping (z,8 * 9) (word pc,LENGTH bignum_mul_p521_neon_mc) /\ + ALL (nonoverlapping (word_sub stackpointer (word 144),144)) + [(word pc,LENGTH bignum_mul_p521_neon_mc); (x,8 * 9); (y,8 * 9); + (z,8 * 9)] + ==> ensures arm + (\s. aligned_bytes_loaded s (word pc) bignum_mul_p521_neon_mc /\ + read PC s = word pc /\ + read SP s = stackpointer /\ + read X30 s = returnaddress /\ + C_ARGUMENTS [z; x; y] s /\ + bignum_from_memory (x,9) s = a /\ + bignum_from_memory (y,9) s = b) + (\s. read PC s = returnaddress /\ + (a < p_521 /\ b < p_521 + ==> bignum_from_memory (z,9) s = (a * b) MOD p_521)) + (MAYCHANGE_REGS_AND_FLAGS_PERMITTED_BY_ABI ,, + MAYCHANGE [memory :> bytes(z,8 * 9); + memory :> bytes(word_sub stackpointer (word 144),144)])`, + let th = CONV_RULE (ONCE_DEPTH_CONV NUM_ADD_CONV) + (REWRITE_RULE [fst BIGNUM_MUL_P521_NEON_CORE_EXEC; + fst BIGNUM_MUL_P521_NEON_EXEC] + BIGNUM_MUL_P521_NEON_CORRECT) in + REWRITE_TAC[fst BIGNUM_MUL_P521_NEON_EXEC] THEN + ARM_ADD_RETURN_STACK_TAC + BIGNUM_MUL_P521_NEON_EXEC th + `[X19;X20;X21;X22;X23;X24;X25;X26]` 144);; diff --git a/arm/proofs/bignum_sqr_p521.ml b/arm/proofs/bignum_sqr_p521.ml index 5f873fdd..740228df 100644 --- a/arm/proofs/bignum_sqr_p521.ml +++ b/arm/proofs/bignum_sqr_p521.ml @@ -437,6 +437,15 @@ let bignum_sqr_p521_mc = define_assert_from_elf "bignum_sqr_p521_mc" "arm/p521/b let BIGNUM_SQR_P521_EXEC = ARM_MK_EXEC_RULE bignum_sqr_p521_mc;; +(* bignum_sqr_p521_mc without callee-save register spills + ret. *) +let bignum_sqr_p521_core_mc_def, + bignum_sqr_p521_core_mc, + BIGNUM_SQR_P521_CORE_EXEC = + mk_sublist_of_mc "bignum_sqr_p521_core_mc" + bignum_sqr_p521_mc + (`12`,`LENGTH bignum_sqr_p521_mc - 28`) + (fst BIGNUM_SQR_P521_EXEC);; + (* ------------------------------------------------------------------------- *) (* Proof. *) (* ------------------------------------------------------------------------- *) @@ -469,15 +478,15 @@ let lemma2 = prove ASM_SIMP_TAC[VAL_WORD_SUB_CASES; GSYM REAL_OF_NUM_SUB] THEN REAL_ARITH_TAC);; -let BIGNUM_SQR_P521_CORRECT = time prove +let BIGNUM_SQR_P521_CORE_CORRECT = time prove (`!z x n pc. - nonoverlapping (word pc,0x68c) (z,8 * 9) + nonoverlapping (word pc,LENGTH bignum_sqr_p521_core_mc) (z,8 * 9) ==> ensures arm - (\s. aligned_bytes_loaded s (word pc) bignum_sqr_p521_mc /\ - read PC s = word(pc + 0xc) /\ + (\s. aligned_bytes_loaded s (word pc) bignum_sqr_p521_core_mc /\ + read PC s = word(pc) /\ C_ARGUMENTS [z; x] s /\ bignum_from_memory (x,9) s = n) - (\s. read PC s = word (pc + 0x67c) /\ + (\s. read PC s = word (pc + LENGTH bignum_sqr_p521_core_mc) /\ (n < p_521 ==> bignum_from_memory (z,9) s = (n EXP 2) MOD p_521)) (MAYCHANGE [PC; X2; X3; X4; X5; X6; X7; X8; X9; X10; X11; X12; X13; @@ -486,13 +495,14 @@ let BIGNUM_SQR_P521_CORRECT = time prove MAYCHANGE [memory :> bignum(z,9)])`, MAP_EVERY X_GEN_TAC [`z:int64`; `x:int64`; `n:num`; `pc:num`] THEN - REWRITE_TAC[C_ARGUMENTS; C_RETURN; SOME_FLAGS; NONOVERLAPPING_CLAUSES] THEN + REWRITE_TAC[C_ARGUMENTS; C_RETURN; SOME_FLAGS; NONOVERLAPPING_CLAUSES; + fst BIGNUM_SQR_P521_CORE_EXEC] THEN DISCH_THEN(REPEAT_TCL CONJUNCTS_THEN ASSUME_TAC) THEN (*** Globalize the n < p_521 assumption for simplicity's sake ***) ASM_CASES_TAC `n < p_521` THENL - [ASM_REWRITE_TAC[]; ARM_SIM_TAC BIGNUM_SQR_P521_EXEC (1--412)] THEN + [ASM_REWRITE_TAC[]; ARM_SIM_TAC BIGNUM_SQR_P521_CORE_EXEC (1--412)] THEN (*** Digitize, deduce the bound on the top word specifically ***) @@ -506,7 +516,7 @@ let BIGNUM_SQR_P521_CORRECT = time prove (*** The 4x4 squaring of the top "half" ***) - ARM_ACCSTEPS_TAC BIGNUM_SQR_P521_EXEC + ARM_ACCSTEPS_TAC BIGNUM_SQR_P521_CORE_EXEC [5; 6; 13; 18; 19; 21; 22; 23; 24; 25; 27; 28; 29; 30; 31; 32; 33; 34; 35; 36; 37; 41; 42; 43; 44; 45; 46; 47; 48; 49; 50; 51; 52; 53; 54; 58; 59; 60; 61; 62; 63; 64; 65; 66; 67] @@ -539,7 +549,7 @@ let BIGNUM_SQR_P521_CORRECT = time prove (*** The complicated augmentation with the little word contribution ***) - ARM_ACCSTEPS_TAC BIGNUM_SQR_P521_EXEC + ARM_ACCSTEPS_TAC BIGNUM_SQR_P521_CORE_EXEC [70; 80; 88; 96; 104; 119; 127; 135; 142; 144] (68--144) THEN SUBGOAL_THEN @@ -708,7 +718,7 @@ let BIGNUM_SQR_P521_CORRECT = time prove (*** Rotation of the high portion ***) - ARM_STEPS_TAC BIGNUM_SQR_P521_EXEC (145--160) THEN + ARM_STEPS_TAC BIGNUM_SQR_P521_CORE_EXEC (145--160) THEN ABBREV_TAC `htop:int64 = word_add (word_and sum_s80 (word 511)) (word_ushr sum_s144 9)` THEN @@ -777,7 +787,7 @@ let BIGNUM_SQR_P521_CORRECT = time prove (*** Squaring of the lower "half" ***) - ARM_ACCSTEPS_TAC BIGNUM_SQR_P521_EXEC + ARM_ACCSTEPS_TAC BIGNUM_SQR_P521_CORE_EXEC [161; 162; 169; 174; 175; 177; 178; 179; 180; 181; 183; 184; 185; 186; 187; 188; 189; 190; 191; 192; 193; 197; 198; 199; 200; 201; 202; 203; 204; 205; 206; 207; 208; 209; 210; 214; 215; 216; 217; 218; 219; 220; 221; 222; 223] @@ -831,7 +841,7 @@ let BIGNUM_SQR_P521_CORRECT = time prove REWRITE_TAC[LENGTH] THEN ARITH_TAC; ALL_TAC] THEN - ARM_ACCSTEPS_TAC BIGNUM_SQR_P521_EXEC + ARM_ACCSTEPS_TAC BIGNUM_SQR_P521_CORE_EXEC [225; 226; 229; 230; 233; 234; 237; 238; 241] (224--242) THEN SUBGOAL_THEN @@ -857,7 +867,7 @@ let BIGNUM_SQR_P521_CORRECT = time prove (*** The cross-multiplication ***) - ARM_ACCSTEPS_TAC BIGNUM_SQR_P521_EXEC + ARM_ACCSTEPS_TAC BIGNUM_SQR_P521_CORE_EXEC [243; 244; 245; 246; 248; 250; 252; 254; 255; 256; 257; 258; 259; 260; 261; 262; 263; 264; 265; 271; 276; 278; 279; 285; 290; 292; 293; 294; 295; 296; 297; 303; 308; 310; 311; 312; @@ -895,7 +905,7 @@ let BIGNUM_SQR_P521_CORRECT = time prove (*** Addition of the rotated cross-product to the running total ***) - ARM_ACCSTEPS_TAC BIGNUM_SQR_P521_EXEC + ARM_ACCSTEPS_TAC BIGNUM_SQR_P521_CORE_EXEC [364; 366; 369; 372; 376; 379; 383; 386; 391] (362--391) THEN MAP_EVERY ABBREV_TAC [`m0:int64 = word_subword @@ -993,7 +1003,7 @@ let BIGNUM_SQR_P521_CORRECT = time prove (*** Splitting up and stuffing 1 bits into the low part ***) - ARM_STEPS_TAC BIGNUM_SQR_P521_EXEC (392--394) THEN + ARM_STEPS_TAC BIGNUM_SQR_P521_CORE_EXEC (392--394) THEN RULE_ASSUM_TAC(REWRITE_RULE[GSYM WORD_AND_ASSOC; DIMINDEX_64; NUM_REDUCE_CONV `9 MOD 64`]) THEN REPEAT(FIRST_X_ASSUM(K ALL_TAC o check (vfree_in `h:num` o concl))) THEN @@ -1006,7 +1016,7 @@ let BIGNUM_SQR_P521_CORRECT = time prove (*** The comparison in its direct condensed form ***) - ARM_ACCSTEPS_TAC BIGNUM_SQR_P521_EXEC (395--397) (395--397) THEN + ARM_ACCSTEPS_TAC BIGNUM_SQR_P521_CORE_EXEC (395--397) (395--397) THEN SUBGOAL_THEN `carry_s397 <=> 2 EXP 192 <= @@ -1021,7 +1031,7 @@ let BIGNUM_SQR_P521_CORRECT = time prove (*** Finish the simulation before completing the mathematics ***) - ARM_ACCSTEPS_TAC BIGNUM_SQR_P521_EXEC (398--406) (398--412) THEN + ARM_ACCSTEPS_TAC BIGNUM_SQR_P521_CORE_EXEC (398--406) (398--412) THEN ENSURES_FINAL_STATE_TAC THEN ASM_REWRITE_TAC[] THEN CONV_TAC(LAND_CONV BIGNUM_LEXPAND_CONV) THEN ASM_REWRITE_TAC[] THEN DISCARD_STATE_TAC "s412" THEN @@ -1132,6 +1142,26 @@ let BIGNUM_SQR_P521_CORRECT = time prove COND_CASES_TAC THEN ASM_REWRITE_TAC[BITVAL_CLAUSES] THEN DISCH_THEN(fun th -> REWRITE_TAC[th]) THEN REAL_INTEGER_TAC);; +let BIGNUM_SQR_P521_CORRECT = time prove + (`!z x n pc. + nonoverlapping (word pc,LENGTH bignum_sqr_p521_mc) (z,8 * 9) + ==> ensures arm + (\s. aligned_bytes_loaded s (word pc) bignum_sqr_p521_mc /\ + read PC s = word(pc+12) /\ + C_ARGUMENTS [z; x] s /\ + bignum_from_memory (x,9) s = n) + (\s. read PC s = word (pc + 12 + LENGTH bignum_sqr_p521_core_mc) /\ + (n < p_521 + ==> bignum_from_memory (z,9) s = (n EXP 2) MOD p_521)) + (MAYCHANGE [PC; X2; X3; X4; X5; X6; X7; X8; X9; X10; X11; X12; X13; + X14; X15; X16; X17; X19; X20; X21; X22; X23; X24] ,, + MAYCHANGE SOME_FLAGS ,, + MAYCHANGE [memory :> bignum(z,9)])`, + + ARM_SUB_LIST_OF_MC_TAC BIGNUM_SQR_P521_CORE_CORRECT + bignum_sqr_p521_core_mc_def + [fst BIGNUM_SQR_P521_CORE_EXEC;fst BIGNUM_SQR_P521_EXEC]);; + let BIGNUM_SQR_P521_SUBROUTINE_CORRECT = time prove (`!z x n pc stackpointer returnaddress. aligned 16 stackpointer /\ @@ -1152,6 +1182,9 @@ let BIGNUM_SQR_P521_SUBROUTINE_CORRECT = time prove (MAYCHANGE_REGS_AND_FLAGS_PERMITTED_BY_ABI ,, MAYCHANGE [memory :> bytes(z,8 * 9); memory :> bytes(word_sub stackpointer (word 48),48)])`, + let th = CONV_RULE (ONCE_DEPTH_CONV NUM_ADD_CONV) + (REWRITE_RULE [fst BIGNUM_SQR_P521_CORE_EXEC;fst BIGNUM_SQR_P521_EXEC] + BIGNUM_SQR_P521_CORRECT) in ARM_ADD_RETURN_STACK_TAC - BIGNUM_SQR_P521_EXEC BIGNUM_SQR_P521_CORRECT + BIGNUM_SQR_P521_EXEC th `[X19;X20;X21;X22;X23;X24]` 48);; diff --git a/arm/proofs/bignum_sqr_p521_neon.ml b/arm/proofs/bignum_sqr_p521_neon.ml new file mode 100644 index 00000000..a537603c --- /dev/null +++ b/arm/proofs/bignum_sqr_p521_neon.ml @@ -0,0 +1,1021 @@ +(* + * Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + *) + +(* ========================================================================= *) +(* Squaring modulo p_521, the field characteristic for the NIST P-521 curve. *) +(* ========================================================================= *) + +needs "arm/proofs/bignum_sqr_p521.ml";; +needs "arm/proofs/equiv.ml";; +needs "arm/proofs/neon_helper.ml";; + +(* This is the intermediate program that is equivalent to the core parts of + bignum_sqr_p521 and bignum_sqr_p521_neon. This is a vectorized + version of core of bignum_sqr_p521 but instructions are unscheduled. *) + +let bignum_sqr_p521_interm1_ops:int list = [ + 0xa9404c34; (* ldp x20, x19, [x1] *) + 0x3dc00037; (* ldr q23, [x1] *) + 0x3dc00021; (* ldr q1, [x1] *) + 0x3dc00030; (* ldr q16, [x1] *) + 0xa941302e; (* ldp x14, x12, [x1, #16] *) + 0x3dc0043c; (* ldr q28, [x1, #16] *) + 0x3dc0043f; (* ldr q31, [x1, #16] *) + 0xa9420829; (* ldp x9, x2, [x1, #32] *) + 0x3dc0083d; (* ldr q29, [x1, #32] *) + 0x3dc00824; (* ldr q4, [x1, #32] *) + 0x3dc00025; (* ldr q5, [x1] *) + 0x3dc00822; (* ldr q2, [x1, #32] *) + 0xa9433426; (* ldp x6, x13, [x1, #48] *) + 0x3dc00c38; (* ldr q24, [x1, #48] *) + 0x3dc00c3b; (* ldr q27, [x1, #48] *) + 0x3dc00420; (* ldr q0, [x1, #16] *) + 0x3dc00c3e; (* ldr q30, [x1, #48] *) + 0x9b067d31; (* mul x17, x9, x6 *) + 0x9b0d7c4a; (* mul x10, x2, x13 *) + 0x9bc67d38; (* umulh x24, x9, x6 *) + 0xeb020124; (* subs x4, x9, x2 *) + 0xda842484; (* cneg x4, x4, cc // cc = lo, ul, last *) + 0xda9f23f0; (* csetm x16, cc // cc = lo, ul, last *) + 0xeb0601a3; (* subs x3, x13, x6 *) + 0xda832477; (* cneg x23, x3, cc // cc = lo, ul, last *) + 0x9b177c83; (* mul x3, x4, x23 *) + 0x9bd77c84; (* umulh x4, x4, x23 *) + 0xda902216; (* cinv x22, x16, cc // cc = lo, ul, last *) + 0xca160077; (* eor x23, x3, x22 *) + 0xca160090; (* eor x16, x4, x22 *) + 0xab180223; (* adds x3, x17, x24 *) + 0x9a1f0318; (* adc x24, x24, xzr *) + 0x9bcd7c44; (* umulh x4, x2, x13 *) + 0xab0a0063; (* adds x3, x3, x10 *) + 0xba040318; (* adcs x24, x24, x4 *) + 0x9a1f0084; (* adc x4, x4, xzr *) + 0xab0a0318; (* adds x24, x24, x10 *) + 0x9a1f008a; (* adc x10, x4, xzr *) + 0xb10006df; (* cmn x22, #0x1 *) + 0xba170064; (* adcs x4, x3, x23 *) + 0xba100318; (* adcs x24, x24, x16 *) + 0x9a16014a; (* adc x10, x10, x22 *) + 0xab110228; (* adds x8, x17, x17 *) + 0xba040096; (* adcs x22, x4, x4 *) + 0xba180305; (* adcs x5, x24, x24 *) + 0xba0a014b; (* adcs x11, x10, x10 *) + 0x9a1f03f7; (* adc x23, xzr, xzr *) + 0x6f00e5f9; (* movi v25.2d, #0xffffffff *) + 0x4e845893; (* uzp2 v19.4s, v4.4s, v4.4s *) + 0x0ea12bba; (* xtn v26.2s, v29.2d *) + 0x0ea12896; (* xtn v22.2s, v4.2d *) + 0x4ea00884; (* rev64 v4.4s, v4.4s *) + 0x2eb6c347; (* umull v7.2d, v26.2s, v22.2s *) + 0x2eb3c355; (* umull v21.2d, v26.2s, v19.2s *) + 0x4e9d5bb1; (* uzp2 v17.4s, v29.4s, v29.4s *) + 0x4ebd9c84; (* mul v4.4s, v4.4s, v29.4s *) + 0x6f6014f5; (* usra v21.2d, v7.2d, #32 *) + 0x2eb3c232; (* umull v18.2d, v17.2s, v19.2s *) + 0x6ea02884; (* uaddlp v4.2d, v4.4s *) + 0x4e391ea7; (* and v7.16b, v21.16b, v25.16b *) + 0x2eb68227; (* umlal v7.2d, v17.2s, v22.2s *) + 0x4f605484; (* shl v4.2d, v4.2d, #32 *) + 0x6f6016b2; (* usra v18.2d, v21.2d, #32 *) + 0x2eb68344; (* umlal v4.2d, v26.2s, v22.2s *) + 0x6f6014f2; (* usra v18.2d, v7.2d, #32 *) + 0x4e083c8f; (* mov x15, v4.d[0] *) + 0x4e183c90; (* mov x16, v4.d[1] *) + 0x9b027d23; (* mul x3, x9, x2 *) + 0x4e083e4a; (* mov x10, v18.d[0] *) + 0x4e183e51; (* mov x17, v18.d[1] *) + 0x9bc27d24; (* umulh x4, x9, x2 *) + 0xab030158; (* adds x24, x10, x3 *) + 0xba04020a; (* adcs x10, x16, x4 *) + 0x9a1f0231; (* adc x17, x17, xzr *) + 0xab030307; (* adds x7, x24, x3 *) + 0xba04014a; (* adcs x10, x10, x4 *) + 0x9a1f0231; (* adc x17, x17, xzr *) + 0xab0a0108; (* adds x8, x8, x10 *) + 0xba1102d6; (* adcs x22, x22, x17 *) + 0xba1f00b5; (* adcs x21, x5, xzr *) + 0xba1f0165; (* adcs x5, x11, xzr *) + 0x9a1f02eb; (* adc x11, x23, xzr *) + 0x6f00e5f9; (* movi v25.2d, #0xffffffff *) + 0x4e9b5b73; (* uzp2 v19.4s, v27.4s, v27.4s *) + 0x0ea12b1a; (* xtn v26.2s, v24.2d *) + 0x0ea12b76; (* xtn v22.2s, v27.2d *) + 0x4ea00b64; (* rev64 v4.4s, v27.4s *) + 0x2eb6c347; (* umull v7.2d, v26.2s, v22.2s *) + 0x2eb3c355; (* umull v21.2d, v26.2s, v19.2s *) + 0x4e985b11; (* uzp2 v17.4s, v24.4s, v24.4s *) + 0x4eb89c84; (* mul v4.4s, v4.4s, v24.4s *) + 0x6f6014f5; (* usra v21.2d, v7.2d, #32 *) + 0x2eb3c232; (* umull v18.2d, v17.2s, v19.2s *) + 0x6ea02884; (* uaddlp v4.2d, v4.4s *) + 0x4e391ea7; (* and v7.16b, v21.16b, v25.16b *) + 0x2eb68227; (* umlal v7.2d, v17.2s, v22.2s *) + 0x4f605484; (* shl v4.2d, v4.2d, #32 *) + 0x6f6016b2; (* usra v18.2d, v21.2d, #32 *) + 0x2eb68344; (* umlal v4.2d, v26.2s, v22.2s *) + 0x6f6014f2; (* usra v18.2d, v7.2d, #32 *) + 0x4e083c97; (* mov x23, v4.d[0] *) + 0x4e183c90; (* mov x16, v4.d[1] *) + 0x9b0d7cc3; (* mul x3, x6, x13 *) + 0x4e083e4a; (* mov x10, v18.d[0] *) + 0x4e183e51; (* mov x17, v18.d[1] *) + 0x9bcd7cc4; (* umulh x4, x6, x13 *) + 0xab030158; (* adds x24, x10, x3 *) + 0xba04020a; (* adcs x10, x16, x4 *) + 0x9a1f0231; (* adc x17, x17, xzr *) + 0xab030318; (* adds x24, x24, x3 *) + 0xba04014a; (* adcs x10, x10, x4 *) + 0x9a1f0231; (* adc x17, x17, xzr *) + 0xab1502f7; (* adds x23, x23, x21 *) + 0xba050310; (* adcs x16, x24, x5 *) + 0xba0b0143; (* adcs x3, x10, x11 *) + 0x9a1f0235; (* adc x21, x17, xzr *) + 0xf9402031; (* ldr x17, [x1, #64] *) + 0x8b110225; (* add x5, x17, x17 *) + 0x9b117e2b; (* mul x11, x17, x17 *) + 0x9240ce91; (* and x17, x20, #0xfffffffffffff *) + 0x9b117ca4; (* mul x4, x5, x17 *) + 0x93d4d271; (* extr x17, x19, x20, #52 *) + 0x9240ce31; (* and x17, x17, #0xfffffffffffff *) + 0x9b117caa; (* mul x10, x5, x17 *) + 0xd374fc91; (* lsr x17, x4, #52 *) + 0x8b110158; (* add x24, x10, x17 *) + 0xd374cc91; (* lsl x17, x4, #12 *) + 0x93d13311; (* extr x17, x24, x17, #12 *) + 0xab1101ef; (* adds x15, x15, x17 *) + 0x93d3a1d1; (* extr x17, x14, x19, #40 *) + 0x9240ce31; (* and x17, x17, #0xfffffffffffff *) + 0x9b117caa; (* mul x10, x5, x17 *) + 0xd374ff11; (* lsr x17, x24, #52 *) + 0x8b110144; (* add x4, x10, x17 *) + 0xd374cf11; (* lsl x17, x24, #12 *) + 0x93d16091; (* extr x17, x4, x17, #24 *) + 0xba1100e7; (* adcs x7, x7, x17 *) + 0x93ce7191; (* extr x17, x12, x14, #28 *) + 0x9240ce31; (* and x17, x17, #0xfffffffffffff *) + 0x9b117caa; (* mul x10, x5, x17 *) + 0xd374fc91; (* lsr x17, x4, #52 *) + 0x8b110158; (* add x24, x10, x17 *) + 0xd374cc91; (* lsl x17, x4, #12 *) + 0x93d19311; (* extr x17, x24, x17, #36 *) + 0xba110108; (* adcs x8, x8, x17 *) + 0x93cc4131; (* extr x17, x9, x12, #16 *) + 0x9240ce31; (* and x17, x17, #0xfffffffffffff *) + 0x9b117caa; (* mul x10, x5, x17 *) + 0xd374ff11; (* lsr x17, x24, #52 *) + 0x8b110144; (* add x4, x10, x17 *) + 0xd374cf11; (* lsl x17, x24, #12 *) + 0x93d1c091; (* extr x17, x4, x17, #48 *) + 0xba1102d6; (* adcs x22, x22, x17 *) + 0xd344fd31; (* lsr x17, x9, #4 *) + 0x9240ce31; (* and x17, x17, #0xfffffffffffff *) + 0x9b117caa; (* mul x10, x5, x17 *) + 0xd374fc91; (* lsr x17, x4, #52 *) + 0x8b110158; (* add x24, x10, x17 *) + 0xd374cc91; (* lsl x17, x4, #12 *) + 0x93d1f304; (* extr x4, x24, x17, #60 *) + 0x93c9e051; (* extr x17, x2, x9, #56 *) + 0x9240ce31; (* and x17, x17, #0xfffffffffffff *) + 0x9b117caa; (* mul x10, x5, x17 *) + 0xd374ff11; (* lsr x17, x24, #52 *) + 0x8b110158; (* add x24, x10, x17 *) + 0xd378dc91; (* lsl x17, x4, #8 *) + 0x93d12311; (* extr x17, x24, x17, #8 *) + 0xba1102f7; (* adcs x23, x23, x17 *) + 0x93c2b0d1; (* extr x17, x6, x2, #44 *) + 0x9240ce31; (* and x17, x17, #0xfffffffffffff *) + 0x9b117caa; (* mul x10, x5, x17 *) + 0xd374ff11; (* lsr x17, x24, #52 *) + 0x8b110144; (* add x4, x10, x17 *) + 0xd374cf11; (* lsl x17, x24, #12 *) + 0x93d15091; (* extr x17, x4, x17, #20 *) + 0xba110210; (* adcs x16, x16, x17 *) + 0x93c681b1; (* extr x17, x13, x6, #32 *) + 0x9240ce31; (* and x17, x17, #0xfffffffffffff *) + 0x9b117caa; (* mul x10, x5, x17 *) + 0xd374fc91; (* lsr x17, x4, #52 *) + 0x8b110158; (* add x24, x10, x17 *) + 0xd374cc91; (* lsl x17, x4, #12 *) + 0x93d18311; (* extr x17, x24, x17, #32 *) + 0xba110063; (* adcs x3, x3, x17 *) + 0xd354fdb1; (* lsr x17, x13, #20 *) + 0x9b117caa; (* mul x10, x5, x17 *) + 0xd374ff11; (* lsr x17, x24, #52 *) + 0x8b11014a; (* add x10, x10, x17 *) + 0xd374cf11; (* lsl x17, x24, #12 *) + 0x93d1b151; (* extr x17, x10, x17, #44 *) + 0xba1102a4; (* adcs x4, x21, x17 *) + 0xd36cfd51; (* lsr x17, x10, #44 *) + 0x9a110178; (* adc x24, x11, x17 *) + 0x93cf24ea; (* extr x10, x7, x15, #9 *) + 0x93c72511; (* extr x17, x8, x7, #9 *) + 0xa900440a; (* stp x10, x17, [x0] *) + 0x93c826ca; (* extr x10, x22, x8, #9 *) + 0x93d626f1; (* extr x17, x23, x22, #9 *) + 0xa901440a; (* stp x10, x17, [x0, #16] *) + 0x93d7260a; (* extr x10, x16, x23, #9 *) + 0x93d02471; (* extr x17, x3, x16, #9 *) + 0xa902440a; (* stp x10, x17, [x0, #32] *) + 0x93c3248a; (* extr x10, x4, x3, #9 *) + 0x93c42711; (* extr x17, x24, x4, #9 *) + 0xa903440a; (* stp x10, x17, [x0, #48] *) + 0x924021ea; (* and x10, x15, #0x1ff *) + 0xd349ff11; (* lsr x17, x24, #9 *) + 0x8b110151; (* add x17, x10, x17 *) + 0xf9002011; (* str x17, [x0, #64] *) + 0x4e971b91; (* uzp1 v17.4s, v28.4s, v23.4s *) + 0x4ea00b84; (* rev64 v4.4s, v28.4s *) + 0x4e971ae7; (* uzp1 v7.4s, v23.4s, v23.4s *) + 0x4eb79c84; (* mul v4.4s, v4.4s, v23.4s *) + 0x6ea02884; (* uaddlp v4.2d, v4.4s *) + 0x4f605484; (* shl v4.2d, v4.2d, #32 *) + 0x2eb180e4; (* umlal v4.2d, v7.2s, v17.2s *) + 0x4e083c88; (* mov x8, v4.d[0] *) + 0x4e183c96; (* mov x22, v4.d[1] *) + 0x9bce7e97; (* umulh x23, x20, x14 *) + 0xeb130291; (* subs x17, x20, x19 *) + 0xda912624; (* cneg x4, x17, cc // cc = lo, ul, last *) + 0xda9f23f8; (* csetm x24, cc // cc = lo, ul, last *) + 0xeb0e0191; (* subs x17, x12, x14 *) + 0xda912631; (* cneg x17, x17, cc // cc = lo, ul, last *) + 0x9b117c8a; (* mul x10, x4, x17 *) + 0x9bd17c91; (* umulh x17, x4, x17 *) + 0xda982310; (* cinv x16, x24, cc // cc = lo, ul, last *) + 0xca100143; (* eor x3, x10, x16 *) + 0xca100224; (* eor x4, x17, x16 *) + 0xab170118; (* adds x24, x8, x23 *) + 0x9a1f02ea; (* adc x10, x23, xzr *) + 0x9bcc7e71; (* umulh x17, x19, x12 *) + 0xab160318; (* adds x24, x24, x22 *) + 0xba11014a; (* adcs x10, x10, x17 *) + 0x9a1f0231; (* adc x17, x17, xzr *) + 0xab16014a; (* adds x10, x10, x22 *) + 0x9a1f0231; (* adc x17, x17, xzr *) + 0xb100061f; (* cmn x16, #0x1 *) + 0xba030318; (* adcs x24, x24, x3 *) + 0xba04014a; (* adcs x10, x10, x4 *) + 0x9a100231; (* adc x17, x17, x16 *) + 0xab08010f; (* adds x15, x8, x8 *) + 0xba180307; (* adcs x7, x24, x24 *) + 0xba0a0148; (* adcs x8, x10, x10 *) + 0xba110236; (* adcs x22, x17, x17 *) + 0x9a1f03f7; (* adc x23, xzr, xzr *) + 0x6f00e5f9; (* movi v25.2d, #0xffffffff *) + 0x4e905a13; (* uzp2 v19.4s, v16.4s, v16.4s *) + 0x0ea1283a; (* xtn v26.2s, v1.2d *) + 0x0ea12a16; (* xtn v22.2s, v16.2d *) + 0x4ea00a04; (* rev64 v4.4s, v16.4s *) + 0x2eb6c347; (* umull v7.2d, v26.2s, v22.2s *) + 0x2eb3c355; (* umull v21.2d, v26.2s, v19.2s *) + 0x4e815831; (* uzp2 v17.4s, v1.4s, v1.4s *) + 0x4ea19c84; (* mul v4.4s, v4.4s, v1.4s *) + 0x6f6014f5; (* usra v21.2d, v7.2d, #32 *) + 0x2eb3c232; (* umull v18.2d, v17.2s, v19.2s *) + 0x6ea02884; (* uaddlp v4.2d, v4.4s *) + 0x4e391ea7; (* and v7.16b, v21.16b, v25.16b *) + 0x2eb68227; (* umlal v7.2d, v17.2s, v22.2s *) + 0x4f605484; (* shl v4.2d, v4.2d, #32 *) + 0x6f6016b2; (* usra v18.2d, v21.2d, #32 *) + 0x2eb68344; (* umlal v4.2d, v26.2s, v22.2s *) + 0x6f6014f2; (* usra v18.2d, v7.2d, #32 *) + 0x4e083c95; (* mov x21, v4.d[0] *) + 0x4e183c90; (* mov x16, v4.d[1] *) + 0x9b137e83; (* mul x3, x20, x19 *) + 0x4e083e4a; (* mov x10, v18.d[0] *) + 0x4e183e51; (* mov x17, v18.d[1] *) + 0x9bd37e84; (* umulh x4, x20, x19 *) + 0xab030158; (* adds x24, x10, x3 *) + 0xba04020a; (* adcs x10, x16, x4 *) + 0x9a1f0231; (* adc x17, x17, xzr *) + 0xab030305; (* adds x5, x24, x3 *) + 0xba04014a; (* adcs x10, x10, x4 *) + 0x9a1f0231; (* adc x17, x17, xzr *) + 0xab0a01eb; (* adds x11, x15, x10 *) + 0xba1100ef; (* adcs x15, x7, x17 *) + 0xba1f0107; (* adcs x7, x8, xzr *) + 0xba1f02c8; (* adcs x8, x22, xzr *) + 0x9a1f02f6; (* adc x22, x23, xzr *) + 0x0ea12be7; (* xtn v7.2s, v31.2d *) + 0x0f2087e4; (* shrn v4.2s, v31.2d, #32 *) + 0x2ea4c0e4; (* umull v4.2d, v7.2s, v4.2s *) + 0x4f615484; (* shl v4.2d, v4.2d, #33 *) + 0x2ea780e4; (* umlal v4.2d, v7.2s, v7.2s *) + 0x4e083c97; (* mov x23, v4.d[0] *) + 0x4e183c90; (* mov x16, v4.d[1] *) + 0x9b0c7dc3; (* mul x3, x14, x12 *) + 0x9bce7dca; (* umulh x10, x14, x14 *) + 0x9bcc7d91; (* umulh x17, x12, x12 *) + 0x9bcc7dc4; (* umulh x4, x14, x12 *) + 0xab030158; (* adds x24, x10, x3 *) + 0xba04020a; (* adcs x10, x16, x4 *) + 0x9a1f0231; (* adc x17, x17, xzr *) + 0xab030318; (* adds x24, x24, x3 *) + 0xba04014a; (* adcs x10, x10, x4 *) + 0x9a1f0231; (* adc x17, x17, xzr *) + 0xab0702f0; (* adds x16, x23, x7 *) + 0xba080303; (* adcs x3, x24, x8 *) + 0xba160144; (* adcs x4, x10, x22 *) + 0x9a1f0238; (* adc x24, x17, xzr *) + 0xa940440a; (* ldp x10, x17, [x0] *) + 0xab15014a; (* adds x10, x10, x21 *) + 0xba050231; (* adcs x17, x17, x5 *) + 0xa900440a; (* stp x10, x17, [x0] *) + 0xa941440a; (* ldp x10, x17, [x0, #16] *) + 0xba0b014a; (* adcs x10, x10, x11 *) + 0xba0f0231; (* adcs x17, x17, x15 *) + 0xa901440a; (* stp x10, x17, [x0, #16] *) + 0xa942440a; (* ldp x10, x17, [x0, #32] *) + 0xba10014a; (* adcs x10, x10, x16 *) + 0xba030231; (* adcs x17, x17, x3 *) + 0xa902440a; (* stp x10, x17, [x0, #32] *) + 0xa943440a; (* ldp x10, x17, [x0, #48] *) + 0xba04014a; (* adcs x10, x10, x4 *) + 0xba180231; (* adcs x17, x17, x24 *) + 0xa903440a; (* stp x10, x17, [x0, #48] *) + 0xf9402011; (* ldr x17, [x0, #64] *) + 0x9a1f0231; (* adc x17, x17, xzr *) + 0xf9002011; (* str x17, [x0, #64] *) + 0x6f00e5f9; (* movi v25.2d, #0xffffffff *) + 0x4e825853; (* uzp2 v19.4s, v2.4s, v2.4s *) + 0x0ea128ba; (* xtn v26.2s, v5.2d *) + 0x0ea12856; (* xtn v22.2s, v2.2d *) + 0x4ea00844; (* rev64 v4.4s, v2.4s *) + 0x2eb6c347; (* umull v7.2d, v26.2s, v22.2s *) + 0x2eb3c355; (* umull v21.2d, v26.2s, v19.2s *) + 0x4e8558b1; (* uzp2 v17.4s, v5.4s, v5.4s *) + 0x4ea59c84; (* mul v4.4s, v4.4s, v5.4s *) + 0x6f6014f5; (* usra v21.2d, v7.2d, #32 *) + 0x2eb3c232; (* umull v18.2d, v17.2s, v19.2s *) + 0x6ea02884; (* uaddlp v4.2d, v4.4s *) + 0x4e391ea7; (* and v7.16b, v21.16b, v25.16b *) + 0x2eb68227; (* umlal v7.2d, v17.2s, v22.2s *) + 0x4f605484; (* shl v4.2d, v4.2d, #32 *) + 0x6f6016b2; (* usra v18.2d, v21.2d, #32 *) + 0x2eb68344; (* umlal v4.2d, v26.2s, v22.2s *) + 0x6f6014f2; (* usra v18.2d, v7.2d, #32 *) + 0x4e083c85; (* mov x5, v4.d[0] *) + 0x4e183c84; (* mov x4, v4.d[1] *) + 0x6f00e5f9; (* movi v25.2d, #0xffffffff *) + 0x4e9e5bd1; (* uzp2 v17.4s, v30.4s, v30.4s *) + 0x0ea12813; (* xtn v19.2s, v0.2d *) + 0x0ea12bda; (* xtn v26.2s, v30.2d *) + 0x4ea00bc4; (* rev64 v4.4s, v30.4s *) + 0x2ebac267; (* umull v7.2d, v19.2s, v26.2s *) + 0x2eb1c276; (* umull v22.2d, v19.2s, v17.2s *) + 0x4e805815; (* uzp2 v21.4s, v0.4s, v0.4s *) + 0x4ea09c84; (* mul v4.4s, v4.4s, v0.4s *) + 0x6f6014f6; (* usra v22.2d, v7.2d, #32 *) + 0x2eb1c2b1; (* umull v17.2d, v21.2s, v17.2s *) + 0x6ea02884; (* uaddlp v4.2d, v4.4s *) + 0x4e391ec7; (* and v7.16b, v22.16b, v25.16b *) + 0x2eba82a7; (* umlal v7.2d, v21.2s, v26.2s *) + 0x4f605484; (* shl v4.2d, v4.2d, #32 *) + 0x6f6016d1; (* usra v17.2d, v22.2d, #32 *) + 0x2eba8264; (* umlal v4.2d, v19.2s, v26.2s *) + 0x6f6014f1; (* usra v17.2d, v7.2d, #32 *) + 0x4e083c98; (* mov x24, v4.d[0] *) + 0x4e183c8a; (* mov x10, v4.d[1] *) + 0x4e083e51; (* mov x17, v18.d[0] *) + 0xab110084; (* adds x4, x4, x17 *) + 0x4e183e51; (* mov x17, v18.d[1] *) + 0xba110318; (* adcs x24, x24, x17 *) + 0x4e083e31; (* mov x17, v17.d[0] *) + 0xba11014a; (* adcs x10, x10, x17 *) + 0x4e183e31; (* mov x17, v17.d[1] *) + 0x9a1f0231; (* adc x17, x17, xzr *) + 0xab05008f; (* adds x15, x4, x5 *) + 0xba040304; (* adcs x4, x24, x4 *) + 0xba180158; (* adcs x24, x10, x24 *) + 0xba0a022a; (* adcs x10, x17, x10 *) + 0x9a1103f1; (* adc x17, xzr, x17 *) + 0xab050087; (* adds x7, x4, x5 *) + 0xba0f0308; (* adcs x8, x24, x15 *) + 0xba040156; (* adcs x22, x10, x4 *) + 0xba180237; (* adcs x23, x17, x24 *) + 0xba0a03f0; (* adcs x16, xzr, x10 *) + 0x9a1103e3; (* adc x3, xzr, x17 *) + 0xeb0c01d1; (* subs x17, x14, x12 *) + 0xda912638; (* cneg x24, x17, cc // cc = lo, ul, last *) + 0xda9f23e4; (* csetm x4, cc // cc = lo, ul, last *) + 0xeb0601b1; (* subs x17, x13, x6 *) + 0xda91262a; (* cneg x10, x17, cc // cc = lo, ul, last *) + 0x9b0a7f11; (* mul x17, x24, x10 *) + 0x9bca7f18; (* umulh x24, x24, x10 *) + 0xda84208a; (* cinv x10, x4, cc // cc = lo, ul, last *) + 0xb100055f; (* cmn x10, #0x1 *) + 0xca0a0231; (* eor x17, x17, x10 *) + 0xba1102f7; (* adcs x23, x23, x17 *) + 0xca0a0311; (* eor x17, x24, x10 *) + 0xba110210; (* adcs x16, x16, x17 *) + 0x9a0a0063; (* adc x3, x3, x10 *) + 0xeb130291; (* subs x17, x20, x19 *) + 0xda912638; (* cneg x24, x17, cc // cc = lo, ul, last *) + 0xda9f23e4; (* csetm x4, cc // cc = lo, ul, last *) + 0xeb090051; (* subs x17, x2, x9 *) + 0xda91262a; (* cneg x10, x17, cc // cc = lo, ul, last *) + 0x9b0a7f11; (* mul x17, x24, x10 *) + 0x9bca7f18; (* umulh x24, x24, x10 *) + 0xda84208a; (* cinv x10, x4, cc // cc = lo, ul, last *) + 0xb100055f; (* cmn x10, #0x1 *) + 0xca0a0231; (* eor x17, x17, x10 *) + 0xba1101eb; (* adcs x11, x15, x17 *) + 0xca0a0311; (* eor x17, x24, x10 *) + 0xba1100ef; (* adcs x15, x7, x17 *) + 0xba0a0107; (* adcs x7, x8, x10 *) + 0xba0a02d6; (* adcs x22, x22, x10 *) + 0xba0a02f7; (* adcs x23, x23, x10 *) + 0xba0a0210; (* adcs x16, x16, x10 *) + 0x9a0a0063; (* adc x3, x3, x10 *) + 0xeb0c0271; (* subs x17, x19, x12 *) + 0xda912638; (* cneg x24, x17, cc // cc = lo, ul, last *) + 0xda9f23e4; (* csetm x4, cc // cc = lo, ul, last *) + 0xeb0201b1; (* subs x17, x13, x2 *) + 0xda91262a; (* cneg x10, x17, cc // cc = lo, ul, last *) + 0x9b0a7f11; (* mul x17, x24, x10 *) + 0x9bca7f18; (* umulh x24, x24, x10 *) + 0xda84208a; (* cinv x10, x4, cc // cc = lo, ul, last *) + 0xb100055f; (* cmn x10, #0x1 *) + 0xca0a0231; (* eor x17, x17, x10 *) + 0xba1102c8; (* adcs x8, x22, x17 *) + 0xca0a0311; (* eor x17, x24, x10 *) + 0xba1102f7; (* adcs x23, x23, x17 *) + 0xba0a0210; (* adcs x16, x16, x10 *) + 0x9a0a0063; (* adc x3, x3, x10 *) + 0xeb0e0291; (* subs x17, x20, x14 *) + 0xda912638; (* cneg x24, x17, cc // cc = lo, ul, last *) + 0xda9f23e4; (* csetm x4, cc // cc = lo, ul, last *) + 0xeb0900d1; (* subs x17, x6, x9 *) + 0xda91262a; (* cneg x10, x17, cc // cc = lo, ul, last *) + 0x9b0a7f11; (* mul x17, x24, x10 *) + 0x9bca7f18; (* umulh x24, x24, x10 *) + 0xda84208a; (* cinv x10, x4, cc // cc = lo, ul, last *) + 0xb100055f; (* cmn x10, #0x1 *) + 0xca0a0231; (* eor x17, x17, x10 *) + 0xba1101f6; (* adcs x22, x15, x17 *) + 0xca0a0311; (* eor x17, x24, x10 *) + 0xba1100e4; (* adcs x4, x7, x17 *) + 0xba0a0118; (* adcs x24, x8, x10 *) + 0xba0a02f7; (* adcs x23, x23, x10 *) + 0xba0a0210; (* adcs x16, x16, x10 *) + 0x9a0a0063; (* adc x3, x3, x10 *) + 0xeb0c028c; (* subs x12, x20, x12 *) + 0xda8c258a; (* cneg x10, x12, cc // cc = lo, ul, last *) + 0xda9f23f1; (* csetm x17, cc // cc = lo, ul, last *) + 0xeb0901ac; (* subs x12, x13, x9 *) + 0xda8c2589; (* cneg x9, x12, cc // cc = lo, ul, last *) + 0x9b097d4c; (* mul x12, x10, x9 *) + 0x9bc97d4d; (* umulh x13, x10, x9 *) + 0xda912229; (* cinv x9, x17, cc // cc = lo, ul, last *) + 0xb100053f; (* cmn x9, #0x1 *) + 0xca09018c; (* eor x12, x12, x9 *) + 0xba0c0084; (* adcs x4, x4, x12 *) + 0xca0901ac; (* eor x12, x13, x9 *) + 0xba0c0318; (* adcs x24, x24, x12 *) + 0xba0902ea; (* adcs x10, x23, x9 *) + 0xba090211; (* adcs x17, x16, x9 *) + 0x9a09006d; (* adc x13, x3, x9 *) + 0xeb0e0273; (* subs x19, x19, x14 *) + 0xda93266c; (* cneg x12, x19, cc // cc = lo, ul, last *) + 0xda9f23e9; (* csetm x9, cc // cc = lo, ul, last *) + 0xeb0200c6; (* subs x6, x6, x2 *) + 0xda8624ce; (* cneg x14, x6, cc // cc = lo, ul, last *) + 0x9b0e7d93; (* mul x19, x12, x14 *) + 0x9bce7d8c; (* umulh x12, x12, x14 *) + 0xda89212e; (* cinv x14, x9, cc // cc = lo, ul, last *) + 0xb10005df; (* cmn x14, #0x1 *) + 0xca0e0273; (* eor x19, x19, x14 *) + 0xba130097; (* adcs x23, x4, x19 *) + 0xca0e0193; (* eor x19, x12, x14 *) + 0xba130310; (* adcs x16, x24, x19 *) + 0xba0e0146; (* adcs x6, x10, x14 *) + 0xba0e0222; (* adcs x2, x17, x14 *) + 0x9a0e01a9; (* adc x9, x13, x14 *) + 0xa940380c; (* ldp x12, x14, [x0] *) + 0x93d020d3; (* extr x19, x6, x16, #8 *) + 0xab0c026a; (* adds x10, x19, x12 *) + 0x93c62053; (* extr x19, x2, x6, #8 *) + 0xba0e0271; (* adcs x17, x19, x14 *) + 0xa941300e; (* ldp x14, x12, [x0, #16] *) + 0x93c22133; (* extr x19, x9, x2, #8 *) + 0xba0e026d; (* adcs x13, x19, x14 *) + 0x8a0d022e; (* and x14, x17, x13 *) + 0xd348fd33; (* lsr x19, x9, #8 *) + 0xba0c0266; (* adcs x6, x19, x12 *) + 0x8a0601c9; (* and x9, x14, x6 *) + 0xa942300e; (* ldp x14, x12, [x0, #32] *) + 0xd37ff8b3; (* lsl x19, x5, #1 *) + 0xba0e0262; (* adcs x2, x19, x14 *) + 0x8a02012e; (* and x14, x9, x2 *) + 0x93c5fd73; (* extr x19, x11, x5, #63 *) + 0xba0c0263; (* adcs x3, x19, x12 *) + 0x8a0301c9; (* and x9, x14, x3 *) + 0xa943300e; (* ldp x14, x12, [x0, #48] *) + 0x93cbfed3; (* extr x19, x22, x11, #63 *) + 0xba0e0264; (* adcs x4, x19, x14 *) + 0x8a04012e; (* and x14, x9, x4 *) + 0x93d6fef3; (* extr x19, x23, x22, #63 *) + 0xba0c0278; (* adcs x24, x19, x12 *) + 0x8a1801cc; (* and x12, x14, x24 *) + 0xf940200e; (* ldr x14, [x0, #64] *) + 0x93d7fe13; (* extr x19, x16, x23, #63 *) + 0x92402273; (* and x19, x19, #0x1ff *) + 0x9a1301d3; (* adc x19, x14, x19 *) + 0xd349fe6e; (* lsr x14, x19, #9 *) + 0xb277da73; (* orr x19, x19, #0xfffffffffffffe00 *) + 0xeb1f03ff; (* cmp xzr, xzr *) + 0xba0e015f; (* adcs xzr, x10, x14 *) + 0xba1f019f; (* adcs xzr, x12, xzr *) + 0xba1f027f; (* adcs xzr, x19, xzr *) + 0xba0e014a; (* adcs x10, x10, x14 *) + 0xba1f0231; (* adcs x17, x17, xzr *) + 0xba1f01ad; (* adcs x13, x13, xzr *) + 0xba1f00c6; (* adcs x6, x6, xzr *) + 0xba1f0042; (* adcs x2, x2, xzr *) + 0xba1f0069; (* adcs x9, x3, xzr *) + 0xba1f008c; (* adcs x12, x4, xzr *) + 0xba1f030e; (* adcs x14, x24, xzr *) + 0x9a1f0273; (* adc x19, x19, xzr *) + 0x92402273; (* and x19, x19, #0x1ff *) + 0xa900440a; (* stp x10, x17, [x0] *) + 0xa901180d; (* stp x13, x6, [x0, #16] *) + 0xa9022402; (* stp x2, x9, [x0, #32] *) + 0xa903380c; (* stp x12, x14, [x0, #48] *) + 0xf9002013; (* str x19, [x0, #64] *) +];; + +let bignum_sqr_p521_interm1_core_mc = + let charlist = List.concat_map + (fun op32 -> + [Char.chr (Int.logand op32 255); + Char.chr (Int.logand (Int.shift_right op32 8) 255); + Char.chr (Int.logand (Int.shift_right op32 16) 255); + Char.chr (Int.logand (Int.shift_right op32 24) 255)]) + bignum_sqr_p521_interm1_ops in + let byte_list = Bytes.init (List.length charlist) (fun i -> List.nth charlist i) in + define_word_list "bignum_sqr_p521_interm1_core_mc" (term_of_bytes byte_list);; + +let BIGNUM_SQR_P521_INTERM1_CORE_EXEC = + ARM_MK_EXEC_RULE bignum_sqr_p521_interm1_core_mc;; + +let equiv_input_states = new_definition + `!s1 s1' x z. + (equiv_input_states:(armstate#armstate)->int64->int64->bool) (s1,s1') x z <=> + (C_ARGUMENTS [z; x] s1 /\ + C_ARGUMENTS [z; x] s1' /\ + ?a. bignum_from_memory (x,9) s1 = a /\ + bignum_from_memory (x,9) s1' = a)`;; + +let equiv_output_states = new_definition + `!s1 s1' z. + (equiv_output_states:(armstate#armstate)->int64->bool) (s1,s1') z <=> + (?a. + bignum_from_memory (z,9) s1 = a /\ + bignum_from_memory (z,9) s1' = a)`;; + +let actions = [ + ("equal", 0, 1, 0, 1); ("insert", 1, 1, 1, 4); + ("equal", 1, 2, 4, 5); ("insert", 2, 2, 5, 7); + ("equal", 2, 3, 7, 8); ("insert", 3, 3, 8, 12); + ("equal", 3, 4, 12, 13); ("insert", 4, 4, 13, 17); ("equal", 4, 34, 17, 47); + ("delete", 34, 36, 47, 47); ("insert", 36, 36, 47, 67); + ("equal", 36, 37, 67, 68); ("delete", 37, 39, 68, 68); + ("insert", 39, 39, 68, 70); ("equal", 39, 51, 70, 82); + ("delete", 51, 53, 82, 82); ("insert", 53, 53, 82, 102); + ("equal", 53, 54, 102, 103); ("delete", 54, 56, 103, 103); + ("insert", 56, 56, 103, 105); ("equal", 56, 160, 105, 209); + ("delete", 160, 162, 209, 209); ("insert", 162, 162, 209, 218); + ("equal", 162, 190, 218, 246); ("delete", 190, 192, 246, 246); + ("insert", 192, 192, 246, 266); ("equal", 192, 193, 266, 267); + ("delete", 193, 195, 267, 267); ("insert", 195, 195, 267, 269); + ("equal", 195, 207, 269, 281); ("delete", 207, 209, 281, 281); + ("insert", 209, 209, 281, 288); ("equal", 209, 242, 288, 321); + ("delete", 242, 247, 321, 321); ("insert", 247, 247, 321, 362); + ("equal", 247, 248, 362, 363); ("delete", 248, 249, 363, 363); + ("insert", 249, 249, 363, 364); ("equal", 249, 250, 364, 365); + ("delete", 250, 251, 365, 365); ("insert", 251, 251, 365, 366); + ("equal", 251, 252, 366, 367); ("delete", 252, 253, 367, 367); + ("insert", 253, 253, 367, 368); ("equal", 253, 412, 368, 527) +];; + +let equiv_goal1 = mk_equiv_statement + `ALL (nonoverlapping (z:int64,8 * 9)) + [(word pc,LENGTH bignum_sqr_p521_core_mc); + (word pc2,LENGTH bignum_sqr_p521_interm1_core_mc)]` + equiv_input_states + equiv_output_states + bignum_sqr_p521_core_mc 0 + `MAYCHANGE [PC; X2; X3; X4; X5; X6; X7; X8; X9; X10; X11; X12; X13; + X14; X15; X16; X17; X19; X20; X21; X22; X23; X24] ,, + MAYCHANGE SOME_FLAGS ,, + MAYCHANGE [memory :> bignum(z,9)]` + bignum_sqr_p521_interm1_core_mc 0 + `MAYCHANGE_REGS_AND_FLAGS_PERMITTED_BY_ABI ,, + MAYCHANGE [X19; X20; X21; X22; X23; X24] ,, + MAYCHANGE [memory :> bignum(z,9)]`;; + +let _org_extra_word_CONV = !extra_word_CONV;; +extra_word_CONV := + [GEN_REWRITE_CONV I [WORD_BITMANIP_SIMP_LEMMAS; WORD_MUL64_LO; WORD_MUL64_HI; + WORD_SQR64_HI; WORD_SQR128_DIGIT0]] + @ (!extra_word_CONV);; + +let BIGNUM_SQR_P521_CORE_EQUIV1 = time prove(equiv_goal1, + + REWRITE_TAC[MAYCHANGE_REGS_AND_FLAGS_PERMITTED_BY_ABI;SOME_FLAGS; + ALLPAIRS;ALL;NONOVERLAPPING_CLAUSES; + fst BIGNUM_SQR_P521_CORE_EXEC; + fst BIGNUM_SQR_P521_INTERM1_CORE_EXEC] THEN + REPEAT STRIP_TAC THEN + (** Initialize **) + EQUIV_INITIATE_TAC equiv_input_states THEN + REPEAT (FIRST_X_ASSUM BIGNUM_EXPAND_AND_DIGITIZE_TAC) THEN + ASM_PROPAGATE_DIGIT_EQS_FROM_EXPANDED_BIGNUM_TAC THEN + (* necessary to run ldr qs *) + COMBINE_READ_BYTES64_PAIRS_TAC THEN + + (* Start *) + EQUIV_STEPS_TAC actions + BIGNUM_SQR_P521_CORE_EXEC + BIGNUM_SQR_P521_INTERM1_CORE_EXEC THEN + + REPEAT_N 2 ENSURES_FINAL_STATE'_TAC THEN + (* Prove remaining clauses from the postcondition *) + ASM_REWRITE_TAC[] THEN + REPEAT CONJ_TAC THENL [ + (** SUBGOAL 1. Outputs **) + ASM_REWRITE_TAC[equiv_output_states;mk_equiv_regs;mk_equiv_bool_regs; + BIGNUM_EXPAND_CONV `bignum_from_memory (ptr,9) state`; + C_ARGUMENTS] THEN + REPEAT (HINT_EXISTS_REFL_TAC THEN ASM_REWRITE_TAC[]); + + (** SUBGOAL 2. Maychange left **) + DISCARD_ASSUMPTIONS_TAC (fun th -> free_in `s0':armstate` (concl th)) THEN + MONOTONE_MAYCHANGE_TAC; + + (** SUBGOAL 3. Maychange right **) + DISCARD_ASSUMPTIONS_TAC (fun th -> free_in `s0:armstate` (concl th)) THEN + MONOTONE_MAYCHANGE_TAC + ]);; + +extra_word_CONV := _org_extra_word_CONV;; + + + +(****************************************************************************** + The second program equivalence between the core part of intermediate + program and fully optimized program +******************************************************************************) + +let bignum_sqr_p521_neon_mc = + define_from_elf "bignum_sqr_p521_neon_mc" + "arm/p521/bignum_sqr_p521_neon.o";; + +let BIGNUM_SQR_P521_NEON_EXEC = + ARM_MK_EXEC_RULE bignum_sqr_p521_neon_mc;; + +let bignum_sqr_p521_neon_core_mc_def, + bignum_sqr_p521_neon_core_mc, + BIGNUM_SQR_P521_NEON_CORE_EXEC = + mk_sublist_of_mc "bignum_sqr_p521_neon_core_mc" + bignum_sqr_p521_neon_mc + (`12`,`LENGTH bignum_sqr_p521_neon_mc - 28`) + (fst BIGNUM_SQR_P521_NEON_EXEC);; + + +let equiv_goal2 = mk_equiv_statement + `ALL (nonoverlapping (z:int64,8 * 9)) + [(word pc,LENGTH bignum_sqr_p521_interm1_core_mc); + (word pc2,LENGTH bignum_sqr_p521_neon_core_mc)]` + equiv_input_states + equiv_output_states + bignum_sqr_p521_interm1_core_mc 0 + `MAYCHANGE_REGS_AND_FLAGS_PERMITTED_BY_ABI ,, + MAYCHANGE [X19; X20; X21; X22; X23; X24] ,, + MAYCHANGE [memory :> bignum(z,9)]` + bignum_sqr_p521_neon_core_mc 0 + `MAYCHANGE_REGS_AND_FLAGS_PERMITTED_BY_ABI ,, + MAYCHANGE [X19; X20; X21; X22; X23; X24] ,, + MAYCHANGE [memory :> bignum(z,9)]`;; + + +(* Line numbers from the fully optimized prog. to the intermediate prog. + The script that prints this map is being privately maintained by aqjune-aws. *) + +let inst_map = [10;8;9;14;13;52;68;15;21;49;56;85;23;50;51;20;87;54;22;59;53;18;91;90;62;84;19;64;57;31;93;86;48;32;33;55;89;24;60;88;25;83;58;61;26;28;94;29;34;27;3;63;35;97;36;37;92;38;39;99;71;40;65;117;30;106;101;102;41;98;95;42;43;7;44;96;69;119;2;100;118;45;4;67;46;70;47;72;6;249;73;74;75;282;251;76;1;105;250;77;78;248;211;103;255;283;79;104;120;253;12;80;17;252;81;16;247;82;107;210;254;108;121;284;11;109;110;122;257;111;123;5;256;112;113;124;285;127;125;326;258;126;114;130;66;128;131;261;259;132;262;115;138;260;325;263;322;133;139;323;134;140;135;324;136;116;286;129;143;137;213;206;141;232;329;194;142;330;144;146;212;145;147;327;195;148;196;151;154;214;155;328;156;161;149;162;150;152;163;157;331;158;215;153;169;159;164;160;170;216;165;177;171;343;166;346;178;167;179;174;168;172;185;173;186;180;175;182;181;176;183;219;184;264;189;200;187;349;188;201;197;334;192;190;198;345;217;202;333;199;270;191;193;220;344;222;203;218;221;223;350;224;227;336;348;204;230;225;347;231;233;353;207;234;268;235;226;236;338;352;205;208;237;351;267;228;238;269;332;239;229;335;342;240;337;290;241;242;356;209;243;354;357;244;266;245;355;246;271;289;358;339;292;272;359;273;274;275;276;277;291;278;279;288;280;281;293;294;303;287;295;296;297;298;299;265;300;307;301;302;304;341;311;305;319;362;308;306;309;315;312;313;314;316;361;317;318;320;381;364;382;383;384;360;385;388;366;363;386;368;365;367;369;387;310;340;395;396;321;397;398;399;402;370;401;371;372;390;373;400;406;374;375;392;376;377;378;404;379;380;389;391;393;394;413;415;414;416;420;417;403;405;418;407;408;419;409;410;411;422;412;428;429;430;445;446;447;431;435;432;421;424;423;433;425;434;426;427;448;449;452;461;450;462;463;439;464;451;437;468;465;436;438;456;440;441;454;442;466;443;444;453;455;467;457;472;497;458;493;477;459;470;460;469;482;471;500;473;504;490;474;478;489;475;476;479;480;496;483;481;503;486;484;505;485;487;491;488;494;492;498;495;499;501;506;509;508;507;510;502;511;512;513;514;515;523;516;517;524;518;525;519;520;526;521;522;527];; + +(* (state number, (equation, fresh var)) *) +let state_to_abbrevs: (int * thm) list ref = ref [];; + +let BIGNUM_SQR_P521_CORE_EQUIV2 = time prove( + equiv_goal2, + + REWRITE_TAC[MAYCHANGE_REGS_AND_FLAGS_PERMITTED_BY_ABI;SOME_FLAGS; + ALLPAIRS;ALL;NONOVERLAPPING_CLAUSES; + fst BIGNUM_SQR_P521_INTERM1_CORE_EXEC; + fst BIGNUM_SQR_P521_NEON_CORE_EXEC] THEN + REPEAT STRIP_TAC THEN + (** Initialize **) + EQUIV_INITIATE_TAC equiv_input_states THEN + REPEAT (FIRST_X_ASSUM BIGNUM_EXPAND_AND_DIGITIZE_TAC) THEN + ASM_PROPAGATE_DIGIT_EQS_FROM_EXPANDED_BIGNUM_TAC THEN + (* necessary to run ldr qs *) + COMBINE_READ_BYTES64_PAIRS_TAC THEN + + (* Left *) + ARM_STEPS'_AND_ABBREV_TAC BIGNUM_SQR_P521_INTERM1_CORE_EXEC + (1--(List.length inst_map)) state_to_abbrevs THEN + + (* Right *) + ARM_STEPS'_AND_REWRITE_TAC BIGNUM_SQR_P521_NEON_CORE_EXEC + (1--(List.length inst_map)) inst_map state_to_abbrevs THEN + + REPEAT_N 2 ENSURES_FINAL_STATE'_TAC THEN + (* Prove remaining clauses from the postcondition *) + ASM_REWRITE_TAC[] THEN + REPEAT CONJ_TAC THENL [ + (** SUBGOAL 1. Outputs **) + ASM_REWRITE_TAC[equiv_output_states;mk_equiv_regs;mk_equiv_bool_regs; + BIGNUM_EXPAND_CONV `bignum_from_memory (ptr,9) state`; + C_ARGUMENTS] THEN + REPEAT (HINT_EXISTS_REFL_TAC THEN ASM_REWRITE_TAC[]); + + (** SUBGOAL 2. Maychange left **) + DISCARD_ASSUMPTIONS_TAC (fun th -> free_in `s0':armstate` (concl th)) THEN + MONOTONE_MAYCHANGE_TAC; + + (** SUBGOAL 3. Maychange right **) + DISCARD_ASSUMPTIONS_TAC (fun th -> free_in `s0:armstate` (concl th)) THEN + MONOTONE_MAYCHANGE_TAC + ]);; + + +(****************************************************************************** + Use transitivity of two program equivalences to prove end-to-end + correctness +******************************************************************************) + +let equiv_goal = mk_equiv_statement + `ALL (nonoverlapping (z:int64,8 * 9)) + [(word pc,LENGTH bignum_sqr_p521_core_mc); + (word pc2,LENGTH bignum_sqr_p521_neon_core_mc)]` + equiv_input_states + equiv_output_states + bignum_sqr_p521_core_mc 0 + `MAYCHANGE [PC; X2; X3; X4; X5; X6; X7; X8; X9; X10; X11; X12; X13; + X14; X15; X16; X17; X19; X20; X21; X22; X23; X24] ,, + MAYCHANGE SOME_FLAGS ,, + MAYCHANGE [memory :> bignum(z,9)]` + bignum_sqr_p521_neon_core_mc 0 + `MAYCHANGE_REGS_AND_FLAGS_PERMITTED_BY_ABI ,, + MAYCHANGE [X19; X20; X21; X22; X23; X24] ,, + MAYCHANGE [memory :> bignum(z,9)]`;; + +let equiv_output_states_TRANS = prove( + `!s s2 s' + z. equiv_output_states (s,s') z /\ equiv_output_states (s',s2) z + ==> equiv_output_states (s,s2) z`, + MESON_TAC[equiv_output_states]);; + +let BIGNUM_SQR_P521_CORE_EQUIV = time prove(equiv_goal, + + REPEAT STRIP_TAC THEN + SUBGOAL_THEN + `?pc3. + ALL (nonoverlapping (z:int64,8 * 9)) + [(word pc:int64,LENGTH bignum_sqr_p521_core_mc); + (word pc3:int64,LENGTH bignum_sqr_p521_interm1_core_mc)] /\ + ALL (nonoverlapping (z:int64,8 * 9)) + [(word pc3:int64,LENGTH bignum_sqr_p521_interm1_core_mc); + (word pc2:int64,LENGTH bignum_sqr_p521_neon_core_mc)] /\ + // Input buffers and the intermediate program don't alias + ALL (nonoverlapping + (word pc3:int64, LENGTH bignum_sqr_p521_interm1_core_mc)) + [x,8 * 9] /\ + 4 divides val (word pc3:int64)` + MP_TAC THENL [ + FIRST_X_ASSUM MP_TAC THEN + ASM_REWRITE_TAC + [ALL;NONOVERLAPPING_CLAUSES; + fst BIGNUM_SQR_P521_INTERM1_CORE_EXEC; + fst BIGNUM_SQR_P521_NEON_CORE_EXEC; + GSYM CONJ_ASSOC] THEN + STRIP_TAC THEN ASM_REWRITE_TAC[] THEN POP_ASSUM_LIST (K ALL_TAC) THEN + FIND_HOLE_TAC; + + ALL_TAC + ] THEN + STRIP_TAC THEN + + ENSURES2_TRANS_TAC BIGNUM_SQR_P521_CORE_EQUIV1 BIGNUM_SQR_P521_CORE_EQUIV2 THEN + + (* break 'ALL nonoverlapping' in assumptions *) + RULE_ASSUM_TAC (REWRITE_RULE[ + ALLPAIRS;ALL; + fst BIGNUM_SQR_P521_CORE_EXEC; + fst BIGNUM_SQR_P521_NEON_CORE_EXEC; + fst BIGNUM_SQR_P521_INTERM1_CORE_EXEC; + NONOVERLAPPING_CLAUSES]) THEN + REPEAT SPLIT_FIRST_CONJ_ASSUM_TAC THEN + + MATCH_MP_TAC ENSURES2_WEAKEN THEN + REWRITE_TAC[] THEN + REPEAT CONJ_TAC THENL [ + REPEAT STRIP_TAC THEN ASM_REWRITE_TAC[] THEN + REWRITE_TAC[TAUT `(p /\ q /\ r) /\ p /\ q /\ r' <=> p /\ q /\ r /\ r'`] THEN + EXISTS_TAC + `write (memory :> bytelist + (word pc3,LENGTH bignum_sqr_p521_interm1_core_mc)) + bignum_sqr_p521_interm1_core_mc + (write PC (word pc3) s')` THEN + PROVE_CONJ_OF_EQ_READS_TAC BIGNUM_SQR_P521_INTERM1_CORE_EXEC THENL [ + UNDISCH_TAC `equiv_input_states (s,s') x z` THEN + REWRITE_TAC[equiv_input_states;C_ARGUMENTS;BIGNUM_FROM_MEMORY_BYTES; + fst BIGNUM_SQR_P521_INTERM1_CORE_EXEC] THEN + STRIP_TAC THEN ASM_REWRITE_TAC[] THEN + REPEAT (TRY HINT_EXISTS_REFL_TAC THEN + PROVE_CONJ_OF_EQ_READS_TAC BIGNUM_SQR_P521_INTERM1_CORE_EXEC); + + UNDISCH_TAC `equiv_input_states (s,s') x z` THEN + REWRITE_TAC[equiv_input_states;C_ARGUMENTS;BIGNUM_FROM_MEMORY_BYTES; + fst BIGNUM_SQR_P521_INTERM1_CORE_EXEC] THEN + STRIP_TAC THEN ASM_REWRITE_TAC[] THEN + REPEAT (TRY HINT_EXISTS_REFL_TAC THEN + PROVE_CONJ_OF_EQ_READS_TAC BIGNUM_SQR_P521_INTERM1_CORE_EXEC); + ]; + + REPEAT GEN_TAC THEN STRIP_TAC THEN + ASM_REWRITE_TAC[] THEN ASM_MESON_TAC[equiv_output_states_TRANS]; + + SUBSUMED_MAYCHANGE_TAC + ]);; + + + +(****************************************************************************** + Inducing BIGNUM_SQR_P521_NEON_SUBROUTINE_CORRECT + from BIGNUM_SQR_P521_CORE_CORRECT +******************************************************************************) + +(* Prove BIGNUM_SQR_P384_CORE_CORRECT_N first *) + +let event_n_at_pc_goal = mk_eventually_n_at_pc_statement + `nonoverlapping + (word pc:int64, + LENGTH (APPEND bignum_sqr_p521_core_mc barrier_inst_bytes)) + (z:int64,8 * 9)` + [`z:int64`;`x:int64`] (*pc_mc_ofs*)0 + bignum_sqr_p521_core_mc (*pc_ofs*)0 + `\s0. C_ARGUMENTS [z;x] s0`;; + +let BIGNUM_SQR_P521_EVENTUALLY_N_AT_PC = time prove(event_n_at_pc_goal, + + REWRITE_TAC[LENGTH_APPEND;fst BIGNUM_SQR_P521_CORE_EXEC;BARRIER_INST_BYTES_LENGTH] THEN + REWRITE_TAC[eventually_n_at_pc;ALL;NONOVERLAPPING_CLAUSES;C_ARGUMENTS] THEN + SUBGOAL_THEN `4 divides (LENGTH bignum_sqr_p521_core_mc)` + (fun th -> REWRITE_TAC[MATCH_MP aligned_bytes_loaded_append th; + fst BIGNUM_SQR_P521_CORE_EXEC]) THENL [ + REWRITE_TAC[fst BIGNUM_SQR_P521_CORE_EXEC] THEN CONV_TAC NUM_DIVIDES_CONV; + ALL_TAC] THEN + REPEAT GEN_TAC THEN STRIP_TAC THEN + (* now start..! *) + X_GEN_TAC `s0:armstate` THEN GEN_TAC THEN STRIP_TAC THEN + (* eventually ==> eventually_n *) + PROVE_EVENTUALLY_IMPLIES_EVENTUALLY_N_TAC BIGNUM_SQR_P521_CORE_EXEC);; + + +let BIGNUM_SQR_P521_CORE_CORRECT_N = + prove_correct_n + BIGNUM_SQR_P521_EXEC + BIGNUM_SQR_P521_CORE_EXEC + BIGNUM_SQR_P521_CORE_CORRECT + BIGNUM_SQR_P521_EVENTUALLY_N_AT_PC;; + +(* This theorem is a copy of BIGNUM_SQR_P521_CORE_CORRECT, but with + - 'pc' replaced with 'pc2' + - bignum_sqr_p521_core_mc with bignum_sqr_p521_neon_core_mc + - The MAYCHANGE set replaced with the Neon version's one *) + +let BIGNUM_SQR_P521_NEON_CORE_CORRECT = time prove + (`!z x n pc2. + nonoverlapping (word pc2,LENGTH bignum_sqr_p521_neon_core_mc) (z,8 * 9) + ==> ensures arm + (\s. aligned_bytes_loaded s (word pc2) bignum_sqr_p521_neon_core_mc /\ + read PC s = word(pc2) /\ + C_ARGUMENTS [z; x] s /\ + bignum_from_memory (x,9) s = n) + (\s. read PC s = word (pc2 + LENGTH bignum_sqr_p521_neon_core_mc) /\ + (n < p_521 + ==> bignum_from_memory (z,9) s = n EXP 2 MOD p_521)) + (MAYCHANGE_REGS_AND_FLAGS_PERMITTED_BY_ABI ,, + MAYCHANGE [X19; X20; X21; X22; X23; X24] ,, + MAYCHANGE [memory :> bignum(z,9)])`, + + REPEAT GEN_TAC THEN + (* Prepare pc for the original program. This is going to be used + for preparing an initial state by 'overwriting' bignum_montsqr_p384_mc + at pc. *) + SUBGOAL_THEN + `?pc. + nonoverlapping (word pc, + LENGTH (APPEND bignum_sqr_p521_core_mc barrier_inst_bytes)) (z:int64,8 * 9) /\ + nonoverlapping (word pc, + LENGTH (APPEND bignum_sqr_p521_core_mc barrier_inst_bytes)) (x:int64,8 * 9) /\ + 4 divides val (word pc:int64)` MP_TAC THENL [ + REWRITE_TAC[fst BIGNUM_SQR_P521_CORE_EXEC;NONOVERLAPPING_CLAUSES;ALL; + LENGTH_APPEND;BARRIER_INST_BYTES_LENGTH] THEN + FIND_HOLE_TAC; + + (** SUBGOAL 2 **) + ALL_TAC + ] THEN + + REPEAT_N 2 STRIP_TAC THEN + + VCGEN_EQUIV_TAC BIGNUM_SQR_P521_CORE_EQUIV BIGNUM_SQR_P521_CORE_CORRECT_N + [fst BIGNUM_SQR_P521_CORE_EXEC; + fst BIGNUM_SQR_P521_NEON_CORE_EXEC] THEN + + (* unfold definitions that may block tactics *) + RULE_ASSUM_TAC (REWRITE_RULE + [NONOVERLAPPING_CLAUSES;fst BIGNUM_SQR_P521_EXEC; + fst BIGNUM_SQR_P521_NEON_EXEC]) THEN + REWRITE_TAC[C_ARGUMENTS;BIGNUM_FROM_MEMORY_BYTES] THEN + REPEAT CONJ_TAC THENL [ + (** SUBGOAL 1. Precond **) + X_GEN_TAC `s2:armstate` THEN REPEAT STRIP_TAC THEN + SUBGOAL_THEN `4 divides val (word pc2:int64)` ASSUME_TAC THENL + [ FIRST_ASSUM (fun th -> + MP_TAC th THEN REWRITE_TAC[DIVIDES_4_VAL_WORD_64;aligned_bytes_loaded_word] + THEN METIS_TAC[]) THEN NO_TAC; ALL_TAC ] THEN + ASM_REWRITE_TAC[equiv_input_states;C_ARGUMENTS] THEN + EXISTS_TAC + `write (memory :> bytelist + (word pc,LENGTH (APPEND bignum_sqr_p521_core_mc barrier_inst_bytes))) + (APPEND bignum_sqr_p521_core_mc barrier_inst_bytes) + (write PC (word pc) s2)` THEN + (* Expand variables appearing in the equiv relation *) + REPEAT CONJ_TAC THEN + TRY (PROVE_CONJ_OF_EQ_READS_TAC BIGNUM_SQR_P521_CORE_EXEC) THEN + (* Now has only one subgoal: the equivalence! *) + REWRITE_TAC[BIGNUM_FROM_MEMORY_BYTES] THEN + HINT_EXISTS_REFL_TAC THEN PROVE_CONJ_OF_EQ_READS_TAC BIGNUM_SQR_P521_CORE_EXEC; + + (** SUBGOAL 2. Postcond **) + MESON_TAC[equiv_output_states;BIGNUM_FROM_MEMORY_BYTES; + fst BIGNUM_SQR_P521_NEON_CORE_EXEC]; + + (** SUBGOAL 3. Frame **) + MESON_TAC[MAYCHANGE_REGS_AND_FLAGS_PERMITTED_BY_ABI] + ]);; + +let BIGNUM_SQR_P521_NEON_CORRECT = time prove + (`!z x n pc. + nonoverlapping (word pc,LENGTH bignum_sqr_p521_neon_mc) (z,8 * 9) + ==> ensures arm + (\s. aligned_bytes_loaded s (word pc) bignum_sqr_p521_neon_mc /\ + read PC s = word(pc + 12) /\ + C_ARGUMENTS [z; x] s /\ + bignum_from_memory (x,9) s = n) + (\s. read PC s = word (pc + 12 + LENGTH bignum_sqr_p521_neon_core_mc) /\ + (n < p_521 + ==> bignum_from_memory (z,9) s = n EXP 2 MOD p_521)) + (MAYCHANGE_REGS_AND_FLAGS_PERMITTED_BY_ABI ,, + MAYCHANGE [X19; X20; X21; X22; X23; X24] ,, + MAYCHANGE [memory :> bignum(z,9)])`, + + ARM_SUB_LIST_OF_MC_TAC + BIGNUM_SQR_P521_NEON_CORE_CORRECT + bignum_sqr_p521_neon_core_mc_def + [fst BIGNUM_SQR_P521_NEON_CORE_EXEC; + fst BIGNUM_SQR_P521_NEON_EXEC]);; + +let BIGNUM_SQR_P521_NEON_SUBROUTINE_CORRECT = time prove + (`!z x n pc stackpointer returnaddress. + aligned 16 stackpointer /\ + nonoverlapping (z,8 * 9) (word_sub stackpointer (word 48),48) /\ + ALLPAIRS nonoverlapping + [(z,8 * 9); (word_sub stackpointer (word 48),48)] + [(word pc,LENGTH bignum_sqr_p521_neon_mc); (x,8 * 9)] + ==> ensures arm + (\s. aligned_bytes_loaded s (word pc) bignum_sqr_p521_neon_mc /\ + read PC s = word pc /\ + read SP s = stackpointer /\ + read X30 s = returnaddress /\ + C_ARGUMENTS [z; x] s /\ + bignum_from_memory (x,9) s = n) + (\s. read PC s = returnaddress /\ + (n < p_521 + ==> bignum_from_memory (z,9) s = n EXP 2 MOD p_521)) + (MAYCHANGE_REGS_AND_FLAGS_PERMITTED_BY_ABI ,, + MAYCHANGE [memory :> bytes(z,8 * 9); + memory :> bytes(word_sub stackpointer (word 48),48)])`, + let th = CONV_RULE (ONCE_DEPTH_CONV NUM_ADD_CONV) + (REWRITE_RULE [fst BIGNUM_SQR_P521_NEON_CORE_EXEC; + fst BIGNUM_SQR_P521_NEON_EXEC] + BIGNUM_SQR_P521_NEON_CORRECT) in + REWRITE_TAC[fst BIGNUM_SQR_P521_NEON_EXEC] THEN + ARM_ADD_RETURN_STACK_TAC + BIGNUM_SQR_P521_NEON_EXEC th + `[X19;X20;X21;X22;X23;X24]` 48);; diff --git a/arm/proofs/specifications.txt b/arm/proofs/specifications.txt index ebf5c1e9..e155fe3e 100644 --- a/arm/proofs/specifications.txt +++ b/arm/proofs/specifications.txt @@ -178,6 +178,7 @@ BIGNUM_MUL_P25519_SUBROUTINE_CORRECT BIGNUM_MUL_P256K1_ALT_SUBROUTINE_CORRECT BIGNUM_MUL_P256K1_SUBROUTINE_CORRECT BIGNUM_MUL_P521_ALT_SUBROUTINE_CORRECT +BIGNUM_MUL_P521_NEON_SUBROUTINE_CORRECT BIGNUM_MUL_P521_SUBROUTINE_CORRECT BIGNUM_MUL_SUBROUTINE_CORRECT BIGNUM_MUX16_SUBROUTINE_CORRECT @@ -224,6 +225,7 @@ BIGNUM_SQR_P25519_SUBROUTINE_CORRECT BIGNUM_SQR_P256K1_ALT_SUBROUTINE_CORRECT BIGNUM_SQR_P256K1_SUBROUTINE_CORRECT BIGNUM_SQR_P521_ALT_SUBROUTINE_CORRECT +BIGNUM_SQR_P521_NEON_SUBROUTINE_CORRECT BIGNUM_SQR_P521_SUBROUTINE_CORRECT BIGNUM_SQR_SUBROUTINE_CORRECT BIGNUM_SUB_P25519_SUBROUTINE_CORRECT diff --git a/benchmarks/benchmark.c b/benchmarks/benchmark.c index 610f6778..2a73e0f7 100644 --- a/benchmarks/benchmark.c +++ b/benchmarks/benchmark.c @@ -803,7 +803,9 @@ void call_bignum_montsqr_p256_neon(void) {} void call_bignum_montsqr_p384_neon(void) {} void call_bignum_montsqr_p521_neon(void) {} void call_bignum_mul_8_16_neon(void) {} +void call_bignum_mul_p521_neon(void) {} void call_bignum_sqr_8_16_neon(void) {} +void call_bignum_sqr_p521_neon(void) {} #else @@ -828,7 +830,9 @@ void call_bignum_montsqr_p256_neon(void) repeat(bignum_montsqr_p256_neon(b0,b1)) void call_bignum_montsqr_p384_neon(void) repeat(bignum_montsqr_p384_neon(b0,b1)) void call_bignum_montsqr_p521_neon(void) repeat(bignum_montsqr_p521_neon(b0,b1)) void call_bignum_mul_8_16_neon(void) repeat(bignum_mul_8_16_neon(b0,b1,b2)) +void call_bignum_mul_p521_neon(void) repeat(bignum_mul_p521_neon(b0,b1,b2)) void call_bignum_sqr_8_16_neon(void) repeat(bignum_sqr_8_16_neon(b0,b1)) +void call_bignum_sqr_p521_neon(void) repeat(bignum_sqr_p521_neon(b0,b1)) #endif @@ -1105,6 +1109,7 @@ int main(int argc, char *argv[]) timingtest(all,"bignum_mul_p256k1_alt",call_bignum_mul_p256k1_alt); timingtest(bmi,"bignum_mul_p521",call_bignum_mul_p521); timingtest(all,"bignum_mul_p521_alt",call_bignum_mul_p521_alt); + timingtest(arm,"bignum_mul_p521_neon", call_bignum_mul_p521_neon); timingtest(all,"bignum_muladd10 (32 -> 32)",call_bignum_muladd10__32); timingtest(all,"bignum_mux16 (4 -> 4)",call_bignum_mux16__4); timingtest(all,"bignum_mux16 (6 -> 6)",call_bignum_mux16__6); @@ -1155,6 +1160,7 @@ int main(int argc, char *argv[]) timingtest(all,"bignum_sqr_p256k1_alt",call_bignum_sqr_p256k1_alt); timingtest(bmi,"bignum_sqr_p521",call_bignum_sqr_p521); timingtest(all,"bignum_sqr_p521_alt",call_bignum_sqr_p521_alt); + timingtest(arm,"bignum_sqr_p521_neon", call_bignum_sqr_p521_neon); timingtest(bmi,"bignum_sqrt_p25519",call_bignum_sqrt_p25519); timingtest(all,"bignum_sqrt_p25519_alt",call_bignum_sqrt_p25519_alt); timingtest(all,"bignum_sub (4x4->4)",call_bignum_sub__4_4); diff --git a/include/s2n-bignum.h b/include/s2n-bignum.h index cfe65827..e8c0b5e1 100644 --- a/include/s2n-bignum.h +++ b/include/s2n-bignum.h @@ -643,6 +643,7 @@ extern void bignum_mul_p256k1_alt (uint64_t z[S2N_BIGNUM_STATIC 4], uint64_t x[S // Inputs x[9], y[9]; output z[9] extern void bignum_mul_p521 (uint64_t z[S2N_BIGNUM_STATIC 9], uint64_t x[S2N_BIGNUM_STATIC 9], uint64_t y[S2N_BIGNUM_STATIC 9]); extern void bignum_mul_p521_alt (uint64_t z[S2N_BIGNUM_STATIC 9], uint64_t x[S2N_BIGNUM_STATIC 9], uint64_t y[S2N_BIGNUM_STATIC 9]); +extern void bignum_mul_p521_neon(uint64_t z[S2N_BIGNUM_STATIC 9], uint64_t x[S2N_BIGNUM_STATIC 9], uint64_t y[S2N_BIGNUM_STATIC 9]); // Multiply bignum by 10 and add word: z := 10 * z + d // Inputs z[k], d; outputs function return (carry) and z[k] @@ -802,6 +803,7 @@ extern void bignum_sqr_p256k1_alt (uint64_t z[S2N_BIGNUM_STATIC 4], uint64_t x[S // Input x[9]; output z[9] extern void bignum_sqr_p521 (uint64_t z[S2N_BIGNUM_STATIC 9], uint64_t x[S2N_BIGNUM_STATIC 9]); extern void bignum_sqr_p521_alt (uint64_t z[S2N_BIGNUM_STATIC 9], uint64_t x[S2N_BIGNUM_STATIC 9]); +extern void bignum_sqr_p521_neon (uint64_t z[S2N_BIGNUM_STATIC 9], uint64_t x[S2N_BIGNUM_STATIC 9]); // Square root modulo p_25519 // Input x[4]; output function return (Legendre symbol) and z[4] diff --git a/tests/test.c b/tests/test.c index 08c79a90..f67e38bd 100644 --- a/tests/test.c +++ b/tests/test.c @@ -6143,7 +6143,7 @@ int test_bignum_montmul_p521_specific(const char *name, void (*f)(uint64_t *z, uint64_t *x, uint64_t *y)) { uint64_t t; - printf("Testing bignum_montmul_p521 with %d cases\n",tests); + printf("Testing %s with %d cases\n",name,tests); int c; for (t = 0; t < tests; ++t) @@ -6858,15 +6858,16 @@ int test_bignum_mul_p256k1_alt(void) return 0; } -int test_bignum_mul_p521(void) +int test_bignum_mul_p521_specific(const char *name, + void (*f)(uint64_t *z, uint64_t *x, uint64_t *y)) { uint64_t i, k; - printf("Testing bignum_mul_p521 with %d cases\n",tests); + printf("Testing %s with %d cases\n",name,tests); uint64_t c; for (i = 0; i < tests; ++i) { k = 9; random_bignum(k,b2); reference_mod(k,b0,b2,p_521); random_bignum(k,b2); reference_mod(k,b1,b2,p_521); - bignum_mul_p521(b2,b0,b1); + f(b2,b0,b1); reference_mul(2*k,b4,k,b0,k,b1); reference_copy(2*k,b3,k,p_521); reference_mod(2*k,b5,b4,b3); @@ -6892,38 +6893,21 @@ int test_bignum_mul_p521(void) return 0; } +int test_bignum_mul_p521(void) +{ return test_bignum_mul_p521_specific("bignum_mul_p521", bignum_mul_p521); +} + int test_bignum_mul_p521_alt(void) -{ uint64_t i, k; - printf("Testing bignum_mul_p521_alt with %d cases\n",tests); - uint64_t c; - for (i = 0; i < tests; ++i) - { k = 9; - random_bignum(k,b2); reference_mod(k,b0,b2,p_521); - random_bignum(k,b2); reference_mod(k,b1,b2,p_521); - bignum_mul_p521_alt(b2,b0,b1); - reference_mul(2*k,b4,k,b0,k,b1); - reference_copy(2*k,b3,k,p_521); - reference_mod(2*k,b5,b4,b3); - reference_copy(k,b3,2*k,b5); - c = reference_compare(k,b3,k,b2); +{ return test_bignum_mul_p521_specific("bignum_mul_p521_alt", bignum_mul_p521_alt); +} - if (c != 0) - { printf("### Disparity: [size %4"PRIu64"] " - "...0x%016"PRIx64" * ...0x%016"PRIx64" mod ....0x%016"PRIx64" = " - "...0x%016"PRIx64" not ...0x%016"PRIx64"\n", - k,b0[0],b1[0],p_521[0],b2[0],b3[0]); - return 1; - } - else if (VERBOSE) - { if (k == 0) printf("OK: [size %4"PRIu64"]\n",k); - else printf("OK: [size %4"PRIu64"] " - "...0x%016"PRIx64" * ...0x%016"PRIx64" mod ....0x%016"PRIx64" = " - "...0x%016"PRIx64"\n", - k,b0[0],b1[0],p_521[0],b2[0]); - } - } - printf("All OK\n"); - return 0; +int test_bignum_mul_p521_neon(void) { +#ifdef __x86_64__ + // Do not call the neon function to avoid a linking failure error. + return 1; +#else + return test_bignum_mul_p521_specific("bignum_mul_p521_neon", bignum_mul_p521_neon); +#endif } int test_bignum_muladd10(void) @@ -8180,14 +8164,14 @@ int test_bignum_sqr_p256k1_alt(void) return 0; } -int test_bignum_sqr_p521(void) +int test_bignum_sqr_p521_specific(const char *name, void (*f)(uint64_t*, uint64_t*)) { uint64_t i, k; - printf("Testing bignum_sqr_p521 with %d cases\n",tests); + printf("Testing %s with %d cases\n",name,tests); uint64_t c; for (i = 0; i < tests; ++i) { k = 9; random_bignum(k,b2); reference_mod(k,b0,b2,p_521); - bignum_sqr_p521(b2,b0); + f(b2,b0); reference_mul(2*k,b4,k,b0,k,b0); reference_copy(2*k,b3,k,p_521); reference_mod(2*k,b5,b4,b3); @@ -8213,37 +8197,21 @@ int test_bignum_sqr_p521(void) return 0; } +int test_bignum_sqr_p521(void) +{ return test_bignum_sqr_p521_specific("bignum_sqr_p521", bignum_sqr_p521) ; +} + int test_bignum_sqr_p521_alt(void) -{ uint64_t i, k; - printf("Testing bignum_sqr_p521_alt with %d cases\n",tests); - uint64_t c; - for (i = 0; i < tests; ++i) - { k = 9; - random_bignum(k,b2); reference_mod(k,b0,b2,p_521); - bignum_sqr_p521_alt(b2,b0); - reference_mul(2*k,b4,k,b0,k,b0); - reference_copy(2*k,b3,k,p_521); - reference_mod(2*k,b5,b4,b3); - reference_copy(k,b3,2*k,b5); +{ return test_bignum_sqr_p521_specific("bignum_sqr_p521_alt", bignum_sqr_p521_alt); +} - c = reference_compare(k,b3,k,b2); - if (c != 0) - { printf("### Disparity: [size %4"PRIu64"] " - "...0x%016"PRIx64" ^ 2 mod ....0x%016"PRIx64" = " - "...0x%016"PRIx64" not ...0x%016"PRIx64"\n", - k,b0[0],p_521[0],b2[0],b3[0]); - return 1; - } - else if (VERBOSE) - { if (k == 0) printf("OK: [size %4"PRIu64"]\n",k); - else printf("OK: [size %4"PRIu64"] " - "...0x%016"PRIx64" ^ 2 mod ....0x%016"PRIx64" = " - "...0x%016"PRIx64"\n", - k,b0[0],p_521[0],b2[0]); - } - } - printf("All OK\n"); - return 0; +int test_bignum_sqr_p521_neon(void) { +#ifdef __x86_64__ + // Do not call the neon function to avoid a linking failure error. + return 1; +#else + return test_bignum_sqr_p521_specific("bignum_sqr_p521_neon", bignum_sqr_p521_neon); +#endif } int test_bignum_sqrt_p25519(void) @@ -12549,7 +12517,9 @@ int main(int argc, char *argv[]) functionaltest(all,"bignum_montsqr_p384_neon", test_bignum_montsqr_p384_neon); functionaltest(all,"bignum_montsqr_p521_neon", test_bignum_montsqr_p521_neon); functionaltest(all,"bignum_mul_8_16_neon",test_bignum_mul_8_16_neon); + functionaltest(all,"bignum_mul_p521_neon",test_bignum_mul_p521_neon); functionaltest(all,"bignum_sqr_8_16_neon",test_bignum_sqr_8_16_neon); + functionaltest(all,"bignum_sqr_p521_neon", test_bignum_sqr_p521_neon); } if (extrastrigger) function_to_test = "_";