diff --git a/arm/Makefile b/arm/Makefile index 1264f450..47c2c430 100644 --- a/arm/Makefile +++ b/arm/Makefile @@ -117,22 +117,29 @@ BIGNUM_OBJ = curve25519/bignum_add_p25519.o \ curve25519/bignum_sqr_p25519_alt.o \ curve25519/bignum_sub_p25519.o \ fastmul/bignum_emontredc_8n.o \ + fastmul/bignum_emontredc_8n_neon.o \ fastmul/bignum_kmul_16_32.o \ + fastmul/bignum_kmul_16_32_neon.o \ fastmul/bignum_kmul_32_64.o \ + fastmul/bignum_kmul_32_64_neon.o \ fastmul/bignum_ksqr_16_32.o \ + fastmul/bignum_ksqr_16_32_neon.o \ fastmul/bignum_ksqr_32_64.o \ + fastmul/bignum_ksqr_32_64_neon.o \ fastmul/bignum_mul_4_8.o \ fastmul/bignum_mul_4_8_alt.o \ fastmul/bignum_mul_6_12.o \ fastmul/bignum_mul_6_12_alt.o \ fastmul/bignum_mul_8_16.o \ fastmul/bignum_mul_8_16_alt.o \ + fastmul/bignum_mul_8_16_neon.o \ fastmul/bignum_sqr_4_8.o \ fastmul/bignum_sqr_4_8_alt.o \ fastmul/bignum_sqr_6_12.o \ fastmul/bignum_sqr_6_12_alt.o \ fastmul/bignum_sqr_8_16.o \ fastmul/bignum_sqr_8_16_alt.o \ + fastmul/bignum_sqr_8_16_neon.o \ generic/bignum_add.o \ generic/bignum_amontifier.o \ generic/bignum_amontmul.o \ diff --git a/arm/fastmul/Makefile b/arm/fastmul/Makefile index e486ecb6..22754601 100644 --- a/arm/fastmul/Makefile +++ b/arm/fastmul/Makefile @@ -22,22 +22,29 @@ endif # List of object files OBJ = bignum_emontredc_8n.o \ + bignum_emontredc_8n_neon.o \ bignum_kmul_16_32.o \ + bignum_kmul_16_32_neon.o \ bignum_kmul_32_64.o \ + bignum_kmul_32_64_neon.o \ bignum_ksqr_16_32.o \ + bignum_ksqr_16_32_neon.o \ bignum_ksqr_32_64.o \ + bignum_ksqr_32_64_neon.o \ bignum_mul_4_8.o \ bignum_mul_4_8_alt.o \ bignum_mul_6_12.o \ bignum_mul_6_12_alt.o \ bignum_mul_8_16.o \ bignum_mul_8_16_alt.o \ + bignum_mul_8_16_neon.o \ bignum_sqr_4_8.o \ bignum_sqr_4_8_alt.o \ bignum_sqr_6_12.o \ bignum_sqr_6_12_alt.o \ bignum_sqr_8_16.o \ - bignum_sqr_8_16_alt.o + bignum_sqr_8_16_alt.o \ + bignum_sqr_8_16_neon.o %.o : %.S ; $(CC) -E -I../../include $< | $(GAS) -o $@ - diff --git a/arm/fastmul/bignum_emontredc_8n_neon.S b/arm/fastmul/bignum_emontredc_8n_neon.S new file mode 100644 index 00000000..1fc7af39 --- /dev/null +++ b/arm/fastmul/bignum_emontredc_8n_neon.S @@ -0,0 +1,1093 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC + +// ---------------------------------------------------------------------------- +// Extend Montgomery reduce in 8-digit blocks, results in input-output buffer +// Inputs z[2*k], m[k], w; outputs function return (extra result bit) and z[2*k] +// +// extern uint64_t bignum_emontredc_8n_neon +// (uint64_t k, uint64_t *z, uint64_t *m, uint64_t w); +// +// Functionally equivalent to bignum_emontredc (see that file for more detail). +// But in general assumes that the input k is a multiple of 8. +// +// Standard ARM ABI: X0 = k, X1 = z, X2 = m, X3 = w, returns X0 +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_emontredc_8n_neon) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_emontredc_8n_neon) + .text + .balign 4 + + +S2N_BN_SYMBOL(bignum_emontredc_8n_neon): + stp x19, x20, [sp, #-16]! + stp x21, x22, [sp, #-16]! + stp x23, x24, [sp, #-16]! + stp x25, x26, [sp, #-16]! + stp x27, x28, [sp, #-16]! + sub sp, sp, #32 + lsr x0, x0, #2 + mov x26, x0 + subs x12, x0, #1 + bcc bignum_emontredc_8n_neon_end + + stp x3, xzr, [sp] + stp x26, xzr, [sp, #16] + mov x28, xzr + lsl x0, x12, #5 + +bignum_emontredc_8n_neon_outerloop: + ldp x3, xzr, [sp] + ldp x17, x19, [x1] + ldp x20, x21, [x1, #16] + ldp x8, x9, [x2] + ldp x10, x11, [x2, #16] + ldr q21, [x2, #16] + + // Montgomery step 0 + + mul x4, x17, x3 +// NEON: Calculate x4 * (x10, x11) that does two 64x64->128-bit multiplications. +dup v0.2d, x4 +uzp2 v3.4s, v21.4s, v0.4s +xtn v4.2s, v0.2d +xtn v5.2s, v21.2d + mul x12, x4, x8 + adds x17, x17, x12 + umulh x12, x4, x8 + mul x13, x4, x9 +rev64 v1.4s, v21.4s +umull v6.2d, v4.2s, v5.2s +umull v7.2d, v4.2s, v3.2s +uzp2 v16.4s, v0.4s, v0.4s +mul v0.4s, v1.4s, v0.4s +movi v2.2d, #0x000000ffffffff +usra v7.2d, v6.2d, #32 +umull v1.2d, v16.2s, v3.2s +uaddlp v0.2d, v0.4s +and v2.16b, v7.16b, v2.16b +umlal v2.2d, v16.2s, v5.2s +shl v0.2d, v0.2d, #32 +usra v1.2d, v7.2d, #32 +umlal v0.2d, v4.2s, v5.2s +mov x14, v0.d[0] +mov x15, v0.d[1] + adcs x19, x19, x13 + umulh x13, x4, x9 + adcs x20, x20, x14 +usra v1.2d, v2.2d, #32 +mov x14, v1.d[0] + adcs x21, x21, x15 +mov x15, v1.d[1] + adc x22, xzr, xzr + adds x19, x19, x12 + mul x5, x19, x3 // hoisted from step 1 + adcs x20, x20, x13 + adcs x21, x21, x14 + adc x22, x22, x15 + + // Montgomery step 1 + +// NEON: Calculate x5 * (x10, x11) that does two 64x64->128-bit multiplications. +dup v0.2d, x5 +uzp2 v3.4s, v21.4s, v0.4s +xtn v4.2s, v0.2d +xtn v5.2s, v21.2d + + mul x12, x5, x8 + adds x19, x19, x12 + umulh x12, x5, x8 + mul x13, x5, x9 + +rev64 v1.4s, v21.4s +umull v6.2d, v4.2s, v5.2s +umull v7.2d, v4.2s, v3.2s +uzp2 v16.4s, v0.4s, v0.4s +mul v0.4s, v1.4s, v0.4s +movi v2.2d, #0x000000ffffffff +usra v7.2d, v6.2d, #32 +umull v1.2d, v16.2s, v3.2s +uaddlp v0.2d, v0.4s +and v2.16b, v7.16b, v2.16b +umlal v2.2d, v16.2s, v5.2s +shl v0.2d, v0.2d, #32 +usra v1.2d, v7.2d, #32 +umlal v0.2d, v4.2s, v5.2s +mov x14, v0.d[0] +mov x15, v0.d[1] + adcs x20, x20, x13 + umulh x13, x5, x9 + adcs x21, x21, x14 +usra v1.2d, v2.2d, #32 +mov x14, v1.d[0] + adcs x22, x22, x15 +mov x15, v1.d[1] + adc x23, xzr, xzr + adds x20, x20, x12 + mul x6, x20, x3 // hoisted from step 2 + +// NEON: For montgomery step 2, +// calculate x6 * (x10, x11) that does two 64x64->128-bit multiplications. +dup v0.2d, x6 +#define in1 v21 +#define in2 v0 +#define out_lo v0 +#define out_hi v1 +uzp2 v3.4s, in2.4s, in1.4s +xtn v4.2s, in1.2d +xtn v5.2s, in2.2d + + adcs x21, x21, x13 + adcs x22, x22, x14 + adc x23, x23, x15 + + stp x4, x5, [x1] + +// hoisted from maddloop_neon_firstitr +ldr q20, [x1] +// q21 will be loaded later. + +ldr q22, [x2, #32] +ldr q23, [x2, #48] + + // Montgomery step 2 + +rev64 v1.4s, in2.4s +umull v6.2d, v4.2s, v5.2s +umull v7.2d, v4.2s, v3.2s +uzp2 v16.4s, in1.4s, in1.4s + + mul x12, x6, x8 + adds x20, x20, x12 + +mul v0.4s, v1.4s, in1.4s +movi v2.2d, #0x000000ffffffff +usra v7.2d, v6.2d, #32 +umull out_hi.2d, v16.2s, v3.2s + + umulh x12, x6, x8 + mul x13, x6, x9 + +uaddlp v0.2d, v0.4s +and v2.16b, v7.16b, v2.16b +umlal v2.2d, v16.2s, v5.2s +shl out_lo.2d, v0.2d, #32 + + adcs x21, x21, x13 + umulh x13, x6, x9 + +usra out_hi.2d, v7.2d, #32 +umlal out_lo.2d, v4.2s, v5.2s +mov x14, out_lo.d[0] +mov x15, out_lo.d[1] + +usra out_hi.2d, v2.2d, #32 +#undef in1 +#undef in2 +#undef out_lo +#undef out_hi + + adcs x22, x22, x14 + adcs x23, x23, x15 + +mov x14, v1.d[0] +mov x15, v1.d[1] + + adc x24, xzr, xzr + adds x21, x21, x12 + mul x7, x21, x3 + adcs x22, x22, x13 + adcs x23, x23, x14 + adc x24, x24, x15 + + stp x6, x7, [x1, #16] + +// hoisted from maddloop_neon_firstitr +ldr q21, [x1, #16] + +// pre-calculate 2mul+2umulhs in maddloop_neon_firstitr +// v25++v24 = hi and lo of (x4 * x8, x5 * x9) +#define in1 v20 +#define in2 v22 +#define out_lo v24 +#define out_hi v25 +uzp2 v3.4s, in2.4s, in1.4s +xtn v4.2s, in1.2d + + // Montgomery step 3 + + mul x12, x7, x8 + mul x13, x7, x9 + +xtn v5.2s, in2.2d +rev64 v1.4s, in2.4s +umull v6.2d, v4.2s, v5.2s +umull v7.2d, v4.2s, v3.2s + + mul x14, x7, x10 + mul x15, x7, x11 + +uzp2 v16.4s, in1.4s, in1.4s +mul v0.4s, v1.4s, in1.4s +movi v2.2d, #0x000000ffffffff +usra v7.2d, v6.2d, #32 +umull out_hi.2d, v16.2s, v3.2s +uaddlp v0.2d, v0.4s +and v2.16b, v7.16b, v2.16b +umlal v2.2d, v16.2s, v5.2s + + adds x21, x21, x12 + umulh x12, x7, x8 + adcs x22, x22, x13 + umulh x13, x7, x9 + +shl out_lo.2d, v0.2d, #32 +usra out_hi.2d, v7.2d, #32 +umlal out_lo.2d, v4.2s, v5.2s +usra out_hi.2d, v2.2d, #32 +#undef in1 +#undef in2 +#undef out_lo +#undef out_hi + + adcs x23, x23, x14 + umulh x14, x7, x10 + adcs x24, x24, x15 + umulh x15, x7, x11 + +// v27++v26 = hi and lo of (x6 * x10, x7 * x11) +#define in1 v21 +#define in2 v23 +#define out_lo v26 +#define out_hi v27 +uzp2 v3.4s, in2.4s, in1.4s +xtn v4.2s, in1.2d +xtn v5.2s, in2.2d +rev64 v1.4s, in2.4s + +// hoisted from maddloop_neon_firstitr and maddloop_x0one + ldp x8, x9, [x2, #32] + ldp x10, x11, [x2, #48] + +umull v6.2d, v4.2s, v5.2s +umull v7.2d, v4.2s, v3.2s +uzp2 v16.4s, in1.4s, in1.4s +mul v0.4s, v1.4s, in1.4s + + adc x25, xzr, xzr + adds x12, x22, x12 + adcs x13, x23, x13 + adcs x14, x24, x14 + adc x15, x25, x15 + +movi v2.2d, #0x000000ffffffff +usra v7.2d, v6.2d, #32 +umull out_hi.2d, v16.2s, v3.2s +uaddlp v0.2d, v0.4s +and v2.16b, v7.16b, v2.16b +umlal v2.2d, v16.2s, v5.2s +shl out_lo.2d, v0.2d, #32 +usra out_hi.2d, v7.2d, #32 +umlal out_lo.2d, v4.2s, v5.2s +usra out_hi.2d, v2.2d, #32 +#undef in1 +#undef in2 +#undef out_lo +#undef out_hi + + cbz x0, bignum_emontredc_8n_neon_madddone + mov x27, x0 + cmp x0, #32 + bne bignum_emontredc_8n_neon_maddloop_neon_firstitr + +bignum_emontredc_8n_neon_maddloop_x0one: + add x2, x2, #0x20 + add x1, x1, #0x20 + mul x17, x4, x8 + mul x22, x5, x9 + mul x23, x6, x10 + mul x24, x7, x11 + umulh x16, x4, x8 + adds x22, x22, x16 + umulh x16, x5, x9 + adcs x23, x23, x16 + umulh x16, x6, x10 + adcs x24, x24, x16 + umulh x16, x7, x11 + adc x25, x16, xzr + ldp x20, x21, [x1] + adds x12, x12, x20 + adcs x13, x13, x21 + ldp x20, x21, [x1, #16] + adcs x14, x14, x20 + adcs x15, x15, x21 + adc x16, xzr, xzr + adds x19, x22, x17 + adcs x22, x23, x22 + adcs x23, x24, x23 + adcs x24, x25, x24 + adc x25, xzr, x25 + adds x20, x22, x17 + adcs x21, x23, x19 + adcs x22, x24, x22 + adcs x23, x25, x23 + adcs x24, xzr, x24 + adc x25, xzr, x25 + adds x17, x17, x12 + adcs x19, x19, x13 + adcs x20, x20, x14 + adcs x21, x21, x15 + adcs x22, x22, x16 + adcs x23, x23, xzr + adcs x24, x24, xzr + adc x25, x25, xzr + subs x15, x6, x7 + cneg x15, x15, cc // cc = lo, ul, last + csetm x12, cc // cc = lo, ul, last + subs x13, x11, x10 + cneg x13, x13, cc // cc = lo, ul, last + mul x14, x15, x13 + umulh x13, x15, x13 + cinv x12, x12, cc // cc = lo, ul, last + cmn x12, #0x1 + eor x14, x14, x12 + adcs x23, x23, x14 + eor x13, x13, x12 + adcs x24, x24, x13 + adc x25, x25, x12 + subs x15, x4, x5 + cneg x15, x15, cc // cc = lo, ul, last + csetm x12, cc // cc = lo, ul, last + subs x13, x9, x8 + cneg x13, x13, cc // cc = lo, ul, last + mul x14, x15, x13 + umulh x13, x15, x13 + cinv x12, x12, cc // cc = lo, ul, last + cmn x12, #0x1 + eor x14, x14, x12 + adcs x19, x19, x14 + eor x13, x13, x12 + adcs x20, x20, x13 + adcs x21, x21, x12 + adcs x22, x22, x12 + adcs x23, x23, x12 + adcs x24, x24, x12 + adc x25, x25, x12 + subs x15, x5, x7 + cneg x15, x15, cc // cc = lo, ul, last + csetm x12, cc // cc = lo, ul, last + subs x13, x11, x9 + cneg x13, x13, cc // cc = lo, ul, last + mul x14, x15, x13 + umulh x13, x15, x13 + cinv x12, x12, cc // cc = lo, ul, last + cmn x12, #0x1 + eor x14, x14, x12 + adcs x22, x22, x14 + eor x13, x13, x12 + adcs x23, x23, x13 + adcs x24, x24, x12 + adc x25, x25, x12 + subs x15, x4, x6 + cneg x15, x15, cc // cc = lo, ul, last + csetm x12, cc // cc = lo, ul, last + subs x13, x10, x8 + cneg x13, x13, cc // cc = lo, ul, last + mul x14, x15, x13 + umulh x13, x15, x13 + cinv x12, x12, cc // cc = lo, ul, last + cmn x12, #0x1 + eor x14, x14, x12 + adcs x20, x20, x14 + eor x13, x13, x12 + adcs x21, x21, x13 + adcs x22, x22, x12 + adcs x23, x23, x12 + adcs x24, x24, x12 + adc x25, x25, x12 + subs x15, x4, x7 + cneg x15, x15, cc // cc = lo, ul, last + csetm x12, cc // cc = lo, ul, last + subs x13, x11, x8 + cneg x13, x13, cc // cc = lo, ul, last + mul x14, x15, x13 + umulh x13, x15, x13 + cinv x12, x12, cc // cc = lo, ul, last + cmn x12, #0x1 + eor x14, x14, x12 + adcs x21, x21, x14 + eor x13, x13, x12 + adcs x22, x22, x13 + adcs x23, x23, x12 + adcs x24, x24, x12 + adc x25, x25, x12 + subs x15, x5, x6 + cneg x15, x15, cc // cc = lo, ul, last + csetm x12, cc // cc = lo, ul, last + subs x13, x10, x9 + cneg x13, x13, cc // cc = lo, ul, last + mul x14, x15, x13 + umulh x13, x15, x13 + cinv x12, x12, cc // cc = lo, ul, last + cmn x12, #0x1 + eor x14, x14, x12 + adcs x21, x21, x14 + eor x13, x13, x12 + adcs x22, x22, x13 + adcs x13, x23, x12 + adcs x14, x24, x12 + adc x15, x25, x12 + mov x12, x22 + stp x17, x19, [x1] + stp x20, x21, [x1, #16] + sub x27, x27, #0x20 + b bignum_emontredc_8n_neon_madddone + + +bignum_emontredc_8n_neon_maddloop_neon_firstitr: + +mov x16, v25.d[0] //umulh x16,x4,x8 +mov x22, v24.d[1] //mul x22, x5, x9 + +mov x20, v25.d[1] //umulh x20,x5,x9 +mov x23, v26.d[0] //mul x23, x6, x10 + +mov x21, v27.d[0] //umulh x21,x6,x10 +mov x24, v26.d[1] //mul x24, x7, x11 + +mov x3, v27.d[1] //umulh x3,x7,x11 +mov x17, v24.d[0] //mul x17, x4, x8 + + adds x22,x22,x16 + adcs x23,x23,x20 + adcs x24,x24,x21 + adc x25,x3,xzr + +// pre-calculate the multiplications for the next iter. +// v25 ++ v24 = hi, lo of (x4 * x8, x5 * x9) +ldr q22, [x2, #64] +ldr q23, [x2, #80] + + add x2, x2, #32 + add x1, x1, #32 + +#define in1 v20 +#define in2 v22 +#define out_lo v24 +#define out_hi v25 +uzp2 v3.4s, in2.4s, in1.4s +xtn v4.2s, in1.2d +xtn v5.2s, in2.2d +rev64 v1.4s, in2.4s + + ldp x20,x21,[x1] + adds x12,x12,x20 + adcs x13,x13,x21 + ldp x20,x21,[x1,#16] + +umull v6.2d, v4.2s, v5.2s +umull v7.2d, v4.2s, v3.2s +uzp2 v16.4s, in1.4s, in1.4s +mul v0.4s, v1.4s, in1.4s + + adcs x14,x14,x20 + adcs x15,x15,x21 + adc x16,xzr,xzr + adds x19,x22,x17 + +movi v2.2d, #0x000000ffffffff +usra v7.2d, v6.2d, #32 +umull out_hi.2d, v16.2s, v3.2s +uaddlp v0.2d, v0.4s + + adcs x22,x23,x22 + adcs x23,x24,x23 + adcs x24,x25,x24 + adc x25,xzr,x25 + +and v2.16b, v7.16b, v2.16b +umlal v2.2d, v16.2s, v5.2s +shl out_lo.2d, v0.2d, #32 +usra out_hi.2d, v7.2d, #32 + + adds x20,x22,x17 + adcs x21,x23,x19 + adcs x22,x24,x22 + adcs x23,x25,x23 + +umlal out_lo.2d, v4.2s, v5.2s +usra out_hi.2d, v2.2d, #32 +#undef in1 +#undef in2 +#undef out_lo +#undef out_hi + + adcs x24,xzr,x24 + adc x25,xzr,x25 + adds x17,x17,x12 + adcs x19,x19,x13 + +#define in1 v21 +#define in2 v23 +#define out_lo v26 +#define out_hi v27 +uzp2 v3.4s, in2.4s, in1.4s +xtn v4.2s, in1.2d +xtn v5.2s, in2.2d +rev64 v1.4s, in2.4s + + adcs x20,x20,x14 + adcs x21,x21,x15 + adcs x22,x22,x16 + adcs x23,x23,xzr + +umull v6.2d, v4.2s, v5.2s +umull v7.2d, v4.2s, v3.2s +uzp2 v16.4s, in1.4s, in1.4s +mul v0.4s, v1.4s, in1.4s + + adcs x24,x24,xzr + adc x25,x25,xzr + subs x15,x6,x7 + cneg x15,x15,cc + +movi v2.2d, #0x000000ffffffff +usra v7.2d, v6.2d, #32 +umull out_hi.2d, v16.2s, v3.2s +uaddlp v0.2d, v0.4s + + csetm x12,cc + subs x13,x11,x10 + cneg x13,x13,cc + mul x14,x15,x13 + +and v2.16b, v7.16b, v2.16b +umlal v2.2d, v16.2s, v5.2s +shl out_lo.2d, v0.2d, #32 +usra out_hi.2d, v7.2d, #32 + + umulh x13,x15,x13 + cinv x12,x12,cc + adds xzr,x12,#1 + eor x14,x14,x12 + +umlal out_lo.2d, v4.2s, v5.2s +usra out_hi.2d, v2.2d, #32 +#undef in1 +#undef in2 +#undef out_lo +#undef out_hi + + adcs x23,x23,x14 + eor x13,x13,x12 + adcs x24,x24,x13 + adc x25,x25,x12 + subs x15,x4,x5 + cneg x15,x15,cc + csetm x12,cc + subs x13,x9,x8 + cneg x13,x13,cc + mul x14,x15,x13 + umulh x13,x15,x13 + cinv x12,x12,cc + adds xzr,x12,#1 + eor x14,x14,x12 + adcs x19,x19,x14 + eor x13,x13,x12 + adcs x20,x20,x13 + adcs x21,x21,x12 + adcs x22,x22,x12 + adcs x23,x23,x12 + adcs x24,x24,x12 + adc x25,x25,x12 + + stp x17,x19,[x1] + +mov x16, v25.d[0] // hi bits of (x4 * x8) +mov x26, v27.d[0] // hi bits of (x6 * x10) +mov x3, v25.d[1] // hi bits of (x5 * x9) +mov x17, v27.d[1] // hi bits of (x6 * x10) + + subs x15,x5,x7 + cneg x15,x15,cc + csetm x12,cc + subs x13,x11,x9 + cneg x13,x13,cc + mul x14,x15,x13 + umulh x13,x15,x13 + cinv x12,x12,cc + adds xzr,x12,#1 + eor x14,x14,x12 + adcs x22,x22,x14 + eor x13,x13,x12 + adcs x23,x23,x13 + adcs x24,x24,x12 + adc x25,x25,x12 + subs x15,x4,x6 + cneg x15,x15,cc + csetm x12,cc + subs x13,x10,x8 + cneg x13,x13,cc + mul x14,x15,x13 + umulh x13,x15,x13 + cinv x12,x12,cc + adds xzr,x12,#1 + eor x14,x14,x12 + adcs x20,x20,x14 + eor x13,x13,x12 + adcs x21,x21,x13 + adcs x22,x22,x12 + adcs x23,x23,x12 + adcs x24,x24,x12 + adc x25,x25,x12 + subs x15,x4,x7 + cneg x15,x15,cc + csetm x12,cc + subs x13,x11,x8 + cneg x13,x13,cc + mul x14,x15,x13 + umulh x13,x15,x13 + cinv x12,x12,cc + adds xzr,x12,#1 + eor x14,x14,x12 + adcs x21,x21,x14 + eor x13,x13,x12 + adcs x22,x22,x13 + adcs x23,x23,x12 + adcs x24,x24,x12 + adc x25,x25,x12 + subs x15,x5,x6 + cneg x15,x15,cc + csetm x12,cc + subs x13,x10,x9 + cneg x13,x13,cc + mul x14,x15,x13 + umulh x13,x15,x13 + cinv x12,x12,cc + adds xzr,x12,#1 + eor x14,x14,x12 + adcs x21,x21,x14 + + stp x20,x21,[x1,#16] +mov x20, v24.d[1] // lo bits of (x5 * x9) +mov x21, v26.d[0] // lo bits of (x6 * x10) + + eor x13,x13,x12 + adcs x22,x22,x13 + adcs x13,x23,x12 + adcs x14,x24,x12 + adc x15,x25,x12 + mov x12,x22 + +mov x24, v26.d[1] // lo bits of (x7 * x11) + + sub x27, x27, #32 + cmp x27, #32 + beq bignum_emontredc_8n_neon_maddloop_neon_last + + +bignum_emontredc_8n_neon_maddloop_neon: + ldp x8, x9, [x2, #32] + ldp x10, x11, [x2, #48] + +// pre-calculate the multiplications for the next iter. +// v25 ++ v24 = hi, lo of (x4 * x8, x5 * x9) +ldr q22, [x2, #64] +ldr q23, [x2, #80] + + add x2, x2, #32 + add x1, x1, #32 + + adds x22,x20,x16 + adcs x23,x21,x3 + adcs x24,x24,x26 + adc x25,x17,xzr +mov x17, v24.d[0] // lo bits of (x4 * x8) + +#define in1 v20 +#define in2 v22 +#define out_lo v24 +#define out_hi v25 +uzp2 v3.4s, in2.4s, in1.4s +xtn v4.2s, in1.2d +xtn v5.2s, in2.2d +rev64 v1.4s, in2.4s + + ldp x20,x21,[x1] + adds x12,x12,x20 + adcs x13,x13,x21 + ldp x20,x21,[x1,#16] + +umull v6.2d, v4.2s, v5.2s +umull v7.2d, v4.2s, v3.2s +uzp2 v16.4s, in1.4s, in1.4s +mul v0.4s, v1.4s, in1.4s + + adcs x14,x14,x20 + adcs x15,x15,x21 + adc x16,xzr,xzr + adds x19,x22,x17 + +movi v2.2d, #0x000000ffffffff +usra v7.2d, v6.2d, #32 +umull out_hi.2d, v16.2s, v3.2s +uaddlp v0.2d, v0.4s + + adcs x22,x23,x22 + adcs x23,x24,x23 + adcs x24,x25,x24 + adc x25,xzr,x25 + +and v2.16b, v7.16b, v2.16b +umlal v2.2d, v16.2s, v5.2s +shl out_lo.2d, v0.2d, #32 +usra out_hi.2d, v7.2d, #32 + + adds x20,x22,x17 + adcs x21,x23,x19 + adcs x22,x24,x22 + adcs x23,x25,x23 + +umlal out_lo.2d, v4.2s, v5.2s +usra out_hi.2d, v2.2d, #32 +#undef in1 +#undef in2 +#undef out_lo +#undef out_hi + + adcs x24,xzr,x24 + adc x25,xzr,x25 + adds x17,x17,x12 + adcs x19,x19,x13 + +#define in1 v21 +#define in2 v23 +#define out_lo v26 +#define out_hi v27 +uzp2 v3.4s, in2.4s, in1.4s +xtn v4.2s, in1.2d +xtn v5.2s, in2.2d +rev64 v1.4s, in2.4s + + adcs x20,x20,x14 + adcs x21,x21,x15 + adcs x22,x22,x16 + adcs x23,x23,xzr + +umull v6.2d, v4.2s, v5.2s +umull v7.2d, v4.2s, v3.2s +uzp2 v16.4s, in1.4s, in1.4s +mul v0.4s, v1.4s, in1.4s + + adcs x24,x24,xzr + adc x25,x25,xzr + subs x15,x6,x7 + cneg x15,x15,cc + +movi v2.2d, #0x000000ffffffff +usra v7.2d, v6.2d, #32 +umull out_hi.2d, v16.2s, v3.2s +uaddlp v0.2d, v0.4s + + csetm x12,cc + subs x13,x11,x10 + cneg x13,x13,cc + mul x14,x15,x13 + +and v2.16b, v7.16b, v2.16b +umlal v2.2d, v16.2s, v5.2s +shl out_lo.2d, v0.2d, #32 +usra out_hi.2d, v7.2d, #32 + + umulh x13,x15,x13 + cinv x12,x12,cc + adds xzr,x12,#1 + eor x14,x14,x12 + +umlal out_lo.2d, v4.2s, v5.2s +usra out_hi.2d, v2.2d, #32 +#undef in1 +#undef in2 +#undef out_lo +#undef out_hi + + adcs x23,x23,x14 + eor x13,x13,x12 + adcs x24,x24,x13 + adc x25,x25,x12 + subs x15,x4,x5 + cneg x15,x15,cc + csetm x12,cc + subs x13,x9,x8 + cneg x13,x13,cc + mul x14,x15,x13 + umulh x13,x15,x13 + cinv x12,x12,cc + adds xzr,x12,#1 + eor x14,x14,x12 + adcs x19,x19,x14 + eor x13,x13,x12 + adcs x20,x20,x13 + adcs x21,x21,x12 + adcs x22,x22,x12 + adcs x23,x23,x12 + adcs x24,x24,x12 + adc x25,x25,x12 + + stp x17,x19,[x1] + +mov x16, v25.d[0] // hi bits of (x4 * x8) +mov x26, v27.d[0] // hi bits of (x6 * x10) +mov x3, v25.d[1] // hi bits of (x5 * x9) +mov x17, v27.d[1] // hi bits of (x6 * x10) + + subs x15,x5,x7 + cneg x15,x15,cc + csetm x12,cc + subs x13,x11,x9 + cneg x13,x13,cc + mul x14,x15,x13 + umulh x13,x15,x13 + cinv x12,x12,cc + adds xzr,x12,#1 + eor x14,x14,x12 + adcs x22,x22,x14 + eor x13,x13,x12 + adcs x23,x23,x13 + adcs x24,x24,x12 + adc x25,x25,x12 + subs x15,x4,x6 + cneg x15,x15,cc + csetm x12,cc + subs x13,x10,x8 + cneg x13,x13,cc + mul x14,x15,x13 + umulh x13,x15,x13 + cinv x12,x12,cc + adds xzr,x12,#1 + eor x14,x14,x12 + adcs x20,x20,x14 + eor x13,x13,x12 + adcs x21,x21,x13 + adcs x22,x22,x12 + adcs x23,x23,x12 + adcs x24,x24,x12 + adc x25,x25,x12 + subs x15,x4,x7 + cneg x15,x15,cc + csetm x12,cc + subs x13,x11,x8 + cneg x13,x13,cc + mul x14,x15,x13 + umulh x13,x15,x13 + cinv x12,x12,cc + adds xzr,x12,#1 + eor x14,x14,x12 + adcs x21,x21,x14 + eor x13,x13,x12 + adcs x22,x22,x13 + adcs x23,x23,x12 + adcs x24,x24,x12 + adc x25,x25,x12 + subs x15,x5,x6 + cneg x15,x15,cc + csetm x12,cc + subs x13,x10,x9 + cneg x13,x13,cc + mul x14,x15,x13 + umulh x13,x15,x13 + cinv x12,x12,cc + adds xzr,x12,#1 + eor x14,x14,x12 + adcs x21,x21,x14 + + stp x20,x21,[x1,#16] +mov x20, v24.d[1] // lo bits of (x5 * x9) +mov x21, v26.d[0] // lo bits of (x6 * x10) + + eor x13,x13,x12 + adcs x22,x22,x13 + adcs x13,x23,x12 + adcs x14,x24,x12 + adc x15,x25,x12 + mov x12,x22 + +mov x24, v26.d[1] // lo bits of (x7 * x11) + + sub x27, x27, #32 + cmp x27, #32 + bne bignum_emontredc_8n_neon_maddloop_neon + + +bignum_emontredc_8n_neon_maddloop_neon_last: + ldp x8, x9, [x2, #32] + ldp x10, x11, [x2, #48] + + add x2, x2, #32 + add x1, x1, #32 + + adds x22,x20,x16 + adcs x23,x21,x3 + adcs x24,x24,x26 + adc x25,x17,xzr +mov x17, v24.d[0] // lo bits of (x4 * x8) + + ldp x20,x21,[x1] + adds x12,x12,x20 + adcs x13,x13,x21 + ldp x20,x21,[x1,#16] + adcs x14,x14,x20 + adcs x15,x15,x21 + adc x16,xzr,xzr + adds x19,x22,x17 + adcs x22,x23,x22 + adcs x23,x24,x23 + adcs x24,x25,x24 + adc x25,xzr,x25 + adds x20,x22,x17 + adcs x21,x23,x19 + adcs x22,x24,x22 + adcs x23,x25,x23 + adcs x24,xzr,x24 + adc x25,xzr,x25 + adds x17,x17,x12 + adcs x19,x19,x13 + adcs x20,x20,x14 + adcs x21,x21,x15 + adcs x22,x22,x16 + adcs x23,x23,xzr + adcs x24,x24,xzr + adc x25,x25,xzr + subs x15,x6,x7 + cneg x15,x15,cc + csetm x12,cc + subs x13,x11,x10 + cneg x13,x13,cc + mul x14,x15,x13 + umulh x13,x15,x13 + cinv x12,x12,cc + adds xzr,x12,#1 + eor x14,x14,x12 + adcs x23,x23,x14 + eor x13,x13,x12 + adcs x24,x24,x13 + adc x25,x25,x12 + subs x15,x4,x5 + cneg x15,x15,cc + csetm x12,cc + subs x13,x9,x8 + cneg x13,x13,cc + mul x14,x15,x13 + umulh x13,x15,x13 + cinv x12,x12,cc + adds xzr,x12,#1 + eor x14,x14,x12 + adcs x19,x19,x14 + eor x13,x13,x12 + adcs x20,x20,x13 + adcs x21,x21,x12 + adcs x22,x22,x12 + adcs x23,x23,x12 + adcs x24,x24,x12 + adc x25,x25,x12 + subs x15,x5,x7 + cneg x15,x15,cc + csetm x12,cc + subs x13,x11,x9 + cneg x13,x13,cc + mul x14,x15,x13 + umulh x13,x15,x13 + cinv x12,x12,cc + adds xzr,x12,#1 + eor x14,x14,x12 + adcs x22,x22,x14 + eor x13,x13,x12 + adcs x23,x23,x13 + adcs x24,x24,x12 + adc x25,x25,x12 + subs x15,x4,x6 + cneg x15,x15,cc + csetm x12,cc + subs x13,x10,x8 + cneg x13,x13,cc + mul x14,x15,x13 + umulh x13,x15,x13 + cinv x12,x12,cc + adds xzr,x12,#1 + eor x14,x14,x12 + adcs x20,x20,x14 + eor x13,x13,x12 + adcs x21,x21,x13 + adcs x22,x22,x12 + adcs x23,x23,x12 + adcs x24,x24,x12 + adc x25,x25,x12 + subs x15,x4,x7 + cneg x15,x15,cc + csetm x12,cc + subs x13,x11,x8 + cneg x13,x13,cc + mul x14,x15,x13 + umulh x13,x15,x13 + cinv x12,x12,cc + adds xzr,x12,#1 + eor x14,x14,x12 + adcs x21,x21,x14 + eor x13,x13,x12 + adcs x22,x22,x13 + adcs x23,x23,x12 + adcs x24,x24,x12 + adc x25,x25,x12 + subs x15,x5,x6 + cneg x15,x15,cc + csetm x12,cc + subs x13,x10,x9 + cneg x13,x13,cc + mul x14,x15,x13 + umulh x13,x15,x13 + cinv x12,x12,cc + adds xzr,x12,#1 + eor x14,x14,x12 + adcs x21,x21,x14 + eor x13,x13,x12 + adcs x22,x22,x13 + adcs x13,x23,x12 + adcs x14,x24,x12 + adc x15,x25,x12 + mov x12,x22 + stp x17,x19,[x1] + stp x20,x21,[x1,#16] + subs x27, x27, #64 + +bignum_emontredc_8n_neon_madddone: + ldp x17, x19, [x1, #32] + ldp x20, x21, [x1, #48] + ldp x26, xzr, [sp, #16] + adds xzr, x28, x28 + adcs x17, x17, x12 + adcs x19, x19, x13 + adcs x20, x20, x14 + adcs x21, x21, x15 + csetm x28, cs + stp x17, x19, [x1, #32] + stp x20, x21, [x1, #48] + sub x1, x1, x0 + sub x2, x2, x0 + add x1, x1, #32 + subs x26, x26, #1 + stp x26, xzr, [sp, #16] + bne bignum_emontredc_8n_neon_outerloop + neg x0, x28 + +bignum_emontredc_8n_neon_end: + add sp, sp, #32 + + ldp x27, x28, [sp], #16 + ldp x25, x26, [sp], #16 + ldp x23, x24, [sp], #16 + ldp x21, x22, [sp], #16 + ldp x19, x20, [sp], #16 + ret + diff --git a/arm/fastmul/bignum_kmul_16_32_neon.S b/arm/fastmul/bignum_kmul_16_32_neon.S new file mode 100644 index 00000000..a3cb89bd --- /dev/null +++ b/arm/fastmul/bignum_kmul_16_32_neon.S @@ -0,0 +1,835 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC + +// ---------------------------------------------------------------------------- +// Multiply z := x * y +// Inputs x[16], y[16]; output z[32]; temporary buffer t[>=32] +// +// extern void bignum_kmul_16_32_neon +// (uint64_t z[static 32], uint64_t x[static 16], uint64_t y[static 16], +// uint64_t t[static 32]) +// +// This is a Karatsuba-style function multiplying half-sized results +// internally and using temporary buffer t for intermediate results. +// +// Standard ARM ABI: X0 = z, X1 = x, X2 = y, X3 = t +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_kmul_16_32_neon) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_kmul_16_32_neon) + .text + .balign 4 + +// Subroutine-safe copies of the output, inputs and temporary buffer pointers + +#define z x25 +#define x x26 +#define y x27 +#define t x28 + +// More variables for sign masks, with s also necessarily subroutine-safe + +#define s x29 +#define m x19 + +S2N_BN_SYMBOL(bignum_kmul_16_32_neon): + +// Save registers, including return address + + stp x19, x20, [sp, #-16]! + stp x21, x22, [sp, #-16]! + stp x23, x24, [sp, #-16]! + stp x25, x26, [sp, #-16]! + stp x27, x28, [sp, #-16]! + stp x29, x30, [sp, #-16]! + +// Move parameters into subroutine-safe places + + mov z, x0 + mov x, x1 + mov y, x2 + mov t, x3 + +// Compute L = x_lo * y_lo in bottom half of buffer (size 8 x 8 -> 16) + + bl bignum_kmul_16_32_neon_local_mul_8_16 + +// Compute absolute difference [t..] = |x_lo - x_hi| +// and the sign s = sgn(x_lo - x_hi) as a bitmask (all 1s for negative) + + ldp x10, x11, [x] + ldp x8, x9, [x, #64] + subs x10, x10, x8 + sbcs x11, x11, x9 + ldp x12, x13, [x, #16] + ldp x8, x9, [x, #80] + sbcs x12, x12, x8 + sbcs x13, x13, x9 + ldp x14, x15, [x, #32] + ldp x8, x9, [x, #96] + sbcs x14, x14, x8 + sbcs x15, x15, x9 + ldp x16, x17, [x, #48] + ldp x8, x9, [x, #112] + sbcs x16, x16, x8 + sbcs x17, x17, x9 + csetm s, cc + adds xzr, s, s + eor x10, x10, s + adcs x10, x10, xzr + eor x11, x11, s + adcs x11, x11, xzr + stp x10, x11, [t] + eor x12, x12, s + adcs x12, x12, xzr + eor x13, x13, s + adcs x13, x13, xzr + stp x12, x13, [t, #16] + eor x14, x14, s + adcs x14, x14, xzr + eor x15, x15, s + adcs x15, x15, xzr + stp x14, x15, [t, #32] + eor x16, x16, s + adcs x16, x16, xzr + eor x17, x17, s + adcs x17, x17, xzr + stp x16, x17, [t, #48] + +// Compute H = x_hi * y_hi in top half of buffer (size 8 x 8 -> 16) + + add x0, z, #128 + add x1, x, #64 + add x2, y, #64 + bl bignum_kmul_16_32_neon_local_mul_8_16 + +// Compute the other absolute difference [t+8..] = |y_hi - y_lo| +// Collect the combined product sign bitmask (all 1s for negative) in s + + ldp x10, x11, [y] + ldp x8, x9, [y, #64] + subs x10, x8, x10 + sbcs x11, x9, x11 + ldp x12, x13, [y, #16] + ldp x8, x9, [y, #80] + sbcs x12, x8, x12 + sbcs x13, x9, x13 + ldp x14, x15, [y, #32] + ldp x8, x9, [y, #96] + sbcs x14, x8, x14 + sbcs x15, x9, x15 + ldp x16, x17, [y, #48] + ldp x8, x9, [y, #112] + sbcs x16, x8, x16 + sbcs x17, x9, x17 + csetm m, cc + adds xzr, m, m + eor x10, x10, m + adcs x10, x10, xzr + eor x11, x11, m + adcs x11, x11, xzr + stp x10, x11, [t, #64] + eor x12, x12, m + adcs x12, x12, xzr + eor x13, x13, m + adcs x13, x13, xzr + stp x12, x13, [t, #80] + eor x14, x14, m + adcs x14, x14, xzr + eor x15, x15, m + adcs x15, x15, xzr + stp x14, x15, [t, #96] + eor x16, x16, m + adcs x16, x16, xzr + eor x17, x17, m + adcs x17, x17, xzr + stp x16, x17, [t, #112] + eor s, s, m + +// Compute H' = H + L_top in place of H (it cannot overflow) +// First add 8-sized block then propagate carry through next 8 + + ldp x10, x11, [z, #128] + ldp x12, x13, [z, #64] + adds x10, x10, x12 + adcs x11, x11, x13 + stp x10, x11, [z, #128] + + ldp x10, x11, [z, #128+16] + ldp x12, x13, [z, #64+16] + adcs x10, x10, x12 + adcs x11, x11, x13 + stp x10, x11, [z, #128+16] + + ldp x10, x11, [z, #128+32] + ldp x12, x13, [z, #64+32] + adcs x10, x10, x12 + adcs x11, x11, x13 + stp x10, x11, [z, #128+32] + + ldp x10, x11, [z, #128+48] + ldp x12, x13, [z, #64+48] + adcs x10, x10, x12 + adcs x11, x11, x13 + stp x10, x11, [z, #128+48] + + ldp x10, x11, [z, #128+64] + adcs x10, x10, xzr + adcs x11, x11, xzr + stp x10, x11, [z, #128+64] + + ldp x10, x11, [z, #128+80] + adcs x10, x10, xzr + adcs x11, x11, xzr + stp x10, x11, [z, #128+80] + + ldp x10, x11, [z, #128+96] + adcs x10, x10, xzr + adcs x11, x11, xzr + stp x10, x11, [z, #128+96] + + ldp x10, x11, [z, #128+112] + adcs x10, x10, xzr + adcs x11, x11, xzr + stp x10, x11, [z, #128+112] + +// Compute M = |x_lo - x_hi| * |y_hi - y_lo| in [t+16...], size 16 + + add x0, t, #128 + mov x1, t + add x2, t, #64 + bl bignum_kmul_16_32_neon_local_mul_8_16 + +// Add the interlocking H' and L_bot terms, storing in registers x15..x0 +// Intercept the carry at the 8 + 16 = 24 position and store it in x. +// (Note that we no longer need the input x was pointing at.) + + ldp x0, x1, [z] + ldp x16, x17, [z, #128] + adds x0, x0, x16 + adcs x1, x1, x17 + ldp x2, x3, [z, #16] + ldp x16, x17, [z, #144] + adcs x2, x2, x16 + adcs x3, x3, x17 + ldp x4, x5, [z, #32] + ldp x16, x17, [z, #160] + adcs x4, x4, x16 + adcs x5, x5, x17 + ldp x6, x7, [z, #48] + ldp x16, x17, [z, #176] + adcs x6, x6, x16 + adcs x7, x7, x17 + ldp x8, x9, [z, #128] + ldp x16, x17, [z, #192] + adcs x8, x8, x16 + adcs x9, x9, x17 + ldp x10, x11, [z, #144] + ldp x16, x17, [z, #208] + adcs x10, x10, x16 + adcs x11, x11, x17 + ldp x12, x13, [z, #160] + ldp x16, x17, [z, #224] + adcs x12, x12, x16 + adcs x13, x13, x17 + ldp x14, x15, [z, #176] + ldp x16, x17, [z, #240] + adcs x14, x14, x16 + adcs x15, x15, x17 + + cset x, cs + +// Add the sign-adjusted mid-term cross product M + + cmn s, s + + ldp x16, x17, [t, #128] + eor x16, x16, s + adcs x0, x0, x16 + eor x17, x17, s + adcs x1, x1, x17 + stp x0, x1, [z, #64] + ldp x16, x17, [t, #144] + eor x16, x16, s + adcs x2, x2, x16 + eor x17, x17, s + adcs x3, x3, x17 + stp x2, x3, [z, #80] + ldp x16, x17, [t, #160] + eor x16, x16, s + adcs x4, x4, x16 + eor x17, x17, s + adcs x5, x5, x17 + stp x4, x5, [z, #96] + ldp x16, x17, [t, #176] + eor x16, x16, s + adcs x6, x6, x16 + eor x17, x17, s + adcs x7, x7, x17 + stp x6, x7, [z, #112] + ldp x16, x17, [t, #192] + eor x16, x16, s + adcs x8, x8, x16 + eor x17, x17, s + adcs x9, x9, x17 + stp x8, x9, [z, #128] + ldp x16, x17, [t, #208] + eor x16, x16, s + adcs x10, x10, x16 + eor x17, x17, s + adcs x11, x11, x17 + stp x10, x11, [z, #144] + ldp x16, x17, [t, #224] + eor x16, x16, s + adcs x12, x12, x16 + eor x17, x17, s + adcs x13, x13, x17 + stp x12, x13, [z, #160] + ldp x16, x17, [t, #240] + eor x16, x16, s + adcs x14, x14, x16 + eor x17, x17, s + adcs x15, x15, x17 + stp x14, x15, [z, #176] + +// Get the next digits effectively resulting so far starting at 24 + + adcs y, s, x + adc t, s, xzr + +// Now the final 8 digits of padding; the first one is special in using y +// and also in getting the carry chain started + + ldp x10, x11, [z, #192] + adds x10, x10, y + adcs x11, x11, t + stp x10, x11, [z, #192] + ldp x10, x11, [z, #208] + adcs x10, x10, t + adcs x11, x11, t + stp x10, x11, [z, #208] + ldp x10, x11, [z, #224] + adcs x10, x10, t + adcs x11, x11, t + stp x10, x11, [z, #224] + ldp x10, x11, [z, #240] + adcs x10, x10, t + adcs x11, x11, t + stp x10, x11, [z, #240] + +// Restore registers and return + + ldp x29, x30, [sp], #16 + ldp x27, x28, [sp], #16 + ldp x25, x26, [sp], #16 + ldp x23, x24, [sp], #16 + ldp x21, x22, [sp], #16 + ldp x19, x20, [sp], #16 + + ret + +// ---------------------------------------------------------------------------- +// Local copy of bignum_mul_8_16_neon without the scratch register save/restore +// ---------------------------------------------------------------------------- + +bignum_kmul_16_32_neon_local_mul_8_16: + ldp x3, x4, [x1] + ldr q0, [x1] + ldp x7, x8, [x2] + ldr q1, [x2] + ldp x5, x6, [x1, #16] + ldr q2, [x1, #16] + ldp x9, x10, [x2, #16] + ldr q3, [x2, #16] + uzp1 v4.4s, v1.4s, v0.4s + rev64 v1.4s, v1.4s + uzp1 v5.4s, v0.4s, v0.4s + mul v0.4s, v1.4s, v0.4s + uaddlp v0.2d, v0.4s + shl v0.2d, v0.2d, #32 + umlal v0.2d, v5.2s, v4.2s + mov x11, v0.d[0] + mov x15, v0.d[1] + uzp1 v0.4s, v3.4s, v2.4s + rev64 v1.4s, v3.4s + uzp1 v3.4s, v2.4s, v2.4s + mul v1.4s, v1.4s, v2.4s + uaddlp v1.2d, v1.4s + shl v1.2d, v1.2d, #32 + umlal v1.2d, v3.2s, v0.2s + mov x16, v1.d[0] + mov x17, v1.d[1] + ldr q0, [x1, #32] + ldr q1, [x2, #32] + ldr q2, [x1, #48] + ldr q3, [x2, #48] + umulh x19, x3, x7 + adds x15, x15, x19 + umulh x19, x4, x8 + adcs x16, x16, x19 + umulh x19, x5, x9 + adcs x17, x17, x19 + umulh x19, x6, x10 + uzp1 v4.4s, v1.4s, v0.4s + rev64 v1.4s, v1.4s + uzp1 v5.4s, v0.4s, v0.4s + mul v0.4s, v1.4s, v0.4s + uaddlp v0.2d, v0.4s + shl v0.2d, v0.2d, #32 + umlal v0.2d, v5.2s, v4.2s + adc x19, x19, xzr + adds x12, x15, x11 + adcs x15, x16, x15 + adcs x16, x17, x16 + adcs x17, x19, x17 + adc x19, xzr, x19 + adds x13, x15, x11 + adcs x14, x16, x12 + adcs x15, x17, x15 + adcs x16, x19, x16 + adcs x17, xzr, x17 + adc x19, xzr, x19 + subs x24, x5, x6 + cneg x24, x24, cc + csetm x20, cc + subs x21, x10, x9 + cneg x21, x21, cc + mul x22, x24, x21 + umulh x21, x24, x21 + cinv x20, x20, cc + cmn x20, #0x1 + eor x22, x22, x20 + adcs x16, x16, x22 + eor x21, x21, x20 + adcs x17, x17, x21 + adc x19, x19, x20 + subs x24, x3, x4 + cneg x24, x24, cc + csetm x20, cc + subs x21, x8, x7 + cneg x21, x21, cc + mul x22, x24, x21 + umulh x21, x24, x21 + cinv x20, x20, cc + cmn x20, #0x1 + eor x22, x22, x20 + adcs x12, x12, x22 + eor x21, x21, x20 + adcs x13, x13, x21 + adcs x14, x14, x20 + adcs x15, x15, x20 + adcs x16, x16, x20 + adcs x17, x17, x20 + adc x19, x19, x20 + subs x24, x4, x6 + cneg x24, x24, cc + csetm x20, cc + subs x21, x10, x8 + cneg x21, x21, cc + mul x22, x24, x21 + umulh x21, x24, x21 + cinv x20, x20, cc + cmn x20, #0x1 + eor x22, x22, x20 + adcs x15, x15, x22 + eor x21, x21, x20 + adcs x16, x16, x21 + adcs x17, x17, x20 + adc x19, x19, x20 + subs x24, x3, x5 + cneg x24, x24, cc + csetm x20, cc + subs x21, x9, x7 + cneg x21, x21, cc + mul x22, x24, x21 + umulh x21, x24, x21 + cinv x20, x20, cc + cmn x20, #0x1 + eor x22, x22, x20 + adcs x13, x13, x22 + eor x21, x21, x20 + adcs x14, x14, x21 + adcs x15, x15, x20 + adcs x16, x16, x20 + adcs x17, x17, x20 + adc x19, x19, x20 + subs x24, x3, x6 + cneg x24, x24, cc + csetm x20, cc + subs x21, x10, x7 + cneg x21, x21, cc + mul x22, x24, x21 + umulh x21, x24, x21 + cinv x20, x20, cc + cmn x20, #0x1 + eor x22, x22, x20 + adcs x14, x14, x22 + eor x21, x21, x20 + adcs x15, x15, x21 + adcs x16, x16, x20 + adcs x17, x17, x20 + adc x19, x19, x20 + subs x24, x4, x5 + cneg x24, x24, cc + csetm x20, cc + subs x21, x9, x8 + cneg x21, x21, cc + mul x22, x24, x21 + umulh x21, x24, x21 + cinv x20, x20, cc + cmn x20, #0x1 + eor x22, x22, x20 + adcs x14, x14, x22 + eor x21, x21, x20 + adcs x15, x15, x21 + adcs x16, x16, x20 + adcs x17, x17, x20 + adc x19, x19, x20 + ldp x3, x4, [x1, #32] + stp x11, x12, [x0] + ldp x7, x8, [x2, #32] + stp x13, x14, [x0, #16] + ldp x5, x6, [x1, #48] + stp x15, x16, [x0, #32] + ldp x9, x10, [x2, #48] + stp x17, x19, [x0, #48] + mov x11, v0.d[0] + mov x15, v0.d[1] + uzp1 v0.4s, v3.4s, v2.4s + rev64 v1.4s, v3.4s + uzp1 v3.4s, v2.4s, v2.4s + mul v1.4s, v1.4s, v2.4s + uaddlp v1.2d, v1.4s + shl v1.2d, v1.2d, #32 + umlal v1.2d, v3.2s, v0.2s + mov x16, v1.d[0] + mov x17, v1.d[1] + umulh x19, x3, x7 + adds x15, x15, x19 + umulh x19, x4, x8 + adcs x16, x16, x19 + umulh x19, x5, x9 + adcs x17, x17, x19 + umulh x19, x6, x10 + adc x19, x19, xzr + adds x12, x15, x11 + adcs x15, x16, x15 + adcs x16, x17, x16 + adcs x17, x19, x17 + adc x19, xzr, x19 + adds x13, x15, x11 + adcs x14, x16, x12 + adcs x15, x17, x15 + adcs x16, x19, x16 + adcs x17, xzr, x17 + adc x19, xzr, x19 + ldp x22, x21, [x0, #32] + adds x11, x11, x22 + adcs x12, x12, x21 + ldp x22, x21, [x0, #48] + adcs x13, x13, x22 + adcs x14, x14, x21 + adcs x15, x15, xzr + adcs x16, x16, xzr + adcs x17, x17, xzr + adc x19, x19, xzr + subs x24, x5, x6 + cneg x24, x24, cc + csetm x20, cc + subs x21, x10, x9 + cneg x21, x21, cc + mul x22, x24, x21 + umulh x21, x24, x21 + cinv x20, x20, cc + cmn x20, #0x1 + eor x22, x22, x20 + adcs x16, x16, x22 + eor x21, x21, x20 + adcs x17, x17, x21 + adc x19, x19, x20 + subs x24, x3, x4 + cneg x24, x24, cc + csetm x20, cc + subs x21, x8, x7 + cneg x21, x21, cc + mul x22, x24, x21 + umulh x21, x24, x21 + cinv x20, x20, cc + cmn x20, #0x1 + eor x22, x22, x20 + adcs x12, x12, x22 + eor x21, x21, x20 + adcs x13, x13, x21 + adcs x14, x14, x20 + adcs x15, x15, x20 + adcs x16, x16, x20 + adcs x17, x17, x20 + adc x19, x19, x20 + subs x24, x4, x6 + cneg x24, x24, cc + csetm x20, cc + subs x21, x10, x8 + cneg x21, x21, cc + mul x22, x24, x21 + umulh x21, x24, x21 + cinv x20, x20, cc + cmn x20, #0x1 + eor x22, x22, x20 + adcs x15, x15, x22 + eor x21, x21, x20 + adcs x16, x16, x21 + adcs x17, x17, x20 + adc x19, x19, x20 + subs x24, x3, x5 + cneg x24, x24, cc + csetm x20, cc + subs x21, x9, x7 + cneg x21, x21, cc + mul x22, x24, x21 + umulh x21, x24, x21 + cinv x20, x20, cc + cmn x20, #0x1 + eor x22, x22, x20 + adcs x13, x13, x22 + eor x21, x21, x20 + adcs x14, x14, x21 + adcs x15, x15, x20 + adcs x16, x16, x20 + adcs x17, x17, x20 + adc x19, x19, x20 + subs x24, x3, x6 + cneg x24, x24, cc + csetm x20, cc + subs x21, x10, x7 + cneg x21, x21, cc + mul x22, x24, x21 + umulh x21, x24, x21 + cinv x20, x20, cc + cmn x20, #0x1 + eor x22, x22, x20 + adcs x14, x14, x22 + eor x21, x21, x20 + adcs x15, x15, x21 + adcs x16, x16, x20 + adcs x17, x17, x20 + adc x19, x19, x20 + subs x24, x4, x5 + cneg x24, x24, cc + csetm x20, cc + subs x21, x9, x8 + cneg x21, x21, cc + mul x22, x24, x21 + umulh x21, x24, x21 + cinv x20, x20, cc + cmn x20, #0x1 + eor x22, x22, x20 + adcs x14, x14, x22 + eor x21, x21, x20 + adcs x15, x15, x21 + adcs x16, x16, x20 + adcs x17, x17, x20 + adc x19, x19, x20 + ldp x22, x21, [x1] + subs x3, x3, x22 + sbcs x4, x4, x21 + ldp x22, x21, [x1, #16] + sbcs x5, x5, x22 + sbcs x6, x6, x21 + csetm x24, cc + stp x11, x12, [x0, #64] + ldp x22, x21, [x2] + subs x7, x22, x7 + sbcs x8, x21, x8 + ldp x22, x21, [x2, #16] + sbcs x9, x22, x9 + sbcs x10, x21, x10 + csetm x1, cc + stp x13, x14, [x0, #80] + eor x3, x3, x24 + subs x3, x3, x24 + eor x4, x4, x24 + sbcs x4, x4, x24 + eor x5, x5, x24 + sbcs x5, x5, x24 + eor x6, x6, x24 + sbc x6, x6, x24 + stp x15, x16, [x0, #96] + eor x7, x7, x1 + subs x7, x7, x1 + eor x8, x8, x1 + sbcs x8, x8, x1 + eor x9, x9, x1 + sbcs x9, x9, x1 + eor x10, x10, x1 + sbc x10, x10, x1 + stp x17, x19, [x0, #112] + eor x1, x1, x24 + mul x11, x3, x7 + mul x15, x4, x8 + mul x16, x5, x9 + mul x17, x6, x10 + umulh x19, x3, x7 + adds x15, x15, x19 + umulh x19, x4, x8 + adcs x16, x16, x19 + umulh x19, x5, x9 + adcs x17, x17, x19 + umulh x19, x6, x10 + adc x19, x19, xzr + adds x12, x15, x11 + adcs x15, x16, x15 + adcs x16, x17, x16 + adcs x17, x19, x17 + adc x19, xzr, x19 + adds x13, x15, x11 + adcs x14, x16, x12 + adcs x15, x17, x15 + adcs x16, x19, x16 + adcs x17, xzr, x17 + adc x19, xzr, x19 + subs x24, x5, x6 + cneg x24, x24, cc + csetm x20, cc + subs x21, x10, x9 + cneg x21, x21, cc + mul x22, x24, x21 + umulh x21, x24, x21 + cinv x20, x20, cc + cmn x20, #0x1 + eor x22, x22, x20 + adcs x16, x16, x22 + eor x21, x21, x20 + adcs x17, x17, x21 + adc x19, x19, x20 + subs x24, x3, x4 + cneg x24, x24, cc + csetm x20, cc + subs x21, x8, x7 + cneg x21, x21, cc + mul x22, x24, x21 + umulh x21, x24, x21 + cinv x20, x20, cc + cmn x20, #0x1 + eor x22, x22, x20 + adcs x12, x12, x22 + eor x21, x21, x20 + adcs x13, x13, x21 + adcs x14, x14, x20 + adcs x15, x15, x20 + adcs x16, x16, x20 + adcs x17, x17, x20 + adc x19, x19, x20 + subs x24, x4, x6 + cneg x24, x24, cc + csetm x20, cc + subs x21, x10, x8 + cneg x21, x21, cc + mul x22, x24, x21 + umulh x21, x24, x21 + cinv x20, x20, cc + cmn x20, #0x1 + eor x22, x22, x20 + adcs x15, x15, x22 + eor x21, x21, x20 + adcs x16, x16, x21 + adcs x17, x17, x20 + adc x19, x19, x20 + subs x24, x3, x5 + cneg x24, x24, cc + csetm x20, cc + subs x21, x9, x7 + cneg x21, x21, cc + mul x22, x24, x21 + umulh x21, x24, x21 + cinv x20, x20, cc + cmn x20, #0x1 + eor x22, x22, x20 + adcs x13, x13, x22 + eor x21, x21, x20 + adcs x14, x14, x21 + adcs x15, x15, x20 + adcs x16, x16, x20 + adcs x17, x17, x20 + adc x19, x19, x20 + subs x24, x3, x6 + cneg x24, x24, cc + csetm x20, cc + subs x21, x10, x7 + cneg x21, x21, cc + mul x22, x24, x21 + umulh x21, x24, x21 + cinv x20, x20, cc + cmn x20, #0x1 + eor x22, x22, x20 + adcs x14, x14, x22 + eor x21, x21, x20 + adcs x15, x15, x21 + adcs x16, x16, x20 + adcs x17, x17, x20 + adc x19, x19, x20 + subs x24, x4, x5 + cneg x24, x24, cc + csetm x20, cc + subs x21, x9, x8 + cneg x21, x21, cc + mul x22, x24, x21 + umulh x21, x24, x21 + cinv x20, x20, cc + cmn x20, #0x1 + eor x22, x22, x20 + adcs x14, x14, x22 + eor x21, x21, x20 + adcs x15, x15, x21 + adcs x16, x16, x20 + adcs x17, x17, x20 + adc x19, x19, x20 + ldp x3, x4, [x0] + ldp x7, x8, [x0, #64] + adds x3, x3, x7 + adcs x4, x4, x8 + ldp x5, x6, [x0, #16] + ldp x9, x10, [x0, #80] + adcs x5, x5, x9 + adcs x6, x6, x10 + ldp x20, x21, [x0, #96] + adcs x7, x7, x20 + adcs x8, x8, x21 + ldp x22, x23, [x0, #112] + adcs x9, x9, x22 + adcs x10, x10, x23 + adcs x24, x1, xzr + adc x2, x1, xzr + cmn x1, #0x1 + eor x11, x11, x1 + adcs x3, x11, x3 + eor x12, x12, x1 + adcs x4, x12, x4 + eor x13, x13, x1 + adcs x5, x13, x5 + eor x14, x14, x1 + adcs x6, x14, x6 + eor x15, x15, x1 + adcs x7, x15, x7 + eor x16, x16, x1 + adcs x8, x16, x8 + eor x17, x17, x1 + adcs x9, x17, x9 + eor x19, x19, x1 + adcs x10, x19, x10 + adcs x20, x20, x24 + adcs x21, x21, x2 + adcs x22, x22, x2 + adc x23, x23, x2 + stp x3, x4, [x0, #32] + stp x5, x6, [x0, #48] + stp x7, x8, [x0, #64] + stp x9, x10, [x0, #80] + stp x20, x21, [x0, #96] + stp x22, x23, [x0, #112] + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif + diff --git a/arm/fastmul/bignum_kmul_32_64_neon.S b/arm/fastmul/bignum_kmul_32_64_neon.S new file mode 100644 index 00000000..ce17e8fb --- /dev/null +++ b/arm/fastmul/bignum_kmul_32_64_neon.S @@ -0,0 +1,1387 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC + +// ---------------------------------------------------------------------------- +// Multiply z := x * y +// Inputs x[32], y[32]; output z[64]; temporary buffer t[>=96] +// +// extern void bignum_kmul_32_64_neon +// (uint64_t z[static 64], uint64_t x[static 32], uint64_t y[static 32], +// uint64_t t[static 96]) +// +// This is a Karatsuba-style function multiplying half-sized results +// internally and using temporary buffer t for intermediate results. +// +// Standard ARM ABI: X0 = z, X1 = x, X2 = y, X3 = t +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_kmul_32_64_neon) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_kmul_32_64_neon) + .text + .balign 4 + +#define K 16 +#define L 8 // this is (K/2) + +#define z x19 +#define x x20 +#define y x21 +#define t x22 + +#define c x16 + +S2N_BN_SYMBOL(bignum_kmul_32_64_neon): + +// Save extra registers and return address, store parameters safely + + stp x19, x20, [sp, -16]! + stp x21, x22, [sp, -16]! + stp x23, x24, [sp, -16]! + stp x25, x26, [sp, -16]! + stp x27, x28, [sp, -16]! + stp x29, x30, [sp, -16]! + + mov z, x0 + mov x, x1 + mov y, x2 + mov t, x3 + +// Compute L = x_lo * y_lo in bottom half of buffer (size 16 x 16 -> 32) + + bl bignum_kmul_32_64_neon_local_kmul_16_32 + +// Compute H = x_hi * y_hi in top half of buffer (size 16 x 16 -> 32) + + add x0, z, #16*K + add x1, x, #8*K + add x2, y, #8*K + mov x3, t + bl bignum_kmul_32_64_neon_local_kmul_16_32 + +// Compute absolute difference [t..] = |x_lo - x_hi| +// and the sign x = sgn(x_lo - x_hi) as a bitmask (all 1s for negative) +// Note that we overwrite the pointer x itself with this sign, +// which is safe since we no longer need it. + + ldp x0, x1, [x, #128] + ldp x16, x17, [x] + subs x0, x0, x16 + sbcs x1, x1, x17 + + ldp x2, x3, [x, #144] + ldp x16, x17, [x, #16] + sbcs x2, x2, x16 + sbcs x3, x3, x17 + + ldp x4, x5, [x, #160] + ldp x16, x17, [x, #32] + sbcs x4, x4, x16 + sbcs x5, x5, x17 + + ldp x6, x7, [x, #176] + ldp x16, x17, [x, #48] + sbcs x6, x6, x16 + sbcs x7, x7, x17 + + ldp x8, x9, [x, #192] + ldp x16, x17, [x, #64] + sbcs x8, x8, x16 + sbcs x9, x9, x17 + + ldp x10, x11, [x, #208] + ldp x16, x17, [x, #80] + sbcs x10, x10, x16 + sbcs x11, x11, x17 + + ldp x12, x13, [x, #224] + ldp x16, x17, [x, #96] + sbcs x12, x12, x16 + sbcs x13, x13, x17 + + ldp x14, x15, [x, #240] + ldp x16, x17, [x, #112] + sbcs x14, x14, x16 + sbcs x15, x15, x17 + + sbc x, xzr, xzr + + adds xzr, x, x + + eor x0, x0, x + adcs x0, x0, xzr + eor x1, x1, x + adcs x1, x1, xzr + stp x0, x1, [t] + + eor x2, x2, x + adcs x2, x2, xzr + eor x3, x3, x + adcs x3, x3, xzr + stp x2, x3, [t, #16] + + eor x4, x4, x + adcs x4, x4, xzr + eor x5, x5, x + adcs x5, x5, xzr + stp x4, x5, [t, #32] + + eor x6, x6, x + adcs x6, x6, xzr + eor x7, x7, x + adcs x7, x7, xzr + stp x6, x7, [t, #48] + + eor x8, x8, x + adcs x8, x8, xzr + eor x9, x9, x + adcs x9, x9, xzr + stp x8, x9, [t, #64] + + eor x10, x10, x + adcs x10, x10, xzr + eor x11, x11, x + adcs x11, x11, xzr + stp x10, x11, [t, #80] + + eor x12, x12, x + adcs x12, x12, xzr + eor x13, x13, x + adcs x13, x13, xzr + stp x12, x13, [t, #96] + + eor x14, x14, x + adcs x14, x14, xzr + eor x15, x15, x + adc x15, x15, xzr + stp x14, x15, [t, #112] + +// Compute the other absolute difference [t+8*K..] = |y_hi - y_lo| +// Collect the combined product sign bitmask (all 1s for negative) as +// y = sgn((x_lo - x_hi) * (y_hi - y_lo)), overwriting the y pointer. + + ldp x0, x1, [y] + ldp x16, x17, [y, #128] + subs x0, x0, x16 + sbcs x1, x1, x17 + + ldp x2, x3, [y, #16] + ldp x16, x17, [y, #144] + sbcs x2, x2, x16 + sbcs x3, x3, x17 + + ldp x4, x5, [y, #32] + ldp x16, x17, [y, #160] + sbcs x4, x4, x16 + sbcs x5, x5, x17 + + ldp x6, x7, [y, #48] + ldp x16, x17, [y, #176] + sbcs x6, x6, x16 + sbcs x7, x7, x17 + + ldp x8, x9, [y, #64] + ldp x16, x17, [y, #192] + sbcs x8, x8, x16 + sbcs x9, x9, x17 + + ldp x10, x11, [y, #80] + ldp x16, x17, [y, #208] + sbcs x10, x10, x16 + sbcs x11, x11, x17 + + ldp x12, x13, [y, #96] + ldp x16, x17, [y, #224] + sbcs x12, x12, x16 + sbcs x13, x13, x17 + + ldp x14, x15, [y, #112] + ldp x16, x17, [y, #240] + sbcs x14, x14, x16 + sbcs x15, x15, x17 + + sbc y, xzr, xzr + + adds xzr, y, y + + eor x0, x0, y + adcs x0, x0, xzr + eor x1, x1, y + adcs x1, x1, xzr + stp x0, x1, [t, #128] + + eor x2, x2, y + adcs x2, x2, xzr + eor x3, x3, y + adcs x3, x3, xzr + stp x2, x3, [t, #128+16] + + eor x4, x4, y + adcs x4, x4, xzr + eor x5, x5, y + adcs x5, x5, xzr + stp x4, x5, [t, #128+32] + + eor x6, x6, y + adcs x6, x6, xzr + eor x7, x7, y + adcs x7, x7, xzr + stp x6, x7, [t, #128+48] + + eor x8, x8, y + adcs x8, x8, xzr + eor x9, x9, y + adcs x9, x9, xzr + stp x8, x9, [t, #128+64] + + eor x10, x10, y + adcs x10, x10, xzr + eor x11, x11, y + adcs x11, x11, xzr + stp x10, x11, [t, #128+80] + + eor x12, x12, y + adcs x12, x12, xzr + eor x13, x13, y + adcs x13, x13, xzr + stp x12, x13, [t, #128+96] + + eor x14, x14, y + adcs x14, x14, xzr + eor x15, x15, y + adc x15, x15, xzr + stp x14, x15, [t, #128+112] + + eor y, y, x + +// Compute H' = H + L_top in place of H (it cannot overflow) + + ldp x0, x1, [z, #16*16] + ldp x2, x3, [z, #16*L] + adds x0, x0, x2 + adcs x1, x1, x3 + stp x0, x1, [z, #16*16] + + ldp x0, x1, [z, #16*17] + ldp x2, x3, [z, #16*9] + adcs x0, x0, x2 + adcs x1, x1, x3 + stp x0, x1, [z, #16*17] + + ldp x0, x1, [z, #16*18] + ldp x2, x3, [z, #16*10] + adcs x0, x0, x2 + adcs x1, x1, x3 + stp x0, x1, [z, #16*18] + + ldp x0, x1, [z, #16*19] + ldp x2, x3, [z, #16*11] + adcs x0, x0, x2 + adcs x1, x1, x3 + stp x0, x1, [z, #16*19] + + ldp x0, x1, [z, #16*20] + ldp x2, x3, [z, #16*12] + adcs x0, x0, x2 + adcs x1, x1, x3 + stp x0, x1, [z, #16*20] + + ldp x0, x1, [z, #16*21] + ldp x2, x3, [z, #16*13] + adcs x0, x0, x2 + adcs x1, x1, x3 + stp x0, x1, [z, #16*21] + + ldp x0, x1, [z, #16*22] + ldp x2, x3, [z, #16*14] + adcs x0, x0, x2 + adcs x1, x1, x3 + stp x0, x1, [z, #16*22] + + ldp x0, x1, [z, #16*23] + ldp x2, x3, [z, #16*15] + adcs x0, x0, x2 + adcs x1, x1, x3 + stp x0, x1, [z, #16*23] + + ldp x0, x1, [z, #16*24] + adcs x0, x0, xzr + adcs x1, x1, xzr + stp x0, x1, [z, #16*24] + + ldp x0, x1, [z, #16*25] + adcs x0, x0, xzr + adcs x1, x1, xzr + stp x0, x1, [z, #16*25] + + ldp x0, x1, [z, #16*26] + adcs x0, x0, xzr + adcs x1, x1, xzr + stp x0, x1, [z, #16*26] + + ldp x0, x1, [z, #16*27] + adcs x0, x0, xzr + adcs x1, x1, xzr + stp x0, x1, [z, #16*27] + + ldp x0, x1, [z, #16*28] + adcs x0, x0, xzr + adcs x1, x1, xzr + stp x0, x1, [z, #16*28] + + ldp x0, x1, [z, #16*29] + adcs x0, x0, xzr + adcs x1, x1, xzr + stp x0, x1, [z, #16*29] + + ldp x0, x1, [z, #16*30] + adcs x0, x0, xzr + adcs x1, x1, xzr + stp x0, x1, [z, #16*30] + + ldp x0, x1, [z, #16*31] + adcs x0, x0, xzr + adc x1, x1, xzr + stp x0, x1, [z, #16*31] + +// Compute M = |x_lo - x_hi| * |y_hi - y_lo|, size 32 + + add x0, t, #16*K + mov x1, t + add x2, t, #8*K + add x3, t, #32*K + bl bignum_kmul_32_64_neon_local_kmul_16_32 + +// Add the interlocking H' and L_bot terms +// Intercept the carry at the 3k position and store it in x. +// Again, we no longer need the input x was pointing at. + + ldp x0, x1, [z, #16*16] + ldp x2, x3, [z] + adds x0, x0, x2 + adcs x1, x1, x3 + stp x0, x1, [z, #16*8] + + ldp x0, x1, [z, #16*17] + ldp x2, x3, [z, #16*1] + adcs x0, x0, x2 + adcs x1, x1, x3 + stp x0, x1, [z, #16*9] + + ldp x0, x1, [z, #16*18] + ldp x2, x3, [z, #16*2] + adcs x0, x0, x2 + adcs x1, x1, x3 + stp x0, x1, [z, #16*10] + + ldp x0, x1, [z, #16*19] + ldp x2, x3, [z, #16*3] + adcs x0, x0, x2 + adcs x1, x1, x3 + stp x0, x1, [z, #16*11] + + ldp x0, x1, [z, #16*20] + ldp x2, x3, [z, #16*4] + adcs x0, x0, x2 + adcs x1, x1, x3 + stp x0, x1, [z, #16*12] + + ldp x0, x1, [z, #16*21] + ldp x2, x3, [z, #16*5] + adcs x0, x0, x2 + adcs x1, x1, x3 + stp x0, x1, [z, #16*13] + + ldp x0, x1, [z, #16*22] + ldp x2, x3, [z, #16*6] + adcs x0, x0, x2 + adcs x1, x1, x3 + stp x0, x1, [z, #16*14] + + ldp x0, x1, [z, #16*23] + ldp x2, x3, [z, #16*7] + adcs x0, x0, x2 + adcs x1, x1, x3 + stp x0, x1, [z, #16*15] + + ldp x0, x1, [z, #16*16] + ldp x2, x3, [z, #16*24] + adcs x0, x0, x2 + adcs x1, x1, x3 + stp x0, x1, [z, #16*16] + + ldp x0, x1, [z, #16*17] + ldp x2, x3, [z, #16*25] + adcs x0, x0, x2 + adcs x1, x1, x3 + stp x0, x1, [z, #16*17] + + ldp x0, x1, [z, #16*18] + ldp x2, x3, [z, #16*26] + adcs x0, x0, x2 + adcs x1, x1, x3 + stp x0, x1, [z, #16*18] + + ldp x0, x1, [z, #16*19] + ldp x2, x3, [z, #16*27] + adcs x0, x0, x2 + adcs x1, x1, x3 + stp x0, x1, [z, #16*19] + + ldp x0, x1, [z, #16*20] + ldp x2, x3, [z, #16*28] + adcs x0, x0, x2 + adcs x1, x1, x3 + stp x0, x1, [z, #16*20] + + ldp x0, x1, [z, #16*21] + ldp x2, x3, [z, #16*29] + adcs x0, x0, x2 + adcs x1, x1, x3 + stp x0, x1, [z, #16*21] + + ldp x0, x1, [z, #16*22] + ldp x2, x3, [z, #16*30] + adcs x0, x0, x2 + adcs x1, x1, x3 + stp x0, x1, [z, #16*22] + + ldp x0, x1, [z, #16*23] + ldp x2, x3, [z, #16*31] + adcs x0, x0, x2 + adcs x1, x1, x3 + stp x0, x1, [z, #16*23] + + cset x, cs + +// Add the sign-adjusted mid-term cross product M + + cmn y, y + + ldp x0, x1, [z, #128] + ldp x2, x3, [t, #128+128] + eor x2, x2, y + adcs x0, x0, x2 + eor x3, x3, y + adcs x1, x1, x3 + stp x0, x1, [z, #128] + + ldp x0, x1, [z, #144] + ldp x2, x3, [t, #128+144] + eor x2, x2, y + adcs x0, x0, x2 + eor x3, x3, y + adcs x1, x1, x3 + stp x0, x1, [z, #144] + + ldp x0, x1, [z, #160] + ldp x2, x3, [t, #128+160] + eor x2, x2, y + adcs x0, x0, x2 + eor x3, x3, y + adcs x1, x1, x3 + stp x0, x1, [z, #160] + + ldp x0, x1, [z, #176] + ldp x2, x3, [t, #128+176] + eor x2, x2, y + adcs x0, x0, x2 + eor x3, x3, y + adcs x1, x1, x3 + stp x0, x1, [z, #176] + + ldp x0, x1, [z, #192] + ldp x2, x3, [t, #128+192] + eor x2, x2, y + adcs x0, x0, x2 + eor x3, x3, y + adcs x1, x1, x3 + stp x0, x1, [z, #192] + + ldp x0, x1, [z, #208] + ldp x2, x3, [t, #128+208] + eor x2, x2, y + adcs x0, x0, x2 + eor x3, x3, y + adcs x1, x1, x3 + stp x0, x1, [z, #208] + + ldp x0, x1, [z, #224] + ldp x2, x3, [t, #128+224] + eor x2, x2, y + adcs x0, x0, x2 + eor x3, x3, y + adcs x1, x1, x3 + stp x0, x1, [z, #224] + + ldp x0, x1, [z, #240] + ldp x2, x3, [t, #128+240] + eor x2, x2, y + adcs x0, x0, x2 + eor x3, x3, y + adcs x1, x1, x3 + stp x0, x1, [z, #240] + + ldp x0, x1, [z, #256] + ldp x2, x3, [t, #128+256] + eor x2, x2, y + adcs x0, x0, x2 + eor x3, x3, y + adcs x1, x1, x3 + stp x0, x1, [z, #256] + + ldp x0, x1, [z, #272] + ldp x2, x3, [t, #128+272] + eor x2, x2, y + adcs x0, x0, x2 + eor x3, x3, y + adcs x1, x1, x3 + stp x0, x1, [z, #272] + + ldp x0, x1, [z, #288] + ldp x2, x3, [t, #128+288] + eor x2, x2, y + adcs x0, x0, x2 + eor x3, x3, y + adcs x1, x1, x3 + stp x0, x1, [z, #288] + + ldp x0, x1, [z, #304] + ldp x2, x3, [t, #128+304] + eor x2, x2, y + adcs x0, x0, x2 + eor x3, x3, y + adcs x1, x1, x3 + stp x0, x1, [z, #304] + + ldp x0, x1, [z, #320] + ldp x2, x3, [t, #128+320] + eor x2, x2, y + adcs x0, x0, x2 + eor x3, x3, y + adcs x1, x1, x3 + stp x0, x1, [z, #320] + + ldp x0, x1, [z, #336] + ldp x2, x3, [t, #128+336] + eor x2, x2, y + adcs x0, x0, x2 + eor x3, x3, y + adcs x1, x1, x3 + stp x0, x1, [z, #336] + + ldp x0, x1, [z, #352] + ldp x2, x3, [t, #128+352] + eor x2, x2, y + adcs x0, x0, x2 + eor x3, x3, y + adcs x1, x1, x3 + stp x0, x1, [z, #352] + + ldp x0, x1, [z, #368] + ldp x2, x3, [t, #128+368] + eor x2, x2, y + adcs x0, x0, x2 + eor x3, x3, y + adcs x1, x1, x3 + stp x0, x1, [z, #368] + +// Get the next digits effectively resulting so far starting at 3k +// [...,c,c,c,c,x] + + adcs x, y, x + adc c, y, xzr + +// Now propagate through the top quarter of the result + + ldp x0, x1, [z, #16*24] + adds x0, x0, x + adcs x1, x1, c + stp x0, x1, [z, #16*24] + + ldp x0, x1, [z, #16*25] + adcs x0, x0, c + adcs x1, x1, c + stp x0, x1, [z, #16*25] + + ldp x0, x1, [z, #16*26] + adcs x0, x0, c + adcs x1, x1, c + stp x0, x1, [z, #16*26] + + ldp x0, x1, [z, #16*27] + adcs x0, x0, c + adcs x1, x1, c + stp x0, x1, [z, #16*27] + + ldp x0, x1, [z, #16*28] + adcs x0, x0, c + adcs x1, x1, c + stp x0, x1, [z, #16*28] + + ldp x0, x1, [z, #16*29] + adcs x0, x0, c + adcs x1, x1, c + stp x0, x1, [z, #16*29] + + ldp x0, x1, [z, #16*30] + adcs x0, x0, c + adcs x1, x1, c + stp x0, x1, [z, #16*30] + + ldp x0, x1, [z, #16*31] + adcs x0, x0, c + adc x1, x1, c + stp x0, x1, [z, #16*31] + +// Restore and return + + ldp x29, x30, [sp], #16 + ldp x27, x28, [sp], #16 + ldp x25, x26, [sp], #16 + ldp x23, x24, [sp], #16 + ldp x21, x22, [sp], #16 + ldp x19, x20, [sp], #16 + ret + +// Local copy of bignum_kmul_16_32_neon, identical to main one except that it +// only preserves the key registers we need to be stable in the main code. +// This includes in turn a copy of bignum_mul_8_16_neon. + +bignum_kmul_32_64_neon_local_kmul_16_32: + stp x19, x20, [sp, -16]! + stp x21, x22, [sp, -16]! + stp x23, x30, [sp, -16]! + mov x25, x0 + mov x26, x1 + mov x27, x2 + mov x28, x3 + bl bignum_kmul_32_64_neon_local_mul_8_16 + ldp x10, x11, [x26] + ldp x8, x9, [x26, #64] + subs x10, x10, x8 + sbcs x11, x11, x9 + ldp x12, x13, [x26, #16] + ldp x8, x9, [x26, #80] + sbcs x12, x12, x8 + sbcs x13, x13, x9 + ldp x14, x15, [x26, #32] + ldp x8, x9, [x26, #96] + sbcs x14, x14, x8 + sbcs x15, x15, x9 + ldp x16, x17, [x26, #48] + ldp x8, x9, [x26, #112] + sbcs x16, x16, x8 + sbcs x17, x17, x9 + csetm x29, cc + cmn x29, x29 + eor x10, x10, x29 + adcs x10, x10, xzr + eor x11, x11, x29 + adcs x11, x11, xzr + stp x10, x11, [x28] + eor x12, x12, x29 + adcs x12, x12, xzr + eor x13, x13, x29 + adcs x13, x13, xzr + stp x12, x13, [x28, #16] + eor x14, x14, x29 + adcs x14, x14, xzr + eor x15, x15, x29 + adcs x15, x15, xzr + stp x14, x15, [x28, #32] + eor x16, x16, x29 + adcs x16, x16, xzr + eor x17, x17, x29 + adcs x17, x17, xzr + stp x16, x17, [x28, #48] + add x0, x25, #0x80 + add x1, x26, #0x40 + add x2, x27, #0x40 + bl bignum_kmul_32_64_neon_local_mul_8_16 + ldp x10, x11, [x27] + ldp x8, x9, [x27, #64] + subs x10, x8, x10 + sbcs x11, x9, x11 + ldp x12, x13, [x27, #16] + ldp x8, x9, [x27, #80] + sbcs x12, x8, x12 + sbcs x13, x9, x13 + ldp x14, x15, [x27, #32] + ldp x8, x9, [x27, #96] + sbcs x14, x8, x14 + sbcs x15, x9, x15 + ldp x16, x17, [x27, #48] + ldp x8, x9, [x27, #112] + sbcs x16, x8, x16 + sbcs x17, x9, x17 + csetm x19, cc + cmn x19, x19 + eor x10, x10, x19 + adcs x10, x10, xzr + eor x11, x11, x19 + adcs x11, x11, xzr + stp x10, x11, [x28, #64] + eor x12, x12, x19 + adcs x12, x12, xzr + eor x13, x13, x19 + adcs x13, x13, xzr + stp x12, x13, [x28, #80] + eor x14, x14, x19 + adcs x14, x14, xzr + eor x15, x15, x19 + adcs x15, x15, xzr + stp x14, x15, [x28, #96] + eor x16, x16, x19 + adcs x16, x16, xzr + eor x17, x17, x19 + adcs x17, x17, xzr + stp x16, x17, [x28, #112] + eor x29, x29, x19 + ldp x10, x11, [x25, #128] + ldp x12, x13, [x25, #64] + adds x10, x10, x12 + adcs x11, x11, x13 + stp x10, x11, [x25, #128] + ldp x10, x11, [x25, #144] + ldp x12, x13, [x25, #80] + adcs x10, x10, x12 + adcs x11, x11, x13 + stp x10, x11, [x25, #144] + ldp x10, x11, [x25, #160] + ldp x12, x13, [x25, #96] + adcs x10, x10, x12 + adcs x11, x11, x13 + stp x10, x11, [x25, #160] + ldp x10, x11, [x25, #176] + ldp x12, x13, [x25, #112] + adcs x10, x10, x12 + adcs x11, x11, x13 + stp x10, x11, [x25, #176] + ldp x10, x11, [x25, #192] + adcs x10, x10, xzr + adcs x11, x11, xzr + stp x10, x11, [x25, #192] + ldp x10, x11, [x25, #208] + adcs x10, x10, xzr + adcs x11, x11, xzr + stp x10, x11, [x25, #208] + ldp x10, x11, [x25, #224] + adcs x10, x10, xzr + adcs x11, x11, xzr + stp x10, x11, [x25, #224] + ldp x10, x11, [x25, #240] + adcs x10, x10, xzr + adcs x11, x11, xzr + stp x10, x11, [x25, #240] + add x0, x28, #0x80 + mov x1, x28 + add x2, x28, #0x40 + bl bignum_kmul_32_64_neon_local_mul_8_16 + ldp x0, x1, [x25] + ldp x16, x17, [x25, #128] + adds x0, x0, x16 + adcs x1, x1, x17 + ldp x2, x3, [x25, #16] + ldp x16, x17, [x25, #144] + adcs x2, x2, x16 + adcs x3, x3, x17 + ldp x4, x5, [x25, #32] + ldp x16, x17, [x25, #160] + adcs x4, x4, x16 + adcs x5, x5, x17 + ldp x6, x7, [x25, #48] + ldp x16, x17, [x25, #176] + adcs x6, x6, x16 + adcs x7, x7, x17 + ldp x8, x9, [x25, #128] + ldp x16, x17, [x25, #192] + adcs x8, x8, x16 + adcs x9, x9, x17 + ldp x10, x11, [x25, #144] + ldp x16, x17, [x25, #208] + adcs x10, x10, x16 + adcs x11, x11, x17 + ldp x12, x13, [x25, #160] + ldp x16, x17, [x25, #224] + adcs x12, x12, x16 + adcs x13, x13, x17 + ldp x14, x15, [x25, #176] + ldp x16, x17, [x25, #240] + adcs x14, x14, x16 + adcs x15, x15, x17 + cset x26, cs + cmn x29, x29 + ldp x16, x17, [x28, #128] + eor x16, x16, x29 + adcs x0, x0, x16 + eor x17, x17, x29 + adcs x1, x1, x17 + stp x0, x1, [x25, #64] + ldp x16, x17, [x28, #144] + eor x16, x16, x29 + adcs x2, x2, x16 + eor x17, x17, x29 + adcs x3, x3, x17 + stp x2, x3, [x25, #80] + ldp x16, x17, [x28, #160] + eor x16, x16, x29 + adcs x4, x4, x16 + eor x17, x17, x29 + adcs x5, x5, x17 + stp x4, x5, [x25, #96] + ldp x16, x17, [x28, #176] + eor x16, x16, x29 + adcs x6, x6, x16 + eor x17, x17, x29 + adcs x7, x7, x17 + stp x6, x7, [x25, #112] + ldp x16, x17, [x28, #192] + eor x16, x16, x29 + adcs x8, x8, x16 + eor x17, x17, x29 + adcs x9, x9, x17 + stp x8, x9, [x25, #128] + ldp x16, x17, [x28, #208] + eor x16, x16, x29 + adcs x10, x10, x16 + eor x17, x17, x29 + adcs x11, x11, x17 + stp x10, x11, [x25, #144] + ldp x16, x17, [x28, #224] + eor x16, x16, x29 + adcs x12, x12, x16 + eor x17, x17, x29 + adcs x13, x13, x17 + stp x12, x13, [x25, #160] + ldp x16, x17, [x28, #240] + eor x16, x16, x29 + adcs x14, x14, x16 + eor x17, x17, x29 + adcs x15, x15, x17 + stp x14, x15, [x25, #176] + adcs x27, x29, x26 + adc x28, x29, xzr + ldp x10, x11, [x25, #192] + adds x10, x10, x27 + adcs x11, x11, x28 + stp x10, x11, [x25, #192] + ldp x10, x11, [x25, #208] + adcs x10, x10, x28 + adcs x11, x11, x28 + stp x10, x11, [x25, #208] + ldp x10, x11, [x25, #224] + adcs x10, x10, x28 + adcs x11, x11, x28 + stp x10, x11, [x25, #224] + ldp x10, x11, [x25, #240] + adcs x10, x10, x28 + adcs x11, x11, x28 + stp x10, x11, [x25, #240] + ldp x23, x30, [sp], #16 + ldp x21, x22, [sp], #16 + ldp x19, x20, [sp], #16 + ret + +bignum_kmul_32_64_neon_local_mul_8_16: + ldp x3, x4, [x1] + ldr q0, [x1] + ldp x7, x8, [x2] + ldr q1, [x2] + ldp x5, x6, [x1, #16] + ldr q2, [x1, #16] + ldp x9, x10, [x2, #16] + ldr q3, [x2, #16] + uzp1 v4.4s, v1.4s, v0.4s + rev64 v1.4s, v1.4s + uzp1 v5.4s, v0.4s, v0.4s + mul v0.4s, v1.4s, v0.4s + uaddlp v0.2d, v0.4s + shl v0.2d, v0.2d, #32 + umlal v0.2d, v5.2s, v4.2s + mov x11, v0.d[0] + mov x15, v0.d[1] + uzp1 v0.4s, v3.4s, v2.4s + rev64 v1.4s, v3.4s + uzp1 v3.4s, v2.4s, v2.4s + mul v1.4s, v1.4s, v2.4s + uaddlp v1.2d, v1.4s + shl v1.2d, v1.2d, #32 + umlal v1.2d, v3.2s, v0.2s + mov x16, v1.d[0] + mov x17, v1.d[1] + ldr q0, [x1, #32] + ldr q1, [x2, #32] + ldr q2, [x1, #48] + ldr q3, [x2, #48] + umulh x19, x3, x7 + adds x15, x15, x19 + umulh x19, x4, x8 + adcs x16, x16, x19 + umulh x19, x5, x9 + adcs x17, x17, x19 + umulh x19, x6, x10 + uzp1 v4.4s, v1.4s, v0.4s + rev64 v1.4s, v1.4s + uzp1 v5.4s, v0.4s, v0.4s + mul v0.4s, v1.4s, v0.4s + uaddlp v0.2d, v0.4s + shl v0.2d, v0.2d, #32 + umlal v0.2d, v5.2s, v4.2s + adc x19, x19, xzr + adds x12, x15, x11 + adcs x15, x16, x15 + adcs x16, x17, x16 + adcs x17, x19, x17 + adc x19, xzr, x19 + adds x13, x15, x11 + adcs x14, x16, x12 + adcs x15, x17, x15 + adcs x16, x19, x16 + adcs x17, xzr, x17 + adc x19, xzr, x19 + subs x24, x5, x6 + cneg x24, x24, cc + csetm x20, cc + subs x21, x10, x9 + cneg x21, x21, cc + mul x22, x24, x21 + umulh x21, x24, x21 + cinv x20, x20, cc + cmn x20, #0x1 + eor x22, x22, x20 + adcs x16, x16, x22 + eor x21, x21, x20 + adcs x17, x17, x21 + adc x19, x19, x20 + subs x24, x3, x4 + cneg x24, x24, cc + csetm x20, cc + subs x21, x8, x7 + cneg x21, x21, cc + mul x22, x24, x21 + umulh x21, x24, x21 + cinv x20, x20, cc + cmn x20, #0x1 + eor x22, x22, x20 + adcs x12, x12, x22 + eor x21, x21, x20 + adcs x13, x13, x21 + adcs x14, x14, x20 + adcs x15, x15, x20 + adcs x16, x16, x20 + adcs x17, x17, x20 + adc x19, x19, x20 + subs x24, x4, x6 + cneg x24, x24, cc + csetm x20, cc + subs x21, x10, x8 + cneg x21, x21, cc + mul x22, x24, x21 + umulh x21, x24, x21 + cinv x20, x20, cc + cmn x20, #0x1 + eor x22, x22, x20 + adcs x15, x15, x22 + eor x21, x21, x20 + adcs x16, x16, x21 + adcs x17, x17, x20 + adc x19, x19, x20 + subs x24, x3, x5 + cneg x24, x24, cc + csetm x20, cc + subs x21, x9, x7 + cneg x21, x21, cc + mul x22, x24, x21 + umulh x21, x24, x21 + cinv x20, x20, cc + cmn x20, #0x1 + eor x22, x22, x20 + adcs x13, x13, x22 + eor x21, x21, x20 + adcs x14, x14, x21 + adcs x15, x15, x20 + adcs x16, x16, x20 + adcs x17, x17, x20 + adc x19, x19, x20 + subs x24, x3, x6 + cneg x24, x24, cc + csetm x20, cc + subs x21, x10, x7 + cneg x21, x21, cc + mul x22, x24, x21 + umulh x21, x24, x21 + cinv x20, x20, cc + cmn x20, #0x1 + eor x22, x22, x20 + adcs x14, x14, x22 + eor x21, x21, x20 + adcs x15, x15, x21 + adcs x16, x16, x20 + adcs x17, x17, x20 + adc x19, x19, x20 + subs x24, x4, x5 + cneg x24, x24, cc + csetm x20, cc + subs x21, x9, x8 + cneg x21, x21, cc + mul x22, x24, x21 + umulh x21, x24, x21 + cinv x20, x20, cc + cmn x20, #0x1 + eor x22, x22, x20 + adcs x14, x14, x22 + eor x21, x21, x20 + adcs x15, x15, x21 + adcs x16, x16, x20 + adcs x17, x17, x20 + adc x19, x19, x20 + ldp x3, x4, [x1, #32] + stp x11, x12, [x0] + ldp x7, x8, [x2, #32] + stp x13, x14, [x0, #16] + ldp x5, x6, [x1, #48] + stp x15, x16, [x0, #32] + ldp x9, x10, [x2, #48] + stp x17, x19, [x0, #48] + mov x11, v0.d[0] + mov x15, v0.d[1] + uzp1 v0.4s, v3.4s, v2.4s + rev64 v1.4s, v3.4s + uzp1 v3.4s, v2.4s, v2.4s + mul v1.4s, v1.4s, v2.4s + uaddlp v1.2d, v1.4s + shl v1.2d, v1.2d, #32 + umlal v1.2d, v3.2s, v0.2s + mov x16, v1.d[0] + mov x17, v1.d[1] + umulh x19, x3, x7 + adds x15, x15, x19 + umulh x19, x4, x8 + adcs x16, x16, x19 + umulh x19, x5, x9 + adcs x17, x17, x19 + umulh x19, x6, x10 + adc x19, x19, xzr + adds x12, x15, x11 + adcs x15, x16, x15 + adcs x16, x17, x16 + adcs x17, x19, x17 + adc x19, xzr, x19 + adds x13, x15, x11 + adcs x14, x16, x12 + adcs x15, x17, x15 + adcs x16, x19, x16 + adcs x17, xzr, x17 + adc x19, xzr, x19 + ldp x22, x21, [x0, #32] + adds x11, x11, x22 + adcs x12, x12, x21 + ldp x22, x21, [x0, #48] + adcs x13, x13, x22 + adcs x14, x14, x21 + adcs x15, x15, xzr + adcs x16, x16, xzr + adcs x17, x17, xzr + adc x19, x19, xzr + subs x24, x5, x6 + cneg x24, x24, cc + csetm x20, cc + subs x21, x10, x9 + cneg x21, x21, cc + mul x22, x24, x21 + umulh x21, x24, x21 + cinv x20, x20, cc + cmn x20, #0x1 + eor x22, x22, x20 + adcs x16, x16, x22 + eor x21, x21, x20 + adcs x17, x17, x21 + adc x19, x19, x20 + subs x24, x3, x4 + cneg x24, x24, cc + csetm x20, cc + subs x21, x8, x7 + cneg x21, x21, cc + mul x22, x24, x21 + umulh x21, x24, x21 + cinv x20, x20, cc + cmn x20, #0x1 + eor x22, x22, x20 + adcs x12, x12, x22 + eor x21, x21, x20 + adcs x13, x13, x21 + adcs x14, x14, x20 + adcs x15, x15, x20 + adcs x16, x16, x20 + adcs x17, x17, x20 + adc x19, x19, x20 + subs x24, x4, x6 + cneg x24, x24, cc + csetm x20, cc + subs x21, x10, x8 + cneg x21, x21, cc + mul x22, x24, x21 + umulh x21, x24, x21 + cinv x20, x20, cc + cmn x20, #0x1 + eor x22, x22, x20 + adcs x15, x15, x22 + eor x21, x21, x20 + adcs x16, x16, x21 + adcs x17, x17, x20 + adc x19, x19, x20 + subs x24, x3, x5 + cneg x24, x24, cc + csetm x20, cc + subs x21, x9, x7 + cneg x21, x21, cc + mul x22, x24, x21 + umulh x21, x24, x21 + cinv x20, x20, cc + cmn x20, #0x1 + eor x22, x22, x20 + adcs x13, x13, x22 + eor x21, x21, x20 + adcs x14, x14, x21 + adcs x15, x15, x20 + adcs x16, x16, x20 + adcs x17, x17, x20 + adc x19, x19, x20 + subs x24, x3, x6 + cneg x24, x24, cc + csetm x20, cc + subs x21, x10, x7 + cneg x21, x21, cc + mul x22, x24, x21 + umulh x21, x24, x21 + cinv x20, x20, cc + cmn x20, #0x1 + eor x22, x22, x20 + adcs x14, x14, x22 + eor x21, x21, x20 + adcs x15, x15, x21 + adcs x16, x16, x20 + adcs x17, x17, x20 + adc x19, x19, x20 + subs x24, x4, x5 + cneg x24, x24, cc + csetm x20, cc + subs x21, x9, x8 + cneg x21, x21, cc + mul x22, x24, x21 + umulh x21, x24, x21 + cinv x20, x20, cc + cmn x20, #0x1 + eor x22, x22, x20 + adcs x14, x14, x22 + eor x21, x21, x20 + adcs x15, x15, x21 + adcs x16, x16, x20 + adcs x17, x17, x20 + adc x19, x19, x20 + ldp x22, x21, [x1] + subs x3, x3, x22 + sbcs x4, x4, x21 + ldp x22, x21, [x1, #16] + sbcs x5, x5, x22 + sbcs x6, x6, x21 + csetm x24, cc + stp x11, x12, [x0, #64] + ldp x22, x21, [x2] + subs x7, x22, x7 + sbcs x8, x21, x8 + ldp x22, x21, [x2, #16] + sbcs x9, x22, x9 + sbcs x10, x21, x10 + csetm x1, cc + stp x13, x14, [x0, #80] + eor x3, x3, x24 + subs x3, x3, x24 + eor x4, x4, x24 + sbcs x4, x4, x24 + eor x5, x5, x24 + sbcs x5, x5, x24 + eor x6, x6, x24 + sbc x6, x6, x24 + stp x15, x16, [x0, #96] + eor x7, x7, x1 + subs x7, x7, x1 + eor x8, x8, x1 + sbcs x8, x8, x1 + eor x9, x9, x1 + sbcs x9, x9, x1 + eor x10, x10, x1 + sbc x10, x10, x1 + stp x17, x19, [x0, #112] + eor x1, x1, x24 + mul x11, x3, x7 + mul x15, x4, x8 + mul x16, x5, x9 + mul x17, x6, x10 + umulh x19, x3, x7 + adds x15, x15, x19 + umulh x19, x4, x8 + adcs x16, x16, x19 + umulh x19, x5, x9 + adcs x17, x17, x19 + umulh x19, x6, x10 + adc x19, x19, xzr + adds x12, x15, x11 + adcs x15, x16, x15 + adcs x16, x17, x16 + adcs x17, x19, x17 + adc x19, xzr, x19 + adds x13, x15, x11 + adcs x14, x16, x12 + adcs x15, x17, x15 + adcs x16, x19, x16 + adcs x17, xzr, x17 + adc x19, xzr, x19 + subs x24, x5, x6 + cneg x24, x24, cc + csetm x20, cc + subs x21, x10, x9 + cneg x21, x21, cc + mul x22, x24, x21 + umulh x21, x24, x21 + cinv x20, x20, cc + cmn x20, #0x1 + eor x22, x22, x20 + adcs x16, x16, x22 + eor x21, x21, x20 + adcs x17, x17, x21 + adc x19, x19, x20 + subs x24, x3, x4 + cneg x24, x24, cc + csetm x20, cc + subs x21, x8, x7 + cneg x21, x21, cc + mul x22, x24, x21 + umulh x21, x24, x21 + cinv x20, x20, cc + cmn x20, #0x1 + eor x22, x22, x20 + adcs x12, x12, x22 + eor x21, x21, x20 + adcs x13, x13, x21 + adcs x14, x14, x20 + adcs x15, x15, x20 + adcs x16, x16, x20 + adcs x17, x17, x20 + adc x19, x19, x20 + subs x24, x4, x6 + cneg x24, x24, cc + csetm x20, cc + subs x21, x10, x8 + cneg x21, x21, cc + mul x22, x24, x21 + umulh x21, x24, x21 + cinv x20, x20, cc + cmn x20, #0x1 + eor x22, x22, x20 + adcs x15, x15, x22 + eor x21, x21, x20 + adcs x16, x16, x21 + adcs x17, x17, x20 + adc x19, x19, x20 + subs x24, x3, x5 + cneg x24, x24, cc + csetm x20, cc + subs x21, x9, x7 + cneg x21, x21, cc + mul x22, x24, x21 + umulh x21, x24, x21 + cinv x20, x20, cc + cmn x20, #0x1 + eor x22, x22, x20 + adcs x13, x13, x22 + eor x21, x21, x20 + adcs x14, x14, x21 + adcs x15, x15, x20 + adcs x16, x16, x20 + adcs x17, x17, x20 + adc x19, x19, x20 + subs x24, x3, x6 + cneg x24, x24, cc + csetm x20, cc + subs x21, x10, x7 + cneg x21, x21, cc + mul x22, x24, x21 + umulh x21, x24, x21 + cinv x20, x20, cc + cmn x20, #0x1 + eor x22, x22, x20 + adcs x14, x14, x22 + eor x21, x21, x20 + adcs x15, x15, x21 + adcs x16, x16, x20 + adcs x17, x17, x20 + adc x19, x19, x20 + subs x24, x4, x5 + cneg x24, x24, cc + csetm x20, cc + subs x21, x9, x8 + cneg x21, x21, cc + mul x22, x24, x21 + umulh x21, x24, x21 + cinv x20, x20, cc + cmn x20, #0x1 + eor x22, x22, x20 + adcs x14, x14, x22 + eor x21, x21, x20 + adcs x15, x15, x21 + adcs x16, x16, x20 + adcs x17, x17, x20 + adc x19, x19, x20 + ldp x3, x4, [x0] + ldp x7, x8, [x0, #64] + adds x3, x3, x7 + adcs x4, x4, x8 + ldp x5, x6, [x0, #16] + ldp x9, x10, [x0, #80] + adcs x5, x5, x9 + adcs x6, x6, x10 + ldp x20, x21, [x0, #96] + adcs x7, x7, x20 + adcs x8, x8, x21 + ldp x22, x23, [x0, #112] + adcs x9, x9, x22 + adcs x10, x10, x23 + adcs x24, x1, xzr + adc x2, x1, xzr + cmn x1, #0x1 + eor x11, x11, x1 + adcs x3, x11, x3 + eor x12, x12, x1 + adcs x4, x12, x4 + eor x13, x13, x1 + adcs x5, x13, x5 + eor x14, x14, x1 + adcs x6, x14, x6 + eor x15, x15, x1 + adcs x7, x15, x7 + eor x16, x16, x1 + adcs x8, x16, x8 + eor x17, x17, x1 + adcs x9, x17, x9 + eor x19, x19, x1 + adcs x10, x19, x10 + adcs x20, x20, x24 + adcs x21, x21, x2 + adcs x22, x22, x2 + adc x23, x23, x2 + stp x3, x4, [x0, #32] + stp x5, x6, [x0, #48] + stp x7, x8, [x0, #64] + stp x9, x10, [x0, #80] + stp x20, x21, [x0, #96] + stp x22, x23, [x0, #112] + ret + + + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif + diff --git a/arm/fastmul/bignum_ksqr_16_32_neon.S b/arm/fastmul/bignum_ksqr_16_32_neon.S new file mode 100644 index 00000000..bc7fca06 --- /dev/null +++ b/arm/fastmul/bignum_ksqr_16_32_neon.S @@ -0,0 +1,658 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC + +// ---------------------------------------------------------------------------- +// Square, z := x^2 +// Input x[16]; output z[32]; temporary buffer t[>=24] +// +// extern void bignum_ksqr_16_32_neon +// (uint64_t z[static 32], uint64_t x[static 16], uint64_t t[static 24]); +// +// This is a Karatsuba-style function squaring half-sized results +// and using temporary buffer t for intermediate results. +// +// Standard ARM ABI: X0 = z, X1 = x, X2 = t +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_ksqr_16_32_neon) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_ksqr_16_32_neon) + .text + .balign 4 + +// Subroutine-safe copies of the output, inputs and temporary buffer pointers + +#define z x23 +#define x x24 +#define t x25 + +// More variables for sign masks, with s also necessarily subroutine-safe + +#define s x19 + + +S2N_BN_SYMBOL(bignum_ksqr_16_32_neon): + +// Save registers, including return address + + stp x19, x20, [sp, #-16]! + stp x21, x22, [sp, #-16]! + stp x23, x24, [sp, #-16]! + stp x25, x30, [sp, #-16]! + +// Move parameters into subroutine-safe places + + mov z, x0 + mov x, x1 + mov t, x2 + +// Compute L = x_lo * y_lo in bottom half of buffer (size 8 x 8 -> 16) + + bl bignum_ksqr_16_32_neon_local_sqr_8_16 + +// Compute absolute difference [t..] = |x_lo - x_hi| + + ldp x10, x11, [x] + ldp x8, x9, [x, #64] + subs x10, x10, x8 + sbcs x11, x11, x9 + ldp x12, x13, [x, #16] + ldp x8, x9, [x, #80] + sbcs x12, x12, x8 + sbcs x13, x13, x9 + ldp x14, x15, [x, #32] + ldp x8, x9, [x, #96] + sbcs x14, x14, x8 + sbcs x15, x15, x9 + ldp x16, x17, [x, #48] + ldp x8, x9, [x, #112] + sbcs x16, x16, x8 + sbcs x17, x17, x9 + csetm s, cc + adds xzr, s, s + eor x10, x10, s + adcs x10, x10, xzr + eor x11, x11, s + adcs x11, x11, xzr + stp x10, x11, [t] + eor x12, x12, s + adcs x12, x12, xzr + eor x13, x13, s + adcs x13, x13, xzr + stp x12, x13, [t, #16] + eor x14, x14, s + adcs x14, x14, xzr + eor x15, x15, s + adcs x15, x15, xzr + stp x14, x15, [t, #32] + eor x16, x16, s + adcs x16, x16, xzr + eor x17, x17, s + adcs x17, x17, xzr + stp x16, x17, [t, #48] + +// Compute H = x_hi * y_hi in top half of buffer (size 8 x 8 -> 16) + + add x0, z, #128 + add x1, x, #64 + bl bignum_ksqr_16_32_neon_local_sqr_8_16 + +// Compute H' = H + L_top in place of H (it cannot overflow) +// First add 8-sized block then propagate carry through next 8 + + ldp x10, x11, [z, #128] + ldp x12, x13, [z, #64] + adds x10, x10, x12 + adcs x11, x11, x13 + stp x10, x11, [z, #128] + + ldp x10, x11, [z, #128+16] + ldp x12, x13, [z, #64+16] + adcs x10, x10, x12 + adcs x11, x11, x13 + stp x10, x11, [z, #128+16] + + ldp x10, x11, [z, #128+32] + ldp x12, x13, [z, #64+32] + adcs x10, x10, x12 + adcs x11, x11, x13 + stp x10, x11, [z, #128+32] + + ldp x10, x11, [z, #128+48] + ldp x12, x13, [z, #64+48] + adcs x10, x10, x12 + adcs x11, x11, x13 + stp x10, x11, [z, #128+48] + + ldp x10, x11, [z, #128+64] + adcs x10, x10, xzr + adcs x11, x11, xzr + stp x10, x11, [z, #128+64] + + ldp x10, x11, [z, #128+80] + adcs x10, x10, xzr + adcs x11, x11, xzr + stp x10, x11, [z, #128+80] + + ldp x10, x11, [z, #128+96] + adcs x10, x10, xzr + adcs x11, x11, xzr + stp x10, x11, [z, #128+96] + + ldp x10, x11, [z, #128+112] + adcs x10, x10, xzr + adcs x11, x11, xzr + stp x10, x11, [z, #128+112] + +// Compute M = |x_lo - x_hi| * |y_hi - y_lo| in [t+8...], size 16 + + add x0, t, #64 + mov x1, t + bl bignum_ksqr_16_32_neon_local_sqr_8_16 + +// Add the interlocking H' and L_bot terms, storing in registers x15..x0 +// Intercept the carry at the 8 + 16 = 24 position and store it in x. +// (Note that we no longer need the input x was pointing at.) + + ldp x0, x1, [z] + ldp x16, x17, [z, #128] + adds x0, x0, x16 + adcs x1, x1, x17 + ldp x2, x3, [z, #16] + ldp x16, x17, [z, #144] + adcs x2, x2, x16 + adcs x3, x3, x17 + ldp x4, x5, [z, #32] + ldp x16, x17, [z, #160] + adcs x4, x4, x16 + adcs x5, x5, x17 + ldp x6, x7, [z, #48] + ldp x16, x17, [z, #176] + adcs x6, x6, x16 + adcs x7, x7, x17 + ldp x8, x9, [z, #128] + ldp x16, x17, [z, #192] + adcs x8, x8, x16 + adcs x9, x9, x17 + ldp x10, x11, [z, #144] + ldp x16, x17, [z, #208] + adcs x10, x10, x16 + adcs x11, x11, x17 + ldp x12, x13, [z, #160] + ldp x16, x17, [z, #224] + adcs x12, x12, x16 + adcs x13, x13, x17 + ldp x14, x15, [z, #176] + ldp x16, x17, [z, #240] + adcs x14, x14, x16 + adcs x15, x15, x17 + cset x, cs + +// Subtract the mid-term cross product M + + ldp x16, x17, [t, #64] + subs x0, x0, x16 + sbcs x1, x1, x17 + stp x0, x1, [z, #64] + ldp x16, x17, [t, #80] + sbcs x2, x2, x16 + sbcs x3, x3, x17 + stp x2, x3, [z, #80] + ldp x16, x17, [t, #96] + sbcs x4, x4, x16 + sbcs x5, x5, x17 + stp x4, x5, [z, #96] + ldp x16, x17, [t, #112] + sbcs x6, x6, x16 + sbcs x7, x7, x17 + stp x6, x7, [z, #112] + ldp x16, x17, [t, #128] + sbcs x8, x8, x16 + sbcs x9, x9, x17 + stp x8, x9, [z, #128] + ldp x16, x17, [t, #144] + sbcs x10, x10, x16 + sbcs x11, x11, x17 + stp x10, x11, [z, #144] + ldp x16, x17, [t, #160] + sbcs x12, x12, x16 + sbcs x13, x13, x17 + stp x12, x13, [z, #160] + ldp x16, x17, [t, #176] + sbcs x14, x14, x16 + sbcs x15, x15, x17 + stp x14, x15, [z, #176] + +// Get the next digits effectively resulting so far starting at 24 + + sbcs x, x, xzr + csetm t, cc + +// Now the final 8 digits of padding; the first one is special in using x +// and also in getting the carry chain started + + ldp x10, x11, [z, #192] + adds x10, x10, x + adcs x11, x11, t + stp x10, x11, [z, #192] + ldp x10, x11, [z, #208] + adcs x10, x10, t + adcs x11, x11, t + stp x10, x11, [z, #208] + ldp x10, x11, [z, #224] + adcs x10, x10, t + adcs x11, x11, t + stp x10, x11, [z, #224] + ldp x10, x11, [z, #240] + adcs x10, x10, t + adcs x11, x11, t + stp x10, x11, [z, #240] + +// Restore registers and return + + ldp x25, x30, [sp], #16 + ldp x23, x24, [sp], #16 + ldp x21, x22, [sp], #16 + ldp x19, x20, [sp], #16 + + ret + +// ----------------------------------------------------------------------------- +// Local 8x8->16 squaring routine, shared to reduce code size. Effectively +// the same as bignum_sqr_8_16_neon without the scratch register preservation. +// ----------------------------------------------------------------------------- + +bignum_ksqr_16_32_neon_local_sqr_8_16: +// Load registers. + ldp x2, x3, [x1] +ldr q20, [x1] + ldp x4, x5, [x1, #16] +ldr q21, [x1, #16] + ldp x6, x7, [x1, #32] +ldr q22, [x1, #32] + ldp x8, x9, [x1, #48] +ldr q23, [x1, #48] +movi v30.2d, #0xffffffff + + mul x17, x2, x4 + mul x14, x3, x5 + +// Scalar+NEON: square the lower half with a near-clone of bignum_sqr_4_8 +// NEON: prepare 64x64->128 squaring of two 64-bit ints (x2, x3) +ext v1.16b, v20.16b, v20.16b, #8 + umulh x20, x2, x4 +shrn v2.2s, v20.2d, #32 + subs x21, x2, x3 +zip1 v0.2s, v20.2s, v1.2s + cneg x21, x21, cc // cc = lo, ul, last +umull v5.2d, v2.2s, v2.2s + csetm x11, cc // cc = lo, ul, last +umull v6.2d, v2.2s, v0.2s + subs x12, x5, x4 +umull v3.2d, v0.2s, v0.2s + cneg x12, x12, cc // cc = lo, ul, last +mov v1.16b, v6.16b + mul x13, x21, x12 +usra v1.2d, v3.2d, #32 + umulh x12, x21, x12 +and v4.16b, v1.16b, v30.16b + cinv x11, x11, cc // cc = lo, ul, last +add v4.2d, v4.2d, v6.2d + eor x13, x13, x11 +usra v5.2d, v4.2d, #32 + eor x12, x12, x11 +sli v3.2d, v4.2d, #32 + adds x19, x17, x20 +usra v5.2d, v1.2d, #32 + adc x20, x20, xzr + // NEON: prepare 64x64->128 squaring of two 64-bit ints (x4, x5) + ext v1.16b, v21.16b, v21.16b, #8 + umulh x21, x3, x5 + shrn v2.2s, v21.2d, #32 + adds x19, x19, x14 + zip1 v0.2s, v21.2s, v1.2s + adcs x20, x20, x21 + adc x21, x21, xzr + adds x20, x20, x14 + adc x21, x21, xzr + cmn x11, #0x1 + adcs x19, x19, x13 +mov x13, v3.d[1] // mul x13, x3, x3 + adcs x20, x20, x12 +mov x14, v5.d[1] // umulh x14, x3, x3 + adc x21, x21, x11 +mov x12, v3.d[0] // mul x12, x2, x2 + adds x17, x17, x17 +mov x11, v5.d[0] // umulh x11, x2, x2 + adcs x19, x19, x19 + umull v5.2d, v2.2s, v2.2s + adcs x20, x20, x20 + umull v6.2d, v2.2s, v0.2s + adcs x21, x21, x21 + umull v3.2d, v0.2s, v0.2s + adc x10, xzr, xzr + mov v1.16b, v6.16b + + mul x15, x2, x3 + usra v1.2d, v3.2d, #32 + umulh x16, x2, x3 + and v4.16b, v1.16b, v30.16b + adds x11, x11, x15 + add v4.2d, v4.2d, v6.2d + adcs x13, x13, x16 + usra v5.2d, v4.2d, #32 + adc x14, x14, xzr + sli v3.2d, v4.2d, #32 + adds x11, x11, x15 + usra v5.2d, v1.2d, #32 + adcs x13, x13, x16 + adc x14, x14, xzr + stp x12, x11, [x0] + mov x11, v5.d[0] // umulh x11, x4, x4 + adds x17, x17, x13 + mov x13, v3.d[1] // mul x13, x5, x5 + adcs x19, x19, x14 + mov x14, v5.d[1] // umulh x14, x5, x5 + adcs x20, x20, xzr + mov x12, v3.d[0] // mul x12, x4, x4 + adcs x21, x21, xzr +// NEON: prepare muls in the upper half +ext v1.16b, v22.16b, v22.16b, #8 + adc x10, x10, xzr +shrn v2.2s, v22.2d, #32 + stp x17, x19, [x0, #16] +zip1 v0.2s, v22.2s, v1.2s + mul x15, x4, x5 +umull v5.2d, v2.2s, v2.2s + umulh x16, x4, x5 +umull v6.2d, v2.2s, v0.2s + adds x11, x11, x15 +umull v3.2d, v0.2s, v0.2s + adcs x13, x13, x16 +mov v1.16b, v6.16b + adc x14, x14, xzr +usra v1.2d, v3.2d, #32 + adds x11, x11, x15 +and v4.16b, v1.16b, v30.16b + adcs x13, x13, x16 +add v4.2d, v4.2d, v6.2d + adc x14, x14, xzr +usra v5.2d, v4.2d, #32 + adds x12, x12, x20 +sli v3.2d, v4.2d, #32 + adcs x11, x11, x21 +usra v5.2d, v1.2d, #32 + stp x12, x11, [x0, #32] + // NEON: prepare muls in the upper half + ext v1.16b, v23.16b, v23.16b, #8 + adcs x13, x13, x10 + shrn v2.2s, v23.2d, #32 + adc x14, x14, xzr + zip1 v0.2s, v23.2s, v1.2s + stp x13, x14, [x0, #48] + +// Scalar: square the upper half with a slight variant of the previous block + mul x17, x6, x8 + umull v16.2d, v2.2s, v2.2s + mul x14, x7, x9 + umull v6.2d, v2.2s, v0.2s + umulh x20, x6, x8 + umull v18.2d, v0.2s, v0.2s + subs x21, x6, x7 + cneg x21, x21, cc // cc = lo, ul, last + mov v1.16b, v6.16b + csetm x11, cc // cc = lo, ul, last + subs x12, x9, x8 + cneg x12, x12, cc // cc = lo, ul, last + usra v1.2d, v18.2d, #32 + mul x13, x21, x12 + and v4.16b, v1.16b, v30.16b + umulh x12, x21, x12 + add v4.2d, v4.2d, v6.2d + cinv x11, x11, cc // cc = lo, ul, last + eor x13, x13, x11 + eor x12, x12, x11 + usra v16.2d, v4.2d, #32 + adds x19, x17, x20 + adc x20, x20, xzr + sli v18.2d, v4.2d, #32 + umulh x21, x7, x9 + adds x19, x19, x14 + adcs x20, x20, x21 + adc x21, x21, xzr + adds x20, x20, x14 +mov x14, v5.d[1] + adc x21, x21, xzr + cmn x11, #0x1 + adcs x19, x19, x13 +mov x13, v3.d[1] + adcs x20, x20, x12 +mov x12, v3.d[0] + adc x21, x21, x11 +mov x11, v5.d[0] + adds x17, x17, x17 + adcs x19, x19, x19 + usra v16.2d, v1.2d, #32 + adcs x20, x20, x20 + adcs x21, x21, x21 + adc x10, xzr, xzr +// NEON: two mul+umulhs for the next stage +uzp2 v17.4s, v21.4s, v23.4s + mul x15, x6, x7 +xtn v4.2s, v23.2d + umulh x16, x6, x7 + mov x22, v16.d[0] + adds x11, x11, x15 + adcs x13, x13, x16 +xtn v5.2s, v21.2d + adc x14, x14, xzr + adds x11, x11, x15 +rev64 v1.4s, v21.4s + adcs x13, x13, x16 + adc x14, x14, xzr + stp x12, x11, [x0, #64] + adds x17, x17, x13 + mov x13, v18.d[1] + adcs x19, x19, x14 + mov x14, v16.d[1] + adcs x20, x20, xzr + mov x12, v18.d[0] + adcs x21, x21, xzr + adc x10, x10, xzr +umull v6.2d, v4.2s, v5.2s + stp x17, x19, [x0, #80] +umull v7.2d, v4.2s, v17.2s + mul x15, x8, x9 +uzp2 v16.4s, v23.4s, v23.4s + umulh x16, x8, x9 +mul v0.4s, v1.4s, v23.4s + adds x11, x22, x15 + adcs x13, x13, x16 +usra v7.2d, v6.2d, #32 + adc x14, x14, xzr + adds x11, x11, x15 +umull v1.2d, v16.2s, v17.2s + adcs x13, x13, x16 + adc x14, x14, xzr +uaddlp v0.2d, v0.4s + adds x12, x12, x20 + adcs x11, x11, x21 +and v2.16b, v7.16b, v30.16b +umlal v2.2d, v16.2s, v5.2s +shl v0.2d, v0.2d, #32 +usra v1.2d, v7.2d, #32 +umlal v0.2d, v4.2s, v5.2s +mov x16, v0.d[1] +mov x15, v0.d[0] +usra v1.2d, v2.2d, #32 +mov x20, v1.d[0] +mov x21, v1.d[1] + stp x12, x11, [x0, #96] + adcs x13, x13, x10 + adc x14, x14, xzr + stp x13, x14, [x0, #112] + +// Now get the cross-product in [s7,...,s0] and double it as [c,s7,...,s0] + + mul x10, x2, x6 + mul x14, x3, x7 + umulh x17, x2, x6 + adds x14, x14, x17 + umulh x17, x3, x7 + adcs x15, x15, x17 + adcs x16, x16, x20 + adc x17, x21, xzr + adds x11, x14, x10 + adcs x14, x15, x14 + adcs x15, x16, x15 + adcs x16, x17, x16 + adc x17, xzr, x17 + adds x12, x14, x10 + adcs x13, x15, x11 + adcs x14, x16, x14 + adcs x15, x17, x15 + adcs x16, xzr, x16 + adc x17, xzr, x17 + subs x22, x4, x5 + cneg x22, x22, cc // cc = lo, ul, last + csetm x19, cc // cc = lo, ul, last + subs x20, x9, x8 + cneg x20, x20, cc // cc = lo, ul, last + mul x21, x22, x20 + umulh x20, x22, x20 + cinv x19, x19, cc // cc = lo, ul, last + cmn x19, #0x1 + eor x21, x21, x19 + adcs x15, x15, x21 + eor x20, x20, x19 + adcs x16, x16, x20 + adc x17, x17, x19 + subs x22, x2, x3 + cneg x22, x22, cc // cc = lo, ul, last + csetm x19, cc // cc = lo, ul, last + subs x20, x7, x6 + cneg x20, x20, cc // cc = lo, ul, last + mul x21, x22, x20 + umulh x20, x22, x20 + cinv x19, x19, cc // cc = lo, ul, last + cmn x19, #0x1 + eor x21, x21, x19 + adcs x11, x11, x21 + eor x20, x20, x19 + adcs x12, x12, x20 + adcs x13, x13, x19 + adcs x14, x14, x19 + adcs x15, x15, x19 + adcs x16, x16, x19 + adc x17, x17, x19 + subs x22, x3, x5 + cneg x22, x22, cc // cc = lo, ul, last + csetm x19, cc // cc = lo, ul, last + subs x20, x9, x7 + cneg x20, x20, cc // cc = lo, ul, last + mul x21, x22, x20 + umulh x20, x22, x20 + cinv x19, x19, cc // cc = lo, ul, last + cmn x19, #0x1 + eor x21, x21, x19 + adcs x14, x14, x21 + eor x20, x20, x19 + adcs x15, x15, x20 + adcs x16, x16, x19 + adc x17, x17, x19 + subs x22, x2, x4 + cneg x22, x22, cc // cc = lo, ul, last + csetm x19, cc // cc = lo, ul, last + subs x20, x8, x6 + cneg x20, x20, cc // cc = lo, ul, last + mul x21, x22, x20 + umulh x20, x22, x20 + cinv x19, x19, cc // cc = lo, ul, last + cmn x19, #0x1 + eor x21, x21, x19 + adcs x12, x12, x21 + eor x20, x20, x19 + adcs x13, x13, x20 + adcs x14, x14, x19 + adcs x15, x15, x19 + adcs x16, x16, x19 + adc x17, x17, x19 + subs x22, x2, x5 + cneg x22, x22, cc // cc = lo, ul, last + csetm x19, cc // cc = lo, ul, last + subs x20, x9, x6 + cneg x20, x20, cc // cc = lo, ul, last + mul x21, x22, x20 + umulh x20, x22, x20 + cinv x19, x19, cc // cc = lo, ul, last + cmn x19, #0x1 + eor x21, x21, x19 + adcs x13, x13, x21 + eor x20, x20, x19 + adcs x14, x14, x20 + adcs x15, x15, x19 + adcs x16, x16, x19 + adc x17, x17, x19 + subs x22, x3, x4 + cneg x22, x22, cc // cc = lo, ul, last + csetm x19, cc // cc = lo, ul, last + subs x20, x8, x7 + cneg x20, x20, cc // cc = lo, ul, last + mul x21, x22, x20 + umulh x20, x22, x20 + cinv x19, x19, cc // cc = lo, ul, last + cmn x19, #0x1 + eor x21, x21, x19 + adcs x13, x13, x21 + eor x20, x20, x19 + adcs x14, x14, x20 + adcs x15, x15, x19 + adcs x16, x16, x19 + adc x17, x17, x19 + adds x10, x10, x10 + adcs x11, x11, x11 + adcs x12, x12, x12 + adcs x13, x13, x13 + adcs x14, x14, x14 + adcs x15, x15, x15 + adcs x16, x16, x16 + adcs x17, x17, x17 + adc x19, xzr, xzr + +// Add it back to the buffer + + ldp x2, x3, [x0, #32] + adds x10, x10, x2 + adcs x11, x11, x3 + stp x10, x11, [x0, #32] + + ldp x2, x3, [x0, #48] + adcs x12, x12, x2 + adcs x13, x13, x3 + stp x12, x13, [x0, #48] + + ldp x2, x3, [x0, #64] + adcs x14, x14, x2 + adcs x15, x15, x3 + stp x14, x15, [x0, #64] + + ldp x2, x3, [x0, #80] + adcs x16, x16, x2 + adcs x17, x17, x3 + stp x16, x17, [x0, #80] + + ldp x2, x3, [x0, #96] + adcs x2, x2, x19 + adcs x3, x3, xzr + stp x2, x3, [x0, #96] + + ldp x2, x3, [x0, #112] + adcs x2, x2, xzr + adc x3, x3, xzr + stp x2, x3, [x0, #112] + + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/arm/fastmul/bignum_ksqr_32_64_neon.S b/arm/fastmul/bignum_ksqr_32_64_neon.S new file mode 100644 index 00000000..83e611c5 --- /dev/null +++ b/arm/fastmul/bignum_ksqr_32_64_neon.S @@ -0,0 +1,1075 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC + +// ---------------------------------------------------------------------------- +// Square, z := x^2 +// Input x[32]; output z[64]; temporary buffer t[>=72] +// +// extern void bignum_ksqr_32_64_neon +// (uint64_t z[static 64], uint64_t x[static 32], uint64_t t[static 72]); +// +// This is a Karatsuba-style function squaring half-sized results +// and using temporary buffer t for intermediate results. +// +// Standard ARM ABI: X0 = z, X1 = x, X2 = t +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_ksqr_32_64_neon) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_ksqr_32_64_neon) + .text + .balign 4 + +#define K 16 +#define L 8 // (K/2) + +#define z x19 +#define x x20 +#define t x21 + +#define c x16 + + +S2N_BN_SYMBOL(bignum_ksqr_32_64_neon): + +// Save extra registers and return address, store parameters safely + + stp x19, x20, [sp, #-16]! + stp x21, x30, [sp, #-16]! + + mov z, x0 + mov x, x1 + mov t, x2 + +// Compute L = x_lo * y_lo in bottom half of buffer (size 16 x 16 -> 32) + + bl bignum_ksqr_32_64_neon_local_ksqr_16_32 + +// Compute H = x_hi * y_hi in top half of buffer (size 16 x 16 -> 32) + + add x0, z, #16*K + add x1, x, #8*K + mov x2, t + bl bignum_ksqr_32_64_neon_local_ksqr_16_32 + +// Compute H' = H + L_top in place of H (it cannot overflow) + + ldp x0, x1, [z, #16*16] + ldp x2, x3, [z, #16*8] + adds x0, x0, x2 + adcs x1, x1, x3 + stp x0, x1, [z, #16*16] + + ldp x0, x1, [z, #16*17] + ldp x2, x3, [z, #16*9] + adcs x0, x0, x2 + adcs x1, x1, x3 + stp x0, x1, [z, #16*17] + + ldp x0, x1, [z, #16*18] + ldp x2, x3, [z, #16*10] + adcs x0, x0, x2 + adcs x1, x1, x3 + stp x0, x1, [z, #16*18] + + ldp x0, x1, [z, #16*19] + ldp x2, x3, [z, #16*11] + adcs x0, x0, x2 + adcs x1, x1, x3 + stp x0, x1, [z, #16*19] + + ldp x0, x1, [z, #16*20] + ldp x2, x3, [z, #16*12] + adcs x0, x0, x2 + adcs x1, x1, x3 + stp x0, x1, [z, #16*20] + + ldp x0, x1, [z, #16*21] + ldp x2, x3, [z, #16*13] + adcs x0, x0, x2 + adcs x1, x1, x3 + stp x0, x1, [z, #16*21] + + ldp x0, x1, [z, #16*22] + ldp x2, x3, [z, #16*14] + adcs x0, x0, x2 + adcs x1, x1, x3 + stp x0, x1, [z, #16*22] + + ldp x0, x1, [z, #16*23] + ldp x2, x3, [z, #16*15] + adcs x0, x0, x2 + adcs x1, x1, x3 + stp x0, x1, [z, #16*23] + + ldp x0, x1, [z, #16*24] + adcs x0, x0, xzr + adcs x1, x1, xzr + stp x0, x1, [z, #16*24] + + ldp x0, x1, [z, #16*25] + adcs x0, x0, xzr + adcs x1, x1, xzr + stp x0, x1, [z, #16*25] + + ldp x0, x1, [z, #16*26] + adcs x0, x0, xzr + adcs x1, x1, xzr + stp x0, x1, [z, #16*26] + + ldp x0, x1, [z, #16*27] + adcs x0, x0, xzr + adcs x1, x1, xzr + stp x0, x1, [z, #16*27] + + ldp x0, x1, [z, #16*28] + adcs x0, x0, xzr + adcs x1, x1, xzr + stp x0, x1, [z, #16*28] + + ldp x0, x1, [z, #16*29] + adcs x0, x0, xzr + adcs x1, x1, xzr + stp x0, x1, [z, #16*29] + + ldp x0, x1, [z, #16*30] + adcs x0, x0, xzr + adcs x1, x1, xzr + stp x0, x1, [z, #16*30] + + ldp x0, x1, [z, #16*31] + adcs x0, x0, xzr + adc x1, x1, xzr + stp x0, x1, [z, #16*31] + +// Compute absolute difference [t..] = |x_lo - x_hi| + + ldp x0, x1, [x, #128] + ldp x16, x17, [x] + subs x0, x0, x16 + sbcs x1, x1, x17 + + ldp x2, x3, [x, #144] + ldp x16, x17, [x, #16] + sbcs x2, x2, x16 + sbcs x3, x3, x17 + + ldp x4, x5, [x, #160] + ldp x16, x17, [x, #32] + sbcs x4, x4, x16 + sbcs x5, x5, x17 + + ldp x6, x7, [x, #176] + ldp x16, x17, [x, #48] + sbcs x6, x6, x16 + sbcs x7, x7, x17 + + ldp x8, x9, [x, #192] + ldp x16, x17, [x, #64] + sbcs x8, x8, x16 + sbcs x9, x9, x17 + + ldp x10, x11, [x, #208] + ldp x16, x17, [x, #80] + sbcs x10, x10, x16 + sbcs x11, x11, x17 + + ldp x12, x13, [x, #224] + ldp x16, x17, [x, #96] + sbcs x12, x12, x16 + sbcs x13, x13, x17 + + ldp x14, x15, [x, #240] + ldp x16, x17, [x, #112] + sbcs x14, x14, x16 + sbcs x15, x15, x17 + + sbc c, xzr, xzr + + adds xzr, c, c + + eor x0, x0, c + adcs x0, x0, xzr + eor x1, x1, c + adcs x1, x1, xzr + stp x0, x1, [t] + + eor x2, x2, c + adcs x2, x2, xzr + eor x3, x3, c + adcs x3, x3, xzr + stp x2, x3, [t, #16] + + eor x4, x4, c + adcs x4, x4, xzr + eor x5, x5, c + adcs x5, x5, xzr + stp x4, x5, [t, #32] + + eor x6, x6, c + adcs x6, x6, xzr + eor x7, x7, c + adcs x7, x7, xzr + stp x6, x7, [t, #48] + + eor x8, x8, c + adcs x8, x8, xzr + eor x9, x9, c + adcs x9, x9, xzr + stp x8, x9, [t, #64] + + eor x10, x10, c + adcs x10, x10, xzr + eor x11, x11, c + adcs x11, x11, xzr + stp x10, x11, [t, #80] + + eor x12, x12, c + adcs x12, x12, xzr + eor x13, x13, c + adcs x13, x13, xzr + stp x12, x13, [t, #96] + + eor x14, x14, c + adcs x14, x14, xzr + eor x15, x15, c + adc x15, x15, xzr + stp x14, x15, [t, #112] + +// Compute M = |x_lo - x_hi|^2, size 32 + + add x0, t, #8*K + mov x1, t + add x2, t, #24*K + bl bignum_ksqr_32_64_neon_local_ksqr_16_32 + +// Add the interlocking H' and L_bot terms +// Intercept the carry at the 3k position and store it in x. +// (Note that we no longer need the input x was pointing at.) + + ldp x0, x1, [z, #16*16] + ldp x2, x3, [z] + adds x0, x0, x2 + adcs x1, x1, x3 + stp x0, x1, [z, #16*8] + + ldp x0, x1, [z, #16*17] + ldp x2, x3, [z, #16*1] + adcs x0, x0, x2 + adcs x1, x1, x3 + stp x0, x1, [z, #16*9] + + ldp x0, x1, [z, #16*18] + ldp x2, x3, [z, #16*2] + adcs x0, x0, x2 + adcs x1, x1, x3 + stp x0, x1, [z, #16*10] + + ldp x0, x1, [z, #16*19] + ldp x2, x3, [z, #16*3] + adcs x0, x0, x2 + adcs x1, x1, x3 + stp x0, x1, [z, #16*11] + + ldp x0, x1, [z, #16*20] + ldp x2, x3, [z, #16*4] + adcs x0, x0, x2 + adcs x1, x1, x3 + stp x0, x1, [z, #16*12] + + ldp x0, x1, [z, #16*21] + ldp x2, x3, [z, #16*5] + adcs x0, x0, x2 + adcs x1, x1, x3 + stp x0, x1, [z, #16*13] + + ldp x0, x1, [z, #16*22] + ldp x2, x3, [z, #16*6] + adcs x0, x0, x2 + adcs x1, x1, x3 + stp x0, x1, [z, #16*14] + + ldp x0, x1, [z, #16*23] + ldp x2, x3, [z, #16*7] + adcs x0, x0, x2 + adcs x1, x1, x3 + stp x0, x1, [z, #16*15] + + ldp x0, x1, [z, #16*16] + ldp x2, x3, [z, #16*24] + adcs x0, x0, x2 + adcs x1, x1, x3 + stp x0, x1, [z, #16*16] + + ldp x0, x1, [z, #16*17] + ldp x2, x3, [z, #16*25] + adcs x0, x0, x2 + adcs x1, x1, x3 + stp x0, x1, [z, #16*17] + + ldp x0, x1, [z, #16*18] + ldp x2, x3, [z, #16*26] + adcs x0, x0, x2 + adcs x1, x1, x3 + stp x0, x1, [z, #16*18] + + ldp x0, x1, [z, #16*19] + ldp x2, x3, [z, #16*27] + adcs x0, x0, x2 + adcs x1, x1, x3 + stp x0, x1, [z, #16*19] + + ldp x0, x1, [z, #16*20] + ldp x2, x3, [z, #16*28] + adcs x0, x0, x2 + adcs x1, x1, x3 + stp x0, x1, [z, #16*20] + + ldp x0, x1, [z, #16*21] + ldp x2, x3, [z, #16*29] + adcs x0, x0, x2 + adcs x1, x1, x3 + stp x0, x1, [z, #16*21] + + ldp x0, x1, [z, #16*22] + ldp x2, x3, [z, #16*30] + adcs x0, x0, x2 + adcs x1, x1, x3 + stp x0, x1, [z, #16*22] + + ldp x0, x1, [z, #16*23] + ldp x2, x3, [z, #16*31] + adcs x0, x0, x2 + adcs x1, x1, x3 + stp x0, x1, [z, #16*23] + + cset x, cs + +// Subtract the mid-term cross product M + + ldp x0, x1, [z, #16*L] + ldp x2, x3, [t, #16*L] + subs x0, x0, x2 + sbcs x1, x1, x3 + stp x0, x1, [z, #16*L] + + ldp x0, x1, [z, #16*9] + ldp x2, x3, [t, #16*9] + sbcs x0, x0, x2 + sbcs x1, x1, x3 + stp x0, x1, [z, #16*9] + + ldp x0, x1, [z, #16*10] + ldp x2, x3, [t, #16*10] + sbcs x0, x0, x2 + sbcs x1, x1, x3 + stp x0, x1, [z, #16*10] + + ldp x0, x1, [z, #16*11] + ldp x2, x3, [t, #16*11] + sbcs x0, x0, x2 + sbcs x1, x1, x3 + stp x0, x1, [z, #16*11] + + ldp x0, x1, [z, #16*12] + ldp x2, x3, [t, #16*12] + sbcs x0, x0, x2 + sbcs x1, x1, x3 + stp x0, x1, [z, #16*12] + + ldp x0, x1, [z, #16*13] + ldp x2, x3, [t, #16*13] + sbcs x0, x0, x2 + sbcs x1, x1, x3 + stp x0, x1, [z, #16*13] + + ldp x0, x1, [z, #16*14] + ldp x2, x3, [t, #16*14] + sbcs x0, x0, x2 + sbcs x1, x1, x3 + stp x0, x1, [z, #16*14] + + ldp x0, x1, [z, #16*15] + ldp x2, x3, [t, #16*15] + sbcs x0, x0, x2 + sbcs x1, x1, x3 + stp x0, x1, [z, #16*15] + + ldp x0, x1, [z, #16*16] + ldp x2, x3, [t, #16*16] + sbcs x0, x0, x2 + sbcs x1, x1, x3 + stp x0, x1, [z, #16*16] + + ldp x0, x1, [z, #16*17] + ldp x2, x3, [t, #16*17] + sbcs x0, x0, x2 + sbcs x1, x1, x3 + stp x0, x1, [z, #16*17] + + ldp x0, x1, [z, #16*18] + ldp x2, x3, [t, #16*18] + sbcs x0, x0, x2 + sbcs x1, x1, x3 + stp x0, x1, [z, #16*18] + + ldp x0, x1, [z, #16*19] + ldp x2, x3, [t, #16*19] + sbcs x0, x0, x2 + sbcs x1, x1, x3 + stp x0, x1, [z, #16*19] + + ldp x0, x1, [z, #16*20] + ldp x2, x3, [t, #16*20] + sbcs x0, x0, x2 + sbcs x1, x1, x3 + stp x0, x1, [z, #16*20] + + ldp x0, x1, [z, #16*21] + ldp x2, x3, [t, #16*21] + sbcs x0, x0, x2 + sbcs x1, x1, x3 + stp x0, x1, [z, #16*21] + + ldp x0, x1, [z, #16*22] + ldp x2, x3, [t, #16*22] + sbcs x0, x0, x2 + sbcs x1, x1, x3 + stp x0, x1, [z, #16*22] + + ldp x0, x1, [z, #16*23] + ldp x2, x3, [t, #16*23] + sbcs x0, x0, x2 + sbcs x1, x1, x3 + stp x0, x1, [z, #16*23] + +// Get the next digits effectively resulting so far starting at 3k +// [...,c,c,c,c,x] + + sbcs x, x, xzr + csetm c, cc + +// Now propagate through the top quarter of the result + + ldp x0, x1, [z, #16*24] + adds x0, x0, x + adcs x1, x1, c + stp x0, x1, [z, #16*24] + + ldp x0, x1, [z, #16*25] + adcs x0, x0, c + adcs x1, x1, c + stp x0, x1, [z, #16*25] + + ldp x0, x1, [z, #16*26] + adcs x0, x0, c + adcs x1, x1, c + stp x0, x1, [z, #16*26] + + ldp x0, x1, [z, #16*27] + adcs x0, x0, c + adcs x1, x1, c + stp x0, x1, [z, #16*27] + + ldp x0, x1, [z, #16*28] + adcs x0, x0, c + adcs x1, x1, c + stp x0, x1, [z, #16*28] + + ldp x0, x1, [z, #16*29] + adcs x0, x0, c + adcs x1, x1, c + stp x0, x1, [z, #16*29] + + ldp x0, x1, [z, #16*30] + adcs x0, x0, c + adcs x1, x1, c + stp x0, x1, [z, #16*30] + + ldp x0, x1, [z, #16*31] + adcs x0, x0, c + adc x1, x1, c + stp x0, x1, [z, #16*31] + +// Restore + + ldp x21, x30, [sp], #16 + ldp x19, x20, [sp], #16 + + ret + +// Local copy of bignum_ksqr_16_32, identical to main one. +// This includes in turn a copy of bignum_sqr_8_16. + +bignum_ksqr_32_64_neon_local_ksqr_16_32: + stp x19, x20, [sp, #-16]! + stp x21, x22, [sp, #-16]! + stp x23, x24, [sp, #-16]! + stp x25, x30, [sp, #-16]! + mov x23, x0 + mov x24, x1 + mov x25, x2 + bl bignum_ksqr_32_64_neon_local_sqr_8_16 + ldp x10, x11, [x24] + ldp x8, x9, [x24, #64] + subs x10, x10, x8 + sbcs x11, x11, x9 + ldp x12, x13, [x24, #16] + ldp x8, x9, [x24, #80] + sbcs x12, x12, x8 + sbcs x13, x13, x9 + ldp x14, x15, [x24, #32] + ldp x8, x9, [x24, #96] + sbcs x14, x14, x8 + sbcs x15, x15, x9 + ldp x16, x17, [x24, #48] + ldp x8, x9, [x24, #112] + sbcs x16, x16, x8 + sbcs x17, x17, x9 + csetm x19, cc + cmn x19, x19 + eor x10, x10, x19 + adcs x10, x10, xzr + eor x11, x11, x19 + adcs x11, x11, xzr + stp x10, x11, [x25] + eor x12, x12, x19 + adcs x12, x12, xzr + eor x13, x13, x19 + adcs x13, x13, xzr + stp x12, x13, [x25, #16] + eor x14, x14, x19 + adcs x14, x14, xzr + eor x15, x15, x19 + adcs x15, x15, xzr + stp x14, x15, [x25, #32] + eor x16, x16, x19 + adcs x16, x16, xzr + eor x17, x17, x19 + adcs x17, x17, xzr + stp x16, x17, [x25, #48] + add x0, x23, #0x80 + add x1, x24, #0x40 + bl bignum_ksqr_32_64_neon_local_sqr_8_16 + ldp x10, x11, [x23, #128] + ldp x12, x13, [x23, #64] + adds x10, x10, x12 + adcs x11, x11, x13 + stp x10, x11, [x23, #128] + ldp x10, x11, [x23, #144] + ldp x12, x13, [x23, #80] + adcs x10, x10, x12 + adcs x11, x11, x13 + stp x10, x11, [x23, #144] + ldp x10, x11, [x23, #160] + ldp x12, x13, [x23, #96] + adcs x10, x10, x12 + adcs x11, x11, x13 + stp x10, x11, [x23, #160] + ldp x10, x11, [x23, #176] + ldp x12, x13, [x23, #112] + adcs x10, x10, x12 + adcs x11, x11, x13 + stp x10, x11, [x23, #176] + ldp x10, x11, [x23, #192] + adcs x10, x10, xzr + adcs x11, x11, xzr + stp x10, x11, [x23, #192] + ldp x10, x11, [x23, #208] + adcs x10, x10, xzr + adcs x11, x11, xzr + stp x10, x11, [x23, #208] + ldp x10, x11, [x23, #224] + adcs x10, x10, xzr + adcs x11, x11, xzr + stp x10, x11, [x23, #224] + ldp x10, x11, [x23, #240] + adcs x10, x10, xzr + adcs x11, x11, xzr + stp x10, x11, [x23, #240] + add x0, x25, #0x40 + mov x1, x25 + bl bignum_ksqr_32_64_neon_local_sqr_8_16 + ldp x0, x1, [x23] + ldp x16, x17, [x23, #128] + adds x0, x0, x16 + adcs x1, x1, x17 + ldp x2, x3, [x23, #16] + ldp x16, x17, [x23, #144] + adcs x2, x2, x16 + adcs x3, x3, x17 + ldp x4, x5, [x23, #32] + ldp x16, x17, [x23, #160] + adcs x4, x4, x16 + adcs x5, x5, x17 + ldp x6, x7, [x23, #48] + ldp x16, x17, [x23, #176] + adcs x6, x6, x16 + adcs x7, x7, x17 + ldp x8, x9, [x23, #128] + ldp x16, x17, [x23, #192] + adcs x8, x8, x16 + adcs x9, x9, x17 + ldp x10, x11, [x23, #144] + ldp x16, x17, [x23, #208] + adcs x10, x10, x16 + adcs x11, x11, x17 + ldp x12, x13, [x23, #160] + ldp x16, x17, [x23, #224] + adcs x12, x12, x16 + adcs x13, x13, x17 + ldp x14, x15, [x23, #176] + ldp x16, x17, [x23, #240] + adcs x14, x14, x16 + adcs x15, x15, x17 + cset x24, cs + ldp x16, x17, [x25, #64] + subs x0, x0, x16 + sbcs x1, x1, x17 + stp x0, x1, [x23, #64] + ldp x16, x17, [x25, #80] + sbcs x2, x2, x16 + sbcs x3, x3, x17 + stp x2, x3, [x23, #80] + ldp x16, x17, [x25, #96] + sbcs x4, x4, x16 + sbcs x5, x5, x17 + stp x4, x5, [x23, #96] + ldp x16, x17, [x25, #112] + sbcs x6, x6, x16 + sbcs x7, x7, x17 + stp x6, x7, [x23, #112] + ldp x16, x17, [x25, #128] + sbcs x8, x8, x16 + sbcs x9, x9, x17 + stp x8, x9, [x23, #128] + ldp x16, x17, [x25, #144] + sbcs x10, x10, x16 + sbcs x11, x11, x17 + stp x10, x11, [x23, #144] + ldp x16, x17, [x25, #160] + sbcs x12, x12, x16 + sbcs x13, x13, x17 + stp x12, x13, [x23, #160] + ldp x16, x17, [x25, #176] + sbcs x14, x14, x16 + sbcs x15, x15, x17 + stp x14, x15, [x23, #176] + sbcs x24, x24, xzr + csetm x25, cc + ldp x10, x11, [x23, #192] + adds x10, x10, x24 + adcs x11, x11, x25 + stp x10, x11, [x23, #192] + ldp x10, x11, [x23, #208] + adcs x10, x10, x25 + adcs x11, x11, x25 + stp x10, x11, [x23, #208] + ldp x10, x11, [x23, #224] + adcs x10, x10, x25 + adcs x11, x11, x25 + stp x10, x11, [x23, #224] + ldp x10, x11, [x23, #240] + adcs x10, x10, x25 + adcs x11, x11, x25 + stp x10, x11, [x23, #240] + ldp x25, x30, [sp], #16 + ldp x23, x24, [sp], #16 + ldp x21, x22, [sp], #16 + ldp x19, x20, [sp], #16 + ret + +bignum_ksqr_32_64_neon_local_sqr_8_16: +// Load registers. + ldp x2, x3, [x1] +ldr q20, [x1] + ldp x4, x5, [x1, #16] +ldr q21, [x1, #16] + ldp x6, x7, [x1, #32] +ldr q22, [x1, #32] + ldp x8, x9, [x1, #48] +ldr q23, [x1, #48] +movi v30.2d, #0xffffffff + + mul x17, x2, x4 + mul x14, x3, x5 + +// Scalar+NEON: square the lower half with a near-clone of bignum_sqr_4_8 +// NEON: prepare 64x64->128 squaring of two 64-bit ints (x2, x3) +ext v1.16b, v20.16b, v20.16b, #8 + umulh x20, x2, x4 +shrn v2.2s, v20.2d, #32 + subs x21, x2, x3 +zip1 v0.2s, v20.2s, v1.2s + cneg x21, x21, cc // cc = lo, ul, last +umull v5.2d, v2.2s, v2.2s + csetm x11, cc // cc = lo, ul, last +umull v6.2d, v2.2s, v0.2s + subs x12, x5, x4 +umull v3.2d, v0.2s, v0.2s + cneg x12, x12, cc // cc = lo, ul, last +mov v1.16b, v6.16b + mul x13, x21, x12 +usra v1.2d, v3.2d, #32 + umulh x12, x21, x12 +and v4.16b, v1.16b, v30.16b + cinv x11, x11, cc // cc = lo, ul, last +add v4.2d, v4.2d, v6.2d + eor x13, x13, x11 +usra v5.2d, v4.2d, #32 + eor x12, x12, x11 +sli v3.2d, v4.2d, #32 + adds x19, x17, x20 +usra v5.2d, v1.2d, #32 + adc x20, x20, xzr + // NEON: prepare 64x64->128 squaring of two 64-bit ints (x4, x5) + ext v1.16b, v21.16b, v21.16b, #8 + umulh x21, x3, x5 + shrn v2.2s, v21.2d, #32 + adds x19, x19, x14 + zip1 v0.2s, v21.2s, v1.2s + adcs x20, x20, x21 + adc x21, x21, xzr + adds x20, x20, x14 + adc x21, x21, xzr + cmn x11, #0x1 + adcs x19, x19, x13 +mov x13, v3.d[1] // mul x13, x3, x3 + adcs x20, x20, x12 +mov x14, v5.d[1] // umulh x14, x3, x3 + adc x21, x21, x11 +mov x12, v3.d[0] // mul x12, x2, x2 + adds x17, x17, x17 +mov x11, v5.d[0] // umulh x11, x2, x2 + adcs x19, x19, x19 + umull v5.2d, v2.2s, v2.2s + adcs x20, x20, x20 + umull v6.2d, v2.2s, v0.2s + adcs x21, x21, x21 + umull v3.2d, v0.2s, v0.2s + adc x10, xzr, xzr + mov v1.16b, v6.16b + + mul x15, x2, x3 + usra v1.2d, v3.2d, #32 + umulh x16, x2, x3 + and v4.16b, v1.16b, v30.16b + adds x11, x11, x15 + add v4.2d, v4.2d, v6.2d + adcs x13, x13, x16 + usra v5.2d, v4.2d, #32 + adc x14, x14, xzr + sli v3.2d, v4.2d, #32 + adds x11, x11, x15 + usra v5.2d, v1.2d, #32 + adcs x13, x13, x16 + adc x14, x14, xzr + stp x12, x11, [x0] + mov x11, v5.d[0] // umulh x11, x4, x4 + adds x17, x17, x13 + mov x13, v3.d[1] // mul x13, x5, x5 + adcs x19, x19, x14 + mov x14, v5.d[1] // umulh x14, x5, x5 + adcs x20, x20, xzr + mov x12, v3.d[0] // mul x12, x4, x4 + adcs x21, x21, xzr +// NEON: prepare muls in the upper half +ext v1.16b, v22.16b, v22.16b, #8 + adc x10, x10, xzr +shrn v2.2s, v22.2d, #32 + stp x17, x19, [x0, #16] +zip1 v0.2s, v22.2s, v1.2s + mul x15, x4, x5 +umull v5.2d, v2.2s, v2.2s + umulh x16, x4, x5 +umull v6.2d, v2.2s, v0.2s + adds x11, x11, x15 +umull v3.2d, v0.2s, v0.2s + adcs x13, x13, x16 +mov v1.16b, v6.16b + adc x14, x14, xzr +usra v1.2d, v3.2d, #32 + adds x11, x11, x15 +and v4.16b, v1.16b, v30.16b + adcs x13, x13, x16 +add v4.2d, v4.2d, v6.2d + adc x14, x14, xzr +usra v5.2d, v4.2d, #32 + adds x12, x12, x20 +sli v3.2d, v4.2d, #32 + adcs x11, x11, x21 +usra v5.2d, v1.2d, #32 + stp x12, x11, [x0, #32] + // NEON: prepare muls in the upper half + ext v1.16b, v23.16b, v23.16b, #8 + adcs x13, x13, x10 + shrn v2.2s, v23.2d, #32 + adc x14, x14, xzr + zip1 v0.2s, v23.2s, v1.2s + stp x13, x14, [x0, #48] + +// Scalar: square the upper half with a slight variant of the previous block + mul x17, x6, x8 + umull v16.2d, v2.2s, v2.2s + mul x14, x7, x9 + umull v6.2d, v2.2s, v0.2s + umulh x20, x6, x8 + umull v18.2d, v0.2s, v0.2s + subs x21, x6, x7 + cneg x21, x21, cc // cc = lo, ul, last + mov v1.16b, v6.16b + csetm x11, cc // cc = lo, ul, last + subs x12, x9, x8 + cneg x12, x12, cc // cc = lo, ul, last + usra v1.2d, v18.2d, #32 + mul x13, x21, x12 + and v4.16b, v1.16b, v30.16b + umulh x12, x21, x12 + add v4.2d, v4.2d, v6.2d + cinv x11, x11, cc // cc = lo, ul, last + eor x13, x13, x11 + eor x12, x12, x11 + usra v16.2d, v4.2d, #32 + adds x19, x17, x20 + adc x20, x20, xzr + sli v18.2d, v4.2d, #32 + umulh x21, x7, x9 + adds x19, x19, x14 + adcs x20, x20, x21 + adc x21, x21, xzr + adds x20, x20, x14 +mov x14, v5.d[1] + adc x21, x21, xzr + cmn x11, #0x1 + adcs x19, x19, x13 +mov x13, v3.d[1] + adcs x20, x20, x12 +mov x12, v3.d[0] + adc x21, x21, x11 +mov x11, v5.d[0] + adds x17, x17, x17 + adcs x19, x19, x19 + usra v16.2d, v1.2d, #32 + adcs x20, x20, x20 + adcs x21, x21, x21 + adc x10, xzr, xzr +// NEON: two mul+umulhs for the next stage +uzp2 v17.4s, v21.4s, v23.4s + mul x15, x6, x7 +xtn v4.2s, v23.2d + umulh x16, x6, x7 + mov x22, v16.d[0] + adds x11, x11, x15 + adcs x13, x13, x16 +xtn v5.2s, v21.2d + adc x14, x14, xzr + adds x11, x11, x15 +rev64 v1.4s, v21.4s + adcs x13, x13, x16 + adc x14, x14, xzr + stp x12, x11, [x0, #64] + adds x17, x17, x13 + mov x13, v18.d[1] + adcs x19, x19, x14 + mov x14, v16.d[1] + adcs x20, x20, xzr + mov x12, v18.d[0] + adcs x21, x21, xzr + adc x10, x10, xzr +umull v6.2d, v4.2s, v5.2s + stp x17, x19, [x0, #80] +umull v7.2d, v4.2s, v17.2s + mul x15, x8, x9 +uzp2 v16.4s, v23.4s, v23.4s + umulh x16, x8, x9 +mul v0.4s, v1.4s, v23.4s + adds x11, x22, x15 + adcs x13, x13, x16 +usra v7.2d, v6.2d, #32 + adc x14, x14, xzr + adds x11, x11, x15 +umull v1.2d, v16.2s, v17.2s + adcs x13, x13, x16 + adc x14, x14, xzr +uaddlp v0.2d, v0.4s + adds x12, x12, x20 + adcs x11, x11, x21 +and v2.16b, v7.16b, v30.16b +umlal v2.2d, v16.2s, v5.2s +shl v0.2d, v0.2d, #32 +usra v1.2d, v7.2d, #32 +umlal v0.2d, v4.2s, v5.2s +mov x16, v0.d[1] +mov x15, v0.d[0] +usra v1.2d, v2.2d, #32 +mov x20, v1.d[0] +mov x21, v1.d[1] + stp x12, x11, [x0, #96] + adcs x13, x13, x10 + adc x14, x14, xzr + stp x13, x14, [x0, #112] + +// Now get the cross-product in [s7,...,s0] and double it as [c,s7,...,s0] + + mul x10, x2, x6 + mul x14, x3, x7 + umulh x17, x2, x6 + adds x14, x14, x17 + umulh x17, x3, x7 + adcs x15, x15, x17 + adcs x16, x16, x20 + adc x17, x21, xzr + adds x11, x14, x10 + adcs x14, x15, x14 + adcs x15, x16, x15 + adcs x16, x17, x16 + adc x17, xzr, x17 + adds x12, x14, x10 + adcs x13, x15, x11 + adcs x14, x16, x14 + adcs x15, x17, x15 + adcs x16, xzr, x16 + adc x17, xzr, x17 + subs x22, x4, x5 + cneg x22, x22, cc // cc = lo, ul, last + csetm x19, cc // cc = lo, ul, last + subs x20, x9, x8 + cneg x20, x20, cc // cc = lo, ul, last + mul x21, x22, x20 + umulh x20, x22, x20 + cinv x19, x19, cc // cc = lo, ul, last + cmn x19, #0x1 + eor x21, x21, x19 + adcs x15, x15, x21 + eor x20, x20, x19 + adcs x16, x16, x20 + adc x17, x17, x19 + subs x22, x2, x3 + cneg x22, x22, cc // cc = lo, ul, last + csetm x19, cc // cc = lo, ul, last + subs x20, x7, x6 + cneg x20, x20, cc // cc = lo, ul, last + mul x21, x22, x20 + umulh x20, x22, x20 + cinv x19, x19, cc // cc = lo, ul, last + cmn x19, #0x1 + eor x21, x21, x19 + adcs x11, x11, x21 + eor x20, x20, x19 + adcs x12, x12, x20 + adcs x13, x13, x19 + adcs x14, x14, x19 + adcs x15, x15, x19 + adcs x16, x16, x19 + adc x17, x17, x19 + subs x22, x3, x5 + cneg x22, x22, cc // cc = lo, ul, last + csetm x19, cc // cc = lo, ul, last + subs x20, x9, x7 + cneg x20, x20, cc // cc = lo, ul, last + mul x21, x22, x20 + umulh x20, x22, x20 + cinv x19, x19, cc // cc = lo, ul, last + cmn x19, #0x1 + eor x21, x21, x19 + adcs x14, x14, x21 + eor x20, x20, x19 + adcs x15, x15, x20 + adcs x16, x16, x19 + adc x17, x17, x19 + subs x22, x2, x4 + cneg x22, x22, cc // cc = lo, ul, last + csetm x19, cc // cc = lo, ul, last + subs x20, x8, x6 + cneg x20, x20, cc // cc = lo, ul, last + mul x21, x22, x20 + umulh x20, x22, x20 + cinv x19, x19, cc // cc = lo, ul, last + cmn x19, #0x1 + eor x21, x21, x19 + adcs x12, x12, x21 + eor x20, x20, x19 + adcs x13, x13, x20 + adcs x14, x14, x19 + adcs x15, x15, x19 + adcs x16, x16, x19 + adc x17, x17, x19 + subs x22, x2, x5 + cneg x22, x22, cc // cc = lo, ul, last + csetm x19, cc // cc = lo, ul, last + subs x20, x9, x6 + cneg x20, x20, cc // cc = lo, ul, last + mul x21, x22, x20 + umulh x20, x22, x20 + cinv x19, x19, cc // cc = lo, ul, last + cmn x19, #0x1 + eor x21, x21, x19 + adcs x13, x13, x21 + eor x20, x20, x19 + adcs x14, x14, x20 + adcs x15, x15, x19 + adcs x16, x16, x19 + adc x17, x17, x19 + subs x22, x3, x4 + cneg x22, x22, cc // cc = lo, ul, last + csetm x19, cc // cc = lo, ul, last + subs x20, x8, x7 + cneg x20, x20, cc // cc = lo, ul, last + mul x21, x22, x20 + umulh x20, x22, x20 + cinv x19, x19, cc // cc = lo, ul, last + cmn x19, #0x1 + eor x21, x21, x19 + adcs x13, x13, x21 + eor x20, x20, x19 + adcs x14, x14, x20 + adcs x15, x15, x19 + adcs x16, x16, x19 + adc x17, x17, x19 + adds x10, x10, x10 + adcs x11, x11, x11 + adcs x12, x12, x12 + adcs x13, x13, x13 + adcs x14, x14, x14 + adcs x15, x15, x15 + adcs x16, x16, x16 + adcs x17, x17, x17 + adc x19, xzr, xzr + +// Add it back to the buffer + + ldp x2, x3, [x0, #32] + adds x10, x10, x2 + adcs x11, x11, x3 + stp x10, x11, [x0, #32] + + ldp x2, x3, [x0, #48] + adcs x12, x12, x2 + adcs x13, x13, x3 + stp x12, x13, [x0, #48] + + ldp x2, x3, [x0, #64] + adcs x14, x14, x2 + adcs x15, x15, x3 + stp x14, x15, [x0, #64] + + ldp x2, x3, [x0, #80] + adcs x16, x16, x2 + adcs x17, x17, x3 + stp x16, x17, [x0, #80] + + ldp x2, x3, [x0, #96] + adcs x2, x2, x19 + adcs x3, x3, xzr + stp x2, x3, [x0, #96] + + ldp x2, x3, [x0, #112] + adcs x2, x2, xzr + adc x3, x3, xzr + stp x2, x3, [x0, #112] + + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/arm/fastmul/bignum_mul_8_16_neon.S b/arm/fastmul/bignum_mul_8_16_neon.S new file mode 100644 index 00000000..c52349d2 --- /dev/null +++ b/arm/fastmul/bignum_mul_8_16_neon.S @@ -0,0 +1,509 @@ +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mul_8_16_neon) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mul_8_16_neon) + .text + .balign 4 + +S2N_BN_SYMBOL(bignum_mul_8_16_neon): + stp x19, x20, [sp, #-16]! + stp x21, x22, [sp, #-16]! + stp x23, x24, [sp, #-16]! + ldp x3, x4, [x1] + ldr q0, [x1] + ldp x7, x8, [x2] + ldr q1, [x2] + ldp x5, x6, [x1, #16] + ldr q2, [x1, #16] + ldp x9, x10, [x2, #16] + ldr q3, [x2, #16] + uzp1 v4.4s, v1.4s, v0.4s + rev64 v1.4s, v1.4s + uzp1 v5.4s, v0.4s, v0.4s + mul v0.4s, v1.4s, v0.4s + uaddlp v0.2d, v0.4s + shl v0.2d, v0.2d, #32 + umlal v0.2d, v5.2s, v4.2s + mov x11, v0.d[0] + mov x15, v0.d[1] + uzp1 v0.4s, v3.4s, v2.4s + rev64 v1.4s, v3.4s + uzp1 v3.4s, v2.4s, v2.4s + mul v1.4s, v1.4s, v2.4s + uaddlp v1.2d, v1.4s + shl v1.2d, v1.2d, #32 + umlal v1.2d, v3.2s, v0.2s + mov x16, v1.d[0] + mov x17, v1.d[1] + ldr q0, [x1, #32] + ldr q1, [x2, #32] + ldr q2, [x1, #48] + ldr q3, [x2, #48] + umulh x19, x3, x7 + adds x15, x15, x19 + umulh x19, x4, x8 + adcs x16, x16, x19 + umulh x19, x5, x9 + adcs x17, x17, x19 + umulh x19, x6, x10 + uzp1 v4.4s, v1.4s, v0.4s + rev64 v1.4s, v1.4s + uzp1 v5.4s, v0.4s, v0.4s + mul v0.4s, v1.4s, v0.4s + uaddlp v0.2d, v0.4s + shl v0.2d, v0.2d, #32 + umlal v0.2d, v5.2s, v4.2s + adc x19, x19, xzr + adds x12, x15, x11 + adcs x15, x16, x15 + adcs x16, x17, x16 + adcs x17, x19, x17 + adc x19, xzr, x19 + adds x13, x15, x11 + adcs x14, x16, x12 + adcs x15, x17, x15 + adcs x16, x19, x16 + adcs x17, xzr, x17 + adc x19, xzr, x19 + subs x24, x5, x6 + cneg x24, x24, cc + csetm x20, cc + subs x21, x10, x9 + cneg x21, x21, cc + mul x22, x24, x21 + umulh x21, x24, x21 + cinv x20, x20, cc + cmn x20, #0x1 + eor x22, x22, x20 + adcs x16, x16, x22 + eor x21, x21, x20 + adcs x17, x17, x21 + adc x19, x19, x20 + subs x24, x3, x4 + cneg x24, x24, cc + csetm x20, cc + subs x21, x8, x7 + cneg x21, x21, cc + mul x22, x24, x21 + umulh x21, x24, x21 + cinv x20, x20, cc + cmn x20, #0x1 + eor x22, x22, x20 + adcs x12, x12, x22 + eor x21, x21, x20 + adcs x13, x13, x21 + adcs x14, x14, x20 + adcs x15, x15, x20 + adcs x16, x16, x20 + adcs x17, x17, x20 + adc x19, x19, x20 + subs x24, x4, x6 + cneg x24, x24, cc + csetm x20, cc + subs x21, x10, x8 + cneg x21, x21, cc + mul x22, x24, x21 + umulh x21, x24, x21 + cinv x20, x20, cc + cmn x20, #0x1 + eor x22, x22, x20 + adcs x15, x15, x22 + eor x21, x21, x20 + adcs x16, x16, x21 + adcs x17, x17, x20 + adc x19, x19, x20 + subs x24, x3, x5 + cneg x24, x24, cc + csetm x20, cc + subs x21, x9, x7 + cneg x21, x21, cc + mul x22, x24, x21 + umulh x21, x24, x21 + cinv x20, x20, cc + cmn x20, #0x1 + eor x22, x22, x20 + adcs x13, x13, x22 + eor x21, x21, x20 + adcs x14, x14, x21 + adcs x15, x15, x20 + adcs x16, x16, x20 + adcs x17, x17, x20 + adc x19, x19, x20 + subs x24, x3, x6 + cneg x24, x24, cc + csetm x20, cc + subs x21, x10, x7 + cneg x21, x21, cc + mul x22, x24, x21 + umulh x21, x24, x21 + cinv x20, x20, cc + cmn x20, #0x1 + eor x22, x22, x20 + adcs x14, x14, x22 + eor x21, x21, x20 + adcs x15, x15, x21 + adcs x16, x16, x20 + adcs x17, x17, x20 + adc x19, x19, x20 + subs x24, x4, x5 + cneg x24, x24, cc + csetm x20, cc + subs x21, x9, x8 + cneg x21, x21, cc + mul x22, x24, x21 + umulh x21, x24, x21 + cinv x20, x20, cc + cmn x20, #0x1 + eor x22, x22, x20 + adcs x14, x14, x22 + eor x21, x21, x20 + adcs x15, x15, x21 + adcs x16, x16, x20 + adcs x17, x17, x20 + adc x19, x19, x20 + ldp x3, x4, [x1, #32] + stp x11, x12, [x0] + ldp x7, x8, [x2, #32] + stp x13, x14, [x0, #16] + ldp x5, x6, [x1, #48] + stp x15, x16, [x0, #32] + ldp x9, x10, [x2, #48] + stp x17, x19, [x0, #48] + mov x11, v0.d[0] + mov x15, v0.d[1] + uzp1 v0.4s, v3.4s, v2.4s + rev64 v1.4s, v3.4s + uzp1 v3.4s, v2.4s, v2.4s + mul v1.4s, v1.4s, v2.4s + uaddlp v1.2d, v1.4s + shl v1.2d, v1.2d, #32 + umlal v1.2d, v3.2s, v0.2s + mov x16, v1.d[0] + mov x17, v1.d[1] + umulh x19, x3, x7 + adds x15, x15, x19 + umulh x19, x4, x8 + adcs x16, x16, x19 + umulh x19, x5, x9 + adcs x17, x17, x19 + umulh x19, x6, x10 + adc x19, x19, xzr + adds x12, x15, x11 + adcs x15, x16, x15 + adcs x16, x17, x16 + adcs x17, x19, x17 + adc x19, xzr, x19 + adds x13, x15, x11 + adcs x14, x16, x12 + adcs x15, x17, x15 + adcs x16, x19, x16 + adcs x17, xzr, x17 + adc x19, xzr, x19 + ldp x22, x21, [x0, #32] + adds x11, x11, x22 + adcs x12, x12, x21 + ldp x22, x21, [x0, #48] + adcs x13, x13, x22 + adcs x14, x14, x21 + adcs x15, x15, xzr + adcs x16, x16, xzr + adcs x17, x17, xzr + adc x19, x19, xzr + subs x24, x5, x6 + cneg x24, x24, cc + csetm x20, cc + subs x21, x10, x9 + cneg x21, x21, cc + mul x22, x24, x21 + umulh x21, x24, x21 + cinv x20, x20, cc + cmn x20, #0x1 + eor x22, x22, x20 + adcs x16, x16, x22 + eor x21, x21, x20 + adcs x17, x17, x21 + adc x19, x19, x20 + subs x24, x3, x4 + cneg x24, x24, cc + csetm x20, cc + subs x21, x8, x7 + cneg x21, x21, cc + mul x22, x24, x21 + umulh x21, x24, x21 + cinv x20, x20, cc + cmn x20, #0x1 + eor x22, x22, x20 + adcs x12, x12, x22 + eor x21, x21, x20 + adcs x13, x13, x21 + adcs x14, x14, x20 + adcs x15, x15, x20 + adcs x16, x16, x20 + adcs x17, x17, x20 + adc x19, x19, x20 + subs x24, x4, x6 + cneg x24, x24, cc + csetm x20, cc + subs x21, x10, x8 + cneg x21, x21, cc + mul x22, x24, x21 + umulh x21, x24, x21 + cinv x20, x20, cc + cmn x20, #0x1 + eor x22, x22, x20 + adcs x15, x15, x22 + eor x21, x21, x20 + adcs x16, x16, x21 + adcs x17, x17, x20 + adc x19, x19, x20 + subs x24, x3, x5 + cneg x24, x24, cc + csetm x20, cc + subs x21, x9, x7 + cneg x21, x21, cc + mul x22, x24, x21 + umulh x21, x24, x21 + cinv x20, x20, cc + cmn x20, #0x1 + eor x22, x22, x20 + adcs x13, x13, x22 + eor x21, x21, x20 + adcs x14, x14, x21 + adcs x15, x15, x20 + adcs x16, x16, x20 + adcs x17, x17, x20 + adc x19, x19, x20 + subs x24, x3, x6 + cneg x24, x24, cc + csetm x20, cc + subs x21, x10, x7 + cneg x21, x21, cc + mul x22, x24, x21 + umulh x21, x24, x21 + cinv x20, x20, cc + cmn x20, #0x1 + eor x22, x22, x20 + adcs x14, x14, x22 + eor x21, x21, x20 + adcs x15, x15, x21 + adcs x16, x16, x20 + adcs x17, x17, x20 + adc x19, x19, x20 + subs x24, x4, x5 + cneg x24, x24, cc + csetm x20, cc + subs x21, x9, x8 + cneg x21, x21, cc + mul x22, x24, x21 + umulh x21, x24, x21 + cinv x20, x20, cc + cmn x20, #0x1 + eor x22, x22, x20 + adcs x14, x14, x22 + eor x21, x21, x20 + adcs x15, x15, x21 + adcs x16, x16, x20 + adcs x17, x17, x20 + adc x19, x19, x20 + ldp x22, x21, [x1] + subs x3, x3, x22 + sbcs x4, x4, x21 + ldp x22, x21, [x1, #16] + sbcs x5, x5, x22 + sbcs x6, x6, x21 + csetm x24, cc + stp x11, x12, [x0, #64] + ldp x22, x21, [x2] + subs x7, x22, x7 + sbcs x8, x21, x8 + ldp x22, x21, [x2, #16] + sbcs x9, x22, x9 + sbcs x10, x21, x10 + csetm x1, cc + stp x13, x14, [x0, #80] + eor x3, x3, x24 + subs x3, x3, x24 + eor x4, x4, x24 + sbcs x4, x4, x24 + eor x5, x5, x24 + sbcs x5, x5, x24 + eor x6, x6, x24 + sbc x6, x6, x24 + stp x15, x16, [x0, #96] + eor x7, x7, x1 + subs x7, x7, x1 + eor x8, x8, x1 + sbcs x8, x8, x1 + eor x9, x9, x1 + sbcs x9, x9, x1 + eor x10, x10, x1 + sbc x10, x10, x1 + stp x17, x19, [x0, #112] + eor x1, x1, x24 + mul x11, x3, x7 + mul x15, x4, x8 + mul x16, x5, x9 + mul x17, x6, x10 + umulh x19, x3, x7 + adds x15, x15, x19 + umulh x19, x4, x8 + adcs x16, x16, x19 + umulh x19, x5, x9 + adcs x17, x17, x19 + umulh x19, x6, x10 + adc x19, x19, xzr + adds x12, x15, x11 + adcs x15, x16, x15 + adcs x16, x17, x16 + adcs x17, x19, x17 + adc x19, xzr, x19 + adds x13, x15, x11 + adcs x14, x16, x12 + adcs x15, x17, x15 + adcs x16, x19, x16 + adcs x17, xzr, x17 + adc x19, xzr, x19 + subs x24, x5, x6 + cneg x24, x24, cc + csetm x20, cc + subs x21, x10, x9 + cneg x21, x21, cc + mul x22, x24, x21 + umulh x21, x24, x21 + cinv x20, x20, cc + cmn x20, #0x1 + eor x22, x22, x20 + adcs x16, x16, x22 + eor x21, x21, x20 + adcs x17, x17, x21 + adc x19, x19, x20 + subs x24, x3, x4 + cneg x24, x24, cc + csetm x20, cc + subs x21, x8, x7 + cneg x21, x21, cc + mul x22, x24, x21 + umulh x21, x24, x21 + cinv x20, x20, cc + cmn x20, #0x1 + eor x22, x22, x20 + adcs x12, x12, x22 + eor x21, x21, x20 + adcs x13, x13, x21 + adcs x14, x14, x20 + adcs x15, x15, x20 + adcs x16, x16, x20 + adcs x17, x17, x20 + adc x19, x19, x20 + subs x24, x4, x6 + cneg x24, x24, cc + csetm x20, cc + subs x21, x10, x8 + cneg x21, x21, cc + mul x22, x24, x21 + umulh x21, x24, x21 + cinv x20, x20, cc + cmn x20, #0x1 + eor x22, x22, x20 + adcs x15, x15, x22 + eor x21, x21, x20 + adcs x16, x16, x21 + adcs x17, x17, x20 + adc x19, x19, x20 + subs x24, x3, x5 + cneg x24, x24, cc + csetm x20, cc + subs x21, x9, x7 + cneg x21, x21, cc + mul x22, x24, x21 + umulh x21, x24, x21 + cinv x20, x20, cc + cmn x20, #0x1 + eor x22, x22, x20 + adcs x13, x13, x22 + eor x21, x21, x20 + adcs x14, x14, x21 + adcs x15, x15, x20 + adcs x16, x16, x20 + adcs x17, x17, x20 + adc x19, x19, x20 + subs x24, x3, x6 + cneg x24, x24, cc + csetm x20, cc + subs x21, x10, x7 + cneg x21, x21, cc + mul x22, x24, x21 + umulh x21, x24, x21 + cinv x20, x20, cc + cmn x20, #0x1 + eor x22, x22, x20 + adcs x14, x14, x22 + eor x21, x21, x20 + adcs x15, x15, x21 + adcs x16, x16, x20 + adcs x17, x17, x20 + adc x19, x19, x20 + subs x24, x4, x5 + cneg x24, x24, cc + csetm x20, cc + subs x21, x9, x8 + cneg x21, x21, cc + mul x22, x24, x21 + umulh x21, x24, x21 + cinv x20, x20, cc + cmn x20, #0x1 + eor x22, x22, x20 + adcs x14, x14, x22 + eor x21, x21, x20 + adcs x15, x15, x21 + adcs x16, x16, x20 + adcs x17, x17, x20 + adc x19, x19, x20 + ldp x3, x4, [x0] + ldp x7, x8, [x0, #64] + adds x3, x3, x7 + adcs x4, x4, x8 + ldp x5, x6, [x0, #16] + ldp x9, x10, [x0, #80] + adcs x5, x5, x9 + adcs x6, x6, x10 + ldp x20, x21, [x0, #96] + adcs x7, x7, x20 + adcs x8, x8, x21 + ldp x22, x23, [x0, #112] + adcs x9, x9, x22 + adcs x10, x10, x23 + adcs x24, x1, xzr + adc x2, x1, xzr + cmn x1, #0x1 + eor x11, x11, x1 + adcs x3, x11, x3 + eor x12, x12, x1 + adcs x4, x12, x4 + eor x13, x13, x1 + adcs x5, x13, x5 + eor x14, x14, x1 + adcs x6, x14, x6 + eor x15, x15, x1 + adcs x7, x15, x7 + eor x16, x16, x1 + adcs x8, x16, x8 + eor x17, x17, x1 + adcs x9, x17, x9 + eor x19, x19, x1 + adcs x10, x19, x10 + adcs x20, x20, x24 + adcs x21, x21, x2 + adcs x22, x22, x2 + adc x23, x23, x2 + stp x3, x4, [x0, #32] + stp x5, x6, [x0, #48] + stp x7, x8, [x0, #64] + stp x9, x10, [x0, #80] + stp x20, x21, [x0, #96] + stp x22, x23, [x0, #112] + ldp x23, x24, [sp], #16 + ldp x21, x22, [sp], #16 + ldp x19, x20, [sp], #16 + ret + diff --git a/arm/fastmul/bignum_sqr_8_16_neon.S b/arm/fastmul/bignum_sqr_8_16_neon.S new file mode 100644 index 00000000..920d8cad --- /dev/null +++ b/arm/fastmul/bignum_sqr_8_16_neon.S @@ -0,0 +1,423 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC + +// ---------------------------------------------------------------------------- +// Square, z := x^2 +// Input x[8]; output z[16] +// +// extern void bignum_sqr_8_16_neon (uint64_t z[static 16], uint64_t x[static 8]); +// +// Standard ARM ABI: X0 = z, X1 = x +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_sqr_8_16_neon) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_sqr_8_16_neon) + .text + .balign 4 + + +S2N_BN_SYMBOL(bignum_sqr_8_16_neon): + +// Save registers + + stp x19, x20, [sp, #-16]! + stp x21, x22, [sp, #-16]! + +// Load registers. + ldp x2, x3, [x1] +ldr q20, [x1] + ldp x4, x5, [x1, #16] +ldr q21, [x1, #16] + ldp x6, x7, [x1, #32] +ldr q22, [x1, #32] + ldp x8, x9, [x1, #48] +ldr q23, [x1, #48] +movi v30.2d, #0xffffffff + + mul x17, x2, x4 + mul x14, x3, x5 + +// Scalar+NEON: square the lower half with a near-clone of bignum_sqr_4_8 +// NEON: prepare 64x64->128 squaring of two 64-bit ints (x2, x3) +ext v1.16b, v20.16b, v20.16b, #8 + umulh x20, x2, x4 +shrn v2.2s, v20.2d, #32 + subs x21, x2, x3 +zip1 v0.2s, v20.2s, v1.2s + cneg x21, x21, cc // cc = lo, ul, last +umull v5.2d, v2.2s, v2.2s + csetm x11, cc // cc = lo, ul, last +umull v6.2d, v2.2s, v0.2s + subs x12, x5, x4 +umull v3.2d, v0.2s, v0.2s + cneg x12, x12, cc // cc = lo, ul, last +mov v1.16b, v6.16b + mul x13, x21, x12 +usra v1.2d, v3.2d, #32 + umulh x12, x21, x12 +and v4.16b, v1.16b, v30.16b + cinv x11, x11, cc // cc = lo, ul, last +add v4.2d, v4.2d, v6.2d + eor x13, x13, x11 +usra v5.2d, v4.2d, #32 + eor x12, x12, x11 +sli v3.2d, v4.2d, #32 + adds x19, x17, x20 +usra v5.2d, v1.2d, #32 + adc x20, x20, xzr + // NEON: prepare 64x64->128 squaring of two 64-bit ints (x4, x5) + ext v1.16b, v21.16b, v21.16b, #8 + umulh x21, x3, x5 + shrn v2.2s, v21.2d, #32 + adds x19, x19, x14 + zip1 v0.2s, v21.2s, v1.2s + adcs x20, x20, x21 + adc x21, x21, xzr + adds x20, x20, x14 + adc x21, x21, xzr + cmn x11, #0x1 + adcs x19, x19, x13 +mov x13, v3.d[1] // mul x13, x3, x3 + adcs x20, x20, x12 +mov x14, v5.d[1] // umulh x14, x3, x3 + adc x21, x21, x11 +mov x12, v3.d[0] // mul x12, x2, x2 + adds x17, x17, x17 +mov x11, v5.d[0] // umulh x11, x2, x2 + adcs x19, x19, x19 + umull v5.2d, v2.2s, v2.2s + adcs x20, x20, x20 + umull v6.2d, v2.2s, v0.2s + adcs x21, x21, x21 + umull v3.2d, v0.2s, v0.2s + adc x10, xzr, xzr + mov v1.16b, v6.16b + + mul x15, x2, x3 + usra v1.2d, v3.2d, #32 + umulh x16, x2, x3 + and v4.16b, v1.16b, v30.16b + adds x11, x11, x15 + add v4.2d, v4.2d, v6.2d + adcs x13, x13, x16 + usra v5.2d, v4.2d, #32 + adc x14, x14, xzr + sli v3.2d, v4.2d, #32 + adds x11, x11, x15 + usra v5.2d, v1.2d, #32 + adcs x13, x13, x16 + adc x14, x14, xzr + stp x12, x11, [x0] + mov x11, v5.d[0] // umulh x11, x4, x4 + adds x17, x17, x13 + mov x13, v3.d[1] // mul x13, x5, x5 + adcs x19, x19, x14 + mov x14, v5.d[1] // umulh x14, x5, x5 + adcs x20, x20, xzr + mov x12, v3.d[0] // mul x12, x4, x4 + adcs x21, x21, xzr +// NEON: prepare muls in the upper half +ext v1.16b, v22.16b, v22.16b, #8 + adc x10, x10, xzr +shrn v2.2s, v22.2d, #32 + stp x17, x19, [x0, #16] +zip1 v0.2s, v22.2s, v1.2s + mul x15, x4, x5 +umull v5.2d, v2.2s, v2.2s + umulh x16, x4, x5 +umull v6.2d, v2.2s, v0.2s + adds x11, x11, x15 +umull v3.2d, v0.2s, v0.2s + adcs x13, x13, x16 +mov v1.16b, v6.16b + adc x14, x14, xzr +usra v1.2d, v3.2d, #32 + adds x11, x11, x15 +and v4.16b, v1.16b, v30.16b + adcs x13, x13, x16 +add v4.2d, v4.2d, v6.2d + adc x14, x14, xzr +usra v5.2d, v4.2d, #32 + adds x12, x12, x20 +sli v3.2d, v4.2d, #32 + adcs x11, x11, x21 +usra v5.2d, v1.2d, #32 + stp x12, x11, [x0, #32] + // NEON: prepare muls in the upper half + ext v1.16b, v23.16b, v23.16b, #8 + adcs x13, x13, x10 + shrn v2.2s, v23.2d, #32 + adc x14, x14, xzr + zip1 v0.2s, v23.2s, v1.2s + stp x13, x14, [x0, #48] + +// Scalar: square the upper half with a slight variant of the previous block + mul x17, x6, x8 + umull v16.2d, v2.2s, v2.2s + mul x14, x7, x9 + umull v6.2d, v2.2s, v0.2s + umulh x20, x6, x8 + umull v18.2d, v0.2s, v0.2s + subs x21, x6, x7 + cneg x21, x21, cc // cc = lo, ul, last + mov v1.16b, v6.16b + csetm x11, cc // cc = lo, ul, last + subs x12, x9, x8 + cneg x12, x12, cc // cc = lo, ul, last + usra v1.2d, v18.2d, #32 + mul x13, x21, x12 + and v4.16b, v1.16b, v30.16b + umulh x12, x21, x12 + add v4.2d, v4.2d, v6.2d + cinv x11, x11, cc // cc = lo, ul, last + eor x13, x13, x11 + eor x12, x12, x11 + usra v16.2d, v4.2d, #32 + adds x19, x17, x20 + adc x20, x20, xzr + sli v18.2d, v4.2d, #32 + umulh x21, x7, x9 + adds x19, x19, x14 + adcs x20, x20, x21 + adc x21, x21, xzr + adds x20, x20, x14 +mov x14, v5.d[1] + adc x21, x21, xzr + cmn x11, #0x1 + adcs x19, x19, x13 +mov x13, v3.d[1] + adcs x20, x20, x12 +mov x12, v3.d[0] + adc x21, x21, x11 +mov x11, v5.d[0] + adds x17, x17, x17 + adcs x19, x19, x19 + usra v16.2d, v1.2d, #32 + adcs x20, x20, x20 + adcs x21, x21, x21 + adc x10, xzr, xzr +// NEON: two mul+umulhs for the next stage +uzp2 v17.4s, v21.4s, v23.4s + mul x15, x6, x7 +xtn v4.2s, v23.2d + umulh x16, x6, x7 + mov x22, v16.d[0] + adds x11, x11, x15 + adcs x13, x13, x16 +xtn v5.2s, v21.2d + adc x14, x14, xzr + adds x11, x11, x15 +rev64 v1.4s, v21.4s + adcs x13, x13, x16 + adc x14, x14, xzr + stp x12, x11, [x0, #64] + adds x17, x17, x13 + mov x13, v18.d[1] + adcs x19, x19, x14 + mov x14, v16.d[1] + adcs x20, x20, xzr + mov x12, v18.d[0] + adcs x21, x21, xzr + adc x10, x10, xzr +umull v6.2d, v4.2s, v5.2s + stp x17, x19, [x0, #80] +umull v7.2d, v4.2s, v17.2s + mul x15, x8, x9 +uzp2 v16.4s, v23.4s, v23.4s + umulh x16, x8, x9 +mul v0.4s, v1.4s, v23.4s + adds x11, x22, x15 + adcs x13, x13, x16 +usra v7.2d, v6.2d, #32 + adc x14, x14, xzr + adds x11, x11, x15 +umull v1.2d, v16.2s, v17.2s + adcs x13, x13, x16 + adc x14, x14, xzr +uaddlp v0.2d, v0.4s + adds x12, x12, x20 + adcs x11, x11, x21 +and v2.16b, v7.16b, v30.16b +umlal v2.2d, v16.2s, v5.2s +shl v0.2d, v0.2d, #32 +usra v1.2d, v7.2d, #32 +umlal v0.2d, v4.2s, v5.2s +mov x16, v0.d[1] +mov x15, v0.d[0] +usra v1.2d, v2.2d, #32 +mov x20, v1.d[0] +mov x21, v1.d[1] + stp x12, x11, [x0, #96] + adcs x13, x13, x10 + adc x14, x14, xzr + stp x13, x14, [x0, #112] + +// Now get the cross-product in [s7,...,s0] and double it as [c,s7,...,s0] + + mul x10, x2, x6 + mul x14, x3, x7 + umulh x17, x2, x6 + adds x14, x14, x17 + umulh x17, x3, x7 + adcs x15, x15, x17 + adcs x16, x16, x20 + adc x17, x21, xzr + adds x11, x14, x10 + adcs x14, x15, x14 + adcs x15, x16, x15 + adcs x16, x17, x16 + adc x17, xzr, x17 + adds x12, x14, x10 + adcs x13, x15, x11 + adcs x14, x16, x14 + adcs x15, x17, x15 + adcs x16, xzr, x16 + adc x17, xzr, x17 + subs x22, x4, x5 + cneg x22, x22, cc // cc = lo, ul, last + csetm x19, cc // cc = lo, ul, last + subs x20, x9, x8 + cneg x20, x20, cc // cc = lo, ul, last + mul x21, x22, x20 + umulh x20, x22, x20 + cinv x19, x19, cc // cc = lo, ul, last + cmn x19, #0x1 + eor x21, x21, x19 + adcs x15, x15, x21 + eor x20, x20, x19 + adcs x16, x16, x20 + adc x17, x17, x19 + subs x22, x2, x3 + cneg x22, x22, cc // cc = lo, ul, last + csetm x19, cc // cc = lo, ul, last + subs x20, x7, x6 + cneg x20, x20, cc // cc = lo, ul, last + mul x21, x22, x20 + umulh x20, x22, x20 + cinv x19, x19, cc // cc = lo, ul, last + cmn x19, #0x1 + eor x21, x21, x19 + adcs x11, x11, x21 + eor x20, x20, x19 + adcs x12, x12, x20 + adcs x13, x13, x19 + adcs x14, x14, x19 + adcs x15, x15, x19 + adcs x16, x16, x19 + adc x17, x17, x19 + subs x22, x3, x5 + cneg x22, x22, cc // cc = lo, ul, last + csetm x19, cc // cc = lo, ul, last + subs x20, x9, x7 + cneg x20, x20, cc // cc = lo, ul, last + mul x21, x22, x20 + umulh x20, x22, x20 + cinv x19, x19, cc // cc = lo, ul, last + cmn x19, #0x1 + eor x21, x21, x19 + adcs x14, x14, x21 + eor x20, x20, x19 + adcs x15, x15, x20 + adcs x16, x16, x19 + adc x17, x17, x19 + subs x22, x2, x4 + cneg x22, x22, cc // cc = lo, ul, last + csetm x19, cc // cc = lo, ul, last + subs x20, x8, x6 + cneg x20, x20, cc // cc = lo, ul, last + mul x21, x22, x20 + umulh x20, x22, x20 + cinv x19, x19, cc // cc = lo, ul, last + cmn x19, #0x1 + eor x21, x21, x19 + adcs x12, x12, x21 + eor x20, x20, x19 + adcs x13, x13, x20 + adcs x14, x14, x19 + adcs x15, x15, x19 + adcs x16, x16, x19 + adc x17, x17, x19 + subs x22, x2, x5 + cneg x22, x22, cc // cc = lo, ul, last + csetm x19, cc // cc = lo, ul, last + subs x20, x9, x6 + cneg x20, x20, cc // cc = lo, ul, last + mul x21, x22, x20 + umulh x20, x22, x20 + cinv x19, x19, cc // cc = lo, ul, last + cmn x19, #0x1 + eor x21, x21, x19 + adcs x13, x13, x21 + eor x20, x20, x19 + adcs x14, x14, x20 + adcs x15, x15, x19 + adcs x16, x16, x19 + adc x17, x17, x19 + subs x22, x3, x4 + cneg x22, x22, cc // cc = lo, ul, last + csetm x19, cc // cc = lo, ul, last + subs x20, x8, x7 + cneg x20, x20, cc // cc = lo, ul, last + mul x21, x22, x20 + umulh x20, x22, x20 + cinv x19, x19, cc // cc = lo, ul, last + cmn x19, #0x1 + eor x21, x21, x19 + adcs x13, x13, x21 + eor x20, x20, x19 + adcs x14, x14, x20 + adcs x15, x15, x19 + adcs x16, x16, x19 + adc x17, x17, x19 + adds x10, x10, x10 + adcs x11, x11, x11 + adcs x12, x12, x12 + adcs x13, x13, x13 + adcs x14, x14, x14 + adcs x15, x15, x15 + adcs x16, x16, x16 + adcs x17, x17, x17 + adc x19, xzr, xzr + +// Add it back to the buffer + + ldp x2, x3, [x0, #32] + adds x10, x10, x2 + adcs x11, x11, x3 + stp x10, x11, [x0, #32] + + ldp x2, x3, [x0, #48] + adcs x12, x12, x2 + adcs x13, x13, x3 + stp x12, x13, [x0, #48] + + ldp x2, x3, [x0, #64] + adcs x14, x14, x2 + adcs x15, x15, x3 + stp x14, x15, [x0, #64] + + ldp x2, x3, [x0, #80] + adcs x16, x16, x2 + adcs x17, x17, x3 + stp x16, x17, [x0, #80] + + ldp x2, x3, [x0, #96] + adcs x2, x2, x19 + adcs x3, x3, xzr + stp x2, x3, [x0, #96] + + ldp x2, x3, [x0, #112] + adcs x2, x2, xzr + adc x3, x3, xzr + stp x2, x3, [x0, #112] + + ldp x21, x22, [sp], #16 + ldp x19, x20, [sp], #16 + + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif + diff --git a/arm/proofs/bignum_emontredc_8n_neon.ml b/arm/proofs/bignum_emontredc_8n_neon.ml new file mode 100644 index 00000000..b7d99376 --- /dev/null +++ b/arm/proofs/bignum_emontredc_8n_neon.ml @@ -0,0 +1,2936 @@ +(* + * Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. + * SPDX-License-Identifier: Apache-2.0 OR ISC + *) + +(* ========================================================================= *) +(* Extended Montgomery reduction of arbitrary bignum. *) +(* ========================================================================= *) + +(**** print_literal_from_elf "arm/fastmul/bignum_emontredc_8n_neon.o";; + ****) + +let bignum_emontredc_8n_neon_mc = + define_assert_from_elf "bignum_emontredc_8n_neon_mc" "arm/fastmul/bignum_emontredc_8n_neon.o" +[ + 0xa9bf53f3; (* arm_STP X19 X20 SP (Preimmediate_Offset (iword (-- &16))) *) + 0xa9bf5bf5; (* arm_STP X21 X22 SP (Preimmediate_Offset (iword (-- &16))) *) + 0xa9bf63f7; (* arm_STP X23 X24 SP (Preimmediate_Offset (iword (-- &16))) *) + 0xa9bf6bf9; (* arm_STP X25 X26 SP (Preimmediate_Offset (iword (-- &16))) *) + 0xa9bf73fb; (* arm_STP X27 X28 SP (Preimmediate_Offset (iword (-- &16))) *) + 0xd10083ff; (* arm_SUB SP SP (rvalue (word 32)) *) + 0xd342fc00; (* arm_LSR X0 X0 2 *) + 0xaa0003fa; (* arm_MOV X26 X0 *) + 0xf100040c; (* arm_SUBS X12 X0 (rvalue (word 1)) *) + 0x54006a63; (* arm_BCC (word 3404) *) + 0xa9007fe3; (* arm_STP X3 XZR SP (Immediate_Offset (iword (&0))) *) + 0xa9017ffa; (* arm_STP X26 XZR SP (Immediate_Offset (iword (&16))) *) + 0xaa1f03fc; (* arm_MOV X28 XZR *) + 0xd37be980; (* arm_LSL X0 X12 5 *) + 0xa9407fe3; (* arm_LDP X3 XZR SP (Immediate_Offset (iword (&0))) *) + 0xa9404c31; (* arm_LDP X17 X19 X1 (Immediate_Offset (iword (&0))) *) + 0xa9415434; (* arm_LDP X20 X21 X1 (Immediate_Offset (iword (&16))) *) + 0xa9402448; (* arm_LDP X8 X9 X2 (Immediate_Offset (iword (&0))) *) + 0xa9412c4a; (* arm_LDP X10 X11 X2 (Immediate_Offset (iword (&16))) *) + 0x3dc00455; (* arm_LDR Q21 X2 (Immediate_Offset (word 16)) *) + 0x9b037e24; (* arm_MUL X4 X17 X3 *) + 0x4e080c80; (* arm_DUP_GEN Q0 X4 *) + 0x4e805aa3; (* arm_UZP2 Q3 Q21 Q0 32 *) + 0x0ea12804; (* arm_XTN Q4 Q0 32 *) + 0x0ea12aa5; (* arm_XTN Q5 Q21 32 *) + 0x9b087c8c; (* arm_MUL X12 X4 X8 *) + 0xab0c0231; (* arm_ADDS X17 X17 X12 *) + 0x9bc87c8c; (* arm_UMULH X12 X4 X8 *) + 0x9b097c8d; (* arm_MUL X13 X4 X9 *) + 0x4ea00aa1; (* arm_REV64_VEC Q1 Q21 32 *) + 0x2ea5c086; (* arm_UMULL_VEC Q6 Q4 Q5 32 *) + 0x2ea3c087; (* arm_UMULL_VEC Q7 Q4 Q3 32 *) + 0x4e805810; (* arm_UZP2 Q16 Q0 Q0 32 *) + 0x4ea09c20; (* arm_MUL_VEC Q0 Q1 Q0 32 *) + 0x6f00e5e2; (* arm_MOVI Q2 (word 4294967295) *) + 0x6f6014c7; (* arm_USRA_VEC Q7 Q6 32 64 128 *) + 0x2ea3c201; (* arm_UMULL_VEC Q1 Q16 Q3 32 *) + 0x6ea02800; (* arm_UADDLP Q0 Q0 32 *) + 0x4e221ce2; (* arm_AND_VEC Q2 Q7 Q2 128 *) + 0x2ea58202; (* arm_UMLAL_VEC Q2 Q16 Q5 32 *) + 0x4f605400; (* arm_SHL_VEC Q0 Q0 32 64 *) + 0x6f6014e1; (* arm_USRA_VEC Q1 Q7 32 64 128 *) + 0x2ea58080; (* arm_UMLAL_VEC Q0 Q4 Q5 32 *) + 0x4e083c0e; (* arm_UMOV X14 Q0 0 8 *) + 0x4e183c0f; (* arm_UMOV X15 Q0 1 8 *) + 0xba0d0273; (* arm_ADCS X19 X19 X13 *) + 0x9bc97c8d; (* arm_UMULH X13 X4 X9 *) + 0xba0e0294; (* arm_ADCS X20 X20 X14 *) + 0x6f601441; (* arm_USRA_VEC Q1 Q2 32 64 128 *) + 0x4e083c2e; (* arm_UMOV X14 Q1 0 8 *) + 0xba0f02b5; (* arm_ADCS X21 X21 X15 *) + 0x4e183c2f; (* arm_UMOV X15 Q1 1 8 *) + 0x9a1f03f6; (* arm_ADC X22 XZR XZR *) + 0xab0c0273; (* arm_ADDS X19 X19 X12 *) + 0x9b037e65; (* arm_MUL X5 X19 X3 *) + 0xba0d0294; (* arm_ADCS X20 X20 X13 *) + 0xba0e02b5; (* arm_ADCS X21 X21 X14 *) + 0x9a0f02d6; (* arm_ADC X22 X22 X15 *) + 0x4e080ca0; (* arm_DUP_GEN Q0 X5 *) + 0x4e805aa3; (* arm_UZP2 Q3 Q21 Q0 32 *) + 0x0ea12804; (* arm_XTN Q4 Q0 32 *) + 0x0ea12aa5; (* arm_XTN Q5 Q21 32 *) + 0x9b087cac; (* arm_MUL X12 X5 X8 *) + 0xab0c0273; (* arm_ADDS X19 X19 X12 *) + 0x9bc87cac; (* arm_UMULH X12 X5 X8 *) + 0x9b097cad; (* arm_MUL X13 X5 X9 *) + 0x4ea00aa1; (* arm_REV64_VEC Q1 Q21 32 *) + 0x2ea5c086; (* arm_UMULL_VEC Q6 Q4 Q5 32 *) + 0x2ea3c087; (* arm_UMULL_VEC Q7 Q4 Q3 32 *) + 0x4e805810; (* arm_UZP2 Q16 Q0 Q0 32 *) + 0x4ea09c20; (* arm_MUL_VEC Q0 Q1 Q0 32 *) + 0x6f00e5e2; (* arm_MOVI Q2 (word 4294967295) *) + 0x6f6014c7; (* arm_USRA_VEC Q7 Q6 32 64 128 *) + 0x2ea3c201; (* arm_UMULL_VEC Q1 Q16 Q3 32 *) + 0x6ea02800; (* arm_UADDLP Q0 Q0 32 *) + 0x4e221ce2; (* arm_AND_VEC Q2 Q7 Q2 128 *) + 0x2ea58202; (* arm_UMLAL_VEC Q2 Q16 Q5 32 *) + 0x4f605400; (* arm_SHL_VEC Q0 Q0 32 64 *) + 0x6f6014e1; (* arm_USRA_VEC Q1 Q7 32 64 128 *) + 0x2ea58080; (* arm_UMLAL_VEC Q0 Q4 Q5 32 *) + 0x4e083c0e; (* arm_UMOV X14 Q0 0 8 *) + 0x4e183c0f; (* arm_UMOV X15 Q0 1 8 *) + 0xba0d0294; (* arm_ADCS X20 X20 X13 *) + 0x9bc97cad; (* arm_UMULH X13 X5 X9 *) + 0xba0e02b5; (* arm_ADCS X21 X21 X14 *) + 0x6f601441; (* arm_USRA_VEC Q1 Q2 32 64 128 *) + 0x4e083c2e; (* arm_UMOV X14 Q1 0 8 *) + 0xba0f02d6; (* arm_ADCS X22 X22 X15 *) + 0x4e183c2f; (* arm_UMOV X15 Q1 1 8 *) + 0x9a1f03f7; (* arm_ADC X23 XZR XZR *) + 0xab0c0294; (* arm_ADDS X20 X20 X12 *) + 0x9b037e86; (* arm_MUL X6 X20 X3 *) + 0x4e080cc0; (* arm_DUP_GEN Q0 X6 *) + 0x4e955803; (* arm_UZP2 Q3 Q0 Q21 32 *) + 0x0ea12aa4; (* arm_XTN Q4 Q21 32 *) + 0x0ea12805; (* arm_XTN Q5 Q0 32 *) + 0xba0d02b5; (* arm_ADCS X21 X21 X13 *) + 0xba0e02d6; (* arm_ADCS X22 X22 X14 *) + 0x9a0f02f7; (* arm_ADC X23 X23 X15 *) + 0xa9001424; (* arm_STP X4 X5 X1 (Immediate_Offset (iword (&0))) *) + 0x3dc00034; (* arm_LDR Q20 X1 (Immediate_Offset (word 0)) *) + 0x3dc00856; (* arm_LDR Q22 X2 (Immediate_Offset (word 32)) *) + 0x3dc00c57; (* arm_LDR Q23 X2 (Immediate_Offset (word 48)) *) + 0x4ea00801; (* arm_REV64_VEC Q1 Q0 32 *) + 0x2ea5c086; (* arm_UMULL_VEC Q6 Q4 Q5 32 *) + 0x2ea3c087; (* arm_UMULL_VEC Q7 Q4 Q3 32 *) + 0x4e955ab0; (* arm_UZP2 Q16 Q21 Q21 32 *) + 0x9b087ccc; (* arm_MUL X12 X6 X8 *) + 0xab0c0294; (* arm_ADDS X20 X20 X12 *) + 0x4eb59c20; (* arm_MUL_VEC Q0 Q1 Q21 32 *) + 0x6f00e5e2; (* arm_MOVI Q2 (word 4294967295) *) + 0x6f6014c7; (* arm_USRA_VEC Q7 Q6 32 64 128 *) + 0x2ea3c201; (* arm_UMULL_VEC Q1 Q16 Q3 32 *) + 0x9bc87ccc; (* arm_UMULH X12 X6 X8 *) + 0x9b097ccd; (* arm_MUL X13 X6 X9 *) + 0x6ea02800; (* arm_UADDLP Q0 Q0 32 *) + 0x4e221ce2; (* arm_AND_VEC Q2 Q7 Q2 128 *) + 0x2ea58202; (* arm_UMLAL_VEC Q2 Q16 Q5 32 *) + 0x4f605400; (* arm_SHL_VEC Q0 Q0 32 64 *) + 0xba0d02b5; (* arm_ADCS X21 X21 X13 *) + 0x9bc97ccd; (* arm_UMULH X13 X6 X9 *) + 0x6f6014e1; (* arm_USRA_VEC Q1 Q7 32 64 128 *) + 0x2ea58080; (* arm_UMLAL_VEC Q0 Q4 Q5 32 *) + 0x4e083c0e; (* arm_UMOV X14 Q0 0 8 *) + 0x4e183c0f; (* arm_UMOV X15 Q0 1 8 *) + 0x6f601441; (* arm_USRA_VEC Q1 Q2 32 64 128 *) + 0xba0e02d6; (* arm_ADCS X22 X22 X14 *) + 0xba0f02f7; (* arm_ADCS X23 X23 X15 *) + 0x4e083c2e; (* arm_UMOV X14 Q1 0 8 *) + 0x4e183c2f; (* arm_UMOV X15 Q1 1 8 *) + 0x9a1f03f8; (* arm_ADC X24 XZR XZR *) + 0xab0c02b5; (* arm_ADDS X21 X21 X12 *) + 0x9b037ea7; (* arm_MUL X7 X21 X3 *) + 0xba0d02d6; (* arm_ADCS X22 X22 X13 *) + 0xba0e02f7; (* arm_ADCS X23 X23 X14 *) + 0x9a0f0318; (* arm_ADC X24 X24 X15 *) + 0xa9011c26; (* arm_STP X6 X7 X1 (Immediate_Offset (iword (&16))) *) + 0x3dc00435; (* arm_LDR Q21 X1 (Immediate_Offset (word 16)) *) + 0x4e945ac3; (* arm_UZP2 Q3 Q22 Q20 32 *) + 0x0ea12a84; (* arm_XTN Q4 Q20 32 *) + 0x9b087cec; (* arm_MUL X12 X7 X8 *) + 0x9b097ced; (* arm_MUL X13 X7 X9 *) + 0x0ea12ac5; (* arm_XTN Q5 Q22 32 *) + 0x4ea00ac1; (* arm_REV64_VEC Q1 Q22 32 *) + 0x2ea5c086; (* arm_UMULL_VEC Q6 Q4 Q5 32 *) + 0x2ea3c087; (* arm_UMULL_VEC Q7 Q4 Q3 32 *) + 0x9b0a7cee; (* arm_MUL X14 X7 X10 *) + 0x9b0b7cef; (* arm_MUL X15 X7 X11 *) + 0x4e945a90; (* arm_UZP2 Q16 Q20 Q20 32 *) + 0x4eb49c20; (* arm_MUL_VEC Q0 Q1 Q20 32 *) + 0x6f00e5e2; (* arm_MOVI Q2 (word 4294967295) *) + 0x6f6014c7; (* arm_USRA_VEC Q7 Q6 32 64 128 *) + 0x2ea3c219; (* arm_UMULL_VEC Q25 Q16 Q3 32 *) + 0x6ea02800; (* arm_UADDLP Q0 Q0 32 *) + 0x4e221ce2; (* arm_AND_VEC Q2 Q7 Q2 128 *) + 0x2ea58202; (* arm_UMLAL_VEC Q2 Q16 Q5 32 *) + 0xab0c02b5; (* arm_ADDS X21 X21 X12 *) + 0x9bc87cec; (* arm_UMULH X12 X7 X8 *) + 0xba0d02d6; (* arm_ADCS X22 X22 X13 *) + 0x9bc97ced; (* arm_UMULH X13 X7 X9 *) + 0x4f605418; (* arm_SHL_VEC Q24 Q0 32 64 *) + 0x6f6014f9; (* arm_USRA_VEC Q25 Q7 32 64 128 *) + 0x2ea58098; (* arm_UMLAL_VEC Q24 Q4 Q5 32 *) + 0x6f601459; (* arm_USRA_VEC Q25 Q2 32 64 128 *) + 0xba0e02f7; (* arm_ADCS X23 X23 X14 *) + 0x9bca7cee; (* arm_UMULH X14 X7 X10 *) + 0xba0f0318; (* arm_ADCS X24 X24 X15 *) + 0x9bcb7cef; (* arm_UMULH X15 X7 X11 *) + 0x4e955ae3; (* arm_UZP2 Q3 Q23 Q21 32 *) + 0x0ea12aa4; (* arm_XTN Q4 Q21 32 *) + 0x0ea12ae5; (* arm_XTN Q5 Q23 32 *) + 0x4ea00ae1; (* arm_REV64_VEC Q1 Q23 32 *) + 0xa9422448; (* arm_LDP X8 X9 X2 (Immediate_Offset (iword (&32))) *) + 0xa9432c4a; (* arm_LDP X10 X11 X2 (Immediate_Offset (iword (&48))) *) + 0x2ea5c086; (* arm_UMULL_VEC Q6 Q4 Q5 32 *) + 0x2ea3c087; (* arm_UMULL_VEC Q7 Q4 Q3 32 *) + 0x4e955ab0; (* arm_UZP2 Q16 Q21 Q21 32 *) + 0x4eb59c20; (* arm_MUL_VEC Q0 Q1 Q21 32 *) + 0x9a1f03f9; (* arm_ADC X25 XZR XZR *) + 0xab0c02cc; (* arm_ADDS X12 X22 X12 *) + 0xba0d02ed; (* arm_ADCS X13 X23 X13 *) + 0xba0e030e; (* arm_ADCS X14 X24 X14 *) + 0x9a0f032f; (* arm_ADC X15 X25 X15 *) + 0x6f00e5e2; (* arm_MOVI Q2 (word 4294967295) *) + 0x6f6014c7; (* arm_USRA_VEC Q7 Q6 32 64 128 *) + 0x2ea3c21b; (* arm_UMULL_VEC Q27 Q16 Q3 32 *) + 0x6ea02800; (* arm_UADDLP Q0 Q0 32 *) + 0x4e221ce2; (* arm_AND_VEC Q2 Q7 Q2 128 *) + 0x2ea58202; (* arm_UMLAL_VEC Q2 Q16 Q5 32 *) + 0x4f60541a; (* arm_SHL_VEC Q26 Q0 32 64 *) + 0x6f6014fb; (* arm_USRA_VEC Q27 Q7 32 64 128 *) + 0x2ea5809a; (* arm_UMLAL_VEC Q26 Q4 Q5 32 *) + 0x6f60145b; (* arm_USRA_VEC Q27 Q2 32 64 128 *) + 0xb4005120; (* arm_CBZ X0 (word 2596) *) + 0xaa0003fb; (* arm_MOV X27 X0 *) + 0xf100801f; (* arm_CMP X0 (rvalue (word 32)) *) + 0x540011c1; (* arm_BNE (word 568) *) + 0x91008042; (* arm_ADD X2 X2 (rvalue (word 32)) *) + 0x91008021; (* arm_ADD X1 X1 (rvalue (word 32)) *) + 0x9b087c91; (* arm_MUL X17 X4 X8 *) + 0x9b097cb6; (* arm_MUL X22 X5 X9 *) + 0x9b0a7cd7; (* arm_MUL X23 X6 X10 *) + 0x9b0b7cf8; (* arm_MUL X24 X7 X11 *) + 0x9bc87c90; (* arm_UMULH X16 X4 X8 *) + 0xab1002d6; (* arm_ADDS X22 X22 X16 *) + 0x9bc97cb0; (* arm_UMULH X16 X5 X9 *) + 0xba1002f7; (* arm_ADCS X23 X23 X16 *) + 0x9bca7cd0; (* arm_UMULH X16 X6 X10 *) + 0xba100318; (* arm_ADCS X24 X24 X16 *) + 0x9bcb7cf0; (* arm_UMULH X16 X7 X11 *) + 0x9a1f0219; (* arm_ADC X25 X16 XZR *) + 0xa9405434; (* arm_LDP X20 X21 X1 (Immediate_Offset (iword (&0))) *) + 0xab14018c; (* arm_ADDS X12 X12 X20 *) + 0xba1501ad; (* arm_ADCS X13 X13 X21 *) + 0xa9415434; (* arm_LDP X20 X21 X1 (Immediate_Offset (iword (&16))) *) + 0xba1401ce; (* arm_ADCS X14 X14 X20 *) + 0xba1501ef; (* arm_ADCS X15 X15 X21 *) + 0x9a1f03f0; (* arm_ADC X16 XZR XZR *) + 0xab1102d3; (* arm_ADDS X19 X22 X17 *) + 0xba1602f6; (* arm_ADCS X22 X23 X22 *) + 0xba170317; (* arm_ADCS X23 X24 X23 *) + 0xba180338; (* arm_ADCS X24 X25 X24 *) + 0x9a1903f9; (* arm_ADC X25 XZR X25 *) + 0xab1102d4; (* arm_ADDS X20 X22 X17 *) + 0xba1302f5; (* arm_ADCS X21 X23 X19 *) + 0xba160316; (* arm_ADCS X22 X24 X22 *) + 0xba170337; (* arm_ADCS X23 X25 X23 *) + 0xba1803f8; (* arm_ADCS X24 XZR X24 *) + 0x9a1903f9; (* arm_ADC X25 XZR X25 *) + 0xab0c0231; (* arm_ADDS X17 X17 X12 *) + 0xba0d0273; (* arm_ADCS X19 X19 X13 *) + 0xba0e0294; (* arm_ADCS X20 X20 X14 *) + 0xba0f02b5; (* arm_ADCS X21 X21 X15 *) + 0xba1002d6; (* arm_ADCS X22 X22 X16 *) + 0xba1f02f7; (* arm_ADCS X23 X23 XZR *) + 0xba1f0318; (* arm_ADCS X24 X24 XZR *) + 0x9a1f0339; (* arm_ADC X25 X25 XZR *) + 0xeb0700cf; (* arm_SUBS X15 X6 X7 *) + 0xda8f25ef; (* arm_CNEG X15 X15 Condition_CC *) + 0xda9f23ec; (* arm_CSETM X12 Condition_CC *) + 0xeb0a016d; (* arm_SUBS X13 X11 X10 *) + 0xda8d25ad; (* arm_CNEG X13 X13 Condition_CC *) + 0x9b0d7dee; (* arm_MUL X14 X15 X13 *) + 0x9bcd7ded; (* arm_UMULH X13 X15 X13 *) + 0xda8c218c; (* arm_CINV X12 X12 Condition_CC *) + 0xb100059f; (* arm_CMN X12 (rvalue (word 1)) *) + 0xca0c01ce; (* arm_EOR X14 X14 X12 *) + 0xba0e02f7; (* arm_ADCS X23 X23 X14 *) + 0xca0c01ad; (* arm_EOR X13 X13 X12 *) + 0xba0d0318; (* arm_ADCS X24 X24 X13 *) + 0x9a0c0339; (* arm_ADC X25 X25 X12 *) + 0xeb05008f; (* arm_SUBS X15 X4 X5 *) + 0xda8f25ef; (* arm_CNEG X15 X15 Condition_CC *) + 0xda9f23ec; (* arm_CSETM X12 Condition_CC *) + 0xeb08012d; (* arm_SUBS X13 X9 X8 *) + 0xda8d25ad; (* arm_CNEG X13 X13 Condition_CC *) + 0x9b0d7dee; (* arm_MUL X14 X15 X13 *) + 0x9bcd7ded; (* arm_UMULH X13 X15 X13 *) + 0xda8c218c; (* arm_CINV X12 X12 Condition_CC *) + 0xb100059f; (* arm_CMN X12 (rvalue (word 1)) *) + 0xca0c01ce; (* arm_EOR X14 X14 X12 *) + 0xba0e0273; (* arm_ADCS X19 X19 X14 *) + 0xca0c01ad; (* arm_EOR X13 X13 X12 *) + 0xba0d0294; (* arm_ADCS X20 X20 X13 *) + 0xba0c02b5; (* arm_ADCS X21 X21 X12 *) + 0xba0c02d6; (* arm_ADCS X22 X22 X12 *) + 0xba0c02f7; (* arm_ADCS X23 X23 X12 *) + 0xba0c0318; (* arm_ADCS X24 X24 X12 *) + 0x9a0c0339; (* arm_ADC X25 X25 X12 *) + 0xeb0700af; (* arm_SUBS X15 X5 X7 *) + 0xda8f25ef; (* arm_CNEG X15 X15 Condition_CC *) + 0xda9f23ec; (* arm_CSETM X12 Condition_CC *) + 0xeb09016d; (* arm_SUBS X13 X11 X9 *) + 0xda8d25ad; (* arm_CNEG X13 X13 Condition_CC *) + 0x9b0d7dee; (* arm_MUL X14 X15 X13 *) + 0x9bcd7ded; (* arm_UMULH X13 X15 X13 *) + 0xda8c218c; (* arm_CINV X12 X12 Condition_CC *) + 0xb100059f; (* arm_CMN X12 (rvalue (word 1)) *) + 0xca0c01ce; (* arm_EOR X14 X14 X12 *) + 0xba0e02d6; (* arm_ADCS X22 X22 X14 *) + 0xca0c01ad; (* arm_EOR X13 X13 X12 *) + 0xba0d02f7; (* arm_ADCS X23 X23 X13 *) + 0xba0c0318; (* arm_ADCS X24 X24 X12 *) + 0x9a0c0339; (* arm_ADC X25 X25 X12 *) + 0xeb06008f; (* arm_SUBS X15 X4 X6 *) + 0xda8f25ef; (* arm_CNEG X15 X15 Condition_CC *) + 0xda9f23ec; (* arm_CSETM X12 Condition_CC *) + 0xeb08014d; (* arm_SUBS X13 X10 X8 *) + 0xda8d25ad; (* arm_CNEG X13 X13 Condition_CC *) + 0x9b0d7dee; (* arm_MUL X14 X15 X13 *) + 0x9bcd7ded; (* arm_UMULH X13 X15 X13 *) + 0xda8c218c; (* arm_CINV X12 X12 Condition_CC *) + 0xb100059f; (* arm_CMN X12 (rvalue (word 1)) *) + 0xca0c01ce; (* arm_EOR X14 X14 X12 *) + 0xba0e0294; (* arm_ADCS X20 X20 X14 *) + 0xca0c01ad; (* arm_EOR X13 X13 X12 *) + 0xba0d02b5; (* arm_ADCS X21 X21 X13 *) + 0xba0c02d6; (* arm_ADCS X22 X22 X12 *) + 0xba0c02f7; (* arm_ADCS X23 X23 X12 *) + 0xba0c0318; (* arm_ADCS X24 X24 X12 *) + 0x9a0c0339; (* arm_ADC X25 X25 X12 *) + 0xeb07008f; (* arm_SUBS X15 X4 X7 *) + 0xda8f25ef; (* arm_CNEG X15 X15 Condition_CC *) + 0xda9f23ec; (* arm_CSETM X12 Condition_CC *) + 0xeb08016d; (* arm_SUBS X13 X11 X8 *) + 0xda8d25ad; (* arm_CNEG X13 X13 Condition_CC *) + 0x9b0d7dee; (* arm_MUL X14 X15 X13 *) + 0x9bcd7ded; (* arm_UMULH X13 X15 X13 *) + 0xda8c218c; (* arm_CINV X12 X12 Condition_CC *) + 0xb100059f; (* arm_CMN X12 (rvalue (word 1)) *) + 0xca0c01ce; (* arm_EOR X14 X14 X12 *) + 0xba0e02b5; (* arm_ADCS X21 X21 X14 *) + 0xca0c01ad; (* arm_EOR X13 X13 X12 *) + 0xba0d02d6; (* arm_ADCS X22 X22 X13 *) + 0xba0c02f7; (* arm_ADCS X23 X23 X12 *) + 0xba0c0318; (* arm_ADCS X24 X24 X12 *) + 0x9a0c0339; (* arm_ADC X25 X25 X12 *) + 0xeb0600af; (* arm_SUBS X15 X5 X6 *) + 0xda8f25ef; (* arm_CNEG X15 X15 Condition_CC *) + 0xda9f23ec; (* arm_CSETM X12 Condition_CC *) + 0xeb09014d; (* arm_SUBS X13 X10 X9 *) + 0xda8d25ad; (* arm_CNEG X13 X13 Condition_CC *) + 0x9b0d7dee; (* arm_MUL X14 X15 X13 *) + 0x9bcd7ded; (* arm_UMULH X13 X15 X13 *) + 0xda8c218c; (* arm_CINV X12 X12 Condition_CC *) + 0xb100059f; (* arm_CMN X12 (rvalue (word 1)) *) + 0xca0c01ce; (* arm_EOR X14 X14 X12 *) + 0xba0e02b5; (* arm_ADCS X21 X21 X14 *) + 0xca0c01ad; (* arm_EOR X13 X13 X12 *) + 0xba0d02d6; (* arm_ADCS X22 X22 X13 *) + 0xba0c02ed; (* arm_ADCS X13 X23 X12 *) + 0xba0c030e; (* arm_ADCS X14 X24 X12 *) + 0x9a0c032f; (* arm_ADC X15 X25 X12 *) + 0xaa1603ec; (* arm_MOV X12 X22 *) + 0xa9004c31; (* arm_STP X17 X19 X1 (Immediate_Offset (iword (&0))) *) + 0xa9015434; (* arm_STP X20 X21 X1 (Immediate_Offset (iword (&16))) *) + 0xd100837b; (* arm_SUB X27 X27 (rvalue (word 32)) *) + 0x140001f9; (* arm_B (word 2020) *) + 0x4e083f30; (* arm_UMOV X16 Q25 0 8 *) + 0x4e183f16; (* arm_UMOV X22 Q24 1 8 *) + 0x4e183f34; (* arm_UMOV X20 Q25 1 8 *) + 0x4e083f57; (* arm_UMOV X23 Q26 0 8 *) + 0x4e083f75; (* arm_UMOV X21 Q27 0 8 *) + 0x4e183f58; (* arm_UMOV X24 Q26 1 8 *) + 0x4e183f63; (* arm_UMOV X3 Q27 1 8 *) + 0x4e083f11; (* arm_UMOV X17 Q24 0 8 *) + 0xab1002d6; (* arm_ADDS X22 X22 X16 *) + 0xba1402f7; (* arm_ADCS X23 X23 X20 *) + 0xba150318; (* arm_ADCS X24 X24 X21 *) + 0x9a1f0079; (* arm_ADC X25 X3 XZR *) + 0x3dc01056; (* arm_LDR Q22 X2 (Immediate_Offset (word 64)) *) + 0x3dc01457; (* arm_LDR Q23 X2 (Immediate_Offset (word 80)) *) + 0x91008042; (* arm_ADD X2 X2 (rvalue (word 32)) *) + 0x91008021; (* arm_ADD X1 X1 (rvalue (word 32)) *) + 0x4e945ac3; (* arm_UZP2 Q3 Q22 Q20 32 *) + 0x0ea12a84; (* arm_XTN Q4 Q20 32 *) + 0x0ea12ac5; (* arm_XTN Q5 Q22 32 *) + 0x4ea00ac1; (* arm_REV64_VEC Q1 Q22 32 *) + 0xa9405434; (* arm_LDP X20 X21 X1 (Immediate_Offset (iword (&0))) *) + 0xab14018c; (* arm_ADDS X12 X12 X20 *) + 0xba1501ad; (* arm_ADCS X13 X13 X21 *) + 0xa9415434; (* arm_LDP X20 X21 X1 (Immediate_Offset (iword (&16))) *) + 0x2ea5c086; (* arm_UMULL_VEC Q6 Q4 Q5 32 *) + 0x2ea3c087; (* arm_UMULL_VEC Q7 Q4 Q3 32 *) + 0x4e945a90; (* arm_UZP2 Q16 Q20 Q20 32 *) + 0x4eb49c20; (* arm_MUL_VEC Q0 Q1 Q20 32 *) + 0xba1401ce; (* arm_ADCS X14 X14 X20 *) + 0xba1501ef; (* arm_ADCS X15 X15 X21 *) + 0x9a1f03f0; (* arm_ADC X16 XZR XZR *) + 0xab1102d3; (* arm_ADDS X19 X22 X17 *) + 0x6f00e5e2; (* arm_MOVI Q2 (word 4294967295) *) + 0x6f6014c7; (* arm_USRA_VEC Q7 Q6 32 64 128 *) + 0x2ea3c219; (* arm_UMULL_VEC Q25 Q16 Q3 32 *) + 0x6ea02800; (* arm_UADDLP Q0 Q0 32 *) + 0xba1602f6; (* arm_ADCS X22 X23 X22 *) + 0xba170317; (* arm_ADCS X23 X24 X23 *) + 0xba180338; (* arm_ADCS X24 X25 X24 *) + 0x9a1903f9; (* arm_ADC X25 XZR X25 *) + 0x4e221ce2; (* arm_AND_VEC Q2 Q7 Q2 128 *) + 0x2ea58202; (* arm_UMLAL_VEC Q2 Q16 Q5 32 *) + 0x4f605418; (* arm_SHL_VEC Q24 Q0 32 64 *) + 0x6f6014f9; (* arm_USRA_VEC Q25 Q7 32 64 128 *) + 0xab1102d4; (* arm_ADDS X20 X22 X17 *) + 0xba1302f5; (* arm_ADCS X21 X23 X19 *) + 0xba160316; (* arm_ADCS X22 X24 X22 *) + 0xba170337; (* arm_ADCS X23 X25 X23 *) + 0x2ea58098; (* arm_UMLAL_VEC Q24 Q4 Q5 32 *) + 0x6f601459; (* arm_USRA_VEC Q25 Q2 32 64 128 *) + 0xba1803f8; (* arm_ADCS X24 XZR X24 *) + 0x9a1903f9; (* arm_ADC X25 XZR X25 *) + 0xab0c0231; (* arm_ADDS X17 X17 X12 *) + 0xba0d0273; (* arm_ADCS X19 X19 X13 *) + 0x4e955ae3; (* arm_UZP2 Q3 Q23 Q21 32 *) + 0x0ea12aa4; (* arm_XTN Q4 Q21 32 *) + 0x0ea12ae5; (* arm_XTN Q5 Q23 32 *) + 0x4ea00ae1; (* arm_REV64_VEC Q1 Q23 32 *) + 0xba0e0294; (* arm_ADCS X20 X20 X14 *) + 0xba0f02b5; (* arm_ADCS X21 X21 X15 *) + 0xba1002d6; (* arm_ADCS X22 X22 X16 *) + 0xba1f02f7; (* arm_ADCS X23 X23 XZR *) + 0x2ea5c086; (* arm_UMULL_VEC Q6 Q4 Q5 32 *) + 0x2ea3c087; (* arm_UMULL_VEC Q7 Q4 Q3 32 *) + 0x4e955ab0; (* arm_UZP2 Q16 Q21 Q21 32 *) + 0x4eb59c20; (* arm_MUL_VEC Q0 Q1 Q21 32 *) + 0xba1f0318; (* arm_ADCS X24 X24 XZR *) + 0x9a1f0339; (* arm_ADC X25 X25 XZR *) + 0xeb0700cf; (* arm_SUBS X15 X6 X7 *) + 0xda8f25ef; (* arm_CNEG X15 X15 Condition_CC *) + 0x6f00e5e2; (* arm_MOVI Q2 (word 4294967295) *) + 0x6f6014c7; (* arm_USRA_VEC Q7 Q6 32 64 128 *) + 0x2ea3c21b; (* arm_UMULL_VEC Q27 Q16 Q3 32 *) + 0x6ea02800; (* arm_UADDLP Q0 Q0 32 *) + 0xda9f23ec; (* arm_CSETM X12 Condition_CC *) + 0xeb0a016d; (* arm_SUBS X13 X11 X10 *) + 0xda8d25ad; (* arm_CNEG X13 X13 Condition_CC *) + 0x9b0d7dee; (* arm_MUL X14 X15 X13 *) + 0x4e221ce2; (* arm_AND_VEC Q2 Q7 Q2 128 *) + 0x2ea58202; (* arm_UMLAL_VEC Q2 Q16 Q5 32 *) + 0x4f60541a; (* arm_SHL_VEC Q26 Q0 32 64 *) + 0x6f6014fb; (* arm_USRA_VEC Q27 Q7 32 64 128 *) + 0x9bcd7ded; (* arm_UMULH X13 X15 X13 *) + 0xda8c218c; (* arm_CINV X12 X12 Condition_CC *) + 0xb100059f; (* arm_CMN X12 (rvalue (word 1)) *) + 0xca0c01ce; (* arm_EOR X14 X14 X12 *) + 0x2ea5809a; (* arm_UMLAL_VEC Q26 Q4 Q5 32 *) + 0x6f60145b; (* arm_USRA_VEC Q27 Q2 32 64 128 *) + 0xba0e02f7; (* arm_ADCS X23 X23 X14 *) + 0xca0c01ad; (* arm_EOR X13 X13 X12 *) + 0xba0d0318; (* arm_ADCS X24 X24 X13 *) + 0x9a0c0339; (* arm_ADC X25 X25 X12 *) + 0xeb05008f; (* arm_SUBS X15 X4 X5 *) + 0xda8f25ef; (* arm_CNEG X15 X15 Condition_CC *) + 0xda9f23ec; (* arm_CSETM X12 Condition_CC *) + 0xeb08012d; (* arm_SUBS X13 X9 X8 *) + 0xda8d25ad; (* arm_CNEG X13 X13 Condition_CC *) + 0x9b0d7dee; (* arm_MUL X14 X15 X13 *) + 0x9bcd7ded; (* arm_UMULH X13 X15 X13 *) + 0xda8c218c; (* arm_CINV X12 X12 Condition_CC *) + 0xb100059f; (* arm_CMN X12 (rvalue (word 1)) *) + 0xca0c01ce; (* arm_EOR X14 X14 X12 *) + 0xba0e0273; (* arm_ADCS X19 X19 X14 *) + 0xca0c01ad; (* arm_EOR X13 X13 X12 *) + 0xba0d0294; (* arm_ADCS X20 X20 X13 *) + 0xba0c02b5; (* arm_ADCS X21 X21 X12 *) + 0xba0c02d6; (* arm_ADCS X22 X22 X12 *) + 0xba0c02f7; (* arm_ADCS X23 X23 X12 *) + 0xba0c0318; (* arm_ADCS X24 X24 X12 *) + 0x9a0c0339; (* arm_ADC X25 X25 X12 *) + 0xa9004c31; (* arm_STP X17 X19 X1 (Immediate_Offset (iword (&0))) *) + 0x4e083f30; (* arm_UMOV X16 Q25 0 8 *) + 0x4e083f7a; (* arm_UMOV X26 Q27 0 8 *) + 0x4e183f23; (* arm_UMOV X3 Q25 1 8 *) + 0x4e183f71; (* arm_UMOV X17 Q27 1 8 *) + 0xeb0700af; (* arm_SUBS X15 X5 X7 *) + 0xda8f25ef; (* arm_CNEG X15 X15 Condition_CC *) + 0xda9f23ec; (* arm_CSETM X12 Condition_CC *) + 0xeb09016d; (* arm_SUBS X13 X11 X9 *) + 0xda8d25ad; (* arm_CNEG X13 X13 Condition_CC *) + 0x9b0d7dee; (* arm_MUL X14 X15 X13 *) + 0x9bcd7ded; (* arm_UMULH X13 X15 X13 *) + 0xda8c218c; (* arm_CINV X12 X12 Condition_CC *) + 0xb100059f; (* arm_CMN X12 (rvalue (word 1)) *) + 0xca0c01ce; (* arm_EOR X14 X14 X12 *) + 0xba0e02d6; (* arm_ADCS X22 X22 X14 *) + 0xca0c01ad; (* arm_EOR X13 X13 X12 *) + 0xba0d02f7; (* arm_ADCS X23 X23 X13 *) + 0xba0c0318; (* arm_ADCS X24 X24 X12 *) + 0x9a0c0339; (* arm_ADC X25 X25 X12 *) + 0xeb06008f; (* arm_SUBS X15 X4 X6 *) + 0xda8f25ef; (* arm_CNEG X15 X15 Condition_CC *) + 0xda9f23ec; (* arm_CSETM X12 Condition_CC *) + 0xeb08014d; (* arm_SUBS X13 X10 X8 *) + 0xda8d25ad; (* arm_CNEG X13 X13 Condition_CC *) + 0x9b0d7dee; (* arm_MUL X14 X15 X13 *) + 0x9bcd7ded; (* arm_UMULH X13 X15 X13 *) + 0xda8c218c; (* arm_CINV X12 X12 Condition_CC *) + 0xb100059f; (* arm_CMN X12 (rvalue (word 1)) *) + 0xca0c01ce; (* arm_EOR X14 X14 X12 *) + 0xba0e0294; (* arm_ADCS X20 X20 X14 *) + 0xca0c01ad; (* arm_EOR X13 X13 X12 *) + 0xba0d02b5; (* arm_ADCS X21 X21 X13 *) + 0xba0c02d6; (* arm_ADCS X22 X22 X12 *) + 0xba0c02f7; (* arm_ADCS X23 X23 X12 *) + 0xba0c0318; (* arm_ADCS X24 X24 X12 *) + 0x9a0c0339; (* arm_ADC X25 X25 X12 *) + 0xeb07008f; (* arm_SUBS X15 X4 X7 *) + 0xda8f25ef; (* arm_CNEG X15 X15 Condition_CC *) + 0xda9f23ec; (* arm_CSETM X12 Condition_CC *) + 0xeb08016d; (* arm_SUBS X13 X11 X8 *) + 0xda8d25ad; (* arm_CNEG X13 X13 Condition_CC *) + 0x9b0d7dee; (* arm_MUL X14 X15 X13 *) + 0x9bcd7ded; (* arm_UMULH X13 X15 X13 *) + 0xda8c218c; (* arm_CINV X12 X12 Condition_CC *) + 0xb100059f; (* arm_CMN X12 (rvalue (word 1)) *) + 0xca0c01ce; (* arm_EOR X14 X14 X12 *) + 0xba0e02b5; (* arm_ADCS X21 X21 X14 *) + 0xca0c01ad; (* arm_EOR X13 X13 X12 *) + 0xba0d02d6; (* arm_ADCS X22 X22 X13 *) + 0xba0c02f7; (* arm_ADCS X23 X23 X12 *) + 0xba0c0318; (* arm_ADCS X24 X24 X12 *) + 0x9a0c0339; (* arm_ADC X25 X25 X12 *) + 0xeb0600af; (* arm_SUBS X15 X5 X6 *) + 0xda8f25ef; (* arm_CNEG X15 X15 Condition_CC *) + 0xda9f23ec; (* arm_CSETM X12 Condition_CC *) + 0xeb09014d; (* arm_SUBS X13 X10 X9 *) + 0xda8d25ad; (* arm_CNEG X13 X13 Condition_CC *) + 0x9b0d7dee; (* arm_MUL X14 X15 X13 *) + 0x9bcd7ded; (* arm_UMULH X13 X15 X13 *) + 0xda8c218c; (* arm_CINV X12 X12 Condition_CC *) + 0xb100059f; (* arm_CMN X12 (rvalue (word 1)) *) + 0xca0c01ce; (* arm_EOR X14 X14 X12 *) + 0xba0e02b5; (* arm_ADCS X21 X21 X14 *) + 0xa9015434; (* arm_STP X20 X21 X1 (Immediate_Offset (iword (&16))) *) + 0x4e183f14; (* arm_UMOV X20 Q24 1 8 *) + 0x4e083f55; (* arm_UMOV X21 Q26 0 8 *) + 0xca0c01ad; (* arm_EOR X13 X13 X12 *) + 0xba0d02d6; (* arm_ADCS X22 X22 X13 *) + 0xba0c02ed; (* arm_ADCS X13 X23 X12 *) + 0xba0c030e; (* arm_ADCS X14 X24 X12 *) + 0x9a0c032f; (* arm_ADC X15 X25 X12 *) + 0xaa1603ec; (* arm_MOV X12 X22 *) + 0x4e183f58; (* arm_UMOV X24 Q26 1 8 *) + 0xd100837b; (* arm_SUB X27 X27 (rvalue (word 32)) *) + 0xf100837f; (* arm_CMP X27 (rvalue (word 32)) *) + 0x540016e0; (* arm_BEQ (word 732) *) + 0xa9422448; (* arm_LDP X8 X9 X2 (Immediate_Offset (iword (&32))) *) + 0xa9432c4a; (* arm_LDP X10 X11 X2 (Immediate_Offset (iword (&48))) *) + 0x3dc01056; (* arm_LDR Q22 X2 (Immediate_Offset (word 64)) *) + 0x3dc01457; (* arm_LDR Q23 X2 (Immediate_Offset (word 80)) *) + 0x91008042; (* arm_ADD X2 X2 (rvalue (word 32)) *) + 0x91008021; (* arm_ADD X1 X1 (rvalue (word 32)) *) + 0xab100296; (* arm_ADDS X22 X20 X16 *) + 0xba0302b7; (* arm_ADCS X23 X21 X3 *) + 0xba1a0318; (* arm_ADCS X24 X24 X26 *) + 0x9a1f0239; (* arm_ADC X25 X17 XZR *) + 0x4e083f11; (* arm_UMOV X17 Q24 0 8 *) + 0x4e945ac3; (* arm_UZP2 Q3 Q22 Q20 32 *) + 0x0ea12a84; (* arm_XTN Q4 Q20 32 *) + 0x0ea12ac5; (* arm_XTN Q5 Q22 32 *) + 0x4ea00ac1; (* arm_REV64_VEC Q1 Q22 32 *) + 0xa9405434; (* arm_LDP X20 X21 X1 (Immediate_Offset (iword (&0))) *) + 0xab14018c; (* arm_ADDS X12 X12 X20 *) + 0xba1501ad; (* arm_ADCS X13 X13 X21 *) + 0xa9415434; (* arm_LDP X20 X21 X1 (Immediate_Offset (iword (&16))) *) + 0x2ea5c086; (* arm_UMULL_VEC Q6 Q4 Q5 32 *) + 0x2ea3c087; (* arm_UMULL_VEC Q7 Q4 Q3 32 *) + 0x4e945a90; (* arm_UZP2 Q16 Q20 Q20 32 *) + 0x4eb49c20; (* arm_MUL_VEC Q0 Q1 Q20 32 *) + 0xba1401ce; (* arm_ADCS X14 X14 X20 *) + 0xba1501ef; (* arm_ADCS X15 X15 X21 *) + 0x9a1f03f0; (* arm_ADC X16 XZR XZR *) + 0xab1102d3; (* arm_ADDS X19 X22 X17 *) + 0x6f00e5e2; (* arm_MOVI Q2 (word 4294967295) *) + 0x6f6014c7; (* arm_USRA_VEC Q7 Q6 32 64 128 *) + 0x2ea3c219; (* arm_UMULL_VEC Q25 Q16 Q3 32 *) + 0x6ea02800; (* arm_UADDLP Q0 Q0 32 *) + 0xba1602f6; (* arm_ADCS X22 X23 X22 *) + 0xba170317; (* arm_ADCS X23 X24 X23 *) + 0xba180338; (* arm_ADCS X24 X25 X24 *) + 0x9a1903f9; (* arm_ADC X25 XZR X25 *) + 0x4e221ce2; (* arm_AND_VEC Q2 Q7 Q2 128 *) + 0x2ea58202; (* arm_UMLAL_VEC Q2 Q16 Q5 32 *) + 0x4f605418; (* arm_SHL_VEC Q24 Q0 32 64 *) + 0x6f6014f9; (* arm_USRA_VEC Q25 Q7 32 64 128 *) + 0xab1102d4; (* arm_ADDS X20 X22 X17 *) + 0xba1302f5; (* arm_ADCS X21 X23 X19 *) + 0xba160316; (* arm_ADCS X22 X24 X22 *) + 0xba170337; (* arm_ADCS X23 X25 X23 *) + 0x2ea58098; (* arm_UMLAL_VEC Q24 Q4 Q5 32 *) + 0x6f601459; (* arm_USRA_VEC Q25 Q2 32 64 128 *) + 0xba1803f8; (* arm_ADCS X24 XZR X24 *) + 0x9a1903f9; (* arm_ADC X25 XZR X25 *) + 0xab0c0231; (* arm_ADDS X17 X17 X12 *) + 0xba0d0273; (* arm_ADCS X19 X19 X13 *) + 0x4e955ae3; (* arm_UZP2 Q3 Q23 Q21 32 *) + 0x0ea12aa4; (* arm_XTN Q4 Q21 32 *) + 0x0ea12ae5; (* arm_XTN Q5 Q23 32 *) + 0x4ea00ae1; (* arm_REV64_VEC Q1 Q23 32 *) + 0xba0e0294; (* arm_ADCS X20 X20 X14 *) + 0xba0f02b5; (* arm_ADCS X21 X21 X15 *) + 0xba1002d6; (* arm_ADCS X22 X22 X16 *) + 0xba1f02f7; (* arm_ADCS X23 X23 XZR *) + 0x2ea5c086; (* arm_UMULL_VEC Q6 Q4 Q5 32 *) + 0x2ea3c087; (* arm_UMULL_VEC Q7 Q4 Q3 32 *) + 0x4e955ab0; (* arm_UZP2 Q16 Q21 Q21 32 *) + 0x4eb59c20; (* arm_MUL_VEC Q0 Q1 Q21 32 *) + 0xba1f0318; (* arm_ADCS X24 X24 XZR *) + 0x9a1f0339; (* arm_ADC X25 X25 XZR *) + 0xeb0700cf; (* arm_SUBS X15 X6 X7 *) + 0xda8f25ef; (* arm_CNEG X15 X15 Condition_CC *) + 0x6f00e5e2; (* arm_MOVI Q2 (word 4294967295) *) + 0x6f6014c7; (* arm_USRA_VEC Q7 Q6 32 64 128 *) + 0x2ea3c21b; (* arm_UMULL_VEC Q27 Q16 Q3 32 *) + 0x6ea02800; (* arm_UADDLP Q0 Q0 32 *) + 0xda9f23ec; (* arm_CSETM X12 Condition_CC *) + 0xeb0a016d; (* arm_SUBS X13 X11 X10 *) + 0xda8d25ad; (* arm_CNEG X13 X13 Condition_CC *) + 0x9b0d7dee; (* arm_MUL X14 X15 X13 *) + 0x4e221ce2; (* arm_AND_VEC Q2 Q7 Q2 128 *) + 0x2ea58202; (* arm_UMLAL_VEC Q2 Q16 Q5 32 *) + 0x4f60541a; (* arm_SHL_VEC Q26 Q0 32 64 *) + 0x6f6014fb; (* arm_USRA_VEC Q27 Q7 32 64 128 *) + 0x9bcd7ded; (* arm_UMULH X13 X15 X13 *) + 0xda8c218c; (* arm_CINV X12 X12 Condition_CC *) + 0xb100059f; (* arm_CMN X12 (rvalue (word 1)) *) + 0xca0c01ce; (* arm_EOR X14 X14 X12 *) + 0x2ea5809a; (* arm_UMLAL_VEC Q26 Q4 Q5 32 *) + 0x6f60145b; (* arm_USRA_VEC Q27 Q2 32 64 128 *) + 0xba0e02f7; (* arm_ADCS X23 X23 X14 *) + 0xca0c01ad; (* arm_EOR X13 X13 X12 *) + 0xba0d0318; (* arm_ADCS X24 X24 X13 *) + 0x9a0c0339; (* arm_ADC X25 X25 X12 *) + 0xeb05008f; (* arm_SUBS X15 X4 X5 *) + 0xda8f25ef; (* arm_CNEG X15 X15 Condition_CC *) + 0xda9f23ec; (* arm_CSETM X12 Condition_CC *) + 0xeb08012d; (* arm_SUBS X13 X9 X8 *) + 0xda8d25ad; (* arm_CNEG X13 X13 Condition_CC *) + 0x9b0d7dee; (* arm_MUL X14 X15 X13 *) + 0x9bcd7ded; (* arm_UMULH X13 X15 X13 *) + 0xda8c218c; (* arm_CINV X12 X12 Condition_CC *) + 0xb100059f; (* arm_CMN X12 (rvalue (word 1)) *) + 0xca0c01ce; (* arm_EOR X14 X14 X12 *) + 0xba0e0273; (* arm_ADCS X19 X19 X14 *) + 0xca0c01ad; (* arm_EOR X13 X13 X12 *) + 0xba0d0294; (* arm_ADCS X20 X20 X13 *) + 0xba0c02b5; (* arm_ADCS X21 X21 X12 *) + 0xba0c02d6; (* arm_ADCS X22 X22 X12 *) + 0xba0c02f7; (* arm_ADCS X23 X23 X12 *) + 0xba0c0318; (* arm_ADCS X24 X24 X12 *) + 0x9a0c0339; (* arm_ADC X25 X25 X12 *) + 0xa9004c31; (* arm_STP X17 X19 X1 (Immediate_Offset (iword (&0))) *) + 0x4e083f30; (* arm_UMOV X16 Q25 0 8 *) + 0x4e083f7a; (* arm_UMOV X26 Q27 0 8 *) + 0x4e183f23; (* arm_UMOV X3 Q25 1 8 *) + 0x4e183f71; (* arm_UMOV X17 Q27 1 8 *) + 0xeb0700af; (* arm_SUBS X15 X5 X7 *) + 0xda8f25ef; (* arm_CNEG X15 X15 Condition_CC *) + 0xda9f23ec; (* arm_CSETM X12 Condition_CC *) + 0xeb09016d; (* arm_SUBS X13 X11 X9 *) + 0xda8d25ad; (* arm_CNEG X13 X13 Condition_CC *) + 0x9b0d7dee; (* arm_MUL X14 X15 X13 *) + 0x9bcd7ded; (* arm_UMULH X13 X15 X13 *) + 0xda8c218c; (* arm_CINV X12 X12 Condition_CC *) + 0xb100059f; (* arm_CMN X12 (rvalue (word 1)) *) + 0xca0c01ce; (* arm_EOR X14 X14 X12 *) + 0xba0e02d6; (* arm_ADCS X22 X22 X14 *) + 0xca0c01ad; (* arm_EOR X13 X13 X12 *) + 0xba0d02f7; (* arm_ADCS X23 X23 X13 *) + 0xba0c0318; (* arm_ADCS X24 X24 X12 *) + 0x9a0c0339; (* arm_ADC X25 X25 X12 *) + 0xeb06008f; (* arm_SUBS X15 X4 X6 *) + 0xda8f25ef; (* arm_CNEG X15 X15 Condition_CC *) + 0xda9f23ec; (* arm_CSETM X12 Condition_CC *) + 0xeb08014d; (* arm_SUBS X13 X10 X8 *) + 0xda8d25ad; (* arm_CNEG X13 X13 Condition_CC *) + 0x9b0d7dee; (* arm_MUL X14 X15 X13 *) + 0x9bcd7ded; (* arm_UMULH X13 X15 X13 *) + 0xda8c218c; (* arm_CINV X12 X12 Condition_CC *) + 0xb100059f; (* arm_CMN X12 (rvalue (word 1)) *) + 0xca0c01ce; (* arm_EOR X14 X14 X12 *) + 0xba0e0294; (* arm_ADCS X20 X20 X14 *) + 0xca0c01ad; (* arm_EOR X13 X13 X12 *) + 0xba0d02b5; (* arm_ADCS X21 X21 X13 *) + 0xba0c02d6; (* arm_ADCS X22 X22 X12 *) + 0xba0c02f7; (* arm_ADCS X23 X23 X12 *) + 0xba0c0318; (* arm_ADCS X24 X24 X12 *) + 0x9a0c0339; (* arm_ADC X25 X25 X12 *) + 0xeb07008f; (* arm_SUBS X15 X4 X7 *) + 0xda8f25ef; (* arm_CNEG X15 X15 Condition_CC *) + 0xda9f23ec; (* arm_CSETM X12 Condition_CC *) + 0xeb08016d; (* arm_SUBS X13 X11 X8 *) + 0xda8d25ad; (* arm_CNEG X13 X13 Condition_CC *) + 0x9b0d7dee; (* arm_MUL X14 X15 X13 *) + 0x9bcd7ded; (* arm_UMULH X13 X15 X13 *) + 0xda8c218c; (* arm_CINV X12 X12 Condition_CC *) + 0xb100059f; (* arm_CMN X12 (rvalue (word 1)) *) + 0xca0c01ce; (* arm_EOR X14 X14 X12 *) + 0xba0e02b5; (* arm_ADCS X21 X21 X14 *) + 0xca0c01ad; (* arm_EOR X13 X13 X12 *) + 0xba0d02d6; (* arm_ADCS X22 X22 X13 *) + 0xba0c02f7; (* arm_ADCS X23 X23 X12 *) + 0xba0c0318; (* arm_ADCS X24 X24 X12 *) + 0x9a0c0339; (* arm_ADC X25 X25 X12 *) + 0xeb0600af; (* arm_SUBS X15 X5 X6 *) + 0xda8f25ef; (* arm_CNEG X15 X15 Condition_CC *) + 0xda9f23ec; (* arm_CSETM X12 Condition_CC *) + 0xeb09014d; (* arm_SUBS X13 X10 X9 *) + 0xda8d25ad; (* arm_CNEG X13 X13 Condition_CC *) + 0x9b0d7dee; (* arm_MUL X14 X15 X13 *) + 0x9bcd7ded; (* arm_UMULH X13 X15 X13 *) + 0xda8c218c; (* arm_CINV X12 X12 Condition_CC *) + 0xb100059f; (* arm_CMN X12 (rvalue (word 1)) *) + 0xca0c01ce; (* arm_EOR X14 X14 X12 *) + 0xba0e02b5; (* arm_ADCS X21 X21 X14 *) + 0xa9015434; (* arm_STP X20 X21 X1 (Immediate_Offset (iword (&16))) *) + 0x4e183f14; (* arm_UMOV X20 Q24 1 8 *) + 0x4e083f55; (* arm_UMOV X21 Q26 0 8 *) + 0xca0c01ad; (* arm_EOR X13 X13 X12 *) + 0xba0d02d6; (* arm_ADCS X22 X22 X13 *) + 0xba0c02ed; (* arm_ADCS X13 X23 X12 *) + 0xba0c030e; (* arm_ADCS X14 X24 X12 *) + 0x9a0c032f; (* arm_ADC X15 X25 X12 *) + 0xaa1603ec; (* arm_MOV X12 X22 *) + 0x4e183f58; (* arm_UMOV X24 Q26 1 8 *) + 0xd100837b; (* arm_SUB X27 X27 (rvalue (word 32)) *) + 0xf100837f; (* arm_CMP X27 (rvalue (word 32)) *) + 0x54ffe961; (* arm_BNE (word 2096428) *) + 0xa9422448; (* arm_LDP X8 X9 X2 (Immediate_Offset (iword (&32))) *) + 0xa9432c4a; (* arm_LDP X10 X11 X2 (Immediate_Offset (iword (&48))) *) + 0x91008042; (* arm_ADD X2 X2 (rvalue (word 32)) *) + 0x91008021; (* arm_ADD X1 X1 (rvalue (word 32)) *) + 0xab100296; (* arm_ADDS X22 X20 X16 *) + 0xba0302b7; (* arm_ADCS X23 X21 X3 *) + 0xba1a0318; (* arm_ADCS X24 X24 X26 *) + 0x9a1f0239; (* arm_ADC X25 X17 XZR *) + 0x4e083f11; (* arm_UMOV X17 Q24 0 8 *) + 0xa9405434; (* arm_LDP X20 X21 X1 (Immediate_Offset (iword (&0))) *) + 0xab14018c; (* arm_ADDS X12 X12 X20 *) + 0xba1501ad; (* arm_ADCS X13 X13 X21 *) + 0xa9415434; (* arm_LDP X20 X21 X1 (Immediate_Offset (iword (&16))) *) + 0xba1401ce; (* arm_ADCS X14 X14 X20 *) + 0xba1501ef; (* arm_ADCS X15 X15 X21 *) + 0x9a1f03f0; (* arm_ADC X16 XZR XZR *) + 0xab1102d3; (* arm_ADDS X19 X22 X17 *) + 0xba1602f6; (* arm_ADCS X22 X23 X22 *) + 0xba170317; (* arm_ADCS X23 X24 X23 *) + 0xba180338; (* arm_ADCS X24 X25 X24 *) + 0x9a1903f9; (* arm_ADC X25 XZR X25 *) + 0xab1102d4; (* arm_ADDS X20 X22 X17 *) + 0xba1302f5; (* arm_ADCS X21 X23 X19 *) + 0xba160316; (* arm_ADCS X22 X24 X22 *) + 0xba170337; (* arm_ADCS X23 X25 X23 *) + 0xba1803f8; (* arm_ADCS X24 XZR X24 *) + 0x9a1903f9; (* arm_ADC X25 XZR X25 *) + 0xab0c0231; (* arm_ADDS X17 X17 X12 *) + 0xba0d0273; (* arm_ADCS X19 X19 X13 *) + 0xba0e0294; (* arm_ADCS X20 X20 X14 *) + 0xba0f02b5; (* arm_ADCS X21 X21 X15 *) + 0xba1002d6; (* arm_ADCS X22 X22 X16 *) + 0xba1f02f7; (* arm_ADCS X23 X23 XZR *) + 0xba1f0318; (* arm_ADCS X24 X24 XZR *) + 0x9a1f0339; (* arm_ADC X25 X25 XZR *) + 0xeb0700cf; (* arm_SUBS X15 X6 X7 *) + 0xda8f25ef; (* arm_CNEG X15 X15 Condition_CC *) + 0xda9f23ec; (* arm_CSETM X12 Condition_CC *) + 0xeb0a016d; (* arm_SUBS X13 X11 X10 *) + 0xda8d25ad; (* arm_CNEG X13 X13 Condition_CC *) + 0x9b0d7dee; (* arm_MUL X14 X15 X13 *) + 0x9bcd7ded; (* arm_UMULH X13 X15 X13 *) + 0xda8c218c; (* arm_CINV X12 X12 Condition_CC *) + 0xb100059f; (* arm_CMN X12 (rvalue (word 1)) *) + 0xca0c01ce; (* arm_EOR X14 X14 X12 *) + 0xba0e02f7; (* arm_ADCS X23 X23 X14 *) + 0xca0c01ad; (* arm_EOR X13 X13 X12 *) + 0xba0d0318; (* arm_ADCS X24 X24 X13 *) + 0x9a0c0339; (* arm_ADC X25 X25 X12 *) + 0xeb05008f; (* arm_SUBS X15 X4 X5 *) + 0xda8f25ef; (* arm_CNEG X15 X15 Condition_CC *) + 0xda9f23ec; (* arm_CSETM X12 Condition_CC *) + 0xeb08012d; (* arm_SUBS X13 X9 X8 *) + 0xda8d25ad; (* arm_CNEG X13 X13 Condition_CC *) + 0x9b0d7dee; (* arm_MUL X14 X15 X13 *) + 0x9bcd7ded; (* arm_UMULH X13 X15 X13 *) + 0xda8c218c; (* arm_CINV X12 X12 Condition_CC *) + 0xb100059f; (* arm_CMN X12 (rvalue (word 1)) *) + 0xca0c01ce; (* arm_EOR X14 X14 X12 *) + 0xba0e0273; (* arm_ADCS X19 X19 X14 *) + 0xca0c01ad; (* arm_EOR X13 X13 X12 *) + 0xba0d0294; (* arm_ADCS X20 X20 X13 *) + 0xba0c02b5; (* arm_ADCS X21 X21 X12 *) + 0xba0c02d6; (* arm_ADCS X22 X22 X12 *) + 0xba0c02f7; (* arm_ADCS X23 X23 X12 *) + 0xba0c0318; (* arm_ADCS X24 X24 X12 *) + 0x9a0c0339; (* arm_ADC X25 X25 X12 *) + 0xeb0700af; (* arm_SUBS X15 X5 X7 *) + 0xda8f25ef; (* arm_CNEG X15 X15 Condition_CC *) + 0xda9f23ec; (* arm_CSETM X12 Condition_CC *) + 0xeb09016d; (* arm_SUBS X13 X11 X9 *) + 0xda8d25ad; (* arm_CNEG X13 X13 Condition_CC *) + 0x9b0d7dee; (* arm_MUL X14 X15 X13 *) + 0x9bcd7ded; (* arm_UMULH X13 X15 X13 *) + 0xda8c218c; (* arm_CINV X12 X12 Condition_CC *) + 0xb100059f; (* arm_CMN X12 (rvalue (word 1)) *) + 0xca0c01ce; (* arm_EOR X14 X14 X12 *) + 0xba0e02d6; (* arm_ADCS X22 X22 X14 *) + 0xca0c01ad; (* arm_EOR X13 X13 X12 *) + 0xba0d02f7; (* arm_ADCS X23 X23 X13 *) + 0xba0c0318; (* arm_ADCS X24 X24 X12 *) + 0x9a0c0339; (* arm_ADC X25 X25 X12 *) + 0xeb06008f; (* arm_SUBS X15 X4 X6 *) + 0xda8f25ef; (* arm_CNEG X15 X15 Condition_CC *) + 0xda9f23ec; (* arm_CSETM X12 Condition_CC *) + 0xeb08014d; (* arm_SUBS X13 X10 X8 *) + 0xda8d25ad; (* arm_CNEG X13 X13 Condition_CC *) + 0x9b0d7dee; (* arm_MUL X14 X15 X13 *) + 0x9bcd7ded; (* arm_UMULH X13 X15 X13 *) + 0xda8c218c; (* arm_CINV X12 X12 Condition_CC *) + 0xb100059f; (* arm_CMN X12 (rvalue (word 1)) *) + 0xca0c01ce; (* arm_EOR X14 X14 X12 *) + 0xba0e0294; (* arm_ADCS X20 X20 X14 *) + 0xca0c01ad; (* arm_EOR X13 X13 X12 *) + 0xba0d02b5; (* arm_ADCS X21 X21 X13 *) + 0xba0c02d6; (* arm_ADCS X22 X22 X12 *) + 0xba0c02f7; (* arm_ADCS X23 X23 X12 *) + 0xba0c0318; (* arm_ADCS X24 X24 X12 *) + 0x9a0c0339; (* arm_ADC X25 X25 X12 *) + 0xeb07008f; (* arm_SUBS X15 X4 X7 *) + 0xda8f25ef; (* arm_CNEG X15 X15 Condition_CC *) + 0xda9f23ec; (* arm_CSETM X12 Condition_CC *) + 0xeb08016d; (* arm_SUBS X13 X11 X8 *) + 0xda8d25ad; (* arm_CNEG X13 X13 Condition_CC *) + 0x9b0d7dee; (* arm_MUL X14 X15 X13 *) + 0x9bcd7ded; (* arm_UMULH X13 X15 X13 *) + 0xda8c218c; (* arm_CINV X12 X12 Condition_CC *) + 0xb100059f; (* arm_CMN X12 (rvalue (word 1)) *) + 0xca0c01ce; (* arm_EOR X14 X14 X12 *) + 0xba0e02b5; (* arm_ADCS X21 X21 X14 *) + 0xca0c01ad; (* arm_EOR X13 X13 X12 *) + 0xba0d02d6; (* arm_ADCS X22 X22 X13 *) + 0xba0c02f7; (* arm_ADCS X23 X23 X12 *) + 0xba0c0318; (* arm_ADCS X24 X24 X12 *) + 0x9a0c0339; (* arm_ADC X25 X25 X12 *) + 0xeb0600af; (* arm_SUBS X15 X5 X6 *) + 0xda8f25ef; (* arm_CNEG X15 X15 Condition_CC *) + 0xda9f23ec; (* arm_CSETM X12 Condition_CC *) + 0xeb09014d; (* arm_SUBS X13 X10 X9 *) + 0xda8d25ad; (* arm_CNEG X13 X13 Condition_CC *) + 0x9b0d7dee; (* arm_MUL X14 X15 X13 *) + 0x9bcd7ded; (* arm_UMULH X13 X15 X13 *) + 0xda8c218c; (* arm_CINV X12 X12 Condition_CC *) + 0xb100059f; (* arm_CMN X12 (rvalue (word 1)) *) + 0xca0c01ce; (* arm_EOR X14 X14 X12 *) + 0xba0e02b5; (* arm_ADCS X21 X21 X14 *) + 0xca0c01ad; (* arm_EOR X13 X13 X12 *) + 0xba0d02d6; (* arm_ADCS X22 X22 X13 *) + 0xba0c02ed; (* arm_ADCS X13 X23 X12 *) + 0xba0c030e; (* arm_ADCS X14 X24 X12 *) + 0x9a0c032f; (* arm_ADC X15 X25 X12 *) + 0xaa1603ec; (* arm_MOV X12 X22 *) + 0xa9004c31; (* arm_STP X17 X19 X1 (Immediate_Offset (iword (&0))) *) + 0xa9015434; (* arm_STP X20 X21 X1 (Immediate_Offset (iword (&16))) *) + 0xf101037b; (* arm_SUBS X27 X27 (rvalue (word 64)) *) + 0xa9424c31; (* arm_LDP X17 X19 X1 (Immediate_Offset (iword (&32))) *) + 0xa9435434; (* arm_LDP X20 X21 X1 (Immediate_Offset (iword (&48))) *) + 0xa9417ffa; (* arm_LDP X26 XZR SP (Immediate_Offset (iword (&16))) *) + 0xab1c039f; (* arm_CMN X28 X28 *) + 0xba0c0231; (* arm_ADCS X17 X17 X12 *) + 0xba0d0273; (* arm_ADCS X19 X19 X13 *) + 0xba0e0294; (* arm_ADCS X20 X20 X14 *) + 0xba0f02b5; (* arm_ADCS X21 X21 X15 *) + 0xda9f33fc; (* arm_CSETM X28 Condition_CS *) + 0xa9024c31; (* arm_STP X17 X19 X1 (Immediate_Offset (iword (&32))) *) + 0xa9035434; (* arm_STP X20 X21 X1 (Immediate_Offset (iword (&48))) *) + 0xcb000021; (* arm_SUB X1 X1 X0 *) + 0xcb000042; (* arm_SUB X2 X2 X0 *) + 0x91008021; (* arm_ADD X1 X1 (rvalue (word 32)) *) + 0xf100075a; (* arm_SUBS X26 X26 (rvalue (word 1)) *) + 0xa9017ffa; (* arm_STP X26 XZR SP (Immediate_Offset (iword (&16))) *) + 0x54ff9681; (* arm_BNE (word 2093776) *) + 0xcb1c03e0; (* arm_NEG X0 X28 *) + 0x910083ff; (* arm_ADD SP SP (rvalue (word 32)) *) + 0xa8c173fb; (* arm_LDP X27 X28 SP (Postimmediate_Offset (iword (&16))) *) + 0xa8c16bf9; (* arm_LDP X25 X26 SP (Postimmediate_Offset (iword (&16))) *) + 0xa8c163f7; (* arm_LDP X23 X24 SP (Postimmediate_Offset (iword (&16))) *) + 0xa8c15bf5; (* arm_LDP X21 X22 SP (Postimmediate_Offset (iword (&16))) *) + 0xa8c153f3; (* arm_LDP X19 X20 SP (Postimmediate_Offset (iword (&16))) *) + 0xd65f03c0 (* arm_RET X30 *) +];; + +let BIGNUM_EMONTREDC_8N_NEON_EXEC = ARM_MK_EXEC_RULE bignum_emontredc_8n_neon_mc;; + +(* ------------------------------------------------------------------------- *) +(* Proof. *) +(* ------------------------------------------------------------------------- *) + +(*** Lemma to justify zeros in the Montgomery steps ***) + +let montgomery_lemma = prove + (`!w n. + (n * w + 1 == 0) (mod (2 EXP 64)) + ==> !h l x. + &2 pow 64 * &h + &l:real = + &(val (word(x * w):int64)) * + &(val(word(bigdigit n 0):int64)) + ==> !h' l'. &2 pow 64 * &h' + &(val l'):real = &x + &l + ==> val(l':int64) = 0`, + REPEAT GEN_TAC THEN DISCH_TAC THEN REPEAT GEN_TAC THEN + REWRITE_TAC[REAL_OF_NUM_CLAUSES] THEN + REWRITE_TAC[VAL_WORD_ZX_GEN; VAL_WORD; GSYM LOWDIGITS_1; lowdigits] THEN + REPEAT STRIP_TAC THEN ONCE_REWRITE_TAC[GSYM VAL_MOD_REFL] THEN + REPEAT(FIRST_X_ASSUM(MP_TAC o AP_TERM `\x. x MOD 2 EXP 64`)) THEN + REWRITE_TAC[MOD_MULT_ADD; DIMINDEX_128; DIMINDEX_64; MULT_CLAUSES] THEN + REWRITE_TAC[MOD_MOD_EXP_MIN] THEN + REWRITE_TAC[ARITH_RULE `MIN 64 64 = 64 /\ MIN 128 64 = 64`] THEN + CONV_TAC MOD_DOWN_CONV THEN REWRITE_TAC[GSYM CONG; GSYM DIVIDES_MOD] THEN + POP_ASSUM MP_TAC THEN SPEC_TAC(`2 EXP 64`,`p:num`) THEN + CONV_TAC NUMBER_RULE);; + +(*** Lemmas for the case splits in the ADK blocks ***) + +let lemma1 = prove + (`!(x0:num) x1 (y0:num) y1. + (if y0 <= y1 + then if x1 <= x0 then word 0 else word 18446744073709551615 + else word_not + (if x1 <= x0 then word 0 else word 18446744073709551615)):int64 = + word_neg(word(bitval(y0 <= y1 <=> x0 < x1)))`, + REPEAT GEN_TAC THEN REWRITE_TAC[GSYM NOT_LE] THEN + REPEAT(COND_CASES_TAC THEN ASM_REWRITE_TAC[BITVAL_CLAUSES]) THEN + CONV_TAC WORD_REDUCE_CONV);; + +let lemma2 = prove + (`!(x0:int64) (x1:int64) (y0:int64) (y1:int64). + &(val(if val x1 <= val x0 then word_sub x0 x1 + else word_neg (word_sub x0 x1))) * + &(val(if val y0 <= val y1 then word_sub y1 y0 + else word_neg (word_sub y1 y0))):real = + --(&1) pow bitval(val y0 <= val y1 <=> val x0 < val x1) * + (&(val x0) - &(val x1)) * (&(val y1) - &(val y0))`, + REPEAT GEN_TAC THEN REWRITE_TAC[GSYM NOT_LE; WORD_NEG_SUB] THEN + REPEAT(COND_CASES_TAC THEN ASM_REWRITE_TAC[BITVAL_CLAUSES]) THEN + REPEAT(FIRST_X_ASSUM(ASSUME_TAC o MATCH_MP (ARITH_RULE + `~(m:num <= n) ==> n <= m /\ ~(m <= n)`))) THEN + ASM_SIMP_TAC[VAL_WORD_SUB_CASES; GSYM REAL_OF_NUM_SUB] THEN + REAL_ARITH_TAC);; + +(*** Load helpful lemmas and tactics for NEONs ***) + +needs "arm/proofs/neon_helper.ml";; + +(*** Define a few important definitions and useful functions ***) + +let inner_loop_invariant = + `\i s. read X1 s = word_sub (word_add z (word(32 * i))) (word 32) /\ + read X2 s = word_sub (word_add m (word(32 * i))) (word 32) /\ + bignum_from_memory(m,k) s = n /\ + read X0 s = word (32 * (k4 - 1)) /\ + read SP s = word_sub stackpointer (word 32) /\ + read (memory :> bytes64 (word_sub stackpointer (word 32))) s = word w /\ + read (memory :> + bytes64 (word_add (word_sub stackpointer (word 32)) (word 16))) s = + wouter /\ + read X28 s = word_neg(word cout) /\ + bignum_from_memory (z,4) s = q /\ + read X4 s = word (bigdigit q 0) /\ + read X5 s = word (bigdigit q 1) /\ + read X6 s = word (bigdigit q 2) /\ + read X7 s = word (bigdigit q 3) /\ + bignum_from_memory (word_add z (word (8 * 4 * i)), + (k + 4) - 4 * i) s = + highdigits a (4 * i) /\ + + // induction variable + read X27 s = word (32 * (k4 - i)) /\ + + // two vector regs read during outerloop + read Q20 s = word_join + (word(bigdigit q 1):(64)word) (word(bigdigit q 0):(64)word) /\ + read Q21 s = word_join + (word(bigdigit q 3):(64)word) (word(bigdigit q 2):(64)word) /\ + + // pre-calculated multiplications + read X16 s = + word ((val (word (bigdigit q 0):(64)word) * + val (word (bigdigit n (4 * i)):(64)word)) DIV 2 EXP 64):(64)word /\ // hi of x4*x8 + read X26 s = word + ((val (word (bigdigit q 2):(64)word) * + val (word (bigdigit n (4 * i + 2)):(64)word)) DIV 2 EXP 64):(64)word /\ // hi of x6 * x10 + read X3 s = word + ((val (word (bigdigit q 1):(64)word) * + val (word (bigdigit n (4 * i + 1)):(64)word)) DIV 2 EXP 64):(64)word /\ // hi of x5 * x9 + read X17 s = word + ((val (word (bigdigit q 3):(64)word) * + val (word (bigdigit n (4 * i + 3)):(64)word)) DIV 2 EXP 64):(64)word /\ // hi of x6 * x10 + read X20 s = + word (0 + val (word (bigdigit q 1):(64)word) + * val (word (bigdigit n (4 * i + 1)):(64)word)):(64)word /\ // lo of x5 * x9 + read X21 s = + word (0 + val (word (bigdigit q 2):(64)word) + * val (word (bigdigit n (4 * i + 2)):(64)word)):(64)word /\ // lo of x6 * x10 + read X24 s = + word (0 + val (word (bigdigit q 3):(64)word) + * val (word (bigdigit n (4 * i + 3)):(64)word)):(64)word /\ // lo of x7 * x11 + read Q24 s = word_join + (word (0 + val (word (bigdigit q 1):(64)word) + * val (word (bigdigit n (4 * i + 1)):(64)word)):(64)word) + (word (0 + val (word (bigdigit q 0):(64)word) + * val (word (bigdigit n (4 * i)):(64)word)):(64)word) /\ + ((n * w + 1 == 0) (mod (2 EXP 64)) + ==> 2 EXP (64 * 4 * i) * + bignum_of_wordlist + [read X12 s; read X13 s; read X14 s; read X15 s] + + bignum_from_memory(z,4 * i) s = + q * lowdigits n (4 * i) + lowdigits a (4 * i) + q)`;; + +let inner_loop_invariant_with_flag = mk_abs + (`i:num`, mk_abs + (`s:armstate`, mk_conj + (snd (dest_abs (snd (dest_abs inner_loop_invariant))), + `read ZF s <=> i = (k4-1)`)));; + +(* Given f = \i. x, return x[n/i] *) +let apply_i f n = rhs (concl (BETA_CONV (mk_comb (f, n))));; + +let get_hoare_precond (concl:term) = + try + let hoare_precond = rand(rator(rator(concl))) in + hoare_precond + with Failure _ -> + failwith ("get_hoare_precond cannot understand " ^ string_of_term concl);; + +(* Given a hoare condition that is + `\s. aligned_bytes_loaded s (word pc) .._mc /\ + read PC s = ... /\ + BODY`, + return `\s. BODY`. *) +let strip_mc_and_pc_conds (hoare_cond:term):term = + let s,body = dest_abs hoare_cond in + let aligned_load_mc, body = dest_conj body in + let old_pc_eq, body = dest_conj body in + let old_pc_eq_lhs, old_pc_eq_rhs = dest_eq old_pc_eq in + if not (old_pc_eq_lhs = `read PC s`) then + failwith ("Must be `read PC s = ...`, but got " ^ string_of_term old_pc_eq) else + mk_abs(s, body);; + +(* Given a hoare condition that is + `\s. aligned_bytes_loaded s (word pc) .._mc /\ + read PC s = ... /\ + BODY`, + return `\s. aligned_bytes_loaded s (word pc) .._mc /\ + read PC s = ... /\ + t /\ BODY`. *) +let mk_hoare_cond_conj (hoare_cond,t:term*term):term = + let s,body = dest_abs hoare_cond in + let aligned_load_mc, body = dest_conj body in + let read_pc, body = dest_conj body in + mk_abs(s, mk_conj(aligned_load_mc, mk_conj(read_pc, mk_conj(t, body))));; + +(* A solver that targets conclusions like this: + `2 EXP 256 * bignum_of_wordlist [sum_s179; sum_s180; sum_s181; sum_s182] + + val sum_s53 + + 2 EXP 64 * val sum_s103 + + 2 EXP 128 * val sum_s141 + + 2 EXP 192 * val sum_s174 = + (val (word (bigdigit q 0)) + + 2 EXP 64 * val (word (bigdigit q 1)) + + 2 EXP 128 * val (word (bigdigit q 2)) + + 2 EXP 192 * val (word (bigdigit q 3))) * + (2 EXP (64 * 3) * bigdigit n 7 + + 2 EXP (64 * 2) * bigdigit n 6 + + 2 EXP (64 * 1) * bigdigit n 5 + + bigdigit n 4) + + 2 EXP (64 * 3) * bigdigit a 7 + + 2 EXP (64 * 2) * bigdigit a 6 + + 2 EXP (64 * 1) * bigdigit a 5 + + bigdigit a 4 + + bignum_of_wordlist [g8; g9; g10; g11]` *) +let PROVE_IT = REWRITE_TAC[bignum_of_wordlist] THEN + REWRITE_TAC[GSYM REAL_OF_NUM_CLAUSES] THEN CONV_TAC NUM_REDUCE_CONV THEN + ONCE_REWRITE_TAC[GSYM VAL_WORD_BIGDIGIT] THEN REWRITE_TAC[WORD_VAL] THEN + MATCH_MP_TAC EQUAL_FROM_CONGRUENT_REAL THEN + MAP_EVERY EXISTS_TAC [`512`; `&0:real`] THEN + REPLICATE_TAC 2 (CONJ_TAC THENL [BOUNDER_TAC[]; ALL_TAC]) THEN + CONJ_TAC THENL [REAL_INTEGER_TAC; ALL_TAC] THEN + + ACCUMULATOR_POP_ASSUM_LIST(MP_TAC o end_itlist CONJ) THEN + POP_ASSUM_LIST(K ALL_TAC) THEN + REWRITE_TAC[lemma1; lemma2] THEN REWRITE_TAC[WORD_XOR_MASK] THEN + + REPEAT(COND_CASES_TAC THEN + ASM_REWRITE_TAC[BITVAL_CLAUSES; REAL_VAL_WORD_NOT]) THEN + CONV_TAC WORD_REDUCE_CONV THEN CONV_TAC NUM_REDUCE_CONV THEN + REWRITE_TAC[BITVAL_CLAUSES; DIMINDEX_64] THEN + POP_ASSUM_LIST(K ALL_TAC) THEN DISCH_TAC THEN + FIRST_ASSUM(MP_TAC o end_itlist CONJ o DESUM_RULE o CONJUNCTS) THEN + DISCH_THEN(fun th -> REWRITE_TAC[th]) THEN + REWRITE_TAC[VAL_WORD_BIGDIGIT; ADD_CLAUSES; VAL_WORD_BITVAL] THEN + CONV_TAC(RAND_CONV REAL_POLY_CONV) THEN + FIRST_ASSUM(MP_TAC o end_itlist CONJ o filter (is_ratconst o rand o concl) o + DECARRY_RULE o CONJUNCTS) THEN + DISCH_THEN(fun th -> REWRITE_TAC[th]) THEN + (REAL_INTEGER_TAC ORELSE + (PRINT_GOAL_TAC "REAL_INTEGER_TAC could not prove this goal" THEN + FAIL_TAC "REAL_INTEGER failed"));; + +let BIGNUM_EMONTREDC_8N_NEON_CORRECT = time prove + (`!k z m w a n pc stackpointer. + aligned 16 stackpointer /\ + ALLPAIRS nonoverlapping + [(word pc,3468); (m,8 * val k)] + [(z,8 * 2 * val k); (word_sub stackpointer (word 32), 32)] /\ + nonoverlapping (z,8 * 2 * val k) (word_sub stackpointer (word 32), 32) /\ + 8 divides val k + ==> ensures arm + (\s. aligned_bytes_loaded s (word pc) bignum_emontredc_8n_neon_mc /\ + read PC s = word(pc + 0x14) /\ + read SP s = stackpointer /\ + C_ARGUMENTS [k; z; m; w] s /\ + bignum_from_memory (z,2 * val k) s = a /\ + bignum_from_memory (m,val k) s = n) + (\s. read PC s = word(pc + 3444) /\ + ((n * val w + 1 == 0) (mod (2 EXP 64)) + ==> n * bignum_from_memory (z,val k) s + a = + 2 EXP (64 * val k) * + (2 EXP (64 * val k) * val(C_RETURN s) + + bignum_from_memory + (word_add z (word(8 * val k)),val k) s))) + (MAYCHANGE_REGS_AND_FLAGS_PERMITTED_BY_ABI ,, + MAYCHANGE [X19; X20; X21; X22; X23; X24; X25; X26; X27; X28] ,, + MAYCHANGE [memory :> bytes(z,8 * 2 * val k); + memory :> bytes(word_sub stackpointer (word 32),32)])`, + W64_GEN_TAC `k:num` THEN + MAP_EVERY X_GEN_TAC [`z:int64`; `m:int64`] THEN + W64_GEN_TAC `w:num` THEN + MAP_EVERY X_GEN_TAC [`a:num`; `n:num`; `pc:num`; `stackpointer:int64`] THEN + REWRITE_TAC[ALL; ALLPAIRS; NONOVERLAPPING_CLAUSES] THEN + REWRITE_TAC[C_ARGUMENTS; C_RETURN; SOME_FLAGS; MAYCHANGE_REGS_AND_FLAGS_PERMITTED_BY_ABI] THEN + DISCH_THEN(REPEAT_TCL CONJUNCTS_THEN ASSUME_TAC) THEN + BIGNUM_TERMRANGE_TAC `2 * k` `a:num` THEN + BIGNUM_TERMRANGE_TAC `k:num` `n:num` THEN + ENSURES_EXISTING_PRESERVED_TAC `SP` THEN + ABBREV_TAC `k4 = k DIV 4` THEN + + (*** Degenerate k/4 = 0 case ***) + + ASM_CASES_TAC `k4 = 0` THENL + [UNDISCH_THEN `k4 = 0` SUBST_ALL_TAC THEN + + REWRITE_TAC(!simulation_precanon_thms) THEN + ENSURES_INIT_TAC "s0" THEN + ARM_STEPS_TAC BIGNUM_EMONTREDC_8N_NEON_EXEC (1--5) THEN + UNDISCH_TAC `read PC s5 = + (if val (word_ushr (word k:(64)word) 2) < 1 then word (pc + 3440) else word (pc + 40))` THEN + ASM_REWRITE_TAC[VAL_WORD_USHR; NUM_REDUCE_CONV `2 EXP 2`; ARITH_RULE `0 < 1`] THEN + DISCH_TAC THEN + ARM_STEPS_TAC BIGNUM_EMONTREDC_8N_NEON_EXEC [861] THEN + ENSURES_FINAL_STATE_TAC THEN + UNDISCH_TAC `8 divides k` THEN + ASM_REWRITE_TAC[VAL_WORD_USHR; NUM_REDUCE_CONV `2 EXP 2`; + DIVIDES_DIV_MULT; MULT_CLAUSES; ARITH_RULE `0 < 1`; + DIV_0; ARITH_RULE `k DIV 8 = k DIV 4 DIV 2`; + WORD_RULE `word_add (word_sub x y) y:(64)word = x`] THEN + ASM_CASES_TAC `k = 0` THEN ASM_REWRITE_TAC[] THEN + EXPAND_TAC "a" THEN REWRITE_TAC[ASSUME `k = 0`] THEN + REWRITE_TAC[GSYM BIGNUM_FROM_MEMORY_BYTES; VAL_WORD_0] THEN + ASM_REWRITE_TAC[ADD_CLAUSES; MULT_CLAUSES; BIGNUM_FROM_MEMORY_TRIVIAL]; + ALL_TAC] THEN + + (*** Restate things in terms of k' = k * k DIV 4 for naturalness ***) + + ABBREV_TAC `k' = 4 * k4` THEN + ABBREV_TAC `a' = lowdigits a (2 * k')` THEN + ABBREV_TAC `n' = lowdigits n k'` THEN + + ENSURES_SEQUENCE_TAC `pc + 0x28` + `\s. read X12 s = word(k4 - 1) /\ + read X26 s = word k4 /\ + read X1 s = z /\ + read X2 s = m /\ + read X3 s = word w /\ + read SP s = word_sub stackpointer (word 32) /\ + aligned 16 stackpointer /\ + bignum_from_memory (z,2 * k') s = a' /\ + bignum_from_memory (m,k') s = n'` THEN + CONJ_TAC THENL + [ARM_SIM_TAC BIGNUM_EMONTREDC_8N_NEON_EXEC (1--5) THEN + ASM_REWRITE_TAC[VAL_WORD_USHR; NUM_REDUCE_CONV `2 EXP 2`] THEN + ASM_REWRITE_TAC[ARITH_RULE `n < 1 <=> n = 0`] THEN + ASM_REWRITE_TAC[WORD_SUB; ARITH_RULE `1 <= n <=> ~(n = 0)`] THEN + REWRITE_TAC[WORD_RULE `word_sub x z = word_sub y z <=> x = y`] THEN + ASM_REWRITE_TAC[word_ushr; NUM_REDUCE_CONV `2 EXP 2`] THEN + MAP_EVERY EXPAND_TAC ["a'"; "n'"; "a"; "n"] THEN + REWRITE_TAC[GSYM BIGNUM_FROM_MEMORY_BYTES] THEN + REWRITE_TAC[LOWDIGITS_BIGNUM_FROM_MEMORY] THEN + MAP_EVERY EXPAND_TAC ["k'"; "k4"] THEN + CONJ_TAC THEN AP_THM_TAC THEN AP_TERM_TAC THEN AP_TERM_TAC THEN ARITH_TAC; + ALL_TAC] THEN + + ENSURES_SEQUENCE_TAC `pc + 0xd74` + `\s. ((n' * w + 1 == 0) (mod (2 EXP 64)) + ==> n' * bignum_from_memory (z,k') s + a' = + 2 EXP (64 * k') * + (2 EXP (64 * k') * val(read X0 s) + + bignum_from_memory (word_add z (word (8 * k')),k') s)) /\ + read SP s = stackpointer` THEN + CONJ_TAC THENL + [ALL_TAC; + + ENSURES_INIT_TAC "s0" THEN + ENSURES_FINAL_STATE_TAC THEN + ASM_REWRITE_TAC[] THEN + UNDISCH_TAC `8 divides k` THEN + DISCH_THEN(MP_TAC o SPEC `4` o MATCH_MP (NUMBER_RULE + `y divides a ==> !x:num. x divides y ==> x divides a`)) THEN + ANTS_TAC THENL [CONV_TAC DIVIDES_CONV; ALL_TAC] THEN + ASM_REWRITE_TAC[ONCE_REWRITE_RULE[MULT_SYM] DIVIDES_DIV_MULT] THEN + ASM_CASES_TAC `k':num = k` THEN ASM_REWRITE_TAC[] THEN + UNDISCH_THEN `k':num = k` SUBST_ALL_TAC THEN + MAP_EVERY UNDISCH_TAC + [`lowdigits a (2 * k) = a'`; `lowdigits n k = n'`] THEN + ASM_SIMP_TAC[LOWDIGITS_SELF]] THEN + + SUBGOAL_THEN + `nonoverlapping (z,8 * 2 * k') (word pc,3468) /\ + nonoverlapping (z,8 * 2 * k') (m:int64,8 * k') /\ + nonoverlapping (word_sub stackpointer (word 32):int64, 32) + (m:int64, 8 * k') /\ + nonoverlapping (word_sub stackpointer (word 32):int64, 32) + (word pc, 3468) /\ + nonoverlapping (word_sub stackpointer (word 32):int64, 32) + (z:int64, 8 * 2 * k')` + MP_TAC THEN REWRITE_TAC[NONOVERLAPPING_CLAUSES] THENL + [MAP_EVERY EXPAND_TAC ["k'"; "k4"] THEN + REPEAT CONJ_TAC THEN NONOVERLAPPING_TAC; + STRIP_TAC] THEN + + MATCH_MP_TAC ENSURES_FRAME_SUBSUMED THEN + + EXISTS_TAC + `MAYCHANGE_REGS_AND_FLAGS_PERMITTED_BY_ABI ,, + MAYCHANGE [X19; X20; X21; X22; X23; X24; X25; X26; X27; X28] ,, + MAYCHANGE [memory :> bytes (z,8 * 2 * k'); + memory :> bytes(word_sub stackpointer (word 32),32)] ,, + MAYCHANGE [SP]` THEN + CONJ_TAC THENL + [REWRITE_TAC [MAYCHANGE_REGS_AND_FLAGS_PERMITTED_BY_ABI] THEN + REWRITE_TAC[GSYM SEQ_ASSOC] THEN + REPEAT(MATCH_MP_TAC SUBSUMED_SEQ THEN REWRITE_TAC[SUBSUMED_REFL]) THEN + MAP_EVERY EXPAND_TAC ["k'"; "k4"] THEN SUBSUMED_MAYCHANGE_TAC; + ALL_TAC] THEN + + (* Show that 8 <= k *) + RULE_ASSUM_TAC (REWRITE_RULE [DIVIDES_DIV_MULT]) THEN + SUBGOAL_THEN `~(k4 = 1)` ASSUME_TAC THENL [ + DISCH_TAC THEN + SUBST_ALL_TAC (ASSUME `k4 = 1`) THEN + SUBGOAL_THEN `k DIV 8 = (k DIV 4) DIV 2` SUBST_ALL_TAC THENL + [REWRITE_TAC[DIV_DIV; ARITH_RULE `4 * 2 = 8`]; ALL_TAC] THEN + SUBGOAL_THEN `k DIV 4 DIV 2 * 8 = 0` SUBST_ALL_TAC THENL + [REWRITE_TAC[ASSUME `k DIV 4 = 1`; ARITH_RULE `1 DIV 2 = 0`] THEN ARITH_TAC; + ASM_ARITH_TAC]; + ALL_TAC] THEN + SUBGOAL_THEN `8 <= k'` ASSUME_TAC THENL [ASM_ARITH_TAC; ALL_TAC] THEN + + REPEAT(FIRST_X_ASSUM(K ALL_TAC o check (free_in `a:num`) o concl)) THEN + REPEAT(FIRST_X_ASSUM(K ALL_TAC o check (free_in `n:num`) o concl)) THEN + REPEAT(FIRST_X_ASSUM(K ALL_TAC o check (free_in `k:num`) o concl)) THEN + POP_ASSUM_LIST(MP_TAC o end_itlist CONJ o rev) THEN + MAP_EVERY SPEC_TAC + [(`a':num`,`a:num`); (`n':num`,`n:num`); (`k':num`,`k:num`)] THEN + REPEAT STRIP_TAC THEN + BIGNUM_TERMRANGE_TAC `2 * k` `a:num` THEN + BIGNUM_TERMRANGE_TAC `k:num` `n:num` THEN + + (*** Get a basic bound on k and k4 from the nonoverlapping assumptions ***) + + SUBGOAL_THEN `~(k = 0)` ASSUME_TAC THENL + [EXPAND_TAC "k" THEN REWRITE_TAC[MULT_EQ_0; ARITH_EQ] THEN + ASM_REWRITE_TAC[]; + ALL_TAC] THEN + + MP_TAC(ASSUME + `nonoverlapping_modulo (2 EXP 64) + (val(z:int64),8 * 2 * k) (val(m:int64),8 * k)`) THEN + DISCH_THEN(MP_TAC o MATCH_MP (ONCE_REWRITE_RULE[IMP_CONJ] + NONOVERLAPPING_IMP_SMALL_2)) THEN + ANTS_TAC THENL [UNDISCH_TAC `~(k = 0)` THEN ARITH_TAC; DISCH_TAC] THEN + SUBGOAL_THEN `k4 < 2 EXP 58` ASSUME_TAC THENL + [SIMPLE_ARITH_TAC; ALL_TAC] THEN + + (*** Main loop invariant for "outerloop" ***) + + REWRITE_TAC[MAYCHANGE_REGS_AND_FLAGS_PERMITTED_BY_ABI; GSYM SEQ_ASSOC] THEN + ENSURES_WHILE_PUP_TAC `k4:num` `pc + 0x38` `pc + 0xd68` + `\i s. (read X2 s = m /\ + bignum_from_memory (m,k) s = n /\ + read X0 s = word(32 * (k4 - 1)) /\ + read X1 s = word_add z (word(8 * 4 * i)) /\ + read SP s = word_sub stackpointer (word 32) /\ + read (memory :> bytes64 (word_add (word_sub stackpointer (word 32)) (word 16))) s = word (k4 - i) /\ // X26 + read (memory :> bytes64 (word_sub stackpointer (word 32))) s = word w /\ // X3 + aligned 16 stackpointer /\ + bignum_from_memory(word_add z (word(8 * (k + 4 * i))), + 2 * k - (k + 4 * i)) s = + highdigits a (k + 4 * i) /\ + ((n * w + 1 == 0) (mod (2 EXP 64)) + ==> 2 EXP (64 * 4 * i) * + (2 EXP (64 * k) * val(word_neg(read X28 s)) + + bignum_from_memory(word_add z (word(8 * 4 * i)),k) s) = + bignum_from_memory(z,4 * i) s * n + lowdigits a (k + 4 * i))) /\ + (read ZF s <=> i = k4)` THEN + ASM_REWRITE_TAC[] THEN REPEAT CONJ_TAC THENL + [ REWRITE_TAC[BIGNUM_FROM_MEMORY_BYTES] THEN ENSURES_INIT_TAC "s0" THEN + MP_TAC(ISPECL [`z:int64`; `2 * k`; `k:num`; `s0:armstate`] + HIGHDIGITS_BIGNUM_FROM_MEMORY) THEN + MP_TAC(ISPECL [`z:int64`; `2 * k`; `k:num`; `s0:armstate`] + LOWDIGITS_BIGNUM_FROM_MEMORY) THEN + ASM_REWRITE_TAC[BIGNUM_FROM_MEMORY_BYTES] THEN + REWRITE_TAC[ARITH_RULE `MIN (2 * k) k = k /\ 2 * k - k = k`] THEN + REPLICATE_TAC 2 (DISCH_THEN(ASSUME_TAC o SYM)) THEN + ARM_STEPS_TAC BIGNUM_EMONTREDC_8N_NEON_EXEC (1--4) THEN + ENSURES_FINAL_STATE_TAC THEN + ASM_REWRITE_TAC[] THEN + REWRITE_TAC[GSYM BIGNUM_FROM_MEMORY_BYTES; SUB_0; WORD_NEG_0] THEN + REWRITE_TAC[WORD_ADD_0; MULT_CLAUSES; VAL_WORD_0; ADD_CLAUSES; EXP] THEN + REWRITE_TAC[BIGNUM_FROM_MEMORY_TRIVIAL] THEN + ASM_REWRITE_TAC[ADD_CLAUSES; MULT_CLAUSES; ARITH_RULE `2 * k - k = k`] THEN + ASM_REWRITE_TAC[BIGNUM_FROM_MEMORY_BYTES] THEN CONV_TAC WORD_RULE; + + ALL_TAC; (*** This is the main loop invariant: save for later ***) + + X_GEN_TAC `i:num` THEN STRIP_TAC THEN VAL_INT64_TAC `i:num` THEN + ARM_SIM_TAC BIGNUM_EMONTREDC_8N_NEON_EXEC [1]; + + GHOST_INTRO_TAC `ncout:int64` `read X28` THEN + ARM_SIM_TAC BIGNUM_EMONTREDC_8N_NEON_EXEC (1--3) THEN CONJ_TAC THENL + [ DISCH_TAC THEN + ASM_SIMP_TAC[LOWDIGITS_SELF; GSYM MULT_2; WORD_SUB_LZERO] THEN + REWRITE_TAC[MULT_SYM]; + CONV_TAC WORD_RULE + ] + ] THEN + + (*** Start on the main outer loop invariant, rebase at z + 32 * i = z' ***) + + X_GEN_TAC `i:num` THEN STRIP_TAC THEN VAL_INT64_TAC `i:num` THEN + REWRITE_TAC[WORD_RULE + `word_add z (word (8 * (k + 4 * i))) = + word_add (word_add z (word(8 * 4 * i))) (word(8 * k))`] THEN + REWRITE_TAC[WORD_RULE + `word_add z (word (8 * 4 * (i + 1))) = + word_add (word_add z (word(8 * 4 * i))) (word(8 * 4))`] THEN + ABBREV_TAC `z':int64 = word_add z (word (8 * 4 * i))` THEN + REWRITE_TAC[WORD_RULE + `word_add (word_add z (word (8 * 4))) (word (8 * k)) = + word_add z (word (8 * (k + 4)))`] THEN + REWRITE_TAC[ARITH_RULE `2 * k - (k + i) = k - i`] THEN + + GHOST_INTRO_TAC `cout:num` `\s. val (word_neg (read X28 s))` THEN + REWRITE_TAC[VAL_WORD_GALOIS; DIMINDEX_64] THEN + REWRITE_TAC[WORD_RULE `word_neg x = y <=> x = word_neg y`] THEN + + SUBGOAL_THEN + `!s. bignum_from_memory(z',k) s = + lowdigits (bignum_from_memory(z',k+4) s) k` + (fun th -> REWRITE_TAC[th]) + THENL + [REWRITE_TAC[LOWDIGITS_BIGNUM_FROM_MEMORY] THEN + REWRITE_TAC[ARITH_RULE `MIN (k + 4) k = k`]; + ALL_TAC] THEN + + SUBGOAL_THEN + `!s. bignum_from_memory (z,4 * (i + 1)) s = + 2 EXP (64 * 4 * i) * bignum_from_memory(z',4) s + + bignum_from_memory(z,4 * i) s` + (fun th -> REWRITE_TAC[th]) + THENL + [REWRITE_TAC[ARITH_RULE `4 * (i + 1) = 4 * i + 4`] THEN + ASM_REWRITE_TAC[BIGNUM_FROM_MEMORY_SPLIT]; + ALL_TAC] THEN + + SUBGOAL_THEN + `!s. bignum_from_memory (word_add z' (word (8 * k)),k - 4 * i) s = + highdigits a (k + 4 * i) <=> + highdigits (bignum_from_memory(z',k+4) s) k = + lowdigits (highdigits a (k + 4 * i)) 4 /\ + bignum_from_memory + (word_add z' (word (8 * (k + 4))),k - 4 * (i + 1)) s = + highdigits a (k + 4 * (i + 1))` + (fun th -> REWRITE_TAC[th]) + THENL + [GEN_TAC THEN + REWRITE_TAC[HIGHDIGITS_BIGNUM_FROM_MEMORY; ADD_SUB2] THEN + SUBGOAL_THEN + `k - 4 * i = 4 + (k - 4 * (i + 1))` + SUBST1_TAC THENL [SIMPLE_ARITH_TAC; ALL_TAC] THEN + REWRITE_TAC[BIGNUM_FROM_MEMORY_SPLIT] THEN + MP_TAC(SPECL [`highdigits a (k + 4 * i)`; `4`] + (CONJUNCT1 HIGH_LOW_DIGITS)) THEN + DISCH_THEN(fun th -> + GEN_REWRITE_TAC (LAND_CONV o ONCE_DEPTH_CONV) [GSYM th]) THEN + SIMP_TAC[LEXICOGRAPHIC_EQ; BIGNUM_FROM_MEMORY_BOUND; LOWDIGITS_BOUND] THEN + REWRITE_TAC[HIGHDIGITS_HIGHDIGITS] THEN + REWRITE_TAC[ARITH_RULE `(k + 4 * i) + 4 = k + 4 * (i + 1)`] THEN + REWRITE_TAC[WORD_RULE + `word_add (word_add z (word (8 * k))) (word (8 * 4)) = + word_add z (word (8 * (k + 4)))`] THEN + MATCH_ACCEPT_TAC CONJ_SYM; + ALL_TAC] THEN + + GHOST_INTRO_TAC `z1:num` `bignum_from_memory(z',k+4)` THEN + BIGNUM_TERMRANGE_TAC `k + 4` `z1:num` THEN + GHOST_INTRO_TAC `q1:num` `bignum_from_memory(z,4 * i)` THEN + BIGNUM_TERMRANGE_TAC `4 * i` `q1:num` THEN + GLOBALIZE_PRECONDITION_TAC THEN + + ENSURES_SEQUENCE_TAC `pc + 0xd68` + `\s. read X2 s = m /\ + bignum_from_memory (m,k) s = n /\ + read X0 s = word (32 * (k4 - 1)) /\ + (read ZF s <=> i + 1 = k4) /\ + read X1 s = word_add z' (word (8 * 4)) /\ + read SP s = word_sub stackpointer (word 32) /\ + read (memory :> bytes64 (word_add (word_sub stackpointer (word 32)) (word 16))) s = word (k4 - (i + 1)) /\ // X26 + read (memory :> bytes64 (word_sub stackpointer (word 32))) s = word w /\ // X3 + aligned 16 stackpointer /\ + bignum_from_memory (word_add z' (word (8 * (k + 4))),k - 4 * (i + 1)) + s = + highdigits a (k + 4 * (i + 1)) /\ + bignum_from_memory (z,4 * i) s = q1 /\ + ((n * w + 1 == 0) (mod (2 EXP 64)) + ==> 2 EXP (64 * 4) * + (2 EXP (64 * k) * + val(word_neg(read X28 s)) + + bignum_from_memory(word_add z' (word(8 * 4)),k) s) = + bignum_from_memory(z',4) s * n + 2 EXP (64 * k) * cout + z1)` THEN + CONJ_TAC THENL + [ALL_TAC; + ARM_SIM_TAC BIGNUM_EMONTREDC_8N_NEON_EXEC [] THEN + DISCH_THEN(fun th -> + REPEAT(FIRST_X_ASSUM(ASSUME_TAC o C MATCH_MP th))) THEN + REWRITE_TAC[EXP_ADD; ARITH_RULE + `64 * 4 * (i + 1) = 64 * 4 * i + 64 * 4`] THEN + ASM_REWRITE_TAC[GSYM MULT_ASSOC] THEN + REWRITE_TAC[LEFT_ADD_DISTRIB; GSYM ADD_ASSOC; RIGHT_ADD_DISTRIB] THEN + REWRITE_TAC[GSYM MULT_ASSOC; EQ_ADD_LCANCEL] THEN + MP_TAC(SPECL [`z1:num`; `k:num`] (CONJUNCT1 HIGH_LOW_DIGITS)) THEN + ASM_REWRITE_TAC[] THEN DISCH_THEN(SUBST1_TAC o SYM) THEN + ASM_REWRITE_TAC[ARITH_RULE + `ee * e * c + ee * (e * h + l):num = + (ee * (e * c + l)) + (ee * e) * h`] THEN + REWRITE_TAC[GSYM EXP_ADD; GSYM ADD_ASSOC; EQ_ADD_LCANCEL] THEN + REWRITE_TAC[lowdigits; highdigits; LEFT_ADD_DISTRIB; ADD_ASSOC] THEN + REWRITE_TAC[ARITH_RULE `64 * 4 * i + 64 * k = 64 * k + 64 * 4 * i`] THEN + SPEC_TAC(`64 * k + 64 * 4 * i`,`j:num`) THEN + REWRITE_TAC[EXP_ADD; MOD_MULT_MOD] THEN ARITH_TAC] THEN + + (*** Now discard no-longer-relevant things outside the window ***) + + MATCH_MP_TAC ENSURES_FRAME_SUBSUMED THEN EXISTS_TAC + `MAYCHANGE_REGS_AND_FLAGS_PERMITTED_BY_ABI ,, + MAYCHANGE [X19; X20; X21; X22; X23; X24; X25; X26; X27; X28],, + MAYCHANGE [memory :> bytes(z',8 * (k + 4))] ,, + MAYCHANGE [memory :> bytes(word_sub stackpointer (word 32),32)]` THEN + (REWRITE_TAC [MAYCHANGE_REGS_AND_FLAGS_PERMITTED_BY_ABI; GSYM SEQ_ASSOC] + THEN CONJ_TAC) THENL + [EXPAND_TAC "z'" THEN SUBSUMED_MAYCHANGE_TAC; + ALL_TAC] THEN + + SUBGOAL_THEN + `nonoverlapping (z':int64,8 * (k + 4)) (z,8 * 4 * i) /\ + nonoverlapping (z':int64,8 * (k + 4)) (word pc,3468) /\ + nonoverlapping (z':int64,8 * (k + 4)) (m,8 * k) /\ + nonoverlapping (z':int64,8 * (k + 4)) + (word_add z' (word (8 * (k + 4))),8 * (k - 4 * (i + 1))) /\ + nonoverlapping (word_sub stackpointer (word 32),32) (z,8 * 4 * i) /\ + nonoverlapping (word_sub stackpointer (word 32),32) (word pc,3468) /\ + nonoverlapping (word_sub stackpointer (word 32),32) (m,8 * k) /\ + nonoverlapping (word_sub stackpointer (word 32),32) + (word_add z' (word (8 * (k + 4))),8 * (k - 4 * (i + 1))) /\ + nonoverlapping (word_sub stackpointer (word 32),32) (z':int64,8 * (k + 4))` + MP_TAC THEN REWRITE_TAC[NONOVERLAPPING_CLAUSES] THENL + [EXPAND_TAC "z'" THEN REPEAT CONJ_TAC THEN NONOVERLAPPING_TAC; + STRIP_TAC] THEN + + REWRITE_TAC[BIGNUM_FROM_MEMORY_BYTES] THEN + ENSURES_FORGET_COMPONENTS_TAC + [`memory :> bytes (z,8 * 4 * i)`; + `memory :> + bytes (word_add z' (word (8 * (k + 4))),8 * (k - 4 * (i + 1)))`] THEN + + (*** Get the cout < 2 before we forget too much context ***) + + SUBGOAL_THEN `(n * w + 1 == 0) (mod (2 EXP 64)) ==> cout < 2` + ASSUME_TAC THENL + [DISCH_TAC THEN + SUBGOAL_THEN + `2 EXP (64 * 4 * i) * (2 EXP (64 * k) * cout + lowdigits z1 k) < + 2 EXP (64 * 4 * i) * 2 EXP (64 * k) * 2` + MP_TAC THENL + [ASM_SIMP_TAC[] THEN MATCH_MP_TAC (ARITH_RULE + `x < d * e /\ y < e * d ==> x + y < d * e * 2`) THEN + ASM_SIMP_TAC[LT_MULT2] THEN REWRITE_TAC[GSYM EXP_ADD] THEN + REWRITE_TAC[LOWDIGITS_BOUND; GSYM LEFT_ADD_DISTRIB]; + DISCH_THEN(MP_TAC o MATCH_MP (ARITH_RULE + `d * (e * c + l):num < x ==> d * e * c < x`)) THEN + REWRITE_TAC[LT_MULT_LCANCEL; EXP_EQ_0; ARITH_EQ]]; + ALL_TAC] THEN + + (*** Now forget more things; back up a few steps and forget i as well ***) + + REPEAT(FIRST_X_ASSUM(K ALL_TAC o check (free_in `a:num`) o concl)) THEN + REPEAT(FIRST_X_ASSUM(K ALL_TAC o check (free_in `z:int64`) o concl)) THEN + REPEAT(FIRST_X_ASSUM(K ALL_TAC o check (free_in `q1:num`) o concl)) THEN + REPEAT(FIRST_X_ASSUM(K ALL_TAC o check (free_in `r1:num`) o concl)) THEN + + ENSURES_SEQUENCE_TAC `pc + 0xd54` + `\s. read X2 s = word_add m (word(32 * (k4 - 1))) /\ + bignum_from_memory (m,k) s = n /\ + read X0 s = word (32 * (k4 - 1)) /\ + read X26 s = word (k4 - i) /\ + read X1 s = word_add z' (word(32 * (k4 - 1))) /\ + read SP s = word_sub stackpointer (word 32) /\ + read (memory :> bytes64 (word_sub stackpointer (word 32))) s = word w /\ + ((n * w + 1 == 0) (mod (2 EXP 64)) + ==> 2 EXP (64 * 4) * + (2 EXP (64 * k) * val(word_neg (read X28 s)) + + bignum_from_memory(word_add z' (word(8 * 4)),k) s) = + bignum_from_memory(z',4) s * n + + 2 EXP (64 * k) * cout + z1)` THEN + CONJ_TAC THENL + [ALL_TAC; + ARM_SIM_TAC BIGNUM_EMONTREDC_8N_NEON_EXEC (1--5) + THEN REPEAT CONJ_TAC THENL + [CONV_TAC WORD_RULE; + VAL_INT64_TAC `k4 - i:num` THEN ASM_REWRITE_TAC[VAL_WORD_1] THEN + UNDISCH_TAC `i:num < k4` THEN ARITH_TAC; + CONV_TAC WORD_RULE; + REWRITE_TAC[ARITH_RULE `k - (j + 1) = k - j - 1`] THEN + GEN_REWRITE_TAC RAND_CONV [WORD_SUB] THEN + ASM_REWRITE_TAC[ARITH_RULE `1 <= k - j <=> j < k`]]] THEN + + ABBREV_TAC `wouter:int64 = word (k4 - i)` THEN + REPEAT(FIRST_X_ASSUM(K ALL_TAC o check (free_in `i:num`) o concl)) THEN + POP_ASSUM_LIST(MP_TAC o end_itlist CONJ o rev) THEN + MAP_EVERY SPEC_TAC + [(`z1:num`,`a:num`); (`z':int64`,`z:int64`)] THEN + REPEAT STRIP_TAC THEN + + SUBGOAL_THEN `4 <= k` ASSUME_TAC THENL + [SUBST1_TAC(SYM(ASSUME `4 * k4 = k`)) THEN UNDISCH_TAC `~(k4 = 0)` THEN + ARITH_TAC; + ALL_TAC] THEN + + (*** The initial Montgomery 4-block ***) + + ENSURES_SEQUENCE_TAC `pc + 0x304` + `\s. read X2 s = m /\ + bignum_from_memory(m,k) s = n /\ + read X0 s = word (32 * (k4 - 1)) /\ + read X1 s = z /\ + read X28 s = word_neg(word cout) /\ + read SP s = word_sub stackpointer (word 32) /\ + read (memory :> bytes64 (word_sub stackpointer (word 32))) s = word w /\ + read (memory :> bytes64 (word_add + (word_sub stackpointer (word 32)) (word 16))) s = + wouter /\ + bignum_from_memory(word_add z (word (8 * 4)),k) s = + highdigits a 4 /\ + read X4 s = word(bigdigit (bignum_from_memory(z,4) s) 0) /\ + read X5 s = word(bigdigit (bignum_from_memory(z,4) s) 1) /\ + read X6 s = word(bigdigit (bignum_from_memory(z,4) s) 2) /\ + read X7 s = word(bigdigit (bignum_from_memory(z,4) s) 3) /\ + read X8 s = word(bigdigit n 4) /\ + read X9 s = word(bigdigit n 5) /\ + read X10 s = word(bigdigit n 6) /\ + read X11 s = word(bigdigit n 7) /\ + read Q20 s = word_join + (word(bigdigit (bignum_from_memory(z,4) s) 1):(64)word) + (word(bigdigit (bignum_from_memory(z,4) s) 0):(64)word) /\ + read Q21 s = word_join + (word(bigdigit (bignum_from_memory(z,4) s) 3):(64)word) + (word(bigdigit (bignum_from_memory(z,4) s) 2):(64)word) /\ + read Q22 s = word_join + (word(bigdigit n 5):(64)word) (word(bigdigit n 4):(64)word) /\ + read Q23 s = word_join + (word(bigdigit n 7):(64)word) (word(bigdigit n 6):(64)word) /\ + read Q24 s = word_join + (word(0 + val (word (bigdigit (bignum_from_memory(z,4) s) 1):(64)word) * + val (word (bigdigit n 5):(64)word)):(64)word) + (word(0 + val (word (bigdigit (bignum_from_memory(z,4) s) 0):(64)word) * + val (word (bigdigit n 4):(64)word)):(64)word) /\ + read Q26 s = word_join + (word(0 + val (word (bigdigit (bignum_from_memory(z,4) s) 3):(64)word) * + val (word (bigdigit n 7):(64)word)):(64)word) + (word(0 + val (word (bigdigit (bignum_from_memory(z,4) s) 2):(64)word) * + val (word (bigdigit n 6):(64)word)):(64)word) /\ + read Q25 s = word_join + (word((val (word (bigdigit (bignum_from_memory(z,4) s) 1):(64)word) * + val (word (bigdigit n 5):(64)word)) DIV 2 EXP 64):(64)word) + (word((val (word (bigdigit (bignum_from_memory(z,4) s) 0):(64)word) * + val (word (bigdigit n 4):(64)word)) DIV 2 EXP 64):(64)word) /\ + read Q27 s = word_join + (word((val (word (bigdigit (bignum_from_memory(z,4) s) 3):(64)word) * + val (word (bigdigit n 7):(64)word)) DIV 2 EXP 64):(64)word) + (word((val (word (bigdigit (bignum_from_memory(z,4) s) 2):(64)word) * + val (word (bigdigit n 6):(64)word)) DIV 2 EXP 64):(64)word) /\ + ((n * w + 1 == 0) (mod (2 EXP 64)) + ==> 2 EXP (64 * 4) * + bignum_of_wordlist + [read X12 s; read X13 s; read X14 s; read X15 s] = + bignum_from_memory(z,4) s * lowdigits n 4 + lowdigits a 4)` THEN + CONJ_TAC THENL + [ REWRITE_TAC[BIGNUM_FROM_MEMORY_BYTES] THEN + ENSURES_INIT_TAC "s0" THEN + SUBGOAL_THEN + `highdigits (bignum_from_memory(z,k+4) s0) 4 = highdigits a 4` + MP_TAC THENL + [ASM_REWRITE_TAC[BIGNUM_FROM_MEMORY_BYTES]; ALL_TAC] THEN + REWRITE_TAC[HIGHDIGITS_BIGNUM_FROM_MEMORY; ADD_SUB] THEN + REWRITE_TAC[GSYM BIGNUM_FROM_MEMORY_BYTES] THEN + REWRITE_TAC[NUM_REDUCE_CONV `8 * 4`] THEN + REWRITE_TAC[BIGNUM_FROM_MEMORY_BYTES] THEN + DISCH_TAC THEN + SUBGOAL_THEN + `(!i. i < 4 + ==> bigdigit (bignum_from_memory(z,k+4) s0) i = bigdigit a i) /\ + (!i. i < 8 + ==> bigdigit (bignum_from_memory(m,k) s0) i = bigdigit n i)` + MP_TAC THENL [ASM_REWRITE_TAC[BIGNUM_FROM_MEMORY_BYTES]; ALL_TAC] THEN + GEN_REWRITE_TAC (LAND_CONV o ONCE_DEPTH_CONV) + [BIGDIGIT_BIGNUM_FROM_MEMORY] THEN + SUBGOAL_THEN `!i. i < 8 \/ i < 4 ==> i < k /\ i < k + 4` MP_TAC THENL + [UNDISCH_TAC `8 <= k` THEN ARITH_TAC; SIMP_TAC[]] THEN + DISCH_THEN(K ALL_TAC) THEN + GEN_REWRITE_TAC (LAND_CONV o TOP_DEPTH_CONV) + [VAL_WORD_GALOIS; DIMINDEX_64; BIGDIGIT_BOUND] THEN + REWRITE_TAC[] THEN + CONV_TAC(LAND_CONV(BINOP_CONV EXPAND_CASES_CONV)) THEN + CONV_TAC(LAND_CONV(ONCE_DEPTH_CONV NUM_MULT_CONV)) THEN + GEN_REWRITE_TAC (LAND_CONV o ONCE_DEPTH_CONV) [WORD_ADD_0] THEN + STRIP_TAC THEN + + BYTES128_EQ_JOIN64_TAC `read (memory :> bytes128 (word_add m (word 16))) s0` + `word (bigdigit n 3):(64)word` `word (bigdigit n 2):(64)word` THEN + BYTES128_EQ_JOIN64_TAC `read (memory :> bytes128 (word_add m (word 32))) s0` + `word (bigdigit n 5):(64)word` `word (bigdigit n 4):(64)word` THEN + BYTES128_EQ_JOIN64_TAC `read (memory :> bytes128 (word_add m (word 48))) s0` + `word (bigdigit n 7):(64)word` `word (bigdigit n 6):(64)word` THEN + + ARM_REWRITE_ASSUM_AND_XACCSTEPS_TAC2 BIGNUM_EMONTREDC_8N_NEON_EXEC + [30;31;36;38;67;68;73;75] [WORD_MUL64_LO;WORD_MUL64_HI] + (1--86) (1--86) [] THEN + + (* ldr of stp x4, x5, [x1] *) + BYTES128_EQ_JOIN64_TAC `read (memory :> bytes128 z) s86` + `word (0 + val (sum_s40:(64)word) * w):(64)word` + `word (0 + val (word (bigdigit a 0):(64)word) * w):(64)word` THEN + (* ldr of ldp ... [x2, #32] *) + BYTES128_EQ_JOIN64_TAC `read (memory :> bytes128 (word_add m (word 32):64 word)) s86` + `word (bigdigit n 5):(64)word` `word (bigdigit n 4):(64)word` THEN + BYTES128_EQ_JOIN64_TAC `read (memory :> bytes128 (word_add m (word 48):64 word)) s86` + `word (bigdigit n 7):(64)word` `word (bigdigit n 6):(64)word` THEN + + ARM_REWRITE_ASSUM_AND_XACCSTEPS_TAC2 BIGNUM_EMONTREDC_8N_NEON_EXEC + [110;111;115;116] [WORD_MUL64_LO;WORD_MUL64_HI] + (87--123) (87--123) [] THEN + + (* ldr of stp x6, x7, [x1, #16] *) + BYTES128_EQ_JOIN64_TAC + `read (memory :> bytes128 (word_add z (word 16):(64)word)) s123` + `word (0 + val (sum_s118:(64)word) * w):(64)word` + `word (0 + val (sum_s77:(64)word) * w):(64)word` THEN + + ARM_REWRITE_ASSUM_AND_XACCSTEPS_TAC2 BIGNUM_EMONTREDC_8N_NEON_EXEC + [] [WORD_MUL64_LO;WORD_MUL64_HI] + (124--179) (124--179) [] THEN + + RULE_ASSUM_TAC(REWRITE_RULE[WORD_MUL64_LO;WORD_MUL64_HI]) THEN + ENSURES_FINAL_STATE_TAC THEN ASM_REWRITE_TAC[] THEN + REWRITE_TAC[GSYM BIGNUM_FROM_MEMORY_BYTES] THEN + REWRITE_TAC[BIGDIGIT_BIGNUM_FROM_MEMORY] THEN + CONV_TAC(ONCE_DEPTH_CONV NUM_LT_CONV) THEN + CONV_TAC(ONCE_DEPTH_CONV NUM_MULT_CONV) THEN + ASM_REWRITE_TAC[WORD_VAL; WORD_ADD_0] THEN + RULE_ASSUM_TAC(REWRITE_RULE[ADD_CLAUSES]) THEN + DISCH_TAC THEN CONV_TAC(ONCE_DEPTH_CONV BIGNUM_EXPAND_CONV) THEN + ASM_REWRITE_TAC[] THEN + REWRITE_TAC[RAND_CONV(TOP_DEPTH_CONV num_CONV) `lowdigits x 4`] THEN + REWRITE_TAC[ADD1; LOWDIGITS_CLAUSES] THEN + CONV_TAC(DEPTH_CONV NUM_ADD_CONV) THEN REWRITE_TAC[bignum_of_wordlist] THEN + REWRITE_TAC[GSYM REAL_OF_NUM_CLAUSES] THEN CONV_TAC NUM_REDUCE_CONV THEN + FIRST_ASSUM(MP_TAC o MATCH_MP montgomery_lemma) THEN + DISCH_THEN(fun ith -> + EVERY_ASSUM(fun th -> + try let th' = MATCH_MP ith th in + EVERY_ASSUM(fun th'' -> + try MP_TAC(MATCH_MP th' th'') + with Failure _ -> ALL_TAC) + with Failure _ -> ALL_TAC)) THEN + REWRITE_TAC[IMP_IMP; GSYM CONJ_ASSOC] THEN + DISCH_THEN(fun th -> ASSUME_TAC th THEN MP_TAC th) THEN + DISCH_THEN(REPEAT_TCL CONJUNCTS_THEN + (MP_TAC o MATCH_MP (MESON[REAL_ADD_LID] + `n = 0 ==> !x:real. &n + x = x`))) THEN + REPEAT(DISCH_THEN(fun th -> RULE_ASSUM_TAC(REWRITE_RULE[th]))) THEN + ACCUMULATOR_POP_ASSUM_LIST(MP_TAC o end_itlist CONJ o DECARRY_RULE) THEN + REWRITE_TAC[VAL_WORD_BIGDIGIT] THEN + DISCH_THEN(fun th -> REWRITE_TAC[th]) THEN ASM_REWRITE_TAC[] THEN + REAL_ARITH_TAC; + + ALL_TAC] THEN + + (*** Shared tail to handle the final carry chaining in k4 = 1 too ***) + + GHOST_INTRO_TAC `q:num` `bignum_from_memory(z,4)` THEN + BIGNUM_TERMRANGE_TAC `4` `q:num` THEN + + (*** Set up a version with the whole z buffer ***) + + ENSURES_SEQUENCE_TAC `pc + 0xd28` + `\s. read X1 s = word_add z (word (32 * (k4 - 1))) /\ + read X2 s = word_add m (word (32 * (k4 - 1))) /\ + bignum_from_memory(m,k) s = n /\ + read X0 s = word (32 * (k4 - 1)) /\ + read SP s = word_sub stackpointer (word 32) /\ + read (memory :> bytes64 (word_sub stackpointer (word 32))) s = word w /\ + read (memory :> + bytes64 (word_add (word_sub stackpointer (word 32)) (word 16))) s = + wouter /\ + read X28 s = word_neg(word cout) /\ + bignum_from_memory (word_add z (word (8 * k)),4) s = + highdigits a k /\ + bignum_from_memory (z,4) s = q /\ + ((n * w + 1 == 0) (mod (2 EXP 64)) + ==> 2 EXP (64 * k) * + bignum_of_wordlist + [read X12 s; read X13 s; read X14 s; read X15 s] + + bignum_from_memory(z,k) s = + q * n + lowdigits a k + q)` THEN + CONJ_TAC THENL + [ALL_TAC; + + GHOST_INTRO_TAC `g8:int64` `read X12` THEN + GHOST_INTRO_TAC `g9:int64` `read X13` THEN + GHOST_INTRO_TAC `g10:int64` `read X14` THEN + GHOST_INTRO_TAC `g11:int64` `read X15` THEN + + (*** Rebase once again to avoid indexing messiness a bit ***) + + ABBREV_TAC `z':int64 = word_add z (word (8 * k))` THEN + MATCH_MP_TAC ENSURES_FRAME_SUBSUMED THEN + EXISTS_TAC + `MAYCHANGE_REGS_AND_FLAGS_PERMITTED_BY_ABI ,, + MAYCHANGE + [X19; X20; X21; X22; X23; X24; X25; X26; X27; X28] ,, + MAYCHANGE [memory :> bytes (z',8 * 4)]` THEN + REWRITE_TAC [MAYCHANGE_REGS_AND_FLAGS_PERMITTED_BY_ABI; + GSYM SEQ_ASSOC] THEN CONJ_TAC + THENL + [REPEAT(MATCH_MP_TAC SUBSUMED_SEQ THEN REWRITE_TAC[SUBSUMED_REFL]) THEN + MAP_EVERY EXPAND_TAC ["z'"] THEN SUBSUMED_MAYCHANGE_TAC; + ALL_TAC] THEN + SUBGOAL_THEN + `nonoverlapping (z':int64,8 * 4) (word pc,3468) /\ + nonoverlapping (z':int64,8 * 4) (m,8 * k) /\ + nonoverlapping (z':int64,8 * 4) (z,8 * 4) /\ + nonoverlapping (z':int64,8 * 4) (z,8 * k) /\ + nonoverlapping (z':int64,8 * 4) ((word_sub stackpointer (word 32)),8 * 4)` + MP_TAC THEN REWRITE_TAC[NONOVERLAPPING_CLAUSES] THENL + [MAP_EVERY EXPAND_TAC ["z'"] THEN + REPEAT CONJ_TAC THEN NONOVERLAPPING_TAC; + STRIP_TAC] THEN + + REWRITE_TAC[BIGNUM_FROM_MEMORY_BYTES] THEN ENSURES_INIT_TAC "s0" THEN + SUBGOAL_THEN + `!j. j < 4 + ==> bigdigit (bignum_from_memory(z',4) s0) j = + bigdigit a (k + j)` + MP_TAC THENL + [ASM_REWRITE_TAC[BIGNUM_FROM_MEMORY_BYTES; BIGDIGIT_HIGHDIGITS]; + SIMP_TAC[BIGDIGIT_BIGNUM_FROM_MEMORY]] THEN + GEN_REWRITE_TAC (LAND_CONV o TOP_DEPTH_CONV) + [VAL_WORD_GALOIS; DIMINDEX_64; BIGDIGIT_BOUND; GSYM WORD_ADD_ASSOC; + GSYM WORD_ADD] THEN + REWRITE_TAC[] THEN CONV_TAC(LAND_CONV EXPAND_CASES_CONV) THEN + CONV_TAC(LAND_CONV(ONCE_DEPTH_CONV NUM_MULT_CONV)) THEN + DISCH_THEN(STRIP_ASSUME_TAC o REWRITE_RULE[ADD_CLAUSES; WORD_ADD_0]) THEN + SUBGOAL_THEN + `word_add z (word (32 * (k4 - 1) + 32)):int64 = z' /\ + word_add z (word (32 * (k4 - 1) + 40)):int64 = word_add z' (word 8) /\ + word_add z (word (32 * (k4 - 1) + 48)):int64 = word_add z' (word 16) /\ + word_add z (word (32 * (k4 - 1) + 56)):int64 = word_add z' (word 24) /\ + word_add (word_add z (word (32 * (k4 - 1)))) (word 32):int64 = + z' /\ + word_add (word_add z (word (32 * (k4 - 1)))) (word 48):int64 = + word_add z' (word 16)` + STRIP_ASSUME_TAC THENL + [REWRITE_TAC[GSYM WORD_ADD; GSYM WORD_ADD_ASSOC] THEN + SUBST1_TAC(SYM(ASSUME `word_add z (word (8 * k)):int64 = z'`)) THEN + SUBGOAL_THEN `8 * k = 32 * (k4 - 1) + 32` SUBST1_TAC THENL + [MAP_EVERY UNDISCH_TAC [`4 * k4 = k`; `~(k4 = 0)`] THEN ARITH_TAC; + CONV_TAC WORD_RULE]; + ALL_TAC] THEN + + ARM_ACCSTEPS_TAC BIGNUM_EMONTREDC_8N_NEON_EXEC (5--8) (1--11) THEN + ENSURES_FINAL_STATE_TAC THEN ASM_REWRITE_TAC[] THEN + + DISCH_THEN(fun th -> + REPEAT(FIRST_X_ASSUM(ASSUME_TAC o C MATCH_MP th)) THEN + ASSUME_TAC th) THEN + ABBREV_TAC `bout <=> ~(word cout:int64 = word 0)` THEN + SUBGOAL_THEN `cout = bitval bout` SUBST_ALL_TAC THENL + [EXPAND_TAC "bout" THEN UNDISCH_TAC `cout < 2` THEN + SPEC_TAC(`cout:num`,`c:num`) THEN + CONV_TAC EXPAND_CASES_CONV THEN + CONV_TAC WORD_REDUCE_CONV THEN REWRITE_TAC[BITVAL_CLAUSES]; + ALL_TAC] THEN + SUBGOAL_THEN + `bitval + (2 EXP 64 <= + val (word_neg(word (bitval bout):int64)) + + val (word_neg(word (bitval bout):int64))) = + bitval bout` + SUBST_ALL_TAC THENL + [POP_ASSUM_LIST(K ALL_TAC) THEN AP_TERM_TAC THEN + BOOL_CASES_TAC `bout:bool` THEN + REWRITE_TAC[BITVAL_CLAUSES] THEN CONV_TAC WORD_REDUCE_CONV THEN + CONV_TAC NUM_REDUCE_CONV; + REWRITE_TAC[WORD_UNMASK_64; WORD_NEG_NEG; VAL_WORD_BITVAL]] THEN + MP_TAC(SPECL [`a:num`; `k:num`] (CONJUNCT1 HIGH_LOW_DIGITS)) THEN + DISCH_THEN(SUBST1_TAC o SYM) THEN FIRST_X_ASSUM(MATCH_MP_TAC o MATCH_MP + (ARITH_RULE + `z = q * n + a + q + ==> x + q = z + b + h + ==> x = q * n + b + h + a`)) THEN + SUBST1_TAC(SYM(ASSUME `read (memory :> bytes (z,8 * 4)) s11 = q`)) THEN + REWRITE_TAC[GSYM BIGNUM_FROM_MEMORY_BYTES] THEN + REWRITE_TAC[LEFT_ADD_DISTRIB; GSYM ADD_ASSOC] THEN + REWRITE_TAC[GSYM BIGNUM_FROM_MEMORY_SPLIT] THEN + ONCE_REWRITE_TAC[MESON[ADD_SYM] + `bignum_from_memory (z,4 + k) = bignum_from_memory (z,k + 4)`] THEN + REWRITE_TAC[BIGNUM_FROM_MEMORY_SPLIT] THEN + GEN_REWRITE_TAC RAND_CONV [ARITH_RULE `a + b + c:num = (a + c) + b`] THEN + REWRITE_TAC[EQ_ADD_RCANCEL; ADD_ASSOC] THEN + ONCE_REWRITE_TAC[ARITH_RULE `a * b * c:num = b * a * c`] THEN + REWRITE_TAC[GSYM LEFT_ADD_DISTRIB; VAL_WORD_BITVAL] THEN + AP_TERM_TAC THEN + CONV_TAC(ONCE_DEPTH_CONV BIGNUM_EXPAND_CONV) THEN + ASM_REWRITE_TAC[] THEN + ASM_REWRITE_TAC[WORD_ADD; WORD_ADD_ASSOC] THEN + REPLICATE_TAC 4 + (GEN_REWRITE_TAC (RAND_CONV o ONCE_DEPTH_CONV) [HIGHDIGITS_STEP]) THEN + REWRITE_TAC[GSYM ADD_ASSOC] THEN CONV_TAC(DEPTH_CONV NUM_ADD_CONV) THEN + ASM_SIMP_TAC[HIGHDIGITS_ZERO] THEN + REWRITE_TAC[bignum_of_wordlist] THEN + REWRITE_TAC[GSYM REAL_OF_NUM_CLAUSES] THEN CONV_TAC NUM_REDUCE_CONV THEN + ACCUMULATOR_POP_ASSUM_LIST(MP_TAC o end_itlist CONJ o DESUM_RULE) THEN + REWRITE_TAC[VAL_WORD_BIGDIGIT] THEN + DISCH_THEN(fun th -> REWRITE_TAC[th]) THEN ASM_REWRITE_TAC[] THEN + REAL_ARITH_TAC + ] THEN + + (*** The semi-degenerate case where we skip the inner loop ***) + + ASM_CASES_TAC `k4 = 1` THENL + [UNDISCH_THEN `k4 = 1` SUBST_ALL_TAC THEN + FIRST_X_ASSUM(SUBST_ALL_TAC o MATCH_MP (ARITH_RULE + `4 * 1 = k ==> k = 4`)) THEN + ARM_SIM_TAC BIGNUM_EMONTREDC_8N_NEON_EXEC [1] THEN + ASM_SIMP_TAC[LOWDIGITS_SELF] THEN REWRITE_TAC[GSYM ADD_ASSOC] THEN + CONV_TAC NUM_REDUCE_CONV THEN CONV_TAC WORD_RULE; + ALL_TAC] THEN + + (*** + if (k4 = 2) { + // straight-line code doing 256x256 mult for i = 1 + // (X27 = 32 * (2 - 1) = 32) + } else { + ... // straight-line code for i = 1 + for (i = 2 to k4 - 1) { .. } + ... // straight-line code for i = k4 + } + ***) + + ASM_CASES_TAC `k4 = 2` THENL [ + UNDISCH_THEN `k4 = 2` SUBST_ALL_TAC THEN + SUBGOAL_THEN `k = 8` SUBST_ALL_TAC THENL [ASM_ARITH_TAC; ALL_TAC] THEN + REWRITE_TAC[ARITH_RULE `32 * (2 - 1) = 32`] THEN + REWRITE_TAC[BIGNUM_FROM_MEMORY_BYTES] THEN + RULE_ASSUM_TAC (REWRITE_RULE [ARITH_RULE `8 * (8 + 4) = 96`]) THEN + (* Introduce variables storing the initial values of X12~X15 *) + GHOST_INTRO_TAC `g8:int64` `read X12` THEN + GHOST_INTRO_TAC `g9:int64` `read X13` THEN + GHOST_INTRO_TAC `g10:int64` `read X14` THEN + GHOST_INTRO_TAC `g11:int64` `read X15` THEN + ENSURES_INIT_TAC "s0" THEN + (* Prove [z+64..z+96] = a / 2^(64*8) from [z+32..z+96] = a / 2^(64*4). *) + SUBGOAL_THEN + `bignum_from_memory (word_add z (word (8 * 8)),4) s0 = highdigits a 8` + MP_TAC THENL [ + REWRITE_TAC[WORD_RULE + `(word_add z (word (8 * 8))) = + (word_add (word_add z (word (8*4))) (word (8*4)))`] THEN + REWRITE_TAC[ARITH_RULE`(_,4)=(_,8-4)` ; GSYM BIGNUM_FROM_MEMORY_DIV] THEN + ASM_REWRITE_TAC[BIGNUM_FROM_MEMORY_BYTES] THEN + REWRITE_TAC[highdigits; DIV_DIV; GSYM EXP_ADD] THEN + AP_TERM_TAC THEN AP_TERM_TAC THEN ARITH_TAC; + + REWRITE_TAC[BIGNUM_FROM_MEMORY_BYTES] THEN DISCH_TAC] THEN + (* Prove that [z+32..z+64] and [z+64..z+96] are nonoverlapping *) + SUBGOAL_THEN + `nonoverlapping (word_add z (word 32):(64)word,32) + (word_add z (word 64):(64)word,32)` + ASSUME_TAC THENL + [REWRITE_TAC[NONOVERLAPPING_CLAUSES] THEN NONOVERLAPPING_TAC; ALL_TAC] THEN + (* Simplify 8*const to make nonoverlapping checks work *) + SUBST_ALL_TAC (ARITH_RULE `8 * 8 = 64`) THEN + SUBST_ALL_TAC (ARITH_RULE `8 * 4 = 32`) THEN + + (* Introduce byte64 version of read [z+32..z+96]. *) + SUBGOAL_THEN + `!j. j < 8 + ==> bigdigit (bignum_from_memory((word_add z (word 32)),8) s0) j = + bigdigit a (4 + j)` MP_TAC THENL + [ASM_REWRITE_TAC[BIGNUM_FROM_MEMORY_BYTES;ARITH_RULE`8*8=64`] THEN + REWRITE_TAC[BIGDIGIT_HIGHDIGITS] THEN + FAIL_TAC "unreachable"; + REWRITE_TAC[HIGHDIGITS_BIGNUM_FROM_MEMORY; BIGDIGIT_BIGNUM_FROM_MEMORY] THEN + SIMP_TAC[VAL_WORD_GALOIS; DIMINDEX_64; BIGDIGIT_BOUND] THEN + CONV_TAC (ONCE_DEPTH_CONV EXPAND_CASES_CONV) THEN + CONV_TAC(ONCE_DEPTH_CONV NUM_MULT_CONV) THEN + REWRITE_TAC [WORD_ADD_ASSOC_CONSTS] THEN + CONV_TAC(ONCE_DEPTH_CONV NUM_ADD_CONV)] THEN + STRIP_TAC THEN + + ARM_STEPS_TAC BIGNUM_EMONTREDC_8N_NEON_EXEC (1--4) THEN + (* jump to maddloop_x0one *) + ARM_XACCSTEPS_TAC BIGNUM_EMONTREDC_8N_NEON_EXEC [`X1`; `X2`; `SP`] + ((7--10) @ [12;14;16;18;20;21] @ (23--44) @ + [50;55;57;58;64;69] @ (71--76) @ + [82;87;89;90;91] @ + [97;102;104;105;106;107;108] @ + [114;119;121;122;123;124] @ + [130;135;137;138;139;140]) + (5--145) THEN + ENSURES_FINAL_STATE_TAC THEN ASM_REWRITE_TAC[] THEN + + (* Discharge (n * w + 1 == 0) (mod (2 EXP 64)) and simplify an + existing assumption using this *) + DISCH_THEN(fun th -> + REPEAT(FIRST_X_ASSUM(ASSUME_TAC o C MATCH_MP th)) THEN + ASSUME_TAC th) THEN + (* Split `read (memory :> bytes (z,64)) s145` into high 32 and low 32 bytes. + Low 32 bytes are simply q. *) + SUBST1_TAC (ARITH_RULE `64 = 8 * 8`) THEN + REWRITE_TAC[GSYM BIGNUM_FROM_MEMORY_BYTES] THEN + REWRITE_TAC[ARITH_RULE `(_,8)=(_,4+4)`] THEN + REWRITE_TAC[BIGNUM_FROM_MEMORY_SPLIT] THEN + ASM_REWRITE_TAC[BIGNUM_FROM_MEMORY_BYTES; ARITH_RULE `8*4=32`] THEN + + (* Split high 32 bytes into 4 8-byte reads. *) + REWRITE_TAC[ARITH_RULE `(_,32)=(_,8*4)`] THEN + CONV_TAC(ONCE_DEPTH_CONV BIGNUM_EXPAND_CONV) THEN + ASM_REWRITE_TAC[] THEN + + (* Simplify .. + q = .. + q *) + REWRITE_TAC[ARITH_RULE `p+q+x=r+s+x <=> p+q=r+s`] THEN + + (* Split lowdigits _ 8 into lowdigits _ 4 + ... *) + ONCE_REWRITE_TAC[ + MP (SPECL [`n:num`; `8:num`] (GSYM LOWDIGITS_SELF)) + (ASSUME `n < 2 EXP (64 * 8)`)] THEN + REWRITE_TAC[ARITH_RULE `lowdigits n 8 = lowdigits n ((((4+1)+1)+1)+1)`; + LOWDIGITS_CLAUSES] THEN + CONV_TAC(DEPTH_CONV NUM_ADD_CONV) THEN + ONCE_REWRITE_TAC[ARITH_RULE + `q*(a0+a1+a2+a3+a4)+b0+b1+b2+b3+b4 = + q*(a0+a1+a2+a3)+b0+b1+b2+b3+(q*a4+b4)`] THEN + REWRITE_TAC[GSYM (ASSUME + `2 EXP (64 * 4) * bignum_of_wordlist [g8; g9; g10; g11] = + q * lowdigits n 4 + lowdigits a 4`)] THEN + DISCARD_MATCHING_ASSUMPTIONS [ + `2 EXP (64 * 4) * bignum_of_wordlist [g8; g9; g10; g11] = + q * lowdigits n 4 + lowdigits a 4`] THEN + + (* Expand q *) + SUBGOAL_THEN + `q = val(word(bigdigit q 0):(64)word) + + 2 EXP 64 * val(word(bigdigit q 1):(64)word) + + 2 EXP 128 * val(word(bigdigit q 2):(64)word) + + 2 EXP 192 * val(word(bigdigit q 3):(64)word)` + (fun thm -> ONCE_REWRITE_TAC [thm]) THENL [ + EXPAND_TAC "q" THEN + REWRITE_TAC[ARITH_RULE `32=8*4`] THEN + GEN_REWRITE_TAC (RAND_CONV o ONCE_DEPTH_CONV) + [GSYM BIGNUM_FROM_MEMORY_BYTES] THEN + REWRITE_TAC[BIGDIGIT_BIGNUM_FROM_MEMORY; ARITH_RULE `0<4/\1<4/\2<4/\3<4`; + WORD_VAL] THEN + CONV_TAC(LAND_CONV BIGNUM_EXPAND_CONV) THEN + CONV_TAC (ONCE_DEPTH_CONV NUM_REDUCE_CONV) THEN REWRITE_TAC[WORD_ADD_0] THEN + FAIL_TAC "unreachable"; + + ALL_TAC] THEN + + (* Divide by 2 EXP 256 *) + REWRITE_TAC[GSYM LEFT_ADD_DISTRIB] THEN + REWRITE_TAC[ARITH_RULE + `2 EXP(64*7)*a0+2 EXP(64*6)*a1+2 EXP(64*5)*a2+2 EXP(64*4)*a3 = + 2 EXP(64*4)*(2 EXP(64*3)*a0+2 EXP(64*2)*a1+2 EXP(64*1)*a2+a3)`] THEN + REWRITE_TAC[ARITH_RULE `2 EXP 512 * a = 2 EXP (64*4) * 2 EXP 256 * a`] THEN + REWRITE_TAC [ARITH_RULE `a * 2 EXP (64*4) * b = 2 EXP (64*4) * a * b`] THEN + REWRITE_TAC[GSYM LEFT_ADD_DISTRIB; EQ_MULT_LCANCEL; + ARITH_RULE `~(2 EXP (64*4) = 0)`] THEN + + (* Prove it! *) + PROVE_IT; + + ALL_TAC] THEN + + (* Jump to maddloop_neon_firstitr *) + (fun (asl,concl) -> ENSURES_SEQUENCE_TAC `pc + 0x548` + (strip_mc_and_pc_conds (mk_hoare_cond_conj + (get_hoare_precond concl, `read X27 s = word (32 * (k4 - 1))`))) (asl,concl)) + THEN CONJ_TAC THENL [ + REWRITE_TAC[BIGNUM_FROM_MEMORY_BYTES] THEN + ENSURES_INIT_TAC "s0" THEN + ARM_STEPS_TAC BIGNUM_EMONTREDC_8N_NEON_EXEC (1--1) THEN + SUBGOAL_THEN `val (word (32 * (k4 - 1)):(64)word) = 0 <=> F` + SUBST_ALL_TAC THENL + [REWRITE_TAC[VAL_WORD; DIMINDEX_64] THEN IMP_REWRITE_TAC[MOD_LT] THEN + DISCARD_MATCHING_ASSUMPTIONS + [`read a b = c`; `nonoverlapping_modulo x y z`; + `bignum_from_memory a b = c`] THEN + ASM_ARITH_TAC; + ALL_TAC] THEN + RULE_ASSUM_TAC (REWRITE_RULE [COND_CLAUSES]) THEN + ARM_STEPS_TAC BIGNUM_EMONTREDC_8N_NEON_EXEC (2--4) THEN + SUBGOAL_THEN `val (word_sub (word (32 * (k4 - 1))) (word 32):(64)word) = 0 + <=> F` + SUBST_ALL_TAC THENL [ + REWRITE_TAC[VAL_WORD_SUB] THEN + REWRITE_TAC[DIMINDEX_64; VAL_WORD] THEN + REWRITE_TAC[ARITH_RULE `32 MOD 2 EXP 64 = 32`] THEN + DISCARD_MATCHING_ASSUMPTIONS + [`read a b = c`; `nonoverlapping_modulo x y z`; + `bignum_from_memory a b = c`] THEN + SUBGOAL_THEN `(32 * (k4 - 1)) MOD 2 EXP 64 = 32 * (k4 - 1)` + SUBST_ALL_TAC THENL [ + IMP_REWRITE_TAC[MOD_LT] THEN + ASM_ARITH_TAC; ALL_TAC] THEN + SUBGOAL_THEN + `32 * (k4 - 1) + 2 EXP 64 - 32 = (32 * (k4 - 1) - 32) + 2 EXP 64` + SUBST_ALL_TAC THENL [ASM_ARITH_TAC; ALL_TAC] THEN + REWRITE_TAC[CONJUNCT1 (SPECL + [`32 * (k4 - 1) - 32:num`; `2 EXP 64:num`; `2 EXP 64`] + (GSYM ADD_MOD_MOD_REFL)); + MOD_REFL; ADD_CLAUSES] THEN + IMP_REWRITE_TAC[MOD_LT] THEN + ASM_ARITH_TAC; + + ALL_TAC] THEN + RULE_ASSUM_TAC (REWRITE_RULE [NOT_CLAUSES]) THEN + ENSURES_FINAL_STATE_TAC THEN ASM_REWRITE_TAC[]; + + ALL_TAC] THEN + + + (* maddloop_neon_firstitr *) + ENSURES_SEQUENCE_TAC `pc + 0x82c` (apply_i inner_loop_invariant `2:num`) + THEN CONJ_TAC THENL [ + REWRITE_TAC[BIGNUM_FROM_MEMORY_BYTES] THEN + GHOST_INTRO_TAC `g8:int64` `read X12` THEN + GHOST_INTRO_TAC `g9:int64` `read X13` THEN + GHOST_INTRO_TAC `g10:int64` `read X14` THEN + GHOST_INTRO_TAC `g11:int64` `read X15` THEN + ENSURES_INIT_TAC "s0" THEN + + (* read bytes64 & bytes128 of ldr q [m + 64 ~ m + 96) *) + (* This is for (13--14) *) + SUBGOAL_THEN `!j. j < 4 + ==> bigdigit (bignum_from_memory(m,k) s0) (8 + j) = + bigdigit n (8 + j)` MP_TAC THENL[ + ASM_REWRITE_TAC[BIGNUM_FROM_MEMORY_BYTES] THEN + FAIL_TAC "unreachable"; + + REWRITE_TAC[BIGDIGIT_BIGNUM_FROM_MEMORY] THEN + CONV_TAC (ONCE_DEPTH_CONV EXPAND_CASES_CONV) THEN + SUBGOAL_THEN `8 + 0 < k <=> T` SUBST_ALL_TAC THENL + [DISCARD_MATCHING_ASSUMPTIONS [`read`] THEN ASM_ARITH_TAC; ALL_TAC] THEN + SUBGOAL_THEN `8 + 1 < k <=> T` SUBST_ALL_TAC THENL + [DISCARD_MATCHING_ASSUMPTIONS [`read`] THEN ASM_ARITH_TAC; ALL_TAC] THEN + SUBGOAL_THEN `8 + 2 < k <=> T` SUBST_ALL_TAC THENL + [DISCARD_MATCHING_ASSUMPTIONS [`read`] THEN ASM_ARITH_TAC; ALL_TAC] THEN + SUBGOAL_THEN `8 + 3 < k <=> T` SUBST_ALL_TAC THENL + [DISCARD_MATCHING_ASSUMPTIONS [`read`] THEN ASM_ARITH_TAC; ALL_TAC] THEN + REWRITE_TAC[COND_CLAUSES] THEN + GEN_REWRITE_TAC (LAND_CONV o TOP_DEPTH_CONV) + [VAL_WORD_GALOIS; DIMINDEX_64; BIGDIGIT_BOUND; AND_CLAUSES] THEN + CONV_TAC (LAND_CONV (ONCE_DEPTH_CONV NUM_REDUCE_CONV)) THEN + STRIP_TAC] THEN + BYTES128_EQ_JOIN64_TAC `read (memory :> bytes128 (word_add m (word 64))) s0` + `word (bigdigit n 9):(64)word` + `word (bigdigit n 8):(64)word` THEN + BYTES128_EQ_JOIN64_TAC `read (memory :> bytes128 (word_add m (word 80))) s0` + `word (bigdigit n 11):(64)word` + `word (bigdigit n 10):(64)word` THEN + + (* ldp [z + 32 ~ z + 64) *) + SUBGOAL_THEN `!j. j < 4 + ==> bigdigit (bignum_from_memory(word_add z (word 32),k) s0) j = + bigdigit a (4 + j)` MP_TAC THENL [ + RULE_ASSUM_TAC (REWRITE_RULE [ARITH_RULE `8*4 = 32`]) THEN + ASM_REWRITE_TAC[BIGNUM_FROM_MEMORY_BYTES; BIGDIGIT_HIGHDIGITS] THEN + FAIL_TAC "unreachable"; + + REWRITE_TAC[BIGDIGIT_BIGNUM_FROM_MEMORY] THEN + CONV_TAC (ONCE_DEPTH_CONV EXPAND_CASES_CONV) THEN + SUBGOAL_THEN `0 < k <=> T` SUBST_ALL_TAC THENL + [DISCARD_MATCHING_ASSUMPTIONS [`read`] THEN ASM_ARITH_TAC; ALL_TAC] THEN + SUBGOAL_THEN `1 < k <=> T` SUBST_ALL_TAC THENL + [DISCARD_MATCHING_ASSUMPTIONS [`read`] THEN ASM_ARITH_TAC; ALL_TAC] THEN + SUBGOAL_THEN `2 < k <=> T` SUBST_ALL_TAC THENL + [DISCARD_MATCHING_ASSUMPTIONS [`read`] THEN ASM_ARITH_TAC; ALL_TAC] THEN + SUBGOAL_THEN `3 < k <=> T` SUBST_ALL_TAC THENL + [DISCARD_MATCHING_ASSUMPTIONS [`read`] THEN ASM_ARITH_TAC; ALL_TAC] THEN + REWRITE_TAC[COND_CLAUSES] THEN + GEN_REWRITE_TAC (LAND_CONV o TOP_DEPTH_CONV) + [VAL_WORD_GALOIS; DIMINDEX_64; BIGDIGIT_BOUND; AND_CLAUSES; + WORD_ADD_ASSOC_CONSTS] THEN + CONV_TAC (LAND_CONV (ONCE_DEPTH_CONV NUM_REDUCE_CONV)) THEN + STRIP_TAC] THEN + + (* from assumption + bignum_from_memory (word_add z (word (8 * 4)),k) s = highdigits a 4, + make + bignum_from_memory (word_add z (word (8 * 4 * 2)),(k + 4) - 4 * 2) s = + highdigits a (4 * 2) + *) + SUBGOAL_THEN + `bignum_from_memory (word_add z (word 64),k - 4) s0 = highdigits a 8` + MP_TAC THENL [ + REWRITE_TAC[WORD_RULE + `(word_add z (word 64)) = + (word_add (word_add z (word (8*4))) (word (8*4)))`] THEN + REWRITE_TAC[GSYM BIGNUM_FROM_MEMORY_DIV] THEN + ASM_REWRITE_TAC[BIGNUM_FROM_MEMORY_BYTES] THEN + REWRITE_TAC[highdigits; DIV_DIV; GSYM EXP_ADD] THEN + AP_TERM_TAC THEN AP_TERM_TAC THEN ARITH_TAC; + + REWRITE_TAC[BIGNUM_FROM_MEMORY_BYTES] THEN DISCH_TAC] THEN + + (* For nonoverlapping reasoning *) + SUBST_ALL_TAC (ARITH_RULE `8 * 4 = 32`) THEN + (* Prove that [z+32..z+64] and [z+64..] are nonoverlapping *) + SUBGOAL_THEN + `nonoverlapping (word_add z (word 32):(64)word,32) + (word_add z (word 64):(64)word,8 * (k-4))` + ASSUME_TAC THENL + [REWRITE_TAC[NONOVERLAPPING_CLAUSES] THEN NONOVERLAPPING_TAC; ALL_TAC] THEN + + ARM_REWRITE_ASSUM_AND_XACCSTEPS_TAC2 BIGNUM_EMONTREDC_8N_NEON_EXEC + ((1--8) @ (112--115) @ [176;177;183;184]) + [WORD_MUL64_LO;WORD_MUL64_HI] + ([2;4;6;8] @ (9--12) @ [22;23] @ (29--32) @ (37--40) @ (45--48) @ (51--54) @ (59--62) + @ [67;68] @ + [78;89;91;92] @ + [98;103;105;106;107;108;109;110] @ + [121;126;128;129;130] @ + [136;141;143;144;145;146;147] @ + [153;158;160;161;162;163] @ + [169;174;179;180;181;182]) (1--185) [`X1`; `X2`] THEN + + ENSURES_FINAL_STATE_TAC THEN ASM_REWRITE_TAC[ + ARITH_RULE `4 * 2 = 8`; + ARITH_RULE `4 * 2 + 1 = 9`; + ARITH_RULE `4 * 2 + 2 = 10`; + ARITH_RULE `4 * 2 + 3 = 11`] THEN + SUBGOAL_THEN `word_sub (word (32 * (k4 - 1))) (word 32):(64)word = word (32 * (k4 - 2))` + SUBST_ALL_TAC THENL [ + REWRITE_TAC[ARITH_RULE `32 * (k4-2) = 32*(k4-1)-32`; WORD_SUB] THEN + IMP_REWRITE_TAC[TAUT `(c <=> T) ==> (if c then t1 else t2) = t1`] THEN + DISCARD_NONMATCHING_ASSUMPTIONS + [`8 <= k`; `4 * k4 = k`] THEN ASM_ARITH_TAC; + + ALL_TAC + ] THEN + REWRITE_TAC[WORD_BITMANIP_SIMP_LEMMAS; WORD_RULE + `word_sub (word_add p (word (32*2):(64)word)) (word 32) = + word_add p (word 32)`] THEN + ASM_REWRITE_TAC[ARITH_RULE `4*2=8`; ARITH_RULE `8*8=64`; + ARITH_RULE `8 * ((k + 4) - 8) = 8 * (k-4)`] THEN + + (* Now the conclusion is (n * w + 1 == 0) (mod (2 EXP 64)) ==> ... . *) + (* Discharge (n * w + 1 == 0) (mod (2 EXP 64)) and simplify an + existing assumption using this *) + DISCH_THEN(fun th -> + REPEAT(FIRST_X_ASSUM(ASSUME_TAC o C MATCH_MP th)) THEN + ASSUME_TAC th) THEN + (* Split `read (memory :> bytes (z,64)) s185` into high 32 and low 32 bytes. + Low 32 bytes are simply q. *) + SUBGOAL_THEN `read (memory :> bytes (z,64)) s185 = + 2 EXP (64 * 4) * bignum_from_memory (word_add z (word (8 * 4)),4) s185 + + q` (fun thm -> ONCE_REWRITE_TAC [thm]) THENL [ + SUBST1_TAC (ARITH_RULE `64 = 8 * 8`) THEN + REWRITE_TAC[GSYM BIGNUM_FROM_MEMORY_BYTES] THEN + REWRITE_TAC[ARITH_RULE `(_,8)=(_,4+4)`] THEN + REWRITE_TAC[BIGNUM_FROM_MEMORY_SPLIT] THEN + ASM_REWRITE_TAC[BIGNUM_FROM_MEMORY_BYTES; ARITH_RULE `8*8=64`; ARITH_RULE `8*4=32`] + THEN FAIL_TAC "unreachable"; + + ALL_TAC] THEN + + (* Split high 32 bytes into 4 8-byte reads. *) + CONV_TAC(ONCE_DEPTH_CONV BIGNUM_EXPAND_CONV) THEN + ASM_REWRITE_TAC (map ARITH_RULE [`8*4=32`;`8*4+8=40`;`8*4+16=48`;`8*4+24=56`]) THEN + + (* Simplify .. + q = .. + q *) + REWRITE_TAC[ARITH_RULE `p+q+x=r+s+x <=> p+q=r+s`] THEN + + (* Split lowdigits _ 8 into lowdigits _ 4 + ... *) + REWRITE_TAC[ARITH_RULE `lowdigits n 8 = lowdigits n ((((4+1)+1)+1)+1)`; + LOWDIGITS_CLAUSES] THEN + CONV_TAC(DEPTH_CONV NUM_ADD_CONV) THEN + ONCE_REWRITE_TAC[ARITH_RULE + `q*(a0+a1+a2+a3+a4)+b0+b1+b2+b3+b4 = + q*(a0+a1+a2+a3)+b0+b1+b2+b3+(q*a4+b4)`] THEN + REWRITE_TAC[GSYM (ASSUME + `2 EXP (64 * 4) * bignum_of_wordlist [g8; g9; g10; g11] = + q * lowdigits n 4 + lowdigits a 4`)] THEN + DISCARD_MATCHING_ASSUMPTIONS [ + `2 EXP (64 * 4) * bignum_of_wordlist [g8; g9; g10; g11] = + q * lowdigits n 4 + lowdigits a 4`] THEN + + (* Expand q *) + SUBGOAL_THEN + `q = val(word(bigdigit q 0):(64)word) + + 2 EXP 64 * val(word(bigdigit q 1):(64)word) + + 2 EXP 128 * val(word(bigdigit q 2):(64)word) + + 2 EXP 192 * val(word(bigdigit q 3):(64)word)` + (fun thm -> ONCE_REWRITE_TAC [thm]) THENL [ + EXPAND_TAC "q" THEN + REWRITE_TAC[ARITH_RULE `32=8*4`] THEN + GEN_REWRITE_TAC (RAND_CONV o ONCE_DEPTH_CONV) + [GSYM BIGNUM_FROM_MEMORY_BYTES] THEN + REWRITE_TAC[BIGDIGIT_BIGNUM_FROM_MEMORY; ARITH_RULE `0<4/\1<4/\2<4/\3<4`; + WORD_VAL] THEN + CONV_TAC(LAND_CONV BIGNUM_EXPAND_CONV) THEN + CONV_TAC (ONCE_DEPTH_CONV NUM_REDUCE_CONV) THEN REWRITE_TAC[WORD_ADD_0] THEN + FAIL_TAC "unreachable"; + + ALL_TAC] THEN + + (* Divide by 2 EXP 256 *) + REWRITE_TAC[GSYM LEFT_ADD_DISTRIB] THEN + REWRITE_TAC[ARITH_RULE + `2 EXP(64*7)*a0+2 EXP(64*6)*a1+2 EXP(64*5)*a2+2 EXP(64*4)*a3 = + 2 EXP(64*4)*(2 EXP(64*3)*a0+2 EXP(64*2)*a1+2 EXP(64*1)*a2+a3)`] THEN + REWRITE_TAC[ARITH_RULE `2 EXP (64*8) * a = 2 EXP (64*4) * 2 EXP 256 * a`] THEN + REWRITE_TAC [ARITH_RULE `a * 2 EXP (64*4) * b = 2 EXP (64*4) * a * b`] THEN + REWRITE_TAC[GSYM LEFT_ADD_DISTRIB; EQ_MULT_LCANCEL; + ARITH_RULE `~(2 EXP (64*4) = 0)`] THEN + + (* Prove it! *) + PROVE_IT; + + ALL_TAC] THEN + + (* Simulate maddloop_neon_last ~ end first. *) + ENSURES_SEQUENCE_TAC `pc + 0xb0c` (apply_i inner_loop_invariant `k4-1:num`) + THEN CONJ_TAC THENL [ + (* 0x82c ~ 0xb0c*) + ALL_TAC; + + (* 0xb0c ~ 0xd28 *) + (* Use z' and m' because nonoverlapping tactic sometimes doesn't solve (z+e,z+e') *) + REWRITE_TAC[GSYM BIGNUM_FROM_MEMORY_BYTES] THEN + REWRITE_TAC[WORD_RULE `word_add z (word (8 * 4 * (k-1))) = word_add z (word(32 * (k-1)))`] THEN + ABBREV_TAC `z':int64 = word_add z (word (32 * (k4-1)))` THEN + ABBREV_TAC `m':int64 = word_add m (word (32 * (k4-1)))` THEN + + SUBGOAL_THEN `4 * (k4-1) < k` ASSUME_TAC THENL + [MAP_EVERY UNDISCH_TAC [`~(k=0)`; `4 * k4 = k`] THEN ARITH_TAC; ALL_TAC] THEN + GHOST_INTRO_TAC `g8:int64` `read X12` THEN + GHOST_INTRO_TAC `g9:int64` `read X13` THEN + GHOST_INTRO_TAC `g10:int64` `read X14` THEN + GHOST_INTRO_TAC `g11:int64` `read X15` THEN + + (* Shrink the window of maychange z *) + MATCH_MP_TAC ENSURES_FRAME_SUBSUMED THEN EXISTS_TAC + `MAYCHANGE_REGS_AND_FLAGS_PERMITTED_BY_ABI ,, + MAYCHANGE [X19; X20; X21; X22; X23; X24; X25; X26; X27; X28],, + MAYCHANGE [memory :> bytes(z',32)] ,, + MAYCHANGE [memory :> bytes(word_sub stackpointer (word 32),32)]` THEN + (REWRITE_TAC [MAYCHANGE_REGS_AND_FLAGS_PERMITTED_BY_ABI; GSYM SEQ_ASSOC] + THEN CONJ_TAC) THENL + [EXPAND_TAC "z'" THEN SUBSUMED_MAYCHANGE_TAC; ALL_TAC] THEN + + (* nonoverlapping between (z',32) and many *) + SUBGOAL_THEN + `ALL (nonoverlapping (z':int64,32)) + [(z,32); (z,8 * 4 * (k4-1)); (m,8 * k); (word pc,3468); + (m',32); (word_add z' (word 32),32); + (word_sub stackpointer (word 32),32)]` + MP_TAC THEN REWRITE_TAC[ALL; NONOVERLAPPING_CLAUSES] THENL + [MAP_EVERY EXPAND_TAC ["z'";"m'"] THEN + REWRITE_TAC [WORD_RULE `word_add (word_sub x y) y = x`] THEN + REPEAT CONJ_TAC THEN NONOVERLAPPING_TAC; + STRIP_TAC] THEN + (* Some simplifications *) + SUBGOAL_THEN `(k + 4) - 4 * (k4 - 1) = 8` (fun thm-> REWRITE_TAC[thm]) THENL [ + MAP_EVERY UNDISCH_TAC [`0 REWRITE_TAC[thm]) THENL [ + MAP_EVERY EXPAND_TAC ["k"; "z'"] THEN + REWRITE_TAC[WORD_ADD_ASSOC_CONSTS] THEN + AP_TERM_TAC THEN AP_TERM_TAC THEN + UNDISCH_TAC `~(k4=0)` THEN ARITH_TAC; + + ALL_TAC] THEN + + (* Start symbolic execution *) + REWRITE_TAC[BIGNUM_FROM_MEMORY_BYTES;ARITH_RULE`8*8=64`] THEN ENSURES_INIT_TAC "s0" THEN + + SUBGOAL_THEN + `bignum_from_memory + (word_add (word_sub m' (word 32)) (word 32):(64)word, + k-4*(k4-1)) s0 = + highdigits n (4*(k4-1))` MP_TAC THENL [ + REWRITE_TAC[WORD_RULE `word_add (word_sub x y) y = x`] THEN + MAP_EVERY EXPAND_TAC ["n";"m'"] THEN + REWRITE_TAC[GSYM BIGNUM_FROM_MEMORY_BYTES; HIGHDIGITS_BIGNUM_FROM_MEMORY; + ARITH_RULE `8 * 4 * (k4-1) = 32 * (k4-1)`] THEN + FAIL_TAC "unreachable"; + + REWRITE_TAC [BIGNUM_FROM_MEMORY_BYTES] THEN STRIP_TAC] THEN + + (* ldp [x2+32 ~ x2+63] ([m' ~ m'+31]) *) + SUBGOAL_THEN + `!j. j < 4 ==> + bigdigit (bignum_from_memory + (word_add (word_sub m' (word 32)) (word 32):(64)word, + k-4*(k4-1)) s0) j = + bigdigit n (4*(k4-1)+j)` + MP_TAC THENL [ + ASM_REWRITE_TAC[BIGNUM_FROM_MEMORY_BYTES; BIGDIGIT_HIGHDIGITS] THEN + EXPAND_TAC "m'" THEN + REWRITE_TAC[WORD_RULE `word_add (word_sub x y) y = x`] THEN + FAIL_TAC "unreachable"; + + REWRITE_TAC[BIGDIGIT_BIGNUM_FROM_MEMORY] THEN + CONV_TAC (ONCE_DEPTH_CONV EXPAND_CASES_CONV) THEN + (let MYTAC = MAP_EVERY UNDISCH_TAC [`4 * k4 = k`; `~(k=0)`] THEN ARITH_TAC in + SUBGOAL_THEN `0 < k - 4 * (k4-1) <=> T` SUBST_ALL_TAC THENL [MYTAC; ALL_TAC] THEN + SUBGOAL_THEN `1 < k - 4 * (k4-1) <=> T` SUBST_ALL_TAC THENL [MYTAC; ALL_TAC] THEN + SUBGOAL_THEN `2 < k - 4 * (k4-1) <=> T` SUBST_ALL_TAC THENL [MYTAC; ALL_TAC] THEN + SUBGOAL_THEN `3 < k - 4 * (k4-1) <=> T` SUBST_ALL_TAC THENL [MYTAC; ALL_TAC]) THEN + REWRITE_TAC[COND_CLAUSES; + WORD_RULE `word_add (word_add x (word y)) (word z) = word_add x (word (y+z))`; + ARITH_RULE `x+0=x`] THEN + CONV_TAC (LAND_CONV (ONCE_DEPTH_CONV NUM_REDUCE_CONV)) THEN + REWRITE_TAC[WORD_ADD_0;VAL_WORD_GALOIS;DIMINDEX_64; BIGDIGIT_BOUND] THEN STRIP_TAC] THEN + + (* ldp [x1 ~ x1 + 31] ([z' ~ z' + 31]). + Do not use (word_add (word_sub z' 32) 32) because after line 6 we will simplify + z'-32+32 into z' (as well as m'-32+32 to m'). *) + SUBGOAL_THEN + `!j. j < 4 ==> + bigdigit (bignum_from_memory (z':(64)word, 8) s0) j = + bigdigit a (4*(k4-1)+j)` + MP_TAC THENL [ + ASM_REWRITE_TAC[BIGNUM_FROM_MEMORY_BYTES; BIGDIGIT_HIGHDIGITS;ARITH_RULE`8*8=64`] THEN + EXPAND_TAC "z'" THEN + FAIL_TAC "unreachable"; + + REWRITE_TAC[BIGDIGIT_BIGNUM_FROM_MEMORY] THEN + CONV_TAC (ONCE_DEPTH_CONV EXPAND_CASES_CONV) THEN + (let MYTAC = ARITH_TAC in + SUBGOAL_THEN `0 < 8 <=> T` SUBST_ALL_TAC THENL [MYTAC; ALL_TAC] THEN + SUBGOAL_THEN `1 < 8 <=> T` SUBST_ALL_TAC THENL [MYTAC; ALL_TAC] THEN + SUBGOAL_THEN `2 < 8 <=> T` SUBST_ALL_TAC THENL [MYTAC; ALL_TAC] THEN + SUBGOAL_THEN `3 < 8 <=> T` SUBST_ALL_TAC THENL [MYTAC; ALL_TAC]) THEN + REWRITE_TAC[COND_CLAUSES; + WORD_RULE `word_add (word_add x (word y)) (word z) = word_add x (word (y+z))`] THEN + CONV_TAC (LAND_CONV (ONCE_DEPTH_CONV NUM_REDUCE_CONV)) THEN + REWRITE_TAC[WORD_ADD_0;ARITH_RULE `x+0=x`;VAL_WORD_GALOIS;DIMINDEX_64; BIGDIGIT_BOUND] THEN STRIP_TAC] THEN + + (* From assumption + `bignum_from_memory (z',8) s0 = highdigits a (4 * (k4-1)), + get the highdigits of the uppermost 4 bytes. + *) + SUBGOAL_THEN + `bignum_from_memory (word_add z' (word 32),4) s0 = + highdigits a k` + MP_TAC THENL [ + ONCE_REWRITE_TAC[ARITH_RULE `(_,4) = (_,8 - 4)`] THEN + ONCE_REWRITE_TAC[WORD_RULE `word_add z' (word 32):64 word = word_add z' (word (8*4))`] THEN + REWRITE_TAC[GSYM BIGNUM_FROM_MEMORY_DIV] THEN + ASM_REWRITE_TAC[BIGNUM_FROM_MEMORY_BYTES;ARITH_RULE`8*8=64`] THEN + REWRITE_TAC[highdigits; DIV_DIV; GSYM EXP_ADD] THEN + AP_TERM_TAC THEN AP_TERM_TAC THEN UNDISCH_TAC `~(k = 0)` THEN + EXPAND_TAC "k" THEN ARITH_TAC; + + REWRITE_TAC[BIGNUM_FROM_MEMORY_BYTES;ARITH_RULE `8*4=32`] THEN DISCH_TAC] THEN + + (* go! *) + ACCUMULATE_ARITH_TAC "s0" THEN CLARIFY_TAC THEN + ARM_XACCSTEPS_TAC BIGNUM_EMONTREDC_8N_NEON_EXEC [`SP`;`X2`;`X1`] (1--4) (1--4) THEN + RULE_ASSUM_TAC (REWRITE_RULE [ (* looks so dumb, but it works... *) + WORD_RULE `word_add (word_sub x (word 32)) (word 32) = x`; + WORD_RULE `word_add (word_sub x (word 32)) (word 40) = word_add x (word 8)`; + WORD_RULE `word_add (word_sub x (word 32)) (word 48) = word_add x (word 16)`; + WORD_RULE `word_add (word_sub x (word 32)) (word 56) = word_add x (word 24)` + ]) THEN + + ARM_REWRITE_ASSUM_AND_XACCSTEPS_TAC2 BIGNUM_EMONTREDC_8N_NEON_EXEC + [9] + [WORD_MUL64_LO;WORD_MUL64_HI] + [5;6;7;8;9;11;12;14;15;16;17;18;19;20;21;22;23;24;25;26;27;28;29;30;31;32;33;34;35; + 41;46;48;49;55;60;62;63;64;65;66;67;73;78;80;81;82;88;93;95;96;97;98;99; + 105;110;112;113;114;115;121;126;128;129;130;131] + (5--135) [`X2`;`X1`;`X27`] THEN + + (* ENSURES_FINAL_STATE_TAC and ASM_REWRITE_TAC *) + ENSURES_FINAL_STATE_TAC THEN + RULE_ASSUM_TAC(REWRITE_RULE[ARITH_RULE `8*4=32`]) THEN (* 8*4=32 for .. = q *) + ASM_REWRITE_TAC[] THEN + + (* Discharge (n * w + 1 == 0) (mod (2 EXP 64)) and simplify an + existing assumption using this *) + DISCH_THEN(fun th -> + REPEAT(FIRST_X_ASSUM(ASSUME_TAC o C MATCH_MP th)) THEN + ASSUME_TAC th) THEN + + SUBGOAL_THEN `n = lowdigits n k` (fun thm -> ONCE_REWRITE_TAC[thm]) THENL [ + MATCH_MP_TAC EQ_SYM THEN + REWRITE_TAC[LOWDIGITS_EQ_SELF] THEN + EXPAND_TAC "n" THEN + REWRITE_TAC[GSYM BIGNUM_FROM_MEMORY_BYTES; BIGNUM_FROM_MEMORY_BOUND] THEN + FAIL_TAC "unreachable"; + + ALL_TAC] THEN + + (* split lowdigits a(n) k *) + SUBGOAL_THEN `!x. lowdigits x k = lowdigits x (((((4 * (k4-1) + 1) + 1) + 1) + 1))` + (fun thm->REWRITE_TAC [thm]) THENL [ + EXPAND_TAC "k" THEN STRIP_TAC THEN AP_TERM_TAC THEN + UNDISCH_TAC `~(k4 = 0)` THEN ARITH_TAC; ALL_TAC ] THEN + REWRITE_TAC[LOWDIGITS_CLAUSES] THEN + ONCE_REWRITE_TAC[ARITH_RULE + `q*(a0+a1+a2+a3+a4)+(b0+b1+b2+b3+b4)+q = + q*(a0+a1+a2+a3)+b0+b1+b2+b3+(q*a4+b4+q)`] THEN + (* .. and replace q * lowdigits n .. + lowdigits a .. + q *) + REWRITE_TAC[GSYM (ASSUME + `2 EXP (64 * 4 * (k4 - 1)) * bignum_of_wordlist [g8; g9; g10; g11] + + read (memory :> bytes (z,8 * 4 * (k4 - 1))) s135 = + q * lowdigits n (4 * (k4 - 1)) + lowdigits a (4 * (k4 - 1)) + q`)] THEN + + (* split read (memory :> bytes (z,8 * k)) s135 into its high 32 bytes and low part, + and cancel out the low parts in lhs = rhs *) + REWRITE_TAC [GSYM BIGNUM_FROM_MEMORY_BYTES] THEN + SUBGOAL_THEN `k = 4 * (k4 - 1) + 4` + (fun thm -> GEN_REWRITE_TAC (LAND_CONV o ONCE_DEPTH_CONV) [thm]) THENL [ + EXPAND_TAC "k" THEN UNDISCH_TAC `~(k4=0)` THEN ARITH_TAC; ALL_TAC + ] THEN + GEN_REWRITE_TAC (LAND_CONV o ONCE_DEPTH_CONV) [BIGNUM_FROM_MEMORY_SPLIT] THEN + REWRITE_TAC[ARITH_RULE `a0+a1+c=b0+b1+b2+b3+b4+b5+c<=>a0+a1=b0+b1+b2+b3+b4+b5`] THEN + + (* Divide by 2 EXP (64 * 4 * (k-1)) *) + REWRITE_TAC[GSYM ADD_ASSOC] THEN + REWRITE_TAC [ARITH_RULE `64*4*k=256*k`; ARITH_RULE `64*(4*k+k')=256*k+64*k'`] THEN + CONV_TAC (ONCE_DEPTH_CONV NUM_REDUCE_CONV) THEN + REWRITE_TAC[EXP_ADD] THEN + REWRITE_TAC[GSYM MULT_ASSOC] THEN + REWRITE_TAC[GSYM LEFT_ADD_DISTRIB; + ARITH_RULE `a * 2 EXP (256 * (k4-1)) * b = 2 EXP (256 * (k4-1)) * a * b`] THEN + REWRITE_TAC[EQ_MULT_LCANCEL; EXP_2_NE_0] THEN + + (* Expand bignum_from_memory (z'4) *) + CONV_TAC(ONCE_DEPTH_CONV BIGNUM_EXPAND_CONV) THEN + REWRITE_TAC[GSYM WORD_ADD_ASSOC_CONSTS; ARITH_RULE `8*4*(k4-1)=32*(k4-1)`] THEN + ASM_REWRITE_TAC[] THEN + + (* Expand q *) + SUBGOAL_THEN + `q = val(word(bigdigit q 0):(64)word) + + 2 EXP 64 * val(word(bigdigit q 1):(64)word) + + 2 EXP 128 * val(word(bigdigit q 2):(64)word) + + 2 EXP 192 * val(word(bigdigit q 3):(64)word)` + (fun thm -> ONCE_REWRITE_TAC [thm]) THENL [ + EXPAND_TAC "q" THEN + REWRITE_TAC[ARITH_RULE `32=8*4`] THEN + GEN_REWRITE_TAC (RAND_CONV o ONCE_DEPTH_CONV) + [GSYM BIGNUM_FROM_MEMORY_BYTES] THEN + REWRITE_TAC[BIGDIGIT_BIGNUM_FROM_MEMORY; ARITH_RULE `0<4/\1<4/\2<4/\3<4`; + WORD_VAL] THEN + CONV_TAC(LAND_CONV BIGNUM_EXPAND_CONV) THEN + CONV_TAC (ONCE_DEPTH_CONV NUM_REDUCE_CONV) THEN REWRITE_TAC[WORD_ADD_0] THEN + FAIL_TAC "unreachable"; + + ALL_TAC] THEN + + (* Cleanup and prove it *) + SUBGOAL_THEN `val (word (0 + bitval carry_s25):64 word) = bitval carry_s25` + (fun thm -> RULE_ASSUM_TAC (REWRITE_RULE[thm])) THENL + [REWRITE_TAC[ADD_CLAUSES; VAL_WORD_BITVAL]; ALL_TAC] THEN + PROVE_IT + ] THEN + + ASM_CASES_TAC `k4 = 3` THENL [ + SUBST_ALL_TAC (ASSUME `k4 = 3`) THEN + REWRITE_TAC[BIGNUM_FROM_MEMORY_BYTES] THEN + ENSURES_INIT_TAC "s0" THEN + ARM_STEPS_TAC BIGNUM_EMONTREDC_8N_NEON_EXEC (1--2) THEN + ENSURES_FINAL_STATE_TAC THEN + ASM_REWRITE_TAC[ARITH_RULE`3-1=2`]; + + ALL_TAC] THEN + + (* maddloop_neon *) + ENSURES_WHILE_PAUP_TAC `2` `k4-1:num` `pc + 0x834` `pc + 0xb08` + inner_loop_invariant_with_flag THEN ASM_REWRITE_TAC[] THEN REPEAT CONJ_TAC THENL [ + (* 1. 2 < k-1 *) + ASM_ARITH_TAC; + + (* 2. 0x82c -> loop begin *) + REWRITE_TAC[BIGNUM_FROM_MEMORY_BYTES] THEN + ENSURES_INIT_TAC "s0" THEN + ARM_STEPS_TAC BIGNUM_EMONTREDC_8N_NEON_EXEC (1--2) THEN + ENSURES_FINAL_STATE_TAC THEN + ASM_REWRITE_TAC[] THEN + IMP_REWRITE_TAC[TAUT `(cond <=> F) ==> (if cond then a else b) = b`] THEN + SUBGOAL_THEN `word_sub (word (32 * (k4 - 2))) (word 32):(64)word = word (32 * (k4 - 3))` + SUBST_ALL_TAC THENL [ + REWRITE_TAC[ARITH_RULE `32 * (k4-3) = 32*(k4-2)-32`; WORD_SUB] THEN + IMP_REWRITE_TAC[TAUT `(c <=> T) ==> (if c then t1 else t2) = t1`] THEN + DISCARD_NONMATCHING_ASSUMPTIONS + [`~(k4 = 0)`;`~(k4 = 1)`;`~(k4 = 2)`;`~(k4 = 3)`] THEN ASM_ARITH_TAC; + + ALL_TAC + ] THEN + VAL_INT64_TAC `32 * (k4 - 3)` THEN + ASM_REWRITE_TAC[] THEN + DISCARD_NONMATCHING_ASSUMPTIONS + [`~(k4 = 0)`;`~(k4 = 1)`;`~(k4 = 2)`;`~(k4 = 3)`] THEN ASM_ARITH_TAC; + + ALL_TAC; (* 3. The main loop invariant preservation *) + + (* 4. cond br (0xb08) -> loop begin (0x834) *) + REPEAT STRIP_TAC THEN ARM_SIM_TAC BIGNUM_EMONTREDC_8N_NEON_EXEC [1]; + + (* 5. cond br (0xb08) -> 0xb0c *) + ARM_SIM_TAC BIGNUM_EMONTREDC_8N_NEON_EXEC [1]] THEN + + + (* The inner loop part. *) + REPEAT STRIP_TAC THEN + REWRITE_TAC[ARITH_RULE `(k + 4) - 4 * (i + 1) = k - 4 * i`] THEN + REWRITE_TAC[WORD_RULE + `word_sub (word_add m (word (32 * (i + 1)))) (word 32) = word_add m (word (32 * i))`] THEN + + (* Use z' and m' because nonoverlapping tactic sometimes doesn't solve (z+e,z+e') *) + REWRITE_TAC[GSYM BIGNUM_FROM_MEMORY_BYTES] THEN + REWRITE_TAC[ARITH_RULE `4 * (i + 1) = 4 * i + 4`] THEN + REWRITE_TAC[BIGNUM_FROM_MEMORY_SPLIT] THEN + REWRITE_TAC[ARITH_RULE `4 * i + 4 = 4 * (i + 1)`] THEN + ASM_REWRITE_TAC[ + WORD_RULE `word_add z (word (8 * 4 * i)) = word_add z (word(32 * i))`; + WORD_RULE `word_add z (word (32 * (i + 1))) = word_add (word_add z (word(32*i))) (word 32)`] THEN + ABBREV_TAC `z':int64 = word_add z (word (32 * i))` THEN + ABBREV_TAC `m':int64 = word_add m (word (32 * i))` THEN + + SUBGOAL_THEN `4 * i < k` ASSUME_TAC THENL + [MAP_EVERY UNDISCH_TAC [`i:num < k4 - 1`; `4 * k4 = k`] THEN ARITH_TAC; + ALL_TAC] THEN + GHOST_INTRO_TAC `g8:int64` `read X12` THEN + GHOST_INTRO_TAC `g9:int64` `read X13` THEN + GHOST_INTRO_TAC `g10:int64` `read X14` THEN + GHOST_INTRO_TAC `g11:int64` `read X15` THEN + + (* Shrink the window of maychange z *) + MATCH_MP_TAC ENSURES_FRAME_SUBSUMED THEN EXISTS_TAC + `MAYCHANGE_REGS_AND_FLAGS_PERMITTED_BY_ABI ,, + MAYCHANGE [X19; X20; X21; X22; X23; X24; X25; X26; X27; X28],, + MAYCHANGE [memory :> bytes(z',32)] ,, + MAYCHANGE [memory :> bytes(word_sub stackpointer (word 32),32)]` THEN + (REWRITE_TAC [MAYCHANGE_REGS_AND_FLAGS_PERMITTED_BY_ABI; GSYM SEQ_ASSOC] + THEN CONJ_TAC) THENL + [EXPAND_TAC "z'" THEN EXPAND_TAC "m'" THEN SUBSUMED_MAYCHANGE_TAC; ALL_TAC] THEN + + (* nonoverlapping between (z',32) and many *) + SUBGOAL_THEN + `ALL (nonoverlapping (z':int64,32)) + [(z,32); (z,8 * 4 * i); (m,8 * k); (word pc,3468); + (m',32); (word_add z' (word 32),8 * (k - 4 * i)); + (word_sub stackpointer (word 32),32)]` + MP_TAC THEN REWRITE_TAC[ALL; NONOVERLAPPING_CLAUSES] THENL + [MAP_EVERY EXPAND_TAC ["z'";"m'"] THEN + REWRITE_TAC [WORD_RULE `word_add (word_sub x y) y = x`] THEN + REPEAT CONJ_TAC THEN NONOVERLAPPING_TAC; + STRIP_TAC] THEN + + (* Start symbolic execution *) + REWRITE_TAC[BIGNUM_FROM_MEMORY_BYTES] THEN ENSURES_INIT_TAC "s0" THEN + + SUBGOAL_THEN + `bignum_from_memory + (word_add (word_sub m' (word 32)) (word 32):(64)word, + k-4*i) s0 = + highdigits n (4*i)` MP_TAC THENL [ + REWRITE_TAC[WORD_RULE `word_add (word_sub x y) y = x`] THEN + MAP_EVERY EXPAND_TAC ["n";"m'"] THEN + REWRITE_TAC[GSYM BIGNUM_FROM_MEMORY_BYTES; HIGHDIGITS_BIGNUM_FROM_MEMORY; + ARITH_RULE `8 * 4 * i = 32 * i`] THEN + FAIL_TAC "unreachable"; + + REWRITE_TAC [BIGNUM_FROM_MEMORY_BYTES] THEN STRIP_TAC] THEN + + (* ldp [x2+32 ~ x2+95] ([m' ~ m'+63]) *) + SUBGOAL_THEN + `!j. j < 8 ==> + bigdigit (bignum_from_memory + (word_add (word_sub m' (word 32)) (word 32):(64)word, + k-4*i) s0) j = + bigdigit n (4*i+j)` + MP_TAC THENL [ + ASM_REWRITE_TAC[BIGNUM_FROM_MEMORY_BYTES; BIGDIGIT_HIGHDIGITS] THEN + EXPAND_TAC "m'" THEN + REWRITE_TAC[WORD_RULE `word_add (word_sub x y) y = x`] THEN + FAIL_TAC "unreachable"; + + REWRITE_TAC[BIGDIGIT_BIGNUM_FROM_MEMORY] THEN + CONV_TAC (ONCE_DEPTH_CONV EXPAND_CASES_CONV) THEN + (let MYTAC = MAP_EVERY UNDISCH_TAC [`4 * k4 = k`; `i < k4 - 1`] THEN ARITH_TAC in + SUBGOAL_THEN `0 < k - 4 * i <=> T` SUBST_ALL_TAC THENL [MYTAC; ALL_TAC] THEN + SUBGOAL_THEN `1 < k - 4 * i <=> T` SUBST_ALL_TAC THENL [MYTAC; ALL_TAC] THEN + SUBGOAL_THEN `2 < k - 4 * i <=> T` SUBST_ALL_TAC THENL [MYTAC; ALL_TAC] THEN + SUBGOAL_THEN `3 < k - 4 * i <=> T` SUBST_ALL_TAC THENL [MYTAC; ALL_TAC] THEN + SUBGOAL_THEN `4 < k - 4 * i <=> T` SUBST_ALL_TAC THENL [MYTAC; ALL_TAC] THEN + SUBGOAL_THEN `5 < k - 4 * i <=> T` SUBST_ALL_TAC THENL [MYTAC; ALL_TAC] THEN + SUBGOAL_THEN `6 < k - 4 * i <=> T` SUBST_ALL_TAC THENL [MYTAC; ALL_TAC] THEN + SUBGOAL_THEN `7 < k - 4 * i <=> T` SUBST_ALL_TAC THENL [MYTAC; ALL_TAC]) THEN + REWRITE_TAC[COND_CLAUSES; + WORD_RULE `word_add (word_add x (word y)) (word z) = word_add x (word (y+z))`; + ARITH_RULE `x+0=x`] THEN + CONV_TAC (LAND_CONV (ONCE_DEPTH_CONV NUM_REDUCE_CONV)) THEN + REWRITE_TAC[WORD_ADD_0;VAL_WORD_GALOIS;DIMINDEX_64; BIGDIGIT_BOUND] THEN STRIP_TAC] THEN + + (* ldr [x2+64~x2+95] *) + BYTES128_EQ_JOIN64_TAC `read (memory :> bytes128 + (word_add (word_sub m' (word 32)) (word 64))) s0` + `word (bigdigit n (4*i+5)):(64)word` + `word (bigdigit n (4*i+4)):(64)word` THEN + BYTES128_EQ_JOIN64_TAC `read (memory :> bytes128 + (word_add (word_sub m' (word 32)) (word 80))) s0` + `word (bigdigit n (4*i+7)):(64)word` + `word (bigdigit n (4*i+6)):(64)word` THEN + + (* ldp [x1 ~ x1 + 31] ([z' ~ z' + 31]). + Do not use (word_add (word_sub z' 32) 32) because after line 6 we will simplify + z'-32+32 into z' (as well as m'-32+32 to m'). *) + SUBGOAL_THEN + `!j. j < 4 ==> + bigdigit (bignum_from_memory (z':(64)word, (k + 4) - 4 * i) s0) j = + bigdigit a (4*i+j)` + MP_TAC THENL [ + ASM_REWRITE_TAC[BIGNUM_FROM_MEMORY_BYTES; BIGDIGIT_HIGHDIGITS] THEN + EXPAND_TAC "z'" THEN + FAIL_TAC "unreachable"; + + REWRITE_TAC[BIGDIGIT_BIGNUM_FROM_MEMORY] THEN + CONV_TAC (ONCE_DEPTH_CONV EXPAND_CASES_CONV) THEN + (let MYTAC = MAP_EVERY UNDISCH_TAC [`4 * k4 = k`; `i < k4 - 1`] THEN ARITH_TAC in + SUBGOAL_THEN `0 < (k+4)-4*i <=> T` SUBST_ALL_TAC THENL [MYTAC; ALL_TAC] THEN + SUBGOAL_THEN `1 < (k+4)-4*i <=> T` SUBST_ALL_TAC THENL [MYTAC; ALL_TAC] THEN + SUBGOAL_THEN `2 < (k+4)-4*i <=> T` SUBST_ALL_TAC THENL [MYTAC; ALL_TAC] THEN + SUBGOAL_THEN `3 < (k+4)-4*i <=> T` SUBST_ALL_TAC THENL [MYTAC; ALL_TAC]) THEN + REWRITE_TAC[COND_CLAUSES; + WORD_RULE `word_add (word_add x (word y)) (word z) = word_add x (word (y+z))`] THEN + CONV_TAC (LAND_CONV (ONCE_DEPTH_CONV NUM_REDUCE_CONV)) THEN + REWRITE_TAC[WORD_ADD_0;ARITH_RULE `x+0=x`;VAL_WORD_GALOIS;DIMINDEX_64; BIGDIGIT_BOUND] THEN STRIP_TAC] THEN + + (* From assupmtion + `bignum_from_memory (z',((k + 4) - 4 * i)) s0 = highdigits a (4 * i), + make + `bignum_from_memory (z'+32,k - 4*i) s0 = highdigits a (4 * (i + 1)) + *) + SUBGOAL_THEN + `bignum_from_memory (word_add z' (word 32),k - 4*i) s0 = + highdigits a (4 * (i + 1))` + MP_TAC THENL [ + EXPAND_TAC "z'" THEN + ONCE_REWRITE_TAC[ + ARITH_RULE `k - 4*i = ((k+4) - 4*i) - 4`; + WORD_RULE `word_add (word_add z (word (32 * i))) (word 32) = + word_add (word_add z (word (32 * i))) (word (8*4))`] THEN + REWRITE_TAC[GSYM BIGNUM_FROM_MEMORY_DIV] THEN + ASM_REWRITE_TAC[BIGNUM_FROM_MEMORY_BYTES] THEN + REWRITE_TAC[highdigits; DIV_DIV; GSYM EXP_ADD] THEN + REWRITE_TAC[ARITH_RULE `64*4*i+64*4=64*4*(i+1)`] THEN FAIL_TAC "unreachable"; + + REWRITE_TAC[BIGNUM_FROM_MEMORY_BYTES] THEN DISCH_TAC] THEN + + (* Cleanup + Forget the definition of z' and m' *) + RULE_ASSUM_TAC (REWRITE_RULE [ARITH_RULE `8*4=32`]) THEN + DISCARD_MATCHING_ASSUMPTIONS [`word_add z (word (32 * i)) = z'`; `word_add m (word (32 * i)) = m'`] THEN + + (* go! *) + ACCUMULATE_ARITH_TAC "s0" THEN CLARIFY_TAC THEN + ARM_XACCSTEPS_TAC BIGNUM_EMONTREDC_8N_NEON_EXEC [`SP`;`X2`;`X1`] (1--4) (1--6) THEN + RULE_ASSUM_TAC (REWRITE_RULE [ (* looks so dumb, but it works... *) + WORD_RULE `word_add (word_sub x (word 32)) (word 32) = x`; + WORD_RULE `word_add (word_sub x (word 32)) (word 40) = word_add x (word 8)`; + WORD_RULE `word_add (word_sub x (word 32)) (word 48) = word_add x (word 16)`; + WORD_RULE `word_add (word_sub x (word 32)) (word 56) = word_add x (word 24)`; + WORD_RULE `word_add (word_sub x (word 32)) (word 64) = word_add x (word 32)`; + WORD_RULE `word_add (word_sub x (word 32)) (word 72) = word_add x (word 40)`; + WORD_RULE `word_add (word_sub x (word 32)) (word 80) = word_add x (word 48)`; + WORD_RULE `word_add (word_sub x (word 32)) (word 88) = word_add x (word 56)` + ]) THEN + + ARM_REWRITE_ASSUM_AND_XACCSTEPS_TAC2 BIGNUM_EMONTREDC_8N_NEON_EXEC + [11;12] [WORD_MUL64_LO;WORD_MUL64_HI] + [7;8;9;10;11;17;18;24;25;26;27;32;33;34;35;40;41;42;43;46;47;48;49; + 54;55;56;57;62;63;73;84;86;87;93;98;100;101;102;103;104;105] + (7--106) [`X2`;`X1`] THEN + + ARM_REWRITE_ASSUM_AND_XACCSTEPS_TAC2 BIGNUM_EMONTREDC_8N_NEON_EXEC + [107;108;109;110;171;172;178;179] [WORD_MUL64_LO;WORD_MUL64_HI] + [116;121;123;124;125;131;136;138;139;140;141;142;148;153;155;156;157;158; + 164;169;174;175;176;177] + (107--181) [`X2`;`X1`;`X16`;`X26`;`X3`;`X17`] THEN + + (* ENSURES_FINAL_STATE_TAC and ASM_REWRITE_TAC *) + ENSURES_FINAL_STATE_TAC THEN + ASM_REWRITE_TAC[ARITH_RULE `8*4=32`] THEN + (* pre-calculated multiplications *) + REWRITE_TAC ((map ARITH_RULE [ + `4 * (i + 1) + 1 = 4 * i + 5`; + `4 * (i + 1) + 2 = 4 * i + 6`; + `4 * (i + 1) + 3 = 4 * i + 7`; + `4 * (i + 1) = 4 * i + 4`]) @ [WORD_BITMANIP_SIMP_LEMMAS]) THEN + + (* X27 (induction var) update *) + SUBGOAL_THEN + `word_sub (word (32 * (k4 - i))) (word 32):(64)word = word (32 * (k4 - (i + 1)))` + SUBST_ALL_TAC THENL [ + REWRITE_TAC[ARITH_RULE `32 * (k4 - (i + 1)) = 32 * (k4 - i) - 32`] THEN + REWRITE_TAC[WORD_SUB] THEN + IMP_REWRITE_TAC [TAUT `(c <=> T) ==> (if c then t1 else t2) = t1`] THEN + UNDISCH_TAC `i < k4 - 1` THEN ARITH_TAC; + + SIMP_TAC[]] THEN + (* Flag update *) + SUBGOAL_THEN + `val (word_sub (word (32 * (k4 - (i + 1)))) (word 32):(64)word) = + (32 * (k4 - (i + 1))) - 32` SUBST_ALL_TAC THENL [ + REWRITE_TAC[VAL_WORD_SUB;VAL_WORD;DIMINDEX_64;ARITH_RULE`32 MOD 2 EXP 64 = 32`] THEN + SUBGOAL_THEN + `(32 * (k4 - (i + 1))) MOD 2 EXP 64 = 32 * (k4 - (i + 1))` SUBST_ALL_TAC + THENL [ + IMP_REWRITE_TAC[MOD_LT] THEN UNDISCH_TAC `k4 < 2 EXP 58` THEN ARITH_TAC; + ALL_TAC ] THEN + SUBGOAL_THEN + `32 * (k4 - (i + 1)) + 2 EXP 64 - 32 = 32 * (k4 - (i + 1)) - 32 + 2 EXP 64` + SUBST_ALL_TAC THENL [ + UNDISCH_TAC `i < k4 - 1` THEN ARITH_TAC; ALL_TAC] THEN + REWRITE_TAC[CONJUNCT1 (SPECL + [`32 * (k4 - (i+1)) - 32:num`; `2 EXP 64:num`; `2 EXP 64`] + (GSYM ADD_MOD_MOD_REFL)); + MOD_REFL; ADD_CLAUSES] THEN + IMP_REWRITE_TAC[MOD_LT] THEN + UNDISCH_TAC `k4 < 2 EXP 58` THEN ARITH_TAC; + + ALL_TAC] THEN + + SUBGOAL_THEN + `32 * (k4 - (i + 1)) - 32 = 0 <=> i + 1 = k4 - 1` (fun thm -> SIMP_TAC [thm]) THENL + [ UNDISCH_TAC `i < k4 - 1` THEN ARITH_TAC; ALL_TAC ] THEN + + (* Discharge (n * w + 1 == 0) (mod (2 EXP 64)) and simplify an + existing assumption using this *) + DISCH_THEN(fun th -> + REPEAT(FIRST_X_ASSUM(ASSUME_TAC o C MATCH_MP th)) THEN + ASSUME_TAC th) THEN + + (* Expand lowdigits n(or a) (4*i+4) *) + GEN_REWRITE_TAC (RAND_CONV o ONCE_DEPTH_CONV) + [ARITH_RULE `4 * i + 4 = 4 * i + 1 + 1 + 1 + 1`] THEN + REWRITE_TAC[ADD_ASSOC] THEN REWRITE_TAC[LOWDIGITS_CLAUSES] THEN + ONCE_REWRITE_TAC[ARITH_RULE + `(q*(a0+a1+a2+a3+a4)+b0+b1+b2+b3+b4)+q = + q*(a0+a1+a2+a3)+b0+b1+b2+b3+(q*a4+b4+q)`] THEN + REWRITE_TAC[GSYM (ASSUME + `2 EXP (64 * 4 * i) * bignum_of_wordlist [g8; g9; g10; g11] + + read (memory :> bytes (z,8 * 4 * i)) s181 = + q * lowdigits n (4 * i) + lowdigits a (4 * i) + q`)] THEN + (* Cancel out ... + read (memory :> bytes (z,8 * 4 * i)) s181 *) + REWRITE_TAC[ARITH_RULE + `a+x = b1+b2+b3+b4+b5+b6+x <=> a = b1+b2+b3+b4+b5+b6`] THEN + + (* Expand read (memory :> bytes (z',8 * 4)) s181 into 4 64-bit words *) + REWRITE_TAC[ARITH_RULE `(_,32)=(_,8*4)`] THEN + CONV_TAC(ONCE_DEPTH_CONV BIGNUM_EXPAND_CONV) THEN + ASM_REWRITE_TAC[] THEN + + (* Divide by 2 EXP 256 *) + REWRITE_TAC(map ARITH_RULE [ + `((4 * i + 1) + 1) + 1 = 4*i+3`; + `(4 * i + 1) + 1 = 4*i+2` + ]) THEN + SUBGOAL_THEN `!k. 2 EXP (64 * (4 * i + k)) = 2 EXP (64 * 4 * i) * 2 EXP (64 * k)` (fun thm -> REWRITE_TAC[thm]) + THENL [REWRITE_TAC[ARITH_RULE `64 * (a + b) = 64 * a + 64 * b`; EXP_ADD] THEN FAIL_TAC "unreachable"; ALL_TAC] THEN + REWRITE_TAC[GSYM MULT_ASSOC; GSYM LEFT_ADD_DISTRIB] THEN + REWRITE_TAC[ARITH_RULE `q*2 EXP (64*4*i)*a + p*b = 2 EXP (64*4*i)*q*a + p*b`; + GSYM LEFT_ADD_DISTRIB] THEN + IMP_REWRITE_TAC[EQ_MULT_LCANCEL;EXP_2_NE_0] THEN + REWRITE_TAC(map ARITH_RULE [`64*4=256`;`64*3=192`;`64*2=128`;`64*1=64`]) THEN + + (* Expand q *) + SUBGOAL_THEN + `q = val(word(bigdigit q 0):(64)word) + + 2 EXP 64 * val(word(bigdigit q 1):(64)word) + + 2 EXP 128 * val(word(bigdigit q 2):(64)word) + + 2 EXP 192 * val(word(bigdigit q 3):(64)word)` + (fun thm -> ONCE_REWRITE_TAC [thm]) THENL [ + EXPAND_TAC "q" THEN + REWRITE_TAC[ARITH_RULE `32=8*4`] THEN + GEN_REWRITE_TAC (RAND_CONV o ONCE_DEPTH_CONV) + [GSYM BIGNUM_FROM_MEMORY_BYTES] THEN + REWRITE_TAC[BIGDIGIT_BIGNUM_FROM_MEMORY; ARITH_RULE `0<4/\1<4/\2<4/\3<4`; + WORD_VAL] THEN + CONV_TAC(LAND_CONV BIGNUM_EXPAND_CONV) THEN + CONV_TAC (ONCE_DEPTH_CONV NUM_REDUCE_CONV) THEN REWRITE_TAC[WORD_ADD_0] THEN + FAIL_TAC "unreachable"; + + ALL_TAC] THEN + + SUBGOAL_THEN `val (word (0 + bitval carry_s25):64 word) = bitval carry_s25` + (fun thm -> RULE_ASSUM_TAC (REWRITE_RULE[thm])) THENL + [REWRITE_TAC[ADD_CLAUSES; VAL_WORD_BITVAL]; ALL_TAC] THEN + RULE_ASSUM_TAC (REWRITE_RULE[WORD_BITMANIP_SIMP_LEMMAS]) THEN + + DISCARD_READ_QREGS THEN + PROVE_IT);; + +let BIGNUM_EMONTREDC_8N_NEON_SUBROUTINE_CORRECT = time prove + (`!k z m w a n pc stackpointer returnaddress. + aligned 16 stackpointer /\ + ALLPAIRS nonoverlapping + [(word pc,3468); (m,8 * val k)] + [(z,8 * 2 * val k); (word_sub stackpointer (word 112), 112)] /\ + nonoverlapping (z,8 * 2 * val k) + (word_sub stackpointer (word 112),112) /\ + 8 divides val k + ==> ensures arm + (\s. aligned_bytes_loaded s (word pc) bignum_emontredc_8n_neon_mc /\ + read PC s = word pc /\ + read SP s = stackpointer /\ + read X30 s = returnaddress /\ + C_ARGUMENTS [k; z; m; w] s /\ + bignum_from_memory (z,2 * val k) s = a /\ + bignum_from_memory (m,val k) s = n) + (\s. read PC s = returnaddress /\ + ((n * val w + 1 == 0) (mod (2 EXP 64)) + ==> n * bignum_from_memory (z,val k) s + a = + 2 EXP (64 * val k) * + (2 EXP (64 * val k) * val(C_RETURN s) + + bignum_from_memory + (word_add z (word(8 * val k)),val k) s))) + (MAYCHANGE_REGS_AND_FLAGS_PERMITTED_BY_ABI ,, + MAYCHANGE [memory :> bytes(z,8 * 2 * val k); + memory :> bytes(word_sub stackpointer (word 112),112)])`, + let execth = BIGNUM_EMONTREDC_8N_NEON_EXEC in + let coreth = BIGNUM_EMONTREDC_8N_NEON_CORRECT in + let regs = dest_list `[X19;X20;X21;X22;X23;X24;X25;X26;X27;X28]` in + let sp_tm = `SP` in + let mono2lemma = MESON[] + `(!x. (!y. P x y) ==> (!y. Q x y)) ==> (!x y. P x y) ==> (!x y. Q x y)` in + MP_TAC BIGNUM_EMONTREDC_8N_NEON_CORRECT THEN + REWRITE_TAC [MAYCHANGE_REGS_AND_FLAGS_PERMITTED_BY_ABI] THEN + REPEAT(MATCH_MP_TAC mono2lemma THEN GEN_TAC) THEN + DISCH_THEN(fun th -> WORD_FORALL_OFFSET_TAC 80 THEN MP_TAC th) THEN + MATCH_MP_TAC MONO_FORALL THEN GEN_TAC THEN + REWRITE_TAC[NONOVERLAPPING_CLAUSES; PAIRWISE; ALLPAIRS; ALL] THEN + REWRITE_TAC[C_ARGUMENTS; C_RETURN; SOME_FLAGS] THEN + DISCH_THEN(fun th -> + REPEAT GEN_TAC THEN + TRY(DISCH_THEN(REPEAT_TCL CONJUNCTS_THEN ASSUME_TAC)) THEN + MP_TAC th) THEN + ASM_REWRITE_TAC[] THEN + SUBGOAL_THEN `word_add stackpointer (word 18446744073709551584):(64)word = + word_sub stackpointer (word 32)` SUBST_ALL_TAC THENL + [CONV_TAC WORD_BLAST;ALL_TAC] THEN + TRY(ANTS_TAC THENL + [REPEAT CONJ_TAC THEN ALIGNED_16_TAC THEN + TRY DISJ2_TAC THEN NONOVERLAPPING_TAC; + ALL_TAC]) THEN + + (* Make nonoverlapping reasoning happy *) + ABBREV_TAC `stackpointer':64 word = word_sub stackpointer (word 32)` THEN + SUBGOAL_THEN `stackpointer:64 word = word_add stackpointer' (word 32)` + SUBST_ALL_TAC THENL [EXPAND_TAC "stackpointer'" THEN CONV_TAC WORD_BLAST; ALL_TAC] THEN + SUBGOAL_THEN `word_add (word_add stackpointer' (word 32)) (word 80) = + word_add stackpointer' (word 112):64 word` + SUBST_ALL_TAC THENL [CONV_TAC WORD_BLAST; ALL_TAC] THEN + + DISCH_THEN(fun th -> + ENSURES_EXISTING_PRESERVED_TAC sp_tm THEN + MAP_EVERY (fun c -> ENSURES_PRESERVED_TAC ("init_"^fst(dest_const c)) c) regs THEN + REWRITE_TAC(!simulation_precanon_thms) THEN ENSURES_INIT_TAC "s0" THEN + ARM_STEPS_TAC execth (1--5) THEN + MP_TAC th) THEN + + (* convert back to the original stackpointer definition to use + ARM_BIGSTEP_TAC *) + ABBREV_TAC `stackpointer'':64 word = word_add stackpointer' (word 32)` THEN + SUBGOAL_THEN `stackpointer':64 word = word_sub stackpointer'' (word 32)` + SUBST_ALL_TAC THENL [EXPAND_TAC "stackpointer''" THEN CONV_TAC WORD_BLAST; ALL_TAC] THEN + + (* ARM_BIGSTEP_TAC erases 'read (memory :> ...) = X27'. + This replacement prevents that from happening. + Probably this is again related to the issue in nonoverlapping... *) + SUBGOAL_THEN + `read (memory :> bytes64 stackpointer'') s5 = + read (memory :> bytes64 (word_add (word_sub stackpointer'' (word 32)) + (word 32))) s5` SUBST_ALL_TAC THENL + [ AP_THM_TAC THEN AP_TERM_TAC THEN AP_TERM_TAC THEN AP_TERM_TAC THEN + CONV_TAC WORD_BLAST; ALL_TAC ] THEN + ARM_BIGSTEP_TAC execth ("s"^string_of_int(6)) THEN + + (* Again introduce the stackpointer - 32 form *) + ABBREV_TAC `stackpointer''':64 word = word_sub stackpointer'' (word 32)` THEN + SUBGOAL_THEN `stackpointer'':64 word = word_add stackpointer''' (word 32)` + SUBST_ALL_TAC THENL [EXPAND_TAC "stackpointer'''" THEN CONV_TAC WORD_BLAST; ALL_TAC] THEN + DISCARD_MATCHING_ASSUMPTIONS [`stackpointer''' = stackpointer''':64 word`] THEN + + REWRITE_TAC(!simulation_precanon_thms) THEN + ARM_STEPS_TAC execth (7--12) THEN + ENSURES_FINAL_STATE_TAC THEN + ASM_REWRITE_TAC[]);; diff --git a/arm/proofs/bignum_kmul_16_32_neon.ml b/arm/proofs/bignum_kmul_16_32_neon.ml new file mode 100644 index 00000000..3ffb9919 --- /dev/null +++ b/arm/proofs/bignum_kmul_16_32_neon.ml @@ -0,0 +1,1562 @@ +(* + * Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. + * SPDX-License-Identifier: Apache-2.0 OR ISC + *) + +(* ========================================================================= *) +(* 16x16 -> 32 multiplication, using Karatsuba reduction. *) +(* ========================================================================= *) + +(**** print_literal_from_elf "arm/fastmul/bignum_kmul_16_32_neon.o";; + ****) + +let bignum_kmul_16_32_neon_mc = define_assert_from_elf "bignum_kmul_16_32_neon_mc" "arm/fastmul/bignum_kmul_16_32_neon.o" +[ + 0xa9bf53f3; (* arm_STP X19 X20 SP (Preimmediate_Offset (iword (-- &16))) *) + 0xa9bf5bf5; (* arm_STP X21 X22 SP (Preimmediate_Offset (iword (-- &16))) *) + 0xa9bf63f7; (* arm_STP X23 X24 SP (Preimmediate_Offset (iword (-- &16))) *) + 0xa9bf6bf9; (* arm_STP X25 X26 SP (Preimmediate_Offset (iword (-- &16))) *) + 0xa9bf73fb; (* arm_STP X27 X28 SP (Preimmediate_Offset (iword (-- &16))) *) + 0xa9bf7bfd; (* arm_STP X29 X30 SP (Preimmediate_Offset (iword (-- &16))) *) + 0xaa0003f9; (* arm_MOV X25 X0 *) + 0xaa0103fa; (* arm_MOV X26 X1 *) + 0xaa0203fb; (* arm_MOV X27 X2 *) + 0xaa0303fc; (* arm_MOV X28 X3 *) + 0x940000e5; (* arm_BL (word 916) *) + 0xa9402f4a; (* arm_LDP X10 X11 X26 (Immediate_Offset (iword (&0))) *) + 0xa9442748; (* arm_LDP X8 X9 X26 (Immediate_Offset (iword (&64))) *) + 0xeb08014a; (* arm_SUBS X10 X10 X8 *) + 0xfa09016b; (* arm_SBCS X11 X11 X9 *) + 0xa941374c; (* arm_LDP X12 X13 X26 (Immediate_Offset (iword (&16))) *) + 0xa9452748; (* arm_LDP X8 X9 X26 (Immediate_Offset (iword (&80))) *) + 0xfa08018c; (* arm_SBCS X12 X12 X8 *) + 0xfa0901ad; (* arm_SBCS X13 X13 X9 *) + 0xa9423f4e; (* arm_LDP X14 X15 X26 (Immediate_Offset (iword (&32))) *) + 0xa9462748; (* arm_LDP X8 X9 X26 (Immediate_Offset (iword (&96))) *) + 0xfa0801ce; (* arm_SBCS X14 X14 X8 *) + 0xfa0901ef; (* arm_SBCS X15 X15 X9 *) + 0xa9434750; (* arm_LDP X16 X17 X26 (Immediate_Offset (iword (&48))) *) + 0xa9472748; (* arm_LDP X8 X9 X26 (Immediate_Offset (iword (&112))) *) + 0xfa080210; (* arm_SBCS X16 X16 X8 *) + 0xfa090231; (* arm_SBCS X17 X17 X9 *) + 0xda9f23fd; (* arm_CSETM X29 Condition_CC *) + 0xab1d03bf; (* arm_CMN X29 X29 *) + 0xca1d014a; (* arm_EOR X10 X10 X29 *) + 0xba1f014a; (* arm_ADCS X10 X10 XZR *) + 0xca1d016b; (* arm_EOR X11 X11 X29 *) + 0xba1f016b; (* arm_ADCS X11 X11 XZR *) + 0xa9002f8a; (* arm_STP X10 X11 X28 (Immediate_Offset (iword (&0))) *) + 0xca1d018c; (* arm_EOR X12 X12 X29 *) + 0xba1f018c; (* arm_ADCS X12 X12 XZR *) + 0xca1d01ad; (* arm_EOR X13 X13 X29 *) + 0xba1f01ad; (* arm_ADCS X13 X13 XZR *) + 0xa901378c; (* arm_STP X12 X13 X28 (Immediate_Offset (iword (&16))) *) + 0xca1d01ce; (* arm_EOR X14 X14 X29 *) + 0xba1f01ce; (* arm_ADCS X14 X14 XZR *) + 0xca1d01ef; (* arm_EOR X15 X15 X29 *) + 0xba1f01ef; (* arm_ADCS X15 X15 XZR *) + 0xa9023f8e; (* arm_STP X14 X15 X28 (Immediate_Offset (iword (&32))) *) + 0xca1d0210; (* arm_EOR X16 X16 X29 *) + 0xba1f0210; (* arm_ADCS X16 X16 XZR *) + 0xca1d0231; (* arm_EOR X17 X17 X29 *) + 0xba1f0231; (* arm_ADCS X17 X17 XZR *) + 0xa9034790; (* arm_STP X16 X17 X28 (Immediate_Offset (iword (&48))) *) + 0x91020320; (* arm_ADD X0 X25 (rvalue (word 128)) *) + 0x91010341; (* arm_ADD X1 X26 (rvalue (word 64)) *) + 0x91010362; (* arm_ADD X2 X27 (rvalue (word 64)) *) + 0x940000bb; (* arm_BL (word 748) *) + 0xa9402f6a; (* arm_LDP X10 X11 X27 (Immediate_Offset (iword (&0))) *) + 0xa9442768; (* arm_LDP X8 X9 X27 (Immediate_Offset (iword (&64))) *) + 0xeb0a010a; (* arm_SUBS X10 X8 X10 *) + 0xfa0b012b; (* arm_SBCS X11 X9 X11 *) + 0xa941376c; (* arm_LDP X12 X13 X27 (Immediate_Offset (iword (&16))) *) + 0xa9452768; (* arm_LDP X8 X9 X27 (Immediate_Offset (iword (&80))) *) + 0xfa0c010c; (* arm_SBCS X12 X8 X12 *) + 0xfa0d012d; (* arm_SBCS X13 X9 X13 *) + 0xa9423f6e; (* arm_LDP X14 X15 X27 (Immediate_Offset (iword (&32))) *) + 0xa9462768; (* arm_LDP X8 X9 X27 (Immediate_Offset (iword (&96))) *) + 0xfa0e010e; (* arm_SBCS X14 X8 X14 *) + 0xfa0f012f; (* arm_SBCS X15 X9 X15 *) + 0xa9434770; (* arm_LDP X16 X17 X27 (Immediate_Offset (iword (&48))) *) + 0xa9472768; (* arm_LDP X8 X9 X27 (Immediate_Offset (iword (&112))) *) + 0xfa100110; (* arm_SBCS X16 X8 X16 *) + 0xfa110131; (* arm_SBCS X17 X9 X17 *) + 0xda9f23f3; (* arm_CSETM X19 Condition_CC *) + 0xab13027f; (* arm_CMN X19 X19 *) + 0xca13014a; (* arm_EOR X10 X10 X19 *) + 0xba1f014a; (* arm_ADCS X10 X10 XZR *) + 0xca13016b; (* arm_EOR X11 X11 X19 *) + 0xba1f016b; (* arm_ADCS X11 X11 XZR *) + 0xa9042f8a; (* arm_STP X10 X11 X28 (Immediate_Offset (iword (&64))) *) + 0xca13018c; (* arm_EOR X12 X12 X19 *) + 0xba1f018c; (* arm_ADCS X12 X12 XZR *) + 0xca1301ad; (* arm_EOR X13 X13 X19 *) + 0xba1f01ad; (* arm_ADCS X13 X13 XZR *) + 0xa905378c; (* arm_STP X12 X13 X28 (Immediate_Offset (iword (&80))) *) + 0xca1301ce; (* arm_EOR X14 X14 X19 *) + 0xba1f01ce; (* arm_ADCS X14 X14 XZR *) + 0xca1301ef; (* arm_EOR X15 X15 X19 *) + 0xba1f01ef; (* arm_ADCS X15 X15 XZR *) + 0xa9063f8e; (* arm_STP X14 X15 X28 (Immediate_Offset (iword (&96))) *) + 0xca130210; (* arm_EOR X16 X16 X19 *) + 0xba1f0210; (* arm_ADCS X16 X16 XZR *) + 0xca130231; (* arm_EOR X17 X17 X19 *) + 0xba1f0231; (* arm_ADCS X17 X17 XZR *) + 0xa9074790; (* arm_STP X16 X17 X28 (Immediate_Offset (iword (&112))) *) + 0xca1303bd; (* arm_EOR X29 X29 X19 *) + 0xa9482f2a; (* arm_LDP X10 X11 X25 (Immediate_Offset (iword (&128))) *) + 0xa944372c; (* arm_LDP X12 X13 X25 (Immediate_Offset (iword (&64))) *) + 0xab0c014a; (* arm_ADDS X10 X10 X12 *) + 0xba0d016b; (* arm_ADCS X11 X11 X13 *) + 0xa9082f2a; (* arm_STP X10 X11 X25 (Immediate_Offset (iword (&128))) *) + 0xa9492f2a; (* arm_LDP X10 X11 X25 (Immediate_Offset (iword (&144))) *) + 0xa945372c; (* arm_LDP X12 X13 X25 (Immediate_Offset (iword (&80))) *) + 0xba0c014a; (* arm_ADCS X10 X10 X12 *) + 0xba0d016b; (* arm_ADCS X11 X11 X13 *) + 0xa9092f2a; (* arm_STP X10 X11 X25 (Immediate_Offset (iword (&144))) *) + 0xa94a2f2a; (* arm_LDP X10 X11 X25 (Immediate_Offset (iword (&160))) *) + 0xa946372c; (* arm_LDP X12 X13 X25 (Immediate_Offset (iword (&96))) *) + 0xba0c014a; (* arm_ADCS X10 X10 X12 *) + 0xba0d016b; (* arm_ADCS X11 X11 X13 *) + 0xa90a2f2a; (* arm_STP X10 X11 X25 (Immediate_Offset (iword (&160))) *) + 0xa94b2f2a; (* arm_LDP X10 X11 X25 (Immediate_Offset (iword (&176))) *) + 0xa947372c; (* arm_LDP X12 X13 X25 (Immediate_Offset (iword (&112))) *) + 0xba0c014a; (* arm_ADCS X10 X10 X12 *) + 0xba0d016b; (* arm_ADCS X11 X11 X13 *) + 0xa90b2f2a; (* arm_STP X10 X11 X25 (Immediate_Offset (iword (&176))) *) + 0xa94c2f2a; (* arm_LDP X10 X11 X25 (Immediate_Offset (iword (&192))) *) + 0xba1f014a; (* arm_ADCS X10 X10 XZR *) + 0xba1f016b; (* arm_ADCS X11 X11 XZR *) + 0xa90c2f2a; (* arm_STP X10 X11 X25 (Immediate_Offset (iword (&192))) *) + 0xa94d2f2a; (* arm_LDP X10 X11 X25 (Immediate_Offset (iword (&208))) *) + 0xba1f014a; (* arm_ADCS X10 X10 XZR *) + 0xba1f016b; (* arm_ADCS X11 X11 XZR *) + 0xa90d2f2a; (* arm_STP X10 X11 X25 (Immediate_Offset (iword (&208))) *) + 0xa94e2f2a; (* arm_LDP X10 X11 X25 (Immediate_Offset (iword (&224))) *) + 0xba1f014a; (* arm_ADCS X10 X10 XZR *) + 0xba1f016b; (* arm_ADCS X11 X11 XZR *) + 0xa90e2f2a; (* arm_STP X10 X11 X25 (Immediate_Offset (iword (&224))) *) + 0xa94f2f2a; (* arm_LDP X10 X11 X25 (Immediate_Offset (iword (&240))) *) + 0xba1f014a; (* arm_ADCS X10 X10 XZR *) + 0xba1f016b; (* arm_ADCS X11 X11 XZR *) + 0xa90f2f2a; (* arm_STP X10 X11 X25 (Immediate_Offset (iword (&240))) *) + 0x91020380; (* arm_ADD X0 X28 (rvalue (word 128)) *) + 0xaa1c03e1; (* arm_MOV X1 X28 *) + 0x91010382; (* arm_ADD X2 X28 (rvalue (word 64)) *) + 0x9400006c; (* arm_BL (word 432) *) + 0xa9400720; (* arm_LDP X0 X1 X25 (Immediate_Offset (iword (&0))) *) + 0xa9484730; (* arm_LDP X16 X17 X25 (Immediate_Offset (iword (&128))) *) + 0xab100000; (* arm_ADDS X0 X0 X16 *) + 0xba110021; (* arm_ADCS X1 X1 X17 *) + 0xa9410f22; (* arm_LDP X2 X3 X25 (Immediate_Offset (iword (&16))) *) + 0xa9494730; (* arm_LDP X16 X17 X25 (Immediate_Offset (iword (&144))) *) + 0xba100042; (* arm_ADCS X2 X2 X16 *) + 0xba110063; (* arm_ADCS X3 X3 X17 *) + 0xa9421724; (* arm_LDP X4 X5 X25 (Immediate_Offset (iword (&32))) *) + 0xa94a4730; (* arm_LDP X16 X17 X25 (Immediate_Offset (iword (&160))) *) + 0xba100084; (* arm_ADCS X4 X4 X16 *) + 0xba1100a5; (* arm_ADCS X5 X5 X17 *) + 0xa9431f26; (* arm_LDP X6 X7 X25 (Immediate_Offset (iword (&48))) *) + 0xa94b4730; (* arm_LDP X16 X17 X25 (Immediate_Offset (iword (&176))) *) + 0xba1000c6; (* arm_ADCS X6 X6 X16 *) + 0xba1100e7; (* arm_ADCS X7 X7 X17 *) + 0xa9482728; (* arm_LDP X8 X9 X25 (Immediate_Offset (iword (&128))) *) + 0xa94c4730; (* arm_LDP X16 X17 X25 (Immediate_Offset (iword (&192))) *) + 0xba100108; (* arm_ADCS X8 X8 X16 *) + 0xba110129; (* arm_ADCS X9 X9 X17 *) + 0xa9492f2a; (* arm_LDP X10 X11 X25 (Immediate_Offset (iword (&144))) *) + 0xa94d4730; (* arm_LDP X16 X17 X25 (Immediate_Offset (iword (&208))) *) + 0xba10014a; (* arm_ADCS X10 X10 X16 *) + 0xba11016b; (* arm_ADCS X11 X11 X17 *) + 0xa94a372c; (* arm_LDP X12 X13 X25 (Immediate_Offset (iword (&160))) *) + 0xa94e4730; (* arm_LDP X16 X17 X25 (Immediate_Offset (iword (&224))) *) + 0xba10018c; (* arm_ADCS X12 X12 X16 *) + 0xba1101ad; (* arm_ADCS X13 X13 X17 *) + 0xa94b3f2e; (* arm_LDP X14 X15 X25 (Immediate_Offset (iword (&176))) *) + 0xa94f4730; (* arm_LDP X16 X17 X25 (Immediate_Offset (iword (&240))) *) + 0xba1001ce; (* arm_ADCS X14 X14 X16 *) + 0xba1101ef; (* arm_ADCS X15 X15 X17 *) + 0x9a9f37fa; (* arm_CSET X26 Condition_CS *) + 0xab1d03bf; (* arm_CMN X29 X29 *) + 0xa9484790; (* arm_LDP X16 X17 X28 (Immediate_Offset (iword (&128))) *) + 0xca1d0210; (* arm_EOR X16 X16 X29 *) + 0xba100000; (* arm_ADCS X0 X0 X16 *) + 0xca1d0231; (* arm_EOR X17 X17 X29 *) + 0xba110021; (* arm_ADCS X1 X1 X17 *) + 0xa9040720; (* arm_STP X0 X1 X25 (Immediate_Offset (iword (&64))) *) + 0xa9494790; (* arm_LDP X16 X17 X28 (Immediate_Offset (iword (&144))) *) + 0xca1d0210; (* arm_EOR X16 X16 X29 *) + 0xba100042; (* arm_ADCS X2 X2 X16 *) + 0xca1d0231; (* arm_EOR X17 X17 X29 *) + 0xba110063; (* arm_ADCS X3 X3 X17 *) + 0xa9050f22; (* arm_STP X2 X3 X25 (Immediate_Offset (iword (&80))) *) + 0xa94a4790; (* arm_LDP X16 X17 X28 (Immediate_Offset (iword (&160))) *) + 0xca1d0210; (* arm_EOR X16 X16 X29 *) + 0xba100084; (* arm_ADCS X4 X4 X16 *) + 0xca1d0231; (* arm_EOR X17 X17 X29 *) + 0xba1100a5; (* arm_ADCS X5 X5 X17 *) + 0xa9061724; (* arm_STP X4 X5 X25 (Immediate_Offset (iword (&96))) *) + 0xa94b4790; (* arm_LDP X16 X17 X28 (Immediate_Offset (iword (&176))) *) + 0xca1d0210; (* arm_EOR X16 X16 X29 *) + 0xba1000c6; (* arm_ADCS X6 X6 X16 *) + 0xca1d0231; (* arm_EOR X17 X17 X29 *) + 0xba1100e7; (* arm_ADCS X7 X7 X17 *) + 0xa9071f26; (* arm_STP X6 X7 X25 (Immediate_Offset (iword (&112))) *) + 0xa94c4790; (* arm_LDP X16 X17 X28 (Immediate_Offset (iword (&192))) *) + 0xca1d0210; (* arm_EOR X16 X16 X29 *) + 0xba100108; (* arm_ADCS X8 X8 X16 *) + 0xca1d0231; (* arm_EOR X17 X17 X29 *) + 0xba110129; (* arm_ADCS X9 X9 X17 *) + 0xa9082728; (* arm_STP X8 X9 X25 (Immediate_Offset (iword (&128))) *) + 0xa94d4790; (* arm_LDP X16 X17 X28 (Immediate_Offset (iword (&208))) *) + 0xca1d0210; (* arm_EOR X16 X16 X29 *) + 0xba10014a; (* arm_ADCS X10 X10 X16 *) + 0xca1d0231; (* arm_EOR X17 X17 X29 *) + 0xba11016b; (* arm_ADCS X11 X11 X17 *) + 0xa9092f2a; (* arm_STP X10 X11 X25 (Immediate_Offset (iword (&144))) *) + 0xa94e4790; (* arm_LDP X16 X17 X28 (Immediate_Offset (iword (&224))) *) + 0xca1d0210; (* arm_EOR X16 X16 X29 *) + 0xba10018c; (* arm_ADCS X12 X12 X16 *) + 0xca1d0231; (* arm_EOR X17 X17 X29 *) + 0xba1101ad; (* arm_ADCS X13 X13 X17 *) + 0xa90a372c; (* arm_STP X12 X13 X25 (Immediate_Offset (iword (&160))) *) + 0xa94f4790; (* arm_LDP X16 X17 X28 (Immediate_Offset (iword (&240))) *) + 0xca1d0210; (* arm_EOR X16 X16 X29 *) + 0xba1001ce; (* arm_ADCS X14 X14 X16 *) + 0xca1d0231; (* arm_EOR X17 X17 X29 *) + 0xba1101ef; (* arm_ADCS X15 X15 X17 *) + 0xa90b3f2e; (* arm_STP X14 X15 X25 (Immediate_Offset (iword (&176))) *) + 0xba1a03bb; (* arm_ADCS X27 X29 X26 *) + 0x9a1f03bc; (* arm_ADC X28 X29 XZR *) + 0xa94c2f2a; (* arm_LDP X10 X11 X25 (Immediate_Offset (iword (&192))) *) + 0xab1b014a; (* arm_ADDS X10 X10 X27 *) + 0xba1c016b; (* arm_ADCS X11 X11 X28 *) + 0xa90c2f2a; (* arm_STP X10 X11 X25 (Immediate_Offset (iword (&192))) *) + 0xa94d2f2a; (* arm_LDP X10 X11 X25 (Immediate_Offset (iword (&208))) *) + 0xba1c014a; (* arm_ADCS X10 X10 X28 *) + 0xba1c016b; (* arm_ADCS X11 X11 X28 *) + 0xa90d2f2a; (* arm_STP X10 X11 X25 (Immediate_Offset (iword (&208))) *) + 0xa94e2f2a; (* arm_LDP X10 X11 X25 (Immediate_Offset (iword (&224))) *) + 0xba1c014a; (* arm_ADCS X10 X10 X28 *) + 0xba1c016b; (* arm_ADCS X11 X11 X28 *) + 0xa90e2f2a; (* arm_STP X10 X11 X25 (Immediate_Offset (iword (&224))) *) + 0xa94f2f2a; (* arm_LDP X10 X11 X25 (Immediate_Offset (iword (&240))) *) + 0xba1c014a; (* arm_ADCS X10 X10 X28 *) + 0xba1c016b; (* arm_ADCS X11 X11 X28 *) + 0xa90f2f2a; (* arm_STP X10 X11 X25 (Immediate_Offset (iword (&240))) *) + 0xa8c17bfd; (* arm_LDP X29 X30 SP (Postimmediate_Offset (iword (&16))) *) + 0xa8c173fb; (* arm_LDP X27 X28 SP (Postimmediate_Offset (iword (&16))) *) + 0xa8c16bf9; (* arm_LDP X25 X26 SP (Postimmediate_Offset (iword (&16))) *) + 0xa8c163f7; (* arm_LDP X23 X24 SP (Postimmediate_Offset (iword (&16))) *) + 0xa8c15bf5; (* arm_LDP X21 X22 SP (Postimmediate_Offset (iword (&16))) *) + 0xa8c153f3; (* arm_LDP X19 X20 SP (Postimmediate_Offset (iword (&16))) *) + 0xd65f03c0; (* arm_RET X30 *) + 0xa9401023; (* arm_LDP X3 X4 X1 (Immediate_Offset (iword (&0))) *) + 0x3dc00020; (* arm_LDR Q0 X1 (Immediate_Offset (word 0)) *) + 0xa9402047; (* arm_LDP X7 X8 X2 (Immediate_Offset (iword (&0))) *) + 0x3dc00041; (* arm_LDR Q1 X2 (Immediate_Offset (word 0)) *) + 0xa9411825; (* arm_LDP X5 X6 X1 (Immediate_Offset (iword (&16))) *) + 0x3dc00422; (* arm_LDR Q2 X1 (Immediate_Offset (word 16)) *) + 0xa9412849; (* arm_LDP X9 X10 X2 (Immediate_Offset (iword (&16))) *) + 0x3dc00443; (* arm_LDR Q3 X2 (Immediate_Offset (word 16)) *) + 0x4e801824; (* arm_UZIP1 Q4 Q1 Q0 32 *) + 0x4ea00821; (* arm_REV64_VEC Q1 Q1 32 *) + 0x4e801805; (* arm_UZIP1 Q5 Q0 Q0 32 *) + 0x4ea09c20; (* arm_MUL_VEC Q0 Q1 Q0 32 *) + 0x6ea02800; (* arm_UADDLP Q0 Q0 32 *) + 0x4f605400; (* arm_SHL_VEC Q0 Q0 32 64 *) + 0x2ea480a0; (* arm_UMLAL Q0 Q5 Q4 32 *) + 0x4e083c0b; (* arm_UMOV X11 Q0 0 8 *) + 0x4e183c0f; (* arm_UMOV X15 Q0 1 8 *) + 0x4e821860; (* arm_UZIP1 Q0 Q3 Q2 32 *) + 0x4ea00861; (* arm_REV64_VEC Q1 Q3 32 *) + 0x4e821843; (* arm_UZIP1 Q3 Q2 Q2 32 *) + 0x4ea29c21; (* arm_MUL_VEC Q1 Q1 Q2 32 *) + 0x6ea02821; (* arm_UADDLP Q1 Q1 32 *) + 0x4f605421; (* arm_SHL_VEC Q1 Q1 32 64 *) + 0x2ea08061; (* arm_UMLAL Q1 Q3 Q0 32 *) + 0x4e083c30; (* arm_UMOV X16 Q1 0 8 *) + 0x4e183c31; (* arm_UMOV X17 Q1 1 8 *) + 0x3dc00820; (* arm_LDR Q0 X1 (Immediate_Offset (word 32)) *) + 0x3dc00841; (* arm_LDR Q1 X2 (Immediate_Offset (word 32)) *) + 0x3dc00c22; (* arm_LDR Q2 X1 (Immediate_Offset (word 48)) *) + 0x3dc00c43; (* arm_LDR Q3 X2 (Immediate_Offset (word 48)) *) + 0x9bc77c73; (* arm_UMULH X19 X3 X7 *) + 0xab1301ef; (* arm_ADDS X15 X15 X19 *) + 0x9bc87c93; (* arm_UMULH X19 X4 X8 *) + 0xba130210; (* arm_ADCS X16 X16 X19 *) + 0x9bc97cb3; (* arm_UMULH X19 X5 X9 *) + 0xba130231; (* arm_ADCS X17 X17 X19 *) + 0x9bca7cd3; (* arm_UMULH X19 X6 X10 *) + 0x4e801824; (* arm_UZIP1 Q4 Q1 Q0 32 *) + 0x4ea00821; (* arm_REV64_VEC Q1 Q1 32 *) + 0x4e801805; (* arm_UZIP1 Q5 Q0 Q0 32 *) + 0x4ea09c20; (* arm_MUL_VEC Q0 Q1 Q0 32 *) + 0x6ea02800; (* arm_UADDLP Q0 Q0 32 *) + 0x4f605400; (* arm_SHL_VEC Q0 Q0 32 64 *) + 0x2ea480a0; (* arm_UMLAL Q0 Q5 Q4 32 *) + 0x9a1f0273; (* arm_ADC X19 X19 XZR *) + 0xab0b01ec; (* arm_ADDS X12 X15 X11 *) + 0xba0f020f; (* arm_ADCS X15 X16 X15 *) + 0xba100230; (* arm_ADCS X16 X17 X16 *) + 0xba110271; (* arm_ADCS X17 X19 X17 *) + 0x9a1303f3; (* arm_ADC X19 XZR X19 *) + 0xab0b01ed; (* arm_ADDS X13 X15 X11 *) + 0xba0c020e; (* arm_ADCS X14 X16 X12 *) + 0xba0f022f; (* arm_ADCS X15 X17 X15 *) + 0xba100270; (* arm_ADCS X16 X19 X16 *) + 0xba1103f1; (* arm_ADCS X17 XZR X17 *) + 0x9a1303f3; (* arm_ADC X19 XZR X19 *) + 0xeb0600b8; (* arm_SUBS X24 X5 X6 *) + 0xda982718; (* arm_CNEG X24 X24 Condition_CC *) + 0xda9f23f4; (* arm_CSETM X20 Condition_CC *) + 0xeb090155; (* arm_SUBS X21 X10 X9 *) + 0xda9526b5; (* arm_CNEG X21 X21 Condition_CC *) + 0x9b157f16; (* arm_MUL X22 X24 X21 *) + 0x9bd57f15; (* arm_UMULH X21 X24 X21 *) + 0xda942294; (* arm_CINV X20 X20 Condition_CC *) + 0xb100069f; (* arm_CMN X20 (rvalue (word 1)) *) + 0xca1402d6; (* arm_EOR X22 X22 X20 *) + 0xba160210; (* arm_ADCS X16 X16 X22 *) + 0xca1402b5; (* arm_EOR X21 X21 X20 *) + 0xba150231; (* arm_ADCS X17 X17 X21 *) + 0x9a140273; (* arm_ADC X19 X19 X20 *) + 0xeb040078; (* arm_SUBS X24 X3 X4 *) + 0xda982718; (* arm_CNEG X24 X24 Condition_CC *) + 0xda9f23f4; (* arm_CSETM X20 Condition_CC *) + 0xeb070115; (* arm_SUBS X21 X8 X7 *) + 0xda9526b5; (* arm_CNEG X21 X21 Condition_CC *) + 0x9b157f16; (* arm_MUL X22 X24 X21 *) + 0x9bd57f15; (* arm_UMULH X21 X24 X21 *) + 0xda942294; (* arm_CINV X20 X20 Condition_CC *) + 0xb100069f; (* arm_CMN X20 (rvalue (word 1)) *) + 0xca1402d6; (* arm_EOR X22 X22 X20 *) + 0xba16018c; (* arm_ADCS X12 X12 X22 *) + 0xca1402b5; (* arm_EOR X21 X21 X20 *) + 0xba1501ad; (* arm_ADCS X13 X13 X21 *) + 0xba1401ce; (* arm_ADCS X14 X14 X20 *) + 0xba1401ef; (* arm_ADCS X15 X15 X20 *) + 0xba140210; (* arm_ADCS X16 X16 X20 *) + 0xba140231; (* arm_ADCS X17 X17 X20 *) + 0x9a140273; (* arm_ADC X19 X19 X20 *) + 0xeb060098; (* arm_SUBS X24 X4 X6 *) + 0xda982718; (* arm_CNEG X24 X24 Condition_CC *) + 0xda9f23f4; (* arm_CSETM X20 Condition_CC *) + 0xeb080155; (* arm_SUBS X21 X10 X8 *) + 0xda9526b5; (* arm_CNEG X21 X21 Condition_CC *) + 0x9b157f16; (* arm_MUL X22 X24 X21 *) + 0x9bd57f15; (* arm_UMULH X21 X24 X21 *) + 0xda942294; (* arm_CINV X20 X20 Condition_CC *) + 0xb100069f; (* arm_CMN X20 (rvalue (word 1)) *) + 0xca1402d6; (* arm_EOR X22 X22 X20 *) + 0xba1601ef; (* arm_ADCS X15 X15 X22 *) + 0xca1402b5; (* arm_EOR X21 X21 X20 *) + 0xba150210; (* arm_ADCS X16 X16 X21 *) + 0xba140231; (* arm_ADCS X17 X17 X20 *) + 0x9a140273; (* arm_ADC X19 X19 X20 *) + 0xeb050078; (* arm_SUBS X24 X3 X5 *) + 0xda982718; (* arm_CNEG X24 X24 Condition_CC *) + 0xda9f23f4; (* arm_CSETM X20 Condition_CC *) + 0xeb070135; (* arm_SUBS X21 X9 X7 *) + 0xda9526b5; (* arm_CNEG X21 X21 Condition_CC *) + 0x9b157f16; (* arm_MUL X22 X24 X21 *) + 0x9bd57f15; (* arm_UMULH X21 X24 X21 *) + 0xda942294; (* arm_CINV X20 X20 Condition_CC *) + 0xb100069f; (* arm_CMN X20 (rvalue (word 1)) *) + 0xca1402d6; (* arm_EOR X22 X22 X20 *) + 0xba1601ad; (* arm_ADCS X13 X13 X22 *) + 0xca1402b5; (* arm_EOR X21 X21 X20 *) + 0xba1501ce; (* arm_ADCS X14 X14 X21 *) + 0xba1401ef; (* arm_ADCS X15 X15 X20 *) + 0xba140210; (* arm_ADCS X16 X16 X20 *) + 0xba140231; (* arm_ADCS X17 X17 X20 *) + 0x9a140273; (* arm_ADC X19 X19 X20 *) + 0xeb060078; (* arm_SUBS X24 X3 X6 *) + 0xda982718; (* arm_CNEG X24 X24 Condition_CC *) + 0xda9f23f4; (* arm_CSETM X20 Condition_CC *) + 0xeb070155; (* arm_SUBS X21 X10 X7 *) + 0xda9526b5; (* arm_CNEG X21 X21 Condition_CC *) + 0x9b157f16; (* arm_MUL X22 X24 X21 *) + 0x9bd57f15; (* arm_UMULH X21 X24 X21 *) + 0xda942294; (* arm_CINV X20 X20 Condition_CC *) + 0xb100069f; (* arm_CMN X20 (rvalue (word 1)) *) + 0xca1402d6; (* arm_EOR X22 X22 X20 *) + 0xba1601ce; (* arm_ADCS X14 X14 X22 *) + 0xca1402b5; (* arm_EOR X21 X21 X20 *) + 0xba1501ef; (* arm_ADCS X15 X15 X21 *) + 0xba140210; (* arm_ADCS X16 X16 X20 *) + 0xba140231; (* arm_ADCS X17 X17 X20 *) + 0x9a140273; (* arm_ADC X19 X19 X20 *) + 0xeb050098; (* arm_SUBS X24 X4 X5 *) + 0xda982718; (* arm_CNEG X24 X24 Condition_CC *) + 0xda9f23f4; (* arm_CSETM X20 Condition_CC *) + 0xeb080135; (* arm_SUBS X21 X9 X8 *) + 0xda9526b5; (* arm_CNEG X21 X21 Condition_CC *) + 0x9b157f16; (* arm_MUL X22 X24 X21 *) + 0x9bd57f15; (* arm_UMULH X21 X24 X21 *) + 0xda942294; (* arm_CINV X20 X20 Condition_CC *) + 0xb100069f; (* arm_CMN X20 (rvalue (word 1)) *) + 0xca1402d6; (* arm_EOR X22 X22 X20 *) + 0xba1601ce; (* arm_ADCS X14 X14 X22 *) + 0xca1402b5; (* arm_EOR X21 X21 X20 *) + 0xba1501ef; (* arm_ADCS X15 X15 X21 *) + 0xba140210; (* arm_ADCS X16 X16 X20 *) + 0xba140231; (* arm_ADCS X17 X17 X20 *) + 0x9a140273; (* arm_ADC X19 X19 X20 *) + 0xa9421023; (* arm_LDP X3 X4 X1 (Immediate_Offset (iword (&32))) *) + 0xa900300b; (* arm_STP X11 X12 X0 (Immediate_Offset (iword (&0))) *) + 0xa9422047; (* arm_LDP X7 X8 X2 (Immediate_Offset (iword (&32))) *) + 0xa901380d; (* arm_STP X13 X14 X0 (Immediate_Offset (iword (&16))) *) + 0xa9431825; (* arm_LDP X5 X6 X1 (Immediate_Offset (iword (&48))) *) + 0xa902400f; (* arm_STP X15 X16 X0 (Immediate_Offset (iword (&32))) *) + 0xa9432849; (* arm_LDP X9 X10 X2 (Immediate_Offset (iword (&48))) *) + 0xa9034c11; (* arm_STP X17 X19 X0 (Immediate_Offset (iword (&48))) *) + 0x4e083c0b; (* arm_UMOV X11 Q0 0 8 *) + 0x4e183c0f; (* arm_UMOV X15 Q0 1 8 *) + 0x4e821860; (* arm_UZIP1 Q0 Q3 Q2 32 *) + 0x4ea00861; (* arm_REV64_VEC Q1 Q3 32 *) + 0x4e821843; (* arm_UZIP1 Q3 Q2 Q2 32 *) + 0x4ea29c21; (* arm_MUL_VEC Q1 Q1 Q2 32 *) + 0x6ea02821; (* arm_UADDLP Q1 Q1 32 *) + 0x4f605421; (* arm_SHL_VEC Q1 Q1 32 64 *) + 0x2ea08061; (* arm_UMLAL Q1 Q3 Q0 32 *) + 0x4e083c30; (* arm_UMOV X16 Q1 0 8 *) + 0x4e183c31; (* arm_UMOV X17 Q1 1 8 *) + 0x9bc77c73; (* arm_UMULH X19 X3 X7 *) + 0xab1301ef; (* arm_ADDS X15 X15 X19 *) + 0x9bc87c93; (* arm_UMULH X19 X4 X8 *) + 0xba130210; (* arm_ADCS X16 X16 X19 *) + 0x9bc97cb3; (* arm_UMULH X19 X5 X9 *) + 0xba130231; (* arm_ADCS X17 X17 X19 *) + 0x9bca7cd3; (* arm_UMULH X19 X6 X10 *) + 0x9a1f0273; (* arm_ADC X19 X19 XZR *) + 0xab0b01ec; (* arm_ADDS X12 X15 X11 *) + 0xba0f020f; (* arm_ADCS X15 X16 X15 *) + 0xba100230; (* arm_ADCS X16 X17 X16 *) + 0xba110271; (* arm_ADCS X17 X19 X17 *) + 0x9a1303f3; (* arm_ADC X19 XZR X19 *) + 0xab0b01ed; (* arm_ADDS X13 X15 X11 *) + 0xba0c020e; (* arm_ADCS X14 X16 X12 *) + 0xba0f022f; (* arm_ADCS X15 X17 X15 *) + 0xba100270; (* arm_ADCS X16 X19 X16 *) + 0xba1103f1; (* arm_ADCS X17 XZR X17 *) + 0x9a1303f3; (* arm_ADC X19 XZR X19 *) + 0xa9425416; (* arm_LDP X22 X21 X0 (Immediate_Offset (iword (&32))) *) + 0xab16016b; (* arm_ADDS X11 X11 X22 *) + 0xba15018c; (* arm_ADCS X12 X12 X21 *) + 0xa9435416; (* arm_LDP X22 X21 X0 (Immediate_Offset (iword (&48))) *) + 0xba1601ad; (* arm_ADCS X13 X13 X22 *) + 0xba1501ce; (* arm_ADCS X14 X14 X21 *) + 0xba1f01ef; (* arm_ADCS X15 X15 XZR *) + 0xba1f0210; (* arm_ADCS X16 X16 XZR *) + 0xba1f0231; (* arm_ADCS X17 X17 XZR *) + 0x9a1f0273; (* arm_ADC X19 X19 XZR *) + 0xeb0600b8; (* arm_SUBS X24 X5 X6 *) + 0xda982718; (* arm_CNEG X24 X24 Condition_CC *) + 0xda9f23f4; (* arm_CSETM X20 Condition_CC *) + 0xeb090155; (* arm_SUBS X21 X10 X9 *) + 0xda9526b5; (* arm_CNEG X21 X21 Condition_CC *) + 0x9b157f16; (* arm_MUL X22 X24 X21 *) + 0x9bd57f15; (* arm_UMULH X21 X24 X21 *) + 0xda942294; (* arm_CINV X20 X20 Condition_CC *) + 0xb100069f; (* arm_CMN X20 (rvalue (word 1)) *) + 0xca1402d6; (* arm_EOR X22 X22 X20 *) + 0xba160210; (* arm_ADCS X16 X16 X22 *) + 0xca1402b5; (* arm_EOR X21 X21 X20 *) + 0xba150231; (* arm_ADCS X17 X17 X21 *) + 0x9a140273; (* arm_ADC X19 X19 X20 *) + 0xeb040078; (* arm_SUBS X24 X3 X4 *) + 0xda982718; (* arm_CNEG X24 X24 Condition_CC *) + 0xda9f23f4; (* arm_CSETM X20 Condition_CC *) + 0xeb070115; (* arm_SUBS X21 X8 X7 *) + 0xda9526b5; (* arm_CNEG X21 X21 Condition_CC *) + 0x9b157f16; (* arm_MUL X22 X24 X21 *) + 0x9bd57f15; (* arm_UMULH X21 X24 X21 *) + 0xda942294; (* arm_CINV X20 X20 Condition_CC *) + 0xb100069f; (* arm_CMN X20 (rvalue (word 1)) *) + 0xca1402d6; (* arm_EOR X22 X22 X20 *) + 0xba16018c; (* arm_ADCS X12 X12 X22 *) + 0xca1402b5; (* arm_EOR X21 X21 X20 *) + 0xba1501ad; (* arm_ADCS X13 X13 X21 *) + 0xba1401ce; (* arm_ADCS X14 X14 X20 *) + 0xba1401ef; (* arm_ADCS X15 X15 X20 *) + 0xba140210; (* arm_ADCS X16 X16 X20 *) + 0xba140231; (* arm_ADCS X17 X17 X20 *) + 0x9a140273; (* arm_ADC X19 X19 X20 *) + 0xeb060098; (* arm_SUBS X24 X4 X6 *) + 0xda982718; (* arm_CNEG X24 X24 Condition_CC *) + 0xda9f23f4; (* arm_CSETM X20 Condition_CC *) + 0xeb080155; (* arm_SUBS X21 X10 X8 *) + 0xda9526b5; (* arm_CNEG X21 X21 Condition_CC *) + 0x9b157f16; (* arm_MUL X22 X24 X21 *) + 0x9bd57f15; (* arm_UMULH X21 X24 X21 *) + 0xda942294; (* arm_CINV X20 X20 Condition_CC *) + 0xb100069f; (* arm_CMN X20 (rvalue (word 1)) *) + 0xca1402d6; (* arm_EOR X22 X22 X20 *) + 0xba1601ef; (* arm_ADCS X15 X15 X22 *) + 0xca1402b5; (* arm_EOR X21 X21 X20 *) + 0xba150210; (* arm_ADCS X16 X16 X21 *) + 0xba140231; (* arm_ADCS X17 X17 X20 *) + 0x9a140273; (* arm_ADC X19 X19 X20 *) + 0xeb050078; (* arm_SUBS X24 X3 X5 *) + 0xda982718; (* arm_CNEG X24 X24 Condition_CC *) + 0xda9f23f4; (* arm_CSETM X20 Condition_CC *) + 0xeb070135; (* arm_SUBS X21 X9 X7 *) + 0xda9526b5; (* arm_CNEG X21 X21 Condition_CC *) + 0x9b157f16; (* arm_MUL X22 X24 X21 *) + 0x9bd57f15; (* arm_UMULH X21 X24 X21 *) + 0xda942294; (* arm_CINV X20 X20 Condition_CC *) + 0xb100069f; (* arm_CMN X20 (rvalue (word 1)) *) + 0xca1402d6; (* arm_EOR X22 X22 X20 *) + 0xba1601ad; (* arm_ADCS X13 X13 X22 *) + 0xca1402b5; (* arm_EOR X21 X21 X20 *) + 0xba1501ce; (* arm_ADCS X14 X14 X21 *) + 0xba1401ef; (* arm_ADCS X15 X15 X20 *) + 0xba140210; (* arm_ADCS X16 X16 X20 *) + 0xba140231; (* arm_ADCS X17 X17 X20 *) + 0x9a140273; (* arm_ADC X19 X19 X20 *) + 0xeb060078; (* arm_SUBS X24 X3 X6 *) + 0xda982718; (* arm_CNEG X24 X24 Condition_CC *) + 0xda9f23f4; (* arm_CSETM X20 Condition_CC *) + 0xeb070155; (* arm_SUBS X21 X10 X7 *) + 0xda9526b5; (* arm_CNEG X21 X21 Condition_CC *) + 0x9b157f16; (* arm_MUL X22 X24 X21 *) + 0x9bd57f15; (* arm_UMULH X21 X24 X21 *) + 0xda942294; (* arm_CINV X20 X20 Condition_CC *) + 0xb100069f; (* arm_CMN X20 (rvalue (word 1)) *) + 0xca1402d6; (* arm_EOR X22 X22 X20 *) + 0xba1601ce; (* arm_ADCS X14 X14 X22 *) + 0xca1402b5; (* arm_EOR X21 X21 X20 *) + 0xba1501ef; (* arm_ADCS X15 X15 X21 *) + 0xba140210; (* arm_ADCS X16 X16 X20 *) + 0xba140231; (* arm_ADCS X17 X17 X20 *) + 0x9a140273; (* arm_ADC X19 X19 X20 *) + 0xeb050098; (* arm_SUBS X24 X4 X5 *) + 0xda982718; (* arm_CNEG X24 X24 Condition_CC *) + 0xda9f23f4; (* arm_CSETM X20 Condition_CC *) + 0xeb080135; (* arm_SUBS X21 X9 X8 *) + 0xda9526b5; (* arm_CNEG X21 X21 Condition_CC *) + 0x9b157f16; (* arm_MUL X22 X24 X21 *) + 0x9bd57f15; (* arm_UMULH X21 X24 X21 *) + 0xda942294; (* arm_CINV X20 X20 Condition_CC *) + 0xb100069f; (* arm_CMN X20 (rvalue (word 1)) *) + 0xca1402d6; (* arm_EOR X22 X22 X20 *) + 0xba1601ce; (* arm_ADCS X14 X14 X22 *) + 0xca1402b5; (* arm_EOR X21 X21 X20 *) + 0xba1501ef; (* arm_ADCS X15 X15 X21 *) + 0xba140210; (* arm_ADCS X16 X16 X20 *) + 0xba140231; (* arm_ADCS X17 X17 X20 *) + 0x9a140273; (* arm_ADC X19 X19 X20 *) + 0xa9405436; (* arm_LDP X22 X21 X1 (Immediate_Offset (iword (&0))) *) + 0xeb160063; (* arm_SUBS X3 X3 X22 *) + 0xfa150084; (* arm_SBCS X4 X4 X21 *) + 0xa9415436; (* arm_LDP X22 X21 X1 (Immediate_Offset (iword (&16))) *) + 0xfa1600a5; (* arm_SBCS X5 X5 X22 *) + 0xfa1500c6; (* arm_SBCS X6 X6 X21 *) + 0xda9f23f8; (* arm_CSETM X24 Condition_CC *) + 0xa904300b; (* arm_STP X11 X12 X0 (Immediate_Offset (iword (&64))) *) + 0xa9405456; (* arm_LDP X22 X21 X2 (Immediate_Offset (iword (&0))) *) + 0xeb0702c7; (* arm_SUBS X7 X22 X7 *) + 0xfa0802a8; (* arm_SBCS X8 X21 X8 *) + 0xa9415456; (* arm_LDP X22 X21 X2 (Immediate_Offset (iword (&16))) *) + 0xfa0902c9; (* arm_SBCS X9 X22 X9 *) + 0xfa0a02aa; (* arm_SBCS X10 X21 X10 *) + 0xda9f23e1; (* arm_CSETM X1 Condition_CC *) + 0xa905380d; (* arm_STP X13 X14 X0 (Immediate_Offset (iword (&80))) *) + 0xca180063; (* arm_EOR X3 X3 X24 *) + 0xeb180063; (* arm_SUBS X3 X3 X24 *) + 0xca180084; (* arm_EOR X4 X4 X24 *) + 0xfa180084; (* arm_SBCS X4 X4 X24 *) + 0xca1800a5; (* arm_EOR X5 X5 X24 *) + 0xfa1800a5; (* arm_SBCS X5 X5 X24 *) + 0xca1800c6; (* arm_EOR X6 X6 X24 *) + 0xda1800c6; (* arm_SBC X6 X6 X24 *) + 0xa906400f; (* arm_STP X15 X16 X0 (Immediate_Offset (iword (&96))) *) + 0xca0100e7; (* arm_EOR X7 X7 X1 *) + 0xeb0100e7; (* arm_SUBS X7 X7 X1 *) + 0xca010108; (* arm_EOR X8 X8 X1 *) + 0xfa010108; (* arm_SBCS X8 X8 X1 *) + 0xca010129; (* arm_EOR X9 X9 X1 *) + 0xfa010129; (* arm_SBCS X9 X9 X1 *) + 0xca01014a; (* arm_EOR X10 X10 X1 *) + 0xda01014a; (* arm_SBC X10 X10 X1 *) + 0xa9074c11; (* arm_STP X17 X19 X0 (Immediate_Offset (iword (&112))) *) + 0xca180021; (* arm_EOR X1 X1 X24 *) + 0x9b077c6b; (* arm_MUL X11 X3 X7 *) + 0x9b087c8f; (* arm_MUL X15 X4 X8 *) + 0x9b097cb0; (* arm_MUL X16 X5 X9 *) + 0x9b0a7cd1; (* arm_MUL X17 X6 X10 *) + 0x9bc77c73; (* arm_UMULH X19 X3 X7 *) + 0xab1301ef; (* arm_ADDS X15 X15 X19 *) + 0x9bc87c93; (* arm_UMULH X19 X4 X8 *) + 0xba130210; (* arm_ADCS X16 X16 X19 *) + 0x9bc97cb3; (* arm_UMULH X19 X5 X9 *) + 0xba130231; (* arm_ADCS X17 X17 X19 *) + 0x9bca7cd3; (* arm_UMULH X19 X6 X10 *) + 0x9a1f0273; (* arm_ADC X19 X19 XZR *) + 0xab0b01ec; (* arm_ADDS X12 X15 X11 *) + 0xba0f020f; (* arm_ADCS X15 X16 X15 *) + 0xba100230; (* arm_ADCS X16 X17 X16 *) + 0xba110271; (* arm_ADCS X17 X19 X17 *) + 0x9a1303f3; (* arm_ADC X19 XZR X19 *) + 0xab0b01ed; (* arm_ADDS X13 X15 X11 *) + 0xba0c020e; (* arm_ADCS X14 X16 X12 *) + 0xba0f022f; (* arm_ADCS X15 X17 X15 *) + 0xba100270; (* arm_ADCS X16 X19 X16 *) + 0xba1103f1; (* arm_ADCS X17 XZR X17 *) + 0x9a1303f3; (* arm_ADC X19 XZR X19 *) + 0xeb0600b8; (* arm_SUBS X24 X5 X6 *) + 0xda982718; (* arm_CNEG X24 X24 Condition_CC *) + 0xda9f23f4; (* arm_CSETM X20 Condition_CC *) + 0xeb090155; (* arm_SUBS X21 X10 X9 *) + 0xda9526b5; (* arm_CNEG X21 X21 Condition_CC *) + 0x9b157f16; (* arm_MUL X22 X24 X21 *) + 0x9bd57f15; (* arm_UMULH X21 X24 X21 *) + 0xda942294; (* arm_CINV X20 X20 Condition_CC *) + 0xb100069f; (* arm_CMN X20 (rvalue (word 1)) *) + 0xca1402d6; (* arm_EOR X22 X22 X20 *) + 0xba160210; (* arm_ADCS X16 X16 X22 *) + 0xca1402b5; (* arm_EOR X21 X21 X20 *) + 0xba150231; (* arm_ADCS X17 X17 X21 *) + 0x9a140273; (* arm_ADC X19 X19 X20 *) + 0xeb040078; (* arm_SUBS X24 X3 X4 *) + 0xda982718; (* arm_CNEG X24 X24 Condition_CC *) + 0xda9f23f4; (* arm_CSETM X20 Condition_CC *) + 0xeb070115; (* arm_SUBS X21 X8 X7 *) + 0xda9526b5; (* arm_CNEG X21 X21 Condition_CC *) + 0x9b157f16; (* arm_MUL X22 X24 X21 *) + 0x9bd57f15; (* arm_UMULH X21 X24 X21 *) + 0xda942294; (* arm_CINV X20 X20 Condition_CC *) + 0xb100069f; (* arm_CMN X20 (rvalue (word 1)) *) + 0xca1402d6; (* arm_EOR X22 X22 X20 *) + 0xba16018c; (* arm_ADCS X12 X12 X22 *) + 0xca1402b5; (* arm_EOR X21 X21 X20 *) + 0xba1501ad; (* arm_ADCS X13 X13 X21 *) + 0xba1401ce; (* arm_ADCS X14 X14 X20 *) + 0xba1401ef; (* arm_ADCS X15 X15 X20 *) + 0xba140210; (* arm_ADCS X16 X16 X20 *) + 0xba140231; (* arm_ADCS X17 X17 X20 *) + 0x9a140273; (* arm_ADC X19 X19 X20 *) + 0xeb060098; (* arm_SUBS X24 X4 X6 *) + 0xda982718; (* arm_CNEG X24 X24 Condition_CC *) + 0xda9f23f4; (* arm_CSETM X20 Condition_CC *) + 0xeb080155; (* arm_SUBS X21 X10 X8 *) + 0xda9526b5; (* arm_CNEG X21 X21 Condition_CC *) + 0x9b157f16; (* arm_MUL X22 X24 X21 *) + 0x9bd57f15; (* arm_UMULH X21 X24 X21 *) + 0xda942294; (* arm_CINV X20 X20 Condition_CC *) + 0xb100069f; (* arm_CMN X20 (rvalue (word 1)) *) + 0xca1402d6; (* arm_EOR X22 X22 X20 *) + 0xba1601ef; (* arm_ADCS X15 X15 X22 *) + 0xca1402b5; (* arm_EOR X21 X21 X20 *) + 0xba150210; (* arm_ADCS X16 X16 X21 *) + 0xba140231; (* arm_ADCS X17 X17 X20 *) + 0x9a140273; (* arm_ADC X19 X19 X20 *) + 0xeb050078; (* arm_SUBS X24 X3 X5 *) + 0xda982718; (* arm_CNEG X24 X24 Condition_CC *) + 0xda9f23f4; (* arm_CSETM X20 Condition_CC *) + 0xeb070135; (* arm_SUBS X21 X9 X7 *) + 0xda9526b5; (* arm_CNEG X21 X21 Condition_CC *) + 0x9b157f16; (* arm_MUL X22 X24 X21 *) + 0x9bd57f15; (* arm_UMULH X21 X24 X21 *) + 0xda942294; (* arm_CINV X20 X20 Condition_CC *) + 0xb100069f; (* arm_CMN X20 (rvalue (word 1)) *) + 0xca1402d6; (* arm_EOR X22 X22 X20 *) + 0xba1601ad; (* arm_ADCS X13 X13 X22 *) + 0xca1402b5; (* arm_EOR X21 X21 X20 *) + 0xba1501ce; (* arm_ADCS X14 X14 X21 *) + 0xba1401ef; (* arm_ADCS X15 X15 X20 *) + 0xba140210; (* arm_ADCS X16 X16 X20 *) + 0xba140231; (* arm_ADCS X17 X17 X20 *) + 0x9a140273; (* arm_ADC X19 X19 X20 *) + 0xeb060078; (* arm_SUBS X24 X3 X6 *) + 0xda982718; (* arm_CNEG X24 X24 Condition_CC *) + 0xda9f23f4; (* arm_CSETM X20 Condition_CC *) + 0xeb070155; (* arm_SUBS X21 X10 X7 *) + 0xda9526b5; (* arm_CNEG X21 X21 Condition_CC *) + 0x9b157f16; (* arm_MUL X22 X24 X21 *) + 0x9bd57f15; (* arm_UMULH X21 X24 X21 *) + 0xda942294; (* arm_CINV X20 X20 Condition_CC *) + 0xb100069f; (* arm_CMN X20 (rvalue (word 1)) *) + 0xca1402d6; (* arm_EOR X22 X22 X20 *) + 0xba1601ce; (* arm_ADCS X14 X14 X22 *) + 0xca1402b5; (* arm_EOR X21 X21 X20 *) + 0xba1501ef; (* arm_ADCS X15 X15 X21 *) + 0xba140210; (* arm_ADCS X16 X16 X20 *) + 0xba140231; (* arm_ADCS X17 X17 X20 *) + 0x9a140273; (* arm_ADC X19 X19 X20 *) + 0xeb050098; (* arm_SUBS X24 X4 X5 *) + 0xda982718; (* arm_CNEG X24 X24 Condition_CC *) + 0xda9f23f4; (* arm_CSETM X20 Condition_CC *) + 0xeb080135; (* arm_SUBS X21 X9 X8 *) + 0xda9526b5; (* arm_CNEG X21 X21 Condition_CC *) + 0x9b157f16; (* arm_MUL X22 X24 X21 *) + 0x9bd57f15; (* arm_UMULH X21 X24 X21 *) + 0xda942294; (* arm_CINV X20 X20 Condition_CC *) + 0xb100069f; (* arm_CMN X20 (rvalue (word 1)) *) + 0xca1402d6; (* arm_EOR X22 X22 X20 *) + 0xba1601ce; (* arm_ADCS X14 X14 X22 *) + 0xca1402b5; (* arm_EOR X21 X21 X20 *) + 0xba1501ef; (* arm_ADCS X15 X15 X21 *) + 0xba140210; (* arm_ADCS X16 X16 X20 *) + 0xba140231; (* arm_ADCS X17 X17 X20 *) + 0x9a140273; (* arm_ADC X19 X19 X20 *) + 0xa9401003; (* arm_LDP X3 X4 X0 (Immediate_Offset (iword (&0))) *) + 0xa9442007; (* arm_LDP X7 X8 X0 (Immediate_Offset (iword (&64))) *) + 0xab070063; (* arm_ADDS X3 X3 X7 *) + 0xba080084; (* arm_ADCS X4 X4 X8 *) + 0xa9411805; (* arm_LDP X5 X6 X0 (Immediate_Offset (iword (&16))) *) + 0xa9452809; (* arm_LDP X9 X10 X0 (Immediate_Offset (iword (&80))) *) + 0xba0900a5; (* arm_ADCS X5 X5 X9 *) + 0xba0a00c6; (* arm_ADCS X6 X6 X10 *) + 0xa9465414; (* arm_LDP X20 X21 X0 (Immediate_Offset (iword (&96))) *) + 0xba1400e7; (* arm_ADCS X7 X7 X20 *) + 0xba150108; (* arm_ADCS X8 X8 X21 *) + 0xa9475c16; (* arm_LDP X22 X23 X0 (Immediate_Offset (iword (&112))) *) + 0xba160129; (* arm_ADCS X9 X9 X22 *) + 0xba17014a; (* arm_ADCS X10 X10 X23 *) + 0xba1f0038; (* arm_ADCS X24 X1 XZR *) + 0x9a1f0022; (* arm_ADC X2 X1 XZR *) + 0xb100043f; (* arm_CMN X1 (rvalue (word 1)) *) + 0xca01016b; (* arm_EOR X11 X11 X1 *) + 0xba030163; (* arm_ADCS X3 X11 X3 *) + 0xca01018c; (* arm_EOR X12 X12 X1 *) + 0xba040184; (* arm_ADCS X4 X12 X4 *) + 0xca0101ad; (* arm_EOR X13 X13 X1 *) + 0xba0501a5; (* arm_ADCS X5 X13 X5 *) + 0xca0101ce; (* arm_EOR X14 X14 X1 *) + 0xba0601c6; (* arm_ADCS X6 X14 X6 *) + 0xca0101ef; (* arm_EOR X15 X15 X1 *) + 0xba0701e7; (* arm_ADCS X7 X15 X7 *) + 0xca010210; (* arm_EOR X16 X16 X1 *) + 0xba080208; (* arm_ADCS X8 X16 X8 *) + 0xca010231; (* arm_EOR X17 X17 X1 *) + 0xba090229; (* arm_ADCS X9 X17 X9 *) + 0xca010273; (* arm_EOR X19 X19 X1 *) + 0xba0a026a; (* arm_ADCS X10 X19 X10 *) + 0xba180294; (* arm_ADCS X20 X20 X24 *) + 0xba0202b5; (* arm_ADCS X21 X21 X2 *) + 0xba0202d6; (* arm_ADCS X22 X22 X2 *) + 0x9a0202f7; (* arm_ADC X23 X23 X2 *) + 0xa9021003; (* arm_STP X3 X4 X0 (Immediate_Offset (iword (&32))) *) + 0xa9031805; (* arm_STP X5 X6 X0 (Immediate_Offset (iword (&48))) *) + 0xa9042007; (* arm_STP X7 X8 X0 (Immediate_Offset (iword (&64))) *) + 0xa9052809; (* arm_STP X9 X10 X0 (Immediate_Offset (iword (&80))) *) + 0xa9065414; (* arm_STP X20 X21 X0 (Immediate_Offset (iword (&96))) *) + 0xa9075c16; (* arm_STP X22 X23 X0 (Immediate_Offset (iword (&112))) *) + 0xd65f03c0 (* arm_RET X30 *) +];; + +let BIGNUM_KMUL_16_32_NEON_EXEC = ARM_MK_EXEC_RULE bignum_kmul_16_32_neon_mc;; + +(* ------------------------------------------------------------------------- *) +(* First of all the correctness lemma for the embedded bignum_mul_8_16 *) +(* ------------------------------------------------------------------------- *) + +let lemma1 = prove + (`!(x0:num) x1 (y0:num) y1. + (if y0 <= y1 + then if x1 <= x0 then word 0 else word 18446744073709551615 + else word_not + (if x1 <= x0 then word 0 else word 18446744073709551615)):int64 = + word_neg(word(bitval(y0 <= y1 <=> x0 < x1)))`, + REPEAT GEN_TAC THEN REWRITE_TAC[GSYM NOT_LE] THEN + REPEAT(COND_CASES_TAC THEN ASM_REWRITE_TAC[BITVAL_CLAUSES]) THEN + CONV_TAC WORD_REDUCE_CONV);; + +let lemma2 = prove + (`!(x0:int64) (x1:int64) (y0:int64) (y1:int64). + &(val(if val x1 <= val x0 then word_sub x0 x1 + else word_neg (word_sub x0 x1))) * + &(val(if val y0 <= val y1 then word_sub y1 y0 + else word_neg (word_sub y1 y0))):real = + --(&1) pow bitval(val y0 <= val y1 <=> val x0 < val x1) * + (&(val x0) - &(val x1)) * (&(val y1) - &(val y0))`, + REPEAT GEN_TAC THEN REWRITE_TAC[GSYM NOT_LE; WORD_NEG_SUB] THEN + REPEAT(COND_CASES_TAC THEN ASM_REWRITE_TAC[BITVAL_CLAUSES]) THEN + REPEAT(FIRST_X_ASSUM(ASSUME_TAC o MATCH_MP (ARITH_RULE + `~(m:num <= n) ==> n <= m /\ ~(m <= n)`))) THEN + ASM_SIMP_TAC[VAL_WORD_SUB_CASES; GSYM REAL_OF_NUM_SUB] THEN + REAL_ARITH_TAC);; + + +(* A lemma that is useful for extracting a 32-bit field from a 128-bit word. *) +let WORD_128_SUBWORD_SUBWORD_32 = prove(`!y. + word_subword (word_subword (y:(128)word) (0,64):(64)word) (0,32):(32)word = + word_subword (y:(128)word) (0,32):(32)word /\ + word_subword (word_subword (y:(128)word) (64,64):(64)word) (0,32):(32)word = + word_subword (y:(128)word) (64,32):(32)word /\ + word_subword (word_subword (y:(128)word) (0,64):(64)word) (32,32):(32)word = + word_subword (y:(128)word) (32,32):(32)word /\ + word_subword (word_subword (y:(128)word) (64,64):(64)word) (32,32):(32)word = + word_subword (y:(128)word) (96,32):(32)word`, + CONV_TAC WORD_BLAST);; + +(* A lemma that is useful for extracting a 32-bit field from a join of two 32-bit words. *) +let WORD_SUBWORD_JOIN_64 = prove(`!(x:(32)word) (y:(32)word). + word_subword (word_join (x:(32)word) (y:(32)word): (64)word) (0,32) = y /\ + word_subword (word_join (x:(32)word) (y:(32)word): (64)word) (32,32) = x`, + CONV_TAC WORD_BLAST);; + +(* A lemma that is useful for extracting a 64-bit field from a join of two 64-bit words. *) +let WORD_SUBWORD_JOIN_128_64 = prove(`!(x:(64)word) (y:(64)word). + word_subword (word_join (x:(64)word) (y:(64)word): (128)word) (0,64) = y /\ + word_subword (word_join (x:(64)word) (y:(64)word): (128)word) (64,64) = x`, + CONV_TAC WORD_BLAST);; + +(* A lemma that is useful for extracting a 32-bit field from a join of two 64-bit words. *) +let WORD_SUBWORD_JOIN_128_32 = prove(`!(x:(64)word) (y:(64)word). + word_subword (word_join (x:(64)word) (y:(64)word): (128)word) (0,32):(32)word = + word_subword (y:(64)word) (0,32):(32)word /\ + word_subword (word_join (x:(64)word) (y:(64)word): (128)word) (32,32):(32)word = + word_subword (y:(64)word) (32,32):(32)word /\ + word_subword (word_join (x:(64)word) (y:(64)word): (128)word) (64,32):(32)word = + word_subword (x:(64)word) (0,32):(32)word /\ + word_subword (word_join (x:(64)word) (y:(64)word): (128)word) (96,32):(32)word = + word_subword (x:(64)word) (32,32):(32)word`, + CONV_TAC WORD_BLAST);; + +let rewrite_assumptions t tac = SUBGOAL_THEN t + (fun thm -> RULE_ASSUM_TAC (REWRITE_RULE [thm])) THENL + [tac; ALL_TAC];; + +let lemma4 = prove(`!a b c. + ((a + 2 EXP 32 * (b MOD 2 EXP 32 + c MOD 2 EXP 32)) DIV 2 EXP 32) MOD 2 EXP 32 = + ((a + 2 EXP 32 * (b + c)) DIV 2 EXP 32) MOD 2 EXP 32`, + REPEAT STRIP_TAC THEN + MAP_EVERY (fun (thm, suffix) -> LABEL_TAC ("Ha_" ^ suffix) thm) + (zip (CONJUNCTS ((MP + (SPECL [`a:num`; `2 EXP 32:num`] DIVISION) (ARITH_RULE `~(2 EXP 32 = 0)`)))) + ["eq";"lt"]) THEN + ABBREV_TAC `ahi = a DIV 2 EXP 32` THEN + ABBREV_TAC `alo = a MOD 2 EXP 32` THEN + ASM_REWRITE_TAC[] THEN + REWRITE_TAC[ARITH_RULE + `(ahi * 2 EXP 32 + alo) + 2 EXP 32 * (b MOD 2 EXP 32 + c MOD 2 EXP 32) = + (ahi + b MOD 2 EXP 32 + c MOD 2 EXP 32) * 2 EXP 32 + alo`] THEN + REWRITE_TAC[ARITH_RULE + `(ahi * 2 EXP 32 + alo) + 2 EXP 32 * (b + c) = + (ahi + b + c) * 2 EXP 32 + alo`] THEN + IMP_REWRITE_TAC[DIV_UNIQ] THEN (* (A * 2^32 + B) / 2^32 => A *) + EXISTS_TAC `(ahi + b MOD 2 EXP 32 + c MOD 2 EXP 32)` THEN SIMP_TAC[] THEN + EXISTS_TAC `(ahi + b + c)` THEN SIMP_TAC[] THEN + CONV_TAC MOD_DOWN_CONV THEN SIMP_TAC[]);; + +let WORD_MUL_64_DECOMPOSED_32 = prove(`!(x:(64)word) (y:(64)word). + word_add + (word_mul (word_zx (word_subword x (0,32):(32)word):(64)word) + (word_zx (word_subword y (0,32):(32)word):(64)word)) + (word_shl + (word_add + (word_zx (word_mul (word_subword y (32,32):(32)word) (word_subword x (0,32):(32)word))) + (word_zx (word_mul (word_subword y (0,32):(32)word) (word_subword x (32,32):(32)word)))) + 32) = + word_mul x y`, + REPEAT GEN_TAC THEN + (* word to num: step 1. x = y to val x = val y *) + REWRITE_TAC[GSYM VAL_EQ] THEN + (* step 2. remove all word_* *) + REWRITE_TAC [VAL_WORD_ADD; VAL_WORD_MUL; VAL_WORD_ZX_GEN; VAL_WORD_SUBWORD; + VAL_WORD; VAL_WORD_SHL] THEN + (* step 3. add x, y < 2^64 *) + ASSUME_TAC (ISPECL [`x:(64)word`] VAL_BOUND) THEN + ASSUME_TAC (ISPECL [`y:(64)word`] VAL_BOUND) THEN + RULE_ASSUM_TAC (REWRITE_RULE [DIMINDEX_64]) THEN + (* step 4. eliminate dimindex (:N) and simplify *) + REWRITE_TAC[DIMINDEX_32;DIMINDEX_64;DIMINDEX_128;DIV_1;MOD_MOD_REFL; + MOD_MOD_EXP_MIN;ARITH_RULE `2 EXP 0 = 1`; DIV_1] THEN + CONV_TAC(DEPTH_CONV NUM_MIN_CONV) THEN + CONV_TAC MOD_DOWN_CONV THEN + (* split x into [x0h, x0l], and divide y as well *) + MAP_EVERY (fun (thm, suffix) -> LABEL_TAC ("Hx" ^ suffix) thm) + (zip (CONJUNCTS ((MP (SPECL [`(val (x:(64)word)):num`; `2 EXP 32:num`] DIVISION) + (ARITH_RULE `~(2 EXP 32 = 0)`)))) ["eq";"lt"]) THEN + ABBREV_TAC `xhi = (val (x:(64)word)) DIV 2 EXP 32` THEN + ABBREV_TAC `xlo = (val (x:(64)word)) MOD 2 EXP 32` THEN + ASM_REWRITE_TAC[] THEN + MAP_EVERY (fun (thm, suffix) -> LABEL_TAC ("Hy" ^ suffix) thm) + (zip (CONJUNCTS ((MP (SPECL [`(val (y:(64)word)):num`; `2 EXP 32:num`] DIVISION) + (ARITH_RULE `~(2 EXP 32 = 0)`)))) ["eq";"lt"]) THEN + ABBREV_TAC `yhi = (val (y:(64)word)) DIV 2 EXP 32` THEN + ABBREV_TAC `ylo = (val (y:(64)word)) MOD 2 EXP 32` THEN + ASM_REWRITE_TAC[] THEN + (* lhs *) + REWRITE_TAC[LEFT_ADD_DISTRIB; RIGHT_ADD_DISTRIB] THEN + REWRITE_TAC[ + ARITH_RULE `y1hi * x1hi * 2 EXP 32 = 2 EXP 32 * y1hi * x1hi`; + ARITH_RULE `(y1hi * 2 EXP 32) * x1hi = 2 EXP 32 * y1hi * x1hi`] THEN + REWRITE_TAC[MOD_MULT_ADD] THEN + (* rhs *) + REWRITE_TAC[MULT_ASSOC; ARITH_RULE `2 EXP 32 * 2 EXP 32 = 2 EXP 64`] THEN + REWRITE_TAC[GSYM ADD_ASSOC; GSYM MULT_ASSOC] THEN + REWRITE_TAC[MOD_MULT_ADD] THEN + (* lhs = rhs *) + REWRITE_TAC[ARITH_RULE `2 EXP 64 = 2 EXP 32 * 2 EXP 32`] THEN + REWRITE_TAC[MOD_MULT_MOD] THEN + REWRITE_TAC[ARITH_RULE `2 EXP 32 * p + 2 EXP 32 * q = 2 EXP 32 * (p + q)`; MOD_MULT_ADD] THEN + REWRITE_TAC [lemma4] THEN + REWRITE_TAC [ARITH_RULE + `(xlo * ylo + 2 EXP 32 * (yhi * xlo + ylo * xhi)) DIV 2 EXP 32 = + (2 EXP 32 * xhi * ylo + 2 EXP 32 * xlo * yhi + xlo * ylo) DIV 2 EXP 32`]);; + +let simplify_128bit_words = + RULE_ASSUM_TAC (REWRITE_RULE [ + WORD_128_SUBWORD_SUBWORD_32; WORD_SUBWORD_JOIN_64; + WORD_SUBWORD_JOIN_128_64; WORD_SUBWORD_JOIN_128_32; + WORD_MUL_64_DECOMPOSED_32]);; + +let simplify_128bit_words_and_accumulate state_name = + simplify_128bit_words THEN + (* Rewrite word_mul x y into the pattern that ACCUMULATE_ARITH_TAC can recognize. *) + RULE_ASSUM_TAC (REWRITE_RULE [WORD_RULE + `word_mul (a:(64)word) (b:(64)word) = + word (0 + val (a:(64)word) * val (b:(64)word))`]) THEN + ACCUMULATE_ARITH_TAC state_name THEN CLARIFY_TAC;; + +let WORD_ADD_ASSOC_CONSTS = prove( + `!(x:(N)word) n m. + (word_add (word_add x (word n)) (word m)) = (word_add x (word (n+m)))`, + CONV_TAC WORD_RULE);; + +let ADK_48_TAC = + MATCH_MP_TAC EQUAL_FROM_CONGRUENT_REAL THEN + MAP_EVERY EXISTS_TAC [`512`; `&0:real`] THEN + REPLICATE_TAC 2 (CONJ_TAC THENL [BOUNDER_TAC[]; ALL_TAC]) THEN + CONJ_TAC THENL [REAL_INTEGER_TAC; ALL_TAC] THEN + ACCUMULATOR_POP_ASSUM_LIST(MP_TAC o end_itlist CONJ) THEN + POP_ASSUM_LIST(K ALL_TAC) THEN + REWRITE_TAC[lemma1; lemma2] THEN REWRITE_TAC[WORD_XOR_MASK] THEN + REPEAT(COND_CASES_TAC THEN + ASM_REWRITE_TAC[BITVAL_CLAUSES; REAL_VAL_WORD_NOT]) THEN + CONV_TAC WORD_REDUCE_CONV THEN CONV_TAC NUM_REDUCE_CONV THEN + REWRITE_TAC[BITVAL_CLAUSES; DIMINDEX_64] THEN + POP_ASSUM_LIST(K ALL_TAC) THEN DISCH_TAC THEN + FIRST_ASSUM(MP_TAC o end_itlist CONJ o DESUM_RULE o CONJUNCTS) THEN + DISCH_THEN(fun th -> REWRITE_TAC[th]) THEN + CONV_TAC(RAND_CONV REAL_POLY_CONV) THEN + FIRST_ASSUM(MP_TAC o end_itlist CONJ o filter (is_ratconst o rand o concl) o + DECARRY_RULE o CONJUNCTS) THEN + DISCH_THEN(fun th -> REWRITE_TAC[th]) THEN REAL_INTEGER_TAC;; + +let BIGNUM_KMUL_16_32_NEON_LEMMA = prove + (`!z x y a b pc returnaddress. + ALL (nonoverlapping (z,8 * 16)) + [(word pc,2932); (x,8 * 8); (y,8 * 8)] + ==> ensures arm + (\s. aligned_bytes_loaded s (word(pc + 0)) bignum_kmul_16_32_neon_mc /\ + read PC s = word(pc + 0x3bc) /\ + read X30 s = returnaddress /\ + C_ARGUMENTS [z; x; y] s /\ + bignum_from_memory (x,8) s = a /\ + bignum_from_memory (y,8) s = b) + (\s. read PC s = returnaddress /\ + bignum_from_memory (z,16) s = a * b) + (MAYCHANGE [PC; X1; X2; X3; X4; X5; X6; X7; X8; + X9; X10; X11; X12; X13; X14; X15; X16; + X17; X19; X20; X21; X22; X23; X24] ,, + MAYCHANGE [Q0; Q1; Q2; Q3; Q4; Q5],, + MAYCHANGE [memory :> bytes(z,8 * 16)] ,, + MAYCHANGE SOME_FLAGS)`, + REWRITE_TAC[ADD_CLAUSES] THEN + MAP_EVERY X_GEN_TAC + [`z:int64`; `x:int64`; `y:int64`; `a:num`; `b:num`; `pc:num`; `returnaddress:int64`] THEN + REWRITE_TAC[C_ARGUMENTS; C_RETURN; SOME_FLAGS] THEN + REWRITE_TAC[ALL; NONOVERLAPPING_CLAUSES] THEN + DISCH_THEN(REPEAT_TCL CONJUNCTS_THEN ASSUME_TAC) THEN + ENSURES_INIT_TAC "s0" THEN + BIGNUM_DIGITIZE_TAC "x_" `bignum_from_memory (x,8) s0` THEN + BIGNUM_DIGITIZE_TAC "y_" `bignum_from_memory (y,8) s0` THEN + (* Split 128-bit reads to word_join of 64-bit low and highs *) + ABBREV_TAC `x_0_1:(128)word = read (memory :> bytes128 x) s0` THEN + rewrite_assumptions `x_0_1 = word_join (x_1:(64)word) (x_0:(64)word):(128)word` + (MAP_EVERY EXPAND_TAC ["x_0_1"; "x_1"; "x_0"] THEN + REWRITE_TAC[READ_MEMORY_BYTESIZED_SPLIT]) THEN + ABBREV_TAC `x_2_3:(128)word = read (memory :> bytes128 (word_add x (word 16))) s0` THEN + rewrite_assumptions `x_2_3 = word_join (x_3:(64)word) (x_2:(64)word):(128)word` + (MAP_EVERY EXPAND_TAC ["x_2_3"; "x_3"; "x_2"] THEN + REWRITE_TAC[READ_MEMORY_BYTESIZED_SPLIT; WORD_ADD_ASSOC_CONSTS] THEN + ARITH_TAC) THEN + + ABBREV_TAC `y_0_1:(128)word = read (memory :> bytes128 y) s0` THEN + rewrite_assumptions `y_0_1 = word_join (y_1:(64)word) (y_0:(64)word):(128)word` + (MAP_EVERY EXPAND_TAC ["y_0_1"; "y_1"; "y_0"] THEN + REWRITE_TAC[READ_MEMORY_BYTESIZED_SPLIT]) THEN + ABBREV_TAC `y_2_3:(128)word = read (memory :> bytes128 (word_add y (word 16))) s0` THEN + rewrite_assumptions `y_2_3 = word_join (y_3:(64)word) (y_2:(64)word):(128)word` + (MAP_EVERY EXPAND_TAC ["y_2_3"; "y_3"; "y_2"] THEN + REWRITE_TAC[READ_MEMORY_BYTESIZED_SPLIT; WORD_ADD_ASSOC_CONSTS] THEN + ARITH_TAC) THEN + + (*** First ADK block multiplying the lower halves ***) + + (* Run the vectorized parts first *) + ARM_ACCSTEPS_TAC BIGNUM_KMUL_16_32_NEON_EXEC [] (1--16) THEN + simplify_128bit_words_and_accumulate "s16" THEN + ARM_ACCSTEPS_TAC BIGNUM_KMUL_16_32_NEON_EXEC [] (17--17) THEN + simplify_128bit_words_and_accumulate "s17" THEN + ARM_ACCSTEPS_TAC BIGNUM_KMUL_16_32_NEON_EXEC [] (18--25) THEN + simplify_128bit_words_and_accumulate "s25" THEN + ARM_ACCSTEPS_TAC BIGNUM_KMUL_16_32_NEON_EXEC [] (26--26) THEN + simplify_128bit_words_and_accumulate "s26" THEN + + (* Second ADK block multiplying the upper halves with q1 added: + vector loads hoisted *) + + ABBREV_TAC `x_4_5:(128)word = read (memory :> bytes128 (word_add x (word 32))) s26` THEN + rewrite_assumptions `x_4_5 = word_join (x_5:(64)word) (x_4:(64)word):(128)word` + (MAP_EVERY EXPAND_TAC ["x_4_5"; "x_5"; "x_4"] THEN + REWRITE_TAC[READ_MEMORY_BYTESIZED_SPLIT; WORD_ADD_ASSOC_CONSTS] THEN + ARITH_TAC) THEN + ABBREV_TAC `x_6_7:(128)word = read (memory :> bytes128 (word_add x (word 48))) s26` THEN + rewrite_assumptions `x_6_7 = word_join (x_7:(64)word) (x_6:(64)word):(128)word` + (MAP_EVERY EXPAND_TAC ["x_6_7"; "x_7"; "x_6"] THEN + REWRITE_TAC[READ_MEMORY_BYTESIZED_SPLIT; WORD_ADD_ASSOC_CONSTS] THEN + ARITH_TAC) THEN + ABBREV_TAC `y_4_5:(128)word = read (memory :> bytes128 (word_add y (word 32))) s26` THEN + rewrite_assumptions `y_4_5 = word_join (y_5:(64)word) (y_4:(64)word):(128)word` + (MAP_EVERY EXPAND_TAC ["y_4_5"; "y_5"; "y_4"] THEN + REWRITE_TAC[READ_MEMORY_BYTESIZED_SPLIT; WORD_ADD_ASSOC_CONSTS] THEN + ARITH_TAC) THEN + ABBREV_TAC `y_6_7:(128)word = read (memory :> bytes128 (word_add y (word 48))) s26` THEN + rewrite_assumptions `y_6_7 = word_join (y_7:(64)word) (y_6:(64)word):(128)word` + (MAP_EVERY EXPAND_TAC ["y_6_7"; "y_7"; "y_6"] THEN + REWRITE_TAC[READ_MEMORY_BYTESIZED_SPLIT; WORD_ADD_ASSOC_CONSTS] THEN + ARITH_TAC) THEN + ARM_ACCSTEPS_TAC BIGNUM_KMUL_16_32_NEON_EXEC [] (27--30) THEN + + (* First ADK block: Run the remaining scalar parts (1) *) + ARM_ACCSTEPS_TAC BIGNUM_KMUL_16_32_NEON_EXEC [32;34;36] (31--37) THEN + + (* Second ADK block: multiply using vector instructions, but not move the + results to scalar registers *) + ARM_ACCSTEPS_TAC BIGNUM_KMUL_16_32_NEON_EXEC [] (38--44) THEN + simplify_128bit_words THEN + + (* First ADK block: Run the remaining scalar parts *) + ARM_ACCSTEPS_TAC BIGNUM_KMUL_16_32_NEON_EXEC + [45;46;47;48;49;50;51;52;53;54;55;56;62;67;69;70;76;81;83;84;85;86;87;88;94; + 99;101;102;103;109;114;116;117;118;119;120;126;131;133;134;135;136;142;147; + 149;150;151;152] (45--152) THEN + + MAP_EVERY ABBREV_TAC + [`q0 = bignum_of_wordlist[mullo_s16;sum_s81;sum_s114;sum_s147]`; + `q1 = bignum_of_wordlist[sum_s149;sum_s150;sum_s151;sum_s152]`] THEN + SUBGOAL_THEN + `2 EXP 256 * q1 + q0 = + bignum_of_wordlist [x_0;x_1;x_2;x_3] * + bignum_of_wordlist [y_0;y_1;y_2;y_3]` + ASSUME_TAC THENL + [MAP_EVERY EXPAND_TAC ["q0"; "q1"] THEN + REWRITE_TAC[bignum_of_wordlist; GSYM REAL_OF_NUM_CLAUSES] THEN + ADK_48_TAC; + ACCUMULATOR_POP_ASSUM_LIST(K ALL_TAC) THEN + DISCARD_MATCHING_ASSUMPTIONS [`word a = b`]] THEN + + (*** Second ADK block multiplying the upper halves with q1 added ***) + + ARM_ACCSTEPS_TAC BIGNUM_KMUL_16_32_NEON_EXEC [] (153--161) THEN + simplify_128bit_words_and_accumulate "s161" THEN + ARM_ACCSTEPS_TAC BIGNUM_KMUL_16_32_NEON_EXEC [] (162--162) THEN + simplify_128bit_words_and_accumulate "s162" THEN + ARM_ACCSTEPS_TAC BIGNUM_KMUL_16_32_NEON_EXEC [] (163--170) THEN + simplify_128bit_words_and_accumulate "s170" THEN + ARM_ACCSTEPS_TAC BIGNUM_KMUL_16_32_NEON_EXEC [] (171--171) THEN + simplify_128bit_words_and_accumulate "s171" THEN + + ARM_ACCSTEPS_TAC BIGNUM_KMUL_16_32_NEON_EXEC + [173;175;177;179;180;181;182;183;184;185;186;187;188;189;190;192;193;195; + 196;197;198;199;200;206;211;213;214;220;225;227;228;229;230;231;232;238; + 243;245;246;247;253;258;260;261;262;263;264;270;275;277;278;279;280;286; + 291;293;294;295;296] + (172--296) THEN + + MAP_EVERY ABBREV_TAC + [`q2 = bignum_of_wordlist[sum_s192; sum_s225; sum_s258; sum_s291]`; + `q3 = bignum_of_wordlist[sum_s293; sum_s294; sum_s295; sum_s296]`] THEN + SUBGOAL_THEN + `2 EXP 256 * q3 + q2 = + bignum_of_wordlist [x_4;x_5;x_6;x_7] * + bignum_of_wordlist [y_4;y_5;y_6;y_7] + q1` + ASSUME_TAC THENL + [MAP_EVERY EXPAND_TAC ["q1"; "q2"; "q3"] THEN + REWRITE_TAC[bignum_of_wordlist; GSYM REAL_OF_NUM_CLAUSES] THEN + ADK_48_TAC; + ACCUMULATOR_POP_ASSUM_LIST(K ALL_TAC) THEN + DISCARD_MATCHING_ASSUMPTIONS [`word a = b`]] THEN + + (*** The sign-magnitude difference computation ***) + + ARM_ACCSTEPS_TAC BIGNUM_KMUL_16_32_NEON_EXEC + [298;299;301;302;306;307;309;310;314;316;318;320;323;325;327;329] + (297--330) THEN + RULE_ASSUM_TAC(REWRITE_RULE[WORD_UNMASK_64]) THEN + + MAP_EVERY ABBREV_TAC + [`sgn <=> ~(carry_s310 <=> carry_s302)`; + `xd = bignum_of_wordlist[sum_s314;sum_s316;sum_s318;sum_s320]`; + `yd = bignum_of_wordlist[sum_s323;sum_s325;sum_s327;sum_s329]`] THEN + + SUBGOAL_THEN + `(&(bignum_of_wordlist[x_4;x_5;x_6;x_7]) - + &(bignum_of_wordlist[x_0;x_1;x_2;x_3])) * + (&(bignum_of_wordlist[y_0;y_1;y_2;y_3]) - + &(bignum_of_wordlist[y_4;y_5;y_6;y_7])):real = + --(&1) pow bitval sgn * &xd * &yd` + ASSUME_TAC THENL + [TRANS_TAC EQ_TRANS + `(--(&1) pow bitval carry_s302 * &xd) * + (--(&1) pow bitval carry_s310 * &yd):real` THEN + CONJ_TAC THENL + [ALL_TAC; + EXPAND_TAC "sgn" THEN REWRITE_TAC[BITVAL_NOT; BITVAL_IFF] THEN + POP_ASSUM_LIST(K ALL_TAC) THEN REWRITE_TAC[bitval] THEN + REPEAT(COND_CASES_TAC THEN ASM_REWRITE_TAC[]) THEN + CONV_TAC NUM_REDUCE_CONV THEN REAL_ARITH_TAC] THEN + SUBGOAL_THEN + `(carry_s302 <=> + bignum_of_wordlist[x_4;x_5;x_6;x_7] < + bignum_of_wordlist[x_0;x_1;x_2;x_3]) /\ + (carry_s310 <=> + bignum_of_wordlist[y_0;y_1;y_2;y_3] < + bignum_of_wordlist[y_4;y_5;y_6;y_7])` + (CONJUNCTS_THEN SUBST_ALL_TAC) + THENL + [CONJ_TAC THEN MATCH_MP_TAC FLAG_FROM_CARRY_LT THEN EXISTS_TAC `256` THEN + REWRITE_TAC[bignum_of_wordlist; GSYM REAL_OF_NUM_CLAUSES] THEN + ACCUMULATOR_POP_ASSUM_LIST(MP_TAC o end_itlist CONJ o DECARRY_RULE) THEN + REWRITE_TAC[REAL_BITVAL_NOT; REAL_VAL_WORD_MASK; DIMINDEX_64] THEN + DISCH_THEN(fun th -> REWRITE_TAC[th]) THEN BOUNDER_TAC[]; + ALL_TAC] THEN + BINOP_TAC THEN REWRITE_TAC[bitval] THEN + COND_CASES_TAC THEN ASM_REWRITE_TAC[real_pow; REAL_MUL_LID] THEN + REWRITE_TAC[REAL_ARITH `x - y:real = --(&1) pow 1 * z <=> y - x = z`] THEN + MATCH_MP_TAC EQUAL_FROM_CONGRUENT_REAL THEN + MAP_EVERY EXISTS_TAC [`256`; `&0:real`] THEN + (CONJ_TAC THENL + [MATCH_MP_TAC(REAL_ARITH + `y:real <= x /\ (&0 <= x /\ x < e) /\ (&0 <= y /\ y < e) + ==> &0 <= x - y /\ x - y < e`) THEN + ASM_SIMP_TAC[REAL_OF_NUM_CLAUSES; LT_IMP_LE; + ARITH_RULE `~(a:num < b) ==> b <= a`] THEN + REWRITE_TAC[bignum_of_wordlist; GSYM REAL_OF_NUM_CLAUSES] THEN + CONJ_TAC THEN BOUNDER_TAC[]; + ALL_TAC] THEN + MAP_EVERY EXPAND_TAC ["xd"; "yd"] THEN + REWRITE_TAC[bignum_of_wordlist; GSYM REAL_OF_NUM_CLAUSES] THEN + CONJ_TAC THENL [BOUNDER_TAC[]; REWRITE_TAC[INTEGER_CLOSED]]) THEN + ACCUMULATOR_POP_ASSUM_LIST(MP_TAC o end_itlist CONJ) THEN + ASM_REWRITE_TAC[WORD_XOR_MASK] THEN + REWRITE_TAC[REAL_VAL_WORD_NOT; BITVAL_CLAUSES; DIMINDEX_64] THEN + DISCH_THEN(MP_TAC o end_itlist CONJ o DESUM_RULE o CONJUNCTS) THEN + DISCH_THEN(fun th -> REWRITE_TAC[th]) THEN REAL_INTEGER_TAC; + ACCUMULATOR_POP_ASSUM_LIST(K ALL_TAC)] THEN + + (*** Third ADK block multiplying the absolute differences ***) + + ARM_ACCSTEPS_TAC BIGNUM_KMUL_16_32_NEON_EXEC + [332;333;334;335;337;339;341;343;344;345;346;347;348;349;350;351;352;353;354;360;365;367;368;374;379;381;382;383;384;385;386;392;397;399;400;401;407;412;414;415;416;417;418;424;429;431;432;433;434;440;445;447;448;449;450] + (331--450) THEN + + SUBGOAL_THEN + `&xd * &yd:real = + &(bignum_of_wordlist + [mullo_s332; sum_s379; sum_s412; sum_s445; + sum_s447; sum_s448; sum_s449; sum_s450])` + SUBST_ALL_TAC THENL + [MAP_EVERY EXPAND_TAC ["xd"; "yd"] THEN + REWRITE_TAC[bignum_of_wordlist; GSYM REAL_OF_NUM_CLAUSES] THEN + ADK_48_TAC; + ACCUMULATOR_POP_ASSUM_LIST(K ALL_TAC) THEN + DISCARD_MATCHING_ASSUMPTIONS [`word a = b`]] THEN + + (*** Clean up the overall sign ***) + + FIRST_X_ASSUM(MP_TAC o GEN_REWRITE_RULE RAND_CONV [WORD_XOR_MASKS]) THEN + ASM_REWRITE_TAC[] THEN DISCH_TAC THEN + + (*** Final accumulation simulation and 16-digit focusing ***) + + ARM_ACCSTEPS_TAC BIGNUM_KMUL_16_32_NEON_EXEC + [453;454;457;458;460;461;463;464;465;466;469;471;472;473;475;477;479;481;483;484;485;486;487] + (451--494) THEN + + ENSURES_FINAL_STATE_TAC THEN ASM_REWRITE_TAC[] THEN + CONV_TAC(LAND_CONV BIGNUM_EXPAND_CONV) THEN ASM_REWRITE_TAC[] THEN + DISCARD_STATE_TAC "s493" THEN + REWRITE_TAC[GSYM REAL_OF_NUM_CLAUSES] THEN + MATCH_MP_TAC EQUAL_FROM_CONGRUENT_REAL THEN + MAP_EVERY EXISTS_TAC [`1024`; `&0:real`] THEN + CONJ_TAC THENL [BOUNDER_TAC[]; ALL_TAC] THEN CONJ_TAC THENL + [MAP_EVERY EXPAND_TAC ["a"; "b"] THEN + REWRITE_TAC[GSYM REAL_OF_NUM_CLAUSES] THEN BOUNDER_TAC[]; + REWRITE_TAC[INTEGER_CLOSED]] THEN + + (*** The core rearrangement we are using ***) + + SUBGOAL_THEN + `&a * &b:real = + (&1 + &2 pow 256) * (&q0 + &2 pow 256 * &q2 + &2 pow 512 * &q3) + + &2 pow 256 * + (&(bignum_of_wordlist [x_4; x_5; x_6; x_7]) - + &(bignum_of_wordlist [x_0; x_1; x_2; x_3])) * + (&(bignum_of_wordlist [y_0; y_1; y_2; y_3]) - + &(bignum_of_wordlist [y_4; y_5; y_6; y_7]))` + SUBST1_TAC THENL + [MAP_EVERY UNDISCH_TAC + [`2 EXP 256 * q1 + q0 = + bignum_of_wordlist[x_0; x_1; x_2; x_3] * + bignum_of_wordlist[y_0; y_1; y_2; y_3]`; + `2 EXP 256 * q3 + q2 = + bignum_of_wordlist[x_4; x_5; x_6; x_7] * + bignum_of_wordlist[y_4; y_5; y_6; y_7] + + q1`] THEN + MAP_EVERY EXPAND_TAC ["a"; "b"] THEN + REWRITE_TAC[GSYM REAL_OF_NUM_CLAUSES; bignum_of_wordlist] THEN + CONV_TAC REAL_RING; + ASM_REWRITE_TAC[]] THEN + + MAP_EVERY EXPAND_TAC ["q0"; "q2"; "q3"] THEN + REWRITE_TAC[GSYM REAL_OF_NUM_CLAUSES; bignum_of_wordlist] THEN + ACCUMULATOR_POP_ASSUM_LIST(MP_TAC o end_itlist CONJ) THEN + POP_ASSUM_LIST(K ALL_TAC) THEN + REWRITE_TAC[WORD_XOR_MASK] THEN COND_CASES_TAC THEN + ASM_REWRITE_TAC[REAL_VAL_WORD_NOT; BITVAL_CLAUSES; DIMINDEX_64] THEN + CONV_TAC WORD_REDUCE_CONV THEN CONV_TAC NUM_REDUCE_CONV THEN + REWRITE_TAC[BITVAL_CLAUSES] THEN DISCH_TAC THEN + + (*** A bit of manual logic for the carry connections in negative case ***) + FIRST_ASSUM(MP_TAC o end_itlist CONJ o DESUM_RULE o CONJUNCTS) THEN + DISCH_THEN(fun th -> REWRITE_TAC[th]) THEN + CONV_TAC(RAND_CONV REAL_POLY_CONV) THENL + [SUBGOAL_THEN + `&(bitval carry_s465):real = &(bitval carry_s466)` + SUBST1_TAC THENL [ALL_TAC; REAL_INTEGER_TAC] THEN + POP_ASSUM MP_TAC THEN BOOL_CASES_TAC `carry_s465:bool` THEN + ASM_REWRITE_TAC[BITVAL_CLAUSES] THEN + REWRITE_TAC[REAL_RAT_REDUCE_CONV `(&2 pow 64 - &1) * &1 + &0`] THEN + POP_ASSUM_LIST(K ALL_TAC) THEN DISCH_TAC; + ALL_TAC] THEN + FIRST_ASSUM(MP_TAC o end_itlist CONJ o + filter (is_ratconst o rand o concl) o DECARRY_RULE o CONJUNCTS) THEN + DISCH_THEN(fun th -> REWRITE_TAC[th]) THEN REAL_INTEGER_TAC);; + +let BIGNUM_KMUL_16_32_NEON_LEMMA_TAC = + ARM_SUBROUTINE_SIM_TAC + (bignum_kmul_16_32_neon_mc,BIGNUM_KMUL_16_32_NEON_EXEC, + 0x0,bignum_kmul_16_32_neon_mc,BIGNUM_KMUL_16_32_NEON_LEMMA) + [`read X0 s`; `read X1 s`; `read X2 s`; + `bignum_from_memory (read X1 s,8) s`; + `bignum_from_memory (read X2 s,8) s`; + `pc:num`; `read X30 s`];; + +(* ------------------------------------------------------------------------- *) +(* Now the main proof. *) +(* ------------------------------------------------------------------------- *) + +let BIGNUM_KMUL_16_32_NEON_CORRECT = prove + (`!z x y a b t pc. + nonoverlapping (z,8 * 32) (t,8 * 32) /\ + ALLPAIRS nonoverlapping + [(z,8 * 32); (t,8 * 32)] + [(word pc,2932); (x,8 * 16); (y,8 * 16)] + ==> ensures arm + (\s. aligned_bytes_loaded s (word pc) bignum_kmul_16_32_neon_mc /\ + read PC s = word(pc + 0x18) /\ + C_ARGUMENTS [z; x; y; t] s /\ + bignum_from_memory (x,16) s = a /\ + bignum_from_memory (y,16) s = b) + (\s. read PC s = word (pc + 0x3a0) /\ + bignum_from_memory (z,32) s = a * b) + (MAYCHANGE [PC; X0; X1; X2; X3; X4; X5; X6; X7; X8; X9; X10; + X11; X12; X13; X14; X15; X16; X17; X19; X20; X21; + X22; X23; X24; X25; X26; X27; X28; X29; X30] ,, + MAYCHANGE [Q0; Q1; Q2; Q3; Q4; Q5],, + MAYCHANGE [memory :> bytes(z,8 * 32); + memory :> bytes(t,8 * 32)] ,, + MAYCHANGE SOME_FLAGS)`, + MAP_EVERY X_GEN_TAC + [`z:int64`; `x:int64`; `y:int64`; `a:num`; `b:num`; `t:int64`;`pc:num`] THEN + REWRITE_TAC[ALLPAIRS; ALL; PAIRWISE] THEN + REWRITE_TAC[C_ARGUMENTS; C_RETURN; SOME_FLAGS; NONOVERLAPPING_CLAUSES] THEN + DISCH_THEN(REPEAT_TCL CONJUNCTS_THEN ASSUME_TAC) THEN + ENSURES_INIT_TAC "s0" THEN + BIGNUM_LDIGITIZE_TAC "x_" `bignum_from_memory (x,16) s0` THEN + BIGNUM_LDIGITIZE_TAC "y_" `bignum_from_memory (y,16) s0` THEN + + (*** First nested 8x8 multiply block ***) + + ARM_STEPS_TAC BIGNUM_KMUL_16_32_NEON_EXEC (1--5) THEN + BIGNUM_KMUL_16_32_NEON_LEMMA_TAC 6 THEN + BIGNUM_LDIGITIZE_TAC "l_" `read (memory :> bytes (z,8 * 16)) s6` THEN + FIRST_X_ASSUM + (MP_TAC o check (can (term_match [] `x:num = y * z`) o concl)) THEN + CONV_TAC(LAND_CONV(RAND_CONV(BINOP_CONV BIGNUM_LEXPAND_CONV))) THEN + ASM_REWRITE_TAC[] THEN DISCH_THEN(ASSUME_TAC o SYM) THEN + + (*** Sign-difference computation for x ***) + + ARM_ACCSTEPS_TAC BIGNUM_KMUL_16_32_NEON_EXEC + [9;10;13;14;17;18;21;22] (7--23) THEN + RULE_ASSUM_TAC(REWRITE_RULE[WORD_UNMASK_64]) THEN + SUBGOAL_THEN + `bignum_of_wordlist [x_0;x_1;x_2;x_3;x_4;x_5;x_6;x_7] < + bignum_of_wordlist [x_8;x_9;x_10;x_11;x_12;x_13;x_14;x_15] <=> + carry_s22` + ASSUME_TAC THENL + [CONV_TAC SYM_CONV THEN + MATCH_MP_TAC FLAG_FROM_CARRY_LT THEN EXISTS_TAC `512` THEN + REWRITE_TAC[bignum_of_wordlist; GSYM REAL_OF_NUM_CLAUSES] THEN + ACCUMULATOR_POP_ASSUM_LIST(MP_TAC o end_itlist CONJ o DECARRY_RULE) THEN + REWRITE_TAC[REAL_BITVAL_NOT; REAL_VAL_WORD_MASK; DIMINDEX_64] THEN + DISCH_THEN(fun th -> REWRITE_TAC[th]) THEN BOUNDER_TAC[]; + ALL_TAC] THEN + ARM_ACCSTEPS_TAC BIGNUM_KMUL_16_32_NEON_EXEC + [26;28;31;33;36;38;41;43] (24--44) THEN + SUBGOAL_THEN + `&(bignum_from_memory(t,8) s44):real = + abs(&(bignum_of_wordlist [x_0;x_1;x_2;x_3;x_4;x_5;x_6;x_7]) - + &(bignum_of_wordlist [x_8;x_9;x_10;x_11;x_12;x_13;x_14;x_15]))` + MP_TAC THENL + [MATCH_MP_TAC EQUAL_FROM_CONGRUENT_REAL THEN + MAP_EVERY EXISTS_TAC [`64 * 8`; `&0:real`] THEN CONJ_TAC THENL + [REWRITE_TAC[REAL_OF_NUM_CLAUSES; LE_0; BIGNUM_FROM_MEMORY_BOUND]; + ALL_TAC] THEN + CONJ_TAC THENL + [REWRITE_TAC[REAL_ABS_POS] THEN MATCH_MP_TAC(REAL_ARITH + `&x < e /\ &y < e ==> abs(&x - &y):real < e`) THEN + REWRITE_TAC[REAL_OF_NUM_CLAUSES] THEN CONJ_TAC THEN + MATCH_MP_TAC BIGNUM_OF_WORDLIST_BOUND THEN + REWRITE_TAC[LENGTH] THEN ARITH_TAC; + REWRITE_TAC[INTEGER_CLOSED]] THEN + CONV_TAC(ONCE_DEPTH_CONV BIGNUM_EXPAND_CONV) THEN ASM_REWRITE_TAC[] THEN + CONV_TAC(ONCE_DEPTH_CONV NUM_MULT_CONV) THEN + ASM_REWRITE_TAC[REAL_OF_NUM_LT; REAL_ARITH + `abs(&x - &y):real = if &x < &y then &y - &x else &x - &y`] THEN + REWRITE_TAC[bignum_of_wordlist; GSYM REAL_OF_NUM_CLAUSES] THEN + ACCUMULATOR_POP_ASSUM_LIST(MP_TAC o end_itlist CONJ) THEN + REWRITE_TAC[WORD_UNMASK_64; WORD_XOR_MASK] THEN + COND_CASES_TAC THEN ASM_REWRITE_TAC[BITVAL_CLAUSES] THEN + CONV_TAC WORD_REDUCE_CONV THEN CONV_TAC NUM_REDUCE_CONV THEN + ASM_REWRITE_TAC[BITVAL_CLAUSES; REAL_VAL_WORD_NOT; DIMINDEX_64] THEN + DISCH_THEN(MP_TAC o end_itlist CONJ o DESUM_RULE o CONJUNCTS) THEN + REWRITE_TAC[REAL_BITVAL_NOT; REAL_VAL_WORD_MASK; DIMINDEX_64] THEN + DISCH_THEN(fun th -> REWRITE_TAC[th]) THEN REAL_INTEGER_TAC; + ACCUMULATOR_POP_ASSUM_LIST(K ALL_TAC) THEN + CONV_TAC(LAND_CONV(LAND_CONV(RAND_CONV BIGNUM_LEXPAND_CONV))) THEN + ASM_REWRITE_TAC[] THEN DISCH_THEN(ASSUME_TAC o SYM)] THEN + + (*** Second nested 8x8 multiply ***) + + ARM_STEPS_TAC BIGNUM_KMUL_16_32_NEON_EXEC (45--48) THEN + BIGNUM_KMUL_16_32_NEON_LEMMA_TAC 49 THEN + BIGNUM_LDIGITIZE_TAC "h_" + `read (memory :> bytes (word_add z (word 128),8 * 16)) s49` THEN + FIRST_X_ASSUM + (MP_TAC o check (can (term_match [] `x:num = y * z`) o concl)) THEN + CONV_TAC(LAND_CONV(RAND_CONV(BINOP_CONV BIGNUM_LEXPAND_CONV))) THEN + ASM_REWRITE_TAC[] THEN DISCH_THEN(ASSUME_TAC o SYM) THEN + + (*** Sign-difference computation for y ***) + + ARM_ACCSTEPS_TAC BIGNUM_KMUL_16_32_NEON_EXEC + [52;53;56;57;60;61;64;65] (50--66) THEN + RULE_ASSUM_TAC(REWRITE_RULE[WORD_UNMASK_64]) THEN + SUBGOAL_THEN + `bignum_of_wordlist [y_8;y_9;y_10;y_11;y_12;y_13;y_14;y_15] < + bignum_of_wordlist [y_0;y_1;y_2;y_3;y_4;y_5;y_6;y_7] <=> + carry_s65` + ASSUME_TAC THENL + [CONV_TAC SYM_CONV THEN + MATCH_MP_TAC FLAG_FROM_CARRY_LT THEN EXISTS_TAC `512` THEN + REWRITE_TAC[bignum_of_wordlist; GSYM REAL_OF_NUM_CLAUSES] THEN + ACCUMULATOR_POP_ASSUM_LIST(MP_TAC o end_itlist CONJ o DECARRY_RULE) THEN + REWRITE_TAC[REAL_BITVAL_NOT; REAL_VAL_WORD_MASK; DIMINDEX_64] THEN + DISCH_THEN(fun th -> REWRITE_TAC[th]) THEN BOUNDER_TAC[]; + ALL_TAC] THEN + ARM_ACCSTEPS_TAC BIGNUM_KMUL_16_32_NEON_EXEC + [69;71;74;76;79;81;84;86] (67--88) THEN + SUBGOAL_THEN + `&(bignum_from_memory(word_add t (word 64),8) s88):real = + abs(&(bignum_of_wordlist [y_0;y_1;y_2;y_3;y_4;y_5;y_6;y_7]) - + &(bignum_of_wordlist [y_8;y_9;y_10;y_11;y_12;y_13;y_14;y_15]))` + MP_TAC THENL + [MATCH_MP_TAC EQUAL_FROM_CONGRUENT_REAL THEN + MAP_EVERY EXISTS_TAC [`64 * 8`; `&0:real`] THEN CONJ_TAC THENL + [REWRITE_TAC[REAL_OF_NUM_CLAUSES; LE_0; BIGNUM_FROM_MEMORY_BOUND]; + ALL_TAC] THEN + CONJ_TAC THENL + [REWRITE_TAC[REAL_ABS_POS] THEN MATCH_MP_TAC(REAL_ARITH + `&x < e /\ &y < e ==> abs(&x - &y):real < e`) THEN + REWRITE_TAC[REAL_OF_NUM_CLAUSES] THEN CONJ_TAC THEN + MATCH_MP_TAC BIGNUM_OF_WORDLIST_BOUND THEN + REWRITE_TAC[LENGTH] THEN ARITH_TAC; + REWRITE_TAC[INTEGER_CLOSED]] THEN + CONV_TAC(ONCE_DEPTH_CONV BIGNUM_EXPAND_CONV) THEN ASM_REWRITE_TAC[] THEN + CONV_TAC(ONCE_DEPTH_CONV NUM_MULT_CONV) THEN + ASM_REWRITE_TAC[REAL_OF_NUM_LT; REAL_ARITH + `abs(&x - &y):real = if &y < &x then &x - &y else &y - &x`] THEN + REWRITE_TAC[bignum_of_wordlist; GSYM REAL_OF_NUM_CLAUSES] THEN + ACCUMULATOR_POP_ASSUM_LIST(MP_TAC o end_itlist CONJ) THEN + REWRITE_TAC[WORD_UNMASK_64; WORD_XOR_MASK] THEN + COND_CASES_TAC THEN ASM_REWRITE_TAC[BITVAL_CLAUSES] THEN + CONV_TAC WORD_REDUCE_CONV THEN CONV_TAC NUM_REDUCE_CONV THEN + ASM_REWRITE_TAC[BITVAL_CLAUSES; REAL_VAL_WORD_NOT; DIMINDEX_64] THEN + DISCH_THEN(MP_TAC o end_itlist CONJ o DESUM_RULE o CONJUNCTS) THEN + REWRITE_TAC[REAL_BITVAL_NOT; REAL_VAL_WORD_MASK; DIMINDEX_64] THEN + DISCH_THEN(fun th -> REWRITE_TAC[th]) THEN REAL_INTEGER_TAC; + ACCUMULATOR_POP_ASSUM_LIST(K ALL_TAC) THEN + CONV_TAC(LAND_CONV(LAND_CONV(RAND_CONV BIGNUM_LEXPAND_CONV))) THEN + ASM_REWRITE_TAC[] THEN DISCH_THEN(ASSUME_TAC o SYM)] THEN + + (*** Collected sign ***) + + RULE_ASSUM_TAC(REWRITE_RULE[WORD_XOR_MASKS]) THEN + ABBREV_TAC `sgn <=> ~(carry_s22 <=> carry_s65)` THEN + + (*** Computation of H' = H + L_top ***) + + ARM_ACCSTEPS_TAC BIGNUM_KMUL_16_32_NEON_EXEC + [91;92;96;97;101;102;106;107;110;111;114;115;118;119;122;123] + (89--124) THEN + SUBGOAL_THEN + `bignum_from_memory(word_add z (word 128),16) s124 = + bignum_of_wordlist + [h_0;h_1;h_2;h_3;h_4;h_5;h_6;h_7;h_8;h_9;h_10;h_11;h_12;h_13;h_14;h_15] + + bignum_of_wordlist[l_8;l_9;l_10;l_11;l_12;l_13;l_14;l_15]` + MP_TAC THENL + [REWRITE_TAC[GSYM REAL_OF_NUM_CLAUSES] THEN + MATCH_MP_TAC EQUAL_FROM_CONGRUENT_REAL THEN + MAP_EVERY EXISTS_TAC [`64 * 16`; `&0:real`] THEN CONJ_TAC THENL + [REWRITE_TAC[REAL_OF_NUM_CLAUSES; LE_0; BIGNUM_FROM_MEMORY_BOUND]; + ALL_TAC] THEN + CONJ_TAC THENL + [REWRITE_TAC[REAL_OF_NUM_CLAUSES; LE_0] THEN FIRST_X_ASSUM(fun th -> + GEN_REWRITE_TAC (LAND_CONV o LAND_CONV) [SYM th]) THEN + REWRITE_TAC[bignum_of_wordlist; GSYM REAL_OF_NUM_CLAUSES] THEN + CONV_TAC NUM_REDUCE_CONV THEN BOUNDER_TAC[]; + REWRITE_TAC[INTEGER_CLOSED]] THEN + CONV_TAC(ONCE_DEPTH_CONV BIGNUM_EXPAND_CONV) THEN + ASM_REWRITE_TAC[bignum_of_wordlist; GSYM REAL_OF_NUM_CLAUSES] THEN + CONV_TAC(ONCE_DEPTH_CONV NUM_MULT_CONV) THEN + ACCUMULATOR_POP_ASSUM_LIST(MP_TAC o end_itlist CONJ o DESUM_RULE) THEN + DISCH_THEN(fun th -> REWRITE_TAC[th]) THEN REAL_INTEGER_TAC; + ACCUMULATOR_POP_ASSUM_LIST(K ALL_TAC) THEN + CONV_TAC(LAND_CONV(LAND_CONV BIGNUM_LEXPAND_CONV)) THEN + ASM_REWRITE_TAC[] THEN DISCH_TAC] THEN + + (*** Third and final nested multiply ***) + + ARM_STEPS_TAC BIGNUM_KMUL_16_32_NEON_EXEC (125--128) THEN + BIGNUM_KMUL_16_32_NEON_LEMMA_TAC 129 THEN + BIGNUM_LDIGITIZE_TAC "m_" + `read (memory :> bytes (word_add t (word 128),8 * 16)) s129` THEN + FIRST_X_ASSUM + (MP_TAC o check (can (term_match [] `x:num = y * z`) o concl)) THEN + CONV_TAC(LAND_CONV(RAND_CONV(BINOP_CONV BIGNUM_LEXPAND_CONV))) THEN + ASM_REWRITE_TAC[] THEN DISCH_THEN(ASSUME_TAC o SYM) THEN + + (*** All remaining accumulation of sub-results ***) + + ARM_ACCSTEPS_TAC BIGNUM_KMUL_16_32_NEON_EXEC + [132; 133; 136; 137; 140; 141; 144; 145; 148; 149; 152; 153; 156; + 157; 160; 161; 166; 168; 172; 174; 178; 180; 184; 186; 190; 192; + 196; 198; 202; 204; 208; 210; 212; 213; 215; 216; 219; 220; 223; + 224; 227; 228] + (130--229) THEN + ENSURES_FINAL_STATE_TAC THEN ASM_REWRITE_TAC[] THEN + REWRITE_TAC[GSYM REAL_OF_NUM_CLAUSES] THEN + MATCH_MP_TAC EQUAL_FROM_CONGRUENT_REAL THEN + MAP_EVERY EXISTS_TAC [`64 * 32`; `&0:real`] THEN CONJ_TAC THENL + [REWRITE_TAC[REAL_OF_NUM_CLAUSES; LE_0] THEN + REWRITE_TAC[GSYM BIGNUM_FROM_MEMORY_BYTES; BIGNUM_FROM_MEMORY_BOUND]; + ALL_TAC] THEN + CONJ_TAC THENL + [REWRITE_TAC[REAL_OF_NUM_CLAUSES; LE_0] THEN + MAP_EVERY EXPAND_TAC ["a"; "b"] THEN + REWRITE_TAC[bignum_of_wordlist; GSYM REAL_OF_NUM_CLAUSES] THEN + CONV_TAC NUM_REDUCE_CONV THEN BOUNDER_TAC[]; + REWRITE_TAC[INTEGER_CLOSED]] THEN + SUBGOAL_THEN + `(&a:real) * &b = + (&(bignum_of_wordlist[l_0; l_1; l_2; l_3; l_4; l_5; l_6; l_7]) + + &2 pow 512 * + &(bignum_of_wordlist + [sum_s91; sum_s92; sum_s96; sum_s97; sum_s101; sum_s102; sum_s106; + sum_s107; sum_s110; sum_s111; sum_s114; sum_s115; sum_s118; sum_s119; + sum_s122; sum_s123])) * + (&2 pow 512 + &1) + + &2 pow 512 * + --(&1) pow bitval sgn * + &(bignum_of_wordlist + [m_0; m_1; m_2; m_3; m_4; m_5; m_6; m_7; m_8; m_9; m_10; m_11; m_12; + m_13; m_14; m_15])` + SUBST1_TAC THENL + [ASM_REWRITE_TAC[] THEN REWRITE_TAC[REAL_OF_NUM_CLAUSES; ARITH_RULE + `l + e * (h + m):num = (l + e * m) + e * h`] THEN + REWRITE_TAC[GSYM(BIGNUM_OF_WORDLIST_SPLIT_RULE(8,8))] THEN + REPEAT(FIRST_X_ASSUM(SUBST1_TAC o MATCH_MP (ARITH_RULE + `w * z:num = y ==> y = w * z`))) THEN + REWRITE_TAC[GSYM REAL_OF_NUM_CLAUSES] THEN + REPEAT(FIRST_X_ASSUM(SUBST1_TAC o MATCH_MP (MESON[] + `abs x:real = y ==> y = abs x`))) THEN + ONCE_REWRITE_TAC[MESON[REAL_ABS_NEG] + `abs x * abs y:real = abs x * abs(--y)`] THEN + REWRITE_TAC[REAL_NEG_SUB; REAL_ARITH + `abs(x - x'):real = if x < x' then x' - x else x - x'`] THEN + ASM_REWRITE_TAC[REAL_OF_NUM_LT] THEN + MAP_EVERY EXPAND_TAC ["a"; "b"] THEN + REWRITE_TAC[BIGNUM_OF_WORDLIST_SPLIT_RULE(8,8)] THEN + EXPAND_TAC "sgn" THEN POP_ASSUM_LIST(K ALL_TAC) THEN + REWRITE_TAC[GSYM REAL_OF_NUM_CLAUSES] THEN + REPEAT(COND_CASES_TAC THEN ASM_REWRITE_TAC[BITVAL_CLAUSES]) THEN + REAL_ARITH_TAC; + ALL_TAC] THEN + CONV_TAC(ONCE_DEPTH_CONV BIGNUM_LEXPAND_CONV) THEN + REWRITE_TAC[GSYM REAL_OF_NUM_CLAUSES; bignum_of_wordlist] THEN + ASM_REWRITE_TAC[] THEN CONV_TAC NUM_REDUCE_CONV THEN + ACCUMULATOR_POP_ASSUM_LIST(MP_TAC o end_itlist CONJ) THEN + REWRITE_TAC[WORD_XOR_MASK] THEN POP_ASSUM_LIST(K ALL_TAC) THEN + BOOL_CASES_TAC `sgn:bool` THEN ASM_REWRITE_TAC[BITVAL_CLAUSES] THEN + CONV_TAC WORD_REDUCE_CONV THEN CONV_TAC NUM_REDUCE_CONV THEN + ASM_REWRITE_TAC[BITVAL_CLAUSES; REAL_VAL_WORD_NOT; DIMINDEX_64] THEN + REWRITE_TAC[COND_SWAP; GSYM WORD_BITVAL; VAL_WORD_BITVAL] THEN STRIP_TAC THEN + ASSUM_LIST(MP_TAC o end_itlist CONJ o DESUM_RULE) THEN + DISCH_THEN(fun th -> REWRITE_TAC[th]) THEN + CONV_TAC(RAND_CONV REAL_POLY_CONV) THEN + POP_ASSUM_LIST(MP_TAC o end_itlist CONJ o + filter (free_in `carry_s212:bool` o concl)) + THENL + [ASM_CASES_TAC `carry_s212:bool` THEN ASM_REWRITE_TAC[BITVAL_CLAUSES]; + ALL_TAC] THEN + DISCH_THEN(MP_TAC o end_itlist CONJ o filter (is_ratconst o rand o concl) o + DECARRY_RULE o CONJUNCTS) THEN + DISCH_THEN(fun th -> REWRITE_TAC[th]) THEN REAL_INTEGER_TAC);; + +let BIGNUM_KMUL_16_32_NEON_SUBROUTINE_CORRECT = prove + (`!z x y a b t pc stackpointer returnaddress. + aligned 16 stackpointer /\ + PAIRWISE nonoverlapping + [(z,8 * 32); (t,8 * 32); (word_sub stackpointer (word 96),96)] /\ + ALLPAIRS nonoverlapping + [(z,8 * 32); (t,8 * 32); (word_sub stackpointer (word 96),96)] + [(word pc,2932); (x,8 * 16); (y,8 * 16)] + ==> ensures arm + (\s. aligned_bytes_loaded s (word pc) bignum_kmul_16_32_neon_mc /\ + read PC s = word pc /\ + read SP s = stackpointer /\ + read X30 s = returnaddress /\ + C_ARGUMENTS [z; x; y; t] s /\ + bignum_from_memory (x,16) s = a /\ + bignum_from_memory (y,16) s = b) + (\s. read PC s = returnaddress /\ + bignum_from_memory (z,32) s = a * b) + (MAYCHANGE [PC; X0; X1; X2; X3; X4; X5; X6; X7; X8; X9; X10; + X11; X12; X13; X14; X15; X16; X17] ,, + MAYCHANGE [Q0; Q1; Q2; Q3; Q4; Q5],, + MAYCHANGE [memory :> bytes(z,8 * 32); + memory :> bytes(t,8 * 32); + memory :> bytes(word_sub stackpointer (word 96),96)] ,, + MAYCHANGE SOME_FLAGS)`, + ARM_ADD_RETURN_STACK_TAC + BIGNUM_KMUL_16_32_NEON_EXEC BIGNUM_KMUL_16_32_NEON_CORRECT + `[X19;X20;X21;X22;X23;X24;X25;X26;X27;X28;X29;X30]` 96);; diff --git a/arm/proofs/bignum_kmul_32_64_neon.ml b/arm/proofs/bignum_kmul_32_64_neon.ml new file mode 100644 index 00000000..b99d4029 --- /dev/null +++ b/arm/proofs/bignum_kmul_32_64_neon.ml @@ -0,0 +1,2422 @@ +(* + * Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. + * SPDX-License-Identifier: Apache-2.0 OR ISC + *) + +(* ========================================================================= *) +(* 32x32 -> 64 multiplication, using Karatsuba reduction. *) +(* ========================================================================= *) + +(**** print_literal_from_elf "arm/fastmul/bignum_kmul_32_64_neon.o";; + ****) + +let bignum_kmul_32_64_neon_mc = define_assert_from_elf "bignum_kmul_32_64_neon_mc" "arm/fastmul/bignum_kmul_32_64_neon.o" +[ + 0xa9bf53f3; (* arm_STP X19 X20 SP (Preimmediate_Offset (iword (-- &16))) *) + 0xa9bf5bf5; (* arm_STP X21 X22 SP (Preimmediate_Offset (iword (-- &16))) *) + 0xa9bf63f7; (* arm_STP X23 X24 SP (Preimmediate_Offset (iword (-- &16))) *) + 0xa9bf6bf9; (* arm_STP X25 X26 SP (Preimmediate_Offset (iword (-- &16))) *) + 0xa9bf73fb; (* arm_STP X27 X28 SP (Preimmediate_Offset (iword (-- &16))) *) + 0xa9bf7bfd; (* arm_STP X29 X30 SP (Preimmediate_Offset (iword (-- &16))) *) + 0xaa0003f3; (* arm_MOV X19 X0 *) + 0xaa0103f4; (* arm_MOV X20 X1 *) + 0xaa0203f5; (* arm_MOV X21 X2 *) + 0xaa0303f6; (* arm_MOV X22 X3 *) + 0x940001d3; (* arm_BL (word 1868) *) + 0x91040260; (* arm_ADD X0 X19 (rvalue (word 256)) *) + 0x91020281; (* arm_ADD X1 X20 (rvalue (word 128)) *) + 0x910202a2; (* arm_ADD X2 X21 (rvalue (word 128)) *) + 0xaa1603e3; (* arm_MOV X3 X22 *) + 0x940001ce; (* arm_BL (word 1848) *) + 0xa9480680; (* arm_LDP X0 X1 X20 (Immediate_Offset (iword (&128))) *) + 0xa9404690; (* arm_LDP X16 X17 X20 (Immediate_Offset (iword (&0))) *) + 0xeb100000; (* arm_SUBS X0 X0 X16 *) + 0xfa110021; (* arm_SBCS X1 X1 X17 *) + 0xa9490e82; (* arm_LDP X2 X3 X20 (Immediate_Offset (iword (&144))) *) + 0xa9414690; (* arm_LDP X16 X17 X20 (Immediate_Offset (iword (&16))) *) + 0xfa100042; (* arm_SBCS X2 X2 X16 *) + 0xfa110063; (* arm_SBCS X3 X3 X17 *) + 0xa94a1684; (* arm_LDP X4 X5 X20 (Immediate_Offset (iword (&160))) *) + 0xa9424690; (* arm_LDP X16 X17 X20 (Immediate_Offset (iword (&32))) *) + 0xfa100084; (* arm_SBCS X4 X4 X16 *) + 0xfa1100a5; (* arm_SBCS X5 X5 X17 *) + 0xa94b1e86; (* arm_LDP X6 X7 X20 (Immediate_Offset (iword (&176))) *) + 0xa9434690; (* arm_LDP X16 X17 X20 (Immediate_Offset (iword (&48))) *) + 0xfa1000c6; (* arm_SBCS X6 X6 X16 *) + 0xfa1100e7; (* arm_SBCS X7 X7 X17 *) + 0xa94c2688; (* arm_LDP X8 X9 X20 (Immediate_Offset (iword (&192))) *) + 0xa9444690; (* arm_LDP X16 X17 X20 (Immediate_Offset (iword (&64))) *) + 0xfa100108; (* arm_SBCS X8 X8 X16 *) + 0xfa110129; (* arm_SBCS X9 X9 X17 *) + 0xa94d2e8a; (* arm_LDP X10 X11 X20 (Immediate_Offset (iword (&208))) *) + 0xa9454690; (* arm_LDP X16 X17 X20 (Immediate_Offset (iword (&80))) *) + 0xfa10014a; (* arm_SBCS X10 X10 X16 *) + 0xfa11016b; (* arm_SBCS X11 X11 X17 *) + 0xa94e368c; (* arm_LDP X12 X13 X20 (Immediate_Offset (iword (&224))) *) + 0xa9464690; (* arm_LDP X16 X17 X20 (Immediate_Offset (iword (&96))) *) + 0xfa10018c; (* arm_SBCS X12 X12 X16 *) + 0xfa1101ad; (* arm_SBCS X13 X13 X17 *) + 0xa94f3e8e; (* arm_LDP X14 X15 X20 (Immediate_Offset (iword (&240))) *) + 0xa9474690; (* arm_LDP X16 X17 X20 (Immediate_Offset (iword (&112))) *) + 0xfa1001ce; (* arm_SBCS X14 X14 X16 *) + 0xfa1101ef; (* arm_SBCS X15 X15 X17 *) + 0xda1f03f4; (* arm_NGC X20 XZR *) + 0xab14029f; (* arm_CMN X20 X20 *) + 0xca140000; (* arm_EOR X0 X0 X20 *) + 0xba1f0000; (* arm_ADCS X0 X0 XZR *) + 0xca140021; (* arm_EOR X1 X1 X20 *) + 0xba1f0021; (* arm_ADCS X1 X1 XZR *) + 0xa90006c0; (* arm_STP X0 X1 X22 (Immediate_Offset (iword (&0))) *) + 0xca140042; (* arm_EOR X2 X2 X20 *) + 0xba1f0042; (* arm_ADCS X2 X2 XZR *) + 0xca140063; (* arm_EOR X3 X3 X20 *) + 0xba1f0063; (* arm_ADCS X3 X3 XZR *) + 0xa9010ec2; (* arm_STP X2 X3 X22 (Immediate_Offset (iword (&16))) *) + 0xca140084; (* arm_EOR X4 X4 X20 *) + 0xba1f0084; (* arm_ADCS X4 X4 XZR *) + 0xca1400a5; (* arm_EOR X5 X5 X20 *) + 0xba1f00a5; (* arm_ADCS X5 X5 XZR *) + 0xa90216c4; (* arm_STP X4 X5 X22 (Immediate_Offset (iword (&32))) *) + 0xca1400c6; (* arm_EOR X6 X6 X20 *) + 0xba1f00c6; (* arm_ADCS X6 X6 XZR *) + 0xca1400e7; (* arm_EOR X7 X7 X20 *) + 0xba1f00e7; (* arm_ADCS X7 X7 XZR *) + 0xa9031ec6; (* arm_STP X6 X7 X22 (Immediate_Offset (iword (&48))) *) + 0xca140108; (* arm_EOR X8 X8 X20 *) + 0xba1f0108; (* arm_ADCS X8 X8 XZR *) + 0xca140129; (* arm_EOR X9 X9 X20 *) + 0xba1f0129; (* arm_ADCS X9 X9 XZR *) + 0xa90426c8; (* arm_STP X8 X9 X22 (Immediate_Offset (iword (&64))) *) + 0xca14014a; (* arm_EOR X10 X10 X20 *) + 0xba1f014a; (* arm_ADCS X10 X10 XZR *) + 0xca14016b; (* arm_EOR X11 X11 X20 *) + 0xba1f016b; (* arm_ADCS X11 X11 XZR *) + 0xa9052eca; (* arm_STP X10 X11 X22 (Immediate_Offset (iword (&80))) *) + 0xca14018c; (* arm_EOR X12 X12 X20 *) + 0xba1f018c; (* arm_ADCS X12 X12 XZR *) + 0xca1401ad; (* arm_EOR X13 X13 X20 *) + 0xba1f01ad; (* arm_ADCS X13 X13 XZR *) + 0xa90636cc; (* arm_STP X12 X13 X22 (Immediate_Offset (iword (&96))) *) + 0xca1401ce; (* arm_EOR X14 X14 X20 *) + 0xba1f01ce; (* arm_ADCS X14 X14 XZR *) + 0xca1401ef; (* arm_EOR X15 X15 X20 *) + 0x9a1f01ef; (* arm_ADC X15 X15 XZR *) + 0xa9073ece; (* arm_STP X14 X15 X22 (Immediate_Offset (iword (&112))) *) + 0xa94006a0; (* arm_LDP X0 X1 X21 (Immediate_Offset (iword (&0))) *) + 0xa94846b0; (* arm_LDP X16 X17 X21 (Immediate_Offset (iword (&128))) *) + 0xeb100000; (* arm_SUBS X0 X0 X16 *) + 0xfa110021; (* arm_SBCS X1 X1 X17 *) + 0xa9410ea2; (* arm_LDP X2 X3 X21 (Immediate_Offset (iword (&16))) *) + 0xa94946b0; (* arm_LDP X16 X17 X21 (Immediate_Offset (iword (&144))) *) + 0xfa100042; (* arm_SBCS X2 X2 X16 *) + 0xfa110063; (* arm_SBCS X3 X3 X17 *) + 0xa94216a4; (* arm_LDP X4 X5 X21 (Immediate_Offset (iword (&32))) *) + 0xa94a46b0; (* arm_LDP X16 X17 X21 (Immediate_Offset (iword (&160))) *) + 0xfa100084; (* arm_SBCS X4 X4 X16 *) + 0xfa1100a5; (* arm_SBCS X5 X5 X17 *) + 0xa9431ea6; (* arm_LDP X6 X7 X21 (Immediate_Offset (iword (&48))) *) + 0xa94b46b0; (* arm_LDP X16 X17 X21 (Immediate_Offset (iword (&176))) *) + 0xfa1000c6; (* arm_SBCS X6 X6 X16 *) + 0xfa1100e7; (* arm_SBCS X7 X7 X17 *) + 0xa94426a8; (* arm_LDP X8 X9 X21 (Immediate_Offset (iword (&64))) *) + 0xa94c46b0; (* arm_LDP X16 X17 X21 (Immediate_Offset (iword (&192))) *) + 0xfa100108; (* arm_SBCS X8 X8 X16 *) + 0xfa110129; (* arm_SBCS X9 X9 X17 *) + 0xa9452eaa; (* arm_LDP X10 X11 X21 (Immediate_Offset (iword (&80))) *) + 0xa94d46b0; (* arm_LDP X16 X17 X21 (Immediate_Offset (iword (&208))) *) + 0xfa10014a; (* arm_SBCS X10 X10 X16 *) + 0xfa11016b; (* arm_SBCS X11 X11 X17 *) + 0xa94636ac; (* arm_LDP X12 X13 X21 (Immediate_Offset (iword (&96))) *) + 0xa94e46b0; (* arm_LDP X16 X17 X21 (Immediate_Offset (iword (&224))) *) + 0xfa10018c; (* arm_SBCS X12 X12 X16 *) + 0xfa1101ad; (* arm_SBCS X13 X13 X17 *) + 0xa9473eae; (* arm_LDP X14 X15 X21 (Immediate_Offset (iword (&112))) *) + 0xa94f46b0; (* arm_LDP X16 X17 X21 (Immediate_Offset (iword (&240))) *) + 0xfa1001ce; (* arm_SBCS X14 X14 X16 *) + 0xfa1101ef; (* arm_SBCS X15 X15 X17 *) + 0xda1f03f5; (* arm_NGC X21 XZR *) + 0xab1502bf; (* arm_CMN X21 X21 *) + 0xca150000; (* arm_EOR X0 X0 X21 *) + 0xba1f0000; (* arm_ADCS X0 X0 XZR *) + 0xca150021; (* arm_EOR X1 X1 X21 *) + 0xba1f0021; (* arm_ADCS X1 X1 XZR *) + 0xa90806c0; (* arm_STP X0 X1 X22 (Immediate_Offset (iword (&128))) *) + 0xca150042; (* arm_EOR X2 X2 X21 *) + 0xba1f0042; (* arm_ADCS X2 X2 XZR *) + 0xca150063; (* arm_EOR X3 X3 X21 *) + 0xba1f0063; (* arm_ADCS X3 X3 XZR *) + 0xa9090ec2; (* arm_STP X2 X3 X22 (Immediate_Offset (iword (&144))) *) + 0xca150084; (* arm_EOR X4 X4 X21 *) + 0xba1f0084; (* arm_ADCS X4 X4 XZR *) + 0xca1500a5; (* arm_EOR X5 X5 X21 *) + 0xba1f00a5; (* arm_ADCS X5 X5 XZR *) + 0xa90a16c4; (* arm_STP X4 X5 X22 (Immediate_Offset (iword (&160))) *) + 0xca1500c6; (* arm_EOR X6 X6 X21 *) + 0xba1f00c6; (* arm_ADCS X6 X6 XZR *) + 0xca1500e7; (* arm_EOR X7 X7 X21 *) + 0xba1f00e7; (* arm_ADCS X7 X7 XZR *) + 0xa90b1ec6; (* arm_STP X6 X7 X22 (Immediate_Offset (iword (&176))) *) + 0xca150108; (* arm_EOR X8 X8 X21 *) + 0xba1f0108; (* arm_ADCS X8 X8 XZR *) + 0xca150129; (* arm_EOR X9 X9 X21 *) + 0xba1f0129; (* arm_ADCS X9 X9 XZR *) + 0xa90c26c8; (* arm_STP X8 X9 X22 (Immediate_Offset (iword (&192))) *) + 0xca15014a; (* arm_EOR X10 X10 X21 *) + 0xba1f014a; (* arm_ADCS X10 X10 XZR *) + 0xca15016b; (* arm_EOR X11 X11 X21 *) + 0xba1f016b; (* arm_ADCS X11 X11 XZR *) + 0xa90d2eca; (* arm_STP X10 X11 X22 (Immediate_Offset (iword (&208))) *) + 0xca15018c; (* arm_EOR X12 X12 X21 *) + 0xba1f018c; (* arm_ADCS X12 X12 XZR *) + 0xca1501ad; (* arm_EOR X13 X13 X21 *) + 0xba1f01ad; (* arm_ADCS X13 X13 XZR *) + 0xa90e36cc; (* arm_STP X12 X13 X22 (Immediate_Offset (iword (&224))) *) + 0xca1501ce; (* arm_EOR X14 X14 X21 *) + 0xba1f01ce; (* arm_ADCS X14 X14 XZR *) + 0xca1501ef; (* arm_EOR X15 X15 X21 *) + 0x9a1f01ef; (* arm_ADC X15 X15 XZR *) + 0xa90f3ece; (* arm_STP X14 X15 X22 (Immediate_Offset (iword (&240))) *) + 0xca1402b5; (* arm_EOR X21 X21 X20 *) + 0xa9500660; (* arm_LDP X0 X1 X19 (Immediate_Offset (iword (&256))) *) + 0xa9480e62; (* arm_LDP X2 X3 X19 (Immediate_Offset (iword (&128))) *) + 0xab020000; (* arm_ADDS X0 X0 X2 *) + 0xba030021; (* arm_ADCS X1 X1 X3 *) + 0xa9100660; (* arm_STP X0 X1 X19 (Immediate_Offset (iword (&256))) *) + 0xa9510660; (* arm_LDP X0 X1 X19 (Immediate_Offset (iword (&272))) *) + 0xa9490e62; (* arm_LDP X2 X3 X19 (Immediate_Offset (iword (&144))) *) + 0xba020000; (* arm_ADCS X0 X0 X2 *) + 0xba030021; (* arm_ADCS X1 X1 X3 *) + 0xa9110660; (* arm_STP X0 X1 X19 (Immediate_Offset (iword (&272))) *) + 0xa9520660; (* arm_LDP X0 X1 X19 (Immediate_Offset (iword (&288))) *) + 0xa94a0e62; (* arm_LDP X2 X3 X19 (Immediate_Offset (iword (&160))) *) + 0xba020000; (* arm_ADCS X0 X0 X2 *) + 0xba030021; (* arm_ADCS X1 X1 X3 *) + 0xa9120660; (* arm_STP X0 X1 X19 (Immediate_Offset (iword (&288))) *) + 0xa9530660; (* arm_LDP X0 X1 X19 (Immediate_Offset (iword (&304))) *) + 0xa94b0e62; (* arm_LDP X2 X3 X19 (Immediate_Offset (iword (&176))) *) + 0xba020000; (* arm_ADCS X0 X0 X2 *) + 0xba030021; (* arm_ADCS X1 X1 X3 *) + 0xa9130660; (* arm_STP X0 X1 X19 (Immediate_Offset (iword (&304))) *) + 0xa9540660; (* arm_LDP X0 X1 X19 (Immediate_Offset (iword (&320))) *) + 0xa94c0e62; (* arm_LDP X2 X3 X19 (Immediate_Offset (iword (&192))) *) + 0xba020000; (* arm_ADCS X0 X0 X2 *) + 0xba030021; (* arm_ADCS X1 X1 X3 *) + 0xa9140660; (* arm_STP X0 X1 X19 (Immediate_Offset (iword (&320))) *) + 0xa9550660; (* arm_LDP X0 X1 X19 (Immediate_Offset (iword (&336))) *) + 0xa94d0e62; (* arm_LDP X2 X3 X19 (Immediate_Offset (iword (&208))) *) + 0xba020000; (* arm_ADCS X0 X0 X2 *) + 0xba030021; (* arm_ADCS X1 X1 X3 *) + 0xa9150660; (* arm_STP X0 X1 X19 (Immediate_Offset (iword (&336))) *) + 0xa9560660; (* arm_LDP X0 X1 X19 (Immediate_Offset (iword (&352))) *) + 0xa94e0e62; (* arm_LDP X2 X3 X19 (Immediate_Offset (iword (&224))) *) + 0xba020000; (* arm_ADCS X0 X0 X2 *) + 0xba030021; (* arm_ADCS X1 X1 X3 *) + 0xa9160660; (* arm_STP X0 X1 X19 (Immediate_Offset (iword (&352))) *) + 0xa9570660; (* arm_LDP X0 X1 X19 (Immediate_Offset (iword (&368))) *) + 0xa94f0e62; (* arm_LDP X2 X3 X19 (Immediate_Offset (iword (&240))) *) + 0xba020000; (* arm_ADCS X0 X0 X2 *) + 0xba030021; (* arm_ADCS X1 X1 X3 *) + 0xa9170660; (* arm_STP X0 X1 X19 (Immediate_Offset (iword (&368))) *) + 0xa9580660; (* arm_LDP X0 X1 X19 (Immediate_Offset (iword (&384))) *) + 0xba1f0000; (* arm_ADCS X0 X0 XZR *) + 0xba1f0021; (* arm_ADCS X1 X1 XZR *) + 0xa9180660; (* arm_STP X0 X1 X19 (Immediate_Offset (iword (&384))) *) + 0xa9590660; (* arm_LDP X0 X1 X19 (Immediate_Offset (iword (&400))) *) + 0xba1f0000; (* arm_ADCS X0 X0 XZR *) + 0xba1f0021; (* arm_ADCS X1 X1 XZR *) + 0xa9190660; (* arm_STP X0 X1 X19 (Immediate_Offset (iword (&400))) *) + 0xa95a0660; (* arm_LDP X0 X1 X19 (Immediate_Offset (iword (&416))) *) + 0xba1f0000; (* arm_ADCS X0 X0 XZR *) + 0xba1f0021; (* arm_ADCS X1 X1 XZR *) + 0xa91a0660; (* arm_STP X0 X1 X19 (Immediate_Offset (iword (&416))) *) + 0xa95b0660; (* arm_LDP X0 X1 X19 (Immediate_Offset (iword (&432))) *) + 0xba1f0000; (* arm_ADCS X0 X0 XZR *) + 0xba1f0021; (* arm_ADCS X1 X1 XZR *) + 0xa91b0660; (* arm_STP X0 X1 X19 (Immediate_Offset (iword (&432))) *) + 0xa95c0660; (* arm_LDP X0 X1 X19 (Immediate_Offset (iword (&448))) *) + 0xba1f0000; (* arm_ADCS X0 X0 XZR *) + 0xba1f0021; (* arm_ADCS X1 X1 XZR *) + 0xa91c0660; (* arm_STP X0 X1 X19 (Immediate_Offset (iword (&448))) *) + 0xa95d0660; (* arm_LDP X0 X1 X19 (Immediate_Offset (iword (&464))) *) + 0xba1f0000; (* arm_ADCS X0 X0 XZR *) + 0xba1f0021; (* arm_ADCS X1 X1 XZR *) + 0xa91d0660; (* arm_STP X0 X1 X19 (Immediate_Offset (iword (&464))) *) + 0xa95e0660; (* arm_LDP X0 X1 X19 (Immediate_Offset (iword (&480))) *) + 0xba1f0000; (* arm_ADCS X0 X0 XZR *) + 0xba1f0021; (* arm_ADCS X1 X1 XZR *) + 0xa91e0660; (* arm_STP X0 X1 X19 (Immediate_Offset (iword (&480))) *) + 0xa95f0660; (* arm_LDP X0 X1 X19 (Immediate_Offset (iword (&496))) *) + 0xba1f0000; (* arm_ADCS X0 X0 XZR *) + 0x9a1f0021; (* arm_ADC X1 X1 XZR *) + 0xa91f0660; (* arm_STP X0 X1 X19 (Immediate_Offset (iword (&496))) *) + 0x910402c0; (* arm_ADD X0 X22 (rvalue (word 256)) *) + 0xaa1603e1; (* arm_MOV X1 X22 *) + 0x910202c2; (* arm_ADD X2 X22 (rvalue (word 128)) *) + 0x910802c3; (* arm_ADD X3 X22 (rvalue (word 512)) *) + 0x940000ec; (* arm_BL (word 944) *) + 0xa9500660; (* arm_LDP X0 X1 X19 (Immediate_Offset (iword (&256))) *) + 0xa9400e62; (* arm_LDP X2 X3 X19 (Immediate_Offset (iword (&0))) *) + 0xab020000; (* arm_ADDS X0 X0 X2 *) + 0xba030021; (* arm_ADCS X1 X1 X3 *) + 0xa9080660; (* arm_STP X0 X1 X19 (Immediate_Offset (iword (&128))) *) + 0xa9510660; (* arm_LDP X0 X1 X19 (Immediate_Offset (iword (&272))) *) + 0xa9410e62; (* arm_LDP X2 X3 X19 (Immediate_Offset (iword (&16))) *) + 0xba020000; (* arm_ADCS X0 X0 X2 *) + 0xba030021; (* arm_ADCS X1 X1 X3 *) + 0xa9090660; (* arm_STP X0 X1 X19 (Immediate_Offset (iword (&144))) *) + 0xa9520660; (* arm_LDP X0 X1 X19 (Immediate_Offset (iword (&288))) *) + 0xa9420e62; (* arm_LDP X2 X3 X19 (Immediate_Offset (iword (&32))) *) + 0xba020000; (* arm_ADCS X0 X0 X2 *) + 0xba030021; (* arm_ADCS X1 X1 X3 *) + 0xa90a0660; (* arm_STP X0 X1 X19 (Immediate_Offset (iword (&160))) *) + 0xa9530660; (* arm_LDP X0 X1 X19 (Immediate_Offset (iword (&304))) *) + 0xa9430e62; (* arm_LDP X2 X3 X19 (Immediate_Offset (iword (&48))) *) + 0xba020000; (* arm_ADCS X0 X0 X2 *) + 0xba030021; (* arm_ADCS X1 X1 X3 *) + 0xa90b0660; (* arm_STP X0 X1 X19 (Immediate_Offset (iword (&176))) *) + 0xa9540660; (* arm_LDP X0 X1 X19 (Immediate_Offset (iword (&320))) *) + 0xa9440e62; (* arm_LDP X2 X3 X19 (Immediate_Offset (iword (&64))) *) + 0xba020000; (* arm_ADCS X0 X0 X2 *) + 0xba030021; (* arm_ADCS X1 X1 X3 *) + 0xa90c0660; (* arm_STP X0 X1 X19 (Immediate_Offset (iword (&192))) *) + 0xa9550660; (* arm_LDP X0 X1 X19 (Immediate_Offset (iword (&336))) *) + 0xa9450e62; (* arm_LDP X2 X3 X19 (Immediate_Offset (iword (&80))) *) + 0xba020000; (* arm_ADCS X0 X0 X2 *) + 0xba030021; (* arm_ADCS X1 X1 X3 *) + 0xa90d0660; (* arm_STP X0 X1 X19 (Immediate_Offset (iword (&208))) *) + 0xa9560660; (* arm_LDP X0 X1 X19 (Immediate_Offset (iword (&352))) *) + 0xa9460e62; (* arm_LDP X2 X3 X19 (Immediate_Offset (iword (&96))) *) + 0xba020000; (* arm_ADCS X0 X0 X2 *) + 0xba030021; (* arm_ADCS X1 X1 X3 *) + 0xa90e0660; (* arm_STP X0 X1 X19 (Immediate_Offset (iword (&224))) *) + 0xa9570660; (* arm_LDP X0 X1 X19 (Immediate_Offset (iword (&368))) *) + 0xa9470e62; (* arm_LDP X2 X3 X19 (Immediate_Offset (iword (&112))) *) + 0xba020000; (* arm_ADCS X0 X0 X2 *) + 0xba030021; (* arm_ADCS X1 X1 X3 *) + 0xa90f0660; (* arm_STP X0 X1 X19 (Immediate_Offset (iword (&240))) *) + 0xa9500660; (* arm_LDP X0 X1 X19 (Immediate_Offset (iword (&256))) *) + 0xa9580e62; (* arm_LDP X2 X3 X19 (Immediate_Offset (iword (&384))) *) + 0xba020000; (* arm_ADCS X0 X0 X2 *) + 0xba030021; (* arm_ADCS X1 X1 X3 *) + 0xa9100660; (* arm_STP X0 X1 X19 (Immediate_Offset (iword (&256))) *) + 0xa9510660; (* arm_LDP X0 X1 X19 (Immediate_Offset (iword (&272))) *) + 0xa9590e62; (* arm_LDP X2 X3 X19 (Immediate_Offset (iword (&400))) *) + 0xba020000; (* arm_ADCS X0 X0 X2 *) + 0xba030021; (* arm_ADCS X1 X1 X3 *) + 0xa9110660; (* arm_STP X0 X1 X19 (Immediate_Offset (iword (&272))) *) + 0xa9520660; (* arm_LDP X0 X1 X19 (Immediate_Offset (iword (&288))) *) + 0xa95a0e62; (* arm_LDP X2 X3 X19 (Immediate_Offset (iword (&416))) *) + 0xba020000; (* arm_ADCS X0 X0 X2 *) + 0xba030021; (* arm_ADCS X1 X1 X3 *) + 0xa9120660; (* arm_STP X0 X1 X19 (Immediate_Offset (iword (&288))) *) + 0xa9530660; (* arm_LDP X0 X1 X19 (Immediate_Offset (iword (&304))) *) + 0xa95b0e62; (* arm_LDP X2 X3 X19 (Immediate_Offset (iword (&432))) *) + 0xba020000; (* arm_ADCS X0 X0 X2 *) + 0xba030021; (* arm_ADCS X1 X1 X3 *) + 0xa9130660; (* arm_STP X0 X1 X19 (Immediate_Offset (iword (&304))) *) + 0xa9540660; (* arm_LDP X0 X1 X19 (Immediate_Offset (iword (&320))) *) + 0xa95c0e62; (* arm_LDP X2 X3 X19 (Immediate_Offset (iword (&448))) *) + 0xba020000; (* arm_ADCS X0 X0 X2 *) + 0xba030021; (* arm_ADCS X1 X1 X3 *) + 0xa9140660; (* arm_STP X0 X1 X19 (Immediate_Offset (iword (&320))) *) + 0xa9550660; (* arm_LDP X0 X1 X19 (Immediate_Offset (iword (&336))) *) + 0xa95d0e62; (* arm_LDP X2 X3 X19 (Immediate_Offset (iword (&464))) *) + 0xba020000; (* arm_ADCS X0 X0 X2 *) + 0xba030021; (* arm_ADCS X1 X1 X3 *) + 0xa9150660; (* arm_STP X0 X1 X19 (Immediate_Offset (iword (&336))) *) + 0xa9560660; (* arm_LDP X0 X1 X19 (Immediate_Offset (iword (&352))) *) + 0xa95e0e62; (* arm_LDP X2 X3 X19 (Immediate_Offset (iword (&480))) *) + 0xba020000; (* arm_ADCS X0 X0 X2 *) + 0xba030021; (* arm_ADCS X1 X1 X3 *) + 0xa9160660; (* arm_STP X0 X1 X19 (Immediate_Offset (iword (&352))) *) + 0xa9570660; (* arm_LDP X0 X1 X19 (Immediate_Offset (iword (&368))) *) + 0xa95f0e62; (* arm_LDP X2 X3 X19 (Immediate_Offset (iword (&496))) *) + 0xba020000; (* arm_ADCS X0 X0 X2 *) + 0xba030021; (* arm_ADCS X1 X1 X3 *) + 0xa9170660; (* arm_STP X0 X1 X19 (Immediate_Offset (iword (&368))) *) + 0x9a9f37f4; (* arm_CSET X20 Condition_CS *) + 0xab1502bf; (* arm_CMN X21 X21 *) + 0xa9480660; (* arm_LDP X0 X1 X19 (Immediate_Offset (iword (&128))) *) + 0xa9500ec2; (* arm_LDP X2 X3 X22 (Immediate_Offset (iword (&256))) *) + 0xca150042; (* arm_EOR X2 X2 X21 *) + 0xba020000; (* arm_ADCS X0 X0 X2 *) + 0xca150063; (* arm_EOR X3 X3 X21 *) + 0xba030021; (* arm_ADCS X1 X1 X3 *) + 0xa9080660; (* arm_STP X0 X1 X19 (Immediate_Offset (iword (&128))) *) + 0xa9490660; (* arm_LDP X0 X1 X19 (Immediate_Offset (iword (&144))) *) + 0xa9510ec2; (* arm_LDP X2 X3 X22 (Immediate_Offset (iword (&272))) *) + 0xca150042; (* arm_EOR X2 X2 X21 *) + 0xba020000; (* arm_ADCS X0 X0 X2 *) + 0xca150063; (* arm_EOR X3 X3 X21 *) + 0xba030021; (* arm_ADCS X1 X1 X3 *) + 0xa9090660; (* arm_STP X0 X1 X19 (Immediate_Offset (iword (&144))) *) + 0xa94a0660; (* arm_LDP X0 X1 X19 (Immediate_Offset (iword (&160))) *) + 0xa9520ec2; (* arm_LDP X2 X3 X22 (Immediate_Offset (iword (&288))) *) + 0xca150042; (* arm_EOR X2 X2 X21 *) + 0xba020000; (* arm_ADCS X0 X0 X2 *) + 0xca150063; (* arm_EOR X3 X3 X21 *) + 0xba030021; (* arm_ADCS X1 X1 X3 *) + 0xa90a0660; (* arm_STP X0 X1 X19 (Immediate_Offset (iword (&160))) *) + 0xa94b0660; (* arm_LDP X0 X1 X19 (Immediate_Offset (iword (&176))) *) + 0xa9530ec2; (* arm_LDP X2 X3 X22 (Immediate_Offset (iword (&304))) *) + 0xca150042; (* arm_EOR X2 X2 X21 *) + 0xba020000; (* arm_ADCS X0 X0 X2 *) + 0xca150063; (* arm_EOR X3 X3 X21 *) + 0xba030021; (* arm_ADCS X1 X1 X3 *) + 0xa90b0660; (* arm_STP X0 X1 X19 (Immediate_Offset (iword (&176))) *) + 0xa94c0660; (* arm_LDP X0 X1 X19 (Immediate_Offset (iword (&192))) *) + 0xa9540ec2; (* arm_LDP X2 X3 X22 (Immediate_Offset (iword (&320))) *) + 0xca150042; (* arm_EOR X2 X2 X21 *) + 0xba020000; (* arm_ADCS X0 X0 X2 *) + 0xca150063; (* arm_EOR X3 X3 X21 *) + 0xba030021; (* arm_ADCS X1 X1 X3 *) + 0xa90c0660; (* arm_STP X0 X1 X19 (Immediate_Offset (iword (&192))) *) + 0xa94d0660; (* arm_LDP X0 X1 X19 (Immediate_Offset (iword (&208))) *) + 0xa9550ec2; (* arm_LDP X2 X3 X22 (Immediate_Offset (iword (&336))) *) + 0xca150042; (* arm_EOR X2 X2 X21 *) + 0xba020000; (* arm_ADCS X0 X0 X2 *) + 0xca150063; (* arm_EOR X3 X3 X21 *) + 0xba030021; (* arm_ADCS X1 X1 X3 *) + 0xa90d0660; (* arm_STP X0 X1 X19 (Immediate_Offset (iword (&208))) *) + 0xa94e0660; (* arm_LDP X0 X1 X19 (Immediate_Offset (iword (&224))) *) + 0xa9560ec2; (* arm_LDP X2 X3 X22 (Immediate_Offset (iword (&352))) *) + 0xca150042; (* arm_EOR X2 X2 X21 *) + 0xba020000; (* arm_ADCS X0 X0 X2 *) + 0xca150063; (* arm_EOR X3 X3 X21 *) + 0xba030021; (* arm_ADCS X1 X1 X3 *) + 0xa90e0660; (* arm_STP X0 X1 X19 (Immediate_Offset (iword (&224))) *) + 0xa94f0660; (* arm_LDP X0 X1 X19 (Immediate_Offset (iword (&240))) *) + 0xa9570ec2; (* arm_LDP X2 X3 X22 (Immediate_Offset (iword (&368))) *) + 0xca150042; (* arm_EOR X2 X2 X21 *) + 0xba020000; (* arm_ADCS X0 X0 X2 *) + 0xca150063; (* arm_EOR X3 X3 X21 *) + 0xba030021; (* arm_ADCS X1 X1 X3 *) + 0xa90f0660; (* arm_STP X0 X1 X19 (Immediate_Offset (iword (&240))) *) + 0xa9500660; (* arm_LDP X0 X1 X19 (Immediate_Offset (iword (&256))) *) + 0xa9580ec2; (* arm_LDP X2 X3 X22 (Immediate_Offset (iword (&384))) *) + 0xca150042; (* arm_EOR X2 X2 X21 *) + 0xba020000; (* arm_ADCS X0 X0 X2 *) + 0xca150063; (* arm_EOR X3 X3 X21 *) + 0xba030021; (* arm_ADCS X1 X1 X3 *) + 0xa9100660; (* arm_STP X0 X1 X19 (Immediate_Offset (iword (&256))) *) + 0xa9510660; (* arm_LDP X0 X1 X19 (Immediate_Offset (iword (&272))) *) + 0xa9590ec2; (* arm_LDP X2 X3 X22 (Immediate_Offset (iword (&400))) *) + 0xca150042; (* arm_EOR X2 X2 X21 *) + 0xba020000; (* arm_ADCS X0 X0 X2 *) + 0xca150063; (* arm_EOR X3 X3 X21 *) + 0xba030021; (* arm_ADCS X1 X1 X3 *) + 0xa9110660; (* arm_STP X0 X1 X19 (Immediate_Offset (iword (&272))) *) + 0xa9520660; (* arm_LDP X0 X1 X19 (Immediate_Offset (iword (&288))) *) + 0xa95a0ec2; (* arm_LDP X2 X3 X22 (Immediate_Offset (iword (&416))) *) + 0xca150042; (* arm_EOR X2 X2 X21 *) + 0xba020000; (* arm_ADCS X0 X0 X2 *) + 0xca150063; (* arm_EOR X3 X3 X21 *) + 0xba030021; (* arm_ADCS X1 X1 X3 *) + 0xa9120660; (* arm_STP X0 X1 X19 (Immediate_Offset (iword (&288))) *) + 0xa9530660; (* arm_LDP X0 X1 X19 (Immediate_Offset (iword (&304))) *) + 0xa95b0ec2; (* arm_LDP X2 X3 X22 (Immediate_Offset (iword (&432))) *) + 0xca150042; (* arm_EOR X2 X2 X21 *) + 0xba020000; (* arm_ADCS X0 X0 X2 *) + 0xca150063; (* arm_EOR X3 X3 X21 *) + 0xba030021; (* arm_ADCS X1 X1 X3 *) + 0xa9130660; (* arm_STP X0 X1 X19 (Immediate_Offset (iword (&304))) *) + 0xa9540660; (* arm_LDP X0 X1 X19 (Immediate_Offset (iword (&320))) *) + 0xa95c0ec2; (* arm_LDP X2 X3 X22 (Immediate_Offset (iword (&448))) *) + 0xca150042; (* arm_EOR X2 X2 X21 *) + 0xba020000; (* arm_ADCS X0 X0 X2 *) + 0xca150063; (* arm_EOR X3 X3 X21 *) + 0xba030021; (* arm_ADCS X1 X1 X3 *) + 0xa9140660; (* arm_STP X0 X1 X19 (Immediate_Offset (iword (&320))) *) + 0xa9550660; (* arm_LDP X0 X1 X19 (Immediate_Offset (iword (&336))) *) + 0xa95d0ec2; (* arm_LDP X2 X3 X22 (Immediate_Offset (iword (&464))) *) + 0xca150042; (* arm_EOR X2 X2 X21 *) + 0xba020000; (* arm_ADCS X0 X0 X2 *) + 0xca150063; (* arm_EOR X3 X3 X21 *) + 0xba030021; (* arm_ADCS X1 X1 X3 *) + 0xa9150660; (* arm_STP X0 X1 X19 (Immediate_Offset (iword (&336))) *) + 0xa9560660; (* arm_LDP X0 X1 X19 (Immediate_Offset (iword (&352))) *) + 0xa95e0ec2; (* arm_LDP X2 X3 X22 (Immediate_Offset (iword (&480))) *) + 0xca150042; (* arm_EOR X2 X2 X21 *) + 0xba020000; (* arm_ADCS X0 X0 X2 *) + 0xca150063; (* arm_EOR X3 X3 X21 *) + 0xba030021; (* arm_ADCS X1 X1 X3 *) + 0xa9160660; (* arm_STP X0 X1 X19 (Immediate_Offset (iword (&352))) *) + 0xa9570660; (* arm_LDP X0 X1 X19 (Immediate_Offset (iword (&368))) *) + 0xa95f0ec2; (* arm_LDP X2 X3 X22 (Immediate_Offset (iword (&496))) *) + 0xca150042; (* arm_EOR X2 X2 X21 *) + 0xba020000; (* arm_ADCS X0 X0 X2 *) + 0xca150063; (* arm_EOR X3 X3 X21 *) + 0xba030021; (* arm_ADCS X1 X1 X3 *) + 0xa9170660; (* arm_STP X0 X1 X19 (Immediate_Offset (iword (&368))) *) + 0xba1402b4; (* arm_ADCS X20 X21 X20 *) + 0x9a1f02b0; (* arm_ADC X16 X21 XZR *) + 0xa9580660; (* arm_LDP X0 X1 X19 (Immediate_Offset (iword (&384))) *) + 0xab140000; (* arm_ADDS X0 X0 X20 *) + 0xba100021; (* arm_ADCS X1 X1 X16 *) + 0xa9180660; (* arm_STP X0 X1 X19 (Immediate_Offset (iword (&384))) *) + 0xa9590660; (* arm_LDP X0 X1 X19 (Immediate_Offset (iword (&400))) *) + 0xba100000; (* arm_ADCS X0 X0 X16 *) + 0xba100021; (* arm_ADCS X1 X1 X16 *) + 0xa9190660; (* arm_STP X0 X1 X19 (Immediate_Offset (iword (&400))) *) + 0xa95a0660; (* arm_LDP X0 X1 X19 (Immediate_Offset (iword (&416))) *) + 0xba100000; (* arm_ADCS X0 X0 X16 *) + 0xba100021; (* arm_ADCS X1 X1 X16 *) + 0xa91a0660; (* arm_STP X0 X1 X19 (Immediate_Offset (iword (&416))) *) + 0xa95b0660; (* arm_LDP X0 X1 X19 (Immediate_Offset (iword (&432))) *) + 0xba100000; (* arm_ADCS X0 X0 X16 *) + 0xba100021; (* arm_ADCS X1 X1 X16 *) + 0xa91b0660; (* arm_STP X0 X1 X19 (Immediate_Offset (iword (&432))) *) + 0xa95c0660; (* arm_LDP X0 X1 X19 (Immediate_Offset (iword (&448))) *) + 0xba100000; (* arm_ADCS X0 X0 X16 *) + 0xba100021; (* arm_ADCS X1 X1 X16 *) + 0xa91c0660; (* arm_STP X0 X1 X19 (Immediate_Offset (iword (&448))) *) + 0xa95d0660; (* arm_LDP X0 X1 X19 (Immediate_Offset (iword (&464))) *) + 0xba100000; (* arm_ADCS X0 X0 X16 *) + 0xba100021; (* arm_ADCS X1 X1 X16 *) + 0xa91d0660; (* arm_STP X0 X1 X19 (Immediate_Offset (iword (&464))) *) + 0xa95e0660; (* arm_LDP X0 X1 X19 (Immediate_Offset (iword (&480))) *) + 0xba100000; (* arm_ADCS X0 X0 X16 *) + 0xba100021; (* arm_ADCS X1 X1 X16 *) + 0xa91e0660; (* arm_STP X0 X1 X19 (Immediate_Offset (iword (&480))) *) + 0xa95f0660; (* arm_LDP X0 X1 X19 (Immediate_Offset (iword (&496))) *) + 0xba100000; (* arm_ADCS X0 X0 X16 *) + 0x9a100021; (* arm_ADC X1 X1 X16 *) + 0xa91f0660; (* arm_STP X0 X1 X19 (Immediate_Offset (iword (&496))) *) + 0xa8c17bfd; (* arm_LDP X29 X30 SP (Postimmediate_Offset (iword (&16))) *) + 0xa8c173fb; (* arm_LDP X27 X28 SP (Postimmediate_Offset (iword (&16))) *) + 0xa8c16bf9; (* arm_LDP X25 X26 SP (Postimmediate_Offset (iword (&16))) *) + 0xa8c163f7; (* arm_LDP X23 X24 SP (Postimmediate_Offset (iword (&16))) *) + 0xa8c15bf5; (* arm_LDP X21 X22 SP (Postimmediate_Offset (iword (&16))) *) + 0xa8c153f3; (* arm_LDP X19 X20 SP (Postimmediate_Offset (iword (&16))) *) + 0xd65f03c0; (* arm_RET X30 *) + 0xa9bf53f3; (* arm_STP X19 X20 SP (Preimmediate_Offset (iword (-- &16))) *) + 0xa9bf5bf5; (* arm_STP X21 X22 SP (Preimmediate_Offset (iword (-- &16))) *) + 0xa9bf7bf7; (* arm_STP X23 X30 SP (Preimmediate_Offset (iword (-- &16))) *) + 0xaa0003f9; (* arm_MOV X25 X0 *) + 0xaa0103fa; (* arm_MOV X26 X1 *) + 0xaa0203fb; (* arm_MOV X27 X2 *) + 0xaa0303fc; (* arm_MOV X28 X3 *) + 0x940000e2; (* arm_BL (word 904) *) + 0xa9402f4a; (* arm_LDP X10 X11 X26 (Immediate_Offset (iword (&0))) *) + 0xa9442748; (* arm_LDP X8 X9 X26 (Immediate_Offset (iword (&64))) *) + 0xeb08014a; (* arm_SUBS X10 X10 X8 *) + 0xfa09016b; (* arm_SBCS X11 X11 X9 *) + 0xa941374c; (* arm_LDP X12 X13 X26 (Immediate_Offset (iword (&16))) *) + 0xa9452748; (* arm_LDP X8 X9 X26 (Immediate_Offset (iword (&80))) *) + 0xfa08018c; (* arm_SBCS X12 X12 X8 *) + 0xfa0901ad; (* arm_SBCS X13 X13 X9 *) + 0xa9423f4e; (* arm_LDP X14 X15 X26 (Immediate_Offset (iword (&32))) *) + 0xa9462748; (* arm_LDP X8 X9 X26 (Immediate_Offset (iword (&96))) *) + 0xfa0801ce; (* arm_SBCS X14 X14 X8 *) + 0xfa0901ef; (* arm_SBCS X15 X15 X9 *) + 0xa9434750; (* arm_LDP X16 X17 X26 (Immediate_Offset (iword (&48))) *) + 0xa9472748; (* arm_LDP X8 X9 X26 (Immediate_Offset (iword (&112))) *) + 0xfa080210; (* arm_SBCS X16 X16 X8 *) + 0xfa090231; (* arm_SBCS X17 X17 X9 *) + 0xda9f23fd; (* arm_CSETM X29 Condition_CC *) + 0xab1d03bf; (* arm_CMN X29 X29 *) + 0xca1d014a; (* arm_EOR X10 X10 X29 *) + 0xba1f014a; (* arm_ADCS X10 X10 XZR *) + 0xca1d016b; (* arm_EOR X11 X11 X29 *) + 0xba1f016b; (* arm_ADCS X11 X11 XZR *) + 0xa9002f8a; (* arm_STP X10 X11 X28 (Immediate_Offset (iword (&0))) *) + 0xca1d018c; (* arm_EOR X12 X12 X29 *) + 0xba1f018c; (* arm_ADCS X12 X12 XZR *) + 0xca1d01ad; (* arm_EOR X13 X13 X29 *) + 0xba1f01ad; (* arm_ADCS X13 X13 XZR *) + 0xa901378c; (* arm_STP X12 X13 X28 (Immediate_Offset (iword (&16))) *) + 0xca1d01ce; (* arm_EOR X14 X14 X29 *) + 0xba1f01ce; (* arm_ADCS X14 X14 XZR *) + 0xca1d01ef; (* arm_EOR X15 X15 X29 *) + 0xba1f01ef; (* arm_ADCS X15 X15 XZR *) + 0xa9023f8e; (* arm_STP X14 X15 X28 (Immediate_Offset (iword (&32))) *) + 0xca1d0210; (* arm_EOR X16 X16 X29 *) + 0xba1f0210; (* arm_ADCS X16 X16 XZR *) + 0xca1d0231; (* arm_EOR X17 X17 X29 *) + 0xba1f0231; (* arm_ADCS X17 X17 XZR *) + 0xa9034790; (* arm_STP X16 X17 X28 (Immediate_Offset (iword (&48))) *) + 0x91020320; (* arm_ADD X0 X25 (rvalue (word 128)) *) + 0x91010341; (* arm_ADD X1 X26 (rvalue (word 64)) *) + 0x91010362; (* arm_ADD X2 X27 (rvalue (word 64)) *) + 0x940000b8; (* arm_BL (word 736) *) + 0xa9402f6a; (* arm_LDP X10 X11 X27 (Immediate_Offset (iword (&0))) *) + 0xa9442768; (* arm_LDP X8 X9 X27 (Immediate_Offset (iword (&64))) *) + 0xeb0a010a; (* arm_SUBS X10 X8 X10 *) + 0xfa0b012b; (* arm_SBCS X11 X9 X11 *) + 0xa941376c; (* arm_LDP X12 X13 X27 (Immediate_Offset (iword (&16))) *) + 0xa9452768; (* arm_LDP X8 X9 X27 (Immediate_Offset (iword (&80))) *) + 0xfa0c010c; (* arm_SBCS X12 X8 X12 *) + 0xfa0d012d; (* arm_SBCS X13 X9 X13 *) + 0xa9423f6e; (* arm_LDP X14 X15 X27 (Immediate_Offset (iword (&32))) *) + 0xa9462768; (* arm_LDP X8 X9 X27 (Immediate_Offset (iword (&96))) *) + 0xfa0e010e; (* arm_SBCS X14 X8 X14 *) + 0xfa0f012f; (* arm_SBCS X15 X9 X15 *) + 0xa9434770; (* arm_LDP X16 X17 X27 (Immediate_Offset (iword (&48))) *) + 0xa9472768; (* arm_LDP X8 X9 X27 (Immediate_Offset (iword (&112))) *) + 0xfa100110; (* arm_SBCS X16 X8 X16 *) + 0xfa110131; (* arm_SBCS X17 X9 X17 *) + 0xda9f23f3; (* arm_CSETM X19 Condition_CC *) + 0xab13027f; (* arm_CMN X19 X19 *) + 0xca13014a; (* arm_EOR X10 X10 X19 *) + 0xba1f014a; (* arm_ADCS X10 X10 XZR *) + 0xca13016b; (* arm_EOR X11 X11 X19 *) + 0xba1f016b; (* arm_ADCS X11 X11 XZR *) + 0xa9042f8a; (* arm_STP X10 X11 X28 (Immediate_Offset (iword (&64))) *) + 0xca13018c; (* arm_EOR X12 X12 X19 *) + 0xba1f018c; (* arm_ADCS X12 X12 XZR *) + 0xca1301ad; (* arm_EOR X13 X13 X19 *) + 0xba1f01ad; (* arm_ADCS X13 X13 XZR *) + 0xa905378c; (* arm_STP X12 X13 X28 (Immediate_Offset (iword (&80))) *) + 0xca1301ce; (* arm_EOR X14 X14 X19 *) + 0xba1f01ce; (* arm_ADCS X14 X14 XZR *) + 0xca1301ef; (* arm_EOR X15 X15 X19 *) + 0xba1f01ef; (* arm_ADCS X15 X15 XZR *) + 0xa9063f8e; (* arm_STP X14 X15 X28 (Immediate_Offset (iword (&96))) *) + 0xca130210; (* arm_EOR X16 X16 X19 *) + 0xba1f0210; (* arm_ADCS X16 X16 XZR *) + 0xca130231; (* arm_EOR X17 X17 X19 *) + 0xba1f0231; (* arm_ADCS X17 X17 XZR *) + 0xa9074790; (* arm_STP X16 X17 X28 (Immediate_Offset (iword (&112))) *) + 0xca1303bd; (* arm_EOR X29 X29 X19 *) + 0xa9482f2a; (* arm_LDP X10 X11 X25 (Immediate_Offset (iword (&128))) *) + 0xa944372c; (* arm_LDP X12 X13 X25 (Immediate_Offset (iword (&64))) *) + 0xab0c014a; (* arm_ADDS X10 X10 X12 *) + 0xba0d016b; (* arm_ADCS X11 X11 X13 *) + 0xa9082f2a; (* arm_STP X10 X11 X25 (Immediate_Offset (iword (&128))) *) + 0xa9492f2a; (* arm_LDP X10 X11 X25 (Immediate_Offset (iword (&144))) *) + 0xa945372c; (* arm_LDP X12 X13 X25 (Immediate_Offset (iword (&80))) *) + 0xba0c014a; (* arm_ADCS X10 X10 X12 *) + 0xba0d016b; (* arm_ADCS X11 X11 X13 *) + 0xa9092f2a; (* arm_STP X10 X11 X25 (Immediate_Offset (iword (&144))) *) + 0xa94a2f2a; (* arm_LDP X10 X11 X25 (Immediate_Offset (iword (&160))) *) + 0xa946372c; (* arm_LDP X12 X13 X25 (Immediate_Offset (iword (&96))) *) + 0xba0c014a; (* arm_ADCS X10 X10 X12 *) + 0xba0d016b; (* arm_ADCS X11 X11 X13 *) + 0xa90a2f2a; (* arm_STP X10 X11 X25 (Immediate_Offset (iword (&160))) *) + 0xa94b2f2a; (* arm_LDP X10 X11 X25 (Immediate_Offset (iword (&176))) *) + 0xa947372c; (* arm_LDP X12 X13 X25 (Immediate_Offset (iword (&112))) *) + 0xba0c014a; (* arm_ADCS X10 X10 X12 *) + 0xba0d016b; (* arm_ADCS X11 X11 X13 *) + 0xa90b2f2a; (* arm_STP X10 X11 X25 (Immediate_Offset (iword (&176))) *) + 0xa94c2f2a; (* arm_LDP X10 X11 X25 (Immediate_Offset (iword (&192))) *) + 0xba1f014a; (* arm_ADCS X10 X10 XZR *) + 0xba1f016b; (* arm_ADCS X11 X11 XZR *) + 0xa90c2f2a; (* arm_STP X10 X11 X25 (Immediate_Offset (iword (&192))) *) + 0xa94d2f2a; (* arm_LDP X10 X11 X25 (Immediate_Offset (iword (&208))) *) + 0xba1f014a; (* arm_ADCS X10 X10 XZR *) + 0xba1f016b; (* arm_ADCS X11 X11 XZR *) + 0xa90d2f2a; (* arm_STP X10 X11 X25 (Immediate_Offset (iword (&208))) *) + 0xa94e2f2a; (* arm_LDP X10 X11 X25 (Immediate_Offset (iword (&224))) *) + 0xba1f014a; (* arm_ADCS X10 X10 XZR *) + 0xba1f016b; (* arm_ADCS X11 X11 XZR *) + 0xa90e2f2a; (* arm_STP X10 X11 X25 (Immediate_Offset (iword (&224))) *) + 0xa94f2f2a; (* arm_LDP X10 X11 X25 (Immediate_Offset (iword (&240))) *) + 0xba1f014a; (* arm_ADCS X10 X10 XZR *) + 0xba1f016b; (* arm_ADCS X11 X11 XZR *) + 0xa90f2f2a; (* arm_STP X10 X11 X25 (Immediate_Offset (iword (&240))) *) + 0x91020380; (* arm_ADD X0 X28 (rvalue (word 128)) *) + 0xaa1c03e1; (* arm_MOV X1 X28 *) + 0x91010382; (* arm_ADD X2 X28 (rvalue (word 64)) *) + 0x94000069; (* arm_BL (word 420) *) + 0xa9400720; (* arm_LDP X0 X1 X25 (Immediate_Offset (iword (&0))) *) + 0xa9484730; (* arm_LDP X16 X17 X25 (Immediate_Offset (iword (&128))) *) + 0xab100000; (* arm_ADDS X0 X0 X16 *) + 0xba110021; (* arm_ADCS X1 X1 X17 *) + 0xa9410f22; (* arm_LDP X2 X3 X25 (Immediate_Offset (iword (&16))) *) + 0xa9494730; (* arm_LDP X16 X17 X25 (Immediate_Offset (iword (&144))) *) + 0xba100042; (* arm_ADCS X2 X2 X16 *) + 0xba110063; (* arm_ADCS X3 X3 X17 *) + 0xa9421724; (* arm_LDP X4 X5 X25 (Immediate_Offset (iword (&32))) *) + 0xa94a4730; (* arm_LDP X16 X17 X25 (Immediate_Offset (iword (&160))) *) + 0xba100084; (* arm_ADCS X4 X4 X16 *) + 0xba1100a5; (* arm_ADCS X5 X5 X17 *) + 0xa9431f26; (* arm_LDP X6 X7 X25 (Immediate_Offset (iword (&48))) *) + 0xa94b4730; (* arm_LDP X16 X17 X25 (Immediate_Offset (iword (&176))) *) + 0xba1000c6; (* arm_ADCS X6 X6 X16 *) + 0xba1100e7; (* arm_ADCS X7 X7 X17 *) + 0xa9482728; (* arm_LDP X8 X9 X25 (Immediate_Offset (iword (&128))) *) + 0xa94c4730; (* arm_LDP X16 X17 X25 (Immediate_Offset (iword (&192))) *) + 0xba100108; (* arm_ADCS X8 X8 X16 *) + 0xba110129; (* arm_ADCS X9 X9 X17 *) + 0xa9492f2a; (* arm_LDP X10 X11 X25 (Immediate_Offset (iword (&144))) *) + 0xa94d4730; (* arm_LDP X16 X17 X25 (Immediate_Offset (iword (&208))) *) + 0xba10014a; (* arm_ADCS X10 X10 X16 *) + 0xba11016b; (* arm_ADCS X11 X11 X17 *) + 0xa94a372c; (* arm_LDP X12 X13 X25 (Immediate_Offset (iword (&160))) *) + 0xa94e4730; (* arm_LDP X16 X17 X25 (Immediate_Offset (iword (&224))) *) + 0xba10018c; (* arm_ADCS X12 X12 X16 *) + 0xba1101ad; (* arm_ADCS X13 X13 X17 *) + 0xa94b3f2e; (* arm_LDP X14 X15 X25 (Immediate_Offset (iword (&176))) *) + 0xa94f4730; (* arm_LDP X16 X17 X25 (Immediate_Offset (iword (&240))) *) + 0xba1001ce; (* arm_ADCS X14 X14 X16 *) + 0xba1101ef; (* arm_ADCS X15 X15 X17 *) + 0x9a9f37fa; (* arm_CSET X26 Condition_CS *) + 0xab1d03bf; (* arm_CMN X29 X29 *) + 0xa9484790; (* arm_LDP X16 X17 X28 (Immediate_Offset (iword (&128))) *) + 0xca1d0210; (* arm_EOR X16 X16 X29 *) + 0xba100000; (* arm_ADCS X0 X0 X16 *) + 0xca1d0231; (* arm_EOR X17 X17 X29 *) + 0xba110021; (* arm_ADCS X1 X1 X17 *) + 0xa9040720; (* arm_STP X0 X1 X25 (Immediate_Offset (iword (&64))) *) + 0xa9494790; (* arm_LDP X16 X17 X28 (Immediate_Offset (iword (&144))) *) + 0xca1d0210; (* arm_EOR X16 X16 X29 *) + 0xba100042; (* arm_ADCS X2 X2 X16 *) + 0xca1d0231; (* arm_EOR X17 X17 X29 *) + 0xba110063; (* arm_ADCS X3 X3 X17 *) + 0xa9050f22; (* arm_STP X2 X3 X25 (Immediate_Offset (iword (&80))) *) + 0xa94a4790; (* arm_LDP X16 X17 X28 (Immediate_Offset (iword (&160))) *) + 0xca1d0210; (* arm_EOR X16 X16 X29 *) + 0xba100084; (* arm_ADCS X4 X4 X16 *) + 0xca1d0231; (* arm_EOR X17 X17 X29 *) + 0xba1100a5; (* arm_ADCS X5 X5 X17 *) + 0xa9061724; (* arm_STP X4 X5 X25 (Immediate_Offset (iword (&96))) *) + 0xa94b4790; (* arm_LDP X16 X17 X28 (Immediate_Offset (iword (&176))) *) + 0xca1d0210; (* arm_EOR X16 X16 X29 *) + 0xba1000c6; (* arm_ADCS X6 X6 X16 *) + 0xca1d0231; (* arm_EOR X17 X17 X29 *) + 0xba1100e7; (* arm_ADCS X7 X7 X17 *) + 0xa9071f26; (* arm_STP X6 X7 X25 (Immediate_Offset (iword (&112))) *) + 0xa94c4790; (* arm_LDP X16 X17 X28 (Immediate_Offset (iword (&192))) *) + 0xca1d0210; (* arm_EOR X16 X16 X29 *) + 0xba100108; (* arm_ADCS X8 X8 X16 *) + 0xca1d0231; (* arm_EOR X17 X17 X29 *) + 0xba110129; (* arm_ADCS X9 X9 X17 *) + 0xa9082728; (* arm_STP X8 X9 X25 (Immediate_Offset (iword (&128))) *) + 0xa94d4790; (* arm_LDP X16 X17 X28 (Immediate_Offset (iword (&208))) *) + 0xca1d0210; (* arm_EOR X16 X16 X29 *) + 0xba10014a; (* arm_ADCS X10 X10 X16 *) + 0xca1d0231; (* arm_EOR X17 X17 X29 *) + 0xba11016b; (* arm_ADCS X11 X11 X17 *) + 0xa9092f2a; (* arm_STP X10 X11 X25 (Immediate_Offset (iword (&144))) *) + 0xa94e4790; (* arm_LDP X16 X17 X28 (Immediate_Offset (iword (&224))) *) + 0xca1d0210; (* arm_EOR X16 X16 X29 *) + 0xba10018c; (* arm_ADCS X12 X12 X16 *) + 0xca1d0231; (* arm_EOR X17 X17 X29 *) + 0xba1101ad; (* arm_ADCS X13 X13 X17 *) + 0xa90a372c; (* arm_STP X12 X13 X25 (Immediate_Offset (iword (&160))) *) + 0xa94f4790; (* arm_LDP X16 X17 X28 (Immediate_Offset (iword (&240))) *) + 0xca1d0210; (* arm_EOR X16 X16 X29 *) + 0xba1001ce; (* arm_ADCS X14 X14 X16 *) + 0xca1d0231; (* arm_EOR X17 X17 X29 *) + 0xba1101ef; (* arm_ADCS X15 X15 X17 *) + 0xa90b3f2e; (* arm_STP X14 X15 X25 (Immediate_Offset (iword (&176))) *) + 0xba1a03bb; (* arm_ADCS X27 X29 X26 *) + 0x9a1f03bc; (* arm_ADC X28 X29 XZR *) + 0xa94c2f2a; (* arm_LDP X10 X11 X25 (Immediate_Offset (iword (&192))) *) + 0xab1b014a; (* arm_ADDS X10 X10 X27 *) + 0xba1c016b; (* arm_ADCS X11 X11 X28 *) + 0xa90c2f2a; (* arm_STP X10 X11 X25 (Immediate_Offset (iword (&192))) *) + 0xa94d2f2a; (* arm_LDP X10 X11 X25 (Immediate_Offset (iword (&208))) *) + 0xba1c014a; (* arm_ADCS X10 X10 X28 *) + 0xba1c016b; (* arm_ADCS X11 X11 X28 *) + 0xa90d2f2a; (* arm_STP X10 X11 X25 (Immediate_Offset (iword (&208))) *) + 0xa94e2f2a; (* arm_LDP X10 X11 X25 (Immediate_Offset (iword (&224))) *) + 0xba1c014a; (* arm_ADCS X10 X10 X28 *) + 0xba1c016b; (* arm_ADCS X11 X11 X28 *) + 0xa90e2f2a; (* arm_STP X10 X11 X25 (Immediate_Offset (iword (&224))) *) + 0xa94f2f2a; (* arm_LDP X10 X11 X25 (Immediate_Offset (iword (&240))) *) + 0xba1c014a; (* arm_ADCS X10 X10 X28 *) + 0xba1c016b; (* arm_ADCS X11 X11 X28 *) + 0xa90f2f2a; (* arm_STP X10 X11 X25 (Immediate_Offset (iword (&240))) *) + 0xa8c17bf7; (* arm_LDP X23 X30 SP (Postimmediate_Offset (iword (&16))) *) + 0xa8c15bf5; (* arm_LDP X21 X22 SP (Postimmediate_Offset (iword (&16))) *) + 0xa8c153f3; (* arm_LDP X19 X20 SP (Postimmediate_Offset (iword (&16))) *) + 0xd65f03c0; (* arm_RET X30 *) + 0xa9401023; (* arm_LDP X3 X4 X1 (Immediate_Offset (iword (&0))) *) + 0x3dc00020; (* arm_LDR Q0 X1 (Immediate_Offset (word 0)) *) + 0xa9402047; (* arm_LDP X7 X8 X2 (Immediate_Offset (iword (&0))) *) + 0x3dc00041; (* arm_LDR Q1 X2 (Immediate_Offset (word 0)) *) + 0xa9411825; (* arm_LDP X5 X6 X1 (Immediate_Offset (iword (&16))) *) + 0x3dc00422; (* arm_LDR Q2 X1 (Immediate_Offset (word 16)) *) + 0xa9412849; (* arm_LDP X9 X10 X2 (Immediate_Offset (iword (&16))) *) + 0x3dc00443; (* arm_LDR Q3 X2 (Immediate_Offset (word 16)) *) + 0x4e801824; (* arm_UZIP1 Q4 Q1 Q0 32 *) + 0x4ea00821; (* arm_REV64_VEC Q1 Q1 32 *) + 0x4e801805; (* arm_UZIP1 Q5 Q0 Q0 32 *) + 0x4ea09c20; (* arm_MUL_VEC Q0 Q1 Q0 32 *) + 0x6ea02800; (* arm_UADDLP Q0 Q0 32 *) + 0x4f605400; (* arm_SHL_VEC Q0 Q0 32 64 *) + 0x2ea480a0; (* arm_UMLAL Q0 Q5 Q4 32 *) + 0x4e083c0b; (* arm_UMOV X11 Q0 0 8 *) + 0x4e183c0f; (* arm_UMOV X15 Q0 1 8 *) + 0x4e821860; (* arm_UZIP1 Q0 Q3 Q2 32 *) + 0x4ea00861; (* arm_REV64_VEC Q1 Q3 32 *) + 0x4e821843; (* arm_UZIP1 Q3 Q2 Q2 32 *) + 0x4ea29c21; (* arm_MUL_VEC Q1 Q1 Q2 32 *) + 0x6ea02821; (* arm_UADDLP Q1 Q1 32 *) + 0x4f605421; (* arm_SHL_VEC Q1 Q1 32 64 *) + 0x2ea08061; (* arm_UMLAL Q1 Q3 Q0 32 *) + 0x4e083c30; (* arm_UMOV X16 Q1 0 8 *) + 0x4e183c31; (* arm_UMOV X17 Q1 1 8 *) + 0x3dc00820; (* arm_LDR Q0 X1 (Immediate_Offset (word 32)) *) + 0x3dc00841; (* arm_LDR Q1 X2 (Immediate_Offset (word 32)) *) + 0x3dc00c22; (* arm_LDR Q2 X1 (Immediate_Offset (word 48)) *) + 0x3dc00c43; (* arm_LDR Q3 X2 (Immediate_Offset (word 48)) *) + 0x9bc77c73; (* arm_UMULH X19 X3 X7 *) + 0xab1301ef; (* arm_ADDS X15 X15 X19 *) + 0x9bc87c93; (* arm_UMULH X19 X4 X8 *) + 0xba130210; (* arm_ADCS X16 X16 X19 *) + 0x9bc97cb3; (* arm_UMULH X19 X5 X9 *) + 0xba130231; (* arm_ADCS X17 X17 X19 *) + 0x9bca7cd3; (* arm_UMULH X19 X6 X10 *) + 0x4e801824; (* arm_UZIP1 Q4 Q1 Q0 32 *) + 0x4ea00821; (* arm_REV64_VEC Q1 Q1 32 *) + 0x4e801805; (* arm_UZIP1 Q5 Q0 Q0 32 *) + 0x4ea09c20; (* arm_MUL_VEC Q0 Q1 Q0 32 *) + 0x6ea02800; (* arm_UADDLP Q0 Q0 32 *) + 0x4f605400; (* arm_SHL_VEC Q0 Q0 32 64 *) + 0x2ea480a0; (* arm_UMLAL Q0 Q5 Q4 32 *) + 0x9a1f0273; (* arm_ADC X19 X19 XZR *) + 0xab0b01ec; (* arm_ADDS X12 X15 X11 *) + 0xba0f020f; (* arm_ADCS X15 X16 X15 *) + 0xba100230; (* arm_ADCS X16 X17 X16 *) + 0xba110271; (* arm_ADCS X17 X19 X17 *) + 0x9a1303f3; (* arm_ADC X19 XZR X19 *) + 0xab0b01ed; (* arm_ADDS X13 X15 X11 *) + 0xba0c020e; (* arm_ADCS X14 X16 X12 *) + 0xba0f022f; (* arm_ADCS X15 X17 X15 *) + 0xba100270; (* arm_ADCS X16 X19 X16 *) + 0xba1103f1; (* arm_ADCS X17 XZR X17 *) + 0x9a1303f3; (* arm_ADC X19 XZR X19 *) + 0xeb0600b8; (* arm_SUBS X24 X5 X6 *) + 0xda982718; (* arm_CNEG X24 X24 Condition_CC *) + 0xda9f23f4; (* arm_CSETM X20 Condition_CC *) + 0xeb090155; (* arm_SUBS X21 X10 X9 *) + 0xda9526b5; (* arm_CNEG X21 X21 Condition_CC *) + 0x9b157f16; (* arm_MUL X22 X24 X21 *) + 0x9bd57f15; (* arm_UMULH X21 X24 X21 *) + 0xda942294; (* arm_CINV X20 X20 Condition_CC *) + 0xb100069f; (* arm_CMN X20 (rvalue (word 1)) *) + 0xca1402d6; (* arm_EOR X22 X22 X20 *) + 0xba160210; (* arm_ADCS X16 X16 X22 *) + 0xca1402b5; (* arm_EOR X21 X21 X20 *) + 0xba150231; (* arm_ADCS X17 X17 X21 *) + 0x9a140273; (* arm_ADC X19 X19 X20 *) + 0xeb040078; (* arm_SUBS X24 X3 X4 *) + 0xda982718; (* arm_CNEG X24 X24 Condition_CC *) + 0xda9f23f4; (* arm_CSETM X20 Condition_CC *) + 0xeb070115; (* arm_SUBS X21 X8 X7 *) + 0xda9526b5; (* arm_CNEG X21 X21 Condition_CC *) + 0x9b157f16; (* arm_MUL X22 X24 X21 *) + 0x9bd57f15; (* arm_UMULH X21 X24 X21 *) + 0xda942294; (* arm_CINV X20 X20 Condition_CC *) + 0xb100069f; (* arm_CMN X20 (rvalue (word 1)) *) + 0xca1402d6; (* arm_EOR X22 X22 X20 *) + 0xba16018c; (* arm_ADCS X12 X12 X22 *) + 0xca1402b5; (* arm_EOR X21 X21 X20 *) + 0xba1501ad; (* arm_ADCS X13 X13 X21 *) + 0xba1401ce; (* arm_ADCS X14 X14 X20 *) + 0xba1401ef; (* arm_ADCS X15 X15 X20 *) + 0xba140210; (* arm_ADCS X16 X16 X20 *) + 0xba140231; (* arm_ADCS X17 X17 X20 *) + 0x9a140273; (* arm_ADC X19 X19 X20 *) + 0xeb060098; (* arm_SUBS X24 X4 X6 *) + 0xda982718; (* arm_CNEG X24 X24 Condition_CC *) + 0xda9f23f4; (* arm_CSETM X20 Condition_CC *) + 0xeb080155; (* arm_SUBS X21 X10 X8 *) + 0xda9526b5; (* arm_CNEG X21 X21 Condition_CC *) + 0x9b157f16; (* arm_MUL X22 X24 X21 *) + 0x9bd57f15; (* arm_UMULH X21 X24 X21 *) + 0xda942294; (* arm_CINV X20 X20 Condition_CC *) + 0xb100069f; (* arm_CMN X20 (rvalue (word 1)) *) + 0xca1402d6; (* arm_EOR X22 X22 X20 *) + 0xba1601ef; (* arm_ADCS X15 X15 X22 *) + 0xca1402b5; (* arm_EOR X21 X21 X20 *) + 0xba150210; (* arm_ADCS X16 X16 X21 *) + 0xba140231; (* arm_ADCS X17 X17 X20 *) + 0x9a140273; (* arm_ADC X19 X19 X20 *) + 0xeb050078; (* arm_SUBS X24 X3 X5 *) + 0xda982718; (* arm_CNEG X24 X24 Condition_CC *) + 0xda9f23f4; (* arm_CSETM X20 Condition_CC *) + 0xeb070135; (* arm_SUBS X21 X9 X7 *) + 0xda9526b5; (* arm_CNEG X21 X21 Condition_CC *) + 0x9b157f16; (* arm_MUL X22 X24 X21 *) + 0x9bd57f15; (* arm_UMULH X21 X24 X21 *) + 0xda942294; (* arm_CINV X20 X20 Condition_CC *) + 0xb100069f; (* arm_CMN X20 (rvalue (word 1)) *) + 0xca1402d6; (* arm_EOR X22 X22 X20 *) + 0xba1601ad; (* arm_ADCS X13 X13 X22 *) + 0xca1402b5; (* arm_EOR X21 X21 X20 *) + 0xba1501ce; (* arm_ADCS X14 X14 X21 *) + 0xba1401ef; (* arm_ADCS X15 X15 X20 *) + 0xba140210; (* arm_ADCS X16 X16 X20 *) + 0xba140231; (* arm_ADCS X17 X17 X20 *) + 0x9a140273; (* arm_ADC X19 X19 X20 *) + 0xeb060078; (* arm_SUBS X24 X3 X6 *) + 0xda982718; (* arm_CNEG X24 X24 Condition_CC *) + 0xda9f23f4; (* arm_CSETM X20 Condition_CC *) + 0xeb070155; (* arm_SUBS X21 X10 X7 *) + 0xda9526b5; (* arm_CNEG X21 X21 Condition_CC *) + 0x9b157f16; (* arm_MUL X22 X24 X21 *) + 0x9bd57f15; (* arm_UMULH X21 X24 X21 *) + 0xda942294; (* arm_CINV X20 X20 Condition_CC *) + 0xb100069f; (* arm_CMN X20 (rvalue (word 1)) *) + 0xca1402d6; (* arm_EOR X22 X22 X20 *) + 0xba1601ce; (* arm_ADCS X14 X14 X22 *) + 0xca1402b5; (* arm_EOR X21 X21 X20 *) + 0xba1501ef; (* arm_ADCS X15 X15 X21 *) + 0xba140210; (* arm_ADCS X16 X16 X20 *) + 0xba140231; (* arm_ADCS X17 X17 X20 *) + 0x9a140273; (* arm_ADC X19 X19 X20 *) + 0xeb050098; (* arm_SUBS X24 X4 X5 *) + 0xda982718; (* arm_CNEG X24 X24 Condition_CC *) + 0xda9f23f4; (* arm_CSETM X20 Condition_CC *) + 0xeb080135; (* arm_SUBS X21 X9 X8 *) + 0xda9526b5; (* arm_CNEG X21 X21 Condition_CC *) + 0x9b157f16; (* arm_MUL X22 X24 X21 *) + 0x9bd57f15; (* arm_UMULH X21 X24 X21 *) + 0xda942294; (* arm_CINV X20 X20 Condition_CC *) + 0xb100069f; (* arm_CMN X20 (rvalue (word 1)) *) + 0xca1402d6; (* arm_EOR X22 X22 X20 *) + 0xba1601ce; (* arm_ADCS X14 X14 X22 *) + 0xca1402b5; (* arm_EOR X21 X21 X20 *) + 0xba1501ef; (* arm_ADCS X15 X15 X21 *) + 0xba140210; (* arm_ADCS X16 X16 X20 *) + 0xba140231; (* arm_ADCS X17 X17 X20 *) + 0x9a140273; (* arm_ADC X19 X19 X20 *) + 0xa9421023; (* arm_LDP X3 X4 X1 (Immediate_Offset (iword (&32))) *) + 0xa900300b; (* arm_STP X11 X12 X0 (Immediate_Offset (iword (&0))) *) + 0xa9422047; (* arm_LDP X7 X8 X2 (Immediate_Offset (iword (&32))) *) + 0xa901380d; (* arm_STP X13 X14 X0 (Immediate_Offset (iword (&16))) *) + 0xa9431825; (* arm_LDP X5 X6 X1 (Immediate_Offset (iword (&48))) *) + 0xa902400f; (* arm_STP X15 X16 X0 (Immediate_Offset (iword (&32))) *) + 0xa9432849; (* arm_LDP X9 X10 X2 (Immediate_Offset (iword (&48))) *) + 0xa9034c11; (* arm_STP X17 X19 X0 (Immediate_Offset (iword (&48))) *) + 0x4e083c0b; (* arm_UMOV X11 Q0 0 8 *) + 0x4e183c0f; (* arm_UMOV X15 Q0 1 8 *) + 0x4e821860; (* arm_UZIP1 Q0 Q3 Q2 32 *) + 0x4ea00861; (* arm_REV64_VEC Q1 Q3 32 *) + 0x4e821843; (* arm_UZIP1 Q3 Q2 Q2 32 *) + 0x4ea29c21; (* arm_MUL_VEC Q1 Q1 Q2 32 *) + 0x6ea02821; (* arm_UADDLP Q1 Q1 32 *) + 0x4f605421; (* arm_SHL_VEC Q1 Q1 32 64 *) + 0x2ea08061; (* arm_UMLAL Q1 Q3 Q0 32 *) + 0x4e083c30; (* arm_UMOV X16 Q1 0 8 *) + 0x4e183c31; (* arm_UMOV X17 Q1 1 8 *) + 0x9bc77c73; (* arm_UMULH X19 X3 X7 *) + 0xab1301ef; (* arm_ADDS X15 X15 X19 *) + 0x9bc87c93; (* arm_UMULH X19 X4 X8 *) + 0xba130210; (* arm_ADCS X16 X16 X19 *) + 0x9bc97cb3; (* arm_UMULH X19 X5 X9 *) + 0xba130231; (* arm_ADCS X17 X17 X19 *) + 0x9bca7cd3; (* arm_UMULH X19 X6 X10 *) + 0x9a1f0273; (* arm_ADC X19 X19 XZR *) + 0xab0b01ec; (* arm_ADDS X12 X15 X11 *) + 0xba0f020f; (* arm_ADCS X15 X16 X15 *) + 0xba100230; (* arm_ADCS X16 X17 X16 *) + 0xba110271; (* arm_ADCS X17 X19 X17 *) + 0x9a1303f3; (* arm_ADC X19 XZR X19 *) + 0xab0b01ed; (* arm_ADDS X13 X15 X11 *) + 0xba0c020e; (* arm_ADCS X14 X16 X12 *) + 0xba0f022f; (* arm_ADCS X15 X17 X15 *) + 0xba100270; (* arm_ADCS X16 X19 X16 *) + 0xba1103f1; (* arm_ADCS X17 XZR X17 *) + 0x9a1303f3; (* arm_ADC X19 XZR X19 *) + 0xa9425416; (* arm_LDP X22 X21 X0 (Immediate_Offset (iword (&32))) *) + 0xab16016b; (* arm_ADDS X11 X11 X22 *) + 0xba15018c; (* arm_ADCS X12 X12 X21 *) + 0xa9435416; (* arm_LDP X22 X21 X0 (Immediate_Offset (iword (&48))) *) + 0xba1601ad; (* arm_ADCS X13 X13 X22 *) + 0xba1501ce; (* arm_ADCS X14 X14 X21 *) + 0xba1f01ef; (* arm_ADCS X15 X15 XZR *) + 0xba1f0210; (* arm_ADCS X16 X16 XZR *) + 0xba1f0231; (* arm_ADCS X17 X17 XZR *) + 0x9a1f0273; (* arm_ADC X19 X19 XZR *) + 0xeb0600b8; (* arm_SUBS X24 X5 X6 *) + 0xda982718; (* arm_CNEG X24 X24 Condition_CC *) + 0xda9f23f4; (* arm_CSETM X20 Condition_CC *) + 0xeb090155; (* arm_SUBS X21 X10 X9 *) + 0xda9526b5; (* arm_CNEG X21 X21 Condition_CC *) + 0x9b157f16; (* arm_MUL X22 X24 X21 *) + 0x9bd57f15; (* arm_UMULH X21 X24 X21 *) + 0xda942294; (* arm_CINV X20 X20 Condition_CC *) + 0xb100069f; (* arm_CMN X20 (rvalue (word 1)) *) + 0xca1402d6; (* arm_EOR X22 X22 X20 *) + 0xba160210; (* arm_ADCS X16 X16 X22 *) + 0xca1402b5; (* arm_EOR X21 X21 X20 *) + 0xba150231; (* arm_ADCS X17 X17 X21 *) + 0x9a140273; (* arm_ADC X19 X19 X20 *) + 0xeb040078; (* arm_SUBS X24 X3 X4 *) + 0xda982718; (* arm_CNEG X24 X24 Condition_CC *) + 0xda9f23f4; (* arm_CSETM X20 Condition_CC *) + 0xeb070115; (* arm_SUBS X21 X8 X7 *) + 0xda9526b5; (* arm_CNEG X21 X21 Condition_CC *) + 0x9b157f16; (* arm_MUL X22 X24 X21 *) + 0x9bd57f15; (* arm_UMULH X21 X24 X21 *) + 0xda942294; (* arm_CINV X20 X20 Condition_CC *) + 0xb100069f; (* arm_CMN X20 (rvalue (word 1)) *) + 0xca1402d6; (* arm_EOR X22 X22 X20 *) + 0xba16018c; (* arm_ADCS X12 X12 X22 *) + 0xca1402b5; (* arm_EOR X21 X21 X20 *) + 0xba1501ad; (* arm_ADCS X13 X13 X21 *) + 0xba1401ce; (* arm_ADCS X14 X14 X20 *) + 0xba1401ef; (* arm_ADCS X15 X15 X20 *) + 0xba140210; (* arm_ADCS X16 X16 X20 *) + 0xba140231; (* arm_ADCS X17 X17 X20 *) + 0x9a140273; (* arm_ADC X19 X19 X20 *) + 0xeb060098; (* arm_SUBS X24 X4 X6 *) + 0xda982718; (* arm_CNEG X24 X24 Condition_CC *) + 0xda9f23f4; (* arm_CSETM X20 Condition_CC *) + 0xeb080155; (* arm_SUBS X21 X10 X8 *) + 0xda9526b5; (* arm_CNEG X21 X21 Condition_CC *) + 0x9b157f16; (* arm_MUL X22 X24 X21 *) + 0x9bd57f15; (* arm_UMULH X21 X24 X21 *) + 0xda942294; (* arm_CINV X20 X20 Condition_CC *) + 0xb100069f; (* arm_CMN X20 (rvalue (word 1)) *) + 0xca1402d6; (* arm_EOR X22 X22 X20 *) + 0xba1601ef; (* arm_ADCS X15 X15 X22 *) + 0xca1402b5; (* arm_EOR X21 X21 X20 *) + 0xba150210; (* arm_ADCS X16 X16 X21 *) + 0xba140231; (* arm_ADCS X17 X17 X20 *) + 0x9a140273; (* arm_ADC X19 X19 X20 *) + 0xeb050078; (* arm_SUBS X24 X3 X5 *) + 0xda982718; (* arm_CNEG X24 X24 Condition_CC *) + 0xda9f23f4; (* arm_CSETM X20 Condition_CC *) + 0xeb070135; (* arm_SUBS X21 X9 X7 *) + 0xda9526b5; (* arm_CNEG X21 X21 Condition_CC *) + 0x9b157f16; (* arm_MUL X22 X24 X21 *) + 0x9bd57f15; (* arm_UMULH X21 X24 X21 *) + 0xda942294; (* arm_CINV X20 X20 Condition_CC *) + 0xb100069f; (* arm_CMN X20 (rvalue (word 1)) *) + 0xca1402d6; (* arm_EOR X22 X22 X20 *) + 0xba1601ad; (* arm_ADCS X13 X13 X22 *) + 0xca1402b5; (* arm_EOR X21 X21 X20 *) + 0xba1501ce; (* arm_ADCS X14 X14 X21 *) + 0xba1401ef; (* arm_ADCS X15 X15 X20 *) + 0xba140210; (* arm_ADCS X16 X16 X20 *) + 0xba140231; (* arm_ADCS X17 X17 X20 *) + 0x9a140273; (* arm_ADC X19 X19 X20 *) + 0xeb060078; (* arm_SUBS X24 X3 X6 *) + 0xda982718; (* arm_CNEG X24 X24 Condition_CC *) + 0xda9f23f4; (* arm_CSETM X20 Condition_CC *) + 0xeb070155; (* arm_SUBS X21 X10 X7 *) + 0xda9526b5; (* arm_CNEG X21 X21 Condition_CC *) + 0x9b157f16; (* arm_MUL X22 X24 X21 *) + 0x9bd57f15; (* arm_UMULH X21 X24 X21 *) + 0xda942294; (* arm_CINV X20 X20 Condition_CC *) + 0xb100069f; (* arm_CMN X20 (rvalue (word 1)) *) + 0xca1402d6; (* arm_EOR X22 X22 X20 *) + 0xba1601ce; (* arm_ADCS X14 X14 X22 *) + 0xca1402b5; (* arm_EOR X21 X21 X20 *) + 0xba1501ef; (* arm_ADCS X15 X15 X21 *) + 0xba140210; (* arm_ADCS X16 X16 X20 *) + 0xba140231; (* arm_ADCS X17 X17 X20 *) + 0x9a140273; (* arm_ADC X19 X19 X20 *) + 0xeb050098; (* arm_SUBS X24 X4 X5 *) + 0xda982718; (* arm_CNEG X24 X24 Condition_CC *) + 0xda9f23f4; (* arm_CSETM X20 Condition_CC *) + 0xeb080135; (* arm_SUBS X21 X9 X8 *) + 0xda9526b5; (* arm_CNEG X21 X21 Condition_CC *) + 0x9b157f16; (* arm_MUL X22 X24 X21 *) + 0x9bd57f15; (* arm_UMULH X21 X24 X21 *) + 0xda942294; (* arm_CINV X20 X20 Condition_CC *) + 0xb100069f; (* arm_CMN X20 (rvalue (word 1)) *) + 0xca1402d6; (* arm_EOR X22 X22 X20 *) + 0xba1601ce; (* arm_ADCS X14 X14 X22 *) + 0xca1402b5; (* arm_EOR X21 X21 X20 *) + 0xba1501ef; (* arm_ADCS X15 X15 X21 *) + 0xba140210; (* arm_ADCS X16 X16 X20 *) + 0xba140231; (* arm_ADCS X17 X17 X20 *) + 0x9a140273; (* arm_ADC X19 X19 X20 *) + 0xa9405436; (* arm_LDP X22 X21 X1 (Immediate_Offset (iword (&0))) *) + 0xeb160063; (* arm_SUBS X3 X3 X22 *) + 0xfa150084; (* arm_SBCS X4 X4 X21 *) + 0xa9415436; (* arm_LDP X22 X21 X1 (Immediate_Offset (iword (&16))) *) + 0xfa1600a5; (* arm_SBCS X5 X5 X22 *) + 0xfa1500c6; (* arm_SBCS X6 X6 X21 *) + 0xda9f23f8; (* arm_CSETM X24 Condition_CC *) + 0xa904300b; (* arm_STP X11 X12 X0 (Immediate_Offset (iword (&64))) *) + 0xa9405456; (* arm_LDP X22 X21 X2 (Immediate_Offset (iword (&0))) *) + 0xeb0702c7; (* arm_SUBS X7 X22 X7 *) + 0xfa0802a8; (* arm_SBCS X8 X21 X8 *) + 0xa9415456; (* arm_LDP X22 X21 X2 (Immediate_Offset (iword (&16))) *) + 0xfa0902c9; (* arm_SBCS X9 X22 X9 *) + 0xfa0a02aa; (* arm_SBCS X10 X21 X10 *) + 0xda9f23e1; (* arm_CSETM X1 Condition_CC *) + 0xa905380d; (* arm_STP X13 X14 X0 (Immediate_Offset (iword (&80))) *) + 0xca180063; (* arm_EOR X3 X3 X24 *) + 0xeb180063; (* arm_SUBS X3 X3 X24 *) + 0xca180084; (* arm_EOR X4 X4 X24 *) + 0xfa180084; (* arm_SBCS X4 X4 X24 *) + 0xca1800a5; (* arm_EOR X5 X5 X24 *) + 0xfa1800a5; (* arm_SBCS X5 X5 X24 *) + 0xca1800c6; (* arm_EOR X6 X6 X24 *) + 0xda1800c6; (* arm_SBC X6 X6 X24 *) + 0xa906400f; (* arm_STP X15 X16 X0 (Immediate_Offset (iword (&96))) *) + 0xca0100e7; (* arm_EOR X7 X7 X1 *) + 0xeb0100e7; (* arm_SUBS X7 X7 X1 *) + 0xca010108; (* arm_EOR X8 X8 X1 *) + 0xfa010108; (* arm_SBCS X8 X8 X1 *) + 0xca010129; (* arm_EOR X9 X9 X1 *) + 0xfa010129; (* arm_SBCS X9 X9 X1 *) + 0xca01014a; (* arm_EOR X10 X10 X1 *) + 0xda01014a; (* arm_SBC X10 X10 X1 *) + 0xa9074c11; (* arm_STP X17 X19 X0 (Immediate_Offset (iword (&112))) *) + 0xca180021; (* arm_EOR X1 X1 X24 *) + 0x9b077c6b; (* arm_MUL X11 X3 X7 *) + 0x9b087c8f; (* arm_MUL X15 X4 X8 *) + 0x9b097cb0; (* arm_MUL X16 X5 X9 *) + 0x9b0a7cd1; (* arm_MUL X17 X6 X10 *) + 0x9bc77c73; (* arm_UMULH X19 X3 X7 *) + 0xab1301ef; (* arm_ADDS X15 X15 X19 *) + 0x9bc87c93; (* arm_UMULH X19 X4 X8 *) + 0xba130210; (* arm_ADCS X16 X16 X19 *) + 0x9bc97cb3; (* arm_UMULH X19 X5 X9 *) + 0xba130231; (* arm_ADCS X17 X17 X19 *) + 0x9bca7cd3; (* arm_UMULH X19 X6 X10 *) + 0x9a1f0273; (* arm_ADC X19 X19 XZR *) + 0xab0b01ec; (* arm_ADDS X12 X15 X11 *) + 0xba0f020f; (* arm_ADCS X15 X16 X15 *) + 0xba100230; (* arm_ADCS X16 X17 X16 *) + 0xba110271; (* arm_ADCS X17 X19 X17 *) + 0x9a1303f3; (* arm_ADC X19 XZR X19 *) + 0xab0b01ed; (* arm_ADDS X13 X15 X11 *) + 0xba0c020e; (* arm_ADCS X14 X16 X12 *) + 0xba0f022f; (* arm_ADCS X15 X17 X15 *) + 0xba100270; (* arm_ADCS X16 X19 X16 *) + 0xba1103f1; (* arm_ADCS X17 XZR X17 *) + 0x9a1303f3; (* arm_ADC X19 XZR X19 *) + 0xeb0600b8; (* arm_SUBS X24 X5 X6 *) + 0xda982718; (* arm_CNEG X24 X24 Condition_CC *) + 0xda9f23f4; (* arm_CSETM X20 Condition_CC *) + 0xeb090155; (* arm_SUBS X21 X10 X9 *) + 0xda9526b5; (* arm_CNEG X21 X21 Condition_CC *) + 0x9b157f16; (* arm_MUL X22 X24 X21 *) + 0x9bd57f15; (* arm_UMULH X21 X24 X21 *) + 0xda942294; (* arm_CINV X20 X20 Condition_CC *) + 0xb100069f; (* arm_CMN X20 (rvalue (word 1)) *) + 0xca1402d6; (* arm_EOR X22 X22 X20 *) + 0xba160210; (* arm_ADCS X16 X16 X22 *) + 0xca1402b5; (* arm_EOR X21 X21 X20 *) + 0xba150231; (* arm_ADCS X17 X17 X21 *) + 0x9a140273; (* arm_ADC X19 X19 X20 *) + 0xeb040078; (* arm_SUBS X24 X3 X4 *) + 0xda982718; (* arm_CNEG X24 X24 Condition_CC *) + 0xda9f23f4; (* arm_CSETM X20 Condition_CC *) + 0xeb070115; (* arm_SUBS X21 X8 X7 *) + 0xda9526b5; (* arm_CNEG X21 X21 Condition_CC *) + 0x9b157f16; (* arm_MUL X22 X24 X21 *) + 0x9bd57f15; (* arm_UMULH X21 X24 X21 *) + 0xda942294; (* arm_CINV X20 X20 Condition_CC *) + 0xb100069f; (* arm_CMN X20 (rvalue (word 1)) *) + 0xca1402d6; (* arm_EOR X22 X22 X20 *) + 0xba16018c; (* arm_ADCS X12 X12 X22 *) + 0xca1402b5; (* arm_EOR X21 X21 X20 *) + 0xba1501ad; (* arm_ADCS X13 X13 X21 *) + 0xba1401ce; (* arm_ADCS X14 X14 X20 *) + 0xba1401ef; (* arm_ADCS X15 X15 X20 *) + 0xba140210; (* arm_ADCS X16 X16 X20 *) + 0xba140231; (* arm_ADCS X17 X17 X20 *) + 0x9a140273; (* arm_ADC X19 X19 X20 *) + 0xeb060098; (* arm_SUBS X24 X4 X6 *) + 0xda982718; (* arm_CNEG X24 X24 Condition_CC *) + 0xda9f23f4; (* arm_CSETM X20 Condition_CC *) + 0xeb080155; (* arm_SUBS X21 X10 X8 *) + 0xda9526b5; (* arm_CNEG X21 X21 Condition_CC *) + 0x9b157f16; (* arm_MUL X22 X24 X21 *) + 0x9bd57f15; (* arm_UMULH X21 X24 X21 *) + 0xda942294; (* arm_CINV X20 X20 Condition_CC *) + 0xb100069f; (* arm_CMN X20 (rvalue (word 1)) *) + 0xca1402d6; (* arm_EOR X22 X22 X20 *) + 0xba1601ef; (* arm_ADCS X15 X15 X22 *) + 0xca1402b5; (* arm_EOR X21 X21 X20 *) + 0xba150210; (* arm_ADCS X16 X16 X21 *) + 0xba140231; (* arm_ADCS X17 X17 X20 *) + 0x9a140273; (* arm_ADC X19 X19 X20 *) + 0xeb050078; (* arm_SUBS X24 X3 X5 *) + 0xda982718; (* arm_CNEG X24 X24 Condition_CC *) + 0xda9f23f4; (* arm_CSETM X20 Condition_CC *) + 0xeb070135; (* arm_SUBS X21 X9 X7 *) + 0xda9526b5; (* arm_CNEG X21 X21 Condition_CC *) + 0x9b157f16; (* arm_MUL X22 X24 X21 *) + 0x9bd57f15; (* arm_UMULH X21 X24 X21 *) + 0xda942294; (* arm_CINV X20 X20 Condition_CC *) + 0xb100069f; (* arm_CMN X20 (rvalue (word 1)) *) + 0xca1402d6; (* arm_EOR X22 X22 X20 *) + 0xba1601ad; (* arm_ADCS X13 X13 X22 *) + 0xca1402b5; (* arm_EOR X21 X21 X20 *) + 0xba1501ce; (* arm_ADCS X14 X14 X21 *) + 0xba1401ef; (* arm_ADCS X15 X15 X20 *) + 0xba140210; (* arm_ADCS X16 X16 X20 *) + 0xba140231; (* arm_ADCS X17 X17 X20 *) + 0x9a140273; (* arm_ADC X19 X19 X20 *) + 0xeb060078; (* arm_SUBS X24 X3 X6 *) + 0xda982718; (* arm_CNEG X24 X24 Condition_CC *) + 0xda9f23f4; (* arm_CSETM X20 Condition_CC *) + 0xeb070155; (* arm_SUBS X21 X10 X7 *) + 0xda9526b5; (* arm_CNEG X21 X21 Condition_CC *) + 0x9b157f16; (* arm_MUL X22 X24 X21 *) + 0x9bd57f15; (* arm_UMULH X21 X24 X21 *) + 0xda942294; (* arm_CINV X20 X20 Condition_CC *) + 0xb100069f; (* arm_CMN X20 (rvalue (word 1)) *) + 0xca1402d6; (* arm_EOR X22 X22 X20 *) + 0xba1601ce; (* arm_ADCS X14 X14 X22 *) + 0xca1402b5; (* arm_EOR X21 X21 X20 *) + 0xba1501ef; (* arm_ADCS X15 X15 X21 *) + 0xba140210; (* arm_ADCS X16 X16 X20 *) + 0xba140231; (* arm_ADCS X17 X17 X20 *) + 0x9a140273; (* arm_ADC X19 X19 X20 *) + 0xeb050098; (* arm_SUBS X24 X4 X5 *) + 0xda982718; (* arm_CNEG X24 X24 Condition_CC *) + 0xda9f23f4; (* arm_CSETM X20 Condition_CC *) + 0xeb080135; (* arm_SUBS X21 X9 X8 *) + 0xda9526b5; (* arm_CNEG X21 X21 Condition_CC *) + 0x9b157f16; (* arm_MUL X22 X24 X21 *) + 0x9bd57f15; (* arm_UMULH X21 X24 X21 *) + 0xda942294; (* arm_CINV X20 X20 Condition_CC *) + 0xb100069f; (* arm_CMN X20 (rvalue (word 1)) *) + 0xca1402d6; (* arm_EOR X22 X22 X20 *) + 0xba1601ce; (* arm_ADCS X14 X14 X22 *) + 0xca1402b5; (* arm_EOR X21 X21 X20 *) + 0xba1501ef; (* arm_ADCS X15 X15 X21 *) + 0xba140210; (* arm_ADCS X16 X16 X20 *) + 0xba140231; (* arm_ADCS X17 X17 X20 *) + 0x9a140273; (* arm_ADC X19 X19 X20 *) + 0xa9401003; (* arm_LDP X3 X4 X0 (Immediate_Offset (iword (&0))) *) + 0xa9442007; (* arm_LDP X7 X8 X0 (Immediate_Offset (iword (&64))) *) + 0xab070063; (* arm_ADDS X3 X3 X7 *) + 0xba080084; (* arm_ADCS X4 X4 X8 *) + 0xa9411805; (* arm_LDP X5 X6 X0 (Immediate_Offset (iword (&16))) *) + 0xa9452809; (* arm_LDP X9 X10 X0 (Immediate_Offset (iword (&80))) *) + 0xba0900a5; (* arm_ADCS X5 X5 X9 *) + 0xba0a00c6; (* arm_ADCS X6 X6 X10 *) + 0xa9465414; (* arm_LDP X20 X21 X0 (Immediate_Offset (iword (&96))) *) + 0xba1400e7; (* arm_ADCS X7 X7 X20 *) + 0xba150108; (* arm_ADCS X8 X8 X21 *) + 0xa9475c16; (* arm_LDP X22 X23 X0 (Immediate_Offset (iword (&112))) *) + 0xba160129; (* arm_ADCS X9 X9 X22 *) + 0xba17014a; (* arm_ADCS X10 X10 X23 *) + 0xba1f0038; (* arm_ADCS X24 X1 XZR *) + 0x9a1f0022; (* arm_ADC X2 X1 XZR *) + 0xb100043f; (* arm_CMN X1 (rvalue (word 1)) *) + 0xca01016b; (* arm_EOR X11 X11 X1 *) + 0xba030163; (* arm_ADCS X3 X11 X3 *) + 0xca01018c; (* arm_EOR X12 X12 X1 *) + 0xba040184; (* arm_ADCS X4 X12 X4 *) + 0xca0101ad; (* arm_EOR X13 X13 X1 *) + 0xba0501a5; (* arm_ADCS X5 X13 X5 *) + 0xca0101ce; (* arm_EOR X14 X14 X1 *) + 0xba0601c6; (* arm_ADCS X6 X14 X6 *) + 0xca0101ef; (* arm_EOR X15 X15 X1 *) + 0xba0701e7; (* arm_ADCS X7 X15 X7 *) + 0xca010210; (* arm_EOR X16 X16 X1 *) + 0xba080208; (* arm_ADCS X8 X16 X8 *) + 0xca010231; (* arm_EOR X17 X17 X1 *) + 0xba090229; (* arm_ADCS X9 X17 X9 *) + 0xca010273; (* arm_EOR X19 X19 X1 *) + 0xba0a026a; (* arm_ADCS X10 X19 X10 *) + 0xba180294; (* arm_ADCS X20 X20 X24 *) + 0xba0202b5; (* arm_ADCS X21 X21 X2 *) + 0xba0202d6; (* arm_ADCS X22 X22 X2 *) + 0x9a0202f7; (* arm_ADC X23 X23 X2 *) + 0xa9021003; (* arm_STP X3 X4 X0 (Immediate_Offset (iword (&32))) *) + 0xa9031805; (* arm_STP X5 X6 X0 (Immediate_Offset (iword (&48))) *) + 0xa9042007; (* arm_STP X7 X8 X0 (Immediate_Offset (iword (&64))) *) + 0xa9052809; (* arm_STP X9 X10 X0 (Immediate_Offset (iword (&80))) *) + 0xa9065414; (* arm_STP X20 X21 X0 (Immediate_Offset (iword (&96))) *) + 0xa9075c16; (* arm_STP X22 X23 X0 (Immediate_Offset (iword (&112))) *) + 0xd65f03c0 (* arm_RET X30 *) +];; + +let BIGNUM_KMUL_32_64_NEON_EXEC = ARM_MK_EXEC_RULE bignum_kmul_32_64_neon_mc;; + +(* ------------------------------------------------------------------------- *) +(* First of all the correctness lemma for the embedded bignum_kmul_16_32 *) +(* ------------------------------------------------------------------------- *) + +let lemma1 = prove + (`!(x0:num) x1 (y0:num) y1. + (if y0 <= y1 + then if x1 <= x0 then word 0 else word 18446744073709551615 + else word_not + (if x1 <= x0 then word 0 else word 18446744073709551615)):int64 = + word_neg(word(bitval(y0 <= y1 <=> x0 < x1)))`, + REPEAT GEN_TAC THEN REWRITE_TAC[GSYM NOT_LE] THEN + REPEAT(COND_CASES_TAC THEN ASM_REWRITE_TAC[BITVAL_CLAUSES]) THEN + CONV_TAC WORD_REDUCE_CONV);; + +let lemma2 = prove + (`!(x0:int64) (x1:int64) (y0:int64) (y1:int64). + &(val(if val x1 <= val x0 then word_sub x0 x1 + else word_neg (word_sub x0 x1))) * + &(val(if val y0 <= val y1 then word_sub y1 y0 + else word_neg (word_sub y1 y0))):real = + --(&1) pow bitval(val y0 <= val y1 <=> val x0 < val x1) * + (&(val x0) - &(val x1)) * (&(val y1) - &(val y0))`, + REPEAT GEN_TAC THEN REWRITE_TAC[GSYM NOT_LE; WORD_NEG_SUB] THEN + REPEAT(COND_CASES_TAC THEN ASM_REWRITE_TAC[BITVAL_CLAUSES]) THEN + REPEAT(FIRST_X_ASSUM(ASSUME_TAC o MATCH_MP (ARITH_RULE + `~(m:num <= n) ==> n <= m /\ ~(m <= n)`))) THEN + ASM_SIMP_TAC[VAL_WORD_SUB_CASES; GSYM REAL_OF_NUM_SUB] THEN + REAL_ARITH_TAC);; + +(* A lemma that is useful for extracting a 32-bit field from a 128-bit word. *) +let WORD_128_SUBWORD_SUBWORD_32 = prove(`!y. + word_subword (word_subword (y:(128)word) (0,64):(64)word) (0,32):(32)word = + word_subword (y:(128)word) (0,32):(32)word /\ + word_subword (word_subword (y:(128)word) (64,64):(64)word) (0,32):(32)word = + word_subword (y:(128)word) (64,32):(32)word /\ + word_subword (word_subword (y:(128)word) (0,64):(64)word) (32,32):(32)word = + word_subword (y:(128)word) (32,32):(32)word /\ + word_subword (word_subword (y:(128)word) (64,64):(64)word) (32,32):(32)word = + word_subword (y:(128)word) (96,32):(32)word`, + CONV_TAC WORD_BLAST);; + +(* A lemma that is useful for extracting a 32-bit field from a join of two 32-bit words. *) +let WORD_SUBWORD_JOIN_64 = prove(`!(x:(32)word) (y:(32)word). + word_subword (word_join (x:(32)word) (y:(32)word): (64)word) (0,32) = y /\ + word_subword (word_join (x:(32)word) (y:(32)word): (64)word) (32,32) = x`, + CONV_TAC WORD_BLAST);; + +(* A lemma that is useful for extracting a 64-bit field from a join of two 64-bit words. *) +let WORD_SUBWORD_JOIN_128_64 = prove(`!(x:(64)word) (y:(64)word). + word_subword (word_join (x:(64)word) (y:(64)word): (128)word) (0,64) = y /\ + word_subword (word_join (x:(64)word) (y:(64)word): (128)word) (64,64) = x`, + CONV_TAC WORD_BLAST);; + +(* A lemma that is useful for extracting a 32-bit field from a join of two 64-bit words. *) +let WORD_SUBWORD_JOIN_128_32 = prove(`!(x:(64)word) (y:(64)word). + word_subword (word_join (x:(64)word) (y:(64)word): (128)word) (0,32):(32)word = + word_subword (y:(64)word) (0,32):(32)word /\ + word_subword (word_join (x:(64)word) (y:(64)word): (128)word) (32,32):(32)word = + word_subword (y:(64)word) (32,32):(32)word /\ + word_subword (word_join (x:(64)word) (y:(64)word): (128)word) (64,32):(32)word = + word_subword (x:(64)word) (0,32):(32)word /\ + word_subword (word_join (x:(64)word) (y:(64)word): (128)word) (96,32):(32)word = + word_subword (x:(64)word) (32,32):(32)word`, + CONV_TAC WORD_BLAST);; + +let rewrite_assumptions t tac = SUBGOAL_THEN t + (fun thm -> RULE_ASSUM_TAC (REWRITE_RULE [thm])) THENL + [tac; ALL_TAC];; + +let lemma4 = prove(`!a b c. + ((a + 2 EXP 32 * (b MOD 2 EXP 32 + c MOD 2 EXP 32)) DIV 2 EXP 32) MOD 2 EXP 32 = + ((a + 2 EXP 32 * (b + c)) DIV 2 EXP 32) MOD 2 EXP 32`, + REPEAT STRIP_TAC THEN + MAP_EVERY (fun (thm, suffix) -> LABEL_TAC ("Ha_" ^ suffix) thm) + (zip (CONJUNCTS ((MP + (SPECL [`a:num`; `2 EXP 32:num`] DIVISION) (ARITH_RULE `~(2 EXP 32 = 0)`)))) + ["eq";"lt"]) THEN + ABBREV_TAC `ahi = a DIV 2 EXP 32` THEN + ABBREV_TAC `alo = a MOD 2 EXP 32` THEN + ASM_REWRITE_TAC[] THEN + REWRITE_TAC[ARITH_RULE + `(ahi * 2 EXP 32 + alo) + 2 EXP 32 * (b MOD 2 EXP 32 + c MOD 2 EXP 32) = + (ahi + b MOD 2 EXP 32 + c MOD 2 EXP 32) * 2 EXP 32 + alo`] THEN + REWRITE_TAC[ARITH_RULE + `(ahi * 2 EXP 32 + alo) + 2 EXP 32 * (b + c) = + (ahi + b + c) * 2 EXP 32 + alo`] THEN + IMP_REWRITE_TAC[DIV_UNIQ] THEN (* (A * 2^32 + B) / 2^32 => A *) + EXISTS_TAC `(ahi + b MOD 2 EXP 32 + c MOD 2 EXP 32)` THEN SIMP_TAC[] THEN + EXISTS_TAC `(ahi + b + c)` THEN SIMP_TAC[] THEN + CONV_TAC MOD_DOWN_CONV THEN SIMP_TAC[]);; + +let WORD_MUL_64_DECOMPOSED_32 = prove(`!(x:(64)word) (y:(64)word). + word_add + (word_mul (word_zx (word_subword x (0,32):(32)word):(64)word) + (word_zx (word_subword y (0,32):(32)word):(64)word)) + (word_shl + (word_add + (word_zx (word_mul (word_subword y (32,32):(32)word) (word_subword x (0,32):(32)word))) + (word_zx (word_mul (word_subword y (0,32):(32)word) (word_subword x (32,32):(32)word)))) + 32) = + word_mul x y`, + REPEAT GEN_TAC THEN + (* word to num: step 1. x = y to val x = val y *) + REWRITE_TAC[GSYM VAL_EQ] THEN + (* step 2. remove all word_* *) + REWRITE_TAC [VAL_WORD_ADD; VAL_WORD_MUL; VAL_WORD_ZX_GEN; VAL_WORD_SUBWORD; + VAL_WORD; VAL_WORD_SHL] THEN + (* step 3. add x, y < 2^64 *) + ASSUME_TAC (ISPECL [`x:(64)word`] VAL_BOUND) THEN + ASSUME_TAC (ISPECL [`y:(64)word`] VAL_BOUND) THEN + RULE_ASSUM_TAC (REWRITE_RULE [DIMINDEX_64]) THEN + (* step 4. eliminate dimindex (:N) and simplify *) + REWRITE_TAC[DIMINDEX_32;DIMINDEX_64;DIMINDEX_128;DIV_1;MOD_MOD_REFL; + MOD_MOD_EXP_MIN;ARITH_RULE `2 EXP 0 = 1`; DIV_1] THEN + CONV_TAC(DEPTH_CONV NUM_MIN_CONV) THEN + CONV_TAC MOD_DOWN_CONV THEN + (* split x into [x0h, x0l], and divide y as well *) + MAP_EVERY (fun (thm, suffix) -> LABEL_TAC ("Hx" ^ suffix) thm) + (zip (CONJUNCTS ((MP (SPECL [`(val (x:(64)word)):num`; `2 EXP 32:num`] DIVISION) + (ARITH_RULE `~(2 EXP 32 = 0)`)))) ["eq";"lt"]) THEN + ABBREV_TAC `xhi = (val (x:(64)word)) DIV 2 EXP 32` THEN + ABBREV_TAC `xlo = (val (x:(64)word)) MOD 2 EXP 32` THEN + ASM_REWRITE_TAC[] THEN + MAP_EVERY (fun (thm, suffix) -> LABEL_TAC ("Hy" ^ suffix) thm) + (zip (CONJUNCTS ((MP (SPECL [`(val (y:(64)word)):num`; `2 EXP 32:num`] DIVISION) + (ARITH_RULE `~(2 EXP 32 = 0)`)))) ["eq";"lt"]) THEN + ABBREV_TAC `yhi = (val (y:(64)word)) DIV 2 EXP 32` THEN + ABBREV_TAC `ylo = (val (y:(64)word)) MOD 2 EXP 32` THEN + ASM_REWRITE_TAC[] THEN + (* lhs *) + REWRITE_TAC[LEFT_ADD_DISTRIB; RIGHT_ADD_DISTRIB] THEN + REWRITE_TAC[ + ARITH_RULE `y1hi * x1hi * 2 EXP 32 = 2 EXP 32 * y1hi * x1hi`; + ARITH_RULE `(y1hi * 2 EXP 32) * x1hi = 2 EXP 32 * y1hi * x1hi`] THEN + REWRITE_TAC[MOD_MULT_ADD] THEN + (* rhs *) + REWRITE_TAC[MULT_ASSOC; ARITH_RULE `2 EXP 32 * 2 EXP 32 = 2 EXP 64`] THEN + REWRITE_TAC[GSYM ADD_ASSOC; GSYM MULT_ASSOC] THEN + REWRITE_TAC[MOD_MULT_ADD] THEN + (* lhs = rhs *) + REWRITE_TAC[ARITH_RULE `2 EXP 64 = 2 EXP 32 * 2 EXP 32`] THEN + REWRITE_TAC[MOD_MULT_MOD] THEN + REWRITE_TAC[ARITH_RULE `2 EXP 32 * p + 2 EXP 32 * q = 2 EXP 32 * (p + q)`; MOD_MULT_ADD] THEN + REWRITE_TAC [lemma4] THEN + REWRITE_TAC [ARITH_RULE + `(xlo * ylo + 2 EXP 32 * (yhi * xlo + ylo * xhi)) DIV 2 EXP 32 = + (2 EXP 32 * xhi * ylo + 2 EXP 32 * xlo * yhi + xlo * ylo) DIV 2 EXP 32`]);; + +let simplify_128bit_words = + RULE_ASSUM_TAC (REWRITE_RULE [ + WORD_128_SUBWORD_SUBWORD_32; WORD_SUBWORD_JOIN_64; + WORD_SUBWORD_JOIN_128_64; WORD_SUBWORD_JOIN_128_32; + WORD_MUL_64_DECOMPOSED_32]);; + +let simplify_128bit_words_and_accumulate state_name = + simplify_128bit_words THEN + (* Rewrite word_mul x y into the pattern that ACCUMULATE_ARITH_TAC can recognize. *) + RULE_ASSUM_TAC (REWRITE_RULE [WORD_RULE + `word_mul (a:(64)word) (b:(64)word) = + word (0 + val (a:(64)word) * val (b:(64)word))`]) THEN + ACCUMULATE_ARITH_TAC state_name THEN CLARIFY_TAC;; + +let WORD_ADD_ASSOC_CONSTS = prove( + `!(x:(N)word) n m. + (word_add (word_add x (word n)) (word m)) = (word_add x (word (n+m)))`, + CONV_TAC WORD_RULE);; + +let ADK_48_TAC = + MATCH_MP_TAC EQUAL_FROM_CONGRUENT_REAL THEN + MAP_EVERY EXISTS_TAC [`512`; `&0:real`] THEN + REPLICATE_TAC 2 (CONJ_TAC THENL [BOUNDER_TAC[]; ALL_TAC]) THEN + CONJ_TAC THENL [REAL_INTEGER_TAC; ALL_TAC] THEN + ACCUMULATOR_POP_ASSUM_LIST(MP_TAC o end_itlist CONJ) THEN + POP_ASSUM_LIST(K ALL_TAC) THEN + REWRITE_TAC[lemma1; lemma2] THEN REWRITE_TAC[WORD_XOR_MASK] THEN + REPEAT(COND_CASES_TAC THEN + ASM_REWRITE_TAC[BITVAL_CLAUSES; REAL_VAL_WORD_NOT]) THEN + CONV_TAC WORD_REDUCE_CONV THEN CONV_TAC NUM_REDUCE_CONV THEN + REWRITE_TAC[BITVAL_CLAUSES; DIMINDEX_64] THEN + POP_ASSUM_LIST(K ALL_TAC) THEN DISCH_TAC THEN + FIRST_ASSUM(MP_TAC o end_itlist CONJ o DESUM_RULE o CONJUNCTS) THEN + DISCH_THEN(fun th -> REWRITE_TAC[th]) THEN + CONV_TAC(RAND_CONV REAL_POLY_CONV) THEN + FIRST_ASSUM(MP_TAC o end_itlist CONJ o filter (is_ratconst o rand o concl) o + DECARRY_RULE o CONJUNCTS) THEN + DISCH_THEN(fun th -> REWRITE_TAC[th]) THEN REAL_INTEGER_TAC;; + +let LOCAL_MUL_8_16_NEON_CORRECT = prove + (`!z x y a b pc returnaddress. + ALL (nonoverlapping (z,8 * 16)) + [(word pc,4816); (x,8 * 8); (y,8 * 8)] + ==> ensures arm + (\s. aligned_bytes_loaded s (word(pc + 0)) bignum_kmul_32_64_neon_mc /\ + read PC s = word(pc + 0xb18) /\ + read X30 s = returnaddress /\ + C_ARGUMENTS [z; x; y] s /\ + bignum_from_memory (x,8) s = a /\ + bignum_from_memory (y,8) s = b) + (\s. read PC s = returnaddress /\ + bignum_from_memory (z,16) s = a * b) + (MAYCHANGE [PC; X1; X2; X3; X4; X5; X6; X7; X8; + X9; X10; X11; X12; X13; X14; X15; X16; + X17; X19; X20; X21; X22; X23; X24] ,, + MAYCHANGE [Q0; Q1; Q2; Q3; Q4; Q5],, + MAYCHANGE [memory :> bytes(z,8 * 16)] ,, + MAYCHANGE SOME_FLAGS)`, + REWRITE_TAC[ADD_CLAUSES] THEN + MAP_EVERY X_GEN_TAC + [`z:int64`; `x:int64`; `y:int64`; `a:num`; `b:num`; `pc:num`; `returnaddress:int64`] THEN + REWRITE_TAC[C_ARGUMENTS; C_RETURN; SOME_FLAGS] THEN + REWRITE_TAC[ALL; NONOVERLAPPING_CLAUSES] THEN + DISCH_THEN(REPEAT_TCL CONJUNCTS_THEN ASSUME_TAC) THEN + ENSURES_INIT_TAC "s0" THEN + BIGNUM_DIGITIZE_TAC "x_" `bignum_from_memory (x,8) s0` THEN + BIGNUM_DIGITIZE_TAC "y_" `bignum_from_memory (y,8) s0` THEN + (* Split 128-bit reads to word_join of 64-bit low and highs *) + ABBREV_TAC `x_0_1:(128)word = read (memory :> bytes128 x) s0` THEN + rewrite_assumptions `x_0_1 = word_join (x_1:(64)word) (x_0:(64)word):(128)word` + (MAP_EVERY EXPAND_TAC ["x_0_1"; "x_1"; "x_0"] THEN + REWRITE_TAC[READ_MEMORY_BYTESIZED_SPLIT]) THEN + ABBREV_TAC `x_2_3:(128)word = read (memory :> bytes128 (word_add x (word 16))) s0` THEN + rewrite_assumptions `x_2_3 = word_join (x_3:(64)word) (x_2:(64)word):(128)word` + (MAP_EVERY EXPAND_TAC ["x_2_3"; "x_3"; "x_2"] THEN + REWRITE_TAC[READ_MEMORY_BYTESIZED_SPLIT; WORD_ADD_ASSOC_CONSTS] THEN + ARITH_TAC) THEN + + ABBREV_TAC `y_0_1:(128)word = read (memory :> bytes128 y) s0` THEN + rewrite_assumptions `y_0_1 = word_join (y_1:(64)word) (y_0:(64)word):(128)word` + (MAP_EVERY EXPAND_TAC ["y_0_1"; "y_1"; "y_0"] THEN + REWRITE_TAC[READ_MEMORY_BYTESIZED_SPLIT]) THEN + ABBREV_TAC `y_2_3:(128)word = read (memory :> bytes128 (word_add y (word 16))) s0` THEN + rewrite_assumptions `y_2_3 = word_join (y_3:(64)word) (y_2:(64)word):(128)word` + (MAP_EVERY EXPAND_TAC ["y_2_3"; "y_3"; "y_2"] THEN + REWRITE_TAC[READ_MEMORY_BYTESIZED_SPLIT; WORD_ADD_ASSOC_CONSTS] THEN + ARITH_TAC) THEN + + (*** First ADK block multiplying the lower halves ***) + + (* Run the vectorized parts first *) + ARM_ACCSTEPS_TAC BIGNUM_KMUL_32_64_NEON_EXEC [] (1--16) THEN + simplify_128bit_words_and_accumulate "s16" THEN + ARM_ACCSTEPS_TAC BIGNUM_KMUL_32_64_NEON_EXEC [] (17--17) THEN + simplify_128bit_words_and_accumulate "s17" THEN + ARM_ACCSTEPS_TAC BIGNUM_KMUL_32_64_NEON_EXEC [] (18--25) THEN + simplify_128bit_words_and_accumulate "s25" THEN + ARM_ACCSTEPS_TAC BIGNUM_KMUL_32_64_NEON_EXEC [] (26--26) THEN + simplify_128bit_words_and_accumulate "s26" THEN + + (* Second ADK block multiplying the upper halves with q1 added: + vector loads hoisted *) + + ABBREV_TAC `x_4_5:(128)word = read (memory :> bytes128 (word_add x (word 32))) s26` THEN + rewrite_assumptions `x_4_5 = word_join (x_5:(64)word) (x_4:(64)word):(128)word` + (MAP_EVERY EXPAND_TAC ["x_4_5"; "x_5"; "x_4"] THEN + REWRITE_TAC[READ_MEMORY_BYTESIZED_SPLIT; WORD_ADD_ASSOC_CONSTS] THEN + ARITH_TAC) THEN + ABBREV_TAC `x_6_7:(128)word = read (memory :> bytes128 (word_add x (word 48))) s26` THEN + rewrite_assumptions `x_6_7 = word_join (x_7:(64)word) (x_6:(64)word):(128)word` + (MAP_EVERY EXPAND_TAC ["x_6_7"; "x_7"; "x_6"] THEN + REWRITE_TAC[READ_MEMORY_BYTESIZED_SPLIT; WORD_ADD_ASSOC_CONSTS] THEN + ARITH_TAC) THEN + ABBREV_TAC `y_4_5:(128)word = read (memory :> bytes128 (word_add y (word 32))) s26` THEN + rewrite_assumptions `y_4_5 = word_join (y_5:(64)word) (y_4:(64)word):(128)word` + (MAP_EVERY EXPAND_TAC ["y_4_5"; "y_5"; "y_4"] THEN + REWRITE_TAC[READ_MEMORY_BYTESIZED_SPLIT; WORD_ADD_ASSOC_CONSTS] THEN + ARITH_TAC) THEN + ABBREV_TAC `y_6_7:(128)word = read (memory :> bytes128 (word_add y (word 48))) s26` THEN + rewrite_assumptions `y_6_7 = word_join (y_7:(64)word) (y_6:(64)word):(128)word` + (MAP_EVERY EXPAND_TAC ["y_6_7"; "y_7"; "y_6"] THEN + REWRITE_TAC[READ_MEMORY_BYTESIZED_SPLIT; WORD_ADD_ASSOC_CONSTS] THEN + ARITH_TAC) THEN + ARM_ACCSTEPS_TAC BIGNUM_KMUL_32_64_NEON_EXEC [] (27--30) THEN + + (* First ADK block: Run the remaining scalar parts (1) *) + ARM_ACCSTEPS_TAC BIGNUM_KMUL_32_64_NEON_EXEC [32;34;36] (31--37) THEN + + (* Second ADK block: multiply using vector instructions, but not move the + results to scalar registers *) + ARM_ACCSTEPS_TAC BIGNUM_KMUL_32_64_NEON_EXEC [] (38--44) THEN + simplify_128bit_words THEN + + (* First ADK block: Run the remaining scalar parts *) + ARM_ACCSTEPS_TAC BIGNUM_KMUL_32_64_NEON_EXEC + [45;46;47;48;49;50;51;52;53;54;55;56;62;67;69;70;76;81;83;84;85;86;87;88;94; + 99;101;102;103;109;114;116;117;118;119;120;126;131;133;134;135;136;142;147; + 149;150;151;152] (45--152) THEN + + MAP_EVERY ABBREV_TAC + [`q0 = bignum_of_wordlist[mullo_s16;sum_s81;sum_s114;sum_s147]`; + `q1 = bignum_of_wordlist[sum_s149;sum_s150;sum_s151;sum_s152]`] THEN + SUBGOAL_THEN + `2 EXP 256 * q1 + q0 = + bignum_of_wordlist [x_0;x_1;x_2;x_3] * + bignum_of_wordlist [y_0;y_1;y_2;y_3]` + ASSUME_TAC THENL + [MAP_EVERY EXPAND_TAC ["q0"; "q1"] THEN + REWRITE_TAC[bignum_of_wordlist; GSYM REAL_OF_NUM_CLAUSES] THEN + ADK_48_TAC; + ACCUMULATOR_POP_ASSUM_LIST(K ALL_TAC) THEN + DISCARD_MATCHING_ASSUMPTIONS [`word a = b`]] THEN + + (*** Second ADK block multiplying the upper halves with q1 added ***) + + ARM_ACCSTEPS_TAC BIGNUM_KMUL_32_64_NEON_EXEC [] (153--161) THEN + simplify_128bit_words_and_accumulate "s161" THEN + ARM_ACCSTEPS_TAC BIGNUM_KMUL_32_64_NEON_EXEC [] (162--162) THEN + simplify_128bit_words_and_accumulate "s162" THEN + ARM_ACCSTEPS_TAC BIGNUM_KMUL_32_64_NEON_EXEC [] (163--170) THEN + simplify_128bit_words_and_accumulate "s170" THEN + ARM_ACCSTEPS_TAC BIGNUM_KMUL_32_64_NEON_EXEC [] (171--171) THEN + simplify_128bit_words_and_accumulate "s171" THEN + + ARM_ACCSTEPS_TAC BIGNUM_KMUL_32_64_NEON_EXEC + [173;175;177;179;180;181;182;183;184;185;186;187;188;189;190;192;193;195; + 196;197;198;199;200;206;211;213;214;220;225;227;228;229;230;231;232;238; + 243;245;246;247;253;258;260;261;262;263;264;270;275;277;278;279;280;286; + 291;293;294;295;296] + (172--296) THEN + + MAP_EVERY ABBREV_TAC + [`q2 = bignum_of_wordlist[sum_s192; sum_s225; sum_s258; sum_s291]`; + `q3 = bignum_of_wordlist[sum_s293; sum_s294; sum_s295; sum_s296]`] THEN + SUBGOAL_THEN + `2 EXP 256 * q3 + q2 = + bignum_of_wordlist [x_4;x_5;x_6;x_7] * + bignum_of_wordlist [y_4;y_5;y_6;y_7] + q1` + ASSUME_TAC THENL + [MAP_EVERY EXPAND_TAC ["q1"; "q2"; "q3"] THEN + REWRITE_TAC[bignum_of_wordlist; GSYM REAL_OF_NUM_CLAUSES] THEN + ADK_48_TAC; + ACCUMULATOR_POP_ASSUM_LIST(K ALL_TAC) THEN + DISCARD_MATCHING_ASSUMPTIONS [`word a = b`]] THEN + + (*** The sign-magnitude difference computation ***) + + ARM_ACCSTEPS_TAC BIGNUM_KMUL_32_64_NEON_EXEC + [298;299;301;302;306;307;309;310;314;316;318;320;323;325;327;329] + (297--330) THEN + RULE_ASSUM_TAC(REWRITE_RULE[WORD_UNMASK_64]) THEN + + MAP_EVERY ABBREV_TAC + [`sgn <=> ~(carry_s310 <=> carry_s302)`; + `xd = bignum_of_wordlist[sum_s314;sum_s316;sum_s318;sum_s320]`; + `yd = bignum_of_wordlist[sum_s323;sum_s325;sum_s327;sum_s329]`] THEN + + SUBGOAL_THEN + `(&(bignum_of_wordlist[x_4;x_5;x_6;x_7]) - + &(bignum_of_wordlist[x_0;x_1;x_2;x_3])) * + (&(bignum_of_wordlist[y_0;y_1;y_2;y_3]) - + &(bignum_of_wordlist[y_4;y_5;y_6;y_7])):real = + --(&1) pow bitval sgn * &xd * &yd` + ASSUME_TAC THENL + [TRANS_TAC EQ_TRANS + `(--(&1) pow bitval carry_s302 * &xd) * + (--(&1) pow bitval carry_s310 * &yd):real` THEN + CONJ_TAC THENL + [ALL_TAC; + EXPAND_TAC "sgn" THEN REWRITE_TAC[BITVAL_NOT; BITVAL_IFF] THEN + POP_ASSUM_LIST(K ALL_TAC) THEN REWRITE_TAC[bitval] THEN + REPEAT(COND_CASES_TAC THEN ASM_REWRITE_TAC[]) THEN + CONV_TAC NUM_REDUCE_CONV THEN REAL_ARITH_TAC] THEN + SUBGOAL_THEN + `(carry_s302 <=> + bignum_of_wordlist[x_4;x_5;x_6;x_7] < + bignum_of_wordlist[x_0;x_1;x_2;x_3]) /\ + (carry_s310 <=> + bignum_of_wordlist[y_0;y_1;y_2;y_3] < + bignum_of_wordlist[y_4;y_5;y_6;y_7])` + (CONJUNCTS_THEN SUBST_ALL_TAC) + THENL + [CONJ_TAC THEN MATCH_MP_TAC FLAG_FROM_CARRY_LT THEN EXISTS_TAC `256` THEN + REWRITE_TAC[bignum_of_wordlist; GSYM REAL_OF_NUM_CLAUSES] THEN + ACCUMULATOR_POP_ASSUM_LIST(MP_TAC o end_itlist CONJ o DECARRY_RULE) THEN + REWRITE_TAC[REAL_BITVAL_NOT; REAL_VAL_WORD_MASK; DIMINDEX_64] THEN + DISCH_THEN(fun th -> REWRITE_TAC[th]) THEN BOUNDER_TAC[]; + ALL_TAC] THEN + BINOP_TAC THEN REWRITE_TAC[bitval] THEN + COND_CASES_TAC THEN ASM_REWRITE_TAC[real_pow; REAL_MUL_LID] THEN + REWRITE_TAC[REAL_ARITH `x - y:real = --(&1) pow 1 * z <=> y - x = z`] THEN + MATCH_MP_TAC EQUAL_FROM_CONGRUENT_REAL THEN + MAP_EVERY EXISTS_TAC [`256`; `&0:real`] THEN + (CONJ_TAC THENL + [MATCH_MP_TAC(REAL_ARITH + `y:real <= x /\ (&0 <= x /\ x < e) /\ (&0 <= y /\ y < e) + ==> &0 <= x - y /\ x - y < e`) THEN + ASM_SIMP_TAC[REAL_OF_NUM_CLAUSES; LT_IMP_LE; + ARITH_RULE `~(a:num < b) ==> b <= a`] THEN + REWRITE_TAC[bignum_of_wordlist; GSYM REAL_OF_NUM_CLAUSES] THEN + CONJ_TAC THEN BOUNDER_TAC[]; + ALL_TAC] THEN + MAP_EVERY EXPAND_TAC ["xd"; "yd"] THEN + REWRITE_TAC[bignum_of_wordlist; GSYM REAL_OF_NUM_CLAUSES] THEN + CONJ_TAC THENL [BOUNDER_TAC[]; REWRITE_TAC[INTEGER_CLOSED]]) THEN + ACCUMULATOR_POP_ASSUM_LIST(MP_TAC o end_itlist CONJ) THEN + ASM_REWRITE_TAC[WORD_XOR_MASK] THEN + REWRITE_TAC[REAL_VAL_WORD_NOT; BITVAL_CLAUSES; DIMINDEX_64] THEN + DISCH_THEN(MP_TAC o end_itlist CONJ o DESUM_RULE o CONJUNCTS) THEN + DISCH_THEN(fun th -> REWRITE_TAC[th]) THEN REAL_INTEGER_TAC; + ACCUMULATOR_POP_ASSUM_LIST(K ALL_TAC)] THEN + + (*** Third ADK block multiplying the absolute differences ***) + + ARM_ACCSTEPS_TAC BIGNUM_KMUL_32_64_NEON_EXEC + [332;333;334;335;337;339;341;343;344;345;346;347;348;349;350;351;352;353;354;360;365;367;368;374;379;381;382;383;384;385;386;392;397;399;400;401;407;412;414;415;416;417;418;424;429;431;432;433;434;440;445;447;448;449;450] + (331--450) THEN + + SUBGOAL_THEN + `&xd * &yd:real = + &(bignum_of_wordlist + [mullo_s332; sum_s379; sum_s412; sum_s445; + sum_s447; sum_s448; sum_s449; sum_s450])` + SUBST_ALL_TAC THENL + [MAP_EVERY EXPAND_TAC ["xd"; "yd"] THEN + REWRITE_TAC[bignum_of_wordlist; GSYM REAL_OF_NUM_CLAUSES] THEN + ADK_48_TAC; + ACCUMULATOR_POP_ASSUM_LIST(K ALL_TAC) THEN + DISCARD_MATCHING_ASSUMPTIONS [`word a = b`]] THEN + + (*** Clean up the overall sign ***) + + FIRST_X_ASSUM(MP_TAC o GEN_REWRITE_RULE RAND_CONV [WORD_XOR_MASKS]) THEN + ASM_REWRITE_TAC[] THEN DISCH_TAC THEN + + (*** Final accumulation simulation and 16-digit focusing ***) + + ARM_ACCSTEPS_TAC BIGNUM_KMUL_32_64_NEON_EXEC + [453;454;457;458;460;461;463;464;465;466;469;471;472;473;475;477;479;481;483;484;485;486;487] + (451--494) THEN + + ENSURES_FINAL_STATE_TAC THEN ASM_REWRITE_TAC[] THEN + CONV_TAC(LAND_CONV BIGNUM_EXPAND_CONV) THEN ASM_REWRITE_TAC[] THEN + DISCARD_STATE_TAC "s493" THEN + REWRITE_TAC[GSYM REAL_OF_NUM_CLAUSES] THEN + MATCH_MP_TAC EQUAL_FROM_CONGRUENT_REAL THEN + MAP_EVERY EXISTS_TAC [`1024`; `&0:real`] THEN + CONJ_TAC THENL [BOUNDER_TAC[]; ALL_TAC] THEN CONJ_TAC THENL + [MAP_EVERY EXPAND_TAC ["a"; "b"] THEN + REWRITE_TAC[GSYM REAL_OF_NUM_CLAUSES] THEN BOUNDER_TAC[]; + REWRITE_TAC[INTEGER_CLOSED]] THEN + + (*** The core rearrangement we are using ***) + + SUBGOAL_THEN + `&a * &b:real = + (&1 + &2 pow 256) * (&q0 + &2 pow 256 * &q2 + &2 pow 512 * &q3) + + &2 pow 256 * + (&(bignum_of_wordlist [x_4; x_5; x_6; x_7]) - + &(bignum_of_wordlist [x_0; x_1; x_2; x_3])) * + (&(bignum_of_wordlist [y_0; y_1; y_2; y_3]) - + &(bignum_of_wordlist [y_4; y_5; y_6; y_7]))` + SUBST1_TAC THENL + [MAP_EVERY UNDISCH_TAC + [`2 EXP 256 * q1 + q0 = + bignum_of_wordlist[x_0; x_1; x_2; x_3] * + bignum_of_wordlist[y_0; y_1; y_2; y_3]`; + `2 EXP 256 * q3 + q2 = + bignum_of_wordlist[x_4; x_5; x_6; x_7] * + bignum_of_wordlist[y_4; y_5; y_6; y_7] + + q1`] THEN + MAP_EVERY EXPAND_TAC ["a"; "b"] THEN + REWRITE_TAC[GSYM REAL_OF_NUM_CLAUSES; bignum_of_wordlist] THEN + CONV_TAC REAL_RING; + ASM_REWRITE_TAC[]] THEN + + MAP_EVERY EXPAND_TAC ["q0"; "q2"; "q3"] THEN + REWRITE_TAC[GSYM REAL_OF_NUM_CLAUSES; bignum_of_wordlist] THEN + ACCUMULATOR_POP_ASSUM_LIST(MP_TAC o end_itlist CONJ) THEN + POP_ASSUM_LIST(K ALL_TAC) THEN + REWRITE_TAC[WORD_XOR_MASK] THEN COND_CASES_TAC THEN + ASM_REWRITE_TAC[REAL_VAL_WORD_NOT; BITVAL_CLAUSES; DIMINDEX_64] THEN + CONV_TAC WORD_REDUCE_CONV THEN CONV_TAC NUM_REDUCE_CONV THEN + REWRITE_TAC[BITVAL_CLAUSES] THEN DISCH_TAC THEN + + (*** A bit of manual logic for the carry connections in negative case ***) + FIRST_ASSUM(MP_TAC o end_itlist CONJ o DESUM_RULE o CONJUNCTS) THEN + DISCH_THEN(fun th -> REWRITE_TAC[th]) THEN + CONV_TAC(RAND_CONV REAL_POLY_CONV) THENL + [SUBGOAL_THEN + `&(bitval carry_s465):real = &(bitval carry_s466)` + SUBST1_TAC THENL [ALL_TAC; REAL_INTEGER_TAC] THEN + POP_ASSUM MP_TAC THEN BOOL_CASES_TAC `carry_s465:bool` THEN + ASM_REWRITE_TAC[BITVAL_CLAUSES] THEN + REWRITE_TAC[REAL_RAT_REDUCE_CONV `(&2 pow 64 - &1) * &1 + &0`] THEN + POP_ASSUM_LIST(K ALL_TAC) THEN DISCH_TAC; + ALL_TAC] THEN + FIRST_ASSUM(MP_TAC o end_itlist CONJ o + filter (is_ratconst o rand o concl) o DECARRY_RULE o CONJUNCTS) THEN + DISCH_THEN(fun th -> REWRITE_TAC[th]) THEN REAL_INTEGER_TAC);; + +let LOCAL_MUL_8_16_NEON_TAC = + ARM_SUBROUTINE_SIM_TAC + (bignum_kmul_32_64_neon_mc,BIGNUM_KMUL_32_64_NEON_EXEC, + 0x0,bignum_kmul_32_64_neon_mc,LOCAL_MUL_8_16_NEON_CORRECT) + [`read X0 s`; `read X1 s`; `read X2 s`; + `bignum_from_memory (read X1 s,8) s`; + `bignum_from_memory (read X2 s,8) s`; + `pc:num`; `read X30 s`];; + +let LOCAL_KMUL_16_32_NEON_CORRECT = prove + (`!z x y a b t pc. + nonoverlapping (z,8 * 32) (t,8 * 32) /\ + ALLPAIRS nonoverlapping + [(z,8 * 32); (t,8 * 32)] + [(word pc,4816); (x,8 * 16); (y,8 * 16)] + ==> ensures arm + (\s. aligned_bytes_loaded s (word pc) bignum_kmul_32_64_neon_mc /\ + read PC s = word(pc + 0x780) /\ + C_ARGUMENTS [z; x; y; t] s /\ + bignum_from_memory (x,16) s = a /\ + bignum_from_memory (y,16) s = b) + (\s. read PC s = word (pc + 0xb08) /\ + bignum_from_memory (z,32) s = a * b) + (MAYCHANGE [PC; X0; X1; X2; X3; X4; X5; X6; X7; X8; X9; X10; + X11; X12; X13; X14; X15; X16; X17; X19; X20; X21; + X22; X23; X24; X25; X26; X27; X28; X29; X30] ,, + MAYCHANGE [Q0; Q1; Q2; Q3; Q4; Q5],, + MAYCHANGE [memory :> bytes(z,8 * 32); + memory :> bytes(t,8 * 32)] ,, + MAYCHANGE SOME_FLAGS)`, + MAP_EVERY X_GEN_TAC + [`z:int64`; `x:int64`; `y:int64`; `a:num`; `b:num`; `t:int64`;`pc:num`] THEN + REWRITE_TAC[ALLPAIRS; ALL; PAIRWISE] THEN + REWRITE_TAC[C_ARGUMENTS; C_RETURN; SOME_FLAGS; NONOVERLAPPING_CLAUSES] THEN + DISCH_THEN(REPEAT_TCL CONJUNCTS_THEN ASSUME_TAC) THEN + ENSURES_INIT_TAC "s0" THEN + BIGNUM_LDIGITIZE_TAC "x_" `bignum_from_memory (x,16) s0` THEN + BIGNUM_LDIGITIZE_TAC "y_" `bignum_from_memory (y,16) s0` THEN + + (*** First nested 8x8 multiply block ***) + + ARM_STEPS_TAC BIGNUM_KMUL_32_64_NEON_EXEC (1--5) THEN + LOCAL_MUL_8_16_NEON_TAC 6 THEN + BIGNUM_LDIGITIZE_TAC "l_" `read (memory :> bytes (z,8 * 16)) s6` THEN + FIRST_X_ASSUM + (MP_TAC o check (can (term_match [] `x:num = y * z`) o concl)) THEN + CONV_TAC(LAND_CONV(RAND_CONV(BINOP_CONV BIGNUM_LEXPAND_CONV))) THEN + ASM_REWRITE_TAC[] THEN DISCH_THEN(ASSUME_TAC o SYM) THEN + + (*** Sign-difference computation for x ***) + + ARM_ACCSTEPS_TAC BIGNUM_KMUL_32_64_NEON_EXEC + [9;10;13;14;17;18;21;22] (7--23) THEN + RULE_ASSUM_TAC(REWRITE_RULE[WORD_UNMASK_64]) THEN + SUBGOAL_THEN + `bignum_of_wordlist [x_0;x_1;x_2;x_3;x_4;x_5;x_6;x_7] < + bignum_of_wordlist [x_8;x_9;x_10;x_11;x_12;x_13;x_14;x_15] <=> + carry_s22` + ASSUME_TAC THENL + [CONV_TAC SYM_CONV THEN + MATCH_MP_TAC FLAG_FROM_CARRY_LT THEN EXISTS_TAC `512` THEN + REWRITE_TAC[bignum_of_wordlist; GSYM REAL_OF_NUM_CLAUSES] THEN + ACCUMULATOR_POP_ASSUM_LIST(MP_TAC o end_itlist CONJ o DECARRY_RULE) THEN + REWRITE_TAC[REAL_BITVAL_NOT; REAL_VAL_WORD_MASK; DIMINDEX_64] THEN + DISCH_THEN(fun th -> REWRITE_TAC[th]) THEN BOUNDER_TAC[]; + ALL_TAC] THEN + ARM_ACCSTEPS_TAC BIGNUM_KMUL_32_64_NEON_EXEC + [26;28;31;33;36;38;41;43] (24--44) THEN + SUBGOAL_THEN + `&(bignum_from_memory(t,8) s44):real = + abs(&(bignum_of_wordlist [x_0;x_1;x_2;x_3;x_4;x_5;x_6;x_7]) - + &(bignum_of_wordlist [x_8;x_9;x_10;x_11;x_12;x_13;x_14;x_15]))` + MP_TAC THENL + [MATCH_MP_TAC EQUAL_FROM_CONGRUENT_REAL THEN + MAP_EVERY EXISTS_TAC [`64 * 8`; `&0:real`] THEN CONJ_TAC THENL + [REWRITE_TAC[REAL_OF_NUM_CLAUSES; LE_0; BIGNUM_FROM_MEMORY_BOUND]; + ALL_TAC] THEN + CONJ_TAC THENL + [REWRITE_TAC[REAL_ABS_POS] THEN MATCH_MP_TAC(REAL_ARITH + `&x < e /\ &y < e ==> abs(&x - &y):real < e`) THEN + REWRITE_TAC[REAL_OF_NUM_CLAUSES] THEN CONJ_TAC THEN + MATCH_MP_TAC BIGNUM_OF_WORDLIST_BOUND THEN + REWRITE_TAC[LENGTH] THEN ARITH_TAC; + REWRITE_TAC[INTEGER_CLOSED]] THEN + CONV_TAC(ONCE_DEPTH_CONV BIGNUM_EXPAND_CONV) THEN ASM_REWRITE_TAC[] THEN + CONV_TAC(ONCE_DEPTH_CONV NUM_MULT_CONV) THEN + ASM_REWRITE_TAC[REAL_OF_NUM_LT; REAL_ARITH + `abs(&x - &y):real = if &x < &y then &y - &x else &x - &y`] THEN + REWRITE_TAC[bignum_of_wordlist; GSYM REAL_OF_NUM_CLAUSES] THEN + ACCUMULATOR_POP_ASSUM_LIST(MP_TAC o end_itlist CONJ) THEN + REWRITE_TAC[WORD_UNMASK_64; WORD_XOR_MASK] THEN + COND_CASES_TAC THEN ASM_REWRITE_TAC[BITVAL_CLAUSES] THEN + CONV_TAC WORD_REDUCE_CONV THEN CONV_TAC NUM_REDUCE_CONV THEN + ASM_REWRITE_TAC[BITVAL_CLAUSES; REAL_VAL_WORD_NOT; DIMINDEX_64] THEN + DISCH_THEN(MP_TAC o end_itlist CONJ o DESUM_RULE o CONJUNCTS) THEN + REWRITE_TAC[REAL_BITVAL_NOT; REAL_VAL_WORD_MASK; DIMINDEX_64] THEN + DISCH_THEN(fun th -> REWRITE_TAC[th]) THEN REAL_INTEGER_TAC; + ACCUMULATOR_POP_ASSUM_LIST(K ALL_TAC) THEN + CONV_TAC(LAND_CONV(LAND_CONV(RAND_CONV BIGNUM_LEXPAND_CONV))) THEN + ASM_REWRITE_TAC[] THEN DISCH_THEN(ASSUME_TAC o SYM)] THEN + + (*** Second nested 8x8 multiply ***) + + ARM_STEPS_TAC BIGNUM_KMUL_32_64_NEON_EXEC (45--48) THEN + LOCAL_MUL_8_16_NEON_TAC 49 THEN + BIGNUM_LDIGITIZE_TAC "h_" + `read (memory :> bytes (word_add z (word 128),8 * 16)) s49` THEN + FIRST_X_ASSUM + (MP_TAC o check (can (term_match [] `x:num = y * z`) o concl)) THEN + CONV_TAC(LAND_CONV(RAND_CONV(BINOP_CONV BIGNUM_LEXPAND_CONV))) THEN + ASM_REWRITE_TAC[] THEN DISCH_THEN(ASSUME_TAC o SYM) THEN + + (*** Sign-difference computation for y ***) + + ARM_ACCSTEPS_TAC BIGNUM_KMUL_32_64_NEON_EXEC + [52;53;56;57;60;61;64;65] (50--66) THEN + RULE_ASSUM_TAC(REWRITE_RULE[WORD_UNMASK_64]) THEN + SUBGOAL_THEN + `bignum_of_wordlist [y_8;y_9;y_10;y_11;y_12;y_13;y_14;y_15] < + bignum_of_wordlist [y_0;y_1;y_2;y_3;y_4;y_5;y_6;y_7] <=> + carry_s65` + ASSUME_TAC THENL + [CONV_TAC SYM_CONV THEN + MATCH_MP_TAC FLAG_FROM_CARRY_LT THEN EXISTS_TAC `512` THEN + REWRITE_TAC[bignum_of_wordlist; GSYM REAL_OF_NUM_CLAUSES] THEN + ACCUMULATOR_POP_ASSUM_LIST(MP_TAC o end_itlist CONJ o DECARRY_RULE) THEN + REWRITE_TAC[REAL_BITVAL_NOT; REAL_VAL_WORD_MASK; DIMINDEX_64] THEN + DISCH_THEN(fun th -> REWRITE_TAC[th]) THEN BOUNDER_TAC[]; + ALL_TAC] THEN + ARM_ACCSTEPS_TAC BIGNUM_KMUL_32_64_NEON_EXEC + [69;71;74;76;79;81;84;86] (67--88) THEN + SUBGOAL_THEN + `&(bignum_from_memory(word_add t (word 64),8) s88):real = + abs(&(bignum_of_wordlist [y_0;y_1;y_2;y_3;y_4;y_5;y_6;y_7]) - + &(bignum_of_wordlist [y_8;y_9;y_10;y_11;y_12;y_13;y_14;y_15]))` + MP_TAC THENL + [MATCH_MP_TAC EQUAL_FROM_CONGRUENT_REAL THEN + MAP_EVERY EXISTS_TAC [`64 * 8`; `&0:real`] THEN CONJ_TAC THENL + [REWRITE_TAC[REAL_OF_NUM_CLAUSES; LE_0; BIGNUM_FROM_MEMORY_BOUND]; + ALL_TAC] THEN + CONJ_TAC THENL + [REWRITE_TAC[REAL_ABS_POS] THEN MATCH_MP_TAC(REAL_ARITH + `&x < e /\ &y < e ==> abs(&x - &y):real < e`) THEN + REWRITE_TAC[REAL_OF_NUM_CLAUSES] THEN CONJ_TAC THEN + MATCH_MP_TAC BIGNUM_OF_WORDLIST_BOUND THEN + REWRITE_TAC[LENGTH] THEN ARITH_TAC; + REWRITE_TAC[INTEGER_CLOSED]] THEN + CONV_TAC(ONCE_DEPTH_CONV BIGNUM_EXPAND_CONV) THEN ASM_REWRITE_TAC[] THEN + CONV_TAC(ONCE_DEPTH_CONV NUM_MULT_CONV) THEN + ASM_REWRITE_TAC[REAL_OF_NUM_LT; REAL_ARITH + `abs(&x - &y):real = if &y < &x then &x - &y else &y - &x`] THEN + REWRITE_TAC[bignum_of_wordlist; GSYM REAL_OF_NUM_CLAUSES] THEN + ACCUMULATOR_POP_ASSUM_LIST(MP_TAC o end_itlist CONJ) THEN + REWRITE_TAC[WORD_UNMASK_64; WORD_XOR_MASK] THEN + COND_CASES_TAC THEN ASM_REWRITE_TAC[BITVAL_CLAUSES] THEN + CONV_TAC WORD_REDUCE_CONV THEN CONV_TAC NUM_REDUCE_CONV THEN + ASM_REWRITE_TAC[BITVAL_CLAUSES; REAL_VAL_WORD_NOT; DIMINDEX_64] THEN + DISCH_THEN(MP_TAC o end_itlist CONJ o DESUM_RULE o CONJUNCTS) THEN + REWRITE_TAC[REAL_BITVAL_NOT; REAL_VAL_WORD_MASK; DIMINDEX_64] THEN + DISCH_THEN(fun th -> REWRITE_TAC[th]) THEN REAL_INTEGER_TAC; + ACCUMULATOR_POP_ASSUM_LIST(K ALL_TAC) THEN + CONV_TAC(LAND_CONV(LAND_CONV(RAND_CONV BIGNUM_LEXPAND_CONV))) THEN + ASM_REWRITE_TAC[] THEN DISCH_THEN(ASSUME_TAC o SYM)] THEN + + (*** Collected sign ***) + + RULE_ASSUM_TAC(REWRITE_RULE[WORD_XOR_MASKS]) THEN + ABBREV_TAC `sgn <=> ~(carry_s22 <=> carry_s65)` THEN + + (*** Computation of H' = H + L_top ***) + + ARM_ACCSTEPS_TAC BIGNUM_KMUL_32_64_NEON_EXEC + [91;92;96;97;101;102;106;107;110;111;114;115;118;119;122;123] + (89--124) THEN + SUBGOAL_THEN + `bignum_from_memory(word_add z (word 128),16) s124 = + bignum_of_wordlist + [h_0;h_1;h_2;h_3;h_4;h_5;h_6;h_7;h_8;h_9;h_10;h_11;h_12;h_13;h_14;h_15] + + bignum_of_wordlist[l_8;l_9;l_10;l_11;l_12;l_13;l_14;l_15]` + MP_TAC THENL + [REWRITE_TAC[GSYM REAL_OF_NUM_CLAUSES] THEN + MATCH_MP_TAC EQUAL_FROM_CONGRUENT_REAL THEN + MAP_EVERY EXISTS_TAC [`64 * 16`; `&0:real`] THEN CONJ_TAC THENL + [REWRITE_TAC[REAL_OF_NUM_CLAUSES; LE_0; BIGNUM_FROM_MEMORY_BOUND]; + ALL_TAC] THEN + CONJ_TAC THENL + [REWRITE_TAC[REAL_OF_NUM_CLAUSES; LE_0] THEN FIRST_X_ASSUM(fun th -> + GEN_REWRITE_TAC (LAND_CONV o LAND_CONV) [SYM th]) THEN + REWRITE_TAC[bignum_of_wordlist; GSYM REAL_OF_NUM_CLAUSES] THEN + CONV_TAC NUM_REDUCE_CONV THEN BOUNDER_TAC[]; + REWRITE_TAC[INTEGER_CLOSED]] THEN + CONV_TAC(ONCE_DEPTH_CONV BIGNUM_EXPAND_CONV) THEN + ASM_REWRITE_TAC[bignum_of_wordlist; GSYM REAL_OF_NUM_CLAUSES] THEN + CONV_TAC(ONCE_DEPTH_CONV NUM_MULT_CONV) THEN + ACCUMULATOR_POP_ASSUM_LIST(MP_TAC o end_itlist CONJ o DESUM_RULE) THEN + DISCH_THEN(fun th -> REWRITE_TAC[th]) THEN REAL_INTEGER_TAC; + ACCUMULATOR_POP_ASSUM_LIST(K ALL_TAC) THEN + CONV_TAC(LAND_CONV(LAND_CONV BIGNUM_LEXPAND_CONV)) THEN + ASM_REWRITE_TAC[] THEN DISCH_TAC] THEN + + (*** Third and final nested multiply ***) + + ARM_STEPS_TAC BIGNUM_KMUL_32_64_NEON_EXEC (125--128) THEN + LOCAL_MUL_8_16_NEON_TAC 129 THEN + + BIGNUM_LDIGITIZE_TAC "m_" + `read (memory :> bytes (word_add t (word 128),8 * 16)) s129` THEN + FIRST_X_ASSUM + (MP_TAC o check (can (term_match [] `x:num = y * z`) o concl)) THEN + CONV_TAC(LAND_CONV(RAND_CONV(BINOP_CONV BIGNUM_LEXPAND_CONV))) THEN + ASM_REWRITE_TAC[] THEN DISCH_THEN(ASSUME_TAC o SYM) THEN + + (*** All remaining accumulation of sub-results ***) + + ARM_ACCSTEPS_TAC BIGNUM_KMUL_32_64_NEON_EXEC + [132; 133; 136; 137; 140; 141; 144; 145; 148; 149; 152; 153; 156; + 157; 160; 161; 166; 168; 172; 174; 178; 180; 184; 186; 190; 192; + 196; 198; 202; 204; 208; 210; 212; 213; 215; 216; 219; 220; 223; + 224; 227; 228] + (130--229) THEN + ENSURES_FINAL_STATE_TAC THEN ASM_REWRITE_TAC[] THEN + REWRITE_TAC[GSYM REAL_OF_NUM_CLAUSES] THEN + MATCH_MP_TAC EQUAL_FROM_CONGRUENT_REAL THEN + MAP_EVERY EXISTS_TAC [`64 * 32`; `&0:real`] THEN CONJ_TAC THENL + [REWRITE_TAC[REAL_OF_NUM_CLAUSES; LE_0] THEN + REWRITE_TAC[GSYM BIGNUM_FROM_MEMORY_BYTES; BIGNUM_FROM_MEMORY_BOUND]; + ALL_TAC] THEN + CONJ_TAC THENL + [REWRITE_TAC[REAL_OF_NUM_CLAUSES; LE_0] THEN + MAP_EVERY EXPAND_TAC ["a"; "b"] THEN + REWRITE_TAC[bignum_of_wordlist; GSYM REAL_OF_NUM_CLAUSES] THEN + CONV_TAC NUM_REDUCE_CONV THEN BOUNDER_TAC[]; + REWRITE_TAC[INTEGER_CLOSED]] THEN + SUBGOAL_THEN + `(&a:real) * &b = + (&(bignum_of_wordlist[l_0; l_1; l_2; l_3; l_4; l_5; l_6; l_7]) + + &2 pow 512 * + &(bignum_of_wordlist + [sum_s91; sum_s92; sum_s96; sum_s97; sum_s101; sum_s102; sum_s106; + sum_s107; sum_s110; sum_s111; sum_s114; sum_s115; sum_s118; sum_s119; + sum_s122; sum_s123])) * + (&2 pow 512 + &1) + + &2 pow 512 * + --(&1) pow bitval sgn * + &(bignum_of_wordlist + [m_0; m_1; m_2; m_3; m_4; m_5; m_6; m_7; m_8; m_9; m_10; m_11; m_12; + m_13; m_14; m_15])` + SUBST1_TAC THENL + [ASM_REWRITE_TAC[] THEN REWRITE_TAC[REAL_OF_NUM_CLAUSES; ARITH_RULE + `l + e * (h + m):num = (l + e * m) + e * h`] THEN + REWRITE_TAC[GSYM(BIGNUM_OF_WORDLIST_SPLIT_RULE(8,8))] THEN + REPEAT(FIRST_X_ASSUM(SUBST1_TAC o MATCH_MP (ARITH_RULE + `w * z:num = y ==> y = w * z`))) THEN + REWRITE_TAC[GSYM REAL_OF_NUM_CLAUSES] THEN + REPEAT(FIRST_X_ASSUM(SUBST1_TAC o MATCH_MP (MESON[] + `abs x:real = y ==> y = abs x`))) THEN + ONCE_REWRITE_TAC[MESON[REAL_ABS_NEG] + `abs x * abs y:real = abs x * abs(--y)`] THEN + REWRITE_TAC[REAL_NEG_SUB; REAL_ARITH + `abs(x - x'):real = if x < x' then x' - x else x - x'`] THEN + ASM_REWRITE_TAC[REAL_OF_NUM_LT] THEN + MAP_EVERY EXPAND_TAC ["a"; "b"] THEN + REWRITE_TAC[BIGNUM_OF_WORDLIST_SPLIT_RULE(8,8)] THEN + EXPAND_TAC "sgn" THEN POP_ASSUM_LIST(K ALL_TAC) THEN + REWRITE_TAC[GSYM REAL_OF_NUM_CLAUSES] THEN + REPEAT(COND_CASES_TAC THEN ASM_REWRITE_TAC[BITVAL_CLAUSES]) THEN + REAL_ARITH_TAC; + ALL_TAC] THEN + CONV_TAC(ONCE_DEPTH_CONV BIGNUM_LEXPAND_CONV) THEN + REWRITE_TAC[GSYM REAL_OF_NUM_CLAUSES; bignum_of_wordlist] THEN + ASM_REWRITE_TAC[] THEN CONV_TAC NUM_REDUCE_CONV THEN + ACCUMULATOR_POP_ASSUM_LIST(MP_TAC o end_itlist CONJ) THEN + REWRITE_TAC[WORD_XOR_MASK] THEN POP_ASSUM_LIST(K ALL_TAC) THEN + BOOL_CASES_TAC `sgn:bool` THEN ASM_REWRITE_TAC[BITVAL_CLAUSES] THEN + CONV_TAC WORD_REDUCE_CONV THEN CONV_TAC NUM_REDUCE_CONV THEN + ASM_REWRITE_TAC[BITVAL_CLAUSES; REAL_VAL_WORD_NOT; DIMINDEX_64] THEN + REWRITE_TAC[COND_SWAP; GSYM WORD_BITVAL; VAL_WORD_BITVAL] THEN STRIP_TAC THEN + ASSUM_LIST(MP_TAC o end_itlist CONJ o DESUM_RULE) THEN + DISCH_THEN(fun th -> REWRITE_TAC[th]) THEN + CONV_TAC(RAND_CONV REAL_POLY_CONV) THEN + POP_ASSUM_LIST(MP_TAC o end_itlist CONJ o + filter (free_in `carry_s212:bool` o concl)) + THENL + [ASM_CASES_TAC `carry_s212:bool` THEN ASM_REWRITE_TAC[BITVAL_CLAUSES]; + ALL_TAC] THEN + DISCH_THEN(MP_TAC o end_itlist CONJ o filter (is_ratconst o rand o concl) o + DECARRY_RULE o CONJUNCTS) THEN + DISCH_THEN(fun th -> REWRITE_TAC[th]) THEN REAL_INTEGER_TAC);; + +let LOCAL_KMUL_16_32_NEON_SUBR_CORRECT = prove + (`!z x y a b t pc stackpointer returnaddress. + aligned 16 stackpointer /\ + PAIRWISE nonoverlapping + [(z,8 * 32); (t,8 * 32); (word_sub stackpointer (word 48),48)] /\ + ALLPAIRS nonoverlapping + [(z,8 * 32); (t,8 * 32); (word_sub stackpointer (word 48),48)] + [(word pc,4816); (x,8 * 16); (y,8 * 16)] + ==> ensures arm + (\s. aligned_bytes_loaded s (word(pc + 0)) + bignum_kmul_32_64_neon_mc /\ + read PC s = word(pc + 0x774) /\ + read SP s = stackpointer /\ + read X30 s = returnaddress /\ + C_ARGUMENTS [z; x; y; t] s /\ + bignum_from_memory (x,16) s = a /\ + bignum_from_memory (y,16) s = b) + (\s. read PC s = returnaddress /\ + bignum_from_memory (z,32) s = a * b) + (MAYCHANGE [PC; X0; X1; X2; X3; X4; X5; X6; X7; X8; X9; X10; + X11; X12; X13; X14; X15; X16; X17; + X24; X25; X26; X27; X28; X29] ,, + MAYCHANGE [Q0; Q1; Q2; Q3; Q4; Q5],, + MAYCHANGE [memory :> bytes(z,8 * 32); + memory :> bytes(t,8 * 32); + memory :> bytes(word_sub stackpointer (word 48),48)] ,, + MAYCHANGE SOME_FLAGS)`, + REWRITE_TAC[ADD_CLAUSES] THEN + ARM_ADD_RETURN_STACK_TAC + BIGNUM_KMUL_32_64_NEON_EXEC LOCAL_KMUL_16_32_NEON_CORRECT + `[X19;X20;X21;X22;X23;X30]` 48);; + +let LOCAL_KMUL_16_32_NEON_TAC = + ARM_SUBROUTINE_SIM_TAC + (bignum_kmul_32_64_neon_mc,BIGNUM_KMUL_32_64_NEON_EXEC, + 0x0,bignum_kmul_32_64_neon_mc,LOCAL_KMUL_16_32_NEON_SUBR_CORRECT) + [`read X0 s`; `read X1 s`; `read X2 s`; + `read (memory :> bytes (read X1 s,8 * 16)) s`; + `read (memory :> bytes (read X2 s,8 * 16)) s`; + `read X3 s`; `pc:num`; `read SP s`; `read X30 s`];; + +(* ------------------------------------------------------------------------- *) +(* Now the main proof. *) +(* ------------------------------------------------------------------------- *) + +let BIGNUM_KMUL_32_64_NEON_SUBROUTINE_CORRECT = prove( + `!z x y a b t pc stackpointer returnaddress. + aligned 16 stackpointer /\ + PAIRWISE nonoverlapping + [(z,8 * 64); (t,8 * 96); (word_sub stackpointer (word 144),144)] /\ + ALLPAIRS nonoverlapping + [(z,8 * 64); (t,8 * 96); (word_sub stackpointer (word 144),144)] + [(word pc,4816); (x,8 * 32); (y,8 * 32)] + ==> ensures arm + (\s. aligned_bytes_loaded s (word pc) bignum_kmul_32_64_neon_mc /\ + read PC s = word pc /\ + read SP s = stackpointer /\ + read X30 s = returnaddress /\ + C_ARGUMENTS [z; x; y; t] s /\ + bignum_from_memory (x,32) s = a /\ + bignum_from_memory (y,32) s = b) + (\s. read PC s = returnaddress /\ + bignum_from_memory (z,64) s = a * b) + (MAYCHANGE [PC; X0; X1; X2; X3; X4; X5; X6; X7; X8; X9; X10; + X11; X12; X13; X14; X15; X16; X17] ,, + MAYCHANGE [Q0; Q1; Q2; Q3; Q4; Q5],, + MAYCHANGE [memory :> bytes(z,8 * 64); + memory :> bytes(t,8 * 96); + memory :> bytes(word_sub stackpointer (word 144),144)] ,, + MAYCHANGE SOME_FLAGS)`, + MAP_EVERY X_GEN_TAC + [`z:int64`; `x:int64`; `y:int64`; + `a:num`; `b:num`; `t:int64`; `pc:num`] THEN + WORD_FORALL_OFFSET_TAC 144 THEN + MAP_EVERY X_GEN_TAC [`stackpointer:int64`; `returnaddress:int64`] THEN + REWRITE_TAC[C_ARGUMENTS; C_RETURN; SOME_FLAGS] THEN + REWRITE_TAC[ALL; PAIRWISE; ALLPAIRS; NONOVERLAPPING_CLAUSES] THEN + STRIP_TAC THEN + + (*** Start and end boilerplate for save and restore of registers ***) + + SUBGOAL_THEN + `ensures arm + (\s. aligned_bytes_loaded s (word pc) bignum_kmul_32_64_neon_mc /\ + read PC s = word(pc + 0x18) /\ + read SP s = word_add stackpointer (word 48) /\ + read X30 s = returnaddress /\ + C_ARGUMENTS [z; x; y; t] s /\ + bignum_from_memory (x,32) s = a /\ + bignum_from_memory (y,32) s = b) + (\s. read PC s = word(pc + 0x758) /\ + bignum_from_memory (z,64) s = a * b) + (MAYCHANGE [PC; X0; X1; X2; X3; X4; X5; X6; X7; X8; X9; X10; X11; X12; + X13; X14; X15; X16; X17; X19; X20; X21; X22; X23; + X24; X25; X26; X27; X28; X29; X30] ,, + MAYCHANGE [Q0; Q1; Q2; Q3; Q4; Q5],, + MAYCHANGE [memory :> bytes(z,8 * 64); memory :> bytes(t,8 * 96); + memory :> bytes(stackpointer,48)] ,, + MAYCHANGE SOME_FLAGS)` + MP_TAC THEN + REWRITE_TAC[C_ARGUMENTS; C_RETURN; SOME_FLAGS] THEN + REWRITE_TAC[BIGNUM_FROM_MEMORY_BYTES] THENL + [ENSURES_EXISTING_PRESERVED_TAC `SP`; + DISCH_THEN(fun th -> + ENSURES_PRESERVED_TAC "x19_init" `X19` THEN + ENSURES_PRESERVED_TAC "x20_init" `X20` THEN + ENSURES_PRESERVED_TAC "x21_init" `X21` THEN + ENSURES_PRESERVED_TAC "x22_init" `X22` THEN + ENSURES_PRESERVED_TAC "x23_init" `X23` THEN + ENSURES_PRESERVED_TAC "x24_init" `X24` THEN + ENSURES_PRESERVED_TAC "x25_init" `X25` THEN + ENSURES_PRESERVED_TAC "x26_init" `X26` THEN + ENSURES_PRESERVED_TAC "x27_init" `X27` THEN + ENSURES_PRESERVED_TAC "x28_init" `X28` THEN + ENSURES_PRESERVED_TAC "x29_init" `X29` THEN + ENSURES_EXISTING_PRESERVED_TAC `X30` THEN + ENSURES_EXISTING_PRESERVED_TAC `SP` THEN + ENSURES_INIT_TAC "s0" THEN + ARM_STEPS_TAC BIGNUM_KMUL_32_64_NEON_EXEC (1--6) THEN + MP_TAC th) THEN + ARM_BIGSTEP_TAC BIGNUM_KMUL_32_64_NEON_EXEC "s7" THEN + ARM_STEPS_TAC BIGNUM_KMUL_32_64_NEON_EXEC (8--14) THEN + ENSURES_FINAL_STATE_TAC THEN ASM_REWRITE_TAC[]] THEN + + (*** Initialization and splitting of the inputs ***) + + REWRITE_TAC[GSYM BIGNUM_FROM_MEMORY_BYTES] THEN + BIGNUM_TERMRANGE_TAC `32` `a:num` THEN + BIGNUM_TERMRANGE_TAC `32` `b:num` THEN + REWRITE_TAC[BIGNUM_FROM_MEMORY_BYTES] THEN + MP_TAC(CONJ + (ISPECL [`x:int64`; `16`; `16`] BIGNUM_FROM_MEMORY_SPLIT) + (ISPECL [`y:int64`; `16`; `16`] BIGNUM_FROM_MEMORY_SPLIT)) THEN + CONV_TAC(LAND_CONV(ONCE_DEPTH_CONV(NUM_ADD_CONV ORELSEC NUM_MULT_CONV))) THEN + REWRITE_TAC[BIGNUM_FROM_MEMORY_BYTES] THEN + DISCH_THEN(fun th -> REWRITE_TAC[th]) THEN + ENSURES_INIT_TAC "s0" THEN + MAP_EVERY ABBREV_TAC + [`ahi = read (memory :> bytes (word_add x (word 128),8 * 16)) s0`; + `alo = read (memory :> bytes (x,8 * 16)) s0`; + `bhi = read (memory :> bytes (word_add y (word 128),8 * 16)) s0`; + `blo = read (memory :> bytes (y,8 * 16)) s0`] THEN + + (*** First nested multiply: low part ***) + + ARM_STEPS_TAC BIGNUM_KMUL_32_64_NEON_EXEC (1--5) THEN + LOCAL_KMUL_16_32_NEON_TAC 6 THEN + + (*** Second nested multiply: high part ***) + + ARM_STEPS_TAC BIGNUM_KMUL_32_64_NEON_EXEC (7--11) THEN + LOCAL_KMUL_16_32_NEON_TAC 12 THEN + + (*** Sign-difference computation for x, then discard x stuff ***) + + BIGNUM_LDIGITIZE_TAC "xl_" `read (memory :> bytes (x,8 * 16)) s12` THEN + BIGNUM_LDIGITIZE_TAC "xh_" + `read (memory :> bytes (word_add x (word 128),8 * 16)) s12` THEN + ARM_ACCSTEPS_TAC BIGNUM_KMUL_32_64_NEON_EXEC + [15; 16; 19; 20; 23; 24; 27; 28; 31; 32; 35; 36; 39; 40; 43; 44] + (13--46) THEN + RULE_ASSUM_TAC(REWRITE_RULE[ADD_CLAUSES; WORD_SUB_LZERO]) THEN + SUBGOAL_THEN + `2 EXP 64 <= val(word_neg (word (bitval carry_s44)):int64) + + val(word_neg (word (bitval carry_s44)):int64) <=> + carry_s44` + SUBST_ALL_TAC THENL + [POP_ASSUM_LIST(K ALL_TAC) THEN BOOL_CASES_TAC `carry_s44:bool` THEN + REWRITE_TAC[BITVAL_CLAUSES] THEN CONV_TAC WORD_REDUCE_CONV THEN + CONV_TAC NUM_REDUCE_CONV; + ALL_TAC] THEN + SUBGOAL_THEN `carry_s44 <=> ahi < alo` (ASSUME_TAC o SYM) THENL + [MAP_EVERY EXPAND_TAC ["ahi"; "alo"] THEN + MATCH_MP_TAC FLAG_FROM_CARRY_LT THEN EXISTS_TAC `1024` THEN + REWRITE_TAC[bignum_of_wordlist; GSYM REAL_OF_NUM_CLAUSES] THEN + ACCUMULATOR_POP_ASSUM_LIST(MP_TAC o end_itlist CONJ o DECARRY_RULE) THEN + DISCH_THEN(fun th -> REWRITE_TAC[th]) THEN BOUNDER_TAC[]; + ALL_TAC] THEN + ARM_ACCSTEPS_TAC BIGNUM_KMUL_32_64_NEON_EXEC + [48; 50; 53; 55; 58; 60; 63; 65; 68; 70; 73; 75; 78; 80; 83; 85] + (47--86) THEN + SUBGOAL_THEN + `&(read (memory :> bytes (t,8 * 16)) s86):real = abs(&alo - &ahi)` + ASSUME_TAC THENL + [REWRITE_TAC[GSYM BIGNUM_FROM_MEMORY_BYTES] THEN + MATCH_MP_TAC EQUAL_FROM_CONGRUENT_REAL THEN + MAP_EVERY EXISTS_TAC [`64 * 16`; `&0:real`] THEN CONJ_TAC THENL + [REWRITE_TAC[REAL_OF_NUM_CLAUSES; LE_0; BIGNUM_FROM_MEMORY_BOUND]; + ALL_TAC] THEN + CONJ_TAC THENL + [REWRITE_TAC[REAL_ABS_POS] THEN MATCH_MP_TAC(REAL_ARITH + `&x < e /\ &y < e ==> abs(&x - &y):real < e`) THEN + REWRITE_TAC[REAL_OF_NUM_CLAUSES] THEN + MAP_EVERY EXPAND_TAC ["ahi"; "alo"] THEN + CONJ_TAC THEN MATCH_MP_TAC BIGNUM_OF_WORDLIST_BOUND THEN + REWRITE_TAC[LENGTH] THEN ARITH_TAC; + REWRITE_TAC[INTEGER_CLOSED]] THEN + ASM_REWRITE_TAC[REAL_OF_NUM_LT; REAL_ARITH + `abs(&x - &y):real = if &y < &x then &x - &y else &y - &x`] THEN + CONV_TAC(ONCE_DEPTH_CONV BIGNUM_EXPAND_CONV) THEN ASM_REWRITE_TAC[] THEN + MAP_EVERY EXPAND_TAC ["ahi"; "alo"] THEN + CONV_TAC(ONCE_DEPTH_CONV NUM_MULT_CONV) THEN + REWRITE_TAC[bignum_of_wordlist; GSYM REAL_OF_NUM_CLAUSES] THEN + ACCUMULATOR_POP_ASSUM_LIST(MP_TAC o end_itlist CONJ) THEN + REWRITE_TAC[WORD_XOR_MASK] THEN + ASM_CASES_TAC `carry_s44:bool` THEN ASM_REWRITE_TAC[BITVAL_CLAUSES] THEN + REWRITE_TAC[REAL_VAL_WORD_NOT; DIMINDEX_64] THEN + DISCH_THEN(MP_TAC o end_itlist CONJ o DESUM_RULE o CONJUNCTS) THEN + DISCH_THEN(fun th -> REWRITE_TAC[th]) THEN REAL_INTEGER_TAC; + REPEAT(FIRST_X_ASSUM(K ALL_TAC o check (free_in `x:int64` o concl))) THEN + ACCUMULATOR_POP_ASSUM_LIST(K ALL_TAC)] THEN + + (*** Sign-difference computation for y, then discard y stuff ***) + + BIGNUM_LDIGITIZE_TAC "yl_" `read (memory :> bytes (y,8 * 16)) s86` THEN + BIGNUM_LDIGITIZE_TAC "yh_" + `read (memory :> bytes (word_add y (word 128),8 * 16)) s86` THEN + ARM_ACCSTEPS_TAC BIGNUM_KMUL_32_64_NEON_EXEC + [89; 90; 93; 94; 97; 98; 101; 102; 105; 106; 109; 110; 113; 114; 117; 118] + (87--120) THEN + RULE_ASSUM_TAC(REWRITE_RULE[ADD_CLAUSES; WORD_SUB_LZERO]) THEN + SUBGOAL_THEN + `2 EXP 64 <= val(word_neg (word (bitval carry_s118)):int64) + + val(word_neg (word (bitval carry_s118)):int64) <=> + carry_s118` + SUBST_ALL_TAC THENL + [POP_ASSUM_LIST(K ALL_TAC) THEN BOOL_CASES_TAC `carry_s118:bool` THEN + REWRITE_TAC[BITVAL_CLAUSES] THEN CONV_TAC WORD_REDUCE_CONV THEN + CONV_TAC NUM_REDUCE_CONV; + ALL_TAC] THEN + SUBGOAL_THEN `carry_s118 <=> blo < bhi` (ASSUME_TAC o SYM) THENL + [MAP_EVERY EXPAND_TAC ["bhi"; "blo"] THEN + MATCH_MP_TAC FLAG_FROM_CARRY_LT THEN EXISTS_TAC `1024` THEN + REWRITE_TAC[bignum_of_wordlist; GSYM REAL_OF_NUM_CLAUSES] THEN + ACCUMULATOR_POP_ASSUM_LIST(MP_TAC o end_itlist CONJ o DECARRY_RULE) THEN + DISCH_THEN(fun th -> REWRITE_TAC[th]) THEN BOUNDER_TAC[]; + ALL_TAC] THEN + ARM_ACCSTEPS_TAC BIGNUM_KMUL_32_64_NEON_EXEC + [122;124;127;129;132;134;137;139;142;144;147;149;152;154;157;159] + (121--160) THEN + SUBGOAL_THEN + `&(read (memory :> bytes (word_add t (word 128),8 * 16)) s160):real = + abs(&bhi - &blo)` + ASSUME_TAC THENL + [REWRITE_TAC[GSYM BIGNUM_FROM_MEMORY_BYTES] THEN + MATCH_MP_TAC EQUAL_FROM_CONGRUENT_REAL THEN + MAP_EVERY EXISTS_TAC [`64 * 16`; `&0:real`] THEN CONJ_TAC THENL + [REWRITE_TAC[REAL_OF_NUM_CLAUSES; LE_0; BIGNUM_FROM_MEMORY_BOUND]; + ALL_TAC] THEN + CONJ_TAC THENL + [REWRITE_TAC[REAL_ABS_POS] THEN MATCH_MP_TAC(REAL_ARITH + `&x < e /\ &y < e ==> abs(&x - &y):real < e`) THEN + REWRITE_TAC[REAL_OF_NUM_CLAUSES] THEN + MAP_EVERY EXPAND_TAC ["bhi"; "blo"] THEN + CONJ_TAC THEN MATCH_MP_TAC BIGNUM_OF_WORDLIST_BOUND THEN + REWRITE_TAC[LENGTH] THEN ARITH_TAC; + REWRITE_TAC[INTEGER_CLOSED]] THEN + ASM_REWRITE_TAC[REAL_OF_NUM_LT; REAL_ARITH + `abs(&x - &y):real = if &y < &x then &x - &y else &y - &x`] THEN + CONV_TAC(ONCE_DEPTH_CONV BIGNUM_EXPAND_CONV) THEN ASM_REWRITE_TAC[] THEN + MAP_EVERY EXPAND_TAC ["bhi"; "blo"] THEN + CONV_TAC(ONCE_DEPTH_CONV NUM_MULT_CONV) THEN + REWRITE_TAC[bignum_of_wordlist; GSYM REAL_OF_NUM_CLAUSES] THEN + ACCUMULATOR_POP_ASSUM_LIST(MP_TAC o end_itlist CONJ) THEN + REWRITE_TAC[WORD_XOR_MASK] THEN + ASM_CASES_TAC `carry_s118:bool` THEN ASM_REWRITE_TAC[BITVAL_CLAUSES] THEN + REWRITE_TAC[REAL_VAL_WORD_NOT; DIMINDEX_64] THEN + DISCH_THEN(MP_TAC o end_itlist CONJ o DESUM_RULE o CONJUNCTS) THEN + DISCH_THEN(fun th -> REWRITE_TAC[th]) THEN REAL_INTEGER_TAC; + REPEAT(FIRST_X_ASSUM(K ALL_TAC o check (free_in `y:int64` o concl))) THEN + ACCUMULATOR_POP_ASSUM_LIST(K ALL_TAC)] THEN + + (*** The combined sign ***) + + ARM_STEPS_TAC BIGNUM_KMUL_32_64_NEON_EXEC [161] THEN + RULE_ASSUM_TAC(REWRITE_RULE[WORD_XOR_MASKS]) THEN + ABBREV_TAC `sgn <=> ~(carry_s118 <=> carry_s44)` THEN + + (*** Split L into L_top and L_bot and form H' = H + L_top ***) + + MP_TAC(ISPECL [`z:int64`; `16`; `16`; `s161:armstate`] + BIGNUM_FROM_MEMORY_SPLIT) THEN + CONV_TAC(LAND_CONV(ONCE_DEPTH_CONV(NUM_ADD_CONV ORELSEC NUM_MULT_CONV))) THEN + REWRITE_TAC[BIGNUM_FROM_MEMORY_BYTES] THEN DISCH_THEN SUBST_ALL_TAC THEN + MAP_EVERY ABBREV_TAC + [`ltop = read (memory :> bytes (word_add z (word 128),8 * 16)) s161`; + `lbot = read (memory :> bytes (z,8 * 16)) s161`; + `h = read (memory :> bytes (word_add z (word 256),8 * 32)) s161`] THEN + + BIGNUM_LDIGITIZE_TAC "ltop_" + `read (memory :> bytes (word_add z (word 128),8 * 16)) s161` THEN + BIGNUM_LDIGITIZE_TAC "h_" + `read (memory :> bytes (word_add z (word 256),8 * 32)) s161` THEN + + ARM_ACCSTEPS_TAC BIGNUM_KMUL_32_64_NEON_EXEC + [164; 165; 169; 170; 174; 175; 179; 180; 184; 185; 189; 190; 194; 195; 199; + 200; 203; 204; 207; 208; 211; 212; 215; 216; 219; 220; 223; 224; 227; 228; + 231; 232] + (162--233) THEN + + SUBGOAL_THEN `bignum_from_memory(word_add z (word 256),32) s233 = h + ltop` + MP_TAC THENL + [REWRITE_TAC[GSYM REAL_OF_NUM_CLAUSES] THEN + MATCH_MP_TAC EQUAL_FROM_CONGRUENT_REAL THEN + MAP_EVERY EXISTS_TAC [`64 * 32`; `&0:real`] THEN CONJ_TAC THENL + [REWRITE_TAC[REAL_OF_NUM_CLAUSES; LE_0; BIGNUM_FROM_MEMORY_BOUND]; + ALL_TAC] THEN + CONJ_TAC THENL + [ASM_REWRITE_TAC[REAL_OF_NUM_CLAUSES; LE_0] THEN + MAP_EVERY EXPAND_TAC ["ahi"; "bhi"; "ltop"] THEN + MATCH_MP_TAC(ARITH_RULE + `x <= (2 EXP (64 * 16) - 1) * (2 EXP (64 * 16) - 1) /\ + y + (2 EXP 1024 - 1) EXP 2 < e + ==> x + y < e`) THEN + CONJ_TAC THENL + [MATCH_MP_TAC LE_MULT2 THEN CONJ_TAC THEN + MATCH_MP_TAC(ARITH_RULE `x < e ==> x <= e - 1`) THEN + MATCH_MP_TAC BIGNUM_OF_WORDLIST_BOUND THEN + REWRITE_TAC[LENGTH] THEN ARITH_TAC; + CONV_TAC NUM_REDUCE_CONV THEN + REWRITE_TAC[bignum_of_wordlist; GSYM REAL_OF_NUM_CLAUSES] THEN + CONV_TAC NUM_REDUCE_CONV THEN BOUNDER_TAC[]]; + REWRITE_TAC[INTEGER_CLOSED]] THEN + MAP_EVERY EXPAND_TAC ["h"; "ltop"] THEN + REWRITE_TAC[bignum_of_wordlist; GSYM REAL_OF_NUM_CLAUSES] THEN + CONV_TAC(ONCE_DEPTH_CONV BIGNUM_EXPAND_CONV) THEN ASM_REWRITE_TAC[] THEN + REWRITE_TAC[bignum_of_wordlist; GSYM REAL_OF_NUM_CLAUSES] THEN + CONV_TAC(ONCE_DEPTH_CONV NUM_MULT_CONV) THEN + ACCUMULATOR_POP_ASSUM_LIST(MP_TAC o end_itlist CONJ o DESUM_RULE) THEN + DISCH_THEN(fun th -> REWRITE_TAC[th]) THEN REAL_INTEGER_TAC; + REWRITE_TAC[BIGNUM_FROM_MEMORY_BYTES] THEN + ACCUMULATOR_POP_ASSUM_LIST(K ALL_TAC)] THEN + + (*** Throw away h and digitizations, use h in place of h' now ***) + + DISCARD_MATCHING_ASSUMPTIONS [`read (memory :> bytes64 x) s = y`] THEN + UNDISCH_THEN `h = ahi * bhi` SUBST1_TAC THEN + REPEAT(FIRST_X_ASSUM(K ALL_TAC o check (free_in `h:num` o concl))) THEN + ABBREV_TAC `h = ahi * bhi + ltop` THEN DISCH_TAC THEN + + (*** Third and final nested multiplication: absolute differences ***) + + ABBREV_TAC `adiff = read (memory :> bytes (t,8 * 16)) s233` THEN + ABBREV_TAC + `bdiff = read (memory :> bytes (word_add t (word 128),8 * 16)) s233` THEN + ARM_STEPS_TAC BIGNUM_KMUL_32_64_NEON_EXEC (234--238) THEN + LOCAL_KMUL_16_32_NEON_TAC 239 THEN + + (*** All remaining accumulation of sub-results ***) + + BIGNUM_LDIGITIZE_TAC "l_" `read (memory :> bytes (z,8 * 16)) s239` THEN + BIGNUM_LDIGITIZE_TAC "m_" + `read (memory :> bytes (word_add t (word 256),8 * 32)) s239` THEN + BIGNUM_LDIGITIZE_TAC "h_" + `read (memory :> bytes (word_add z (word 256),8 * 32)) s239` THEN + ARM_ACCSTEPS_TAC BIGNUM_KMUL_32_64_NEON_EXEC + [242;243;247;248;252;253;257;258;262;263;267;268;272;273;277;278;282; + 283;287;288;292;293;297;298;302;303;307;308;312;313;317;318;325;327; + 332;334;339;341;346;348;353;355;360;362;367;369;374;376;381;383;388; + 390;395;397;402;404;409;411;416;418;423;425;430;432;434;435;437;438; + 441;442;445;446;449;450;453;454;457;458;461;462;465;466] + (240--467) THEN + + (*** The Karatsuba rearrangement ***) + + ENSURES_FINAL_STATE_TAC THEN + ASM_REWRITE_TAC[] THEN + REWRITE_TAC[GSYM REAL_OF_NUM_CLAUSES] THEN + MATCH_MP_TAC EQUAL_FROM_CONGRUENT_REAL THEN + MAP_EVERY EXISTS_TAC [`64 * 64`; `&0:real`] THEN CONJ_TAC THENL + [REWRITE_TAC[REAL_OF_NUM_CLAUSES; LE_0] THEN + REWRITE_TAC[GSYM BIGNUM_FROM_MEMORY_BYTES; BIGNUM_FROM_MEMORY_BOUND]; + ALL_TAC] THEN + CONJ_TAC THENL + [REWRITE_TAC[REAL_OF_NUM_CLAUSES; LE_0] THEN + REWRITE_TAC[EXP_ADD; ARITH_RULE `64 * 64 = 64 * 32 + 64 * 32`] THEN + ASM_SIMP_TAC[LT_MULT2]; + REWRITE_TAC[INTEGER_CLOSED]] THEN + SUBGOAL_THEN + `(&a:real) * &b = + (&lbot + &2 pow 1024 * &h) * (&2 pow 1024 + &1) + + &2 pow 1024 * --(&1) pow bitval sgn * &(adiff * bdiff)` + SUBST1_TAC THENL + [ASM_REWRITE_TAC[GSYM REAL_OF_NUM_CLAUSES] THEN + MAP_EVERY EXPAND_TAC ["a"; "b"] THEN + REWRITE_TAC[GSYM REAL_OF_NUM_CLAUSES; REAL_ARITH + `abs(x - y:real) = if y < x then x - y else y - x`] THEN + ASM_REWRITE_TAC[REAL_OF_NUM_LT] THEN + MAP_EVERY UNDISCH_TAC + [`2 EXP 1024 * ltop + lbot = alo * blo`; + `ahi * bhi + ltop:num = h`] THEN + REWRITE_TAC[GSYM REAL_OF_NUM_CLAUSES] THEN + EXPAND_TAC "sgn" THEN POP_ASSUM_LIST(K ALL_TAC) THEN + REPEAT(COND_CASES_TAC THEN ASM_REWRITE_TAC[BITVAL_CLAUSES]) THEN + CONV_TAC REAL_RING; + ALL_TAC] THEN + + (*** The finale ***) + + REPEAT(FIRST_X_ASSUM(SUBST1_TAC o SYM o + check(can (term_match [] `bignum_of_wordlist l = a`) o concl))) THEN + CONV_TAC(ONCE_DEPTH_CONV BIGNUM_LEXPAND_CONV) THEN + REWRITE_TAC[GSYM REAL_OF_NUM_CLAUSES; bignum_of_wordlist] THEN + ASM_REWRITE_TAC[] THEN CONV_TAC NUM_REDUCE_CONV THEN + ACCUMULATOR_POP_ASSUM_LIST(MP_TAC o end_itlist CONJ) THEN + REWRITE_TAC[WORD_XOR_MASK] THEN POP_ASSUM_LIST(K ALL_TAC) THEN + BOOL_CASES_TAC `sgn:bool` THEN ASM_REWRITE_TAC[BITVAL_CLAUSES] THEN + CONV_TAC WORD_REDUCE_CONV THEN CONV_TAC NUM_REDUCE_CONV THEN + ASM_REWRITE_TAC[BITVAL_CLAUSES; REAL_VAL_WORD_NOT; DIMINDEX_64] THEN + REWRITE_TAC[COND_SWAP; GSYM WORD_BITVAL; VAL_WORD_BITVAL] THEN STRIP_TAC THEN + ASSUM_LIST(MP_TAC o end_itlist CONJ o DESUM_RULE) THEN + DISCH_THEN(fun th -> REWRITE_TAC[th]) THEN + CONV_TAC(RAND_CONV REAL_POLY_CONV) THEN + POP_ASSUM_LIST(MP_TAC o end_itlist CONJ o + filter (free_in `carry_s434:bool` o concl)) + THENL + [ASM_CASES_TAC `carry_s434:bool` THEN ASM_REWRITE_TAC[BITVAL_CLAUSES]; + ALL_TAC] THEN + DISCH_THEN(MP_TAC o end_itlist CONJ o filter (is_ratconst o rand o concl) o + DECARRY_RULE o CONJUNCTS) THEN + DISCH_THEN(fun th -> REWRITE_TAC[th]) THEN REAL_INTEGER_TAC);; diff --git a/arm/proofs/bignum_ksqr_16_32_neon.ml b/arm/proofs/bignum_ksqr_16_32_neon.ml new file mode 100644 index 00000000..a27a6310 --- /dev/null +++ b/arm/proofs/bignum_ksqr_16_32_neon.ml @@ -0,0 +1,990 @@ +(* + * Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. + * SPDX-License-Identifier: Apache-2.0 OR ISC + *) + +(* ========================================================================= *) +(* 16x16 -> 32 squaring, using Karatsuba reduction. *) +(* ========================================================================= *) + +(**** print_literal_from_elf "arm/fastmul/bignum_ksqr_16_32_neon.o";; + ****) + +let bignum_ksqr_16_32_neon_mc = define_assert_from_elf + "bignum_ksqr_16_32_neon_mc" "arm/fastmul/bignum_ksqr_16_32_neon.o" +[ + 0xa9bf53f3; (* arm_STP X19 X20 SP (Preimmediate_Offset (iword (-- &16))) *) + 0xa9bf5bf5; (* arm_STP X21 X22 SP (Preimmediate_Offset (iword (-- &16))) *) + 0xa9bf63f7; (* arm_STP X23 X24 SP (Preimmediate_Offset (iword (-- &16))) *) + 0xa9bf7bf9; (* arm_STP X25 X30 SP (Preimmediate_Offset (iword (-- &16))) *) + 0xaa0003f7; (* arm_MOV X23 X0 *) + 0xaa0103f8; (* arm_MOV X24 X1 *) + 0xaa0203f9; (* arm_MOV X25 X2 *) + 0x940000a9; (* arm_BL (word 676) *) + 0xa9402f0a; (* arm_LDP X10 X11 X24 (Immediate_Offset (iword (&0))) *) + 0xa9442708; (* arm_LDP X8 X9 X24 (Immediate_Offset (iword (&64))) *) + 0xeb08014a; (* arm_SUBS X10 X10 X8 *) + 0xfa09016b; (* arm_SBCS X11 X11 X9 *) + 0xa941370c; (* arm_LDP X12 X13 X24 (Immediate_Offset (iword (&16))) *) + 0xa9452708; (* arm_LDP X8 X9 X24 (Immediate_Offset (iword (&80))) *) + 0xfa08018c; (* arm_SBCS X12 X12 X8 *) + 0xfa0901ad; (* arm_SBCS X13 X13 X9 *) + 0xa9423f0e; (* arm_LDP X14 X15 X24 (Immediate_Offset (iword (&32))) *) + 0xa9462708; (* arm_LDP X8 X9 X24 (Immediate_Offset (iword (&96))) *) + 0xfa0801ce; (* arm_SBCS X14 X14 X8 *) + 0xfa0901ef; (* arm_SBCS X15 X15 X9 *) + 0xa9434710; (* arm_LDP X16 X17 X24 (Immediate_Offset (iword (&48))) *) + 0xa9472708; (* arm_LDP X8 X9 X24 (Immediate_Offset (iword (&112))) *) + 0xfa080210; (* arm_SBCS X16 X16 X8 *) + 0xfa090231; (* arm_SBCS X17 X17 X9 *) + 0xda9f23f3; (* arm_CSETM X19 Condition_CC *) + 0xab13027f; (* arm_CMN X19 X19 *) + 0xca13014a; (* arm_EOR X10 X10 X19 *) + 0xba1f014a; (* arm_ADCS X10 X10 XZR *) + 0xca13016b; (* arm_EOR X11 X11 X19 *) + 0xba1f016b; (* arm_ADCS X11 X11 XZR *) + 0xa9002f2a; (* arm_STP X10 X11 X25 (Immediate_Offset (iword (&0))) *) + 0xca13018c; (* arm_EOR X12 X12 X19 *) + 0xba1f018c; (* arm_ADCS X12 X12 XZR *) + 0xca1301ad; (* arm_EOR X13 X13 X19 *) + 0xba1f01ad; (* arm_ADCS X13 X13 XZR *) + 0xa901372c; (* arm_STP X12 X13 X25 (Immediate_Offset (iword (&16))) *) + 0xca1301ce; (* arm_EOR X14 X14 X19 *) + 0xba1f01ce; (* arm_ADCS X14 X14 XZR *) + 0xca1301ef; (* arm_EOR X15 X15 X19 *) + 0xba1f01ef; (* arm_ADCS X15 X15 XZR *) + 0xa9023f2e; (* arm_STP X14 X15 X25 (Immediate_Offset (iword (&32))) *) + 0xca130210; (* arm_EOR X16 X16 X19 *) + 0xba1f0210; (* arm_ADCS X16 X16 XZR *) + 0xca130231; (* arm_EOR X17 X17 X19 *) + 0xba1f0231; (* arm_ADCS X17 X17 XZR *) + 0xa9034730; (* arm_STP X16 X17 X25 (Immediate_Offset (iword (&48))) *) + 0x910202e0; (* arm_ADD X0 X23 (rvalue (word 128)) *) + 0x91010301; (* arm_ADD X1 X24 (rvalue (word 64)) *) + 0x94000080; (* arm_BL (word 512) *) + 0xa9482eea; (* arm_LDP X10 X11 X23 (Immediate_Offset (iword (&128))) *) + 0xa94436ec; (* arm_LDP X12 X13 X23 (Immediate_Offset (iword (&64))) *) + 0xab0c014a; (* arm_ADDS X10 X10 X12 *) + 0xba0d016b; (* arm_ADCS X11 X11 X13 *) + 0xa9082eea; (* arm_STP X10 X11 X23 (Immediate_Offset (iword (&128))) *) + 0xa9492eea; (* arm_LDP X10 X11 X23 (Immediate_Offset (iword (&144))) *) + 0xa94536ec; (* arm_LDP X12 X13 X23 (Immediate_Offset (iword (&80))) *) + 0xba0c014a; (* arm_ADCS X10 X10 X12 *) + 0xba0d016b; (* arm_ADCS X11 X11 X13 *) + 0xa9092eea; (* arm_STP X10 X11 X23 (Immediate_Offset (iword (&144))) *) + 0xa94a2eea; (* arm_LDP X10 X11 X23 (Immediate_Offset (iword (&160))) *) + 0xa94636ec; (* arm_LDP X12 X13 X23 (Immediate_Offset (iword (&96))) *) + 0xba0c014a; (* arm_ADCS X10 X10 X12 *) + 0xba0d016b; (* arm_ADCS X11 X11 X13 *) + 0xa90a2eea; (* arm_STP X10 X11 X23 (Immediate_Offset (iword (&160))) *) + 0xa94b2eea; (* arm_LDP X10 X11 X23 (Immediate_Offset (iword (&176))) *) + 0xa94736ec; (* arm_LDP X12 X13 X23 (Immediate_Offset (iword (&112))) *) + 0xba0c014a; (* arm_ADCS X10 X10 X12 *) + 0xba0d016b; (* arm_ADCS X11 X11 X13 *) + 0xa90b2eea; (* arm_STP X10 X11 X23 (Immediate_Offset (iword (&176))) *) + 0xa94c2eea; (* arm_LDP X10 X11 X23 (Immediate_Offset (iword (&192))) *) + 0xba1f014a; (* arm_ADCS X10 X10 XZR *) + 0xba1f016b; (* arm_ADCS X11 X11 XZR *) + 0xa90c2eea; (* arm_STP X10 X11 X23 (Immediate_Offset (iword (&192))) *) + 0xa94d2eea; (* arm_LDP X10 X11 X23 (Immediate_Offset (iword (&208))) *) + 0xba1f014a; (* arm_ADCS X10 X10 XZR *) + 0xba1f016b; (* arm_ADCS X11 X11 XZR *) + 0xa90d2eea; (* arm_STP X10 X11 X23 (Immediate_Offset (iword (&208))) *) + 0xa94e2eea; (* arm_LDP X10 X11 X23 (Immediate_Offset (iword (&224))) *) + 0xba1f014a; (* arm_ADCS X10 X10 XZR *) + 0xba1f016b; (* arm_ADCS X11 X11 XZR *) + 0xa90e2eea; (* arm_STP X10 X11 X23 (Immediate_Offset (iword (&224))) *) + 0xa94f2eea; (* arm_LDP X10 X11 X23 (Immediate_Offset (iword (&240))) *) + 0xba1f014a; (* arm_ADCS X10 X10 XZR *) + 0xba1f016b; (* arm_ADCS X11 X11 XZR *) + 0xa90f2eea; (* arm_STP X10 X11 X23 (Immediate_Offset (iword (&240))) *) + 0x91010320; (* arm_ADD X0 X25 (rvalue (word 64)) *) + 0xaa1903e1; (* arm_MOV X1 X25 *) + 0x94000059; (* arm_BL (word 356) *) + 0xa94006e0; (* arm_LDP X0 X1 X23 (Immediate_Offset (iword (&0))) *) + 0xa94846f0; (* arm_LDP X16 X17 X23 (Immediate_Offset (iword (&128))) *) + 0xab100000; (* arm_ADDS X0 X0 X16 *) + 0xba110021; (* arm_ADCS X1 X1 X17 *) + 0xa9410ee2; (* arm_LDP X2 X3 X23 (Immediate_Offset (iword (&16))) *) + 0xa94946f0; (* arm_LDP X16 X17 X23 (Immediate_Offset (iword (&144))) *) + 0xba100042; (* arm_ADCS X2 X2 X16 *) + 0xba110063; (* arm_ADCS X3 X3 X17 *) + 0xa94216e4; (* arm_LDP X4 X5 X23 (Immediate_Offset (iword (&32))) *) + 0xa94a46f0; (* arm_LDP X16 X17 X23 (Immediate_Offset (iword (&160))) *) + 0xba100084; (* arm_ADCS X4 X4 X16 *) + 0xba1100a5; (* arm_ADCS X5 X5 X17 *) + 0xa9431ee6; (* arm_LDP X6 X7 X23 (Immediate_Offset (iword (&48))) *) + 0xa94b46f0; (* arm_LDP X16 X17 X23 (Immediate_Offset (iword (&176))) *) + 0xba1000c6; (* arm_ADCS X6 X6 X16 *) + 0xba1100e7; (* arm_ADCS X7 X7 X17 *) + 0xa94826e8; (* arm_LDP X8 X9 X23 (Immediate_Offset (iword (&128))) *) + 0xa94c46f0; (* arm_LDP X16 X17 X23 (Immediate_Offset (iword (&192))) *) + 0xba100108; (* arm_ADCS X8 X8 X16 *) + 0xba110129; (* arm_ADCS X9 X9 X17 *) + 0xa9492eea; (* arm_LDP X10 X11 X23 (Immediate_Offset (iword (&144))) *) + 0xa94d46f0; (* arm_LDP X16 X17 X23 (Immediate_Offset (iword (&208))) *) + 0xba10014a; (* arm_ADCS X10 X10 X16 *) + 0xba11016b; (* arm_ADCS X11 X11 X17 *) + 0xa94a36ec; (* arm_LDP X12 X13 X23 (Immediate_Offset (iword (&160))) *) + 0xa94e46f0; (* arm_LDP X16 X17 X23 (Immediate_Offset (iword (&224))) *) + 0xba10018c; (* arm_ADCS X12 X12 X16 *) + 0xba1101ad; (* arm_ADCS X13 X13 X17 *) + 0xa94b3eee; (* arm_LDP X14 X15 X23 (Immediate_Offset (iword (&176))) *) + 0xa94f46f0; (* arm_LDP X16 X17 X23 (Immediate_Offset (iword (&240))) *) + 0xba1001ce; (* arm_ADCS X14 X14 X16 *) + 0xba1101ef; (* arm_ADCS X15 X15 X17 *) + 0x9a9f37f8; (* arm_CSET X24 Condition_CS *) + 0xa9444730; (* arm_LDP X16 X17 X25 (Immediate_Offset (iword (&64))) *) + 0xeb100000; (* arm_SUBS X0 X0 X16 *) + 0xfa110021; (* arm_SBCS X1 X1 X17 *) + 0xa90406e0; (* arm_STP X0 X1 X23 (Immediate_Offset (iword (&64))) *) + 0xa9454730; (* arm_LDP X16 X17 X25 (Immediate_Offset (iword (&80))) *) + 0xfa100042; (* arm_SBCS X2 X2 X16 *) + 0xfa110063; (* arm_SBCS X3 X3 X17 *) + 0xa9050ee2; (* arm_STP X2 X3 X23 (Immediate_Offset (iword (&80))) *) + 0xa9464730; (* arm_LDP X16 X17 X25 (Immediate_Offset (iword (&96))) *) + 0xfa100084; (* arm_SBCS X4 X4 X16 *) + 0xfa1100a5; (* arm_SBCS X5 X5 X17 *) + 0xa90616e4; (* arm_STP X4 X5 X23 (Immediate_Offset (iword (&96))) *) + 0xa9474730; (* arm_LDP X16 X17 X25 (Immediate_Offset (iword (&112))) *) + 0xfa1000c6; (* arm_SBCS X6 X6 X16 *) + 0xfa1100e7; (* arm_SBCS X7 X7 X17 *) + 0xa9071ee6; (* arm_STP X6 X7 X23 (Immediate_Offset (iword (&112))) *) + 0xa9484730; (* arm_LDP X16 X17 X25 (Immediate_Offset (iword (&128))) *) + 0xfa100108; (* arm_SBCS X8 X8 X16 *) + 0xfa110129; (* arm_SBCS X9 X9 X17 *) + 0xa90826e8; (* arm_STP X8 X9 X23 (Immediate_Offset (iword (&128))) *) + 0xa9494730; (* arm_LDP X16 X17 X25 (Immediate_Offset (iword (&144))) *) + 0xfa10014a; (* arm_SBCS X10 X10 X16 *) + 0xfa11016b; (* arm_SBCS X11 X11 X17 *) + 0xa9092eea; (* arm_STP X10 X11 X23 (Immediate_Offset (iword (&144))) *) + 0xa94a4730; (* arm_LDP X16 X17 X25 (Immediate_Offset (iword (&160))) *) + 0xfa10018c; (* arm_SBCS X12 X12 X16 *) + 0xfa1101ad; (* arm_SBCS X13 X13 X17 *) + 0xa90a36ec; (* arm_STP X12 X13 X23 (Immediate_Offset (iword (&160))) *) + 0xa94b4730; (* arm_LDP X16 X17 X25 (Immediate_Offset (iword (&176))) *) + 0xfa1001ce; (* arm_SBCS X14 X14 X16 *) + 0xfa1101ef; (* arm_SBCS X15 X15 X17 *) + 0xa90b3eee; (* arm_STP X14 X15 X23 (Immediate_Offset (iword (&176))) *) + 0xfa1f0318; (* arm_SBCS X24 X24 XZR *) + 0xda9f23f9; (* arm_CSETM X25 Condition_CC *) + 0xa94c2eea; (* arm_LDP X10 X11 X23 (Immediate_Offset (iword (&192))) *) + 0xab18014a; (* arm_ADDS X10 X10 X24 *) + 0xba19016b; (* arm_ADCS X11 X11 X25 *) + 0xa90c2eea; (* arm_STP X10 X11 X23 (Immediate_Offset (iword (&192))) *) + 0xa94d2eea; (* arm_LDP X10 X11 X23 (Immediate_Offset (iword (&208))) *) + 0xba19014a; (* arm_ADCS X10 X10 X25 *) + 0xba19016b; (* arm_ADCS X11 X11 X25 *) + 0xa90d2eea; (* arm_STP X10 X11 X23 (Immediate_Offset (iword (&208))) *) + 0xa94e2eea; (* arm_LDP X10 X11 X23 (Immediate_Offset (iword (&224))) *) + 0xba19014a; (* arm_ADCS X10 X10 X25 *) + 0xba19016b; (* arm_ADCS X11 X11 X25 *) + 0xa90e2eea; (* arm_STP X10 X11 X23 (Immediate_Offset (iword (&224))) *) + 0xa94f2eea; (* arm_LDP X10 X11 X23 (Immediate_Offset (iword (&240))) *) + 0xba19014a; (* arm_ADCS X10 X10 X25 *) + 0xba19016b; (* arm_ADCS X11 X11 X25 *) + 0xa90f2eea; (* arm_STP X10 X11 X23 (Immediate_Offset (iword (&240))) *) + 0xa8c17bf9; (* arm_LDP X25 X30 SP (Postimmediate_Offset (iword (&16))) *) + 0xa8c163f7; (* arm_LDP X23 X24 SP (Postimmediate_Offset (iword (&16))) *) + 0xa8c15bf5; (* arm_LDP X21 X22 SP (Postimmediate_Offset (iword (&16))) *) + 0xa8c153f3; (* arm_LDP X19 X20 SP (Postimmediate_Offset (iword (&16))) *) + 0xd65f03c0; (* arm_RET X30 *) + 0xa9400c22; (* arm_LDP X2 X3 X1 (Immediate_Offset (iword (&0))) *) + 0x3dc00034; (* arm_LDR Q20 X1 (Immediate_Offset (word 0)) *) + 0xa9411424; (* arm_LDP X4 X5 X1 (Immediate_Offset (iword (&16))) *) + 0x3dc00435; (* arm_LDR Q21 X1 (Immediate_Offset (word 16)) *) + 0xa9421c26; (* arm_LDP X6 X7 X1 (Immediate_Offset (iword (&32))) *) + 0x3dc00836; (* arm_LDR Q22 X1 (Immediate_Offset (word 32)) *) + 0xa9432428; (* arm_LDP X8 X9 X1 (Immediate_Offset (iword (&48))) *) + 0x3dc00c37; (* arm_LDR Q23 X1 (Immediate_Offset (word 48)) *) + 0x6f00e5fe; (* arm_MOVI Q30 (word 4294967295) *) + 0x9b047c51; (* arm_MUL X17 X2 X4 *) + 0x9b057c6e; (* arm_MUL X14 X3 X5 *) + 0x6e144281; (* arm_EXT Q1 Q20 Q20 64 *) + 0x9bc47c54; (* arm_UMULH X20 X2 X4 *) + 0x0f208682; (* arm_SHRN Q2 Q20 32 32 *) + 0xeb030055; (* arm_SUBS X21 X2 X3 *) + 0x0e813a80; (* arm_ZIP1 Q0 Q20 Q1 32 64 *) + 0xda9526b5; (* arm_CNEG X21 X21 Condition_CC *) + 0x2ea2c045; (* arm_UMULL_VEC Q5 Q2 Q2 32 *) + 0xda9f23eb; (* arm_CSETM X11 Condition_CC *) + 0x2ea0c046; (* arm_UMULL_VEC Q6 Q2 Q0 32 *) + 0xeb0400ac; (* arm_SUBS X12 X5 X4 *) + 0x2ea0c003; (* arm_UMULL_VEC Q3 Q0 Q0 32 *) + 0xda8c258c; (* arm_CNEG X12 X12 Condition_CC *) + 0x4ea61cc1; (* arm_MOV_VEC Q1 Q6 128 *) + 0x9b0c7ead; (* arm_MUL X13 X21 X12 *) + 0x6f601461; (* arm_USRA_VEC Q1 Q3 32 64 128 *) + 0x9bcc7eac; (* arm_UMULH X12 X21 X12 *) + 0x4e3e1c24; (* arm_AND_VEC Q4 Q1 Q30 128 *) + 0xda8b216b; (* arm_CINV X11 X11 Condition_CC *) + 0x4ee68484; (* arm_ADD_VEC Q4 Q4 Q6 64 128 *) + 0xca0b01ad; (* arm_EOR X13 X13 X11 *) + 0x6f601485; (* arm_USRA_VEC Q5 Q4 32 64 128 *) + 0xca0b018c; (* arm_EOR X12 X12 X11 *) + 0x6f605483; (* arm_SLI_VEC Q3 Q4 32 64 *) + 0xab140233; (* arm_ADDS X19 X17 X20 *) + 0x6f601425; (* arm_USRA_VEC Q5 Q1 32 64 128 *) + 0x9a1f0294; (* arm_ADC X20 X20 XZR *) + 0x6e1542a1; (* arm_EXT Q1 Q21 Q21 64 *) + 0x9bc57c75; (* arm_UMULH X21 X3 X5 *) + 0x0f2086a2; (* arm_SHRN Q2 Q21 32 32 *) + 0xab0e0273; (* arm_ADDS X19 X19 X14 *) + 0x0e813aa0; (* arm_ZIP1 Q0 Q21 Q1 32 64 *) + 0xba150294; (* arm_ADCS X20 X20 X21 *) + 0x9a1f02b5; (* arm_ADC X21 X21 XZR *) + 0xab0e0294; (* arm_ADDS X20 X20 X14 *) + 0x9a1f02b5; (* arm_ADC X21 X21 XZR *) + 0xb100057f; (* arm_CMN X11 (rvalue (word 1)) *) + 0xba0d0273; (* arm_ADCS X19 X19 X13 *) + 0x4e183c6d; (* arm_UMOV X13 Q3 1 8 *) + 0xba0c0294; (* arm_ADCS X20 X20 X12 *) + 0x4e183cae; (* arm_UMOV X14 Q5 1 8 *) + 0x9a0b02b5; (* arm_ADC X21 X21 X11 *) + 0x4e083c6c; (* arm_UMOV X12 Q3 0 8 *) + 0xab110231; (* arm_ADDS X17 X17 X17 *) + 0x4e083cab; (* arm_UMOV X11 Q5 0 8 *) + 0xba130273; (* arm_ADCS X19 X19 X19 *) + 0x2ea2c045; (* arm_UMULL_VEC Q5 Q2 Q2 32 *) + 0xba140294; (* arm_ADCS X20 X20 X20 *) + 0x2ea0c046; (* arm_UMULL_VEC Q6 Q2 Q0 32 *) + 0xba1502b5; (* arm_ADCS X21 X21 X21 *) + 0x2ea0c003; (* arm_UMULL_VEC Q3 Q0 Q0 32 *) + 0x9a1f03ea; (* arm_ADC X10 XZR XZR *) + 0x4ea61cc1; (* arm_MOV_VEC Q1 Q6 128 *) + 0x9b037c4f; (* arm_MUL X15 X2 X3 *) + 0x6f601461; (* arm_USRA_VEC Q1 Q3 32 64 128 *) + 0x9bc37c50; (* arm_UMULH X16 X2 X3 *) + 0x4e3e1c24; (* arm_AND_VEC Q4 Q1 Q30 128 *) + 0xab0f016b; (* arm_ADDS X11 X11 X15 *) + 0x4ee68484; (* arm_ADD_VEC Q4 Q4 Q6 64 128 *) + 0xba1001ad; (* arm_ADCS X13 X13 X16 *) + 0x6f601485; (* arm_USRA_VEC Q5 Q4 32 64 128 *) + 0x9a1f01ce; (* arm_ADC X14 X14 XZR *) + 0x6f605483; (* arm_SLI_VEC Q3 Q4 32 64 *) + 0xab0f016b; (* arm_ADDS X11 X11 X15 *) + 0x6f601425; (* arm_USRA_VEC Q5 Q1 32 64 128 *) + 0xba1001ad; (* arm_ADCS X13 X13 X16 *) + 0x9a1f01ce; (* arm_ADC X14 X14 XZR *) + 0xa9002c0c; (* arm_STP X12 X11 X0 (Immediate_Offset (iword (&0))) *) + 0x4e083cab; (* arm_UMOV X11 Q5 0 8 *) + 0xab0d0231; (* arm_ADDS X17 X17 X13 *) + 0x4e183c6d; (* arm_UMOV X13 Q3 1 8 *) + 0xba0e0273; (* arm_ADCS X19 X19 X14 *) + 0x4e183cae; (* arm_UMOV X14 Q5 1 8 *) + 0xba1f0294; (* arm_ADCS X20 X20 XZR *) + 0x4e083c6c; (* arm_UMOV X12 Q3 0 8 *) + 0xba1f02b5; (* arm_ADCS X21 X21 XZR *) + 0x6e1642c1; (* arm_EXT Q1 Q22 Q22 64 *) + 0x9a1f014a; (* arm_ADC X10 X10 XZR *) + 0x0f2086c2; (* arm_SHRN Q2 Q22 32 32 *) + 0xa9014c11; (* arm_STP X17 X19 X0 (Immediate_Offset (iword (&16))) *) + 0x0e813ac0; (* arm_ZIP1 Q0 Q22 Q1 32 64 *) + 0x9b057c8f; (* arm_MUL X15 X4 X5 *) + 0x2ea2c045; (* arm_UMULL_VEC Q5 Q2 Q2 32 *) + 0x9bc57c90; (* arm_UMULH X16 X4 X5 *) + 0x2ea0c046; (* arm_UMULL_VEC Q6 Q2 Q0 32 *) + 0xab0f016b; (* arm_ADDS X11 X11 X15 *) + 0x2ea0c003; (* arm_UMULL_VEC Q3 Q0 Q0 32 *) + 0xba1001ad; (* arm_ADCS X13 X13 X16 *) + 0x4ea61cc1; (* arm_MOV_VEC Q1 Q6 128 *) + 0x9a1f01ce; (* arm_ADC X14 X14 XZR *) + 0x6f601461; (* arm_USRA_VEC Q1 Q3 32 64 128 *) + 0xab0f016b; (* arm_ADDS X11 X11 X15 *) + 0x4e3e1c24; (* arm_AND_VEC Q4 Q1 Q30 128 *) + 0xba1001ad; (* arm_ADCS X13 X13 X16 *) + 0x4ee68484; (* arm_ADD_VEC Q4 Q4 Q6 64 128 *) + 0x9a1f01ce; (* arm_ADC X14 X14 XZR *) + 0x6f601485; (* arm_USRA_VEC Q5 Q4 32 64 128 *) + 0xab14018c; (* arm_ADDS X12 X12 X20 *) + 0x6f605483; (* arm_SLI_VEC Q3 Q4 32 64 *) + 0xba15016b; (* arm_ADCS X11 X11 X21 *) + 0x6f601425; (* arm_USRA_VEC Q5 Q1 32 64 128 *) + 0xa9022c0c; (* arm_STP X12 X11 X0 (Immediate_Offset (iword (&32))) *) + 0x6e1742e1; (* arm_EXT Q1 Q23 Q23 64 *) + 0xba0a01ad; (* arm_ADCS X13 X13 X10 *) + 0x0f2086e2; (* arm_SHRN Q2 Q23 32 32 *) + 0x9a1f01ce; (* arm_ADC X14 X14 XZR *) + 0x0e813ae0; (* arm_ZIP1 Q0 Q23 Q1 32 64 *) + 0xa903380d; (* arm_STP X13 X14 X0 (Immediate_Offset (iword (&48))) *) + 0x9b087cd1; (* arm_MUL X17 X6 X8 *) + 0x2ea2c050; (* arm_UMULL_VEC Q16 Q2 Q2 32 *) + 0x9b097cee; (* arm_MUL X14 X7 X9 *) + 0x2ea0c046; (* arm_UMULL_VEC Q6 Q2 Q0 32 *) + 0x9bc87cd4; (* arm_UMULH X20 X6 X8 *) + 0x2ea0c012; (* arm_UMULL_VEC Q18 Q0 Q0 32 *) + 0xeb0700d5; (* arm_SUBS X21 X6 X7 *) + 0xda9526b5; (* arm_CNEG X21 X21 Condition_CC *) + 0x4ea61cc1; (* arm_MOV_VEC Q1 Q6 128 *) + 0xda9f23eb; (* arm_CSETM X11 Condition_CC *) + 0xeb08012c; (* arm_SUBS X12 X9 X8 *) + 0xda8c258c; (* arm_CNEG X12 X12 Condition_CC *) + 0x6f601641; (* arm_USRA_VEC Q1 Q18 32 64 128 *) + 0x9b0c7ead; (* arm_MUL X13 X21 X12 *) + 0x4e3e1c24; (* arm_AND_VEC Q4 Q1 Q30 128 *) + 0x9bcc7eac; (* arm_UMULH X12 X21 X12 *) + 0x4ee68484; (* arm_ADD_VEC Q4 Q4 Q6 64 128 *) + 0xda8b216b; (* arm_CINV X11 X11 Condition_CC *) + 0xca0b01ad; (* arm_EOR X13 X13 X11 *) + 0xca0b018c; (* arm_EOR X12 X12 X11 *) + 0x6f601490; (* arm_USRA_VEC Q16 Q4 32 64 128 *) + 0xab140233; (* arm_ADDS X19 X17 X20 *) + 0x9a1f0294; (* arm_ADC X20 X20 XZR *) + 0x6f605492; (* arm_SLI_VEC Q18 Q4 32 64 *) + 0x9bc97cf5; (* arm_UMULH X21 X7 X9 *) + 0xab0e0273; (* arm_ADDS X19 X19 X14 *) + 0xba150294; (* arm_ADCS X20 X20 X21 *) + 0x9a1f02b5; (* arm_ADC X21 X21 XZR *) + 0xab0e0294; (* arm_ADDS X20 X20 X14 *) + 0x4e183cae; (* arm_UMOV X14 Q5 1 8 *) + 0x9a1f02b5; (* arm_ADC X21 X21 XZR *) + 0xb100057f; (* arm_CMN X11 (rvalue (word 1)) *) + 0xba0d0273; (* arm_ADCS X19 X19 X13 *) + 0x4e183c6d; (* arm_UMOV X13 Q3 1 8 *) + 0xba0c0294; (* arm_ADCS X20 X20 X12 *) + 0x4e083c6c; (* arm_UMOV X12 Q3 0 8 *) + 0x9a0b02b5; (* arm_ADC X21 X21 X11 *) + 0x4e083cab; (* arm_UMOV X11 Q5 0 8 *) + 0xab110231; (* arm_ADDS X17 X17 X17 *) + 0xba130273; (* arm_ADCS X19 X19 X19 *) + 0x6f601430; (* arm_USRA_VEC Q16 Q1 32 64 128 *) + 0xba140294; (* arm_ADCS X20 X20 X20 *) + 0xba1502b5; (* arm_ADCS X21 X21 X21 *) + 0x9a1f03ea; (* arm_ADC X10 XZR XZR *) + 0x4e975ab1; (* arm_UZP2 Q17 Q21 Q23 32 *) + 0x9b077ccf; (* arm_MUL X15 X6 X7 *) + 0x0ea12ae4; (* arm_XTN Q4 Q23 32 *) + 0x9bc77cd0; (* arm_UMULH X16 X6 X7 *) + 0x4e083e16; (* arm_UMOV X22 Q16 0 8 *) + 0xab0f016b; (* arm_ADDS X11 X11 X15 *) + 0xba1001ad; (* arm_ADCS X13 X13 X16 *) + 0x0ea12aa5; (* arm_XTN Q5 Q21 32 *) + 0x9a1f01ce; (* arm_ADC X14 X14 XZR *) + 0xab0f016b; (* arm_ADDS X11 X11 X15 *) + 0x4ea00aa1; (* arm_REV64_VEC Q1 Q21 32 *) + 0xba1001ad; (* arm_ADCS X13 X13 X16 *) + 0x9a1f01ce; (* arm_ADC X14 X14 XZR *) + 0xa9042c0c; (* arm_STP X12 X11 X0 (Immediate_Offset (iword (&64))) *) + 0xab0d0231; (* arm_ADDS X17 X17 X13 *) + 0x4e183e4d; (* arm_UMOV X13 Q18 1 8 *) + 0xba0e0273; (* arm_ADCS X19 X19 X14 *) + 0x4e183e0e; (* arm_UMOV X14 Q16 1 8 *) + 0xba1f0294; (* arm_ADCS X20 X20 XZR *) + 0x4e083e4c; (* arm_UMOV X12 Q18 0 8 *) + 0xba1f02b5; (* arm_ADCS X21 X21 XZR *) + 0x9a1f014a; (* arm_ADC X10 X10 XZR *) + 0x2ea5c086; (* arm_UMULL_VEC Q6 Q4 Q5 32 *) + 0xa9054c11; (* arm_STP X17 X19 X0 (Immediate_Offset (iword (&80))) *) + 0x2eb1c087; (* arm_UMULL_VEC Q7 Q4 Q17 32 *) + 0x9b097d0f; (* arm_MUL X15 X8 X9 *) + 0x4e975af0; (* arm_UZP2 Q16 Q23 Q23 32 *) + 0x9bc97d10; (* arm_UMULH X16 X8 X9 *) + 0x4eb79c20; (* arm_MUL_VEC Q0 Q1 Q23 32 *) + 0xab0f02cb; (* arm_ADDS X11 X22 X15 *) + 0xba1001ad; (* arm_ADCS X13 X13 X16 *) + 0x6f6014c7; (* arm_USRA_VEC Q7 Q6 32 64 128 *) + 0x9a1f01ce; (* arm_ADC X14 X14 XZR *) + 0xab0f016b; (* arm_ADDS X11 X11 X15 *) + 0x2eb1c201; (* arm_UMULL_VEC Q1 Q16 Q17 32 *) + 0xba1001ad; (* arm_ADCS X13 X13 X16 *) + 0x9a1f01ce; (* arm_ADC X14 X14 XZR *) + 0x6ea02800; (* arm_UADDLP Q0 Q0 32 *) + 0xab14018c; (* arm_ADDS X12 X12 X20 *) + 0xba15016b; (* arm_ADCS X11 X11 X21 *) + 0x4e3e1ce2; (* arm_AND_VEC Q2 Q7 Q30 128 *) + 0x2ea58202; (* arm_UMLAL_VEC Q2 Q16 Q5 32 *) + 0x4f605400; (* arm_SHL_VEC Q0 Q0 32 64 *) + 0x6f6014e1; (* arm_USRA_VEC Q1 Q7 32 64 128 *) + 0x2ea58080; (* arm_UMLAL_VEC Q0 Q4 Q5 32 *) + 0x4e183c10; (* arm_UMOV X16 Q0 1 8 *) + 0x4e083c0f; (* arm_UMOV X15 Q0 0 8 *) + 0x6f601441; (* arm_USRA_VEC Q1 Q2 32 64 128 *) + 0x4e083c34; (* arm_UMOV X20 Q1 0 8 *) + 0x4e183c35; (* arm_UMOV X21 Q1 1 8 *) + 0xa9062c0c; (* arm_STP X12 X11 X0 (Immediate_Offset (iword (&96))) *) + 0xba0a01ad; (* arm_ADCS X13 X13 X10 *) + 0x9a1f01ce; (* arm_ADC X14 X14 XZR *) + 0xa907380d; (* arm_STP X13 X14 X0 (Immediate_Offset (iword (&112))) *) + 0x9b067c4a; (* arm_MUL X10 X2 X6 *) + 0x9b077c6e; (* arm_MUL X14 X3 X7 *) + 0x9bc67c51; (* arm_UMULH X17 X2 X6 *) + 0xab1101ce; (* arm_ADDS X14 X14 X17 *) + 0x9bc77c71; (* arm_UMULH X17 X3 X7 *) + 0xba1101ef; (* arm_ADCS X15 X15 X17 *) + 0xba140210; (* arm_ADCS X16 X16 X20 *) + 0x9a1f02b1; (* arm_ADC X17 X21 XZR *) + 0xab0a01cb; (* arm_ADDS X11 X14 X10 *) + 0xba0e01ee; (* arm_ADCS X14 X15 X14 *) + 0xba0f020f; (* arm_ADCS X15 X16 X15 *) + 0xba100230; (* arm_ADCS X16 X17 X16 *) + 0x9a1103f1; (* arm_ADC X17 XZR X17 *) + 0xab0a01cc; (* arm_ADDS X12 X14 X10 *) + 0xba0b01ed; (* arm_ADCS X13 X15 X11 *) + 0xba0e020e; (* arm_ADCS X14 X16 X14 *) + 0xba0f022f; (* arm_ADCS X15 X17 X15 *) + 0xba1003f0; (* arm_ADCS X16 XZR X16 *) + 0x9a1103f1; (* arm_ADC X17 XZR X17 *) + 0xeb050096; (* arm_SUBS X22 X4 X5 *) + 0xda9626d6; (* arm_CNEG X22 X22 Condition_CC *) + 0xda9f23f3; (* arm_CSETM X19 Condition_CC *) + 0xeb080134; (* arm_SUBS X20 X9 X8 *) + 0xda942694; (* arm_CNEG X20 X20 Condition_CC *) + 0x9b147ed5; (* arm_MUL X21 X22 X20 *) + 0x9bd47ed4; (* arm_UMULH X20 X22 X20 *) + 0xda932273; (* arm_CINV X19 X19 Condition_CC *) + 0xb100067f; (* arm_CMN X19 (rvalue (word 1)) *) + 0xca1302b5; (* arm_EOR X21 X21 X19 *) + 0xba1501ef; (* arm_ADCS X15 X15 X21 *) + 0xca130294; (* arm_EOR X20 X20 X19 *) + 0xba140210; (* arm_ADCS X16 X16 X20 *) + 0x9a130231; (* arm_ADC X17 X17 X19 *) + 0xeb030056; (* arm_SUBS X22 X2 X3 *) + 0xda9626d6; (* arm_CNEG X22 X22 Condition_CC *) + 0xda9f23f3; (* arm_CSETM X19 Condition_CC *) + 0xeb0600f4; (* arm_SUBS X20 X7 X6 *) + 0xda942694; (* arm_CNEG X20 X20 Condition_CC *) + 0x9b147ed5; (* arm_MUL X21 X22 X20 *) + 0x9bd47ed4; (* arm_UMULH X20 X22 X20 *) + 0xda932273; (* arm_CINV X19 X19 Condition_CC *) + 0xb100067f; (* arm_CMN X19 (rvalue (word 1)) *) + 0xca1302b5; (* arm_EOR X21 X21 X19 *) + 0xba15016b; (* arm_ADCS X11 X11 X21 *) + 0xca130294; (* arm_EOR X20 X20 X19 *) + 0xba14018c; (* arm_ADCS X12 X12 X20 *) + 0xba1301ad; (* arm_ADCS X13 X13 X19 *) + 0xba1301ce; (* arm_ADCS X14 X14 X19 *) + 0xba1301ef; (* arm_ADCS X15 X15 X19 *) + 0xba130210; (* arm_ADCS X16 X16 X19 *) + 0x9a130231; (* arm_ADC X17 X17 X19 *) + 0xeb050076; (* arm_SUBS X22 X3 X5 *) + 0xda9626d6; (* arm_CNEG X22 X22 Condition_CC *) + 0xda9f23f3; (* arm_CSETM X19 Condition_CC *) + 0xeb070134; (* arm_SUBS X20 X9 X7 *) + 0xda942694; (* arm_CNEG X20 X20 Condition_CC *) + 0x9b147ed5; (* arm_MUL X21 X22 X20 *) + 0x9bd47ed4; (* arm_UMULH X20 X22 X20 *) + 0xda932273; (* arm_CINV X19 X19 Condition_CC *) + 0xb100067f; (* arm_CMN X19 (rvalue (word 1)) *) + 0xca1302b5; (* arm_EOR X21 X21 X19 *) + 0xba1501ce; (* arm_ADCS X14 X14 X21 *) + 0xca130294; (* arm_EOR X20 X20 X19 *) + 0xba1401ef; (* arm_ADCS X15 X15 X20 *) + 0xba130210; (* arm_ADCS X16 X16 X19 *) + 0x9a130231; (* arm_ADC X17 X17 X19 *) + 0xeb040056; (* arm_SUBS X22 X2 X4 *) + 0xda9626d6; (* arm_CNEG X22 X22 Condition_CC *) + 0xda9f23f3; (* arm_CSETM X19 Condition_CC *) + 0xeb060114; (* arm_SUBS X20 X8 X6 *) + 0xda942694; (* arm_CNEG X20 X20 Condition_CC *) + 0x9b147ed5; (* arm_MUL X21 X22 X20 *) + 0x9bd47ed4; (* arm_UMULH X20 X22 X20 *) + 0xda932273; (* arm_CINV X19 X19 Condition_CC *) + 0xb100067f; (* arm_CMN X19 (rvalue (word 1)) *) + 0xca1302b5; (* arm_EOR X21 X21 X19 *) + 0xba15018c; (* arm_ADCS X12 X12 X21 *) + 0xca130294; (* arm_EOR X20 X20 X19 *) + 0xba1401ad; (* arm_ADCS X13 X13 X20 *) + 0xba1301ce; (* arm_ADCS X14 X14 X19 *) + 0xba1301ef; (* arm_ADCS X15 X15 X19 *) + 0xba130210; (* arm_ADCS X16 X16 X19 *) + 0x9a130231; (* arm_ADC X17 X17 X19 *) + 0xeb050056; (* arm_SUBS X22 X2 X5 *) + 0xda9626d6; (* arm_CNEG X22 X22 Condition_CC *) + 0xda9f23f3; (* arm_CSETM X19 Condition_CC *) + 0xeb060134; (* arm_SUBS X20 X9 X6 *) + 0xda942694; (* arm_CNEG X20 X20 Condition_CC *) + 0x9b147ed5; (* arm_MUL X21 X22 X20 *) + 0x9bd47ed4; (* arm_UMULH X20 X22 X20 *) + 0xda932273; (* arm_CINV X19 X19 Condition_CC *) + 0xb100067f; (* arm_CMN X19 (rvalue (word 1)) *) + 0xca1302b5; (* arm_EOR X21 X21 X19 *) + 0xba1501ad; (* arm_ADCS X13 X13 X21 *) + 0xca130294; (* arm_EOR X20 X20 X19 *) + 0xba1401ce; (* arm_ADCS X14 X14 X20 *) + 0xba1301ef; (* arm_ADCS X15 X15 X19 *) + 0xba130210; (* arm_ADCS X16 X16 X19 *) + 0x9a130231; (* arm_ADC X17 X17 X19 *) + 0xeb040076; (* arm_SUBS X22 X3 X4 *) + 0xda9626d6; (* arm_CNEG X22 X22 Condition_CC *) + 0xda9f23f3; (* arm_CSETM X19 Condition_CC *) + 0xeb070114; (* arm_SUBS X20 X8 X7 *) + 0xda942694; (* arm_CNEG X20 X20 Condition_CC *) + 0x9b147ed5; (* arm_MUL X21 X22 X20 *) + 0x9bd47ed4; (* arm_UMULH X20 X22 X20 *) + 0xda932273; (* arm_CINV X19 X19 Condition_CC *) + 0xb100067f; (* arm_CMN X19 (rvalue (word 1)) *) + 0xca1302b5; (* arm_EOR X21 X21 X19 *) + 0xba1501ad; (* arm_ADCS X13 X13 X21 *) + 0xca130294; (* arm_EOR X20 X20 X19 *) + 0xba1401ce; (* arm_ADCS X14 X14 X20 *) + 0xba1301ef; (* arm_ADCS X15 X15 X19 *) + 0xba130210; (* arm_ADCS X16 X16 X19 *) + 0x9a130231; (* arm_ADC X17 X17 X19 *) + 0xab0a014a; (* arm_ADDS X10 X10 X10 *) + 0xba0b016b; (* arm_ADCS X11 X11 X11 *) + 0xba0c018c; (* arm_ADCS X12 X12 X12 *) + 0xba0d01ad; (* arm_ADCS X13 X13 X13 *) + 0xba0e01ce; (* arm_ADCS X14 X14 X14 *) + 0xba0f01ef; (* arm_ADCS X15 X15 X15 *) + 0xba100210; (* arm_ADCS X16 X16 X16 *) + 0xba110231; (* arm_ADCS X17 X17 X17 *) + 0x9a1f03f3; (* arm_ADC X19 XZR XZR *) + 0xa9420c02; (* arm_LDP X2 X3 X0 (Immediate_Offset (iword (&32))) *) + 0xab02014a; (* arm_ADDS X10 X10 X2 *) + 0xba03016b; (* arm_ADCS X11 X11 X3 *) + 0xa9022c0a; (* arm_STP X10 X11 X0 (Immediate_Offset (iword (&32))) *) + 0xa9430c02; (* arm_LDP X2 X3 X0 (Immediate_Offset (iword (&48))) *) + 0xba02018c; (* arm_ADCS X12 X12 X2 *) + 0xba0301ad; (* arm_ADCS X13 X13 X3 *) + 0xa903340c; (* arm_STP X12 X13 X0 (Immediate_Offset (iword (&48))) *) + 0xa9440c02; (* arm_LDP X2 X3 X0 (Immediate_Offset (iword (&64))) *) + 0xba0201ce; (* arm_ADCS X14 X14 X2 *) + 0xba0301ef; (* arm_ADCS X15 X15 X3 *) + 0xa9043c0e; (* arm_STP X14 X15 X0 (Immediate_Offset (iword (&64))) *) + 0xa9450c02; (* arm_LDP X2 X3 X0 (Immediate_Offset (iword (&80))) *) + 0xba020210; (* arm_ADCS X16 X16 X2 *) + 0xba030231; (* arm_ADCS X17 X17 X3 *) + 0xa9054410; (* arm_STP X16 X17 X0 (Immediate_Offset (iword (&80))) *) + 0xa9460c02; (* arm_LDP X2 X3 X0 (Immediate_Offset (iword (&96))) *) + 0xba130042; (* arm_ADCS X2 X2 X19 *) + 0xba1f0063; (* arm_ADCS X3 X3 XZR *) + 0xa9060c02; (* arm_STP X2 X3 X0 (Immediate_Offset (iword (&96))) *) + 0xa9470c02; (* arm_LDP X2 X3 X0 (Immediate_Offset (iword (&112))) *) + 0xba1f0042; (* arm_ADCS X2 X2 XZR *) + 0x9a1f0063; (* arm_ADC X3 X3 XZR *) + 0xa9070c02; (* arm_STP X2 X3 X0 (Immediate_Offset (iword (&112))) *) + 0xd65f03c0 (* arm_RET X30 *) +];; + +let BIGNUM_KSQR_16_32_NEON_EXEC = ARM_MK_EXEC_RULE bignum_ksqr_16_32_neon_mc;; + +(* ------------------------------------------------------------------------- *) +(* First of all the correctness lemma for the embedded bignum_sqr_8_16 *) +(* ------------------------------------------------------------------------- *) + +let lemma1 = prove + (`!(x0:num) x1 (y0:num) y1. + (if y0 <= y1 + then if x1 <= x0 then word 0 else word 18446744073709551615 + else word_not + (if x1 <= x0 then word 0 else word 18446744073709551615)):int64 = + word_neg(word(bitval(y0 <= y1 <=> x0 < x1)))`, + REPEAT GEN_TAC THEN REWRITE_TAC[GSYM NOT_LE] THEN + REPEAT(COND_CASES_TAC THEN ASM_REWRITE_TAC[BITVAL_CLAUSES]) THEN + CONV_TAC WORD_REDUCE_CONV);; + +let lemma2 = prove + (`!(x0:int64) (x1:int64) (y0:int64) (y1:int64). + &(val(if val x1 <= val x0 then word_sub x0 x1 + else word_neg (word_sub x0 x1))) * + &(val(if val y0 <= val y1 then word_sub y1 y0 + else word_neg (word_sub y1 y0))):real = + --(&1) pow bitval(val y0 <= val y1 <=> val x0 < val x1) * + (&(val x0) - &(val x1)) * (&(val y1) - &(val y0))`, + REPEAT GEN_TAC THEN REWRITE_TAC[GSYM NOT_LE; WORD_NEG_SUB] THEN + REPEAT(COND_CASES_TAC THEN ASM_REWRITE_TAC[BITVAL_CLAUSES]) THEN + REPEAT(FIRST_X_ASSUM(ASSUME_TAC o MATCH_MP (ARITH_RULE + `~(m:num <= n) ==> n <= m /\ ~(m <= n)`))) THEN + ASM_SIMP_TAC[VAL_WORD_SUB_CASES; GSYM REAL_OF_NUM_SUB] THEN + REAL_ARITH_TAC);; + +let ADK_48_TAC = + MATCH_MP_TAC EQUAL_FROM_CONGRUENT_REAL THEN + MAP_EVERY EXISTS_TAC [`512`; `&0:real`] THEN + REPLICATE_TAC 2 (CONJ_TAC THENL [BOUNDER_TAC[]; ALL_TAC]) THEN + CONJ_TAC THENL [REAL_INTEGER_TAC; ALL_TAC] THEN + ACCUMULATOR_POP_ASSUM_LIST(MP_TAC o end_itlist CONJ) THEN + POP_ASSUM_LIST(K ALL_TAC) THEN + REWRITE_TAC[lemma1; lemma2] THEN REWRITE_TAC[WORD_XOR_MASK] THEN + REPEAT(COND_CASES_TAC THEN + ASM_REWRITE_TAC[BITVAL_CLAUSES; REAL_VAL_WORD_NOT]) THEN + CONV_TAC WORD_REDUCE_CONV THEN CONV_TAC NUM_REDUCE_CONV THEN + REWRITE_TAC[BITVAL_CLAUSES; DIMINDEX_64] THEN + POP_ASSUM_LIST(K ALL_TAC) THEN DISCH_TAC THEN + FIRST_ASSUM(MP_TAC o end_itlist CONJ o DESUM_RULE o CONJUNCTS) THEN + DISCH_THEN(fun th -> REWRITE_TAC[th]) THEN + CONV_TAC(RAND_CONV REAL_POLY_CONV) THEN + FIRST_ASSUM(MP_TAC o end_itlist CONJ o filter (is_ratconst o rand o concl) o + DECARRY_RULE o CONJUNCTS) THEN + DISCH_THEN(fun th -> REWRITE_TAC[th]) THEN REAL_INTEGER_TAC;; + +needs "arm/proofs/neon_helper.ml";; + +let BIGNUM_KSQR_16_32_NEON_LEMMA = prove + (`!z x a pc returnaddress. + ALL (nonoverlapping (z,8 * 16)) + [(word pc,2164); (x,8 * 8)] + ==> ensures arm + (\s. aligned_bytes_loaded s (word(pc + 0x0)) bignum_ksqr_16_32_neon_mc /\ + read PC s = word(pc + 0x2c0) /\ + read X30 s = returnaddress /\ + C_ARGUMENTS [z; x] s /\ + bignum_from_memory (x,8) s = a) + (\s. read PC s = returnaddress /\ + bignum_from_memory (z,16) s = a EXP 2) + (MAYCHANGE [PC; X2; X3; X4; X5; X6; X7; X8; X9; X10; X11; X12; + X13; X14; X15; X16; X17; X19; X20; X21; X22] ,, + MAYCHANGE [Q0; Q1; Q2; Q3; Q4; Q5; Q6; Q7; Q16; Q17; Q18; Q19; Q20; + Q21; Q22; Q23; Q30] ,, + MAYCHANGE [memory :> bytes(z,8 * 16)] ,, + MAYCHANGE SOME_FLAGS)`, + REWRITE_TAC[ADD_CLAUSES] THEN + + MAP_EVERY X_GEN_TAC [`z:int64`; `x:int64`; `a:num`; `pc:num`; `returnaddress:int64`] THEN + REWRITE_TAC[C_ARGUMENTS; C_RETURN; SOME_FLAGS; ALL; NONOVERLAPPING_CLAUSES] THEN + DISCH_THEN(REPEAT_TCL CONJUNCTS_THEN ASSUME_TAC) THEN + ENSURES_INIT_TAC "s0" THEN + BIGNUM_DIGITIZE_TAC "x_" `bignum_from_memory (x,8) s0` THEN + BYTES128_EQ_JOIN64_TAC `read (memory :> bytes128 x) s0` `x_1:(64)word` `x_0:(64)word` THEN + BYTES128_EQ_JOIN64_TAC `read (memory :> bytes128 (word_add x (word 16:(64)word))) s0` + `x_3:(64)word` `x_2:(64)word` THEN + BYTES128_EQ_JOIN64_TAC `read (memory :> bytes128 (word_add x (word 32:(64)word))) s0` + `x_5:(64)word` `x_4:(64)word` THEN + BYTES128_EQ_JOIN64_TAC `read (memory :> bytes128 (word_add x (word 48:(64)word))) s0` + `x_7:(64)word` `x_6:(64)word` THEN + + (*** First nested mini-ADK 4x4 squaring block ***) + + ARM_REWRITE_ASSUM_AND_ACCSTEPS_TAC BIGNUM_KSQR_16_32_NEON_EXEC + [49;51;53;55;79;81;83;85] + [WORD_SQR64_HI; WORD_SQR64_LO] + [10;11;25;35;37;41;43;44;45;46;48;49;50;52;53;54;56;58;60;62; + 64;68;70;72;74;76;77;80;81;82;84;85;86;88;92;96;98;100;102;104; + 106;108;110;114;116] + (1--118) THEN + + SUBGOAL_THEN + `bignum_of_wordlist [x_0;x_1;x_2;x_3] EXP 2 = + bignum_of_wordlist [mullo_s53; sum_s74; sum_s80; sum_s82; + sum_s108; sum_s110; sum_s114; sum_s116]` + ASSUME_TAC THENL + [REWRITE_TAC[bignum_of_wordlist; GSYM REAL_OF_NUM_CLAUSES] THEN + RULE_ASSUM_TAC(REWRITE_RULE[ADD_CLAUSES; VAL_WORD_BITVAL]) THEN + ADK_48_TAC; + ACCUMULATOR_POP_ASSUM_LIST(K ALL_TAC) THEN + DISCARD_MATCHING_ASSUMPTIONS [`word a = b`]] THEN + + (*** Second nested mini-ADK 4x4 squaring block ***) + + + ARM_REWRITE_ASSUM_AND_ACCSTEPS_TAC BIGNUM_KSQR_16_32_NEON_EXEC + [148;152;154;156;167;178;180;182] + [WORD_SQR64_HI; WORD_SQR64_LO] + [119;121;132;140;141;144;145;146;147;149;151;153;155;157;158;160;161; + 162;154;152;164;168;169;171;172;174;175;177;179;181;183;184;182;178; + 188;192;193;195;196;198;199;201;202] + (119--207) THEN + RULE_ASSUM_TAC (REWRITE_RULE + [WORD_BITMANIP_SIMP_LEMMAS; WORD_MUL64_LO]) THEN + ARM_REWRITE_ASSUM_AND_ACCSTEPS_TAC BIGNUM_KSQR_16_32_NEON_EXEC + [208;209] [WORD_SQR64_HI; WORD_SQR64_LO] [208;209] + (208--210) THEN + RULE_ASSUM_TAC (REWRITE_RULE + [WORD_BITMANIP_SIMP_LEMMAS; WORD_MUL64_HI]) THEN + ARM_REWRITE_ASSUM_AND_ACCSTEPS_TAC BIGNUM_KSQR_16_32_NEON_EXEC + [211;212] [WORD_SQR64_HI; WORD_SQR64_LO] [211;212;214;215] + (211--216) THEN + + SUBGOAL_THEN + `bignum_of_wordlist [x_4;x_5;x_6;x_7] EXP 2 = + bignum_of_wordlist [mullo_s154;sum_s172;sum_s177;sum_s179; + sum_s201;sum_s202;sum_s214;sum_s215]` + ASSUME_TAC THENL + [REWRITE_TAC[bignum_of_wordlist; GSYM REAL_OF_NUM_CLAUSES] THEN + RULE_ASSUM_TAC(REWRITE_RULE[ADD_CLAUSES; VAL_WORD_BITVAL]) THEN + ADK_48_TAC; + let is_acc_thm_for_next acc_thm = + List.exists (contains_str (string_of_term (concl acc_thm))) + ["208";"209";"211";"212"] in + let filter_acc_thms_for_next acc_thms = + List.filter is_acc_thm_for_next acc_thms in + let wpat = `word a = b` in + ACCUMULATOR_POP_ASSUM_LIST( + fun acc_thms -> + let acc_thms = filter_acc_thms_for_next acc_thms in + List.iter (fun t -> Printf.printf "assuming: %s\n" t) + (List.map string_of_thm acc_thms); + MAP_EVERY ASSUME_TAC acc_thms) THEN + DISCARD_ASSUMPTIONS_TAC + (fun th -> can (term_match [] wpat) (concl th) && + not (is_acc_thm_for_next th))] THEN + + (*** Nested ADK 4x4 multiplication block ***) + + ARM_ACCSTEPS_TAC BIGNUM_KSQR_16_32_NEON_EXEC + [217;218;220;222;223;224;225;226;227;228;229;230;231;232;233; + 234;235;241;246;248;249;255;260;262;263;264;265;266;267;273;278; + 280;281;282;288;293;295;296;297;298;299;305;310;312;313;314;315; + 321;326;328;329;330;331] + (217--331) THEN + + SUBGOAL_THEN + `bignum_of_wordlist [x_0;x_1;x_2;x_3] * + bignum_of_wordlist [x_4;x_5;x_6;x_7] = + bignum_of_wordlist + [mullo_s217; sum_s260; sum_s293; sum_s326; + sum_s328; sum_s329; sum_s330; sum_s331]` + ASSUME_TAC THENL + [REWRITE_TAC[bignum_of_wordlist; GSYM REAL_OF_NUM_CLAUSES] THEN + RULE_ASSUM_TAC(REWRITE_RULE[ADD_CLAUSES; VAL_WORD_BITVAL]) THEN + ADK_48_TAC; + ACCUMULATOR_POP_ASSUM_LIST(K ALL_TAC) THEN + DISCARD_MATCHING_ASSUMPTIONS [`word a = b`]] THEN + + (*** Final accumulation simulation and 16-digit focusing ***) + + ARM_ACCSTEPS_TAC BIGNUM_KSQR_16_32_NEON_EXEC (332--364) (332--365) THEN + ENSURES_FINAL_STATE_TAC THEN ASM_REWRITE_TAC[] THEN + CONV_TAC(LAND_CONV BIGNUM_EXPAND_CONV) THEN ASM_REWRITE_TAC[] THEN + DISCARD_STATE_TAC "s365" THEN + REWRITE_TAC[GSYM REAL_OF_NUM_CLAUSES] THEN + MATCH_MP_TAC EQUAL_FROM_CONGRUENT_REAL THEN + MAP_EVERY EXISTS_TAC [`1024`; `&0:real`] THEN + CONJ_TAC THENL [BOUNDER_TAC[]; ALL_TAC] THEN CONJ_TAC THENL + [EXPAND_TAC "a" THEN + REWRITE_TAC[GSYM REAL_OF_NUM_CLAUSES] THEN BOUNDER_TAC[]; + REWRITE_TAC[INTEGER_CLOSED]] THEN + + (*** The core rearrangement we are using ***) + + SUBGOAL_THEN + `(&a:real) pow 2 = + &(bignum_of_wordlist [x_0;x_1;x_2;x_3] EXP 2) + + &2 pow 512 * &(bignum_of_wordlist [x_4;x_5;x_6;x_7] EXP 2) + + &2 pow 257 * &(bignum_of_wordlist [x_0;x_1;x_2;x_3] * + bignum_of_wordlist [x_4;x_5;x_6;x_7])` + SUBST1_TAC THENL + [EXPAND_TAC "a" THEN + REWRITE_TAC[bignum_of_wordlist; REAL_OF_NUM_CLAUSES] THEN ARITH_TAC; + ASM_REWRITE_TAC[]] THEN + REWRITE_TAC[GSYM REAL_OF_NUM_CLAUSES; bignum_of_wordlist] THEN + RULE_ASSUM_TAC(REWRITE_RULE[ADD_CLAUSES; VAL_WORD_BITVAL]) THEN + ACCUMULATOR_POP_ASSUM_LIST(MP_TAC o end_itlist CONJ o DESUM_RULE) THEN + DISCH_THEN(fun th -> REWRITE_TAC[th]) THEN REAL_INTEGER_TAC);; + +let BIGNUM_KSQR_16_32_NEON_LEMMA_TAC = + ARM_SUBROUTINE_SIM_TAC + (bignum_ksqr_16_32_neon_mc,BIGNUM_KSQR_16_32_NEON_EXEC, + 0x0,bignum_ksqr_16_32_neon_mc,BIGNUM_KSQR_16_32_NEON_LEMMA) + [`read X0 s`; `read X1 s`; + `bignum_from_memory (read X1 s,8) s`; + `pc:num`; `read X30 s`];; + +(* ------------------------------------------------------------------------- *) +(* Now the main proof. *) +(* ------------------------------------------------------------------------- *) + +let BIGNUM_KSQR_16_32_NEON_CORRECT = prove + (`!z x a t pc. + nonoverlapping (z,8 * 32) (t,8 * 24) /\ + ALLPAIRS nonoverlapping + [(z,8 * 32); (t,8 * 24)] + [(word pc,2164); (x,8 * 16)] + ==> ensures arm + (\s. aligned_bytes_loaded s (word pc) bignum_ksqr_16_32_neon_mc /\ + read PC s = word(pc + 0x10) /\ + C_ARGUMENTS [z; x; t] s /\ + bignum_from_memory (x,16) s = a) + (\s. read PC s = word (pc + 0x2ac) /\ + bignum_from_memory (z,32) s = a EXP 2) + (MAYCHANGE_REGS_AND_FLAGS_PERMITTED_BY_ABI ,, + MAYCHANGE [X19; X20; X21; X22; X23; X24; X25; X30] ,, + MAYCHANGE [memory :> bytes(z,8 * 32); + memory :> bytes(t,8 * 24)])`, + MAP_EVERY X_GEN_TAC + [`z:int64`; `x:int64`; `a:num`; `t:int64`;`pc:num`] THEN + REWRITE_TAC[ALLPAIRS; ALL; PAIRWISE] THEN + REWRITE_TAC[C_ARGUMENTS; C_RETURN; SOME_FLAGS; NONOVERLAPPING_CLAUSES; + MAYCHANGE_REGS_AND_FLAGS_PERMITTED_BY_ABI] THEN + DISCH_THEN(REPEAT_TCL CONJUNCTS_THEN ASSUME_TAC) THEN + ENSURES_INIT_TAC "s0" THEN + BIGNUM_LDIGITIZE_TAC "x_" `bignum_from_memory (x,16) s0` THEN + + (*** First nested 8x8 squaring block ***) + + ARM_STEPS_TAC BIGNUM_KSQR_16_32_NEON_EXEC (1--4) THEN + BIGNUM_KSQR_16_32_NEON_LEMMA_TAC 5 THEN + BIGNUM_LDIGITIZE_TAC "l_" `read (memory :> bytes (z,8 * 16)) s5` THEN + FIRST_X_ASSUM + (MP_TAC o check (can (term_match [] `x = y EXP 2`) o concl)) THEN + CONV_TAC(LAND_CONV(RAND_CONV(LAND_CONV BIGNUM_LEXPAND_CONV))) THEN + ASM_REWRITE_TAC[] THEN DISCH_THEN(ASSUME_TAC o SYM) THEN + + (*** Absolute difference computation ***) + + ARM_ACCSTEPS_TAC BIGNUM_KSQR_16_32_NEON_EXEC + [8;9;12;13;16;17;20;21; 25;27;30;32;35;37;40;42] (6--43) THEN + SUBGOAL_THEN + `&(bignum_from_memory(t,8) s43):real = + abs(&(bignum_of_wordlist [x_0;x_1;x_2;x_3;x_4;x_5;x_6;x_7]) - + &(bignum_of_wordlist [x_8;x_9;x_10;x_11;x_12;x_13;x_14;x_15]))` + MP_TAC THENL + [MATCH_MP_TAC EQUAL_FROM_CONGRUENT_REAL THEN + MAP_EVERY EXISTS_TAC [`64 * 8`; `&0:real`] THEN CONJ_TAC THENL + [REWRITE_TAC[REAL_OF_NUM_CLAUSES; LE_0; BIGNUM_FROM_MEMORY_BOUND]; + ALL_TAC] THEN + CONJ_TAC THENL + [REWRITE_TAC[REAL_ABS_POS] THEN MATCH_MP_TAC(REAL_ARITH + `&x < e /\ &y < e ==> abs(&x - &y):real < e`) THEN + REWRITE_TAC[REAL_OF_NUM_CLAUSES] THEN CONJ_TAC THEN + MATCH_MP_TAC BIGNUM_OF_WORDLIST_BOUND THEN + REWRITE_TAC[LENGTH] THEN ARITH_TAC; + REWRITE_TAC[INTEGER_CLOSED]] THEN + CONV_TAC(ONCE_DEPTH_CONV BIGNUM_EXPAND_CONV) THEN ASM_REWRITE_TAC[] THEN + CONV_TAC(ONCE_DEPTH_CONV NUM_MULT_CONV) THEN + REWRITE_TAC[REAL_OF_NUM_LT; REAL_ARITH + `abs(&x - &y):real = if &x < &y then &y - &x else &x - &y`] THEN + SUBGOAL_THEN + `carry_s21 <=> + bignum_of_wordlist [x_0;x_1;x_2;x_3;x_4;x_5;x_6;x_7] < + bignum_of_wordlist [x_8;x_9;x_10;x_11;x_12;x_13;x_14;x_15]` + (SUBST_ALL_TAC o SYM) THENL + [MATCH_MP_TAC FLAG_FROM_CARRY_LT THEN EXISTS_TAC `512` THEN + REWRITE_TAC[bignum_of_wordlist; GSYM REAL_OF_NUM_CLAUSES] THEN + ACCUMULATOR_POP_ASSUM_LIST(MP_TAC o end_itlist CONJ o DECARRY_RULE) THEN + REWRITE_TAC[REAL_BITVAL_NOT; REAL_VAL_WORD_MASK; DIMINDEX_64] THEN + DISCH_THEN(fun th -> REWRITE_TAC[th]) THEN BOUNDER_TAC[]; + REWRITE_TAC[bignum_of_wordlist; GSYM REAL_OF_NUM_CLAUSES] THEN + ACCUMULATOR_POP_ASSUM_LIST(MP_TAC o end_itlist CONJ) THEN + REWRITE_TAC[WORD_UNMASK_64; WORD_XOR_MASK] THEN + COND_CASES_TAC THEN ASM_REWRITE_TAC[BITVAL_CLAUSES] THEN + CONV_TAC WORD_REDUCE_CONV THEN CONV_TAC NUM_REDUCE_CONV THEN + ASM_REWRITE_TAC[BITVAL_CLAUSES; REAL_VAL_WORD_NOT; DIMINDEX_64] THEN + DISCH_THEN(MP_TAC o end_itlist CONJ o DESUM_RULE o CONJUNCTS) THEN + REWRITE_TAC[REAL_BITVAL_NOT; REAL_VAL_WORD_MASK; DIMINDEX_64] THEN + DISCH_THEN(fun th -> REWRITE_TAC[th]) THEN REAL_INTEGER_TAC]; + ACCUMULATOR_POP_ASSUM_LIST(K ALL_TAC) THEN + CONV_TAC(LAND_CONV(LAND_CONV(RAND_CONV BIGNUM_LEXPAND_CONV))) THEN + ASM_REWRITE_TAC[] THEN DISCH_THEN(ASSUME_TAC o SYM)] THEN + + (*** Second nested squaring ***) + + ARM_STEPS_TAC BIGNUM_KSQR_16_32_NEON_EXEC (44--46) THEN + BIGNUM_KSQR_16_32_NEON_LEMMA_TAC 47 THEN + BIGNUM_LDIGITIZE_TAC "h_" + `read (memory :> bytes (word_add z (word 128),8 * 16)) s47` THEN + FIRST_X_ASSUM + (MP_TAC o check (can (term_match [] `x = y EXP 2`) o concl)) THEN + CONV_TAC(LAND_CONV(RAND_CONV(LAND_CONV BIGNUM_LEXPAND_CONV))) THEN + ASM_REWRITE_TAC[] THEN DISCH_THEN(ASSUME_TAC o SYM) THEN + + (*** Computation of H' = H + L_top ***) + + ARM_ACCSTEPS_TAC BIGNUM_KSQR_16_32_NEON_EXEC + [50;51;55;56;60;61;65;66;69;70;73;74;77;78;81;82] (48--83) THEN + SUBGOAL_THEN + `bignum_from_memory(word_add z (word 128),16) s83 = + bignum_of_wordlist + [h_0;h_1;h_2;h_3;h_4;h_5;h_6;h_7;h_8;h_9;h_10;h_11;h_12;h_13;h_14;h_15] + + bignum_of_wordlist[l_8;l_9;l_10;l_11;l_12;l_13;l_14;l_15]` + MP_TAC THENL + [REWRITE_TAC[GSYM REAL_OF_NUM_CLAUSES] THEN + MATCH_MP_TAC EQUAL_FROM_CONGRUENT_REAL THEN + MAP_EVERY EXISTS_TAC [`64 * 16`; `&0:real`] THEN CONJ_TAC THENL + [REWRITE_TAC[REAL_OF_NUM_CLAUSES; LE_0; BIGNUM_FROM_MEMORY_BOUND]; + ALL_TAC] THEN + CONJ_TAC THENL + [REWRITE_TAC[REAL_OF_NUM_CLAUSES; LE_0] THEN FIRST_X_ASSUM(fun th -> + GEN_REWRITE_TAC (LAND_CONV o LAND_CONV) [SYM th]) THEN + REWRITE_TAC[bignum_of_wordlist; GSYM REAL_OF_NUM_CLAUSES] THEN + CONV_TAC NUM_REDUCE_CONV THEN BOUNDER_TAC[]; + REWRITE_TAC[INTEGER_CLOSED]] THEN + CONV_TAC(ONCE_DEPTH_CONV BIGNUM_EXPAND_CONV) THEN + ASM_REWRITE_TAC[bignum_of_wordlist; GSYM REAL_OF_NUM_CLAUSES] THEN + CONV_TAC(ONCE_DEPTH_CONV NUM_MULT_CONV) THEN + ACCUMULATOR_POP_ASSUM_LIST(MP_TAC o end_itlist CONJ o DESUM_RULE) THEN + DISCH_THEN(fun th -> REWRITE_TAC[th]) THEN REAL_INTEGER_TAC; + ACCUMULATOR_POP_ASSUM_LIST(K ALL_TAC) THEN + CONV_TAC(LAND_CONV(LAND_CONV BIGNUM_LEXPAND_CONV)) THEN + ASM_REWRITE_TAC[] THEN DISCH_TAC] THEN + + (*** Third and final nested squaring ***) + + ARM_STEPS_TAC BIGNUM_KSQR_16_32_NEON_EXEC (84--86) THEN + BIGNUM_KSQR_16_32_NEON_LEMMA_TAC 87 THEN + BIGNUM_LDIGITIZE_TAC "m_" + `read (memory :> bytes (word_add t (word 64),8 * 16)) s87` THEN + FIRST_X_ASSUM + (MP_TAC o check (can (term_match [] `x = y EXP 2`) o concl)) THEN + CONV_TAC(LAND_CONV(RAND_CONV(LAND_CONV BIGNUM_LEXPAND_CONV))) THEN + ASM_REWRITE_TAC[] THEN DISCH_THEN(ASSUME_TAC o SYM) THEN + + (*** All remaining accumulation of sub-results ***) + + ARM_ACCSTEPS_TAC BIGNUM_KSQR_16_32_NEON_EXEC + [90;91;94;95;98;99;102;103;106;107;110;111;114;115;118;119; + 122;123;126;127;130;131;134;135;138;139;142;143;146;147;150;151; 153; + 156;157;160;161;164;165;168;169] + (88--170) THEN + ENSURES_FINAL_STATE_TAC THEN ASM_REWRITE_TAC[] THEN + REWRITE_TAC[GSYM REAL_OF_NUM_CLAUSES] THEN + MATCH_MP_TAC EQUAL_FROM_CONGRUENT_REAL THEN + MAP_EVERY EXISTS_TAC [`64 * 32`; `&0:real`] THEN CONJ_TAC THENL + [REWRITE_TAC[REAL_OF_NUM_CLAUSES; LE_0] THEN + REWRITE_TAC[GSYM BIGNUM_FROM_MEMORY_BYTES; BIGNUM_FROM_MEMORY_BOUND]; + ALL_TAC] THEN + CONJ_TAC THENL + [REWRITE_TAC[REAL_OF_NUM_CLAUSES; LE_0] THEN EXPAND_TAC "a" THEN + REWRITE_TAC[bignum_of_wordlist; GSYM REAL_OF_NUM_CLAUSES] THEN + CONV_TAC NUM_REDUCE_CONV THEN BOUNDER_TAC[]; + REWRITE_TAC[INTEGER_CLOSED]] THEN + SUBGOAL_THEN + `(&a:real) pow 2 = + (&(bignum_of_wordlist[l_0; l_1; l_2; l_3; l_4; l_5; l_6; l_7]) + + &2 pow 512 * + &(bignum_of_wordlist + [sum_s50; sum_s51; sum_s55; sum_s56; sum_s60; sum_s61; sum_s65; + sum_s66; sum_s69; sum_s70; sum_s73; sum_s74; sum_s77; sum_s78; + sum_s81; sum_s82])) * + (&2 pow 512 + &1) - + &2 pow 512 * + &(bignum_of_wordlist + [m_0; m_1; m_2; m_3; m_4; m_5; m_6; m_7; m_8; m_9; m_10; m_11; m_12; + m_13; m_14; m_15])` + SUBST1_TAC THENL + [ASM_REWRITE_TAC[] THEN REWRITE_TAC[REAL_OF_NUM_CLAUSES; ARITH_RULE + `l + e * (h + m):num = (l + e * m) + e * h`] THEN + REWRITE_TAC[GSYM(BIGNUM_OF_WORDLIST_SPLIT_RULE(8,8))] THEN + REPEAT(FIRST_X_ASSUM(SUBST1_TAC o MATCH_MP (ARITH_RULE + `x EXP 2 = y ==> y = x EXP 2`))) THEN + REWRITE_TAC[GSYM REAL_OF_NUM_CLAUSES] THEN + FIRST_X_ASSUM(SUBST1_TAC o MATCH_MP (MESON[] + `abs x:real = y ==> y = abs x`)) THEN + REWRITE_TAC[REAL_POW2_ABS] THEN + EXPAND_TAC "a" THEN + REWRITE_TAC[GSYM REAL_OF_NUM_CLAUSES; bignum_of_wordlist] THEN + REAL_ARITH_TAC; + ALL_TAC] THEN + CONV_TAC(ONCE_DEPTH_CONV BIGNUM_LEXPAND_CONV) THEN + REWRITE_TAC[GSYM REAL_OF_NUM_CLAUSES; bignum_of_wordlist] THEN + ASM_REWRITE_TAC[] THEN CONV_TAC NUM_REDUCE_CONV THEN + ACCUMULATOR_POP_ASSUM_LIST(MP_TAC o end_itlist CONJ) THEN + REWRITE_TAC[WORD_UNMASK_64; REAL_VAL_WORD_MASK; DIMINDEX_64; + COND_SWAP; GSYM WORD_BITVAL; VAL_WORD_BITVAL] THEN + DISCH_THEN(MP_TAC o end_itlist CONJ o DESUM_RULE o CONJUNCTS) THEN + DISCH_THEN(fun th -> REWRITE_TAC[th]) THEN REAL_INTEGER_TAC);; + +let BIGNUM_KSQR_16_32_NEON_SUBROUTINE_CORRECT = prove + (`!z x a t pc stackpointer returnaddress. + aligned 16 stackpointer /\ + PAIRWISE nonoverlapping + [(z,8 * 32); (t,8 * 24); (word_sub stackpointer (word 64),64)] /\ + ALLPAIRS nonoverlapping + [(z,8 * 32); (t,8 * 24); (word_sub stackpointer (word 64),64)] + [(word pc,2164); (x,8 * 16)] + ==> ensures arm + (\s. aligned_bytes_loaded s (word pc) bignum_ksqr_16_32_neon_mc /\ + read PC s = word pc /\ + read SP s = stackpointer /\ + read X30 s = returnaddress /\ + C_ARGUMENTS [z; x; t] s /\ + bignum_from_memory (x,16) s = a) + (\s. read PC s = returnaddress /\ + bignum_from_memory (z,32) s = a EXP 2) + (MAYCHANGE_REGS_AND_FLAGS_PERMITTED_BY_ABI ,, + MAYCHANGE [memory :> bytes(z,8 * 32); + memory :> bytes(t,8 * 24); + memory :> bytes(word_sub stackpointer (word 64),64)])`, + ARM_ADD_RETURN_STACK_TAC + BIGNUM_KSQR_16_32_NEON_EXEC BIGNUM_KSQR_16_32_NEON_CORRECT + `[X19;X20;X21;X22;X23;X24;X25;X30]` 64);; diff --git a/arm/proofs/bignum_ksqr_32_64_neon.ml b/arm/proofs/bignum_ksqr_32_64_neon.ml new file mode 100644 index 00000000..6901b462 --- /dev/null +++ b/arm/proofs/bignum_ksqr_32_64_neon.ml @@ -0,0 +1,1640 @@ +(* + * Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. + * SPDX-License-Identifier: Apache-2.0 OR ISC + *) + +(* ========================================================================= *) +(* Karatsuba (multi-level) 32x32->64 squaring. *) +(* ========================================================================= *) + +(**** print_literal_from_elf "arm/fastmul/bignum_ksqr_32_64_neon.o";; + ****) + +let bignum_ksqr_32_64_neon_mc = + define_assert_from_elf "bignum_ksqr_32_64_neon_mc" "arm/fastmul/bignum_ksqr_32_64_neon.o" +[ + 0xa9bf53f3; (* arm_STP X19 X20 SP (Preimmediate_Offset (iword (-- &16))) *) + 0xa9bf7bf5; (* arm_STP X21 X30 SP (Preimmediate_Offset (iword (-- &16))) *) + 0xaa0003f3; (* arm_MOV X19 X0 *) + 0xaa0103f4; (* arm_MOV X20 X1 *) + 0xaa0203f5; (* arm_MOV X21 X2 *) + 0x94000161; (* arm_BL (word 1412) *) + 0x91040260; (* arm_ADD X0 X19 (rvalue (word 256)) *) + 0x91020281; (* arm_ADD X1 X20 (rvalue (word 128)) *) + 0xaa1503e2; (* arm_MOV X2 X21 *) + 0x9400015d; (* arm_BL (word 1396) *) + 0xa9500660; (* arm_LDP X0 X1 X19 (Immediate_Offset (iword (&256))) *) + 0xa9480e62; (* arm_LDP X2 X3 X19 (Immediate_Offset (iword (&128))) *) + 0xab020000; (* arm_ADDS X0 X0 X2 *) + 0xba030021; (* arm_ADCS X1 X1 X3 *) + 0xa9100660; (* arm_STP X0 X1 X19 (Immediate_Offset (iword (&256))) *) + 0xa9510660; (* arm_LDP X0 X1 X19 (Immediate_Offset (iword (&272))) *) + 0xa9490e62; (* arm_LDP X2 X3 X19 (Immediate_Offset (iword (&144))) *) + 0xba020000; (* arm_ADCS X0 X0 X2 *) + 0xba030021; (* arm_ADCS X1 X1 X3 *) + 0xa9110660; (* arm_STP X0 X1 X19 (Immediate_Offset (iword (&272))) *) + 0xa9520660; (* arm_LDP X0 X1 X19 (Immediate_Offset (iword (&288))) *) + 0xa94a0e62; (* arm_LDP X2 X3 X19 (Immediate_Offset (iword (&160))) *) + 0xba020000; (* arm_ADCS X0 X0 X2 *) + 0xba030021; (* arm_ADCS X1 X1 X3 *) + 0xa9120660; (* arm_STP X0 X1 X19 (Immediate_Offset (iword (&288))) *) + 0xa9530660; (* arm_LDP X0 X1 X19 (Immediate_Offset (iword (&304))) *) + 0xa94b0e62; (* arm_LDP X2 X3 X19 (Immediate_Offset (iword (&176))) *) + 0xba020000; (* arm_ADCS X0 X0 X2 *) + 0xba030021; (* arm_ADCS X1 X1 X3 *) + 0xa9130660; (* arm_STP X0 X1 X19 (Immediate_Offset (iword (&304))) *) + 0xa9540660; (* arm_LDP X0 X1 X19 (Immediate_Offset (iword (&320))) *) + 0xa94c0e62; (* arm_LDP X2 X3 X19 (Immediate_Offset (iword (&192))) *) + 0xba020000; (* arm_ADCS X0 X0 X2 *) + 0xba030021; (* arm_ADCS X1 X1 X3 *) + 0xa9140660; (* arm_STP X0 X1 X19 (Immediate_Offset (iword (&320))) *) + 0xa9550660; (* arm_LDP X0 X1 X19 (Immediate_Offset (iword (&336))) *) + 0xa94d0e62; (* arm_LDP X2 X3 X19 (Immediate_Offset (iword (&208))) *) + 0xba020000; (* arm_ADCS X0 X0 X2 *) + 0xba030021; (* arm_ADCS X1 X1 X3 *) + 0xa9150660; (* arm_STP X0 X1 X19 (Immediate_Offset (iword (&336))) *) + 0xa9560660; (* arm_LDP X0 X1 X19 (Immediate_Offset (iword (&352))) *) + 0xa94e0e62; (* arm_LDP X2 X3 X19 (Immediate_Offset (iword (&224))) *) + 0xba020000; (* arm_ADCS X0 X0 X2 *) + 0xba030021; (* arm_ADCS X1 X1 X3 *) + 0xa9160660; (* arm_STP X0 X1 X19 (Immediate_Offset (iword (&352))) *) + 0xa9570660; (* arm_LDP X0 X1 X19 (Immediate_Offset (iword (&368))) *) + 0xa94f0e62; (* arm_LDP X2 X3 X19 (Immediate_Offset (iword (&240))) *) + 0xba020000; (* arm_ADCS X0 X0 X2 *) + 0xba030021; (* arm_ADCS X1 X1 X3 *) + 0xa9170660; (* arm_STP X0 X1 X19 (Immediate_Offset (iword (&368))) *) + 0xa9580660; (* arm_LDP X0 X1 X19 (Immediate_Offset (iword (&384))) *) + 0xba1f0000; (* arm_ADCS X0 X0 XZR *) + 0xba1f0021; (* arm_ADCS X1 X1 XZR *) + 0xa9180660; (* arm_STP X0 X1 X19 (Immediate_Offset (iword (&384))) *) + 0xa9590660; (* arm_LDP X0 X1 X19 (Immediate_Offset (iword (&400))) *) + 0xba1f0000; (* arm_ADCS X0 X0 XZR *) + 0xba1f0021; (* arm_ADCS X1 X1 XZR *) + 0xa9190660; (* arm_STP X0 X1 X19 (Immediate_Offset (iword (&400))) *) + 0xa95a0660; (* arm_LDP X0 X1 X19 (Immediate_Offset (iword (&416))) *) + 0xba1f0000; (* arm_ADCS X0 X0 XZR *) + 0xba1f0021; (* arm_ADCS X1 X1 XZR *) + 0xa91a0660; (* arm_STP X0 X1 X19 (Immediate_Offset (iword (&416))) *) + 0xa95b0660; (* arm_LDP X0 X1 X19 (Immediate_Offset (iword (&432))) *) + 0xba1f0000; (* arm_ADCS X0 X0 XZR *) + 0xba1f0021; (* arm_ADCS X1 X1 XZR *) + 0xa91b0660; (* arm_STP X0 X1 X19 (Immediate_Offset (iword (&432))) *) + 0xa95c0660; (* arm_LDP X0 X1 X19 (Immediate_Offset (iword (&448))) *) + 0xba1f0000; (* arm_ADCS X0 X0 XZR *) + 0xba1f0021; (* arm_ADCS X1 X1 XZR *) + 0xa91c0660; (* arm_STP X0 X1 X19 (Immediate_Offset (iword (&448))) *) + 0xa95d0660; (* arm_LDP X0 X1 X19 (Immediate_Offset (iword (&464))) *) + 0xba1f0000; (* arm_ADCS X0 X0 XZR *) + 0xba1f0021; (* arm_ADCS X1 X1 XZR *) + 0xa91d0660; (* arm_STP X0 X1 X19 (Immediate_Offset (iword (&464))) *) + 0xa95e0660; (* arm_LDP X0 X1 X19 (Immediate_Offset (iword (&480))) *) + 0xba1f0000; (* arm_ADCS X0 X0 XZR *) + 0xba1f0021; (* arm_ADCS X1 X1 XZR *) + 0xa91e0660; (* arm_STP X0 X1 X19 (Immediate_Offset (iword (&480))) *) + 0xa95f0660; (* arm_LDP X0 X1 X19 (Immediate_Offset (iword (&496))) *) + 0xba1f0000; (* arm_ADCS X0 X0 XZR *) + 0x9a1f0021; (* arm_ADC X1 X1 XZR *) + 0xa91f0660; (* arm_STP X0 X1 X19 (Immediate_Offset (iword (&496))) *) + 0xa9480680; (* arm_LDP X0 X1 X20 (Immediate_Offset (iword (&128))) *) + 0xa9404690; (* arm_LDP X16 X17 X20 (Immediate_Offset (iword (&0))) *) + 0xeb100000; (* arm_SUBS X0 X0 X16 *) + 0xfa110021; (* arm_SBCS X1 X1 X17 *) + 0xa9490e82; (* arm_LDP X2 X3 X20 (Immediate_Offset (iword (&144))) *) + 0xa9414690; (* arm_LDP X16 X17 X20 (Immediate_Offset (iword (&16))) *) + 0xfa100042; (* arm_SBCS X2 X2 X16 *) + 0xfa110063; (* arm_SBCS X3 X3 X17 *) + 0xa94a1684; (* arm_LDP X4 X5 X20 (Immediate_Offset (iword (&160))) *) + 0xa9424690; (* arm_LDP X16 X17 X20 (Immediate_Offset (iword (&32))) *) + 0xfa100084; (* arm_SBCS X4 X4 X16 *) + 0xfa1100a5; (* arm_SBCS X5 X5 X17 *) + 0xa94b1e86; (* arm_LDP X6 X7 X20 (Immediate_Offset (iword (&176))) *) + 0xa9434690; (* arm_LDP X16 X17 X20 (Immediate_Offset (iword (&48))) *) + 0xfa1000c6; (* arm_SBCS X6 X6 X16 *) + 0xfa1100e7; (* arm_SBCS X7 X7 X17 *) + 0xa94c2688; (* arm_LDP X8 X9 X20 (Immediate_Offset (iword (&192))) *) + 0xa9444690; (* arm_LDP X16 X17 X20 (Immediate_Offset (iword (&64))) *) + 0xfa100108; (* arm_SBCS X8 X8 X16 *) + 0xfa110129; (* arm_SBCS X9 X9 X17 *) + 0xa94d2e8a; (* arm_LDP X10 X11 X20 (Immediate_Offset (iword (&208))) *) + 0xa9454690; (* arm_LDP X16 X17 X20 (Immediate_Offset (iword (&80))) *) + 0xfa10014a; (* arm_SBCS X10 X10 X16 *) + 0xfa11016b; (* arm_SBCS X11 X11 X17 *) + 0xa94e368c; (* arm_LDP X12 X13 X20 (Immediate_Offset (iword (&224))) *) + 0xa9464690; (* arm_LDP X16 X17 X20 (Immediate_Offset (iword (&96))) *) + 0xfa10018c; (* arm_SBCS X12 X12 X16 *) + 0xfa1101ad; (* arm_SBCS X13 X13 X17 *) + 0xa94f3e8e; (* arm_LDP X14 X15 X20 (Immediate_Offset (iword (&240))) *) + 0xa9474690; (* arm_LDP X16 X17 X20 (Immediate_Offset (iword (&112))) *) + 0xfa1001ce; (* arm_SBCS X14 X14 X16 *) + 0xfa1101ef; (* arm_SBCS X15 X15 X17 *) + 0xda1f03f0; (* arm_NGC X16 XZR *) + 0xab10021f; (* arm_CMN X16 X16 *) + 0xca100000; (* arm_EOR X0 X0 X16 *) + 0xba1f0000; (* arm_ADCS X0 X0 XZR *) + 0xca100021; (* arm_EOR X1 X1 X16 *) + 0xba1f0021; (* arm_ADCS X1 X1 XZR *) + 0xa90006a0; (* arm_STP X0 X1 X21 (Immediate_Offset (iword (&0))) *) + 0xca100042; (* arm_EOR X2 X2 X16 *) + 0xba1f0042; (* arm_ADCS X2 X2 XZR *) + 0xca100063; (* arm_EOR X3 X3 X16 *) + 0xba1f0063; (* arm_ADCS X3 X3 XZR *) + 0xa9010ea2; (* arm_STP X2 X3 X21 (Immediate_Offset (iword (&16))) *) + 0xca100084; (* arm_EOR X4 X4 X16 *) + 0xba1f0084; (* arm_ADCS X4 X4 XZR *) + 0xca1000a5; (* arm_EOR X5 X5 X16 *) + 0xba1f00a5; (* arm_ADCS X5 X5 XZR *) + 0xa90216a4; (* arm_STP X4 X5 X21 (Immediate_Offset (iword (&32))) *) + 0xca1000c6; (* arm_EOR X6 X6 X16 *) + 0xba1f00c6; (* arm_ADCS X6 X6 XZR *) + 0xca1000e7; (* arm_EOR X7 X7 X16 *) + 0xba1f00e7; (* arm_ADCS X7 X7 XZR *) + 0xa9031ea6; (* arm_STP X6 X7 X21 (Immediate_Offset (iword (&48))) *) + 0xca100108; (* arm_EOR X8 X8 X16 *) + 0xba1f0108; (* arm_ADCS X8 X8 XZR *) + 0xca100129; (* arm_EOR X9 X9 X16 *) + 0xba1f0129; (* arm_ADCS X9 X9 XZR *) + 0xa90426a8; (* arm_STP X8 X9 X21 (Immediate_Offset (iword (&64))) *) + 0xca10014a; (* arm_EOR X10 X10 X16 *) + 0xba1f014a; (* arm_ADCS X10 X10 XZR *) + 0xca10016b; (* arm_EOR X11 X11 X16 *) + 0xba1f016b; (* arm_ADCS X11 X11 XZR *) + 0xa9052eaa; (* arm_STP X10 X11 X21 (Immediate_Offset (iword (&80))) *) + 0xca10018c; (* arm_EOR X12 X12 X16 *) + 0xba1f018c; (* arm_ADCS X12 X12 XZR *) + 0xca1001ad; (* arm_EOR X13 X13 X16 *) + 0xba1f01ad; (* arm_ADCS X13 X13 XZR *) + 0xa90636ac; (* arm_STP X12 X13 X21 (Immediate_Offset (iword (&96))) *) + 0xca1001ce; (* arm_EOR X14 X14 X16 *) + 0xba1f01ce; (* arm_ADCS X14 X14 XZR *) + 0xca1001ef; (* arm_EOR X15 X15 X16 *) + 0x9a1f01ef; (* arm_ADC X15 X15 XZR *) + 0xa9073eae; (* arm_STP X14 X15 X21 (Immediate_Offset (iword (&112))) *) + 0x910202a0; (* arm_ADD X0 X21 (rvalue (word 128)) *) + 0xaa1503e1; (* arm_MOV X1 X21 *) + 0x910602a2; (* arm_ADD X2 X21 (rvalue (word 384)) *) + 0x940000c7; (* arm_BL (word 796) *) + 0xa9500660; (* arm_LDP X0 X1 X19 (Immediate_Offset (iword (&256))) *) + 0xa9400e62; (* arm_LDP X2 X3 X19 (Immediate_Offset (iword (&0))) *) + 0xab020000; (* arm_ADDS X0 X0 X2 *) + 0xba030021; (* arm_ADCS X1 X1 X3 *) + 0xa9080660; (* arm_STP X0 X1 X19 (Immediate_Offset (iword (&128))) *) + 0xa9510660; (* arm_LDP X0 X1 X19 (Immediate_Offset (iword (&272))) *) + 0xa9410e62; (* arm_LDP X2 X3 X19 (Immediate_Offset (iword (&16))) *) + 0xba020000; (* arm_ADCS X0 X0 X2 *) + 0xba030021; (* arm_ADCS X1 X1 X3 *) + 0xa9090660; (* arm_STP X0 X1 X19 (Immediate_Offset (iword (&144))) *) + 0xa9520660; (* arm_LDP X0 X1 X19 (Immediate_Offset (iword (&288))) *) + 0xa9420e62; (* arm_LDP X2 X3 X19 (Immediate_Offset (iword (&32))) *) + 0xba020000; (* arm_ADCS X0 X0 X2 *) + 0xba030021; (* arm_ADCS X1 X1 X3 *) + 0xa90a0660; (* arm_STP X0 X1 X19 (Immediate_Offset (iword (&160))) *) + 0xa9530660; (* arm_LDP X0 X1 X19 (Immediate_Offset (iword (&304))) *) + 0xa9430e62; (* arm_LDP X2 X3 X19 (Immediate_Offset (iword (&48))) *) + 0xba020000; (* arm_ADCS X0 X0 X2 *) + 0xba030021; (* arm_ADCS X1 X1 X3 *) + 0xa90b0660; (* arm_STP X0 X1 X19 (Immediate_Offset (iword (&176))) *) + 0xa9540660; (* arm_LDP X0 X1 X19 (Immediate_Offset (iword (&320))) *) + 0xa9440e62; (* arm_LDP X2 X3 X19 (Immediate_Offset (iword (&64))) *) + 0xba020000; (* arm_ADCS X0 X0 X2 *) + 0xba030021; (* arm_ADCS X1 X1 X3 *) + 0xa90c0660; (* arm_STP X0 X1 X19 (Immediate_Offset (iword (&192))) *) + 0xa9550660; (* arm_LDP X0 X1 X19 (Immediate_Offset (iword (&336))) *) + 0xa9450e62; (* arm_LDP X2 X3 X19 (Immediate_Offset (iword (&80))) *) + 0xba020000; (* arm_ADCS X0 X0 X2 *) + 0xba030021; (* arm_ADCS X1 X1 X3 *) + 0xa90d0660; (* arm_STP X0 X1 X19 (Immediate_Offset (iword (&208))) *) + 0xa9560660; (* arm_LDP X0 X1 X19 (Immediate_Offset (iword (&352))) *) + 0xa9460e62; (* arm_LDP X2 X3 X19 (Immediate_Offset (iword (&96))) *) + 0xba020000; (* arm_ADCS X0 X0 X2 *) + 0xba030021; (* arm_ADCS X1 X1 X3 *) + 0xa90e0660; (* arm_STP X0 X1 X19 (Immediate_Offset (iword (&224))) *) + 0xa9570660; (* arm_LDP X0 X1 X19 (Immediate_Offset (iword (&368))) *) + 0xa9470e62; (* arm_LDP X2 X3 X19 (Immediate_Offset (iword (&112))) *) + 0xba020000; (* arm_ADCS X0 X0 X2 *) + 0xba030021; (* arm_ADCS X1 X1 X3 *) + 0xa90f0660; (* arm_STP X0 X1 X19 (Immediate_Offset (iword (&240))) *) + 0xa9500660; (* arm_LDP X0 X1 X19 (Immediate_Offset (iword (&256))) *) + 0xa9580e62; (* arm_LDP X2 X3 X19 (Immediate_Offset (iword (&384))) *) + 0xba020000; (* arm_ADCS X0 X0 X2 *) + 0xba030021; (* arm_ADCS X1 X1 X3 *) + 0xa9100660; (* arm_STP X0 X1 X19 (Immediate_Offset (iword (&256))) *) + 0xa9510660; (* arm_LDP X0 X1 X19 (Immediate_Offset (iword (&272))) *) + 0xa9590e62; (* arm_LDP X2 X3 X19 (Immediate_Offset (iword (&400))) *) + 0xba020000; (* arm_ADCS X0 X0 X2 *) + 0xba030021; (* arm_ADCS X1 X1 X3 *) + 0xa9110660; (* arm_STP X0 X1 X19 (Immediate_Offset (iword (&272))) *) + 0xa9520660; (* arm_LDP X0 X1 X19 (Immediate_Offset (iword (&288))) *) + 0xa95a0e62; (* arm_LDP X2 X3 X19 (Immediate_Offset (iword (&416))) *) + 0xba020000; (* arm_ADCS X0 X0 X2 *) + 0xba030021; (* arm_ADCS X1 X1 X3 *) + 0xa9120660; (* arm_STP X0 X1 X19 (Immediate_Offset (iword (&288))) *) + 0xa9530660; (* arm_LDP X0 X1 X19 (Immediate_Offset (iword (&304))) *) + 0xa95b0e62; (* arm_LDP X2 X3 X19 (Immediate_Offset (iword (&432))) *) + 0xba020000; (* arm_ADCS X0 X0 X2 *) + 0xba030021; (* arm_ADCS X1 X1 X3 *) + 0xa9130660; (* arm_STP X0 X1 X19 (Immediate_Offset (iword (&304))) *) + 0xa9540660; (* arm_LDP X0 X1 X19 (Immediate_Offset (iword (&320))) *) + 0xa95c0e62; (* arm_LDP X2 X3 X19 (Immediate_Offset (iword (&448))) *) + 0xba020000; (* arm_ADCS X0 X0 X2 *) + 0xba030021; (* arm_ADCS X1 X1 X3 *) + 0xa9140660; (* arm_STP X0 X1 X19 (Immediate_Offset (iword (&320))) *) + 0xa9550660; (* arm_LDP X0 X1 X19 (Immediate_Offset (iword (&336))) *) + 0xa95d0e62; (* arm_LDP X2 X3 X19 (Immediate_Offset (iword (&464))) *) + 0xba020000; (* arm_ADCS X0 X0 X2 *) + 0xba030021; (* arm_ADCS X1 X1 X3 *) + 0xa9150660; (* arm_STP X0 X1 X19 (Immediate_Offset (iword (&336))) *) + 0xa9560660; (* arm_LDP X0 X1 X19 (Immediate_Offset (iword (&352))) *) + 0xa95e0e62; (* arm_LDP X2 X3 X19 (Immediate_Offset (iword (&480))) *) + 0xba020000; (* arm_ADCS X0 X0 X2 *) + 0xba030021; (* arm_ADCS X1 X1 X3 *) + 0xa9160660; (* arm_STP X0 X1 X19 (Immediate_Offset (iword (&352))) *) + 0xa9570660; (* arm_LDP X0 X1 X19 (Immediate_Offset (iword (&368))) *) + 0xa95f0e62; (* arm_LDP X2 X3 X19 (Immediate_Offset (iword (&496))) *) + 0xba020000; (* arm_ADCS X0 X0 X2 *) + 0xba030021; (* arm_ADCS X1 X1 X3 *) + 0xa9170660; (* arm_STP X0 X1 X19 (Immediate_Offset (iword (&368))) *) + 0x9a9f37f4; (* arm_CSET X20 Condition_CS *) + 0xa9480660; (* arm_LDP X0 X1 X19 (Immediate_Offset (iword (&128))) *) + 0xa9480ea2; (* arm_LDP X2 X3 X21 (Immediate_Offset (iword (&128))) *) + 0xeb020000; (* arm_SUBS X0 X0 X2 *) + 0xfa030021; (* arm_SBCS X1 X1 X3 *) + 0xa9080660; (* arm_STP X0 X1 X19 (Immediate_Offset (iword (&128))) *) + 0xa9490660; (* arm_LDP X0 X1 X19 (Immediate_Offset (iword (&144))) *) + 0xa9490ea2; (* arm_LDP X2 X3 X21 (Immediate_Offset (iword (&144))) *) + 0xfa020000; (* arm_SBCS X0 X0 X2 *) + 0xfa030021; (* arm_SBCS X1 X1 X3 *) + 0xa9090660; (* arm_STP X0 X1 X19 (Immediate_Offset (iword (&144))) *) + 0xa94a0660; (* arm_LDP X0 X1 X19 (Immediate_Offset (iword (&160))) *) + 0xa94a0ea2; (* arm_LDP X2 X3 X21 (Immediate_Offset (iword (&160))) *) + 0xfa020000; (* arm_SBCS X0 X0 X2 *) + 0xfa030021; (* arm_SBCS X1 X1 X3 *) + 0xa90a0660; (* arm_STP X0 X1 X19 (Immediate_Offset (iword (&160))) *) + 0xa94b0660; (* arm_LDP X0 X1 X19 (Immediate_Offset (iword (&176))) *) + 0xa94b0ea2; (* arm_LDP X2 X3 X21 (Immediate_Offset (iword (&176))) *) + 0xfa020000; (* arm_SBCS X0 X0 X2 *) + 0xfa030021; (* arm_SBCS X1 X1 X3 *) + 0xa90b0660; (* arm_STP X0 X1 X19 (Immediate_Offset (iword (&176))) *) + 0xa94c0660; (* arm_LDP X0 X1 X19 (Immediate_Offset (iword (&192))) *) + 0xa94c0ea2; (* arm_LDP X2 X3 X21 (Immediate_Offset (iword (&192))) *) + 0xfa020000; (* arm_SBCS X0 X0 X2 *) + 0xfa030021; (* arm_SBCS X1 X1 X3 *) + 0xa90c0660; (* arm_STP X0 X1 X19 (Immediate_Offset (iword (&192))) *) + 0xa94d0660; (* arm_LDP X0 X1 X19 (Immediate_Offset (iword (&208))) *) + 0xa94d0ea2; (* arm_LDP X2 X3 X21 (Immediate_Offset (iword (&208))) *) + 0xfa020000; (* arm_SBCS X0 X0 X2 *) + 0xfa030021; (* arm_SBCS X1 X1 X3 *) + 0xa90d0660; (* arm_STP X0 X1 X19 (Immediate_Offset (iword (&208))) *) + 0xa94e0660; (* arm_LDP X0 X1 X19 (Immediate_Offset (iword (&224))) *) + 0xa94e0ea2; (* arm_LDP X2 X3 X21 (Immediate_Offset (iword (&224))) *) + 0xfa020000; (* arm_SBCS X0 X0 X2 *) + 0xfa030021; (* arm_SBCS X1 X1 X3 *) + 0xa90e0660; (* arm_STP X0 X1 X19 (Immediate_Offset (iword (&224))) *) + 0xa94f0660; (* arm_LDP X0 X1 X19 (Immediate_Offset (iword (&240))) *) + 0xa94f0ea2; (* arm_LDP X2 X3 X21 (Immediate_Offset (iword (&240))) *) + 0xfa020000; (* arm_SBCS X0 X0 X2 *) + 0xfa030021; (* arm_SBCS X1 X1 X3 *) + 0xa90f0660; (* arm_STP X0 X1 X19 (Immediate_Offset (iword (&240))) *) + 0xa9500660; (* arm_LDP X0 X1 X19 (Immediate_Offset (iword (&256))) *) + 0xa9500ea2; (* arm_LDP X2 X3 X21 (Immediate_Offset (iword (&256))) *) + 0xfa020000; (* arm_SBCS X0 X0 X2 *) + 0xfa030021; (* arm_SBCS X1 X1 X3 *) + 0xa9100660; (* arm_STP X0 X1 X19 (Immediate_Offset (iword (&256))) *) + 0xa9510660; (* arm_LDP X0 X1 X19 (Immediate_Offset (iword (&272))) *) + 0xa9510ea2; (* arm_LDP X2 X3 X21 (Immediate_Offset (iword (&272))) *) + 0xfa020000; (* arm_SBCS X0 X0 X2 *) + 0xfa030021; (* arm_SBCS X1 X1 X3 *) + 0xa9110660; (* arm_STP X0 X1 X19 (Immediate_Offset (iword (&272))) *) + 0xa9520660; (* arm_LDP X0 X1 X19 (Immediate_Offset (iword (&288))) *) + 0xa9520ea2; (* arm_LDP X2 X3 X21 (Immediate_Offset (iword (&288))) *) + 0xfa020000; (* arm_SBCS X0 X0 X2 *) + 0xfa030021; (* arm_SBCS X1 X1 X3 *) + 0xa9120660; (* arm_STP X0 X1 X19 (Immediate_Offset (iword (&288))) *) + 0xa9530660; (* arm_LDP X0 X1 X19 (Immediate_Offset (iword (&304))) *) + 0xa9530ea2; (* arm_LDP X2 X3 X21 (Immediate_Offset (iword (&304))) *) + 0xfa020000; (* arm_SBCS X0 X0 X2 *) + 0xfa030021; (* arm_SBCS X1 X1 X3 *) + 0xa9130660; (* arm_STP X0 X1 X19 (Immediate_Offset (iword (&304))) *) + 0xa9540660; (* arm_LDP X0 X1 X19 (Immediate_Offset (iword (&320))) *) + 0xa9540ea2; (* arm_LDP X2 X3 X21 (Immediate_Offset (iword (&320))) *) + 0xfa020000; (* arm_SBCS X0 X0 X2 *) + 0xfa030021; (* arm_SBCS X1 X1 X3 *) + 0xa9140660; (* arm_STP X0 X1 X19 (Immediate_Offset (iword (&320))) *) + 0xa9550660; (* arm_LDP X0 X1 X19 (Immediate_Offset (iword (&336))) *) + 0xa9550ea2; (* arm_LDP X2 X3 X21 (Immediate_Offset (iword (&336))) *) + 0xfa020000; (* arm_SBCS X0 X0 X2 *) + 0xfa030021; (* arm_SBCS X1 X1 X3 *) + 0xa9150660; (* arm_STP X0 X1 X19 (Immediate_Offset (iword (&336))) *) + 0xa9560660; (* arm_LDP X0 X1 X19 (Immediate_Offset (iword (&352))) *) + 0xa9560ea2; (* arm_LDP X2 X3 X21 (Immediate_Offset (iword (&352))) *) + 0xfa020000; (* arm_SBCS X0 X0 X2 *) + 0xfa030021; (* arm_SBCS X1 X1 X3 *) + 0xa9160660; (* arm_STP X0 X1 X19 (Immediate_Offset (iword (&352))) *) + 0xa9570660; (* arm_LDP X0 X1 X19 (Immediate_Offset (iword (&368))) *) + 0xa9570ea2; (* arm_LDP X2 X3 X21 (Immediate_Offset (iword (&368))) *) + 0xfa020000; (* arm_SBCS X0 X0 X2 *) + 0xfa030021; (* arm_SBCS X1 X1 X3 *) + 0xa9170660; (* arm_STP X0 X1 X19 (Immediate_Offset (iword (&368))) *) + 0xfa1f0294; (* arm_SBCS X20 X20 XZR *) + 0xda9f23f0; (* arm_CSETM X16 Condition_CC *) + 0xa9580660; (* arm_LDP X0 X1 X19 (Immediate_Offset (iword (&384))) *) + 0xab140000; (* arm_ADDS X0 X0 X20 *) + 0xba100021; (* arm_ADCS X1 X1 X16 *) + 0xa9180660; (* arm_STP X0 X1 X19 (Immediate_Offset (iword (&384))) *) + 0xa9590660; (* arm_LDP X0 X1 X19 (Immediate_Offset (iword (&400))) *) + 0xba100000; (* arm_ADCS X0 X0 X16 *) + 0xba100021; (* arm_ADCS X1 X1 X16 *) + 0xa9190660; (* arm_STP X0 X1 X19 (Immediate_Offset (iword (&400))) *) + 0xa95a0660; (* arm_LDP X0 X1 X19 (Immediate_Offset (iword (&416))) *) + 0xba100000; (* arm_ADCS X0 X0 X16 *) + 0xba100021; (* arm_ADCS X1 X1 X16 *) + 0xa91a0660; (* arm_STP X0 X1 X19 (Immediate_Offset (iword (&416))) *) + 0xa95b0660; (* arm_LDP X0 X1 X19 (Immediate_Offset (iword (&432))) *) + 0xba100000; (* arm_ADCS X0 X0 X16 *) + 0xba100021; (* arm_ADCS X1 X1 X16 *) + 0xa91b0660; (* arm_STP X0 X1 X19 (Immediate_Offset (iword (&432))) *) + 0xa95c0660; (* arm_LDP X0 X1 X19 (Immediate_Offset (iword (&448))) *) + 0xba100000; (* arm_ADCS X0 X0 X16 *) + 0xba100021; (* arm_ADCS X1 X1 X16 *) + 0xa91c0660; (* arm_STP X0 X1 X19 (Immediate_Offset (iword (&448))) *) + 0xa95d0660; (* arm_LDP X0 X1 X19 (Immediate_Offset (iword (&464))) *) + 0xba100000; (* arm_ADCS X0 X0 X16 *) + 0xba100021; (* arm_ADCS X1 X1 X16 *) + 0xa91d0660; (* arm_STP X0 X1 X19 (Immediate_Offset (iword (&464))) *) + 0xa95e0660; (* arm_LDP X0 X1 X19 (Immediate_Offset (iword (&480))) *) + 0xba100000; (* arm_ADCS X0 X0 X16 *) + 0xba100021; (* arm_ADCS X1 X1 X16 *) + 0xa91e0660; (* arm_STP X0 X1 X19 (Immediate_Offset (iword (&480))) *) + 0xa95f0660; (* arm_LDP X0 X1 X19 (Immediate_Offset (iword (&496))) *) + 0xba100000; (* arm_ADCS X0 X0 X16 *) + 0x9a100021; (* arm_ADC X1 X1 X16 *) + 0xa91f0660; (* arm_STP X0 X1 X19 (Immediate_Offset (iword (&496))) *) + 0xa8c17bf5; (* arm_LDP X21 X30 SP (Postimmediate_Offset (iword (&16))) *) + 0xa8c153f3; (* arm_LDP X19 X20 SP (Postimmediate_Offset (iword (&16))) *) + 0xd65f03c0; (* arm_RET X30 *) + 0xa9bf53f3; (* arm_STP X19 X20 SP (Preimmediate_Offset (iword (-- &16))) *) + 0xa9bf5bf5; (* arm_STP X21 X22 SP (Preimmediate_Offset (iword (-- &16))) *) + 0xa9bf63f7; (* arm_STP X23 X24 SP (Preimmediate_Offset (iword (-- &16))) *) + 0xa9bf7bf9; (* arm_STP X25 X30 SP (Preimmediate_Offset (iword (-- &16))) *) + 0xaa0003f7; (* arm_MOV X23 X0 *) + 0xaa0103f8; (* arm_MOV X24 X1 *) + 0xaa0203f9; (* arm_MOV X25 X2 *) + 0x940000a9; (* arm_BL (word 676) *) + 0xa9402f0a; (* arm_LDP X10 X11 X24 (Immediate_Offset (iword (&0))) *) + 0xa9442708; (* arm_LDP X8 X9 X24 (Immediate_Offset (iword (&64))) *) + 0xeb08014a; (* arm_SUBS X10 X10 X8 *) + 0xfa09016b; (* arm_SBCS X11 X11 X9 *) + 0xa941370c; (* arm_LDP X12 X13 X24 (Immediate_Offset (iword (&16))) *) + 0xa9452708; (* arm_LDP X8 X9 X24 (Immediate_Offset (iword (&80))) *) + 0xfa08018c; (* arm_SBCS X12 X12 X8 *) + 0xfa0901ad; (* arm_SBCS X13 X13 X9 *) + 0xa9423f0e; (* arm_LDP X14 X15 X24 (Immediate_Offset (iword (&32))) *) + 0xa9462708; (* arm_LDP X8 X9 X24 (Immediate_Offset (iword (&96))) *) + 0xfa0801ce; (* arm_SBCS X14 X14 X8 *) + 0xfa0901ef; (* arm_SBCS X15 X15 X9 *) + 0xa9434710; (* arm_LDP X16 X17 X24 (Immediate_Offset (iword (&48))) *) + 0xa9472708; (* arm_LDP X8 X9 X24 (Immediate_Offset (iword (&112))) *) + 0xfa080210; (* arm_SBCS X16 X16 X8 *) + 0xfa090231; (* arm_SBCS X17 X17 X9 *) + 0xda9f23f3; (* arm_CSETM X19 Condition_CC *) + 0xab13027f; (* arm_CMN X19 X19 *) + 0xca13014a; (* arm_EOR X10 X10 X19 *) + 0xba1f014a; (* arm_ADCS X10 X10 XZR *) + 0xca13016b; (* arm_EOR X11 X11 X19 *) + 0xba1f016b; (* arm_ADCS X11 X11 XZR *) + 0xa9002f2a; (* arm_STP X10 X11 X25 (Immediate_Offset (iword (&0))) *) + 0xca13018c; (* arm_EOR X12 X12 X19 *) + 0xba1f018c; (* arm_ADCS X12 X12 XZR *) + 0xca1301ad; (* arm_EOR X13 X13 X19 *) + 0xba1f01ad; (* arm_ADCS X13 X13 XZR *) + 0xa901372c; (* arm_STP X12 X13 X25 (Immediate_Offset (iword (&16))) *) + 0xca1301ce; (* arm_EOR X14 X14 X19 *) + 0xba1f01ce; (* arm_ADCS X14 X14 XZR *) + 0xca1301ef; (* arm_EOR X15 X15 X19 *) + 0xba1f01ef; (* arm_ADCS X15 X15 XZR *) + 0xa9023f2e; (* arm_STP X14 X15 X25 (Immediate_Offset (iword (&32))) *) + 0xca130210; (* arm_EOR X16 X16 X19 *) + 0xba1f0210; (* arm_ADCS X16 X16 XZR *) + 0xca130231; (* arm_EOR X17 X17 X19 *) + 0xba1f0231; (* arm_ADCS X17 X17 XZR *) + 0xa9034730; (* arm_STP X16 X17 X25 (Immediate_Offset (iword (&48))) *) + 0x910202e0; (* arm_ADD X0 X23 (rvalue (word 128)) *) + 0x91010301; (* arm_ADD X1 X24 (rvalue (word 64)) *) + 0x94000080; (* arm_BL (word 512) *) + 0xa9482eea; (* arm_LDP X10 X11 X23 (Immediate_Offset (iword (&128))) *) + 0xa94436ec; (* arm_LDP X12 X13 X23 (Immediate_Offset (iword (&64))) *) + 0xab0c014a; (* arm_ADDS X10 X10 X12 *) + 0xba0d016b; (* arm_ADCS X11 X11 X13 *) + 0xa9082eea; (* arm_STP X10 X11 X23 (Immediate_Offset (iword (&128))) *) + 0xa9492eea; (* arm_LDP X10 X11 X23 (Immediate_Offset (iword (&144))) *) + 0xa94536ec; (* arm_LDP X12 X13 X23 (Immediate_Offset (iword (&80))) *) + 0xba0c014a; (* arm_ADCS X10 X10 X12 *) + 0xba0d016b; (* arm_ADCS X11 X11 X13 *) + 0xa9092eea; (* arm_STP X10 X11 X23 (Immediate_Offset (iword (&144))) *) + 0xa94a2eea; (* arm_LDP X10 X11 X23 (Immediate_Offset (iword (&160))) *) + 0xa94636ec; (* arm_LDP X12 X13 X23 (Immediate_Offset (iword (&96))) *) + 0xba0c014a; (* arm_ADCS X10 X10 X12 *) + 0xba0d016b; (* arm_ADCS X11 X11 X13 *) + 0xa90a2eea; (* arm_STP X10 X11 X23 (Immediate_Offset (iword (&160))) *) + 0xa94b2eea; (* arm_LDP X10 X11 X23 (Immediate_Offset (iword (&176))) *) + 0xa94736ec; (* arm_LDP X12 X13 X23 (Immediate_Offset (iword (&112))) *) + 0xba0c014a; (* arm_ADCS X10 X10 X12 *) + 0xba0d016b; (* arm_ADCS X11 X11 X13 *) + 0xa90b2eea; (* arm_STP X10 X11 X23 (Immediate_Offset (iword (&176))) *) + 0xa94c2eea; (* arm_LDP X10 X11 X23 (Immediate_Offset (iword (&192))) *) + 0xba1f014a; (* arm_ADCS X10 X10 XZR *) + 0xba1f016b; (* arm_ADCS X11 X11 XZR *) + 0xa90c2eea; (* arm_STP X10 X11 X23 (Immediate_Offset (iword (&192))) *) + 0xa94d2eea; (* arm_LDP X10 X11 X23 (Immediate_Offset (iword (&208))) *) + 0xba1f014a; (* arm_ADCS X10 X10 XZR *) + 0xba1f016b; (* arm_ADCS X11 X11 XZR *) + 0xa90d2eea; (* arm_STP X10 X11 X23 (Immediate_Offset (iword (&208))) *) + 0xa94e2eea; (* arm_LDP X10 X11 X23 (Immediate_Offset (iword (&224))) *) + 0xba1f014a; (* arm_ADCS X10 X10 XZR *) + 0xba1f016b; (* arm_ADCS X11 X11 XZR *) + 0xa90e2eea; (* arm_STP X10 X11 X23 (Immediate_Offset (iword (&224))) *) + 0xa94f2eea; (* arm_LDP X10 X11 X23 (Immediate_Offset (iword (&240))) *) + 0xba1f014a; (* arm_ADCS X10 X10 XZR *) + 0xba1f016b; (* arm_ADCS X11 X11 XZR *) + 0xa90f2eea; (* arm_STP X10 X11 X23 (Immediate_Offset (iword (&240))) *) + 0x91010320; (* arm_ADD X0 X25 (rvalue (word 64)) *) + 0xaa1903e1; (* arm_MOV X1 X25 *) + 0x94000059; (* arm_BL (word 356) *) + 0xa94006e0; (* arm_LDP X0 X1 X23 (Immediate_Offset (iword (&0))) *) + 0xa94846f0; (* arm_LDP X16 X17 X23 (Immediate_Offset (iword (&128))) *) + 0xab100000; (* arm_ADDS X0 X0 X16 *) + 0xba110021; (* arm_ADCS X1 X1 X17 *) + 0xa9410ee2; (* arm_LDP X2 X3 X23 (Immediate_Offset (iword (&16))) *) + 0xa94946f0; (* arm_LDP X16 X17 X23 (Immediate_Offset (iword (&144))) *) + 0xba100042; (* arm_ADCS X2 X2 X16 *) + 0xba110063; (* arm_ADCS X3 X3 X17 *) + 0xa94216e4; (* arm_LDP X4 X5 X23 (Immediate_Offset (iword (&32))) *) + 0xa94a46f0; (* arm_LDP X16 X17 X23 (Immediate_Offset (iword (&160))) *) + 0xba100084; (* arm_ADCS X4 X4 X16 *) + 0xba1100a5; (* arm_ADCS X5 X5 X17 *) + 0xa9431ee6; (* arm_LDP X6 X7 X23 (Immediate_Offset (iword (&48))) *) + 0xa94b46f0; (* arm_LDP X16 X17 X23 (Immediate_Offset (iword (&176))) *) + 0xba1000c6; (* arm_ADCS X6 X6 X16 *) + 0xba1100e7; (* arm_ADCS X7 X7 X17 *) + 0xa94826e8; (* arm_LDP X8 X9 X23 (Immediate_Offset (iword (&128))) *) + 0xa94c46f0; (* arm_LDP X16 X17 X23 (Immediate_Offset (iword (&192))) *) + 0xba100108; (* arm_ADCS X8 X8 X16 *) + 0xba110129; (* arm_ADCS X9 X9 X17 *) + 0xa9492eea; (* arm_LDP X10 X11 X23 (Immediate_Offset (iword (&144))) *) + 0xa94d46f0; (* arm_LDP X16 X17 X23 (Immediate_Offset (iword (&208))) *) + 0xba10014a; (* arm_ADCS X10 X10 X16 *) + 0xba11016b; (* arm_ADCS X11 X11 X17 *) + 0xa94a36ec; (* arm_LDP X12 X13 X23 (Immediate_Offset (iword (&160))) *) + 0xa94e46f0; (* arm_LDP X16 X17 X23 (Immediate_Offset (iword (&224))) *) + 0xba10018c; (* arm_ADCS X12 X12 X16 *) + 0xba1101ad; (* arm_ADCS X13 X13 X17 *) + 0xa94b3eee; (* arm_LDP X14 X15 X23 (Immediate_Offset (iword (&176))) *) + 0xa94f46f0; (* arm_LDP X16 X17 X23 (Immediate_Offset (iword (&240))) *) + 0xba1001ce; (* arm_ADCS X14 X14 X16 *) + 0xba1101ef; (* arm_ADCS X15 X15 X17 *) + 0x9a9f37f8; (* arm_CSET X24 Condition_CS *) + 0xa9444730; (* arm_LDP X16 X17 X25 (Immediate_Offset (iword (&64))) *) + 0xeb100000; (* arm_SUBS X0 X0 X16 *) + 0xfa110021; (* arm_SBCS X1 X1 X17 *) + 0xa90406e0; (* arm_STP X0 X1 X23 (Immediate_Offset (iword (&64))) *) + 0xa9454730; (* arm_LDP X16 X17 X25 (Immediate_Offset (iword (&80))) *) + 0xfa100042; (* arm_SBCS X2 X2 X16 *) + 0xfa110063; (* arm_SBCS X3 X3 X17 *) + 0xa9050ee2; (* arm_STP X2 X3 X23 (Immediate_Offset (iword (&80))) *) + 0xa9464730; (* arm_LDP X16 X17 X25 (Immediate_Offset (iword (&96))) *) + 0xfa100084; (* arm_SBCS X4 X4 X16 *) + 0xfa1100a5; (* arm_SBCS X5 X5 X17 *) + 0xa90616e4; (* arm_STP X4 X5 X23 (Immediate_Offset (iword (&96))) *) + 0xa9474730; (* arm_LDP X16 X17 X25 (Immediate_Offset (iword (&112))) *) + 0xfa1000c6; (* arm_SBCS X6 X6 X16 *) + 0xfa1100e7; (* arm_SBCS X7 X7 X17 *) + 0xa9071ee6; (* arm_STP X6 X7 X23 (Immediate_Offset (iword (&112))) *) + 0xa9484730; (* arm_LDP X16 X17 X25 (Immediate_Offset (iword (&128))) *) + 0xfa100108; (* arm_SBCS X8 X8 X16 *) + 0xfa110129; (* arm_SBCS X9 X9 X17 *) + 0xa90826e8; (* arm_STP X8 X9 X23 (Immediate_Offset (iword (&128))) *) + 0xa9494730; (* arm_LDP X16 X17 X25 (Immediate_Offset (iword (&144))) *) + 0xfa10014a; (* arm_SBCS X10 X10 X16 *) + 0xfa11016b; (* arm_SBCS X11 X11 X17 *) + 0xa9092eea; (* arm_STP X10 X11 X23 (Immediate_Offset (iword (&144))) *) + 0xa94a4730; (* arm_LDP X16 X17 X25 (Immediate_Offset (iword (&160))) *) + 0xfa10018c; (* arm_SBCS X12 X12 X16 *) + 0xfa1101ad; (* arm_SBCS X13 X13 X17 *) + 0xa90a36ec; (* arm_STP X12 X13 X23 (Immediate_Offset (iword (&160))) *) + 0xa94b4730; (* arm_LDP X16 X17 X25 (Immediate_Offset (iword (&176))) *) + 0xfa1001ce; (* arm_SBCS X14 X14 X16 *) + 0xfa1101ef; (* arm_SBCS X15 X15 X17 *) + 0xa90b3eee; (* arm_STP X14 X15 X23 (Immediate_Offset (iword (&176))) *) + 0xfa1f0318; (* arm_SBCS X24 X24 XZR *) + 0xda9f23f9; (* arm_CSETM X25 Condition_CC *) + 0xa94c2eea; (* arm_LDP X10 X11 X23 (Immediate_Offset (iword (&192))) *) + 0xab18014a; (* arm_ADDS X10 X10 X24 *) + 0xba19016b; (* arm_ADCS X11 X11 X25 *) + 0xa90c2eea; (* arm_STP X10 X11 X23 (Immediate_Offset (iword (&192))) *) + 0xa94d2eea; (* arm_LDP X10 X11 X23 (Immediate_Offset (iword (&208))) *) + 0xba19014a; (* arm_ADCS X10 X10 X25 *) + 0xba19016b; (* arm_ADCS X11 X11 X25 *) + 0xa90d2eea; (* arm_STP X10 X11 X23 (Immediate_Offset (iword (&208))) *) + 0xa94e2eea; (* arm_LDP X10 X11 X23 (Immediate_Offset (iword (&224))) *) + 0xba19014a; (* arm_ADCS X10 X10 X25 *) + 0xba19016b; (* arm_ADCS X11 X11 X25 *) + 0xa90e2eea; (* arm_STP X10 X11 X23 (Immediate_Offset (iword (&224))) *) + 0xa94f2eea; (* arm_LDP X10 X11 X23 (Immediate_Offset (iword (&240))) *) + 0xba19014a; (* arm_ADCS X10 X10 X25 *) + 0xba19016b; (* arm_ADCS X11 X11 X25 *) + 0xa90f2eea; (* arm_STP X10 X11 X23 (Immediate_Offset (iword (&240))) *) + 0xa8c17bf9; (* arm_LDP X25 X30 SP (Postimmediate_Offset (iword (&16))) *) + 0xa8c163f7; (* arm_LDP X23 X24 SP (Postimmediate_Offset (iword (&16))) *) + 0xa8c15bf5; (* arm_LDP X21 X22 SP (Postimmediate_Offset (iword (&16))) *) + 0xa8c153f3; (* arm_LDP X19 X20 SP (Postimmediate_Offset (iword (&16))) *) + 0xd65f03c0; (* arm_RET X30 *) + 0xa9400c22; (* arm_LDP X2 X3 X1 (Immediate_Offset (iword (&0))) *) + 0x3dc00034; (* arm_LDR Q20 X1 (Immediate_Offset (word 0)) *) + 0xa9411424; (* arm_LDP X4 X5 X1 (Immediate_Offset (iword (&16))) *) + 0x3dc00435; (* arm_LDR Q21 X1 (Immediate_Offset (word 16)) *) + 0xa9421c26; (* arm_LDP X6 X7 X1 (Immediate_Offset (iword (&32))) *) + 0x3dc00836; (* arm_LDR Q22 X1 (Immediate_Offset (word 32)) *) + 0xa9432428; (* arm_LDP X8 X9 X1 (Immediate_Offset (iword (&48))) *) + 0x3dc00c37; (* arm_LDR Q23 X1 (Immediate_Offset (word 48)) *) + 0x6f00e5fe; (* arm_MOVI Q30 (word 4294967295) *) + 0x9b047c51; (* arm_MUL X17 X2 X4 *) + 0x9b057c6e; (* arm_MUL X14 X3 X5 *) + 0x6e144281; (* arm_EXT Q1 Q20 Q20 64 *) + 0x9bc47c54; (* arm_UMULH X20 X2 X4 *) + 0x0f208682; (* arm_SHRN Q2 Q20 32 32 *) + 0xeb030055; (* arm_SUBS X21 X2 X3 *) + 0x0e813a80; (* arm_ZIP1 Q0 Q20 Q1 32 64 *) + 0xda9526b5; (* arm_CNEG X21 X21 Condition_CC *) + 0x2ea2c045; (* arm_UMULL_VEC Q5 Q2 Q2 32 *) + 0xda9f23eb; (* arm_CSETM X11 Condition_CC *) + 0x2ea0c046; (* arm_UMULL_VEC Q6 Q2 Q0 32 *) + 0xeb0400ac; (* arm_SUBS X12 X5 X4 *) + 0x2ea0c003; (* arm_UMULL_VEC Q3 Q0 Q0 32 *) + 0xda8c258c; (* arm_CNEG X12 X12 Condition_CC *) + 0x4ea61cc1; (* arm_MOV_VEC Q1 Q6 128 *) + 0x9b0c7ead; (* arm_MUL X13 X21 X12 *) + 0x6f601461; (* arm_USRA_VEC Q1 Q3 32 64 128 *) + 0x9bcc7eac; (* arm_UMULH X12 X21 X12 *) + 0x4e3e1c24; (* arm_AND_VEC Q4 Q1 Q30 128 *) + 0xda8b216b; (* arm_CINV X11 X11 Condition_CC *) + 0x4ee68484; (* arm_ADD_VEC Q4 Q4 Q6 64 128 *) + 0xca0b01ad; (* arm_EOR X13 X13 X11 *) + 0x6f601485; (* arm_USRA_VEC Q5 Q4 32 64 128 *) + 0xca0b018c; (* arm_EOR X12 X12 X11 *) + 0x6f605483; (* arm_SLI_VEC Q3 Q4 32 64 *) + 0xab140233; (* arm_ADDS X19 X17 X20 *) + 0x6f601425; (* arm_USRA_VEC Q5 Q1 32 64 128 *) + 0x9a1f0294; (* arm_ADC X20 X20 XZR *) + 0x6e1542a1; (* arm_EXT Q1 Q21 Q21 64 *) + 0x9bc57c75; (* arm_UMULH X21 X3 X5 *) + 0x0f2086a2; (* arm_SHRN Q2 Q21 32 32 *) + 0xab0e0273; (* arm_ADDS X19 X19 X14 *) + 0x0e813aa0; (* arm_ZIP1 Q0 Q21 Q1 32 64 *) + 0xba150294; (* arm_ADCS X20 X20 X21 *) + 0x9a1f02b5; (* arm_ADC X21 X21 XZR *) + 0xab0e0294; (* arm_ADDS X20 X20 X14 *) + 0x9a1f02b5; (* arm_ADC X21 X21 XZR *) + 0xb100057f; (* arm_CMN X11 (rvalue (word 1)) *) + 0xba0d0273; (* arm_ADCS X19 X19 X13 *) + 0x4e183c6d; (* arm_UMOV X13 Q3 1 8 *) + 0xba0c0294; (* arm_ADCS X20 X20 X12 *) + 0x4e183cae; (* arm_UMOV X14 Q5 1 8 *) + 0x9a0b02b5; (* arm_ADC X21 X21 X11 *) + 0x4e083c6c; (* arm_UMOV X12 Q3 0 8 *) + 0xab110231; (* arm_ADDS X17 X17 X17 *) + 0x4e083cab; (* arm_UMOV X11 Q5 0 8 *) + 0xba130273; (* arm_ADCS X19 X19 X19 *) + 0x2ea2c045; (* arm_UMULL_VEC Q5 Q2 Q2 32 *) + 0xba140294; (* arm_ADCS X20 X20 X20 *) + 0x2ea0c046; (* arm_UMULL_VEC Q6 Q2 Q0 32 *) + 0xba1502b5; (* arm_ADCS X21 X21 X21 *) + 0x2ea0c003; (* arm_UMULL_VEC Q3 Q0 Q0 32 *) + 0x9a1f03ea; (* arm_ADC X10 XZR XZR *) + 0x4ea61cc1; (* arm_MOV_VEC Q1 Q6 128 *) + 0x9b037c4f; (* arm_MUL X15 X2 X3 *) + 0x6f601461; (* arm_USRA_VEC Q1 Q3 32 64 128 *) + 0x9bc37c50; (* arm_UMULH X16 X2 X3 *) + 0x4e3e1c24; (* arm_AND_VEC Q4 Q1 Q30 128 *) + 0xab0f016b; (* arm_ADDS X11 X11 X15 *) + 0x4ee68484; (* arm_ADD_VEC Q4 Q4 Q6 64 128 *) + 0xba1001ad; (* arm_ADCS X13 X13 X16 *) + 0x6f601485; (* arm_USRA_VEC Q5 Q4 32 64 128 *) + 0x9a1f01ce; (* arm_ADC X14 X14 XZR *) + 0x6f605483; (* arm_SLI_VEC Q3 Q4 32 64 *) + 0xab0f016b; (* arm_ADDS X11 X11 X15 *) + 0x6f601425; (* arm_USRA_VEC Q5 Q1 32 64 128 *) + 0xba1001ad; (* arm_ADCS X13 X13 X16 *) + 0x9a1f01ce; (* arm_ADC X14 X14 XZR *) + 0xa9002c0c; (* arm_STP X12 X11 X0 (Immediate_Offset (iword (&0))) *) + 0x4e083cab; (* arm_UMOV X11 Q5 0 8 *) + 0xab0d0231; (* arm_ADDS X17 X17 X13 *) + 0x4e183c6d; (* arm_UMOV X13 Q3 1 8 *) + 0xba0e0273; (* arm_ADCS X19 X19 X14 *) + 0x4e183cae; (* arm_UMOV X14 Q5 1 8 *) + 0xba1f0294; (* arm_ADCS X20 X20 XZR *) + 0x4e083c6c; (* arm_UMOV X12 Q3 0 8 *) + 0xba1f02b5; (* arm_ADCS X21 X21 XZR *) + 0x6e1642c1; (* arm_EXT Q1 Q22 Q22 64 *) + 0x9a1f014a; (* arm_ADC X10 X10 XZR *) + 0x0f2086c2; (* arm_SHRN Q2 Q22 32 32 *) + 0xa9014c11; (* arm_STP X17 X19 X0 (Immediate_Offset (iword (&16))) *) + 0x0e813ac0; (* arm_ZIP1 Q0 Q22 Q1 32 64 *) + 0x9b057c8f; (* arm_MUL X15 X4 X5 *) + 0x2ea2c045; (* arm_UMULL_VEC Q5 Q2 Q2 32 *) + 0x9bc57c90; (* arm_UMULH X16 X4 X5 *) + 0x2ea0c046; (* arm_UMULL_VEC Q6 Q2 Q0 32 *) + 0xab0f016b; (* arm_ADDS X11 X11 X15 *) + 0x2ea0c003; (* arm_UMULL_VEC Q3 Q0 Q0 32 *) + 0xba1001ad; (* arm_ADCS X13 X13 X16 *) + 0x4ea61cc1; (* arm_MOV_VEC Q1 Q6 128 *) + 0x9a1f01ce; (* arm_ADC X14 X14 XZR *) + 0x6f601461; (* arm_USRA_VEC Q1 Q3 32 64 128 *) + 0xab0f016b; (* arm_ADDS X11 X11 X15 *) + 0x4e3e1c24; (* arm_AND_VEC Q4 Q1 Q30 128 *) + 0xba1001ad; (* arm_ADCS X13 X13 X16 *) + 0x4ee68484; (* arm_ADD_VEC Q4 Q4 Q6 64 128 *) + 0x9a1f01ce; (* arm_ADC X14 X14 XZR *) + 0x6f601485; (* arm_USRA_VEC Q5 Q4 32 64 128 *) + 0xab14018c; (* arm_ADDS X12 X12 X20 *) + 0x6f605483; (* arm_SLI_VEC Q3 Q4 32 64 *) + 0xba15016b; (* arm_ADCS X11 X11 X21 *) + 0x6f601425; (* arm_USRA_VEC Q5 Q1 32 64 128 *) + 0xa9022c0c; (* arm_STP X12 X11 X0 (Immediate_Offset (iword (&32))) *) + 0x6e1742e1; (* arm_EXT Q1 Q23 Q23 64 *) + 0xba0a01ad; (* arm_ADCS X13 X13 X10 *) + 0x0f2086e2; (* arm_SHRN Q2 Q23 32 32 *) + 0x9a1f01ce; (* arm_ADC X14 X14 XZR *) + 0x0e813ae0; (* arm_ZIP1 Q0 Q23 Q1 32 64 *) + 0xa903380d; (* arm_STP X13 X14 X0 (Immediate_Offset (iword (&48))) *) + 0x9b087cd1; (* arm_MUL X17 X6 X8 *) + 0x2ea2c050; (* arm_UMULL_VEC Q16 Q2 Q2 32 *) + 0x9b097cee; (* arm_MUL X14 X7 X9 *) + 0x2ea0c046; (* arm_UMULL_VEC Q6 Q2 Q0 32 *) + 0x9bc87cd4; (* arm_UMULH X20 X6 X8 *) + 0x2ea0c012; (* arm_UMULL_VEC Q18 Q0 Q0 32 *) + 0xeb0700d5; (* arm_SUBS X21 X6 X7 *) + 0xda9526b5; (* arm_CNEG X21 X21 Condition_CC *) + 0x4ea61cc1; (* arm_MOV_VEC Q1 Q6 128 *) + 0xda9f23eb; (* arm_CSETM X11 Condition_CC *) + 0xeb08012c; (* arm_SUBS X12 X9 X8 *) + 0xda8c258c; (* arm_CNEG X12 X12 Condition_CC *) + 0x6f601641; (* arm_USRA_VEC Q1 Q18 32 64 128 *) + 0x9b0c7ead; (* arm_MUL X13 X21 X12 *) + 0x4e3e1c24; (* arm_AND_VEC Q4 Q1 Q30 128 *) + 0x9bcc7eac; (* arm_UMULH X12 X21 X12 *) + 0x4ee68484; (* arm_ADD_VEC Q4 Q4 Q6 64 128 *) + 0xda8b216b; (* arm_CINV X11 X11 Condition_CC *) + 0xca0b01ad; (* arm_EOR X13 X13 X11 *) + 0xca0b018c; (* arm_EOR X12 X12 X11 *) + 0x6f601490; (* arm_USRA_VEC Q16 Q4 32 64 128 *) + 0xab140233; (* arm_ADDS X19 X17 X20 *) + 0x9a1f0294; (* arm_ADC X20 X20 XZR *) + 0x6f605492; (* arm_SLI_VEC Q18 Q4 32 64 *) + 0x9bc97cf5; (* arm_UMULH X21 X7 X9 *) + 0xab0e0273; (* arm_ADDS X19 X19 X14 *) + 0xba150294; (* arm_ADCS X20 X20 X21 *) + 0x9a1f02b5; (* arm_ADC X21 X21 XZR *) + 0xab0e0294; (* arm_ADDS X20 X20 X14 *) + 0x4e183cae; (* arm_UMOV X14 Q5 1 8 *) + 0x9a1f02b5; (* arm_ADC X21 X21 XZR *) + 0xb100057f; (* arm_CMN X11 (rvalue (word 1)) *) + 0xba0d0273; (* arm_ADCS X19 X19 X13 *) + 0x4e183c6d; (* arm_UMOV X13 Q3 1 8 *) + 0xba0c0294; (* arm_ADCS X20 X20 X12 *) + 0x4e083c6c; (* arm_UMOV X12 Q3 0 8 *) + 0x9a0b02b5; (* arm_ADC X21 X21 X11 *) + 0x4e083cab; (* arm_UMOV X11 Q5 0 8 *) + 0xab110231; (* arm_ADDS X17 X17 X17 *) + 0xba130273; (* arm_ADCS X19 X19 X19 *) + 0x6f601430; (* arm_USRA_VEC Q16 Q1 32 64 128 *) + 0xba140294; (* arm_ADCS X20 X20 X20 *) + 0xba1502b5; (* arm_ADCS X21 X21 X21 *) + 0x9a1f03ea; (* arm_ADC X10 XZR XZR *) + 0x4e975ab1; (* arm_UZP2 Q17 Q21 Q23 32 *) + 0x9b077ccf; (* arm_MUL X15 X6 X7 *) + 0x0ea12ae4; (* arm_XTN Q4 Q23 32 *) + 0x9bc77cd0; (* arm_UMULH X16 X6 X7 *) + 0x4e083e16; (* arm_UMOV X22 Q16 0 8 *) + 0xab0f016b; (* arm_ADDS X11 X11 X15 *) + 0xba1001ad; (* arm_ADCS X13 X13 X16 *) + 0x0ea12aa5; (* arm_XTN Q5 Q21 32 *) + 0x9a1f01ce; (* arm_ADC X14 X14 XZR *) + 0xab0f016b; (* arm_ADDS X11 X11 X15 *) + 0x4ea00aa1; (* arm_REV64_VEC Q1 Q21 32 *) + 0xba1001ad; (* arm_ADCS X13 X13 X16 *) + 0x9a1f01ce; (* arm_ADC X14 X14 XZR *) + 0xa9042c0c; (* arm_STP X12 X11 X0 (Immediate_Offset (iword (&64))) *) + 0xab0d0231; (* arm_ADDS X17 X17 X13 *) + 0x4e183e4d; (* arm_UMOV X13 Q18 1 8 *) + 0xba0e0273; (* arm_ADCS X19 X19 X14 *) + 0x4e183e0e; (* arm_UMOV X14 Q16 1 8 *) + 0xba1f0294; (* arm_ADCS X20 X20 XZR *) + 0x4e083e4c; (* arm_UMOV X12 Q18 0 8 *) + 0xba1f02b5; (* arm_ADCS X21 X21 XZR *) + 0x9a1f014a; (* arm_ADC X10 X10 XZR *) + 0x2ea5c086; (* arm_UMULL_VEC Q6 Q4 Q5 32 *) + 0xa9054c11; (* arm_STP X17 X19 X0 (Immediate_Offset (iword (&80))) *) + 0x2eb1c087; (* arm_UMULL_VEC Q7 Q4 Q17 32 *) + 0x9b097d0f; (* arm_MUL X15 X8 X9 *) + 0x4e975af0; (* arm_UZP2 Q16 Q23 Q23 32 *) + 0x9bc97d10; (* arm_UMULH X16 X8 X9 *) + 0x4eb79c20; (* arm_MUL_VEC Q0 Q1 Q23 32 *) + 0xab0f02cb; (* arm_ADDS X11 X22 X15 *) + 0xba1001ad; (* arm_ADCS X13 X13 X16 *) + 0x6f6014c7; (* arm_USRA_VEC Q7 Q6 32 64 128 *) + 0x9a1f01ce; (* arm_ADC X14 X14 XZR *) + 0xab0f016b; (* arm_ADDS X11 X11 X15 *) + 0x2eb1c201; (* arm_UMULL_VEC Q1 Q16 Q17 32 *) + 0xba1001ad; (* arm_ADCS X13 X13 X16 *) + 0x9a1f01ce; (* arm_ADC X14 X14 XZR *) + 0x6ea02800; (* arm_UADDLP Q0 Q0 32 *) + 0xab14018c; (* arm_ADDS X12 X12 X20 *) + 0xba15016b; (* arm_ADCS X11 X11 X21 *) + 0x4e3e1ce2; (* arm_AND_VEC Q2 Q7 Q30 128 *) + 0x2ea58202; (* arm_UMLAL_VEC Q2 Q16 Q5 32 *) + 0x4f605400; (* arm_SHL_VEC Q0 Q0 32 64 *) + 0x6f6014e1; (* arm_USRA_VEC Q1 Q7 32 64 128 *) + 0x2ea58080; (* arm_UMLAL_VEC Q0 Q4 Q5 32 *) + 0x4e183c10; (* arm_UMOV X16 Q0 1 8 *) + 0x4e083c0f; (* arm_UMOV X15 Q0 0 8 *) + 0x6f601441; (* arm_USRA_VEC Q1 Q2 32 64 128 *) + 0x4e083c34; (* arm_UMOV X20 Q1 0 8 *) + 0x4e183c35; (* arm_UMOV X21 Q1 1 8 *) + 0xa9062c0c; (* arm_STP X12 X11 X0 (Immediate_Offset (iword (&96))) *) + 0xba0a01ad; (* arm_ADCS X13 X13 X10 *) + 0x9a1f01ce; (* arm_ADC X14 X14 XZR *) + 0xa907380d; (* arm_STP X13 X14 X0 (Immediate_Offset (iword (&112))) *) + 0x9b067c4a; (* arm_MUL X10 X2 X6 *) + 0x9b077c6e; (* arm_MUL X14 X3 X7 *) + 0x9bc67c51; (* arm_UMULH X17 X2 X6 *) + 0xab1101ce; (* arm_ADDS X14 X14 X17 *) + 0x9bc77c71; (* arm_UMULH X17 X3 X7 *) + 0xba1101ef; (* arm_ADCS X15 X15 X17 *) + 0xba140210; (* arm_ADCS X16 X16 X20 *) + 0x9a1f02b1; (* arm_ADC X17 X21 XZR *) + 0xab0a01cb; (* arm_ADDS X11 X14 X10 *) + 0xba0e01ee; (* arm_ADCS X14 X15 X14 *) + 0xba0f020f; (* arm_ADCS X15 X16 X15 *) + 0xba100230; (* arm_ADCS X16 X17 X16 *) + 0x9a1103f1; (* arm_ADC X17 XZR X17 *) + 0xab0a01cc; (* arm_ADDS X12 X14 X10 *) + 0xba0b01ed; (* arm_ADCS X13 X15 X11 *) + 0xba0e020e; (* arm_ADCS X14 X16 X14 *) + 0xba0f022f; (* arm_ADCS X15 X17 X15 *) + 0xba1003f0; (* arm_ADCS X16 XZR X16 *) + 0x9a1103f1; (* arm_ADC X17 XZR X17 *) + 0xeb050096; (* arm_SUBS X22 X4 X5 *) + 0xda9626d6; (* arm_CNEG X22 X22 Condition_CC *) + 0xda9f23f3; (* arm_CSETM X19 Condition_CC *) + 0xeb080134; (* arm_SUBS X20 X9 X8 *) + 0xda942694; (* arm_CNEG X20 X20 Condition_CC *) + 0x9b147ed5; (* arm_MUL X21 X22 X20 *) + 0x9bd47ed4; (* arm_UMULH X20 X22 X20 *) + 0xda932273; (* arm_CINV X19 X19 Condition_CC *) + 0xb100067f; (* arm_CMN X19 (rvalue (word 1)) *) + 0xca1302b5; (* arm_EOR X21 X21 X19 *) + 0xba1501ef; (* arm_ADCS X15 X15 X21 *) + 0xca130294; (* arm_EOR X20 X20 X19 *) + 0xba140210; (* arm_ADCS X16 X16 X20 *) + 0x9a130231; (* arm_ADC X17 X17 X19 *) + 0xeb030056; (* arm_SUBS X22 X2 X3 *) + 0xda9626d6; (* arm_CNEG X22 X22 Condition_CC *) + 0xda9f23f3; (* arm_CSETM X19 Condition_CC *) + 0xeb0600f4; (* arm_SUBS X20 X7 X6 *) + 0xda942694; (* arm_CNEG X20 X20 Condition_CC *) + 0x9b147ed5; (* arm_MUL X21 X22 X20 *) + 0x9bd47ed4; (* arm_UMULH X20 X22 X20 *) + 0xda932273; (* arm_CINV X19 X19 Condition_CC *) + 0xb100067f; (* arm_CMN X19 (rvalue (word 1)) *) + 0xca1302b5; (* arm_EOR X21 X21 X19 *) + 0xba15016b; (* arm_ADCS X11 X11 X21 *) + 0xca130294; (* arm_EOR X20 X20 X19 *) + 0xba14018c; (* arm_ADCS X12 X12 X20 *) + 0xba1301ad; (* arm_ADCS X13 X13 X19 *) + 0xba1301ce; (* arm_ADCS X14 X14 X19 *) + 0xba1301ef; (* arm_ADCS X15 X15 X19 *) + 0xba130210; (* arm_ADCS X16 X16 X19 *) + 0x9a130231; (* arm_ADC X17 X17 X19 *) + 0xeb050076; (* arm_SUBS X22 X3 X5 *) + 0xda9626d6; (* arm_CNEG X22 X22 Condition_CC *) + 0xda9f23f3; (* arm_CSETM X19 Condition_CC *) + 0xeb070134; (* arm_SUBS X20 X9 X7 *) + 0xda942694; (* arm_CNEG X20 X20 Condition_CC *) + 0x9b147ed5; (* arm_MUL X21 X22 X20 *) + 0x9bd47ed4; (* arm_UMULH X20 X22 X20 *) + 0xda932273; (* arm_CINV X19 X19 Condition_CC *) + 0xb100067f; (* arm_CMN X19 (rvalue (word 1)) *) + 0xca1302b5; (* arm_EOR X21 X21 X19 *) + 0xba1501ce; (* arm_ADCS X14 X14 X21 *) + 0xca130294; (* arm_EOR X20 X20 X19 *) + 0xba1401ef; (* arm_ADCS X15 X15 X20 *) + 0xba130210; (* arm_ADCS X16 X16 X19 *) + 0x9a130231; (* arm_ADC X17 X17 X19 *) + 0xeb040056; (* arm_SUBS X22 X2 X4 *) + 0xda9626d6; (* arm_CNEG X22 X22 Condition_CC *) + 0xda9f23f3; (* arm_CSETM X19 Condition_CC *) + 0xeb060114; (* arm_SUBS X20 X8 X6 *) + 0xda942694; (* arm_CNEG X20 X20 Condition_CC *) + 0x9b147ed5; (* arm_MUL X21 X22 X20 *) + 0x9bd47ed4; (* arm_UMULH X20 X22 X20 *) + 0xda932273; (* arm_CINV X19 X19 Condition_CC *) + 0xb100067f; (* arm_CMN X19 (rvalue (word 1)) *) + 0xca1302b5; (* arm_EOR X21 X21 X19 *) + 0xba15018c; (* arm_ADCS X12 X12 X21 *) + 0xca130294; (* arm_EOR X20 X20 X19 *) + 0xba1401ad; (* arm_ADCS X13 X13 X20 *) + 0xba1301ce; (* arm_ADCS X14 X14 X19 *) + 0xba1301ef; (* arm_ADCS X15 X15 X19 *) + 0xba130210; (* arm_ADCS X16 X16 X19 *) + 0x9a130231; (* arm_ADC X17 X17 X19 *) + 0xeb050056; (* arm_SUBS X22 X2 X5 *) + 0xda9626d6; (* arm_CNEG X22 X22 Condition_CC *) + 0xda9f23f3; (* arm_CSETM X19 Condition_CC *) + 0xeb060134; (* arm_SUBS X20 X9 X6 *) + 0xda942694; (* arm_CNEG X20 X20 Condition_CC *) + 0x9b147ed5; (* arm_MUL X21 X22 X20 *) + 0x9bd47ed4; (* arm_UMULH X20 X22 X20 *) + 0xda932273; (* arm_CINV X19 X19 Condition_CC *) + 0xb100067f; (* arm_CMN X19 (rvalue (word 1)) *) + 0xca1302b5; (* arm_EOR X21 X21 X19 *) + 0xba1501ad; (* arm_ADCS X13 X13 X21 *) + 0xca130294; (* arm_EOR X20 X20 X19 *) + 0xba1401ce; (* arm_ADCS X14 X14 X20 *) + 0xba1301ef; (* arm_ADCS X15 X15 X19 *) + 0xba130210; (* arm_ADCS X16 X16 X19 *) + 0x9a130231; (* arm_ADC X17 X17 X19 *) + 0xeb040076; (* arm_SUBS X22 X3 X4 *) + 0xda9626d6; (* arm_CNEG X22 X22 Condition_CC *) + 0xda9f23f3; (* arm_CSETM X19 Condition_CC *) + 0xeb070114; (* arm_SUBS X20 X8 X7 *) + 0xda942694; (* arm_CNEG X20 X20 Condition_CC *) + 0x9b147ed5; (* arm_MUL X21 X22 X20 *) + 0x9bd47ed4; (* arm_UMULH X20 X22 X20 *) + 0xda932273; (* arm_CINV X19 X19 Condition_CC *) + 0xb100067f; (* arm_CMN X19 (rvalue (word 1)) *) + 0xca1302b5; (* arm_EOR X21 X21 X19 *) + 0xba1501ad; (* arm_ADCS X13 X13 X21 *) + 0xca130294; (* arm_EOR X20 X20 X19 *) + 0xba1401ce; (* arm_ADCS X14 X14 X20 *) + 0xba1301ef; (* arm_ADCS X15 X15 X19 *) + 0xba130210; (* arm_ADCS X16 X16 X19 *) + 0x9a130231; (* arm_ADC X17 X17 X19 *) + 0xab0a014a; (* arm_ADDS X10 X10 X10 *) + 0xba0b016b; (* arm_ADCS X11 X11 X11 *) + 0xba0c018c; (* arm_ADCS X12 X12 X12 *) + 0xba0d01ad; (* arm_ADCS X13 X13 X13 *) + 0xba0e01ce; (* arm_ADCS X14 X14 X14 *) + 0xba0f01ef; (* arm_ADCS X15 X15 X15 *) + 0xba100210; (* arm_ADCS X16 X16 X16 *) + 0xba110231; (* arm_ADCS X17 X17 X17 *) + 0x9a1f03f3; (* arm_ADC X19 XZR XZR *) + 0xa9420c02; (* arm_LDP X2 X3 X0 (Immediate_Offset (iword (&32))) *) + 0xab02014a; (* arm_ADDS X10 X10 X2 *) + 0xba03016b; (* arm_ADCS X11 X11 X3 *) + 0xa9022c0a; (* arm_STP X10 X11 X0 (Immediate_Offset (iword (&32))) *) + 0xa9430c02; (* arm_LDP X2 X3 X0 (Immediate_Offset (iword (&48))) *) + 0xba02018c; (* arm_ADCS X12 X12 X2 *) + 0xba0301ad; (* arm_ADCS X13 X13 X3 *) + 0xa903340c; (* arm_STP X12 X13 X0 (Immediate_Offset (iword (&48))) *) + 0xa9440c02; (* arm_LDP X2 X3 X0 (Immediate_Offset (iword (&64))) *) + 0xba0201ce; (* arm_ADCS X14 X14 X2 *) + 0xba0301ef; (* arm_ADCS X15 X15 X3 *) + 0xa9043c0e; (* arm_STP X14 X15 X0 (Immediate_Offset (iword (&64))) *) + 0xa9450c02; (* arm_LDP X2 X3 X0 (Immediate_Offset (iword (&80))) *) + 0xba020210; (* arm_ADCS X16 X16 X2 *) + 0xba030231; (* arm_ADCS X17 X17 X3 *) + 0xa9054410; (* arm_STP X16 X17 X0 (Immediate_Offset (iword (&80))) *) + 0xa9460c02; (* arm_LDP X2 X3 X0 (Immediate_Offset (iword (&96))) *) + 0xba130042; (* arm_ADCS X2 X2 X19 *) + 0xba1f0063; (* arm_ADCS X3 X3 XZR *) + 0xa9060c02; (* arm_STP X2 X3 X0 (Immediate_Offset (iword (&96))) *) + 0xa9470c02; (* arm_LDP X2 X3 X0 (Immediate_Offset (iword (&112))) *) + 0xba1f0042; (* arm_ADCS X2 X2 XZR *) + 0x9a1f0063; (* arm_ADC X3 X3 XZR *) + 0xa9070c02; (* arm_STP X2 X3 X0 (Immediate_Offset (iword (&112))) *) + 0xd65f03c0 (* arm_RET X30 *) +];; + +let BIGNUM_KSQR_32_64_NEON_EXEC = ARM_MK_EXEC_RULE bignum_ksqr_32_64_neon_mc;; + +(* ------------------------------------------------------------------------- *) +(* Proof for the inner-level 8->16 squring. *) +(* ------------------------------------------------------------------------- *) + +let lemma1 = prove + (`!(x0:num) x1 (y0:num) y1. + (if y0 <= y1 + then if x1 <= x0 then word 0 else word 18446744073709551615 + else word_not + (if x1 <= x0 then word 0 else word 18446744073709551615)):int64 = + word_neg(word(bitval(y0 <= y1 <=> x0 < x1)))`, + REPEAT GEN_TAC THEN REWRITE_TAC[GSYM NOT_LE] THEN + REPEAT(COND_CASES_TAC THEN ASM_REWRITE_TAC[BITVAL_CLAUSES]) THEN + CONV_TAC WORD_REDUCE_CONV);; + +let lemma2 = prove + (`!(x0:int64) (x1:int64) (y0:int64) (y1:int64). + &(val(if val x1 <= val x0 then word_sub x0 x1 + else word_neg (word_sub x0 x1))) * + &(val(if val y0 <= val y1 then word_sub y1 y0 + else word_neg (word_sub y1 y0))):real = + --(&1) pow bitval(val y0 <= val y1 <=> val x0 < val x1) * + (&(val x0) - &(val x1)) * (&(val y1) - &(val y0))`, + REPEAT GEN_TAC THEN REWRITE_TAC[GSYM NOT_LE; WORD_NEG_SUB] THEN + REPEAT(COND_CASES_TAC THEN ASM_REWRITE_TAC[BITVAL_CLAUSES]) THEN + REPEAT(FIRST_X_ASSUM(ASSUME_TAC o MATCH_MP (ARITH_RULE + `~(m:num <= n) ==> n <= m /\ ~(m <= n)`))) THEN + ASM_SIMP_TAC[VAL_WORD_SUB_CASES; GSYM REAL_OF_NUM_SUB] THEN + REAL_ARITH_TAC);; + +let ADK_48_TAC = + MATCH_MP_TAC EQUAL_FROM_CONGRUENT_REAL THEN + MAP_EVERY EXISTS_TAC [`512`; `&0:real`] THEN + REPLICATE_TAC 2 (CONJ_TAC THENL [BOUNDER_TAC[]; ALL_TAC]) THEN + CONJ_TAC THENL [REAL_INTEGER_TAC; ALL_TAC] THEN + ACCUMULATOR_POP_ASSUM_LIST(MP_TAC o end_itlist CONJ) THEN + POP_ASSUM_LIST(K ALL_TAC) THEN + REWRITE_TAC[lemma1; lemma2] THEN REWRITE_TAC[WORD_XOR_MASK] THEN + REPEAT(COND_CASES_TAC THEN + ASM_REWRITE_TAC[BITVAL_CLAUSES; REAL_VAL_WORD_NOT]) THEN + CONV_TAC WORD_REDUCE_CONV THEN CONV_TAC NUM_REDUCE_CONV THEN + REWRITE_TAC[BITVAL_CLAUSES; DIMINDEX_64] THEN + POP_ASSUM_LIST(K ALL_TAC) THEN DISCH_TAC THEN + FIRST_ASSUM(MP_TAC o end_itlist CONJ o DESUM_RULE o CONJUNCTS) THEN + DISCH_THEN(fun th -> REWRITE_TAC[th]) THEN + CONV_TAC(RAND_CONV REAL_POLY_CONV) THEN + FIRST_ASSUM(MP_TAC o end_itlist CONJ o filter (is_ratconst o rand o concl) o + DECARRY_RULE o CONJUNCTS) THEN + DISCH_THEN(fun th -> REWRITE_TAC[th]) THEN REAL_INTEGER_TAC;; + +needs "arm/proofs/neon_helper.ml";; + + +let BIGNUM_KSQR_32_64_NEON_SUBLEMMA = prove + (`!z x a pc returnaddress. + ALL (nonoverlapping (z,8 * 16)) + [(word pc,3596); (x,8 * 8)] + ==> ensures arm + (\s. aligned_bytes_loaded s (word(pc + 0x0)) bignum_ksqr_32_64_neon_mc /\ + read PC s = word(pc + 0x858) /\ + read X30 s = returnaddress /\ + C_ARGUMENTS [z; x] s /\ + bignum_from_memory (x,8) s = a) + (\s. read PC s = returnaddress /\ + bignum_from_memory (z,16) s = a EXP 2) + (MAYCHANGE [PC; X2; X3; X4; X5; X6; X7; X8; X9; X10; X11; X12; + X13; X14; X15; X16; X17; X19; X20; X21; X22] ,, + MAYCHANGE [Q0; Q1; Q2; Q3; Q4; Q5; Q6; Q7; Q16; Q17; Q18; Q19; Q20; + Q21; Q22; Q23; Q30] ,, + MAYCHANGE [memory :> bytes(z,8 * 16)] ,, + MAYCHANGE SOME_FLAGS)`, + REWRITE_TAC[ADD_CLAUSES] THEN + + MAP_EVERY X_GEN_TAC [`z:int64`; `x:int64`; `a:num`; `pc:num`; `returnaddress:int64`] THEN + REWRITE_TAC[C_ARGUMENTS; C_RETURN; SOME_FLAGS; ALL; NONOVERLAPPING_CLAUSES] THEN + DISCH_THEN(REPEAT_TCL CONJUNCTS_THEN ASSUME_TAC) THEN + ENSURES_INIT_TAC "s0" THEN + BIGNUM_DIGITIZE_TAC "x_" `bignum_from_memory (x,8) s0` THEN + BYTES128_EQ_JOIN64_TAC `read (memory :> bytes128 x) s0` `x_1:(64)word` `x_0:(64)word` THEN + BYTES128_EQ_JOIN64_TAC `read (memory :> bytes128 (word_add x (word 16:(64)word))) s0` + `x_3:(64)word` `x_2:(64)word` THEN + BYTES128_EQ_JOIN64_TAC `read (memory :> bytes128 (word_add x (word 32:(64)word))) s0` + `x_5:(64)word` `x_4:(64)word` THEN + BYTES128_EQ_JOIN64_TAC `read (memory :> bytes128 (word_add x (word 48:(64)word))) s0` + `x_7:(64)word` `x_6:(64)word` THEN + + (*** First nested mini-ADK 4x4 squaring block ***) + + ARM_REWRITE_ASSUM_AND_ACCSTEPS_TAC BIGNUM_KSQR_32_64_NEON_EXEC + [49;51;53;55;79;81;83;85] + [WORD_SQR64_HI; WORD_SQR64_LO] + [10;11;25;35;37;41;43;44;45;46;48;49;50;52;53;54;56;58;60;62; + 64;68;70;72;74;76;77;80;81;82;84;85;86;88;92;96;98;100;102;104; + 106;108;110;114;116] + (1--118) THEN + + SUBGOAL_THEN + `bignum_of_wordlist [x_0;x_1;x_2;x_3] EXP 2 = + bignum_of_wordlist [mullo_s53; sum_s74; sum_s80; sum_s82; + sum_s108; sum_s110; sum_s114; sum_s116]` + ASSUME_TAC THENL + [REWRITE_TAC[bignum_of_wordlist; GSYM REAL_OF_NUM_CLAUSES] THEN + RULE_ASSUM_TAC(REWRITE_RULE[ADD_CLAUSES; VAL_WORD_BITVAL]) THEN + ADK_48_TAC; + ACCUMULATOR_POP_ASSUM_LIST(K ALL_TAC) THEN + DISCARD_MATCHING_ASSUMPTIONS [`word a = b`]] THEN + + (*** Second nested mini-ADK 4x4 squaring block ***) + + + ARM_REWRITE_ASSUM_AND_ACCSTEPS_TAC BIGNUM_KSQR_32_64_NEON_EXEC + [148;152;154;156;167;178;180;182] + [WORD_SQR64_HI; WORD_SQR64_LO] + [119;121;132;140;141;144;145;146;147;149;151;153;155;157;158;160;161; + 162;154;152;164;168;169;171;172;174;175;177;179;181;183;184;182;178; + 188;192;193;195;196;198;199;201;202] + (119--207) THEN + RULE_ASSUM_TAC (REWRITE_RULE + [WORD_BITMANIP_SIMP_LEMMAS; WORD_MUL64_LO]) THEN + ARM_REWRITE_ASSUM_AND_ACCSTEPS_TAC BIGNUM_KSQR_32_64_NEON_EXEC + [208;209] [WORD_SQR64_HI; WORD_SQR64_LO] [208;209] + (208--210) THEN + RULE_ASSUM_TAC (REWRITE_RULE + [WORD_BITMANIP_SIMP_LEMMAS; WORD_MUL64_HI]) THEN + ARM_REWRITE_ASSUM_AND_ACCSTEPS_TAC BIGNUM_KSQR_32_64_NEON_EXEC + [211;212] [WORD_SQR64_HI; WORD_SQR64_LO] [211;212;214;215] + (211--216) THEN + + SUBGOAL_THEN + `bignum_of_wordlist [x_4;x_5;x_6;x_7] EXP 2 = + bignum_of_wordlist [mullo_s154;sum_s172;sum_s177;sum_s179; + sum_s201;sum_s202;sum_s214;sum_s215]` + ASSUME_TAC THENL + [REWRITE_TAC[bignum_of_wordlist; GSYM REAL_OF_NUM_CLAUSES] THEN + RULE_ASSUM_TAC(REWRITE_RULE[ADD_CLAUSES; VAL_WORD_BITVAL]) THEN + ADK_48_TAC; + let is_acc_thm_for_next acc_thm = + List.exists (contains_str (string_of_term (concl acc_thm))) + ["208";"209";"211";"212"] in + let filter_acc_thms_for_next acc_thms = + List.filter is_acc_thm_for_next acc_thms in + let wpat = `word a = b` in + ACCUMULATOR_POP_ASSUM_LIST( + fun acc_thms -> + let acc_thms = filter_acc_thms_for_next acc_thms in + List.iter (fun t -> Printf.printf "assuming: %s\n" t) + (List.map string_of_thm acc_thms); + MAP_EVERY ASSUME_TAC acc_thms) THEN + DISCARD_ASSUMPTIONS_TAC + (fun th -> can (term_match [] wpat) (concl th) && + not (is_acc_thm_for_next th))] THEN + + (*** Nested ADK 4x4 multiplication block ***) + + ARM_ACCSTEPS_TAC BIGNUM_KSQR_32_64_NEON_EXEC + [217;218;220;222;223;224;225;226;227;228;229;230;231;232;233; + 234;235;241;246;248;249;255;260;262;263;264;265;266;267;273;278; + 280;281;282;288;293;295;296;297;298;299;305;310;312;313;314;315; + 321;326;328;329;330;331] + (217--331) THEN + + SUBGOAL_THEN + `bignum_of_wordlist [x_0;x_1;x_2;x_3] * + bignum_of_wordlist [x_4;x_5;x_6;x_7] = + bignum_of_wordlist + [mullo_s217; sum_s260; sum_s293; sum_s326; + sum_s328; sum_s329; sum_s330; sum_s331]` + ASSUME_TAC THENL + [REWRITE_TAC[bignum_of_wordlist; GSYM REAL_OF_NUM_CLAUSES] THEN + RULE_ASSUM_TAC(REWRITE_RULE[ADD_CLAUSES; VAL_WORD_BITVAL]) THEN + ADK_48_TAC; + ACCUMULATOR_POP_ASSUM_LIST(K ALL_TAC) THEN + DISCARD_MATCHING_ASSUMPTIONS [`word a = b`]] THEN + + (*** Final accumulation simulation and 16-digit focusing ***) + + ARM_ACCSTEPS_TAC BIGNUM_KSQR_32_64_NEON_EXEC (332--364) (332--365) THEN + ENSURES_FINAL_STATE_TAC THEN ASM_REWRITE_TAC[] THEN + CONV_TAC(LAND_CONV BIGNUM_EXPAND_CONV) THEN ASM_REWRITE_TAC[] THEN + DISCARD_STATE_TAC "s365" THEN + REWRITE_TAC[GSYM REAL_OF_NUM_CLAUSES] THEN + MATCH_MP_TAC EQUAL_FROM_CONGRUENT_REAL THEN + MAP_EVERY EXISTS_TAC [`1024`; `&0:real`] THEN + CONJ_TAC THENL [BOUNDER_TAC[]; ALL_TAC] THEN CONJ_TAC THENL + [EXPAND_TAC "a" THEN + REWRITE_TAC[GSYM REAL_OF_NUM_CLAUSES] THEN BOUNDER_TAC[]; + REWRITE_TAC[INTEGER_CLOSED]] THEN + + (*** The core rearrangement we are using ***) + + SUBGOAL_THEN + `(&a:real) pow 2 = + &(bignum_of_wordlist [x_0;x_1;x_2;x_3] EXP 2) + + &2 pow 512 * &(bignum_of_wordlist [x_4;x_5;x_6;x_7] EXP 2) + + &2 pow 257 * &(bignum_of_wordlist [x_0;x_1;x_2;x_3] * + bignum_of_wordlist [x_4;x_5;x_6;x_7])` + SUBST1_TAC THENL + [EXPAND_TAC "a" THEN + REWRITE_TAC[bignum_of_wordlist; REAL_OF_NUM_CLAUSES] THEN ARITH_TAC; + ASM_REWRITE_TAC[]] THEN + REWRITE_TAC[GSYM REAL_OF_NUM_CLAUSES; bignum_of_wordlist] THEN + RULE_ASSUM_TAC(REWRITE_RULE[ADD_CLAUSES; VAL_WORD_BITVAL]) THEN + ACCUMULATOR_POP_ASSUM_LIST(MP_TAC o end_itlist CONJ o DESUM_RULE) THEN + DISCH_THEN(fun th -> REWRITE_TAC[th]) THEN REAL_INTEGER_TAC);; + +let BIGNUM_KSQR_32_64_NEON_SUBLEMMA_TAC = + ARM_SUBROUTINE_SIM_TAC + (bignum_ksqr_32_64_neon_mc,BIGNUM_KSQR_32_64_NEON_EXEC, + 0x0,bignum_ksqr_32_64_neon_mc,BIGNUM_KSQR_32_64_NEON_SUBLEMMA) + [`read X0 s`; `read X1 s`; + `bignum_from_memory (read X1 s,8) s`; + `pc:num`; `read X30 s`];; + +(* ------------------------------------------------------------------------- *) +(* Proof now of the 16->32 squaring (like bignum_ksqr_16_32 proof). *) +(* ------------------------------------------------------------------------- *) + +let BIGNUM_KSQR_32_64_NEON_LEMMA = prove + (`!z x a t pc. + nonoverlapping (z,8 * 32) (t,8 * 24) /\ + ALLPAIRS nonoverlapping + [(z,8 * 32); (t,8 * 24)] + [(word pc,3596); (x,8 * 16)] + ==> ensures arm + (\s. aligned_bytes_loaded s (word pc) bignum_ksqr_32_64_neon_mc /\ + read PC s = word(pc + 0x5a8) /\ + C_ARGUMENTS [z; x; t] s /\ + bignum_from_memory (x,16) s = a) + (\s. read PC s = word (pc + 0x844) /\ + bignum_from_memory (z,32) s = a EXP 2) + (MAYCHANGE [PC; X0; X1; X2; X3; X4; X5; X6; X7; X8; X9; X10; + X11; X12; X13; X14; X15; X16; X17; X19; X20; X21; + X22; X23; X24; X25; X30] ,, + MAYCHANGE [Q0; Q1; Q2; Q3; Q4; Q5; Q6; Q7; Q16; Q17; Q18; Q19; Q20; + Q21; Q22; Q23; Q30] ,, + MAYCHANGE [memory :> bytes(z,8 * 32); + memory :> bytes(t,8 * 24)] ,, + MAYCHANGE SOME_FLAGS)`, + MAP_EVERY X_GEN_TAC + [`z:int64`; `x:int64`; `a:num`; `t:int64`;`pc:num`] THEN + REWRITE_TAC[ALLPAIRS; ALL; PAIRWISE] THEN + REWRITE_TAC[C_ARGUMENTS; C_RETURN; SOME_FLAGS; NONOVERLAPPING_CLAUSES] THEN + DISCH_THEN(REPEAT_TCL CONJUNCTS_THEN ASSUME_TAC) THEN + ENSURES_INIT_TAC "s0" THEN + BIGNUM_LDIGITIZE_TAC "x_" `bignum_from_memory (x,16) s0` THEN + + (*** First nested 8x8 squaring block ***) + + ARM_STEPS_TAC BIGNUM_KSQR_32_64_NEON_EXEC (1--4) THEN + BIGNUM_KSQR_32_64_NEON_SUBLEMMA_TAC 5 THEN + BIGNUM_LDIGITIZE_TAC "l_" `read (memory :> bytes (z,8 * 16)) s5` THEN + FIRST_X_ASSUM + (MP_TAC o check (can (term_match [] `x = y EXP 2`) o concl)) THEN + CONV_TAC(LAND_CONV(RAND_CONV(LAND_CONV BIGNUM_LEXPAND_CONV))) THEN + ASM_REWRITE_TAC[] THEN DISCH_THEN(ASSUME_TAC o SYM) THEN + + (*** Absolute difference computation ***) + + ARM_ACCSTEPS_TAC BIGNUM_KSQR_32_64_NEON_EXEC + [8;9;12;13;16;17;20;21; 25;27;30;32;35;37;40;42] (6--43) THEN + SUBGOAL_THEN + `&(bignum_from_memory(t,8) s43):real = + abs(&(bignum_of_wordlist [x_0;x_1;x_2;x_3;x_4;x_5;x_6;x_7]) - + &(bignum_of_wordlist [x_8;x_9;x_10;x_11;x_12;x_13;x_14;x_15]))` + MP_TAC THENL + [MATCH_MP_TAC EQUAL_FROM_CONGRUENT_REAL THEN + MAP_EVERY EXISTS_TAC [`64 * 8`; `&0:real`] THEN CONJ_TAC THENL + [REWRITE_TAC[REAL_OF_NUM_CLAUSES; LE_0; BIGNUM_FROM_MEMORY_BOUND]; + ALL_TAC] THEN + CONJ_TAC THENL + [REWRITE_TAC[REAL_ABS_POS] THEN MATCH_MP_TAC(REAL_ARITH + `&x < e /\ &y < e ==> abs(&x - &y):real < e`) THEN + REWRITE_TAC[REAL_OF_NUM_CLAUSES] THEN CONJ_TAC THEN + MATCH_MP_TAC BIGNUM_OF_WORDLIST_BOUND THEN + REWRITE_TAC[LENGTH] THEN ARITH_TAC; + REWRITE_TAC[INTEGER_CLOSED]] THEN + CONV_TAC(ONCE_DEPTH_CONV BIGNUM_EXPAND_CONV) THEN ASM_REWRITE_TAC[] THEN + CONV_TAC(ONCE_DEPTH_CONV NUM_MULT_CONV) THEN + REWRITE_TAC[REAL_OF_NUM_LT; REAL_ARITH + `abs(&x - &y):real = if &x < &y then &y - &x else &x - &y`] THEN + SUBGOAL_THEN + `carry_s21 <=> + bignum_of_wordlist [x_0;x_1;x_2;x_3;x_4;x_5;x_6;x_7] < + bignum_of_wordlist [x_8;x_9;x_10;x_11;x_12;x_13;x_14;x_15]` + (SUBST_ALL_TAC o SYM) THENL + [MATCH_MP_TAC FLAG_FROM_CARRY_LT THEN EXISTS_TAC `512` THEN + REWRITE_TAC[bignum_of_wordlist; GSYM REAL_OF_NUM_CLAUSES] THEN + ACCUMULATOR_POP_ASSUM_LIST(MP_TAC o end_itlist CONJ o DECARRY_RULE) THEN + REWRITE_TAC[REAL_BITVAL_NOT; REAL_VAL_WORD_MASK; DIMINDEX_64] THEN + DISCH_THEN(fun th -> REWRITE_TAC[th]) THEN BOUNDER_TAC[]; + REWRITE_TAC[bignum_of_wordlist; GSYM REAL_OF_NUM_CLAUSES] THEN + ACCUMULATOR_POP_ASSUM_LIST(MP_TAC o end_itlist CONJ) THEN + REWRITE_TAC[WORD_UNMASK_64; WORD_XOR_MASK] THEN + COND_CASES_TAC THEN ASM_REWRITE_TAC[BITVAL_CLAUSES] THEN + CONV_TAC WORD_REDUCE_CONV THEN CONV_TAC NUM_REDUCE_CONV THEN + ASM_REWRITE_TAC[BITVAL_CLAUSES; REAL_VAL_WORD_NOT; DIMINDEX_64] THEN + DISCH_THEN(MP_TAC o end_itlist CONJ o DESUM_RULE o CONJUNCTS) THEN + REWRITE_TAC[REAL_BITVAL_NOT; REAL_VAL_WORD_MASK; DIMINDEX_64] THEN + DISCH_THEN(fun th -> REWRITE_TAC[th]) THEN REAL_INTEGER_TAC]; + ACCUMULATOR_POP_ASSUM_LIST(K ALL_TAC) THEN + CONV_TAC(LAND_CONV(LAND_CONV(RAND_CONV BIGNUM_LEXPAND_CONV))) THEN + ASM_REWRITE_TAC[] THEN DISCH_THEN(ASSUME_TAC o SYM)] THEN + + (*** Second nested squaring ***) + + ARM_STEPS_TAC BIGNUM_KSQR_32_64_NEON_EXEC (44--46) THEN + BIGNUM_KSQR_32_64_NEON_SUBLEMMA_TAC 47 THEN + BIGNUM_LDIGITIZE_TAC "h_" + `read (memory :> bytes (word_add z (word 128),8 * 16)) s47` THEN + FIRST_X_ASSUM + (MP_TAC o check (can (term_match [] `x = y EXP 2`) o concl)) THEN + CONV_TAC(LAND_CONV(RAND_CONV(LAND_CONV BIGNUM_LEXPAND_CONV))) THEN + ASM_REWRITE_TAC[] THEN DISCH_THEN(ASSUME_TAC o SYM) THEN + + (*** Computation of H' = H + L_top ***) + + ARM_ACCSTEPS_TAC BIGNUM_KSQR_32_64_NEON_EXEC + [50;51;55;56;60;61;65;66;69;70;73;74;77;78;81;82] (48--83) THEN + SUBGOAL_THEN + `bignum_from_memory(word_add z (word 128),16) s83 = + bignum_of_wordlist + [h_0;h_1;h_2;h_3;h_4;h_5;h_6;h_7;h_8;h_9;h_10;h_11;h_12;h_13;h_14;h_15] + + bignum_of_wordlist[l_8;l_9;l_10;l_11;l_12;l_13;l_14;l_15]` + MP_TAC THENL + [REWRITE_TAC[GSYM REAL_OF_NUM_CLAUSES] THEN + MATCH_MP_TAC EQUAL_FROM_CONGRUENT_REAL THEN + MAP_EVERY EXISTS_TAC [`64 * 16`; `&0:real`] THEN CONJ_TAC THENL + [REWRITE_TAC[REAL_OF_NUM_CLAUSES; LE_0; BIGNUM_FROM_MEMORY_BOUND]; + ALL_TAC] THEN + CONJ_TAC THENL + [REWRITE_TAC[REAL_OF_NUM_CLAUSES; LE_0] THEN FIRST_X_ASSUM(fun th -> + GEN_REWRITE_TAC (LAND_CONV o LAND_CONV) [SYM th]) THEN + REWRITE_TAC[bignum_of_wordlist; GSYM REAL_OF_NUM_CLAUSES] THEN + CONV_TAC NUM_REDUCE_CONV THEN BOUNDER_TAC[]; + REWRITE_TAC[INTEGER_CLOSED]] THEN + CONV_TAC(ONCE_DEPTH_CONV BIGNUM_EXPAND_CONV) THEN + ASM_REWRITE_TAC[bignum_of_wordlist; GSYM REAL_OF_NUM_CLAUSES] THEN + CONV_TAC(ONCE_DEPTH_CONV NUM_MULT_CONV) THEN + ACCUMULATOR_POP_ASSUM_LIST(MP_TAC o end_itlist CONJ o DESUM_RULE) THEN + DISCH_THEN(fun th -> REWRITE_TAC[th]) THEN REAL_INTEGER_TAC; + ACCUMULATOR_POP_ASSUM_LIST(K ALL_TAC) THEN + CONV_TAC(LAND_CONV(LAND_CONV BIGNUM_LEXPAND_CONV)) THEN + ASM_REWRITE_TAC[] THEN DISCH_TAC] THEN + + (*** Third and final nested squaring ***) + + ARM_STEPS_TAC BIGNUM_KSQR_32_64_NEON_EXEC (84--86) THEN + BIGNUM_KSQR_32_64_NEON_SUBLEMMA_TAC 87 THEN + BIGNUM_LDIGITIZE_TAC "m_" + `read (memory :> bytes (word_add t (word 64),8 * 16)) s87` THEN + FIRST_X_ASSUM + (MP_TAC o check (can (term_match [] `x = y EXP 2`) o concl)) THEN + CONV_TAC(LAND_CONV(RAND_CONV(LAND_CONV BIGNUM_LEXPAND_CONV))) THEN + ASM_REWRITE_TAC[] THEN DISCH_THEN(ASSUME_TAC o SYM) THEN + + (*** All remaining accumulation of sub-results ***) + + ARM_ACCSTEPS_TAC BIGNUM_KSQR_32_64_NEON_EXEC + [90;91;94;95;98;99;102;103;106;107;110;111;114;115;118;119; + 122;123;126;127;130;131;134;135;138;139;142;143;146;147;150;151; 153; + 156;157;160;161;164;165;168;169] + (88--170) THEN + ENSURES_FINAL_STATE_TAC THEN ASM_REWRITE_TAC[] THEN + REWRITE_TAC[GSYM REAL_OF_NUM_CLAUSES] THEN + MATCH_MP_TAC EQUAL_FROM_CONGRUENT_REAL THEN + MAP_EVERY EXISTS_TAC [`64 * 32`; `&0:real`] THEN CONJ_TAC THENL + [REWRITE_TAC[REAL_OF_NUM_CLAUSES; LE_0] THEN + REWRITE_TAC[GSYM BIGNUM_FROM_MEMORY_BYTES; BIGNUM_FROM_MEMORY_BOUND]; + ALL_TAC] THEN + CONJ_TAC THENL + [REWRITE_TAC[REAL_OF_NUM_CLAUSES; LE_0] THEN EXPAND_TAC "a" THEN + REWRITE_TAC[bignum_of_wordlist; GSYM REAL_OF_NUM_CLAUSES] THEN + CONV_TAC NUM_REDUCE_CONV THEN BOUNDER_TAC[]; + REWRITE_TAC[INTEGER_CLOSED]] THEN + SUBGOAL_THEN + `(&a:real) pow 2 = + (&(bignum_of_wordlist[l_0; l_1; l_2; l_3; l_4; l_5; l_6; l_7]) + + &2 pow 512 * + &(bignum_of_wordlist + [sum_s50; sum_s51; sum_s55; sum_s56; sum_s60; sum_s61; sum_s65; + sum_s66; sum_s69; sum_s70; sum_s73; sum_s74; sum_s77; sum_s78; + sum_s81; sum_s82])) * + (&2 pow 512 + &1) - + &2 pow 512 * + &(bignum_of_wordlist + [m_0; m_1; m_2; m_3; m_4; m_5; m_6; m_7; m_8; m_9; m_10; m_11; m_12; + m_13; m_14; m_15])` + SUBST1_TAC THENL + [ASM_REWRITE_TAC[] THEN REWRITE_TAC[REAL_OF_NUM_CLAUSES; ARITH_RULE + `l + e * (h + m):num = (l + e * m) + e * h`] THEN + REWRITE_TAC[GSYM(BIGNUM_OF_WORDLIST_SPLIT_RULE(8,8))] THEN + REPEAT(FIRST_X_ASSUM(SUBST1_TAC o MATCH_MP (ARITH_RULE + `x EXP 2 = y ==> y = x EXP 2`))) THEN + REWRITE_TAC[GSYM REAL_OF_NUM_CLAUSES] THEN + FIRST_X_ASSUM(SUBST1_TAC o MATCH_MP (MESON[] + `abs x:real = y ==> y = abs x`)) THEN + REWRITE_TAC[REAL_POW2_ABS] THEN + EXPAND_TAC "a" THEN + REWRITE_TAC[GSYM REAL_OF_NUM_CLAUSES; bignum_of_wordlist] THEN + REAL_ARITH_TAC; + ALL_TAC] THEN + CONV_TAC(ONCE_DEPTH_CONV BIGNUM_LEXPAND_CONV) THEN + REWRITE_TAC[GSYM REAL_OF_NUM_CLAUSES; bignum_of_wordlist] THEN + ASM_REWRITE_TAC[] THEN CONV_TAC NUM_REDUCE_CONV THEN + ACCUMULATOR_POP_ASSUM_LIST(MP_TAC o end_itlist CONJ) THEN + REWRITE_TAC[WORD_UNMASK_64; REAL_VAL_WORD_MASK; DIMINDEX_64; + COND_SWAP; GSYM WORD_BITVAL; VAL_WORD_BITVAL] THEN + DISCH_THEN(MP_TAC o end_itlist CONJ o DESUM_RULE o CONJUNCTS) THEN + DISCH_THEN(fun th -> REWRITE_TAC[th]) THEN REAL_INTEGER_TAC);; + +let BIGNUM_KSQR_32_64_NEON_SUBROUTINE_LEMMA = prove + (`!z x a t pc stackpointer returnaddress. + aligned 16 stackpointer /\ + PAIRWISE nonoverlapping + [(z,8 * 32); (t,8 * 24); (word_sub stackpointer (word 64),64)] /\ + ALLPAIRS nonoverlapping + [(z,8 * 32); (t,8 * 24); (word_sub stackpointer (word 64),64)] + [(word pc,3596); (x,8 * 16)] + ==> ensures arm + (\s. aligned_bytes_loaded s + (word (pc + 0x0)) bignum_ksqr_32_64_neon_mc /\ + read PC s = word(pc + 0x598) /\ + read SP s = stackpointer /\ + read X30 s = returnaddress /\ + C_ARGUMENTS [z; x; t] s /\ + bignum_from_memory (x,16) s = a) + (\s. read PC s = returnaddress /\ + bignum_from_memory (z,32) s = a EXP 2) + (MAYCHANGE [PC; X0; X1; X2; X3; X4; X5; X6; X7; X8; X9; X10; + X11; X12; X13; X14; X15; X16; X17] ,, + MAYCHANGE [Q0; Q1; Q2; Q3; Q4; Q5; Q6; Q7; Q16; Q17; Q18; Q19; Q20; + Q21; Q22; Q23; Q30] ,, + MAYCHANGE [memory :> bytes(z,8 * 32); + memory :> bytes(t,8 * 24); + memory :> bytes(word_sub stackpointer (word 64),64)] ,, + MAYCHANGE SOME_FLAGS)`, + REWRITE_TAC[ADD_CLAUSES] THEN + ARM_ADD_RETURN_STACK_TAC + BIGNUM_KSQR_32_64_NEON_EXEC BIGNUM_KSQR_32_64_NEON_LEMMA + `[X19;X20;X21;X22;X23;X24;X25;X30]` 64);; + +let BIGNUM_KSQR_32_64_NEON_LEMMA_TAC = + ARM_SUBROUTINE_SIM_TAC + (bignum_ksqr_32_64_neon_mc,BIGNUM_KSQR_32_64_NEON_EXEC, + 0x0,bignum_ksqr_32_64_neon_mc,BIGNUM_KSQR_32_64_NEON_SUBROUTINE_LEMMA) + [`read X0 s`; `read X1 s`; + `read (memory :> bytes (read X1 s,8 * 16)) s`; + `read X2 s`; `pc:num`; `read SP s`; `read X30 s`];; + +(* ------------------------------------------------------------------------- *) +(* Now the overall proof *) +(* ------------------------------------------------------------------------- *) + +let BIGNUM_KSQR_32_64_NEON_SUBROUTINE_CORRECT = prove + (`!z x t a pc stackpointer returnaddress. + aligned 16 stackpointer /\ + PAIRWISE nonoverlapping + [(z,8 * 64); (t,8 * 72); (word_sub stackpointer (word 96),96)] /\ + ALLPAIRS nonoverlapping + [(z,8 * 64); (t,8 * 72); (word_sub stackpointer (word 96),96)] + [(word pc,3596); (x,8 * 32)] + ==> ensures arm + (\s. aligned_bytes_loaded s (word pc) bignum_ksqr_32_64_neon_mc /\ + read PC s = word pc /\ + read SP s = stackpointer /\ + read X30 s = returnaddress /\ + C_ARGUMENTS [z; x; t] s /\ + bignum_from_memory (x,32) s = a) + (\s. read PC s = returnaddress /\ + bignum_from_memory (z,64) s = a EXP 2) + (MAYCHANGE_REGS_AND_FLAGS_PERMITTED_BY_ABI ,, + MAYCHANGE [memory :> bytes(z,8 * 64); + memory :> bytes(t,8 * 72); + memory :> bytes(word_sub stackpointer (word 96),96)])`, + MAP_EVERY X_GEN_TAC [`z:int64`; `x:int64`; `t:int64`; `a:num`; `pc:num`] THEN + WORD_FORALL_OFFSET_TAC 96 THEN + MAP_EVERY X_GEN_TAC [`stackpointer:int64`; `returnaddress:int64`] THEN + REWRITE_TAC[C_ARGUMENTS; C_RETURN; SOME_FLAGS; + MAYCHANGE_REGS_AND_FLAGS_PERMITTED_BY_ABI] THEN + REWRITE_TAC[ALL; ALLPAIRS; PAIRWISE; NONOVERLAPPING_CLAUSES] THEN + STRIP_TAC THEN + + (*** Start and end boilerplate for save and restore of registers ***) + + SUBGOAL_THEN + `ensures arm + (\s. aligned_bytes_loaded s (word pc) bignum_ksqr_32_64_neon_mc /\ + read PC s = word(pc + 0x8) /\ + read SP s = word_add stackpointer (word 64) /\ + C_ARGUMENTS [z; x; t] s /\ + bignum_from_memory (x,32) s = a) + (\s. read PC s = word(pc + 0x58c) /\ + bignum_from_memory (z,64) s = a EXP 2) + (MAYCHANGE [PC; X0; X1; X2; X3; X4; X5; X6; X7; X8; X9; X10; + X11; X12; X13; X14; X15; X16; X17; X19; X20; X21; X30] ,, + MAYCHANGE [Q0; Q1; Q2; Q3; Q4; Q5; Q6; Q7; Q16; Q17; Q18; Q19; Q20; + Q21; Q22; Q23; Q30] ,, + MAYCHANGE [memory :> bytes(z,8 * 64); memory :> bytes(t,8 * 72); + memory :> bytes(stackpointer,64)] ,, + MAYCHANGE SOME_FLAGS)` + MP_TAC THEN + REWRITE_TAC[C_ARGUMENTS; C_RETURN; SOME_FLAGS] THEN + REWRITE_TAC[BIGNUM_FROM_MEMORY_BYTES] THENL + [ENSURES_EXISTING_PRESERVED_TAC `SP`; + DISCH_THEN(fun th -> + ENSURES_EXISTING_PRESERVED_TAC `SP` THEN + ENSURES_PRESERVED_TAC "x19_init" `X19` THEN + ENSURES_PRESERVED_TAC "x20_init" `X20` THEN + ENSURES_PRESERVED_TAC "x21_init" `X21` THEN + ENSURES_EXISTING_PRESERVED_TAC `X30` THEN + ENSURES_INIT_TAC "s0" THEN + ARM_STEPS_TAC BIGNUM_KSQR_32_64_NEON_EXEC (1--2) THEN + MP_TAC th) THEN + ARM_BIGSTEP_TAC BIGNUM_KSQR_32_64_NEON_EXEC "s3" THEN + ARM_STEPS_TAC BIGNUM_KSQR_32_64_NEON_EXEC (4--6) THEN + ENSURES_FINAL_STATE_TAC THEN ASM_REWRITE_TAC[]] THEN + + (*** Initialization and splitting of the input ***) + + REWRITE_TAC[GSYM BIGNUM_FROM_MEMORY_BYTES] THEN + BIGNUM_TERMRANGE_TAC `32` `a:num` THEN + REWRITE_TAC[BIGNUM_FROM_MEMORY_BYTES] THEN + MP_TAC(ISPECL [`x:int64`; `16`; `16`] BIGNUM_FROM_MEMORY_SPLIT) THEN + CONV_TAC(LAND_CONV(ONCE_DEPTH_CONV(NUM_ADD_CONV ORELSEC NUM_MULT_CONV))) THEN + REWRITE_TAC[BIGNUM_FROM_MEMORY_BYTES] THEN + DISCH_THEN(fun th -> REWRITE_TAC[th]) THEN + ENSURES_INIT_TAC "s0" THEN + MAP_EVERY ABBREV_TAC + [`ahi = read (memory :> bytes (word_add x (word 128),8 * 16)) s0`; + `alo = read (memory :> bytes (x,8 * 16)) s0`] THEN + + (*** First nested squaring: low part ***) + + ARM_STEPS_TAC BIGNUM_KSQR_32_64_NEON_EXEC (1--4) THEN + BIGNUM_KSQR_32_64_NEON_LEMMA_TAC 5 THEN + + (*** Second nested squaring: high part ***) + + ARM_STEPS_TAC BIGNUM_KSQR_32_64_NEON_EXEC (6--9) THEN + BIGNUM_KSQR_32_64_NEON_LEMMA_TAC 10 THEN + + (*** Split L into L_top and L_bot and form H' = H + L_top ***) + + MP_TAC(ISPECL [`z:int64`; `16`; `16`; `s10:armstate`] + BIGNUM_FROM_MEMORY_SPLIT) THEN + CONV_TAC(LAND_CONV(ONCE_DEPTH_CONV(NUM_ADD_CONV ORELSEC NUM_MULT_CONV))) THEN + REWRITE_TAC[BIGNUM_FROM_MEMORY_BYTES] THEN DISCH_THEN SUBST_ALL_TAC THEN + MAP_EVERY ABBREV_TAC + [`ltop = read (memory :> bytes (word_add z (word 128),8 * 16)) s10`; + `lbot = read (memory :> bytes (z,8 * 16)) s10`; + `h = read (memory :> bytes (word_add z (word 256),8 * 32)) s10`] THEN + + BIGNUM_LDIGITIZE_TAC "ltop_" + `read (memory :> bytes (word_add z (word 128),8 * 16)) s10` THEN + BIGNUM_LDIGITIZE_TAC "h_" + `read (memory :> bytes (word_add z (word 256),8 * 32)) s10` THEN + + ARM_ACCSTEPS_TAC BIGNUM_KSQR_32_64_NEON_EXEC + [13; 14; 18; 19; 23; 24; 28; 29; 33; 34; 38; 39; 43; 44; 48; 49; + 52; 53; 56; 57; 60; 61; 64; 65; 68; 69; 72; 73; 76; 77; 80; 81] + (11--82) THEN + + SUBGOAL_THEN `bignum_from_memory(word_add z (word 256),32) s82 = h + ltop` + MP_TAC THENL + [REWRITE_TAC[GSYM REAL_OF_NUM_CLAUSES] THEN + MATCH_MP_TAC EQUAL_FROM_CONGRUENT_REAL THEN + MAP_EVERY EXISTS_TAC [`64 * 32`; `&0:real`] THEN CONJ_TAC THENL + [REWRITE_TAC[REAL_OF_NUM_CLAUSES; LE_0; BIGNUM_FROM_MEMORY_BOUND]; + ALL_TAC] THEN + CONJ_TAC THENL + [ASM_REWRITE_TAC[REAL_OF_NUM_CLAUSES; LE_0] THEN + MAP_EVERY EXPAND_TAC ["ahi"; "ltop"] THEN + MATCH_MP_TAC(ARITH_RULE + `x <= (2 EXP (64 * 16) - 1) EXP 2 /\ + y + (2 EXP 1024 - 1) EXP 2 < e + ==> x + y < e`) THEN + CONJ_TAC THENL + [MATCH_MP_TAC EXP_MONO_LE_IMP THEN + MATCH_MP_TAC(ARITH_RULE `x < e ==> x <= e - 1`) THEN + REWRITE_TAC[BIGNUM_FROM_MEMORY_BOUND; GSYM BIGNUM_FROM_MEMORY_BYTES]; + CONV_TAC NUM_REDUCE_CONV THEN + REWRITE_TAC[bignum_of_wordlist; GSYM REAL_OF_NUM_CLAUSES] THEN + CONV_TAC NUM_REDUCE_CONV THEN BOUNDER_TAC[]]; + REWRITE_TAC[INTEGER_CLOSED]] THEN + MAP_EVERY EXPAND_TAC ["h"; "ltop"] THEN + REWRITE_TAC[bignum_of_wordlist; GSYM REAL_OF_NUM_CLAUSES] THEN + CONV_TAC(ONCE_DEPTH_CONV BIGNUM_EXPAND_CONV) THEN ASM_REWRITE_TAC[] THEN + REWRITE_TAC[bignum_of_wordlist; GSYM REAL_OF_NUM_CLAUSES] THEN + CONV_TAC(ONCE_DEPTH_CONV NUM_MULT_CONV) THEN + ACCUMULATOR_POP_ASSUM_LIST(MP_TAC o end_itlist CONJ o DESUM_RULE) THEN + DISCH_THEN(fun th -> REWRITE_TAC[th]) THEN REAL_INTEGER_TAC; + REWRITE_TAC[BIGNUM_FROM_MEMORY_BYTES] THEN + ACCUMULATOR_POP_ASSUM_LIST(K ALL_TAC)] THEN + + (*** Throw away h and digitizations, use h in place of h' now ***) + + DISCARD_MATCHING_ASSUMPTIONS [`read (memory :> bytes64 x) s = y`] THEN + UNDISCH_THEN `h = ahi EXP 2` SUBST1_TAC THEN + DISCARD_MATCHING_ASSUMPTIONS [`bignum_of_wordlist l = n`] THEN + ABBREV_TAC `h = ahi EXP 2 + ltop` THEN DISCH_TAC THEN + + (*** Absolute difference computation ***) + + BIGNUM_LDIGITIZE_TAC "xl_" `read (memory :> bytes (x,8 * 16)) s82` THEN + BIGNUM_LDIGITIZE_TAC "xh_" + `read (memory :> bytes (word_add x (word 128),8 * 16)) s82` THEN + ARM_ACCSTEPS_TAC BIGNUM_KSQR_32_64_NEON_EXEC + [85; 86; 89; 90; 93; 94; 97; 98; 101; 102; 105; 106; 109; 110; 113; 114] + (83--116) THEN + RULE_ASSUM_TAC(REWRITE_RULE[ADD_CLAUSES; WORD_SUB_LZERO]) THEN + SUBGOAL_THEN + `2 EXP 64 <= val(word_neg (word (bitval carry_s114)):int64) + + val(word_neg (word (bitval carry_s114)):int64) <=> + carry_s114` + SUBST_ALL_TAC THENL + [POP_ASSUM_LIST(K ALL_TAC) THEN BOOL_CASES_TAC `carry_s114:bool` THEN + REWRITE_TAC[BITVAL_CLAUSES] THEN CONV_TAC WORD_REDUCE_CONV THEN + CONV_TAC NUM_REDUCE_CONV; + ALL_TAC] THEN + SUBGOAL_THEN `carry_s114 <=> ahi < alo` (ASSUME_TAC o SYM) THENL + [MAP_EVERY EXPAND_TAC ["ahi"; "alo"] THEN + MATCH_MP_TAC FLAG_FROM_CARRY_LT THEN EXISTS_TAC `1024` THEN + REWRITE_TAC[bignum_of_wordlist; GSYM REAL_OF_NUM_CLAUSES] THEN + ACCUMULATOR_POP_ASSUM_LIST(MP_TAC o end_itlist CONJ o DECARRY_RULE) THEN + DISCH_THEN(fun th -> REWRITE_TAC[th]) THEN BOUNDER_TAC[]; + ALL_TAC] THEN + ARM_ACCSTEPS_TAC BIGNUM_KSQR_32_64_NEON_EXEC + [118; 120; 123; 125; 128; 130; 133; 135; 138; 140; + 143; 145; 148; 150; 153; 155] + (117--156) THEN + SUBGOAL_THEN + `&(read (memory :> bytes (t,8 * 16)) s156):real = abs(&alo - &ahi)` + ASSUME_TAC THENL + [REWRITE_TAC[GSYM BIGNUM_FROM_MEMORY_BYTES] THEN + MATCH_MP_TAC EQUAL_FROM_CONGRUENT_REAL THEN + MAP_EVERY EXISTS_TAC [`64 * 16`; `&0:real`] THEN CONJ_TAC THENL + [REWRITE_TAC[REAL_OF_NUM_CLAUSES; LE_0; BIGNUM_FROM_MEMORY_BOUND]; + ALL_TAC] THEN + CONJ_TAC THENL + [REWRITE_TAC[REAL_ABS_POS] THEN MATCH_MP_TAC(REAL_ARITH + `&x < e /\ &y < e ==> abs(&x - &y):real < e`) THEN + REWRITE_TAC[REAL_OF_NUM_CLAUSES] THEN + MAP_EVERY EXPAND_TAC ["ahi"; "alo"] THEN + CONJ_TAC THEN MATCH_MP_TAC BIGNUM_OF_WORDLIST_BOUND THEN + REWRITE_TAC[LENGTH] THEN ARITH_TAC; + REWRITE_TAC[INTEGER_CLOSED]] THEN + ASM_REWRITE_TAC[REAL_OF_NUM_LT; REAL_ARITH + `abs(&x - &y):real = if &y < &x then &x - &y else &y - &x`] THEN + CONV_TAC(ONCE_DEPTH_CONV BIGNUM_EXPAND_CONV) THEN ASM_REWRITE_TAC[] THEN + MAP_EVERY EXPAND_TAC ["ahi"; "alo"] THEN + CONV_TAC(ONCE_DEPTH_CONV NUM_MULT_CONV) THEN + REWRITE_TAC[bignum_of_wordlist; GSYM REAL_OF_NUM_CLAUSES] THEN + ACCUMULATOR_POP_ASSUM_LIST(MP_TAC o end_itlist CONJ) THEN + REWRITE_TAC[WORD_XOR_MASK] THEN + ASM_CASES_TAC `carry_s114:bool` THEN ASM_REWRITE_TAC[BITVAL_CLAUSES] THEN + REWRITE_TAC[REAL_VAL_WORD_NOT; DIMINDEX_64] THEN + DISCH_THEN(MP_TAC o end_itlist CONJ o DESUM_RULE o CONJUNCTS) THEN + DISCH_THEN(fun th -> REWRITE_TAC[th]) THEN REAL_INTEGER_TAC; + ACCUMULATOR_POP_ASSUM_LIST(K ALL_TAC)] THEN + + (*** Discard elementwise assignments and things to do with x ***) + + DISCARD_MATCHING_ASSUMPTIONS [`read (memory :> bytes64 x) s = y`] THEN + DISCARD_MATCHING_ASSUMPTIONS [`bignum_of_wordlist l = n`] THEN + REPEAT(FIRST_X_ASSUM(K ALL_TAC o check (free_in `x:int64` o concl))) THEN + + (*** Third and final nested squaring ***) + + ABBREV_TAC `m = read (memory :> bytes (t,8 * 16)) s156` THEN + ARM_STEPS_TAC BIGNUM_KSQR_32_64_NEON_EXEC (157--160) THEN + BIGNUM_KSQR_32_64_NEON_LEMMA_TAC 161 THEN + + (*** All remaining accumulation of sub-results ***) + + BIGNUM_LDIGITIZE_TAC "l_" `read (memory :> bytes (z,8 * 16)) s161` THEN + BIGNUM_LDIGITIZE_TAC "m_" + `read (memory :> bytes (word_add t (word 128),8 * 32)) s161` THEN + BIGNUM_LDIGITIZE_TAC "h_" + `read (memory :> bytes (word_add z (word 256),8 * 32)) s161` THEN + ARM_ACCSTEPS_TAC BIGNUM_KSQR_32_64_NEON_EXEC + [164; 165; 169; 170; 174; 175; 179; 180; 184; 185; 189; 190; + 194; 195; 199; 200; 204; 205; 209; 210; 214; 215; 219; 220; + 224; 225; 229; 230; 234; 235; 239; 240; 245; 246; 250; 251; + 255; 256; 260; 261; 265; 266; 270; 271; 275; 276; 280; 281; + 285; 286; 290; 291; 295; 296; 300; 301; 305; 306; 310; 311; + 315; 316; 320; 321; 323; 326; 327; 330; 331; 334; 335; 338; + 339; 342; 343; 346; 347; 350; 351; 354; 355] + (162--356) THEN + ENSURES_FINAL_STATE_TAC THEN ASM_REWRITE_TAC[] THEN + REWRITE_TAC[GSYM REAL_OF_NUM_CLAUSES] THEN + MATCH_MP_TAC EQUAL_FROM_CONGRUENT_REAL THEN + MAP_EVERY EXISTS_TAC [`64 * 64`; `&0:real`] THEN CONJ_TAC THENL + [REWRITE_TAC[REAL_OF_NUM_CLAUSES; LE_0] THEN + REWRITE_TAC[GSYM BIGNUM_FROM_MEMORY_BYTES; BIGNUM_FROM_MEMORY_BOUND]; + ALL_TAC] THEN + CONJ_TAC THENL + [REWRITE_TAC[REAL_OF_NUM_CLAUSES; LE_0; BIGNUM_FROM_MEMORY_BOUND] THEN + REWRITE_TAC[EXP_ADD; ARITH_RULE `64 * 64 = 64 * 32 + 64 * 32`] THEN + ASM_SIMP_TAC[EXP_2; LT_MULT2]; + REWRITE_TAC[INTEGER_CLOSED]] THEN + UNDISCH_THEN `2 EXP 1024 * ahi + alo = a` (SUBST1_TAC o SYM) THEN + REWRITE_TAC[GSYM REAL_OF_NUM_CLAUSES] THEN + ONCE_REWRITE_TAC[REAL_ARITH + `(e * h + l:real) pow 2 = + l pow 2 + e pow 2 * h pow 2 + + e * (h pow 2 + l pow 2 - (l - h) pow 2)`] THEN + ONCE_REWRITE_TAC[GSYM REAL_POW2_ABS] THEN + UNDISCH_THEN `&m:real = abs(&alo - &ahi)` (SUBST1_TAC o SYM) THEN + REWRITE_TAC[REAL_POW2_ABS] THEN REWRITE_TAC[REAL_OF_NUM_CLAUSES] THEN + REPEAT(FIRST_X_ASSUM(SUBST1_TAC o MATCH_MP (ARITH_RULE + `a = b EXP 2 ==> b EXP 2 = a`))) THEN + UNDISCH_TAC `ahi EXP 2 + ltop = h` THEN + REWRITE_TAC[GSYM REAL_OF_NUM_CLAUSES] THEN DISCH_THEN(SUBST1_TAC o MATCH_MP + (REAL_ARITH `a + b:real = c ==> a = c - b`)) THEN + CONV_TAC(ONCE_DEPTH_CONV BIGNUM_EXPAND_CONV) THEN ASM_REWRITE_TAC[] THEN + MAP_EVERY EXPAND_TAC ["lbot"; "h"] THEN + REWRITE_TAC[GSYM REAL_OF_NUM_CLAUSES; bignum_of_wordlist] THEN + RULE_ASSUM_TAC(REWRITE_RULE[WORD_UNMASK_64]) THEN + RULE_ASSUM_TAC(REWRITE_RULE[COND_SWAP; GSYM WORD_BITVAL]) THEN + CONV_TAC NUM_REDUCE_CONV THEN + ACCUMULATOR_POP_ASSUM_LIST(MP_TAC o end_itlist CONJ o DESUM_RULE) THEN + DISCH_THEN(fun th -> REWRITE_TAC[th]) THEN REAL_INTEGER_TAC);; diff --git a/arm/proofs/bignum_mul_8_16_neon.ml b/arm/proofs/bignum_mul_8_16_neon.ml new file mode 100644 index 00000000..625cbfa3 --- /dev/null +++ b/arm/proofs/bignum_mul_8_16_neon.ml @@ -0,0 +1,1015 @@ +(* + * Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. + * SPDX-License-Identifier: Apache-2.0 OR ISC + *) + +(* ========================================================================= *) +(* 8x8 -> 16 multiply (pure Karatsuba and then ADK for the 4x4 bits). *) +(* ========================================================================= *) + +(**** print_literal_from_elf "arm/fastmul/bignum_mul_8_16_neon.o";; + ****) + +let bignum_mul_8_16_neon_mc = define_assert_from_elf "bignum_mul_8_16_neon_mc" "arm/fastmul/bignum_mul_8_16_neon.o" +[ + 0xa9bf53f3; (* arm_STP X19 X20 SP (Preimmediate_Offset (iword (-- &16))) *) + 0xa9bf5bf5; (* arm_STP X21 X22 SP (Preimmediate_Offset (iword (-- &16))) *) + 0xa9bf63f7; (* arm_STP X23 X24 SP (Preimmediate_Offset (iword (-- &16))) *) + 0xa9401023; (* arm_LDP X3 X4 X1 (Immediate_Offset (iword (&0))) *) + 0x3dc00020; (* arm_LDR Q0 X1 (Immediate_Offset (word 0)) *) + 0xa9402047; (* arm_LDP X7 X8 X2 (Immediate_Offset (iword (&0))) *) + 0x3dc00041; (* arm_LDR Q1 X2 (Immediate_Offset (word 0)) *) + 0xa9411825; (* arm_LDP X5 X6 X1 (Immediate_Offset (iword (&16))) *) + 0x3dc00422; (* arm_LDR Q2 X1 (Immediate_Offset (word 16)) *) + 0xa9412849; (* arm_LDP X9 X10 X2 (Immediate_Offset (iword (&16))) *) + 0x3dc00443; (* arm_LDR Q3 X2 (Immediate_Offset (word 16)) *) + 0x4e801824; (* arm_UZIP1 Q4 Q1 Q0 32 *) + 0x4ea00821; (* arm_REV64_VEC Q1 Q1 32 *) + 0x4e801805; (* arm_UZIP1 Q5 Q0 Q0 32 *) + 0x4ea09c20; (* arm_MUL_VEC Q0 Q1 Q0 32 *) + 0x6ea02800; (* arm_UADDLP Q0 Q0 32 *) + 0x4f605400; (* arm_SHL_VEC Q0 Q0 32 64 *) + 0x2ea480a0; (* arm_UMLAL Q0 Q5 Q4 32 *) + 0x4e083c0b; (* arm_UMOV X11 Q0 0 8 *) + 0x4e183c0f; (* arm_UMOV X15 Q0 1 8 *) + 0x4e821860; (* arm_UZIP1 Q0 Q3 Q2 32 *) + 0x4ea00861; (* arm_REV64_VEC Q1 Q3 32 *) + 0x4e821843; (* arm_UZIP1 Q3 Q2 Q2 32 *) + 0x4ea29c21; (* arm_MUL_VEC Q1 Q1 Q2 32 *) + 0x6ea02821; (* arm_UADDLP Q1 Q1 32 *) + 0x4f605421; (* arm_SHL_VEC Q1 Q1 32 64 *) + 0x2ea08061; (* arm_UMLAL Q1 Q3 Q0 32 *) + 0x4e083c30; (* arm_UMOV X16 Q1 0 8 *) + 0x4e183c31; (* arm_UMOV X17 Q1 1 8 *) + 0x3dc00820; (* arm_LDR Q0 X1 (Immediate_Offset (word 32)) *) + 0x3dc00841; (* arm_LDR Q1 X2 (Immediate_Offset (word 32)) *) + 0x3dc00c22; (* arm_LDR Q2 X1 (Immediate_Offset (word 48)) *) + 0x3dc00c43; (* arm_LDR Q3 X2 (Immediate_Offset (word 48)) *) + 0x9bc77c73; (* arm_UMULH X19 X3 X7 *) + 0xab1301ef; (* arm_ADDS X15 X15 X19 *) + 0x9bc87c93; (* arm_UMULH X19 X4 X8 *) + 0xba130210; (* arm_ADCS X16 X16 X19 *) + 0x9bc97cb3; (* arm_UMULH X19 X5 X9 *) + 0xba130231; (* arm_ADCS X17 X17 X19 *) + 0x9bca7cd3; (* arm_UMULH X19 X6 X10 *) + 0x4e801824; (* arm_UZIP1 Q4 Q1 Q0 32 *) + 0x4ea00821; (* arm_REV64_VEC Q1 Q1 32 *) + 0x4e801805; (* arm_UZIP1 Q5 Q0 Q0 32 *) + 0x4ea09c20; (* arm_MUL_VEC Q0 Q1 Q0 32 *) + 0x6ea02800; (* arm_UADDLP Q0 Q0 32 *) + 0x4f605400; (* arm_SHL_VEC Q0 Q0 32 64 *) + 0x2ea480a0; (* arm_UMLAL Q0 Q5 Q4 32 *) + 0x9a1f0273; (* arm_ADC X19 X19 XZR *) + 0xab0b01ec; (* arm_ADDS X12 X15 X11 *) + 0xba0f020f; (* arm_ADCS X15 X16 X15 *) + 0xba100230; (* arm_ADCS X16 X17 X16 *) + 0xba110271; (* arm_ADCS X17 X19 X17 *) + 0x9a1303f3; (* arm_ADC X19 XZR X19 *) + 0xab0b01ed; (* arm_ADDS X13 X15 X11 *) + 0xba0c020e; (* arm_ADCS X14 X16 X12 *) + 0xba0f022f; (* arm_ADCS X15 X17 X15 *) + 0xba100270; (* arm_ADCS X16 X19 X16 *) + 0xba1103f1; (* arm_ADCS X17 XZR X17 *) + 0x9a1303f3; (* arm_ADC X19 XZR X19 *) + 0xeb0600b8; (* arm_SUBS X24 X5 X6 *) + 0xda982718; (* arm_CNEG X24 X24 Condition_CC *) + 0xda9f23f4; (* arm_CSETM X20 Condition_CC *) + 0xeb090155; (* arm_SUBS X21 X10 X9 *) + 0xda9526b5; (* arm_CNEG X21 X21 Condition_CC *) + 0x9b157f16; (* arm_MUL X22 X24 X21 *) + 0x9bd57f15; (* arm_UMULH X21 X24 X21 *) + 0xda942294; (* arm_CINV X20 X20 Condition_CC *) + 0xb100069f; (* arm_CMN X20 (rvalue (word 1)) *) + 0xca1402d6; (* arm_EOR X22 X22 X20 *) + 0xba160210; (* arm_ADCS X16 X16 X22 *) + 0xca1402b5; (* arm_EOR X21 X21 X20 *) + 0xba150231; (* arm_ADCS X17 X17 X21 *) + 0x9a140273; (* arm_ADC X19 X19 X20 *) + 0xeb040078; (* arm_SUBS X24 X3 X4 *) + 0xda982718; (* arm_CNEG X24 X24 Condition_CC *) + 0xda9f23f4; (* arm_CSETM X20 Condition_CC *) + 0xeb070115; (* arm_SUBS X21 X8 X7 *) + 0xda9526b5; (* arm_CNEG X21 X21 Condition_CC *) + 0x9b157f16; (* arm_MUL X22 X24 X21 *) + 0x9bd57f15; (* arm_UMULH X21 X24 X21 *) + 0xda942294; (* arm_CINV X20 X20 Condition_CC *) + 0xb100069f; (* arm_CMN X20 (rvalue (word 1)) *) + 0xca1402d6; (* arm_EOR X22 X22 X20 *) + 0xba16018c; (* arm_ADCS X12 X12 X22 *) + 0xca1402b5; (* arm_EOR X21 X21 X20 *) + 0xba1501ad; (* arm_ADCS X13 X13 X21 *) + 0xba1401ce; (* arm_ADCS X14 X14 X20 *) + 0xba1401ef; (* arm_ADCS X15 X15 X20 *) + 0xba140210; (* arm_ADCS X16 X16 X20 *) + 0xba140231; (* arm_ADCS X17 X17 X20 *) + 0x9a140273; (* arm_ADC X19 X19 X20 *) + 0xeb060098; (* arm_SUBS X24 X4 X6 *) + 0xda982718; (* arm_CNEG X24 X24 Condition_CC *) + 0xda9f23f4; (* arm_CSETM X20 Condition_CC *) + 0xeb080155; (* arm_SUBS X21 X10 X8 *) + 0xda9526b5; (* arm_CNEG X21 X21 Condition_CC *) + 0x9b157f16; (* arm_MUL X22 X24 X21 *) + 0x9bd57f15; (* arm_UMULH X21 X24 X21 *) + 0xda942294; (* arm_CINV X20 X20 Condition_CC *) + 0xb100069f; (* arm_CMN X20 (rvalue (word 1)) *) + 0xca1402d6; (* arm_EOR X22 X22 X20 *) + 0xba1601ef; (* arm_ADCS X15 X15 X22 *) + 0xca1402b5; (* arm_EOR X21 X21 X20 *) + 0xba150210; (* arm_ADCS X16 X16 X21 *) + 0xba140231; (* arm_ADCS X17 X17 X20 *) + 0x9a140273; (* arm_ADC X19 X19 X20 *) + 0xeb050078; (* arm_SUBS X24 X3 X5 *) + 0xda982718; (* arm_CNEG X24 X24 Condition_CC *) + 0xda9f23f4; (* arm_CSETM X20 Condition_CC *) + 0xeb070135; (* arm_SUBS X21 X9 X7 *) + 0xda9526b5; (* arm_CNEG X21 X21 Condition_CC *) + 0x9b157f16; (* arm_MUL X22 X24 X21 *) + 0x9bd57f15; (* arm_UMULH X21 X24 X21 *) + 0xda942294; (* arm_CINV X20 X20 Condition_CC *) + 0xb100069f; (* arm_CMN X20 (rvalue (word 1)) *) + 0xca1402d6; (* arm_EOR X22 X22 X20 *) + 0xba1601ad; (* arm_ADCS X13 X13 X22 *) + 0xca1402b5; (* arm_EOR X21 X21 X20 *) + 0xba1501ce; (* arm_ADCS X14 X14 X21 *) + 0xba1401ef; (* arm_ADCS X15 X15 X20 *) + 0xba140210; (* arm_ADCS X16 X16 X20 *) + 0xba140231; (* arm_ADCS X17 X17 X20 *) + 0x9a140273; (* arm_ADC X19 X19 X20 *) + 0xeb060078; (* arm_SUBS X24 X3 X6 *) + 0xda982718; (* arm_CNEG X24 X24 Condition_CC *) + 0xda9f23f4; (* arm_CSETM X20 Condition_CC *) + 0xeb070155; (* arm_SUBS X21 X10 X7 *) + 0xda9526b5; (* arm_CNEG X21 X21 Condition_CC *) + 0x9b157f16; (* arm_MUL X22 X24 X21 *) + 0x9bd57f15; (* arm_UMULH X21 X24 X21 *) + 0xda942294; (* arm_CINV X20 X20 Condition_CC *) + 0xb100069f; (* arm_CMN X20 (rvalue (word 1)) *) + 0xca1402d6; (* arm_EOR X22 X22 X20 *) + 0xba1601ce; (* arm_ADCS X14 X14 X22 *) + 0xca1402b5; (* arm_EOR X21 X21 X20 *) + 0xba1501ef; (* arm_ADCS X15 X15 X21 *) + 0xba140210; (* arm_ADCS X16 X16 X20 *) + 0xba140231; (* arm_ADCS X17 X17 X20 *) + 0x9a140273; (* arm_ADC X19 X19 X20 *) + 0xeb050098; (* arm_SUBS X24 X4 X5 *) + 0xda982718; (* arm_CNEG X24 X24 Condition_CC *) + 0xda9f23f4; (* arm_CSETM X20 Condition_CC *) + 0xeb080135; (* arm_SUBS X21 X9 X8 *) + 0xda9526b5; (* arm_CNEG X21 X21 Condition_CC *) + 0x9b157f16; (* arm_MUL X22 X24 X21 *) + 0x9bd57f15; (* arm_UMULH X21 X24 X21 *) + 0xda942294; (* arm_CINV X20 X20 Condition_CC *) + 0xb100069f; (* arm_CMN X20 (rvalue (word 1)) *) + 0xca1402d6; (* arm_EOR X22 X22 X20 *) + 0xba1601ce; (* arm_ADCS X14 X14 X22 *) + 0xca1402b5; (* arm_EOR X21 X21 X20 *) + 0xba1501ef; (* arm_ADCS X15 X15 X21 *) + 0xba140210; (* arm_ADCS X16 X16 X20 *) + 0xba140231; (* arm_ADCS X17 X17 X20 *) + 0x9a140273; (* arm_ADC X19 X19 X20 *) + 0xa9421023; (* arm_LDP X3 X4 X1 (Immediate_Offset (iword (&32))) *) + 0xa900300b; (* arm_STP X11 X12 X0 (Immediate_Offset (iword (&0))) *) + 0xa9422047; (* arm_LDP X7 X8 X2 (Immediate_Offset (iword (&32))) *) + 0xa901380d; (* arm_STP X13 X14 X0 (Immediate_Offset (iword (&16))) *) + 0xa9431825; (* arm_LDP X5 X6 X1 (Immediate_Offset (iword (&48))) *) + 0xa902400f; (* arm_STP X15 X16 X0 (Immediate_Offset (iword (&32))) *) + 0xa9432849; (* arm_LDP X9 X10 X2 (Immediate_Offset (iword (&48))) *) + 0xa9034c11; (* arm_STP X17 X19 X0 (Immediate_Offset (iword (&48))) *) + 0x4e083c0b; (* arm_UMOV X11 Q0 0 8 *) + 0x4e183c0f; (* arm_UMOV X15 Q0 1 8 *) + 0x4e821860; (* arm_UZIP1 Q0 Q3 Q2 32 *) + 0x4ea00861; (* arm_REV64_VEC Q1 Q3 32 *) + 0x4e821843; (* arm_UZIP1 Q3 Q2 Q2 32 *) + 0x4ea29c21; (* arm_MUL_VEC Q1 Q1 Q2 32 *) + 0x6ea02821; (* arm_UADDLP Q1 Q1 32 *) + 0x4f605421; (* arm_SHL_VEC Q1 Q1 32 64 *) + 0x2ea08061; (* arm_UMLAL Q1 Q3 Q0 32 *) + 0x4e083c30; (* arm_UMOV X16 Q1 0 8 *) + 0x4e183c31; (* arm_UMOV X17 Q1 1 8 *) + 0x9bc77c73; (* arm_UMULH X19 X3 X7 *) + 0xab1301ef; (* arm_ADDS X15 X15 X19 *) + 0x9bc87c93; (* arm_UMULH X19 X4 X8 *) + 0xba130210; (* arm_ADCS X16 X16 X19 *) + 0x9bc97cb3; (* arm_UMULH X19 X5 X9 *) + 0xba130231; (* arm_ADCS X17 X17 X19 *) + 0x9bca7cd3; (* arm_UMULH X19 X6 X10 *) + 0x9a1f0273; (* arm_ADC X19 X19 XZR *) + 0xab0b01ec; (* arm_ADDS X12 X15 X11 *) + 0xba0f020f; (* arm_ADCS X15 X16 X15 *) + 0xba100230; (* arm_ADCS X16 X17 X16 *) + 0xba110271; (* arm_ADCS X17 X19 X17 *) + 0x9a1303f3; (* arm_ADC X19 XZR X19 *) + 0xab0b01ed; (* arm_ADDS X13 X15 X11 *) + 0xba0c020e; (* arm_ADCS X14 X16 X12 *) + 0xba0f022f; (* arm_ADCS X15 X17 X15 *) + 0xba100270; (* arm_ADCS X16 X19 X16 *) + 0xba1103f1; (* arm_ADCS X17 XZR X17 *) + 0x9a1303f3; (* arm_ADC X19 XZR X19 *) + 0xa9425416; (* arm_LDP X22 X21 X0 (Immediate_Offset (iword (&32))) *) + 0xab16016b; (* arm_ADDS X11 X11 X22 *) + 0xba15018c; (* arm_ADCS X12 X12 X21 *) + 0xa9435416; (* arm_LDP X22 X21 X0 (Immediate_Offset (iword (&48))) *) + 0xba1601ad; (* arm_ADCS X13 X13 X22 *) + 0xba1501ce; (* arm_ADCS X14 X14 X21 *) + 0xba1f01ef; (* arm_ADCS X15 X15 XZR *) + 0xba1f0210; (* arm_ADCS X16 X16 XZR *) + 0xba1f0231; (* arm_ADCS X17 X17 XZR *) + 0x9a1f0273; (* arm_ADC X19 X19 XZR *) + 0xeb0600b8; (* arm_SUBS X24 X5 X6 *) + 0xda982718; (* arm_CNEG X24 X24 Condition_CC *) + 0xda9f23f4; (* arm_CSETM X20 Condition_CC *) + 0xeb090155; (* arm_SUBS X21 X10 X9 *) + 0xda9526b5; (* arm_CNEG X21 X21 Condition_CC *) + 0x9b157f16; (* arm_MUL X22 X24 X21 *) + 0x9bd57f15; (* arm_UMULH X21 X24 X21 *) + 0xda942294; (* arm_CINV X20 X20 Condition_CC *) + 0xb100069f; (* arm_CMN X20 (rvalue (word 1)) *) + 0xca1402d6; (* arm_EOR X22 X22 X20 *) + 0xba160210; (* arm_ADCS X16 X16 X22 *) + 0xca1402b5; (* arm_EOR X21 X21 X20 *) + 0xba150231; (* arm_ADCS X17 X17 X21 *) + 0x9a140273; (* arm_ADC X19 X19 X20 *) + 0xeb040078; (* arm_SUBS X24 X3 X4 *) + 0xda982718; (* arm_CNEG X24 X24 Condition_CC *) + 0xda9f23f4; (* arm_CSETM X20 Condition_CC *) + 0xeb070115; (* arm_SUBS X21 X8 X7 *) + 0xda9526b5; (* arm_CNEG X21 X21 Condition_CC *) + 0x9b157f16; (* arm_MUL X22 X24 X21 *) + 0x9bd57f15; (* arm_UMULH X21 X24 X21 *) + 0xda942294; (* arm_CINV X20 X20 Condition_CC *) + 0xb100069f; (* arm_CMN X20 (rvalue (word 1)) *) + 0xca1402d6; (* arm_EOR X22 X22 X20 *) + 0xba16018c; (* arm_ADCS X12 X12 X22 *) + 0xca1402b5; (* arm_EOR X21 X21 X20 *) + 0xba1501ad; (* arm_ADCS X13 X13 X21 *) + 0xba1401ce; (* arm_ADCS X14 X14 X20 *) + 0xba1401ef; (* arm_ADCS X15 X15 X20 *) + 0xba140210; (* arm_ADCS X16 X16 X20 *) + 0xba140231; (* arm_ADCS X17 X17 X20 *) + 0x9a140273; (* arm_ADC X19 X19 X20 *) + 0xeb060098; (* arm_SUBS X24 X4 X6 *) + 0xda982718; (* arm_CNEG X24 X24 Condition_CC *) + 0xda9f23f4; (* arm_CSETM X20 Condition_CC *) + 0xeb080155; (* arm_SUBS X21 X10 X8 *) + 0xda9526b5; (* arm_CNEG X21 X21 Condition_CC *) + 0x9b157f16; (* arm_MUL X22 X24 X21 *) + 0x9bd57f15; (* arm_UMULH X21 X24 X21 *) + 0xda942294; (* arm_CINV X20 X20 Condition_CC *) + 0xb100069f; (* arm_CMN X20 (rvalue (word 1)) *) + 0xca1402d6; (* arm_EOR X22 X22 X20 *) + 0xba1601ef; (* arm_ADCS X15 X15 X22 *) + 0xca1402b5; (* arm_EOR X21 X21 X20 *) + 0xba150210; (* arm_ADCS X16 X16 X21 *) + 0xba140231; (* arm_ADCS X17 X17 X20 *) + 0x9a140273; (* arm_ADC X19 X19 X20 *) + 0xeb050078; (* arm_SUBS X24 X3 X5 *) + 0xda982718; (* arm_CNEG X24 X24 Condition_CC *) + 0xda9f23f4; (* arm_CSETM X20 Condition_CC *) + 0xeb070135; (* arm_SUBS X21 X9 X7 *) + 0xda9526b5; (* arm_CNEG X21 X21 Condition_CC *) + 0x9b157f16; (* arm_MUL X22 X24 X21 *) + 0x9bd57f15; (* arm_UMULH X21 X24 X21 *) + 0xda942294; (* arm_CINV X20 X20 Condition_CC *) + 0xb100069f; (* arm_CMN X20 (rvalue (word 1)) *) + 0xca1402d6; (* arm_EOR X22 X22 X20 *) + 0xba1601ad; (* arm_ADCS X13 X13 X22 *) + 0xca1402b5; (* arm_EOR X21 X21 X20 *) + 0xba1501ce; (* arm_ADCS X14 X14 X21 *) + 0xba1401ef; (* arm_ADCS X15 X15 X20 *) + 0xba140210; (* arm_ADCS X16 X16 X20 *) + 0xba140231; (* arm_ADCS X17 X17 X20 *) + 0x9a140273; (* arm_ADC X19 X19 X20 *) + 0xeb060078; (* arm_SUBS X24 X3 X6 *) + 0xda982718; (* arm_CNEG X24 X24 Condition_CC *) + 0xda9f23f4; (* arm_CSETM X20 Condition_CC *) + 0xeb070155; (* arm_SUBS X21 X10 X7 *) + 0xda9526b5; (* arm_CNEG X21 X21 Condition_CC *) + 0x9b157f16; (* arm_MUL X22 X24 X21 *) + 0x9bd57f15; (* arm_UMULH X21 X24 X21 *) + 0xda942294; (* arm_CINV X20 X20 Condition_CC *) + 0xb100069f; (* arm_CMN X20 (rvalue (word 1)) *) + 0xca1402d6; (* arm_EOR X22 X22 X20 *) + 0xba1601ce; (* arm_ADCS X14 X14 X22 *) + 0xca1402b5; (* arm_EOR X21 X21 X20 *) + 0xba1501ef; (* arm_ADCS X15 X15 X21 *) + 0xba140210; (* arm_ADCS X16 X16 X20 *) + 0xba140231; (* arm_ADCS X17 X17 X20 *) + 0x9a140273; (* arm_ADC X19 X19 X20 *) + 0xeb050098; (* arm_SUBS X24 X4 X5 *) + 0xda982718; (* arm_CNEG X24 X24 Condition_CC *) + 0xda9f23f4; (* arm_CSETM X20 Condition_CC *) + 0xeb080135; (* arm_SUBS X21 X9 X8 *) + 0xda9526b5; (* arm_CNEG X21 X21 Condition_CC *) + 0x9b157f16; (* arm_MUL X22 X24 X21 *) + 0x9bd57f15; (* arm_UMULH X21 X24 X21 *) + 0xda942294; (* arm_CINV X20 X20 Condition_CC *) + 0xb100069f; (* arm_CMN X20 (rvalue (word 1)) *) + 0xca1402d6; (* arm_EOR X22 X22 X20 *) + 0xba1601ce; (* arm_ADCS X14 X14 X22 *) + 0xca1402b5; (* arm_EOR X21 X21 X20 *) + 0xba1501ef; (* arm_ADCS X15 X15 X21 *) + 0xba140210; (* arm_ADCS X16 X16 X20 *) + 0xba140231; (* arm_ADCS X17 X17 X20 *) + 0x9a140273; (* arm_ADC X19 X19 X20 *) + 0xa9405436; (* arm_LDP X22 X21 X1 (Immediate_Offset (iword (&0))) *) + 0xeb160063; (* arm_SUBS X3 X3 X22 *) + 0xfa150084; (* arm_SBCS X4 X4 X21 *) + 0xa9415436; (* arm_LDP X22 X21 X1 (Immediate_Offset (iword (&16))) *) + 0xfa1600a5; (* arm_SBCS X5 X5 X22 *) + 0xfa1500c6; (* arm_SBCS X6 X6 X21 *) + 0xda9f23f8; (* arm_CSETM X24 Condition_CC *) + 0xa904300b; (* arm_STP X11 X12 X0 (Immediate_Offset (iword (&64))) *) + 0xa9405456; (* arm_LDP X22 X21 X2 (Immediate_Offset (iword (&0))) *) + 0xeb0702c7; (* arm_SUBS X7 X22 X7 *) + 0xfa0802a8; (* arm_SBCS X8 X21 X8 *) + 0xa9415456; (* arm_LDP X22 X21 X2 (Immediate_Offset (iword (&16))) *) + 0xfa0902c9; (* arm_SBCS X9 X22 X9 *) + 0xfa0a02aa; (* arm_SBCS X10 X21 X10 *) + 0xda9f23e1; (* arm_CSETM X1 Condition_CC *) + 0xa905380d; (* arm_STP X13 X14 X0 (Immediate_Offset (iword (&80))) *) + 0xca180063; (* arm_EOR X3 X3 X24 *) + 0xeb180063; (* arm_SUBS X3 X3 X24 *) + 0xca180084; (* arm_EOR X4 X4 X24 *) + 0xfa180084; (* arm_SBCS X4 X4 X24 *) + 0xca1800a5; (* arm_EOR X5 X5 X24 *) + 0xfa1800a5; (* arm_SBCS X5 X5 X24 *) + 0xca1800c6; (* arm_EOR X6 X6 X24 *) + 0xda1800c6; (* arm_SBC X6 X6 X24 *) + 0xa906400f; (* arm_STP X15 X16 X0 (Immediate_Offset (iword (&96))) *) + 0xca0100e7; (* arm_EOR X7 X7 X1 *) + 0xeb0100e7; (* arm_SUBS X7 X7 X1 *) + 0xca010108; (* arm_EOR X8 X8 X1 *) + 0xfa010108; (* arm_SBCS X8 X8 X1 *) + 0xca010129; (* arm_EOR X9 X9 X1 *) + 0xfa010129; (* arm_SBCS X9 X9 X1 *) + 0xca01014a; (* arm_EOR X10 X10 X1 *) + 0xda01014a; (* arm_SBC X10 X10 X1 *) + 0xa9074c11; (* arm_STP X17 X19 X0 (Immediate_Offset (iword (&112))) *) + 0xca180021; (* arm_EOR X1 X1 X24 *) + 0x9b077c6b; (* arm_MUL X11 X3 X7 *) + 0x9b087c8f; (* arm_MUL X15 X4 X8 *) + 0x9b097cb0; (* arm_MUL X16 X5 X9 *) + 0x9b0a7cd1; (* arm_MUL X17 X6 X10 *) + 0x9bc77c73; (* arm_UMULH X19 X3 X7 *) + 0xab1301ef; (* arm_ADDS X15 X15 X19 *) + 0x9bc87c93; (* arm_UMULH X19 X4 X8 *) + 0xba130210; (* arm_ADCS X16 X16 X19 *) + 0x9bc97cb3; (* arm_UMULH X19 X5 X9 *) + 0xba130231; (* arm_ADCS X17 X17 X19 *) + 0x9bca7cd3; (* arm_UMULH X19 X6 X10 *) + 0x9a1f0273; (* arm_ADC X19 X19 XZR *) + 0xab0b01ec; (* arm_ADDS X12 X15 X11 *) + 0xba0f020f; (* arm_ADCS X15 X16 X15 *) + 0xba100230; (* arm_ADCS X16 X17 X16 *) + 0xba110271; (* arm_ADCS X17 X19 X17 *) + 0x9a1303f3; (* arm_ADC X19 XZR X19 *) + 0xab0b01ed; (* arm_ADDS X13 X15 X11 *) + 0xba0c020e; (* arm_ADCS X14 X16 X12 *) + 0xba0f022f; (* arm_ADCS X15 X17 X15 *) + 0xba100270; (* arm_ADCS X16 X19 X16 *) + 0xba1103f1; (* arm_ADCS X17 XZR X17 *) + 0x9a1303f3; (* arm_ADC X19 XZR X19 *) + 0xeb0600b8; (* arm_SUBS X24 X5 X6 *) + 0xda982718; (* arm_CNEG X24 X24 Condition_CC *) + 0xda9f23f4; (* arm_CSETM X20 Condition_CC *) + 0xeb090155; (* arm_SUBS X21 X10 X9 *) + 0xda9526b5; (* arm_CNEG X21 X21 Condition_CC *) + 0x9b157f16; (* arm_MUL X22 X24 X21 *) + 0x9bd57f15; (* arm_UMULH X21 X24 X21 *) + 0xda942294; (* arm_CINV X20 X20 Condition_CC *) + 0xb100069f; (* arm_CMN X20 (rvalue (word 1)) *) + 0xca1402d6; (* arm_EOR X22 X22 X20 *) + 0xba160210; (* arm_ADCS X16 X16 X22 *) + 0xca1402b5; (* arm_EOR X21 X21 X20 *) + 0xba150231; (* arm_ADCS X17 X17 X21 *) + 0x9a140273; (* arm_ADC X19 X19 X20 *) + 0xeb040078; (* arm_SUBS X24 X3 X4 *) + 0xda982718; (* arm_CNEG X24 X24 Condition_CC *) + 0xda9f23f4; (* arm_CSETM X20 Condition_CC *) + 0xeb070115; (* arm_SUBS X21 X8 X7 *) + 0xda9526b5; (* arm_CNEG X21 X21 Condition_CC *) + 0x9b157f16; (* arm_MUL X22 X24 X21 *) + 0x9bd57f15; (* arm_UMULH X21 X24 X21 *) + 0xda942294; (* arm_CINV X20 X20 Condition_CC *) + 0xb100069f; (* arm_CMN X20 (rvalue (word 1)) *) + 0xca1402d6; (* arm_EOR X22 X22 X20 *) + 0xba16018c; (* arm_ADCS X12 X12 X22 *) + 0xca1402b5; (* arm_EOR X21 X21 X20 *) + 0xba1501ad; (* arm_ADCS X13 X13 X21 *) + 0xba1401ce; (* arm_ADCS X14 X14 X20 *) + 0xba1401ef; (* arm_ADCS X15 X15 X20 *) + 0xba140210; (* arm_ADCS X16 X16 X20 *) + 0xba140231; (* arm_ADCS X17 X17 X20 *) + 0x9a140273; (* arm_ADC X19 X19 X20 *) + 0xeb060098; (* arm_SUBS X24 X4 X6 *) + 0xda982718; (* arm_CNEG X24 X24 Condition_CC *) + 0xda9f23f4; (* arm_CSETM X20 Condition_CC *) + 0xeb080155; (* arm_SUBS X21 X10 X8 *) + 0xda9526b5; (* arm_CNEG X21 X21 Condition_CC *) + 0x9b157f16; (* arm_MUL X22 X24 X21 *) + 0x9bd57f15; (* arm_UMULH X21 X24 X21 *) + 0xda942294; (* arm_CINV X20 X20 Condition_CC *) + 0xb100069f; (* arm_CMN X20 (rvalue (word 1)) *) + 0xca1402d6; (* arm_EOR X22 X22 X20 *) + 0xba1601ef; (* arm_ADCS X15 X15 X22 *) + 0xca1402b5; (* arm_EOR X21 X21 X20 *) + 0xba150210; (* arm_ADCS X16 X16 X21 *) + 0xba140231; (* arm_ADCS X17 X17 X20 *) + 0x9a140273; (* arm_ADC X19 X19 X20 *) + 0xeb050078; (* arm_SUBS X24 X3 X5 *) + 0xda982718; (* arm_CNEG X24 X24 Condition_CC *) + 0xda9f23f4; (* arm_CSETM X20 Condition_CC *) + 0xeb070135; (* arm_SUBS X21 X9 X7 *) + 0xda9526b5; (* arm_CNEG X21 X21 Condition_CC *) + 0x9b157f16; (* arm_MUL X22 X24 X21 *) + 0x9bd57f15; (* arm_UMULH X21 X24 X21 *) + 0xda942294; (* arm_CINV X20 X20 Condition_CC *) + 0xb100069f; (* arm_CMN X20 (rvalue (word 1)) *) + 0xca1402d6; (* arm_EOR X22 X22 X20 *) + 0xba1601ad; (* arm_ADCS X13 X13 X22 *) + 0xca1402b5; (* arm_EOR X21 X21 X20 *) + 0xba1501ce; (* arm_ADCS X14 X14 X21 *) + 0xba1401ef; (* arm_ADCS X15 X15 X20 *) + 0xba140210; (* arm_ADCS X16 X16 X20 *) + 0xba140231; (* arm_ADCS X17 X17 X20 *) + 0x9a140273; (* arm_ADC X19 X19 X20 *) + 0xeb060078; (* arm_SUBS X24 X3 X6 *) + 0xda982718; (* arm_CNEG X24 X24 Condition_CC *) + 0xda9f23f4; (* arm_CSETM X20 Condition_CC *) + 0xeb070155; (* arm_SUBS X21 X10 X7 *) + 0xda9526b5; (* arm_CNEG X21 X21 Condition_CC *) + 0x9b157f16; (* arm_MUL X22 X24 X21 *) + 0x9bd57f15; (* arm_UMULH X21 X24 X21 *) + 0xda942294; (* arm_CINV X20 X20 Condition_CC *) + 0xb100069f; (* arm_CMN X20 (rvalue (word 1)) *) + 0xca1402d6; (* arm_EOR X22 X22 X20 *) + 0xba1601ce; (* arm_ADCS X14 X14 X22 *) + 0xca1402b5; (* arm_EOR X21 X21 X20 *) + 0xba1501ef; (* arm_ADCS X15 X15 X21 *) + 0xba140210; (* arm_ADCS X16 X16 X20 *) + 0xba140231; (* arm_ADCS X17 X17 X20 *) + 0x9a140273; (* arm_ADC X19 X19 X20 *) + 0xeb050098; (* arm_SUBS X24 X4 X5 *) + 0xda982718; (* arm_CNEG X24 X24 Condition_CC *) + 0xda9f23f4; (* arm_CSETM X20 Condition_CC *) + 0xeb080135; (* arm_SUBS X21 X9 X8 *) + 0xda9526b5; (* arm_CNEG X21 X21 Condition_CC *) + 0x9b157f16; (* arm_MUL X22 X24 X21 *) + 0x9bd57f15; (* arm_UMULH X21 X24 X21 *) + 0xda942294; (* arm_CINV X20 X20 Condition_CC *) + 0xb100069f; (* arm_CMN X20 (rvalue (word 1)) *) + 0xca1402d6; (* arm_EOR X22 X22 X20 *) + 0xba1601ce; (* arm_ADCS X14 X14 X22 *) + 0xca1402b5; (* arm_EOR X21 X21 X20 *) + 0xba1501ef; (* arm_ADCS X15 X15 X21 *) + 0xba140210; (* arm_ADCS X16 X16 X20 *) + 0xba140231; (* arm_ADCS X17 X17 X20 *) + 0x9a140273; (* arm_ADC X19 X19 X20 *) + 0xa9401003; (* arm_LDP X3 X4 X0 (Immediate_Offset (iword (&0))) *) + 0xa9442007; (* arm_LDP X7 X8 X0 (Immediate_Offset (iword (&64))) *) + 0xab070063; (* arm_ADDS X3 X3 X7 *) + 0xba080084; (* arm_ADCS X4 X4 X8 *) + 0xa9411805; (* arm_LDP X5 X6 X0 (Immediate_Offset (iword (&16))) *) + 0xa9452809; (* arm_LDP X9 X10 X0 (Immediate_Offset (iword (&80))) *) + 0xba0900a5; (* arm_ADCS X5 X5 X9 *) + 0xba0a00c6; (* arm_ADCS X6 X6 X10 *) + 0xa9465414; (* arm_LDP X20 X21 X0 (Immediate_Offset (iword (&96))) *) + 0xba1400e7; (* arm_ADCS X7 X7 X20 *) + 0xba150108; (* arm_ADCS X8 X8 X21 *) + 0xa9475c16; (* arm_LDP X22 X23 X0 (Immediate_Offset (iword (&112))) *) + 0xba160129; (* arm_ADCS X9 X9 X22 *) + 0xba17014a; (* arm_ADCS X10 X10 X23 *) + 0xba1f0038; (* arm_ADCS X24 X1 XZR *) + 0x9a1f0022; (* arm_ADC X2 X1 XZR *) + 0xb100043f; (* arm_CMN X1 (rvalue (word 1)) *) + 0xca01016b; (* arm_EOR X11 X11 X1 *) + 0xba030163; (* arm_ADCS X3 X11 X3 *) + 0xca01018c; (* arm_EOR X12 X12 X1 *) + 0xba040184; (* arm_ADCS X4 X12 X4 *) + 0xca0101ad; (* arm_EOR X13 X13 X1 *) + 0xba0501a5; (* arm_ADCS X5 X13 X5 *) + 0xca0101ce; (* arm_EOR X14 X14 X1 *) + 0xba0601c6; (* arm_ADCS X6 X14 X6 *) + 0xca0101ef; (* arm_EOR X15 X15 X1 *) + 0xba0701e7; (* arm_ADCS X7 X15 X7 *) + 0xca010210; (* arm_EOR X16 X16 X1 *) + 0xba080208; (* arm_ADCS X8 X16 X8 *) + 0xca010231; (* arm_EOR X17 X17 X1 *) + 0xba090229; (* arm_ADCS X9 X17 X9 *) + 0xca010273; (* arm_EOR X19 X19 X1 *) + 0xba0a026a; (* arm_ADCS X10 X19 X10 *) + 0xba180294; (* arm_ADCS X20 X20 X24 *) + 0xba0202b5; (* arm_ADCS X21 X21 X2 *) + 0xba0202d6; (* arm_ADCS X22 X22 X2 *) + 0x9a0202f7; (* arm_ADC X23 X23 X2 *) + 0xa9021003; (* arm_STP X3 X4 X0 (Immediate_Offset (iword (&32))) *) + 0xa9031805; (* arm_STP X5 X6 X0 (Immediate_Offset (iword (&48))) *) + 0xa9042007; (* arm_STP X7 X8 X0 (Immediate_Offset (iword (&64))) *) + 0xa9052809; (* arm_STP X9 X10 X0 (Immediate_Offset (iword (&80))) *) + 0xa9065414; (* arm_STP X20 X21 X0 (Immediate_Offset (iword (&96))) *) + 0xa9075c16; (* arm_STP X22 X23 X0 (Immediate_Offset (iword (&112))) *) + 0xa8c163f7; (* arm_LDP X23 X24 SP (Postimmediate_Offset (iword (&16))) *) + 0xa8c15bf5; (* arm_LDP X21 X22 SP (Postimmediate_Offset (iword (&16))) *) + 0xa8c153f3; (* arm_LDP X19 X20 SP (Postimmediate_Offset (iword (&16))) *) + 0xd65f03c0 (* arm_RET X30 *) +];; + +let BIGNUM_MUL_8_16_NEON_EXEC = ARM_MK_EXEC_RULE bignum_mul_8_16_neon_mc;; + +(* ------------------------------------------------------------------------- *) +(* Lemmas to halve the number of case splits, useful for efficiency. *) +(* ------------------------------------------------------------------------- *) + +let lemma1 = prove + (`!(x0:num) x1 (y0:num) y1. + (if y0 <= y1 + then if x1 <= x0 then word 0 else word 18446744073709551615 + else word_not + (if x1 <= x0 then word 0 else word 18446744073709551615)):int64 = + word_neg(word(bitval(y0 <= y1 <=> x0 < x1)))`, + REPEAT GEN_TAC THEN REWRITE_TAC[GSYM NOT_LE] THEN + REPEAT(COND_CASES_TAC THEN ASM_REWRITE_TAC[BITVAL_CLAUSES]) THEN + CONV_TAC WORD_REDUCE_CONV);; + +let lemma2 = prove + (`!(x0:int64) (x1:int64) (y0:int64) (y1:int64). + &(val(if val x1 <= val x0 then word_sub x0 x1 + else word_neg (word_sub x0 x1))) * + &(val(if val y0 <= val y1 then word_sub y1 y0 + else word_neg (word_sub y1 y0))):real = + --(&1) pow bitval(val y0 <= val y1 <=> val x0 < val x1) * + (&(val x0) - &(val x1)) * (&(val y1) - &(val y0))`, + REPEAT GEN_TAC THEN REWRITE_TAC[GSYM NOT_LE; WORD_NEG_SUB] THEN + REPEAT(COND_CASES_TAC THEN ASM_REWRITE_TAC[BITVAL_CLAUSES]) THEN + REPEAT(FIRST_X_ASSUM(ASSUME_TAC o MATCH_MP (ARITH_RULE + `~(m:num <= n) ==> n <= m /\ ~(m <= n)`))) THEN + ASM_SIMP_TAC[VAL_WORD_SUB_CASES; GSYM REAL_OF_NUM_SUB] THEN + REAL_ARITH_TAC);; + +(* A lemma that is useful for extracting a 32-bit field from a 128-bit word. *) +let WORD_128_SUBWORD_SUBWORD_32 = prove(`!y. + word_subword (word_subword (y:(128)word) (0,64):(64)word) (0,32):(32)word = + word_subword (y:(128)word) (0,32):(32)word /\ + word_subword (word_subword (y:(128)word) (64,64):(64)word) (0,32):(32)word = + word_subword (y:(128)word) (64,32):(32)word /\ + word_subword (word_subword (y:(128)word) (0,64):(64)word) (32,32):(32)word = + word_subword (y:(128)word) (32,32):(32)word /\ + word_subword (word_subword (y:(128)word) (64,64):(64)word) (32,32):(32)word = + word_subword (y:(128)word) (96,32):(32)word`, + CONV_TAC WORD_BLAST);; + +(* A lemma that is useful for extracting a 32-bit field from a join of two 32-bit words. *) +let WORD_SUBWORD_JOIN_64 = prove(`!(x:(32)word) (y:(32)word). + word_subword (word_join (x:(32)word) (y:(32)word): (64)word) (0,32) = y /\ + word_subword (word_join (x:(32)word) (y:(32)word): (64)word) (32,32) = x`, + CONV_TAC WORD_BLAST);; + +(* A lemma that is useful for extracting a 64-bit field from a join of two 64-bit words. *) +let WORD_SUBWORD_JOIN_128_64 = prove(`!(x:(64)word) (y:(64)word). + word_subword (word_join (x:(64)word) (y:(64)word): (128)word) (0,64) = y /\ + word_subword (word_join (x:(64)word) (y:(64)word): (128)word) (64,64) = x`, + CONV_TAC WORD_BLAST);; + +(* A lemma that is useful for extracting a 32-bit field from a join of two 64-bit words. *) +let WORD_SUBWORD_JOIN_128_32 = prove(`!(x:(64)word) (y:(64)word). + word_subword (word_join (x:(64)word) (y:(64)word): (128)word) (0,32):(32)word = + word_subword (y:(64)word) (0,32):(32)word /\ + word_subword (word_join (x:(64)word) (y:(64)word): (128)word) (32,32):(32)word = + word_subword (y:(64)word) (32,32):(32)word /\ + word_subword (word_join (x:(64)word) (y:(64)word): (128)word) (64,32):(32)word = + word_subword (x:(64)word) (0,32):(32)word /\ + word_subword (word_join (x:(64)word) (y:(64)word): (128)word) (96,32):(32)word = + word_subword (x:(64)word) (32,32):(32)word`, + CONV_TAC WORD_BLAST);; + +let lemma4 = prove(`!a b c. + ((a + 2 EXP 32 * (b MOD 2 EXP 32 + c MOD 2 EXP 32)) DIV 2 EXP 32) MOD 2 EXP 32 = + ((a + 2 EXP 32 * (b + c)) DIV 2 EXP 32) MOD 2 EXP 32`, + REPEAT STRIP_TAC THEN + MAP_EVERY (fun (thm, suffix) -> LABEL_TAC ("Ha_" ^ suffix) thm) + (zip (CONJUNCTS ((MP + (SPECL [`a:num`; `2 EXP 32:num`] DIVISION) (ARITH_RULE `~(2 EXP 32 = 0)`)))) + ["eq";"lt"]) THEN + ABBREV_TAC `ahi = a DIV 2 EXP 32` THEN + ABBREV_TAC `alo = a MOD 2 EXP 32` THEN + ASM_REWRITE_TAC[] THEN + REWRITE_TAC[ARITH_RULE + `(ahi * 2 EXP 32 + alo) + 2 EXP 32 * (b MOD 2 EXP 32 + c MOD 2 EXP 32) = + (ahi + b MOD 2 EXP 32 + c MOD 2 EXP 32) * 2 EXP 32 + alo`] THEN + REWRITE_TAC[ARITH_RULE + `(ahi * 2 EXP 32 + alo) + 2 EXP 32 * (b + c) = + (ahi + b + c) * 2 EXP 32 + alo`] THEN + IMP_REWRITE_TAC[DIV_UNIQ] THEN (* (A * 2^32 + B) / 2^32 => A *) + EXISTS_TAC `(ahi + b MOD 2 EXP 32 + c MOD 2 EXP 32)` THEN SIMP_TAC[] THEN + EXISTS_TAC `(ahi + b + c)` THEN SIMP_TAC[] THEN + CONV_TAC MOD_DOWN_CONV THEN SIMP_TAC[]);; + +let WORD_MUL_64_DECOMPOSED_32 = prove(`!(x:(64)word) (y:(64)word). + word_add + (word_mul (word_zx (word_subword x (0,32):(32)word):(64)word) + (word_zx (word_subword y (0,32):(32)word):(64)word)) + (word_shl + (word_add + (word_zx (word_mul (word_subword y (32,32):(32)word) (word_subword x (0,32):(32)word))) + (word_zx (word_mul (word_subword y (0,32):(32)word) (word_subword x (32,32):(32)word)))) + 32) = + word_mul x y`, + REPEAT GEN_TAC THEN + (* word to num: step 1. x = y to val x = val y *) + REWRITE_TAC[GSYM VAL_EQ] THEN + (* step 2. remove all word_* *) + REWRITE_TAC [VAL_WORD_ADD; VAL_WORD_MUL; VAL_WORD_ZX_GEN; VAL_WORD_SUBWORD; + VAL_WORD; VAL_WORD_SHL] THEN + (* step 3. add x, y < 2^64 *) + ASSUME_TAC (ISPECL [`x:(64)word`] VAL_BOUND) THEN + ASSUME_TAC (ISPECL [`y:(64)word`] VAL_BOUND) THEN + RULE_ASSUM_TAC (REWRITE_RULE [DIMINDEX_64]) THEN + (* step 4. eliminate dimindex (:N) and simplify *) + REWRITE_TAC[DIMINDEX_32;DIMINDEX_64;DIMINDEX_128;DIV_1;MOD_MOD_REFL; + MOD_MOD_EXP_MIN;ARITH_RULE `2 EXP 0 = 1`; DIV_1] THEN + CONV_TAC(DEPTH_CONV NUM_MIN_CONV) THEN + CONV_TAC MOD_DOWN_CONV THEN + (* split x into [x0h, x0l], and divide y as well *) + MAP_EVERY (fun (thm, suffix) -> LABEL_TAC ("Hx" ^ suffix) thm) + (zip (CONJUNCTS ((MP (SPECL [`(val (x:(64)word)):num`; `2 EXP 32:num`] DIVISION) + (ARITH_RULE `~(2 EXP 32 = 0)`)))) ["eq";"lt"]) THEN + ABBREV_TAC `xhi = (val (x:(64)word)) DIV 2 EXP 32` THEN + ABBREV_TAC `xlo = (val (x:(64)word)) MOD 2 EXP 32` THEN + ASM_REWRITE_TAC[] THEN + MAP_EVERY (fun (thm, suffix) -> LABEL_TAC ("Hy" ^ suffix) thm) + (zip (CONJUNCTS ((MP (SPECL [`(val (y:(64)word)):num`; `2 EXP 32:num`] DIVISION) + (ARITH_RULE `~(2 EXP 32 = 0)`)))) ["eq";"lt"]) THEN + ABBREV_TAC `yhi = (val (y:(64)word)) DIV 2 EXP 32` THEN + ABBREV_TAC `ylo = (val (y:(64)word)) MOD 2 EXP 32` THEN + ASM_REWRITE_TAC[] THEN + (* lhs *) + REWRITE_TAC[LEFT_ADD_DISTRIB; RIGHT_ADD_DISTRIB] THEN + REWRITE_TAC[ + ARITH_RULE `y1hi * x1hi * 2 EXP 32 = 2 EXP 32 * y1hi * x1hi`; + ARITH_RULE `(y1hi * 2 EXP 32) * x1hi = 2 EXP 32 * y1hi * x1hi`] THEN + REWRITE_TAC[MOD_MULT_ADD] THEN + (* rhs *) + REWRITE_TAC[MULT_ASSOC; ARITH_RULE `2 EXP 32 * 2 EXP 32 = 2 EXP 64`] THEN + REWRITE_TAC[GSYM ADD_ASSOC; GSYM MULT_ASSOC] THEN + REWRITE_TAC[MOD_MULT_ADD] THEN + (* lhs = rhs *) + REWRITE_TAC[ARITH_RULE `2 EXP 64 = 2 EXP 32 * 2 EXP 32`] THEN + REWRITE_TAC[MOD_MULT_MOD] THEN + REWRITE_TAC[ARITH_RULE `2 EXP 32 * p + 2 EXP 32 * q = 2 EXP 32 * (p + q)`; MOD_MULT_ADD] THEN + REWRITE_TAC [lemma4] THEN + REWRITE_TAC [ARITH_RULE + `(xlo * ylo + 2 EXP 32 * (yhi * xlo + ylo * xhi)) DIV 2 EXP 32 = + (2 EXP 32 * xhi * ylo + 2 EXP 32 * xlo * yhi + xlo * ylo) DIV 2 EXP 32`]);; + +let simplify_128bit_words = + RULE_ASSUM_TAC (REWRITE_RULE [ + WORD_128_SUBWORD_SUBWORD_32; WORD_SUBWORD_JOIN_64; + WORD_SUBWORD_JOIN_128_64; WORD_SUBWORD_JOIN_128_32; + WORD_MUL_64_DECOMPOSED_32]);; + +let simplify_128bit_words_and_preproc_accumulate = + simplify_128bit_words THEN + (* Rewrite word_mul x y into the pattern that ACCUMULATE_ARITH_TAC can recognize. *) + RULE_ASSUM_TAC (REWRITE_RULE [WORD_RULE + `word_mul (a:(64)word) (b:(64)word) = + word (0 + val (a:(64)word) * val (b:(64)word))`]);; + +let WORD_ADD_ASSOC_CONSTS = prove( + `!(x:(N)word) n m. + (word_add (word_add x (word n)) (word m)) = (word_add x (word (n+m)))`, + CONV_TAC WORD_RULE);; + +let BYTES128_EQ_JOIN64_TAC lhs128 hi64 lo64 = + let hivar = mk_var (hi64, `:(64)word`) in + let lovar = mk_var (lo64, `:(64)word`) in + let hilo = mk_comb (mk_comb + (`word_join:(64)word->(64)word->(128)word`,hivar),lovar) in + SUBGOAL_THEN (mk_eq (lhs128, hilo)) ASSUME_TAC THENL [ + MAP_EVERY EXPAND_TAC [hi64; lo64] THEN + REWRITE_TAC[READ_MEMORY_BYTESIZED_SPLIT; WORD_ADD_ASSOC_CONSTS] THEN + ARITH_TAC; + ALL_TAC + ];; + +(* ------------------------------------------------------------------------- *) +(* Proof. *) +(* ------------------------------------------------------------------------- *) + +let ADK_48_TAC = + MATCH_MP_TAC EQUAL_FROM_CONGRUENT_REAL THEN + MAP_EVERY EXISTS_TAC [`512`; `&0:real`] THEN + REPLICATE_TAC 2 (CONJ_TAC THENL [BOUNDER_TAC[]; ALL_TAC]) THEN + CONJ_TAC THENL [REAL_INTEGER_TAC; ALL_TAC] THEN + ACCUMULATOR_POP_ASSUM_LIST(MP_TAC o end_itlist CONJ) THEN + POP_ASSUM_LIST(K ALL_TAC) THEN + REWRITE_TAC[lemma1; lemma2] THEN REWRITE_TAC[WORD_XOR_MASK] THEN + REPEAT(COND_CASES_TAC THEN + ASM_REWRITE_TAC[BITVAL_CLAUSES; REAL_VAL_WORD_NOT]) THEN + CONV_TAC WORD_REDUCE_CONV THEN CONV_TAC NUM_REDUCE_CONV THEN + REWRITE_TAC[BITVAL_CLAUSES; DIMINDEX_64] THEN + POP_ASSUM_LIST(K ALL_TAC) THEN DISCH_TAC THEN + FIRST_ASSUM(MP_TAC o end_itlist CONJ o DESUM_RULE o CONJUNCTS) THEN + DISCH_THEN(fun th -> REWRITE_TAC[th]) THEN + CONV_TAC(RAND_CONV REAL_POLY_CONV) THEN + FIRST_ASSUM(MP_TAC o end_itlist CONJ o filter (is_ratconst o rand o concl) o + DECARRY_RULE o CONJUNCTS) THEN + DISCH_THEN(fun th -> REWRITE_TAC[th]) THEN REAL_INTEGER_TAC;; + +(* Caller-save Q registers: + +The first eight registers, v0-v7, are used to pass argument values into a +subroutine and to return result values from a function. They may also be +used to hold intermediate values within a routine (but, in general, only +between subroutine calls). +Registers v8-v15 must be preserved by a callee across subroutine calls; the +remaining registers (v0-v7, v16-v31) do not need to be preserved (or should be +preserved by the caller). Additionally, only the bottom 64 bits of each value +stored in v8-v15 need to be preserved 8; it is the responsibility of the caller +to preserve larger values*) +let BIGNUM_MUL_8_16_NEON_CORRECT = prove( + `!z x y a b pc. + ALL (nonoverlapping (z,8 * 16)) + [(word pc,2000); (x,8 * 8); (y,8 * 8)] + ==> ensures arm + (\s. aligned_bytes_loaded s (word pc) bignum_mul_8_16_neon_mc /\ + read PC s = word(pc + 0xc) /\ + C_ARGUMENTS [z; x; y] s /\ + bignum_from_memory (x,8) s = a /\ + bignum_from_memory (y,8) s = b) + (\s. read PC s = word (pc + 1984) /\ + bignum_from_memory (z,16) s = a * b) + (MAYCHANGE [PC; X1; X2; X3; X4; X5; X6; X7; X8; + X9; X10; X11; X12; X13; X14; X15; X16; + X17; X19; X20; X21; X22; X23; X24] ,, + MAYCHANGE [Q0; Q1; Q2; Q3; Q4; Q5],, + MAYCHANGE [memory :> bytes(z,8 * 16)] ,, + MAYCHANGE SOME_FLAGS)`, + MAP_EVERY X_GEN_TAC + [`z:int64`; `x:int64`; `y:int64`; `a:num`; `b:num`; `pc:num`] THEN + REWRITE_TAC[C_ARGUMENTS; C_RETURN; SOME_FLAGS] THEN + REWRITE_TAC[ALL; NONOVERLAPPING_CLAUSES] THEN + DISCH_THEN(REPEAT_TCL CONJUNCTS_THEN ASSUME_TAC) THEN + ENSURES_INIT_TAC "s0" THEN + BIGNUM_DIGITIZE_TAC "x_" `bignum_from_memory (x,8) s0` THEN + BIGNUM_DIGITIZE_TAC "y_" `bignum_from_memory (y,8) s0` THEN + (* Split 128-bit reads to word_join of 64-bit low and highs *) + BYTES128_EQ_JOIN64_TAC `read (memory :> bytes128 x) s0` "x_1" "x_0" THEN + BYTES128_EQ_JOIN64_TAC `read (memory :> bytes128 (word_add x (word 16))) s0` + "x_3" "x_2" THEN + + BYTES128_EQ_JOIN64_TAC `read (memory :> bytes128 y) s0` "y_1" "y_0" THEN + BYTES128_EQ_JOIN64_TAC `read (memory :> bytes128 (word_add y (word 16))) s0` + "y_3" "y_2" THEN + + (*** First ADK block multiplying the lower halves ***) + + (* Run the vectorized parts first *) + ARM_GEN_ACCSTEPS_TAC + (fun _ -> simplify_128bit_words_and_preproc_accumulate) + BIGNUM_MUL_8_16_NEON_EXEC [16;17;25;26] (1--26) THEN + + (* Second ADK block multiplying the upper halves with q1 added: + vector loads hoisted *) + + BYTES128_EQ_JOIN64_TAC `read (memory :> bytes128 (word_add x (word 32))) s26` "x_5" "x_4" THEN + BYTES128_EQ_JOIN64_TAC `read (memory :> bytes128 (word_add x (word 48))) s26` "x_7" "x_6" THEN + BYTES128_EQ_JOIN64_TAC `read (memory :> bytes128 (word_add y (word 32))) s26` "y_5" "y_4" THEN + BYTES128_EQ_JOIN64_TAC `read (memory :> bytes128 (word_add y (word 48))) s26` "y_7" "y_6" THEN + + (* 31--37: First ADK block: Run the remaining scalar parts (1) *) + (* 38--44: Second ADK block: multiply using vector instructions, but not move the + results to scalar registers *) + (* First ADK block: Run the remaining scalar parts *) + ARM_GEN_ACCSTEPS_TAC + (fun _ -> simplify_128bit_words_and_preproc_accumulate) + BIGNUM_MUL_8_16_NEON_EXEC + [32;34;36] (27--44) THEN + simplify_128bit_words THEN + ARM_ACCSTEPS_TAC BIGNUM_MUL_8_16_NEON_EXEC + [45;46;47;48;49;50;51;52;53;54;55;56;62;67;69;70;76;81;83;84;85;86;87;88;94; + 99;101;102;103;109;114;116;117;118;119;120;126;131;133;134;135;136;142;147; + 149;150;151;152] (45--152) THEN + + MAP_EVERY ABBREV_TAC + [`q0 = bignum_of_wordlist[mullo_s16;sum_s81;sum_s114;sum_s147]`; + `q1 = bignum_of_wordlist[sum_s149;sum_s150;sum_s151;sum_s152]`] THEN + SUBGOAL_THEN + `2 EXP 256 * q1 + q0 = + bignum_of_wordlist [x_0;x_1;x_2;x_3] * + bignum_of_wordlist [y_0;y_1;y_2;y_3]` + ASSUME_TAC THENL + [MAP_EVERY EXPAND_TAC ["q0"; "q1"] THEN + REWRITE_TAC[bignum_of_wordlist; GSYM REAL_OF_NUM_CLAUSES] THEN + ADK_48_TAC; + ACCUMULATOR_POP_ASSUM_LIST(K ALL_TAC) THEN + DISCARD_MATCHING_ASSUMPTIONS [`word a = b`]] THEN + + (*** Second ADK block multiplying the upper halves with q1 added ***) + + ARM_GEN_ACCSTEPS_TAC + (fun _ -> simplify_128bit_words_and_preproc_accumulate) + BIGNUM_MUL_8_16_NEON_EXEC [161;162;170;171] (153--171) THEN + ARM_ACCSTEPS_TAC BIGNUM_MUL_8_16_NEON_EXEC + [173;175;177;179;180;181;182;183;184;185;186;187;188;189;190;192;193;195; + 196;197;198;199;200;206;211;213;214;220;225;227;228;229;230;231;232;238; + 243;245;246;247;253;258;260;261;262;263;264;270;275;277;278;279;280;286; + 291;293;294;295;296] + (172--296) THEN + + MAP_EVERY ABBREV_TAC + [`q2 = bignum_of_wordlist[sum_s192; sum_s225; sum_s258; sum_s291]`; + `q3 = bignum_of_wordlist[sum_s293; sum_s294; sum_s295; sum_s296]`] THEN + SUBGOAL_THEN + `2 EXP 256 * q3 + q2 = + bignum_of_wordlist [x_4;x_5;x_6;x_7] * + bignum_of_wordlist [y_4;y_5;y_6;y_7] + q1` + ASSUME_TAC THENL + [MAP_EVERY EXPAND_TAC ["q1"; "q2"; "q3"] THEN + REWRITE_TAC[bignum_of_wordlist; GSYM REAL_OF_NUM_CLAUSES] THEN + ADK_48_TAC; + ACCUMULATOR_POP_ASSUM_LIST(K ALL_TAC) THEN + DISCARD_MATCHING_ASSUMPTIONS [`word a = b`]] THEN + + (*** The sign-magnitude difference computation ***) + + ARM_ACCSTEPS_TAC BIGNUM_MUL_8_16_NEON_EXEC + [298;299;301;302;306;307;309;310;314;316;318;320;323;325;327;329] + (297--330) THEN + RULE_ASSUM_TAC(REWRITE_RULE[WORD_UNMASK_64]) THEN + + MAP_EVERY ABBREV_TAC + [`sgn <=> ~(carry_s310 <=> carry_s302)`; + `xd = bignum_of_wordlist[sum_s314;sum_s316;sum_s318;sum_s320]`; + `yd = bignum_of_wordlist[sum_s323;sum_s325;sum_s327;sum_s329]`] THEN + + SUBGOAL_THEN + `(&(bignum_of_wordlist[x_4;x_5;x_6;x_7]) - + &(bignum_of_wordlist[x_0;x_1;x_2;x_3])) * + (&(bignum_of_wordlist[y_0;y_1;y_2;y_3]) - + &(bignum_of_wordlist[y_4;y_5;y_6;y_7])):real = + --(&1) pow bitval sgn * &xd * &yd` + ASSUME_TAC THENL + [TRANS_TAC EQ_TRANS + `(--(&1) pow bitval carry_s302 * &xd) * + (--(&1) pow bitval carry_s310 * &yd):real` THEN + CONJ_TAC THENL + [ALL_TAC; + EXPAND_TAC "sgn" THEN REWRITE_TAC[BITVAL_NOT; BITVAL_IFF] THEN + POP_ASSUM_LIST(K ALL_TAC) THEN REWRITE_TAC[bitval] THEN + REPEAT(COND_CASES_TAC THEN ASM_REWRITE_TAC[]) THEN + CONV_TAC NUM_REDUCE_CONV THEN REAL_ARITH_TAC] THEN + SUBGOAL_THEN + `(carry_s302 <=> + bignum_of_wordlist[x_4;x_5;x_6;x_7] < + bignum_of_wordlist[x_0;x_1;x_2;x_3]) /\ + (carry_s310 <=> + bignum_of_wordlist[y_0;y_1;y_2;y_3] < + bignum_of_wordlist[y_4;y_5;y_6;y_7])` + (CONJUNCTS_THEN SUBST_ALL_TAC) + THENL + [CONJ_TAC THEN MATCH_MP_TAC FLAG_FROM_CARRY_LT THEN EXISTS_TAC `256` THEN + REWRITE_TAC[bignum_of_wordlist; GSYM REAL_OF_NUM_CLAUSES] THEN + ACCUMULATOR_POP_ASSUM_LIST(MP_TAC o end_itlist CONJ o DECARRY_RULE) THEN + REWRITE_TAC[REAL_BITVAL_NOT; REAL_VAL_WORD_MASK; DIMINDEX_64] THEN + DISCH_THEN(fun th -> REWRITE_TAC[th]) THEN BOUNDER_TAC[]; + ALL_TAC] THEN + BINOP_TAC THEN REWRITE_TAC[bitval] THEN + COND_CASES_TAC THEN ASM_REWRITE_TAC[real_pow; REAL_MUL_LID] THEN + REWRITE_TAC[REAL_ARITH `x - y:real = --(&1) pow 1 * z <=> y - x = z`] THEN + MATCH_MP_TAC EQUAL_FROM_CONGRUENT_REAL THEN + MAP_EVERY EXISTS_TAC [`256`; `&0:real`] THEN + (CONJ_TAC THENL + [MATCH_MP_TAC(REAL_ARITH + `y:real <= x /\ (&0 <= x /\ x < e) /\ (&0 <= y /\ y < e) + ==> &0 <= x - y /\ x - y < e`) THEN + ASM_SIMP_TAC[REAL_OF_NUM_CLAUSES; LT_IMP_LE; + ARITH_RULE `~(a:num < b) ==> b <= a`] THEN + REWRITE_TAC[bignum_of_wordlist; GSYM REAL_OF_NUM_CLAUSES] THEN + CONJ_TAC THEN BOUNDER_TAC[]; + ALL_TAC] THEN + MAP_EVERY EXPAND_TAC ["xd"; "yd"] THEN + REWRITE_TAC[bignum_of_wordlist; GSYM REAL_OF_NUM_CLAUSES] THEN + CONJ_TAC THENL [BOUNDER_TAC[]; REWRITE_TAC[INTEGER_CLOSED]]) THEN + ACCUMULATOR_POP_ASSUM_LIST(MP_TAC o end_itlist CONJ) THEN + ASM_REWRITE_TAC[WORD_XOR_MASK] THEN + REWRITE_TAC[REAL_VAL_WORD_NOT; BITVAL_CLAUSES; DIMINDEX_64] THEN + DISCH_THEN(MP_TAC o end_itlist CONJ o DESUM_RULE o CONJUNCTS) THEN + DISCH_THEN(fun th -> REWRITE_TAC[th]) THEN REAL_INTEGER_TAC; + ACCUMULATOR_POP_ASSUM_LIST(K ALL_TAC)] THEN + + (*** Third ADK block multiplying the absolute differences ***) + + ARM_ACCSTEPS_TAC BIGNUM_MUL_8_16_NEON_EXEC + [332;333;334;335;337;339;341;343;344;345;346;347;348;349;350;351;352;353;354;360;365;367;368;374;379;381;382;383;384;385;386;392;397;399;400;401;407;412;414;415;416;417;418;424;429;431;432;433;434;440;445;447;448;449;450] + (331--450) THEN + + SUBGOAL_THEN + `&xd * &yd:real = + &(bignum_of_wordlist + [mullo_s332; sum_s379; sum_s412; sum_s445; + sum_s447; sum_s448; sum_s449; sum_s450])` + SUBST_ALL_TAC THENL + [MAP_EVERY EXPAND_TAC ["xd"; "yd"] THEN + REWRITE_TAC[bignum_of_wordlist; GSYM REAL_OF_NUM_CLAUSES] THEN + ADK_48_TAC; + ACCUMULATOR_POP_ASSUM_LIST(K ALL_TAC) THEN + DISCARD_MATCHING_ASSUMPTIONS [`word a = b`]] THEN + + (*** Clean up the overall sign ***) + + FIRST_X_ASSUM(MP_TAC o GEN_REWRITE_RULE RAND_CONV [WORD_XOR_MASKS]) THEN + ASM_REWRITE_TAC[] THEN DISCH_TAC THEN + + (*** Final accumulation simulation and 16-digit focusing ***) + + ARM_ACCSTEPS_TAC BIGNUM_MUL_8_16_NEON_EXEC + [453;454;457;458;460;461;463;464;465;466;469;471;472;473;475;477;479;481;483;484;485;486;487] + (451--493) THEN + + ENSURES_FINAL_STATE_TAC THEN ASM_REWRITE_TAC[] THEN + CONV_TAC(LAND_CONV BIGNUM_EXPAND_CONV) THEN ASM_REWRITE_TAC[] THEN + DISCARD_STATE_TAC "s493" THEN + REWRITE_TAC[GSYM REAL_OF_NUM_CLAUSES] THEN + MATCH_MP_TAC EQUAL_FROM_CONGRUENT_REAL THEN + MAP_EVERY EXISTS_TAC [`1024`; `&0:real`] THEN + CONJ_TAC THENL [BOUNDER_TAC[]; ALL_TAC] THEN CONJ_TAC THENL + [MAP_EVERY EXPAND_TAC ["a"; "b"] THEN + REWRITE_TAC[GSYM REAL_OF_NUM_CLAUSES] THEN BOUNDER_TAC[]; + REWRITE_TAC[INTEGER_CLOSED]] THEN + + (*** The core rearrangement we are using ***) + + SUBGOAL_THEN + `&a * &b:real = + (&1 + &2 pow 256) * (&q0 + &2 pow 256 * &q2 + &2 pow 512 * &q3) + + &2 pow 256 * + (&(bignum_of_wordlist [x_4; x_5; x_6; x_7]) - + &(bignum_of_wordlist [x_0; x_1; x_2; x_3])) * + (&(bignum_of_wordlist [y_0; y_1; y_2; y_3]) - + &(bignum_of_wordlist [y_4; y_5; y_6; y_7]))` + SUBST1_TAC THENL + [MAP_EVERY UNDISCH_TAC + [`2 EXP 256 * q1 + q0 = + bignum_of_wordlist[x_0; x_1; x_2; x_3] * + bignum_of_wordlist[y_0; y_1; y_2; y_3]`; + `2 EXP 256 * q3 + q2 = + bignum_of_wordlist[x_4; x_5; x_6; x_7] * + bignum_of_wordlist[y_4; y_5; y_6; y_7] + + q1`] THEN + MAP_EVERY EXPAND_TAC ["a"; "b"] THEN + REWRITE_TAC[GSYM REAL_OF_NUM_CLAUSES; bignum_of_wordlist] THEN + CONV_TAC REAL_RING; + ASM_REWRITE_TAC[]] THEN + + MAP_EVERY EXPAND_TAC ["q0"; "q2"; "q3"] THEN + REWRITE_TAC[GSYM REAL_OF_NUM_CLAUSES; bignum_of_wordlist] THEN + ACCUMULATOR_POP_ASSUM_LIST(MP_TAC o end_itlist CONJ) THEN + POP_ASSUM_LIST(K ALL_TAC) THEN + REWRITE_TAC[WORD_XOR_MASK] THEN COND_CASES_TAC THEN + ASM_REWRITE_TAC[REAL_VAL_WORD_NOT; BITVAL_CLAUSES; DIMINDEX_64] THEN + CONV_TAC WORD_REDUCE_CONV THEN CONV_TAC NUM_REDUCE_CONV THEN + REWRITE_TAC[BITVAL_CLAUSES] THEN DISCH_TAC THEN + + (*** A bit of manual logic for the carry connections in negative case ***) + + FIRST_ASSUM(MP_TAC o end_itlist CONJ o DESUM_RULE o CONJUNCTS) THEN + DISCH_THEN(fun th -> REWRITE_TAC[th]) THEN + CONV_TAC(RAND_CONV REAL_POLY_CONV) THENL + [SUBGOAL_THEN + `&(bitval carry_s465):real = &(bitval carry_s466)` + SUBST1_TAC THENL [ALL_TAC; REAL_INTEGER_TAC] THEN + POP_ASSUM MP_TAC THEN BOOL_CASES_TAC `carry_s465:bool` THEN + ASM_REWRITE_TAC[BITVAL_CLAUSES] THEN + REWRITE_TAC[REAL_RAT_REDUCE_CONV `(&2 pow 64 - &1) * &1 + &0`] THEN + POP_ASSUM_LIST(K ALL_TAC) THEN DISCH_TAC; + ALL_TAC] THEN + FIRST_ASSUM(MP_TAC o end_itlist CONJ o + filter (is_ratconst o rand o concl) o DECARRY_RULE o CONJUNCTS) THEN + DISCH_THEN(fun th -> REWRITE_TAC[th]) THEN REAL_INTEGER_TAC);; + +let BIGNUM_MUL_8_16_NEON_SUBROUTINE_CORRECT = prove + (`!z x y a b pc stackpointer returnaddress. + aligned 16 stackpointer /\ + nonoverlapping (z,8 * 16) (word_sub stackpointer (word 48),48) /\ + ALLPAIRS nonoverlapping + [(z,8 * 16); (word_sub stackpointer (word 48),48)] + [(word pc,2000); (x,8 * 8); (y,8 * 8)] + ==> ensures arm + (\s. aligned_bytes_loaded s (word pc) bignum_mul_8_16_neon_mc /\ + read PC s = word pc /\ + read SP s = stackpointer /\ + read X30 s = returnaddress /\ + C_ARGUMENTS [z; x; y] s /\ + bignum_from_memory (x,8) s = a /\ + bignum_from_memory (y,8) s = b) + (\s. read PC s = returnaddress /\ + bignum_from_memory (z,16) s = a * b) + (MAYCHANGE [PC; X1; X2; X3; X4; X5; X6; X7; X8; + X9; X10; X11; X12; X13; X14; X15; X16; X17] ,, + MAYCHANGE [Q0; Q1; Q2; Q3; Q4; Q5],, + MAYCHANGE [memory :> bytes(z,8 * 16); + memory :> bytes(word_sub stackpointer (word 48),48)] ,, + MAYCHANGE SOME_FLAGS)`, + ARM_ADD_RETURN_STACK_TAC + BIGNUM_MUL_8_16_NEON_EXEC BIGNUM_MUL_8_16_NEON_CORRECT + `[X19;X20;X21;X22;X23;X24]` 48);; diff --git a/arm/proofs/bignum_sqr_8_16_neon.ml b/arm/proofs/bignum_sqr_8_16_neon.ml new file mode 100644 index 00000000..8c681278 --- /dev/null +++ b/arm/proofs/bignum_sqr_8_16_neon.ml @@ -0,0 +1,619 @@ +(* + * Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. + * SPDX-License-Identifier: Apache-2.0 OR ISC + *) + +(* ========================================================================= *) +(* 8x8 -> 16 squaring, using Karatsuba reduction and nested ADK. *) +(* ========================================================================= *) + +(**** print_literal_from_elf "arm/fastmul/bignum_sqr_8_16_neon.o";; + ****) + +let bignum_sqr_8_16_neon_mc = define_assert_from_elf "bignum_sqr_8_16_neon_mc" + "arm/fastmul/bignum_sqr_8_16_neon.o" +[ + 0xa9bf53f3; (* arm_STP X19 X20 SP (Preimmediate_Offset (iword (-- &16))) *) + 0xa9bf5bf5; (* arm_STP X21 X22 SP (Preimmediate_Offset (iword (-- &16))) *) + 0xa9400c22; (* arm_LDP X2 X3 X1 (Immediate_Offset (iword (&0))) *) + 0x3dc00034; (* arm_LDR Q20 X1 (Immediate_Offset (word 0)) *) + 0xa9411424; (* arm_LDP X4 X5 X1 (Immediate_Offset (iword (&16))) *) + 0x3dc00435; (* arm_LDR Q21 X1 (Immediate_Offset (word 16)) *) + 0xa9421c26; (* arm_LDP X6 X7 X1 (Immediate_Offset (iword (&32))) *) + 0x3dc00836; (* arm_LDR Q22 X1 (Immediate_Offset (word 32)) *) + 0xa9432428; (* arm_LDP X8 X9 X1 (Immediate_Offset (iword (&48))) *) + 0x3dc00c37; (* arm_LDR Q23 X1 (Immediate_Offset (word 48)) *) + 0x6f00e5fe; (* arm_MOVI Q30 (word 4294967295) *) + 0x9b047c51; (* arm_MUL X17 X2 X4 *) + 0x9b057c6e; (* arm_MUL X14 X3 X5 *) + 0x6e144281; (* arm_EXT Q1 Q20 Q20 64 *) + 0x9bc47c54; (* arm_UMULH X20 X2 X4 *) + 0x0f208682; (* arm_SHRN Q2 Q20 32 32 *) + 0xeb030055; (* arm_SUBS X21 X2 X3 *) + 0x0e813a80; (* arm_ZIP1 Q0 Q20 Q1 32 64 *) + 0xda9526b5; (* arm_CNEG X21 X21 Condition_CC *) + 0x2ea2c045; (* arm_UMULL_VEC Q5 Q2 Q2 32 *) + 0xda9f23eb; (* arm_CSETM X11 Condition_CC *) + 0x2ea0c046; (* arm_UMULL_VEC Q6 Q2 Q0 32 *) + 0xeb0400ac; (* arm_SUBS X12 X5 X4 *) + 0x2ea0c003; (* arm_UMULL_VEC Q3 Q0 Q0 32 *) + 0xda8c258c; (* arm_CNEG X12 X12 Condition_CC *) + 0x4ea61cc1; (* arm_MOV_VEC Q1 Q6 128 *) + 0x9b0c7ead; (* arm_MUL X13 X21 X12 *) + 0x6f601461; (* arm_USRA_VEC Q1 Q3 32 64 128 *) + 0x9bcc7eac; (* arm_UMULH X12 X21 X12 *) + 0x4e3e1c24; (* arm_AND_VEC Q4 Q1 Q30 128 *) + 0xda8b216b; (* arm_CINV X11 X11 Condition_CC *) + 0x4ee68484; (* arm_ADD_VEC Q4 Q4 Q6 64 128 *) + 0xca0b01ad; (* arm_EOR X13 X13 X11 *) + 0x6f601485; (* arm_USRA_VEC Q5 Q4 32 64 128 *) + 0xca0b018c; (* arm_EOR X12 X12 X11 *) + 0x6f605483; (* arm_SLI_VEC Q3 Q4 32 64 *) + 0xab140233; (* arm_ADDS X19 X17 X20 *) + 0x6f601425; (* arm_USRA_VEC Q5 Q1 32 64 128 *) + 0x9a1f0294; (* arm_ADC X20 X20 XZR *) + 0x6e1542a1; (* arm_EXT Q1 Q21 Q21 64 *) + 0x9bc57c75; (* arm_UMULH X21 X3 X5 *) + 0x0f2086a2; (* arm_SHRN Q2 Q21 32 32 *) + 0xab0e0273; (* arm_ADDS X19 X19 X14 *) + 0x0e813aa0; (* arm_ZIP1 Q0 Q21 Q1 32 64 *) + 0xba150294; (* arm_ADCS X20 X20 X21 *) + 0x9a1f02b5; (* arm_ADC X21 X21 XZR *) + 0xab0e0294; (* arm_ADDS X20 X20 X14 *) + 0x9a1f02b5; (* arm_ADC X21 X21 XZR *) + 0xb100057f; (* arm_CMN X11 (rvalue (word 1)) *) + 0xba0d0273; (* arm_ADCS X19 X19 X13 *) + 0x4e183c6d; (* arm_UMOV X13 Q3 1 8 *) + 0xba0c0294; (* arm_ADCS X20 X20 X12 *) + 0x4e183cae; (* arm_UMOV X14 Q5 1 8 *) + 0x9a0b02b5; (* arm_ADC X21 X21 X11 *) + 0x4e083c6c; (* arm_UMOV X12 Q3 0 8 *) + 0xab110231; (* arm_ADDS X17 X17 X17 *) + 0x4e083cab; (* arm_UMOV X11 Q5 0 8 *) + 0xba130273; (* arm_ADCS X19 X19 X19 *) + 0x2ea2c045; (* arm_UMULL_VEC Q5 Q2 Q2 32 *) + 0xba140294; (* arm_ADCS X20 X20 X20 *) + 0x2ea0c046; (* arm_UMULL_VEC Q6 Q2 Q0 32 *) + 0xba1502b5; (* arm_ADCS X21 X21 X21 *) + 0x2ea0c003; (* arm_UMULL_VEC Q3 Q0 Q0 32 *) + 0x9a1f03ea; (* arm_ADC X10 XZR XZR *) + 0x4ea61cc1; (* arm_MOV_VEC Q1 Q6 128 *) + 0x9b037c4f; (* arm_MUL X15 X2 X3 *) + 0x6f601461; (* arm_USRA_VEC Q1 Q3 32 64 128 *) + 0x9bc37c50; (* arm_UMULH X16 X2 X3 *) + 0x4e3e1c24; (* arm_AND_VEC Q4 Q1 Q30 128 *) + 0xab0f016b; (* arm_ADDS X11 X11 X15 *) + 0x4ee68484; (* arm_ADD_VEC Q4 Q4 Q6 64 128 *) + 0xba1001ad; (* arm_ADCS X13 X13 X16 *) + 0x6f601485; (* arm_USRA_VEC Q5 Q4 32 64 128 *) + 0x9a1f01ce; (* arm_ADC X14 X14 XZR *) + 0x6f605483; (* arm_SLI_VEC Q3 Q4 32 64 *) + 0xab0f016b; (* arm_ADDS X11 X11 X15 *) + 0x6f601425; (* arm_USRA_VEC Q5 Q1 32 64 128 *) + 0xba1001ad; (* arm_ADCS X13 X13 X16 *) + 0x9a1f01ce; (* arm_ADC X14 X14 XZR *) + 0xa9002c0c; (* arm_STP X12 X11 X0 (Immediate_Offset (iword (&0))) *) + 0x4e083cab; (* arm_UMOV X11 Q5 0 8 *) + 0xab0d0231; (* arm_ADDS X17 X17 X13 *) + 0x4e183c6d; (* arm_UMOV X13 Q3 1 8 *) + 0xba0e0273; (* arm_ADCS X19 X19 X14 *) + 0x4e183cae; (* arm_UMOV X14 Q5 1 8 *) + 0xba1f0294; (* arm_ADCS X20 X20 XZR *) + 0x4e083c6c; (* arm_UMOV X12 Q3 0 8 *) + 0xba1f02b5; (* arm_ADCS X21 X21 XZR *) + 0x6e1642c1; (* arm_EXT Q1 Q22 Q22 64 *) + 0x9a1f014a; (* arm_ADC X10 X10 XZR *) + 0x0f2086c2; (* arm_SHRN Q2 Q22 32 32 *) + 0xa9014c11; (* arm_STP X17 X19 X0 (Immediate_Offset (iword (&16))) *) + 0x0e813ac0; (* arm_ZIP1 Q0 Q22 Q1 32 64 *) + 0x9b057c8f; (* arm_MUL X15 X4 X5 *) + 0x2ea2c045; (* arm_UMULL_VEC Q5 Q2 Q2 32 *) + 0x9bc57c90; (* arm_UMULH X16 X4 X5 *) + 0x2ea0c046; (* arm_UMULL_VEC Q6 Q2 Q0 32 *) + 0xab0f016b; (* arm_ADDS X11 X11 X15 *) + 0x2ea0c003; (* arm_UMULL_VEC Q3 Q0 Q0 32 *) + 0xba1001ad; (* arm_ADCS X13 X13 X16 *) + 0x4ea61cc1; (* arm_MOV_VEC Q1 Q6 128 *) + 0x9a1f01ce; (* arm_ADC X14 X14 XZR *) + 0x6f601461; (* arm_USRA_VEC Q1 Q3 32 64 128 *) + 0xab0f016b; (* arm_ADDS X11 X11 X15 *) + 0x4e3e1c24; (* arm_AND_VEC Q4 Q1 Q30 128 *) + 0xba1001ad; (* arm_ADCS X13 X13 X16 *) + 0x4ee68484; (* arm_ADD_VEC Q4 Q4 Q6 64 128 *) + 0x9a1f01ce; (* arm_ADC X14 X14 XZR *) + 0x6f601485; (* arm_USRA_VEC Q5 Q4 32 64 128 *) + 0xab14018c; (* arm_ADDS X12 X12 X20 *) + 0x6f605483; (* arm_SLI_VEC Q3 Q4 32 64 *) + 0xba15016b; (* arm_ADCS X11 X11 X21 *) + 0x6f601425; (* arm_USRA_VEC Q5 Q1 32 64 128 *) + 0xa9022c0c; (* arm_STP X12 X11 X0 (Immediate_Offset (iword (&32))) *) + 0x6e1742e1; (* arm_EXT Q1 Q23 Q23 64 *) + 0xba0a01ad; (* arm_ADCS X13 X13 X10 *) + 0x0f2086e2; (* arm_SHRN Q2 Q23 32 32 *) + 0x9a1f01ce; (* arm_ADC X14 X14 XZR *) + 0x0e813ae0; (* arm_ZIP1 Q0 Q23 Q1 32 64 *) + 0xa903380d; (* arm_STP X13 X14 X0 (Immediate_Offset (iword (&48))) *) + 0x9b087cd1; (* arm_MUL X17 X6 X8 *) + 0x2ea2c050; (* arm_UMULL_VEC Q16 Q2 Q2 32 *) + 0x9b097cee; (* arm_MUL X14 X7 X9 *) + 0x2ea0c046; (* arm_UMULL_VEC Q6 Q2 Q0 32 *) + 0x9bc87cd4; (* arm_UMULH X20 X6 X8 *) + 0x2ea0c012; (* arm_UMULL_VEC Q18 Q0 Q0 32 *) + 0xeb0700d5; (* arm_SUBS X21 X6 X7 *) + 0xda9526b5; (* arm_CNEG X21 X21 Condition_CC *) + 0x4ea61cc1; (* arm_MOV_VEC Q1 Q6 128 *) + 0xda9f23eb; (* arm_CSETM X11 Condition_CC *) + 0xeb08012c; (* arm_SUBS X12 X9 X8 *) + 0xda8c258c; (* arm_CNEG X12 X12 Condition_CC *) + 0x6f601641; (* arm_USRA_VEC Q1 Q18 32 64 128 *) + 0x9b0c7ead; (* arm_MUL X13 X21 X12 *) + 0x4e3e1c24; (* arm_AND_VEC Q4 Q1 Q30 128 *) + 0x9bcc7eac; (* arm_UMULH X12 X21 X12 *) + 0x4ee68484; (* arm_ADD_VEC Q4 Q4 Q6 64 128 *) + 0xda8b216b; (* arm_CINV X11 X11 Condition_CC *) + 0xca0b01ad; (* arm_EOR X13 X13 X11 *) + 0xca0b018c; (* arm_EOR X12 X12 X11 *) + 0x6f601490; (* arm_USRA_VEC Q16 Q4 32 64 128 *) + 0xab140233; (* arm_ADDS X19 X17 X20 *) + 0x9a1f0294; (* arm_ADC X20 X20 XZR *) + 0x6f605492; (* arm_SLI_VEC Q18 Q4 32 64 *) + 0x9bc97cf5; (* arm_UMULH X21 X7 X9 *) + 0xab0e0273; (* arm_ADDS X19 X19 X14 *) + 0xba150294; (* arm_ADCS X20 X20 X21 *) + 0x9a1f02b5; (* arm_ADC X21 X21 XZR *) + 0xab0e0294; (* arm_ADDS X20 X20 X14 *) + 0x4e183cae; (* arm_UMOV X14 Q5 1 8 *) + 0x9a1f02b5; (* arm_ADC X21 X21 XZR *) + 0xb100057f; (* arm_CMN X11 (rvalue (word 1)) *) + 0xba0d0273; (* arm_ADCS X19 X19 X13 *) + 0x4e183c6d; (* arm_UMOV X13 Q3 1 8 *) + 0xba0c0294; (* arm_ADCS X20 X20 X12 *) + 0x4e083c6c; (* arm_UMOV X12 Q3 0 8 *) + 0x9a0b02b5; (* arm_ADC X21 X21 X11 *) + 0x4e083cab; (* arm_UMOV X11 Q5 0 8 *) + 0xab110231; (* arm_ADDS X17 X17 X17 *) + 0xba130273; (* arm_ADCS X19 X19 X19 *) + 0x6f601430; (* arm_USRA_VEC Q16 Q1 32 64 128 *) + 0xba140294; (* arm_ADCS X20 X20 X20 *) + 0xba1502b5; (* arm_ADCS X21 X21 X21 *) + 0x9a1f03ea; (* arm_ADC X10 XZR XZR *) + 0x4e975ab1; (* arm_UZP2 Q17 Q21 Q23 32 *) + 0x9b077ccf; (* arm_MUL X15 X6 X7 *) + 0x0ea12ae4; (* arm_XTN Q4 Q23 32 *) + 0x9bc77cd0; (* arm_UMULH X16 X6 X7 *) + 0x4e083e16; (* arm_UMOV X22 Q16 0 8 *) + 0xab0f016b; (* arm_ADDS X11 X11 X15 *) + 0xba1001ad; (* arm_ADCS X13 X13 X16 *) + 0x0ea12aa5; (* arm_XTN Q5 Q21 32 *) + 0x9a1f01ce; (* arm_ADC X14 X14 XZR *) + 0xab0f016b; (* arm_ADDS X11 X11 X15 *) + 0x4ea00aa1; (* arm_REV64_VEC Q1 Q21 32 *) + 0xba1001ad; (* arm_ADCS X13 X13 X16 *) + 0x9a1f01ce; (* arm_ADC X14 X14 XZR *) + 0xa9042c0c; (* arm_STP X12 X11 X0 (Immediate_Offset (iword (&64))) *) + 0xab0d0231; (* arm_ADDS X17 X17 X13 *) + 0x4e183e4d; (* arm_UMOV X13 Q18 1 8 *) + 0xba0e0273; (* arm_ADCS X19 X19 X14 *) + 0x4e183e0e; (* arm_UMOV X14 Q16 1 8 *) + 0xba1f0294; (* arm_ADCS X20 X20 XZR *) + 0x4e083e4c; (* arm_UMOV X12 Q18 0 8 *) + 0xba1f02b5; (* arm_ADCS X21 X21 XZR *) + 0x9a1f014a; (* arm_ADC X10 X10 XZR *) + 0x2ea5c086; (* arm_UMULL_VEC Q6 Q4 Q5 32 *) + 0xa9054c11; (* arm_STP X17 X19 X0 (Immediate_Offset (iword (&80))) *) + 0x2eb1c087; (* arm_UMULL_VEC Q7 Q4 Q17 32 *) + 0x9b097d0f; (* arm_MUL X15 X8 X9 *) + 0x4e975af0; (* arm_UZP2 Q16 Q23 Q23 32 *) + 0x9bc97d10; (* arm_UMULH X16 X8 X9 *) + 0x4eb79c20; (* arm_MUL_VEC Q0 Q1 Q23 32 *) + 0xab0f02cb; (* arm_ADDS X11 X22 X15 *) + 0xba1001ad; (* arm_ADCS X13 X13 X16 *) + 0x6f6014c7; (* arm_USRA_VEC Q7 Q6 32 64 128 *) + 0x9a1f01ce; (* arm_ADC X14 X14 XZR *) + 0xab0f016b; (* arm_ADDS X11 X11 X15 *) + 0x2eb1c201; (* arm_UMULL_VEC Q1 Q16 Q17 32 *) + 0xba1001ad; (* arm_ADCS X13 X13 X16 *) + 0x9a1f01ce; (* arm_ADC X14 X14 XZR *) + 0x6ea02800; (* arm_UADDLP Q0 Q0 32 *) + 0xab14018c; (* arm_ADDS X12 X12 X20 *) + 0xba15016b; (* arm_ADCS X11 X11 X21 *) + 0x4e3e1ce2; (* arm_AND_VEC Q2 Q7 Q30 128 *) + 0x2ea58202; (* arm_UMLAL_VEC Q2 Q16 Q5 32 *) + 0x4f605400; (* arm_SHL_VEC Q0 Q0 32 64 *) + 0x6f6014e1; (* arm_USRA_VEC Q1 Q7 32 64 128 *) + 0x2ea58080; (* arm_UMLAL_VEC Q0 Q4 Q5 32 *) + 0x4e183c10; (* arm_UMOV X16 Q0 1 8 *) + 0x4e083c0f; (* arm_UMOV X15 Q0 0 8 *) + 0x6f601441; (* arm_USRA_VEC Q1 Q2 32 64 128 *) + 0x4e083c34; (* arm_UMOV X20 Q1 0 8 *) + 0x4e183c35; (* arm_UMOV X21 Q1 1 8 *) + 0xa9062c0c; (* arm_STP X12 X11 X0 (Immediate_Offset (iword (&96))) *) + 0xba0a01ad; (* arm_ADCS X13 X13 X10 *) + 0x9a1f01ce; (* arm_ADC X14 X14 XZR *) + 0xa907380d; (* arm_STP X13 X14 X0 (Immediate_Offset (iword (&112))) *) + 0x9b067c4a; (* arm_MUL X10 X2 X6 *) + 0x9b077c6e; (* arm_MUL X14 X3 X7 *) + 0x9bc67c51; (* arm_UMULH X17 X2 X6 *) + 0xab1101ce; (* arm_ADDS X14 X14 X17 *) + 0x9bc77c71; (* arm_UMULH X17 X3 X7 *) + 0xba1101ef; (* arm_ADCS X15 X15 X17 *) + 0xba140210; (* arm_ADCS X16 X16 X20 *) + 0x9a1f02b1; (* arm_ADC X17 X21 XZR *) + 0xab0a01cb; (* arm_ADDS X11 X14 X10 *) + 0xba0e01ee; (* arm_ADCS X14 X15 X14 *) + 0xba0f020f; (* arm_ADCS X15 X16 X15 *) + 0xba100230; (* arm_ADCS X16 X17 X16 *) + 0x9a1103f1; (* arm_ADC X17 XZR X17 *) + 0xab0a01cc; (* arm_ADDS X12 X14 X10 *) + 0xba0b01ed; (* arm_ADCS X13 X15 X11 *) + 0xba0e020e; (* arm_ADCS X14 X16 X14 *) + 0xba0f022f; (* arm_ADCS X15 X17 X15 *) + 0xba1003f0; (* arm_ADCS X16 XZR X16 *) + 0x9a1103f1; (* arm_ADC X17 XZR X17 *) + 0xeb050096; (* arm_SUBS X22 X4 X5 *) + 0xda9626d6; (* arm_CNEG X22 X22 Condition_CC *) + 0xda9f23f3; (* arm_CSETM X19 Condition_CC *) + 0xeb080134; (* arm_SUBS X20 X9 X8 *) + 0xda942694; (* arm_CNEG X20 X20 Condition_CC *) + 0x9b147ed5; (* arm_MUL X21 X22 X20 *) + 0x9bd47ed4; (* arm_UMULH X20 X22 X20 *) + 0xda932273; (* arm_CINV X19 X19 Condition_CC *) + 0xb100067f; (* arm_CMN X19 (rvalue (word 1)) *) + 0xca1302b5; (* arm_EOR X21 X21 X19 *) + 0xba1501ef; (* arm_ADCS X15 X15 X21 *) + 0xca130294; (* arm_EOR X20 X20 X19 *) + 0xba140210; (* arm_ADCS X16 X16 X20 *) + 0x9a130231; (* arm_ADC X17 X17 X19 *) + 0xeb030056; (* arm_SUBS X22 X2 X3 *) + 0xda9626d6; (* arm_CNEG X22 X22 Condition_CC *) + 0xda9f23f3; (* arm_CSETM X19 Condition_CC *) + 0xeb0600f4; (* arm_SUBS X20 X7 X6 *) + 0xda942694; (* arm_CNEG X20 X20 Condition_CC *) + 0x9b147ed5; (* arm_MUL X21 X22 X20 *) + 0x9bd47ed4; (* arm_UMULH X20 X22 X20 *) + 0xda932273; (* arm_CINV X19 X19 Condition_CC *) + 0xb100067f; (* arm_CMN X19 (rvalue (word 1)) *) + 0xca1302b5; (* arm_EOR X21 X21 X19 *) + 0xba15016b; (* arm_ADCS X11 X11 X21 *) + 0xca130294; (* arm_EOR X20 X20 X19 *) + 0xba14018c; (* arm_ADCS X12 X12 X20 *) + 0xba1301ad; (* arm_ADCS X13 X13 X19 *) + 0xba1301ce; (* arm_ADCS X14 X14 X19 *) + 0xba1301ef; (* arm_ADCS X15 X15 X19 *) + 0xba130210; (* arm_ADCS X16 X16 X19 *) + 0x9a130231; (* arm_ADC X17 X17 X19 *) + 0xeb050076; (* arm_SUBS X22 X3 X5 *) + 0xda9626d6; (* arm_CNEG X22 X22 Condition_CC *) + 0xda9f23f3; (* arm_CSETM X19 Condition_CC *) + 0xeb070134; (* arm_SUBS X20 X9 X7 *) + 0xda942694; (* arm_CNEG X20 X20 Condition_CC *) + 0x9b147ed5; (* arm_MUL X21 X22 X20 *) + 0x9bd47ed4; (* arm_UMULH X20 X22 X20 *) + 0xda932273; (* arm_CINV X19 X19 Condition_CC *) + 0xb100067f; (* arm_CMN X19 (rvalue (word 1)) *) + 0xca1302b5; (* arm_EOR X21 X21 X19 *) + 0xba1501ce; (* arm_ADCS X14 X14 X21 *) + 0xca130294; (* arm_EOR X20 X20 X19 *) + 0xba1401ef; (* arm_ADCS X15 X15 X20 *) + 0xba130210; (* arm_ADCS X16 X16 X19 *) + 0x9a130231; (* arm_ADC X17 X17 X19 *) + 0xeb040056; (* arm_SUBS X22 X2 X4 *) + 0xda9626d6; (* arm_CNEG X22 X22 Condition_CC *) + 0xda9f23f3; (* arm_CSETM X19 Condition_CC *) + 0xeb060114; (* arm_SUBS X20 X8 X6 *) + 0xda942694; (* arm_CNEG X20 X20 Condition_CC *) + 0x9b147ed5; (* arm_MUL X21 X22 X20 *) + 0x9bd47ed4; (* arm_UMULH X20 X22 X20 *) + 0xda932273; (* arm_CINV X19 X19 Condition_CC *) + 0xb100067f; (* arm_CMN X19 (rvalue (word 1)) *) + 0xca1302b5; (* arm_EOR X21 X21 X19 *) + 0xba15018c; (* arm_ADCS X12 X12 X21 *) + 0xca130294; (* arm_EOR X20 X20 X19 *) + 0xba1401ad; (* arm_ADCS X13 X13 X20 *) + 0xba1301ce; (* arm_ADCS X14 X14 X19 *) + 0xba1301ef; (* arm_ADCS X15 X15 X19 *) + 0xba130210; (* arm_ADCS X16 X16 X19 *) + 0x9a130231; (* arm_ADC X17 X17 X19 *) + 0xeb050056; (* arm_SUBS X22 X2 X5 *) + 0xda9626d6; (* arm_CNEG X22 X22 Condition_CC *) + 0xda9f23f3; (* arm_CSETM X19 Condition_CC *) + 0xeb060134; (* arm_SUBS X20 X9 X6 *) + 0xda942694; (* arm_CNEG X20 X20 Condition_CC *) + 0x9b147ed5; (* arm_MUL X21 X22 X20 *) + 0x9bd47ed4; (* arm_UMULH X20 X22 X20 *) + 0xda932273; (* arm_CINV X19 X19 Condition_CC *) + 0xb100067f; (* arm_CMN X19 (rvalue (word 1)) *) + 0xca1302b5; (* arm_EOR X21 X21 X19 *) + 0xba1501ad; (* arm_ADCS X13 X13 X21 *) + 0xca130294; (* arm_EOR X20 X20 X19 *) + 0xba1401ce; (* arm_ADCS X14 X14 X20 *) + 0xba1301ef; (* arm_ADCS X15 X15 X19 *) + 0xba130210; (* arm_ADCS X16 X16 X19 *) + 0x9a130231; (* arm_ADC X17 X17 X19 *) + 0xeb040076; (* arm_SUBS X22 X3 X4 *) + 0xda9626d6; (* arm_CNEG X22 X22 Condition_CC *) + 0xda9f23f3; (* arm_CSETM X19 Condition_CC *) + 0xeb070114; (* arm_SUBS X20 X8 X7 *) + 0xda942694; (* arm_CNEG X20 X20 Condition_CC *) + 0x9b147ed5; (* arm_MUL X21 X22 X20 *) + 0x9bd47ed4; (* arm_UMULH X20 X22 X20 *) + 0xda932273; (* arm_CINV X19 X19 Condition_CC *) + 0xb100067f; (* arm_CMN X19 (rvalue (word 1)) *) + 0xca1302b5; (* arm_EOR X21 X21 X19 *) + 0xba1501ad; (* arm_ADCS X13 X13 X21 *) + 0xca130294; (* arm_EOR X20 X20 X19 *) + 0xba1401ce; (* arm_ADCS X14 X14 X20 *) + 0xba1301ef; (* arm_ADCS X15 X15 X19 *) + 0xba130210; (* arm_ADCS X16 X16 X19 *) + 0x9a130231; (* arm_ADC X17 X17 X19 *) + 0xab0a014a; (* arm_ADDS X10 X10 X10 *) + 0xba0b016b; (* arm_ADCS X11 X11 X11 *) + 0xba0c018c; (* arm_ADCS X12 X12 X12 *) + 0xba0d01ad; (* arm_ADCS X13 X13 X13 *) + 0xba0e01ce; (* arm_ADCS X14 X14 X14 *) + 0xba0f01ef; (* arm_ADCS X15 X15 X15 *) + 0xba100210; (* arm_ADCS X16 X16 X16 *) + 0xba110231; (* arm_ADCS X17 X17 X17 *) + 0x9a1f03f3; (* arm_ADC X19 XZR XZR *) + 0xa9420c02; (* arm_LDP X2 X3 X0 (Immediate_Offset (iword (&32))) *) + 0xab02014a; (* arm_ADDS X10 X10 X2 *) + 0xba03016b; (* arm_ADCS X11 X11 X3 *) + 0xa9022c0a; (* arm_STP X10 X11 X0 (Immediate_Offset (iword (&32))) *) + 0xa9430c02; (* arm_LDP X2 X3 X0 (Immediate_Offset (iword (&48))) *) + 0xba02018c; (* arm_ADCS X12 X12 X2 *) + 0xba0301ad; (* arm_ADCS X13 X13 X3 *) + 0xa903340c; (* arm_STP X12 X13 X0 (Immediate_Offset (iword (&48))) *) + 0xa9440c02; (* arm_LDP X2 X3 X0 (Immediate_Offset (iword (&64))) *) + 0xba0201ce; (* arm_ADCS X14 X14 X2 *) + 0xba0301ef; (* arm_ADCS X15 X15 X3 *) + 0xa9043c0e; (* arm_STP X14 X15 X0 (Immediate_Offset (iword (&64))) *) + 0xa9450c02; (* arm_LDP X2 X3 X0 (Immediate_Offset (iword (&80))) *) + 0xba020210; (* arm_ADCS X16 X16 X2 *) + 0xba030231; (* arm_ADCS X17 X17 X3 *) + 0xa9054410; (* arm_STP X16 X17 X0 (Immediate_Offset (iword (&80))) *) + 0xa9460c02; (* arm_LDP X2 X3 X0 (Immediate_Offset (iword (&96))) *) + 0xba130042; (* arm_ADCS X2 X2 X19 *) + 0xba1f0063; (* arm_ADCS X3 X3 XZR *) + 0xa9060c02; (* arm_STP X2 X3 X0 (Immediate_Offset (iword (&96))) *) + 0xa9470c02; (* arm_LDP X2 X3 X0 (Immediate_Offset (iword (&112))) *) + 0xba1f0042; (* arm_ADCS X2 X2 XZR *) + 0x9a1f0063; (* arm_ADC X3 X3 XZR *) + 0xa9070c02; (* arm_STP X2 X3 X0 (Immediate_Offset (iword (&112))) *) + 0xa8c15bf5; (* arm_LDP X21 X22 SP (Postimmediate_Offset (iword (&16))) *) + 0xa8c153f3; (* arm_LDP X19 X20 SP (Postimmediate_Offset (iword (&16))) *) + 0xd65f03c0 (* arm_RET X30 *) +];; + +let BIGNUM_SQR_8_16_NEON_EXEC = ARM_MK_EXEC_RULE bignum_sqr_8_16_neon_mc;; + +needs "arm/proofs/neon_helper.ml";; + +(* ------------------------------------------------------------------------- *) +(* Lemmas to halve the number of case splits, useful for efficiency. *) +(* ------------------------------------------------------------------------- *) + +let lemma1 = prove + (`!(x0:num) x1 (y0:num) y1. + (if y0 <= y1 + then if x1 <= x0 then word 0 else word 18446744073709551615 + else word_not + (if x1 <= x0 then word 0 else word 18446744073709551615)):int64 = + word_neg(word(bitval(y0 <= y1 <=> x0 < x1)))`, + REPEAT GEN_TAC THEN REWRITE_TAC[GSYM NOT_LE] THEN + REPEAT(COND_CASES_TAC THEN ASM_REWRITE_TAC[BITVAL_CLAUSES]) THEN + CONV_TAC WORD_REDUCE_CONV);; + +let lemma2 = prove + (`!(x0:int64) (x1:int64) (y0:int64) (y1:int64). + &(val(if val x1 <= val x0 then word_sub x0 x1 + else word_neg (word_sub x0 x1))) * + &(val(if val y0 <= val y1 then word_sub y1 y0 + else word_neg (word_sub y1 y0))):real = + --(&1) pow bitval(val y0 <= val y1 <=> val x0 < val x1) * + (&(val x0) - &(val x1)) * (&(val y1) - &(val y0))`, + REPEAT GEN_TAC THEN REWRITE_TAC[GSYM NOT_LE; WORD_NEG_SUB] THEN + REPEAT(COND_CASES_TAC THEN ASM_REWRITE_TAC[BITVAL_CLAUSES]) THEN + REPEAT(FIRST_X_ASSUM(ASSUME_TAC o MATCH_MP (ARITH_RULE + `~(m:num <= n) ==> n <= m /\ ~(m <= n)`))) THEN + ASM_SIMP_TAC[VAL_WORD_SUB_CASES; GSYM REAL_OF_NUM_SUB] THEN + REAL_ARITH_TAC);; + +(* ------------------------------------------------------------------------- *) +(* Correctness proof. *) +(* ------------------------------------------------------------------------- *) + +let ADK_48_TAC = + MATCH_MP_TAC EQUAL_FROM_CONGRUENT_REAL THEN + MAP_EVERY EXISTS_TAC [`512`; `&0:real`] THEN + REPLICATE_TAC 2 (CONJ_TAC THENL [BOUNDER_TAC[]; ALL_TAC]) THEN + CONJ_TAC THENL [REAL_INTEGER_TAC; ALL_TAC] THEN + ACCUMULATOR_POP_ASSUM_LIST(MP_TAC o end_itlist CONJ) THEN + POP_ASSUM_LIST(K ALL_TAC) THEN + REWRITE_TAC[lemma1; lemma2] THEN REWRITE_TAC[WORD_XOR_MASK] THEN + REPEAT(COND_CASES_TAC THEN + ASM_REWRITE_TAC[BITVAL_CLAUSES; REAL_VAL_WORD_NOT]) THEN + CONV_TAC WORD_REDUCE_CONV THEN CONV_TAC NUM_REDUCE_CONV THEN + REWRITE_TAC[BITVAL_CLAUSES; DIMINDEX_64] THEN + POP_ASSUM_LIST(K ALL_TAC) THEN DISCH_TAC THEN + FIRST_ASSUM(MP_TAC o end_itlist CONJ o DESUM_RULE o CONJUNCTS) THEN + DISCH_THEN(fun th -> REWRITE_TAC[th]) THEN + CONV_TAC(RAND_CONV REAL_POLY_CONV) THEN + FIRST_ASSUM(MP_TAC o end_itlist CONJ o filter (is_ratconst o rand o concl) o + DECARRY_RULE o CONJUNCTS) THEN + DISCH_THEN(fun th -> REWRITE_TAC[th]) THEN REAL_INTEGER_TAC;; + + +(* Note that, unlike BIGNUM_SQR_8_16_CORRECT, BIGNUM_SQR_8_16_NEON_CORRECT + assumes that z and x must not overlap. *) + +let BIGNUM_SQR_8_16_NEON_CORRECT = prove(`!z x a pc. + ALL (nonoverlapping (z,8 * 16)) + [(word pc,1476); (x,8 * 8)] + ==> ensures arm + (\s. aligned_bytes_loaded s (word pc) bignum_sqr_8_16_neon_mc /\ + read PC s = word(pc + 0x8) /\ + C_ARGUMENTS [z; x] s /\ + bignum_from_memory (x,8) s = a) + (\s. read PC s = word (pc + 1464) /\ + bignum_from_memory (z,16) s = a EXP 2) + (MAYCHANGE [PC; X2; X3; X4; X5; X6; X7; X8; X9; X10; X11; X12; + X13; X14; X15; X16; X17; X19; X20; X21; X22] ,, + MAYCHANGE [Q0; Q1; Q2; Q3; Q4; Q5; Q6; Q7; Q16; Q17; Q18; Q19; Q20; + Q21; Q22; Q23; Q30] ,, + MAYCHANGE [memory :> bytes(z,8 * 16)] ,, + MAYCHANGE SOME_FLAGS)`, + + MAP_EVERY X_GEN_TAC [`z:int64`; `x:int64`; `a:num`; `pc:num`] THEN + REWRITE_TAC[C_ARGUMENTS; C_RETURN; SOME_FLAGS; ALL; NONOVERLAPPING_CLAUSES] THEN + DISCH_THEN(REPEAT_TCL CONJUNCTS_THEN ASSUME_TAC) THEN + ENSURES_INIT_TAC "s0" THEN + BIGNUM_DIGITIZE_TAC "x_" `bignum_from_memory (x,8) s0` THEN + BYTES128_EQ_JOIN64_TAC `read (memory :> bytes128 x) s0` `x_1:(64)word` `x_0:(64)word` THEN + BYTES128_EQ_JOIN64_TAC `read (memory :> bytes128 (word_add x (word 16:(64)word))) s0` + `x_3:(64)word` `x_2:(64)word` THEN + BYTES128_EQ_JOIN64_TAC `read (memory :> bytes128 (word_add x (word 32:(64)word))) s0` + `x_5:(64)word` `x_4:(64)word` THEN + BYTES128_EQ_JOIN64_TAC `read (memory :> bytes128 (word_add x (word 48:(64)word))) s0` + `x_7:(64)word` `x_6:(64)word` THEN + + (*** First nested mini-ADK 4x4 squaring block ***) + + ARM_REWRITE_ASSUM_AND_ACCSTEPS_TAC BIGNUM_SQR_8_16_NEON_EXEC + [49;51;53;55;79;81;83;85] + [WORD_SQR64_HI; WORD_SQR64_LO] + [10;11;25;35;37;41;43;44;45;46;48;49;50;52;53;54;56;58;60;62; + 64;68;70;72;74;76;77;80;81;82;84;85;86;88;92;96;98;100;102;104; + 106;108;110;114;116] + (1--118) THEN + + SUBGOAL_THEN + `bignum_of_wordlist [x_0;x_1;x_2;x_3] EXP 2 = + bignum_of_wordlist [mullo_s53; sum_s74; sum_s80; sum_s82; + sum_s108; sum_s110; sum_s114; sum_s116]` + ASSUME_TAC THENL + [REWRITE_TAC[bignum_of_wordlist; GSYM REAL_OF_NUM_CLAUSES] THEN + RULE_ASSUM_TAC(REWRITE_RULE[ADD_CLAUSES; VAL_WORD_BITVAL]) THEN + ADK_48_TAC; + ACCUMULATOR_POP_ASSUM_LIST(K ALL_TAC) THEN + DISCARD_MATCHING_ASSUMPTIONS [`word a = b`]] THEN + + (*** Second nested mini-ADK 4x4 squaring block ***) + + + ARM_REWRITE_ASSUM_AND_ACCSTEPS_TAC BIGNUM_SQR_8_16_NEON_EXEC + [148;152;154;156;167;178;180;182] + [WORD_SQR64_HI; WORD_SQR64_LO] + [119;121;132;140;141;144;145;146;147;149;151;153;155;157;158;160;161; + 162;154;152;164;168;169;171;172;174;175;177;179;181;183;184;182;178; + 188;192;193;195;196;198;199;201;202] + (119--207) THEN + RULE_ASSUM_TAC (REWRITE_RULE + [WORD_BITMANIP_SIMP_LEMMAS; WORD_MUL64_LO]) THEN + ARM_REWRITE_ASSUM_AND_ACCSTEPS_TAC BIGNUM_SQR_8_16_NEON_EXEC + [208;209] [WORD_SQR64_HI; WORD_SQR64_LO] [208;209] + (208--210) THEN + RULE_ASSUM_TAC (REWRITE_RULE + [WORD_BITMANIP_SIMP_LEMMAS; WORD_MUL64_HI]) THEN + ARM_REWRITE_ASSUM_AND_ACCSTEPS_TAC BIGNUM_SQR_8_16_NEON_EXEC + [211;212] [WORD_SQR64_HI; WORD_SQR64_LO] [211;212;214;215] + (211--216) THEN + + SUBGOAL_THEN + `bignum_of_wordlist [x_4;x_5;x_6;x_7] EXP 2 = + bignum_of_wordlist [mullo_s154;sum_s172;sum_s177;sum_s179; + sum_s201;sum_s202;sum_s214;sum_s215]` + ASSUME_TAC THENL + [REWRITE_TAC[bignum_of_wordlist; GSYM REAL_OF_NUM_CLAUSES] THEN + RULE_ASSUM_TAC(REWRITE_RULE[ADD_CLAUSES; VAL_WORD_BITVAL]) THEN + ADK_48_TAC; + let is_acc_thm_for_next acc_thm = + List.exists (contains_str (string_of_term (concl acc_thm))) + ["208";"209";"211";"212"] in + let filter_acc_thms_for_next acc_thms = + List.filter is_acc_thm_for_next acc_thms in + let wpat = `word a = b` in + ACCUMULATOR_POP_ASSUM_LIST( + fun acc_thms -> + let acc_thms = filter_acc_thms_for_next acc_thms in + List.iter (fun t -> Printf.printf "assuming: %s\n" t) + (List.map string_of_thm acc_thms); + MAP_EVERY ASSUME_TAC acc_thms) THEN + DISCARD_ASSUMPTIONS_TAC + (fun th -> can (term_match [] wpat) (concl th) && + not (is_acc_thm_for_next th))] THEN + + (*** Nested ADK 4x4 multiplication block ***) + + ARM_ACCSTEPS_TAC BIGNUM_SQR_8_16_NEON_EXEC + [217;218;220;222;223;224;225;226;227;228;229;230;231;232;233; + 234;235;241;246;248;249;255;260;262;263;264;265;266;267;273;278; + 280;281;282;288;293;295;296;297;298;299;305;310;312;313;314;315; + 321;326;328;329;330;331] + (217--331) THEN + + SUBGOAL_THEN + `bignum_of_wordlist [x_0;x_1;x_2;x_3] * + bignum_of_wordlist [x_4;x_5;x_6;x_7] = + bignum_of_wordlist + [mullo_s217; sum_s260; sum_s293; sum_s326; + sum_s328; sum_s329; sum_s330; sum_s331]` + ASSUME_TAC THENL + [REWRITE_TAC[bignum_of_wordlist; GSYM REAL_OF_NUM_CLAUSES] THEN + RULE_ASSUM_TAC(REWRITE_RULE[ADD_CLAUSES; VAL_WORD_BITVAL]) THEN + ADK_48_TAC; + ACCUMULATOR_POP_ASSUM_LIST(K ALL_TAC) THEN + DISCARD_MATCHING_ASSUMPTIONS [`word a = b`]] THEN + + (*** Final accumulation simulation and 16-digit focusing ***) + + ARM_ACCSTEPS_TAC BIGNUM_SQR_8_16_NEON_EXEC (332--364) (332--364) THEN + ENSURES_FINAL_STATE_TAC THEN ASM_REWRITE_TAC[] THEN + CONV_TAC(LAND_CONV BIGNUM_EXPAND_CONV) THEN ASM_REWRITE_TAC[] THEN + DISCARD_STATE_TAC "s364" THEN + REWRITE_TAC[GSYM REAL_OF_NUM_CLAUSES] THEN + MATCH_MP_TAC EQUAL_FROM_CONGRUENT_REAL THEN + MAP_EVERY EXISTS_TAC [`1024`; `&0:real`] THEN + CONJ_TAC THENL [BOUNDER_TAC[]; ALL_TAC] THEN CONJ_TAC THENL + [EXPAND_TAC "a" THEN + REWRITE_TAC[GSYM REAL_OF_NUM_CLAUSES] THEN BOUNDER_TAC[]; + REWRITE_TAC[INTEGER_CLOSED]] THEN + + (*** The core rearrangement we are using ***) + + SUBGOAL_THEN + `(&a:real) pow 2 = + &(bignum_of_wordlist [x_0;x_1;x_2;x_3] EXP 2) + + &2 pow 512 * &(bignum_of_wordlist [x_4;x_5;x_6;x_7] EXP 2) + + &2 pow 257 * &(bignum_of_wordlist [x_0;x_1;x_2;x_3] * + bignum_of_wordlist [x_4;x_5;x_6;x_7])` + SUBST1_TAC THENL + [EXPAND_TAC "a" THEN + REWRITE_TAC[bignum_of_wordlist; REAL_OF_NUM_CLAUSES] THEN ARITH_TAC; + ASM_REWRITE_TAC[]] THEN + REWRITE_TAC[GSYM REAL_OF_NUM_CLAUSES; bignum_of_wordlist] THEN + RULE_ASSUM_TAC(REWRITE_RULE[ADD_CLAUSES; VAL_WORD_BITVAL]) THEN + ACCUMULATOR_POP_ASSUM_LIST(MP_TAC o end_itlist CONJ o DESUM_RULE) THEN + DISCH_THEN(fun th -> REWRITE_TAC[th]) THEN REAL_INTEGER_TAC);; + +let BIGNUM_SQR_8_16_NEON_SUBROUTINE_CORRECT = prove + (`!z x a pc stackpointer returnaddress. + aligned 16 stackpointer /\ + nonoverlapping (z,8 * 16) (word_sub stackpointer (word 32),32) /\ + ALLPAIRS nonoverlapping + [(z,8 * 16); (word_sub stackpointer (word 32),32)] + [(word pc, 1476); (x,8 * 8)] + ==> ensures arm + (\s. aligned_bytes_loaded s (word pc) bignum_sqr_8_16_neon_mc /\ + read PC s = word pc /\ + read SP s = stackpointer /\ + read X30 s = returnaddress /\ + C_ARGUMENTS [z; x] s /\ + bignum_from_memory (x,8) s = a) + (\s. read PC s = returnaddress /\ + bignum_from_memory (z,16) s = a EXP 2) + (MAYCHANGE_REGS_AND_FLAGS_PERMITTED_BY_ABI ,, + MAYCHANGE [memory :> bytes(z,8 * 16); + memory :> bytes(word_sub stackpointer (word 32),32)])`, + ARM_ADD_RETURN_STACK_TAC + BIGNUM_SQR_8_16_NEON_EXEC BIGNUM_SQR_8_16_NEON_CORRECT + `[X19;X20;X21;X22]` 32);; diff --git a/arm/proofs/neon_helper.ml b/arm/proofs/neon_helper.ml new file mode 100644 index 00000000..c6e2f84a --- /dev/null +++ b/arm/proofs/neon_helper.ml @@ -0,0 +1,385 @@ +(* + * Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. + * SPDX-License-Identifier: Apache-2.0 OR ISC + *) + +(* ========================================================================= *) +(* Helper lemmas for verifying vectorized programs *) +(* ========================================================================= *) + +needs "common/misc.ml";; + +let SPLIT_WORD64_TO_HILO: tactic = + SUBST1_TAC (WORD_BLAST `(x:(64)word) = + word_join (word_subword x (32,32):(32)word) (word_subword x (0,32):(32)word)`) THEN + ABBREV_TAC `xh = word_subword (x:(64)word) (32,32):(32)word` THEN + ABBREV_TAC `xl = word_subword (x:(64)word) (0,32):(32)word` THEN + ASSUME_TAC (REWRITE_RULE [DIMINDEX_32] (ISPECL [`xh:(32)word`] VAL_BOUND)) THEN + ASSUME_TAC (REWRITE_RULE [DIMINDEX_32] (ISPECL [`xl:(32)word`] VAL_BOUND));; + +let WORD_SQR64_LO = prove(`! (x:(64)word). word_or + (word_shl + (word_add + (word_and (word 4294967295) + (word_add + (word_mul (word_ushr x 32) (word_zx (word_subword x (0,32):(32)word))) + (word_ushr + (word_mul (word_zx (word_subword x (0,32):(32)word)) + (word_zx (word_subword x (0,32):(32)word))) + 32))) + (word_mul (word_ushr x 32) (word_zx (word_subword x (0,32):(32)word)))) + 32) + (word_and + (word_mul (word_zx (word_subword x (0,32):(32)word)) + (word_zx (word_subword x (0,32):(32)word))) + (word 4294967295)) = word (0 + val x * val x)`, + REWRITE_TAC [WORD_RULE + `word (0 + val (a:(64)word) * val (b:(64)word)) = + word_mul (a:(64)word) (b:(64)word)`] THEN + REPEAT GEN_TAC THEN + SPLIT_WORD64_TO_HILO THEN + REWRITE_TAC[WORD_BITMANIP_SIMP_LEMMAS] THEN + REWRITE_TAC [GSYM VAL_EQ] THEN + let r = REWRITE_TAC [VAL_WORD_ADD; VAL_WORD_MUL; VAL_WORD_ZX_GEN; + VAL_WORD_SUBWORD; VAL_WORD; VAL_WORD_SHL; WORD_OF_BITS_32BITMASK; + VAL_WORD_AND_MASK; VAL_WORD_USHR; VAL_WORD_JOIN; WORD_OR_ADD_DISJ] in + (r THEN ONCE_REWRITE_TAC [WORD_RULE `word_and x y = word_and y x`] THEN r) + THEN + REWRITE_TAC[DIMINDEX_64; DIMINDEX_32; + ARITH_RULE `MIN 32 32 = 32 /\ MIN 32 64 = 32 /\ MIN 64 32 = 32`; + ARITH_RULE `2 EXP 0 = 1`; DIV_1; MOD_MOD_EXP_MIN] THEN + REWRITE_TAC[ARITH_RULE `2 EXP 64 = 2 EXP 32 * 2 EXP 32`] THEN + (* Remove redundant MODs *) + (* |- !m n. m < n ==> m MOD n = m *) + IMP_REWRITE_TAC [SPECL [`val (t:(32)word)`; `2 EXP 32 * 2 EXP 32`] MOD_LT] THEN + IMP_REWRITE_TAC [ARITH_RULE `x < 2 EXP 32 ==> x < 2 EXP 32 * 2 EXP 32`] THEN + IMP_REWRITE_TAC [SPECL + [`val (t1:(32)word) * val (t2:(32)word)`; `2 EXP 32 * 2 EXP 32`] MOD_LT] THEN + IMP_REWRITE_TAC [LT_MULT2] THEN + IMP_REWRITE_TAC [SPECL + [`2 EXP 32 * val (t1:(32)word) + val (t2:(32)word)`; `2 EXP 32 * 2 EXP 32`] MOD_LT] THEN + IMP_REWRITE_TAC [LT_MULT_ADD_MULT; ARITH_RULE `0 < 2 EXP 32`; LE_REFL] THEN + IMP_REWRITE_TAC [SPECL + [`x MOD 2 EXP 32 + val (t1:(32)word) * val (t2:(32)word)`; `2 EXP 32 * 2 EXP 32`] MOD_LT] THEN + IMP_REWRITE_TAC [LT_ADD_MULT_MULT; MOD_LT_EQ_LT; ARITH_RULE `0 < 2 EXP 32`; LE_LT] THEN + (* |- !m n p. m MOD (n * p) = n * (m DIV n) MOD p + m MOD n *) + REWRITE_TAC [MOD_MULT_MOD] THEN + (* |- !m n. (m * n) MOD m = 0 *) + REWRITE_TAC[MOD_MULT] THEN + IMP_REWRITE_TAC[DIV_MULT] THEN + REWRITE_TAC[ARITH_RULE `~(2 EXP 32 = 0)`; ADD_0] THEN + IMP_REWRITE_TAC[DIV_MULT_ADD; MOD_DIV_EQ_0; ARITH_RULE `~(2 EXP 32 = 0)`; ADD_0; MOD_MOD_REFL] THEN + REWRITE_TAC[MOD_MULT_ADD; MOD_MOD_REFL] THEN + (* Now rewrite RHS *) + REWRITE_TAC [ARITH_RULE `(x + y) * (z + w) = x * z + x * w + y * z + y * w`] THEN + REWRITE_TAC [ARITH_RULE `(2 EXP 32 * w) * z = 2 EXP 32 * (w * z)`] THEN + REWRITE_TAC [ARITH_RULE `val (k:(32)word) * (2 EXP 32 * z) = 2 EXP 32 * (val k * z)`] THEN + IMP_REWRITE_TAC [DIV_MULT_ADD; MOD_MULT_ADD; ARITH_RULE `~(2 EXP 32 = 0)`] THEN + REWRITE_TAC [ADD_MOD_MOD_REFL] THEN + AP_THM_TAC THEN AP_TERM_TAC THEN + AP_TERM_TAC THEN + AP_THM_TAC THEN AP_TERM_TAC THEN + ARITH_TAC);; + +let WORD_SQR64_HI = prove(`!(x:(64)word). word_add + (word_add (word_mul (word_ushr x 32) (word_ushr x 32)) + (word_ushr + (word_add + (word_and + (word 4294967295) + (word_add + (word_mul (word_ushr x 32) (word_zx (word_subword x (0,32):(32)word))) + (word_ushr + (word_mul (word_zx (word_subword x (0,32):(32)word)) + (word_zx (word_subword x (0,32):(32)word))) + 32))) + (word_mul (word_ushr x 32) (word_zx (word_subword x (0,32):(32)word)))) + 32)) + (word_ushr + (word_add + (word_mul (word_ushr x 32) (word_zx (word_subword x (0,32):(32)word))) + (word_ushr + (word_mul + (word_zx (word_subword x (0,32):(32)word)) + (word_zx (word_subword x (0,32):(32)word))) + 32)) + 32) = + word ((val x * val x) DIV 2 EXP 64)`, + GEN_TAC THEN + SPLIT_WORD64_TO_HILO THEN + REWRITE_TAC[WORD_BITMANIP_SIMP_LEMMAS] THEN + REWRITE_TAC [GSYM VAL_EQ] THEN + let r = REWRITE_TAC [VAL_WORD_ADD; VAL_WORD_MUL; VAL_WORD_ZX_GEN; VAL_WORD_SUBWORD; VAL_WORD; VAL_WORD_SHL; WORD_OF_BITS_32BITMASK; VAL_WORD_AND_MASK; VAL_WORD_USHR; VAL_WORD_JOIN; WORD_OR_ADD_DISJ] in (r THEN ONCE_REWRITE_TAC [WORD_RULE `word_and x y = word_and y x`] THEN r) THEN + REWRITE_TAC[DIMINDEX_64; DIMINDEX_32; ARITH_RULE `MIN 32 32 = 32 /\ MIN 32 64 = 32 /\ MIN 64 32 = 32`; ARITH_RULE `2 EXP 0 = 1`; DIV_1; MOD_MOD_EXP_MIN] THEN + REWRITE_TAC[ARITH_RULE `2 EXP 64 = 2 EXP 32 * 2 EXP 32`] THEN + IMP_REWRITE_TAC [SPECL [`val (t:(32)word)`; `2 EXP 32 * 2 EXP 32`] MOD_LT] THEN + IMP_REWRITE_TAC [ARITH_RULE `x < 2 EXP 32 ==> x < 2 EXP 32 * 2 EXP 32`] THEN + IMP_REWRITE_TAC [SPECL [`val (t1:(32)word) * val (t2:(32)word)`; `2 EXP 32 * 2 EXP 32`] MOD_LT] THEN + IMP_REWRITE_TAC [LT_MULT2] THEN + IMP_REWRITE_TAC [SPECL [`2 EXP 32 * val (t1:(32)word) + val (t2:(32)word)`; `2 EXP 32 * 2 EXP 32`] MOD_LT] THEN + IMP_REWRITE_TAC [LT_MULT_ADD_MULT; ARITH_RULE `0 < 2 EXP 32`; LE_REFL] THEN + IMP_REWRITE_TAC [SPECL [`x MOD 2 EXP 32 + val (t1:(32)word) * val (t2:(32)word)`; `2 EXP 32 * 2 EXP 32`] MOD_LT] THEN + IMP_REWRITE_TAC [LT_ADD_MULT_MULT; MOD_LT_EQ_LT; ARITH_RULE `0 < 2 EXP 32`; LE_LT] THEN + IMP_REWRITE_TAC [SPECL [`val (t1:(32)word) * val (t2:(32)word) + t DIV 2 EXP 32`; `2 EXP 32 * 2 EXP 32`] MOD_LT] THEN + IMP_REWRITE_TAC [LT_MULT_ADD_MULT; ARITH_RULE `0 < 2 EXP 32`] THEN + IMP_REWRITE_TAC[RDIV_LT_EQ; ARITH_RULE `~(2 EXP 32 = 0)`; LE_LT; LT_MULT2] THEN + IMP_REWRITE_TAC[LT_ADD_MULT_MULT; LE_LT; MOD_LT_EQ; ARITH_RULE `~(2 EXP 32 = 0)`; ARITH_RULE `0 < 2 EXP 32`] THEN + (* Remove the outermost MOD 2^32*2^32 *) + AP_THM_TAC THEN AP_TERM_TAC THEN + (* Rerwite RHS first *) + REWRITE_TAC [ARITH_RULE `(x + y) * (z + w) = x * z + x * w + y * z + y * w`] THEN + REWRITE_TAC [ARITH_RULE `(2 EXP 32 * w) * z = 2 EXP 32 * (w * z)`] THEN + REWRITE_TAC [ARITH_RULE `val (k:(32)word) * (2 EXP 32 * z) = 2 EXP 32 * (val k * z)`] THEN + REWRITE_TAC[GSYM DIV_DIV] THEN + IMP_REWRITE_TAC[DIV_MULT_ADD; ARITH_RULE `~(2 EXP 32 = 0)`] THEN + (* strip 'xh * xh + ...' *) + REWRITE_TAC[GSYM ADD_ASSOC] THEN AP_TERM_TAC THEN + IMP_REWRITE_TAC[ADD_DIV_MOD_SIMP_LEMMA; ARITH_RULE `~(2 EXP 32 = 0)`] THEN + AP_THM_TAC THEN AP_TERM_TAC THEN + ARITH_TAC);; + +let WORD_MUL_64_DECOMPOSED_LEMMA = prove(`!a b c. + ((a + 2 EXP 32 * (b MOD 2 EXP 32 + c MOD 2 EXP 32)) DIV 2 EXP 32) MOD 2 EXP 32 = + ((a + 2 EXP 32 * (b + c)) DIV 2 EXP 32) MOD 2 EXP 32`, + REPEAT STRIP_TAC THEN + MAP_EVERY (fun (thm, suffix) -> LABEL_TAC ("Ha_" ^ suffix) thm) + (zip (CONJUNCTS ((MP + (SPECL [`a:num`; `2 EXP 32:num`] DIVISION) (ARITH_RULE `~(2 EXP 32 = 0)`)))) + ["eq";"lt"]) THEN + ABBREV_TAC `ahi = a DIV 2 EXP 32` THEN + ABBREV_TAC `alo = a MOD 2 EXP 32` THEN + ASM_REWRITE_TAC[] THEN + REWRITE_TAC[ARITH_RULE + `(ahi * 2 EXP 32 + alo) + 2 EXP 32 * (b MOD 2 EXP 32 + c MOD 2 EXP 32) = + (ahi + b MOD 2 EXP 32 + c MOD 2 EXP 32) * 2 EXP 32 + alo`] THEN + REWRITE_TAC[ARITH_RULE + `(ahi * 2 EXP 32 + alo) + 2 EXP 32 * (b + c) = + (ahi + b + c) * 2 EXP 32 + alo`] THEN + IMP_REWRITE_TAC[DIV_UNIQ] THEN (* (A * 2^32 + B) / 2^32 => A *) + EXISTS_TAC `(ahi + b MOD 2 EXP 32 + c MOD 2 EXP 32)` THEN SIMP_TAC[] THEN + EXISTS_TAC `(ahi + b + c)` THEN SIMP_TAC[] THEN + CONV_TAC MOD_DOWN_CONV THEN SIMP_TAC[]);; + +let WORD_MUL64_LO = prove(`!(x:(64)word) (y:(64)word). + word_add + (word_mul (word_zx (word_subword x (0,32):(32)word):(64)word) + (word_zx (word_subword y (0,32):(32)word):(64)word)) + (word_shl + (word_add + (word_zx (word_mul (word_subword y (32,32):(32)word) (word_subword x (0,32):(32)word))) + (word_zx (word_mul (word_subword y (0,32):(32)word) (word_subword x (32,32):(32)word)))) + 32) = + word (0 + val x * val y)`, + REWRITE_TAC [WORD_RULE + `word (0 + val (a:(64)word) * val (b:(64)word)) = + word_mul (a:(64)word) (b:(64)word)`] THEN + REPEAT GEN_TAC THEN + (* word to num: step 1. x = y to val x = val y *) + REWRITE_TAC[GSYM VAL_EQ] THEN + (* step 2. remove all word_* *) + REWRITE_TAC [VAL_WORD_ADD; VAL_WORD_MUL; VAL_WORD_ZX_GEN; VAL_WORD_SUBWORD; + VAL_WORD; VAL_WORD_SHL] THEN + (* step 3. add x, y < 2^64 *) + ASSUME_TAC (ISPECL [`x:(64)word`] VAL_BOUND) THEN + ASSUME_TAC (ISPECL [`y:(64)word`] VAL_BOUND) THEN + RULE_ASSUM_TAC (REWRITE_RULE [DIMINDEX_64]) THEN + (* step 4. eliminate dimindex (:N) and simplify *) + REWRITE_TAC[DIMINDEX_32;DIMINDEX_64;DIMINDEX_128;DIV_1;MOD_MOD_REFL; + MOD_MOD_EXP_MIN;ARITH_RULE `2 EXP 0 = 1`; DIV_1] THEN + CONV_TAC(DEPTH_CONV NUM_MIN_CONV) THEN + CONV_TAC MOD_DOWN_CONV THEN + (* split x into [x0h, x0l], and divide y as well *) + MAP_EVERY (fun (thm, suffix) -> LABEL_TAC ("Hx" ^ suffix) thm) + (zip (CONJUNCTS ((MP (SPECL [`(val (x:(64)word)):num`; `2 EXP 32:num`] DIVISION) + (ARITH_RULE `~(2 EXP 32 = 0)`)))) ["eq";"lt"]) THEN + ABBREV_TAC `xhi = (val (x:(64)word)) DIV 2 EXP 32` THEN + ABBREV_TAC `xlo = (val (x:(64)word)) MOD 2 EXP 32` THEN + ASM_REWRITE_TAC[] THEN + MAP_EVERY (fun (thm, suffix) -> LABEL_TAC ("Hy" ^ suffix) thm) + (zip (CONJUNCTS ((MP (SPECL [`(val (y:(64)word)):num`; `2 EXP 32:num`] DIVISION) + (ARITH_RULE `~(2 EXP 32 = 0)`)))) ["eq";"lt"]) THEN + ABBREV_TAC `yhi = (val (y:(64)word)) DIV 2 EXP 32` THEN + ABBREV_TAC `ylo = (val (y:(64)word)) MOD 2 EXP 32` THEN + ASM_REWRITE_TAC[] THEN + (* lhs *) + REWRITE_TAC[LEFT_ADD_DISTRIB; RIGHT_ADD_DISTRIB] THEN + REWRITE_TAC[ + ARITH_RULE `y1hi * x1hi * 2 EXP 32 = 2 EXP 32 * y1hi * x1hi`; + ARITH_RULE `(y1hi * 2 EXP 32) * x1hi = 2 EXP 32 * y1hi * x1hi`] THEN + REWRITE_TAC[MOD_MULT_ADD] THEN + (* rhs *) + REWRITE_TAC[MULT_ASSOC; ARITH_RULE `2 EXP 32 * 2 EXP 32 = 2 EXP 64`] THEN + REWRITE_TAC[GSYM ADD_ASSOC; GSYM MULT_ASSOC] THEN + REWRITE_TAC[MOD_MULT_ADD] THEN + (* lhs = rhs *) + REWRITE_TAC[ARITH_RULE `2 EXP 64 = 2 EXP 32 * 2 EXP 32`] THEN + REWRITE_TAC[MOD_MULT_MOD] THEN + REWRITE_TAC[ARITH_RULE `2 EXP 32 * p + 2 EXP 32 * q = 2 EXP 32 * (p + q)`; MOD_MULT_ADD] THEN + REWRITE_TAC [WORD_MUL_64_DECOMPOSED_LEMMA] THEN + REWRITE_TAC [ARITH_RULE + `(xlo * ylo + 2 EXP 32 * (yhi * xlo + ylo * xhi)) DIV 2 EXP 32 = + (2 EXP 32 * xhi * ylo + 2 EXP 32 * xlo * yhi + xlo * ylo) DIV 2 EXP 32`]);; + +let WORD_MUL64_HI = prove(`!(x: (64)word) (y: (64)word). + word_add + (word_add + (word_mul + (word_zx (word_subword x (32,32):(32)word):(64)word) + (word_zx (word_subword y (32,32):(32)word):(64)word)) + (word_ushr + (word_add + (word_mul (word_zx (word_subword x (0,32):(32)word):(64)word) + (word_zx (word_subword y (32,32):(32)word):(64)word)) + (word_ushr + (word_mul (word_zx (word_subword x (0,32):(32)word):(64)word) + (word_zx (word_subword y (0,32):(32)word):(64)word)) + 32)) + 32)) + (word_ushr + (word_add + (word_mul (word_zx (word_subword x (32,32):(32)word):(64)word) + (word_zx (word_subword y (0,32):(32)word):(64)word)) + (word_and (word 4294967295:(64)word) + (word_add + (word_mul (word_zx (word_subword x (0,32):(32)word):(64)word) + (word_zx (word_subword y (32,32):(32)word):(64)word)) + (word_ushr + (word_mul (word_zx (word_subword x (0,32):(32)word):(64)word) + (word_zx (word_subword y (0,32):(32)word):(64)word)) + 32)))) + 32) = + word ((val x * val y) DIV 2 EXP 64)`, + REPEAT GEN_TAC THEN + (SUBST1_TAC (WORD_BLAST `(x:(64)word) = + word_join (word_subword x (32,32):(32)word) (word_subword x (0,32):(32)word)`) THEN + ABBREV_TAC `xh = word_subword (x:(64)word) (32,32):(32)word` THEN + ABBREV_TAC `xl = word_subword (x:(64)word) (0,32):(32)word` THEN + ASSUME_TAC (REWRITE_RULE [DIMINDEX_32] (ISPECL [`xh:(32)word`] VAL_BOUND)) THEN + ASSUME_TAC (REWRITE_RULE [DIMINDEX_32] (ISPECL [`xl:(32)word`] VAL_BOUND))) THEN + (SUBST1_TAC (WORD_BLAST `(y:(64)word) = + word_join (word_subword y (32,32):(32)word) (word_subword y (0,32):(32)word)`) THEN + ABBREV_TAC `yh = word_subword (y:(64)word) (32,32):(32)word` THEN + ABBREV_TAC `yl = word_subword (y:(64)word) (0,32):(32)word` THEN + ASSUME_TAC (REWRITE_RULE [DIMINDEX_32] (ISPECL [`yh:(32)word`] VAL_BOUND)) THEN + ASSUME_TAC (REWRITE_RULE [DIMINDEX_32] (ISPECL [`yl:(32)word`] VAL_BOUND))) THEN + REWRITE_TAC[WORD_BITMANIP_SIMP_LEMMAS] THEN + REWRITE_TAC [GSYM VAL_EQ] THEN + + (let r = REWRITE_TAC [VAL_WORD_ADD; VAL_WORD_MUL; VAL_WORD_ZX_GEN; + VAL_WORD_SUBWORD; VAL_WORD; VAL_WORD_SHL; WORD_OF_BITS_32BITMASK; + VAL_WORD_AND_MASK; VAL_WORD_USHR; VAL_WORD_JOIN] in + (r THEN ONCE_REWRITE_TAC [WORD_RULE `word_and x y = word_and y x`] THEN r) + THEN + REWRITE_TAC[DIMINDEX_64; DIMINDEX_32; + ARITH_RULE `MIN 32 32 = 32 /\ MIN 32 64 = 32 /\ MIN 64 32 = 32`; + ARITH_RULE `2 EXP 0 = 1`; DIV_1; MOD_MOD_EXP_MIN] THEN + REWRITE_TAC[ARITH_RULE `2 EXP 64 = 2 EXP 32 * 2 EXP 32`]) THEN + + (* Remove redundant MODs in LHS *) + IMP_REWRITE_TAC [SPECL [`val (t:(32)word)`; `2 EXP 32 * 2 EXP 32`] MOD_LT; + ARITH_RULE `x < 2 EXP 32 ==> x < 2 EXP 32 * 2 EXP 32`] THEN + IMP_REWRITE_TAC [SPECL + [`val (t1:(32)word) * val (t2:(32)word)`; `2 EXP 32 * 2 EXP 32`] MOD_LT; + LT_MULT2] THEN + + IMP_REWRITE_TAC [ + SPECL [`val (t1:(32)word) * val (t2:(32)word) + k`; `2 EXP 32 * 2 EXP 32`] MOD_LT; + LT_MULT_ADD_MULT; ARITH_RULE `0 < 2 EXP 32`; LE_LT; EXP_2_MOD_LT; RDIV_LT_EQ; + LT_MULT2; ARITH_RULE `~(2 EXP 32 = 0)`] THEN + + REWRITE_TAC[GSYM ADD_ASSOC] THEN + IMP_REWRITE_TAC[ADD_DIV_MOD_SIMP2_LEMMA; ARITH_RULE `~(2 EXP 32 = 0)`] THEN + (* Simplify RHS *) + + IMP_REWRITE_TAC [SPECL + [`2 EXP 32 * val (t1:(32)word) + val (t2:(32)word)`; `2 EXP 32 * 2 EXP 32`] MOD_LT] THEN + IMP_REWRITE_TAC [LT_MULT_ADD_MULT; ARITH_RULE `0 < 2 EXP 32`; LE_REFL] THEN + REWRITE_TAC [ARITH_RULE `(x + y) * (z + w) = x * z + x * w + y * z + y * w`] THEN + REWRITE_TAC [ARITH_RULE `(2 EXP 32 * w) * z = 2 EXP 32 * (w * z)`] THEN + REWRITE_TAC [ARITH_RULE `val (k:(32)word) * (2 EXP 32 * z) = 2 EXP 32 * (val k * z)`] THEN + REWRITE_TAC[GSYM DIV_DIV] THEN + IMP_REWRITE_TAC[DIV_MULT_ADD; ARITH_RULE `~(2 EXP 32 = 0)`] THEN + (* strip the outermost MOD *) + AP_THM_TAC THEN AP_TERM_TAC THEN + ARITH_TAC);; + + +(* ------------------------------------------------------------------------- *) +(* Helpful tactics *) +(* ------------------------------------------------------------------------- *) + +(* match terms of pattern `read (memory :> bytes64 _) ) = _`. *) +let is_read_memory_bytes64 t = + if is_eq t + then begin match lhs t with + | Comb(Comb ( + Const ("read", _), + Comb( + Comb(Const (":>", _),Const("memory", _)), + Comb(Const ("bytes64", _),_))),_) -> true + | _ -> false end + else false;; + +let BYTES128_EQ_JOIN64_TAC lhs128 hi64 lo64 = + let hilo = mk_comb (mk_comb + (`word_join:(64)word->(64)word->(128)word`,hi64),lo64) in + SUBGOAL_THEN (mk_eq (lhs128, hilo)) ASSUME_TAC THENL [ + EVERY_ASSUM (fun thm -> + (*let t = is_read_memory_bytes64 (concl thm) in + let _ = printf "%s \n" (string_of_term (concl thm)) t in*) + if is_read_memory_bytes64 (concl thm) + then REWRITE_TAC[GSYM thm] + else ALL_TAC) THEN + REWRITE_TAC[READ_MEMORY_BYTESIZED_SPLIT; WORD_ADD_ASSOC_CONSTS] THEN + ARITH_TAC; + ALL_TAC + ];; + +(* For s \in acc_rewrite_states_list \cap acc_states_list, apply rewrite + acc_rewrite_rules to the assumptions before ACCSTEP *) +let ARM_REWRITE_ASSUM_AND_ACCSTEPS_TAC + execth + acc_rewrite_states_list (acc_rewrite_rules: thm list) + acc_states_list states_list = + let sl = statenames "s" acc_rewrite_states_list in + let rls = WORD_BITMANIP_SIMP_LEMMAS::acc_rewrite_rules in + ARM_GEN_ACCSTEPS_TAC + (fun sname -> + if List.mem sname sl then RULE_ASSUM_TAC (REWRITE_RULE rls) + else RULE_ASSUM_TAC (REWRITE_RULE [WORD_BITMANIP_SIMP_LEMMAS])) + execth acc_states_list states_list;; + + +let ARM_REWRITE_ASSUM_AND_XACCSTEPS_TAC2 + execth + simp_states_list (simp_rewrite_rules: thm list) + acc_states_list states_list + exclude_regs = + let acc_preproc:tactic = RULE_ASSUM_TAC + (REWRITE_RULE [WORD_BITMANIP_SIMP_LEMMAS]) in + MAP_EVERY + (fun n -> + (if List.mem n simp_states_list then + RULE_ASSUM_TAC (REWRITE_RULE (WORD_BITMANIP_SIMP_LEMMAS::simp_rewrite_rules)) + else ALL_TAC) THEN + let state_name = "s"^string_of_int n in + ARM_SINGLE_STEP_TAC execth state_name THEN + if mem n acc_states_list then + acc_preproc THEN TRY( + ACCUMULATEX_ARITH_TAC ([`SP`] @ exclude_regs) state_name THEN CLARIFY_TAC) + else ALL_TAC) + states_list;; + + +let contains_str (s:string) (subs:string): bool = + let subsl = explode subs in + let n = length subsl in + let rec fn sl = + if length sl < n then false + else if fst (chop_list n sl) = subsl then true + else match sl with | [] -> false | h::t -> fn t in + fn (explode s);; + +let DISCARD_READ_QREGS:tactic = + DISCARD_ASSUMPTIONS_TAC (fun th -> + contains_str (string_of_term (concl th)) "read Q");; diff --git a/benchmarks/benchmark.c b/benchmarks/benchmark.c index 47806e2a..da1a1a08 100644 --- a/benchmarks/benchmark.c +++ b/benchmarks/benchmark.c @@ -13,6 +13,7 @@ #include #include "../include/s2n-bignum.h" +#include "../tests/arch.h" // Controls whether an explanatory header goes on the output @@ -171,31 +172,6 @@ void timingtest(int enabled,char *name,void (*f)(void)) ++tests; } -// Decide whether machine supports BMI and ADX in the x86 case - -#ifdef __x86_64__ - -int cpuid_extendedfeatures(void) -{ int a = 7, b = 0, c = 0, d = 0; - asm ("cpuid\n\t" - : "=a" (a), "=b" (b), "=c" (c), "=d" (d) - : "0" (a), "2" (c)); - return b; -} - -int full_isa_support(void) -{ int c = cpuid_extendedfeatures(); - return (c & (1ul<<8)) && (c & (1ul<<19)); -} - -#else - -int full_isa_support(void) -{ return 1; -} - -#endif - // Wrappers round the functions to call uniformly void call_bignum_add__4_4(void) repeat(bignum_add(4,b0,4,b1,4,b2)) @@ -768,10 +744,31 @@ void call_sm2_montjadd(void) repeat(sm2_montjadd(b1,b2,b3)) void call_sm2_montjdouble(void) repeat(sm2_montjdouble(b1,b2)) void call_sm2_montjmixadd(void) repeat(sm2_montjmixadd(b1,b2,b3)) +#ifdef __ARM_NEON +void call_bignum_mul_8_16_neon(void) repeat(bignum_mul_8_16_neon(b0,b1,b2)) +void call_bignum_sqr_8_16_neon(void) repeat(bignum_sqr_8_16_neon(b0,b1)) +void call_bignum_kmul_16_32_neon(void) repeat(bignum_kmul_16_32_neon(b0,b1,b2,b3)) +void call_bignum_ksqr_16_32_neon(void) repeat(bignum_ksqr_16_32_neon(b0,b1,b2)) +void call_bignum_kmul_32_64_neon(void) repeat(bignum_kmul_32_64_neon(b0,b1,b2,b3)) +void call_bignum_ksqr_32_64_neon(void) repeat(bignum_ksqr_32_64_neon(b0,b1,b2)) +void call_bignum_emontredc_8n_neon__32(void) repeat(bignum_emontredc_8n_neon(32,b0,b1,b2[0])) + +#else +void call_bignum_mul_8_16_neon(void) {} +void call_bignum_sqr_8_16_neon(void) {} +void call_bignum_kmul_16_32_neon(void) {} +void call_bignum_ksqr_16_32_neon(void) {} +void call_bignum_kmul_32_64_neon(void) {} +void call_bignum_ksqr_32_64_neon(void) {} +void call_bignum_emontredc_8n_neon__32(void) {} + +#endif + int main(int argc, char *argv[]) { - int bmi = full_isa_support(); + int bmi = get_arch_name() == ARCH_AARCH64 || supports_bmi2_and_adx(); int all = 1; + int neon = supports_neon(); char *argending; long negreps; function_to_test = ""; @@ -904,6 +901,7 @@ int main(int argc, char *argv[]) timingtest(all,"bignum_emontredc (12 -> 6)",call_bignum_emontredc__6); timingtest(all,"bignum_emontredc (64 -> 32)",call_bignum_emontredc__32); timingtest(bmi,"bignum_emontredc_8n (64 -> 32)",call_bignum_emontredc_8n__32); + timingtest(neon,"bignum_emontredc_8n_neon (64 -> 32)",call_bignum_emontredc_8n_neon__32); timingtest(all,"bignum_eq (32x32)" ,call_bignum_eq__32_32); timingtest(all,"bignum_even (32)" ,call_bignum_even__32); timingtest(all,"bignum_frombebytes_4",call_bignum_frombebytes_4); @@ -921,9 +919,13 @@ int main(int argc, char *argv[]) timingtest(all,"bignum_inv_p25519",call_bignum_inv_p25519); timingtest(all,"bignum_iszero (32)" ,call_bignum_iszero__32); timingtest(bmi,"bignum_kmul_16_32",call_bignum_kmul_16_32); + timingtest(neon,"bignum_kmul_16_32_neon",call_bignum_kmul_16_32_neon); timingtest(bmi,"bignum_kmul_32_64",call_bignum_kmul_32_64); + timingtest(neon,"bignum_kmul_32_64_neon",call_bignum_kmul_32_64_neon); timingtest(bmi,"bignum_ksqr_16_32",call_bignum_ksqr_16_32); + timingtest(neon,"bignum_ksqr_16_32_neon",call_bignum_ksqr_16_32_neon); timingtest(bmi,"bignum_ksqr_32_64",call_bignum_ksqr_32_64); + timingtest(neon,"bignum_ksqr_32_64_neon",call_bignum_ksqr_32_64_neon); timingtest(all,"bignum_le (32x32)" ,call_bignum_le__32_32); timingtest(all,"bignum_littleendian_4",call_bignum_littleendian_4); timingtest(all,"bignum_littleendian_6",call_bignum_littleendian_6); @@ -1003,6 +1005,7 @@ int main(int argc, char *argv[]) timingtest(all,"bignum_mul_6_12_alt",call_bignum_mul_6_12_alt); timingtest(bmi,"bignum_mul_8_16",call_bignum_mul_8_16); timingtest(all,"bignum_mul_8_16_alt",call_bignum_mul_8_16_alt); + timingtest(neon,"bignum_mul_8_16_neon",call_bignum_mul_8_16_neon); timingtest(bmi,"bignum_mul_p25519",call_bignum_mul_p25519); timingtest(all,"bignum_mul_p25519_alt",call_bignum_mul_p25519_alt); timingtest(bmi,"bignum_mul_p256k1",call_bignum_mul_p256k1); @@ -1052,6 +1055,7 @@ int main(int argc, char *argv[]) timingtest(all,"bignum_sqr_6_12_alt",call_bignum_sqr_6_12_alt); timingtest(bmi,"bignum_sqr_8_16",call_bignum_sqr_8_16); timingtest(all,"bignum_sqr_8_16_alt",call_bignum_sqr_8_16_alt); + timingtest(neon,"bignum_sqr_8_16_neon",call_bignum_sqr_8_16_neon); timingtest(bmi,"bignum_sqr_p25519",call_bignum_sqr_p25519); timingtest(all,"bignum_sqr_p25519_alt",call_bignum_sqr_p25519_alt); timingtest(bmi,"bignum_sqr_p256k1",call_bignum_sqr_p256k1); diff --git a/common/interval.ml b/common/interval.ml index 71627160..a14c4e86 100644 --- a/common/interval.ml +++ b/common/interval.ml @@ -31,7 +31,8 @@ let PURE_BOUNDER_RULE = fun tm -> let th = REAL_RAT_REDUCE_CONV tm in let tm' = rand(concl th) in if is_ratconst tm' then MP (SPECL [tm;tm'] pth) th - else failwith "BOUNDER_RULE: unhandled term" + else failwith ("BOUNDER_RULE: unhandled term: " + ^ (string_of_term tm)) and rule_div = GEN_REWRITE_CONV I [real_div] and rule_vid th = GEN_REWRITE_RULE (fun c ->BINOP2_CONV (RAND_CONV c) (LAND_CONV c)) @@ -278,7 +279,9 @@ let PURE_BOUNDER_RULE = else if is_div tm then let eth = rule_div tm in rule_vid eth (bounder(rand(concl eth))) else try CONJ (basic_lowerbound true tm) (basic_upperbound true tm) - with Failure _ -> failwith "BOUNDER_RULE: unhandled term" in + with Failure _ -> failwith + ("BOUNDER_RULE: unhandled term: " ^ + string_of_term tm) in bounder;; let BOUNDER_RULE ths = diff --git a/common/misc.ml b/common/misc.ml index 3e3f06eb..450c2abd 100644 --- a/common/misc.ml +++ b/common/misc.ml @@ -7,9 +7,12 @@ (* Miscellaneous theorems that don't quite fit in the main libraries. *) (* ========================================================================= *) +needs "Library/bitsize.ml";; +needs "Library/floor.ml";; needs "Library/iter.ml";; +needs "Library/pocklington.ml";; needs "Library/rstc.ml";; -needs "Library/floor.ml";; +needs "Library/words.ml";; (* ------------------------------------------------------------------------- *) (* Additional list operations and conversions on them. *) @@ -1185,3 +1188,152 @@ let cache f = let memo = ref [] in fun x -> try assoc x (!memo) with Failure _ -> (let y = f x in (memo := (x,y) :: (!memo); y));; + +(* ------------------------------------------------------------------------- *) +(* A few more lemmas about words. *) +(* ------------------------------------------------------------------------- *) + +let WORD_BITMANIP_SIMP_LEMMAS = prove( + `!(x32:(32)word) (y32:(32)word) (x32_2:(32)word) + (x64:(64)word) (y64:(64)word) (x64_2:(64)word) (y64_2:(64)word) + (y128:(128)word). + // word_subword + word_subword (word_subword y128 (0,64):(64)word) (0,32):(32)word = + word_subword y128 (0,32):(32)word /\ + word_subword (word_subword y128 (64,64):(64)word) (0,32):(32)word = + word_subword y128 (64,32):(32)word /\ + word_subword (word_subword y128 (0,64):(64)word) (32,32):(32)word = + word_subword y128 (32,32):(32)word /\ + word_subword (word_subword y128 (64,64):(64)word) (32,32):(32)word = + word_subword y128 (96,32):(32)word /\ + word_subword + (word 79228162495817593524129366015:(128)word) (64,64):(64)word = + word 4294967295 /\ + word_subword + (word 79228162495817593524129366015:(128)word) (0,64):(64)word = + word 4294967295 /\ + // .. + word_join + word_subword (word_join x32 y32: (64)word) (0,32) = y32 /\ + word_subword (word_join x32 y32: (64)word) (32,32) = x32 /\ + word_subword (word_join x64 y64: (128)word) (0,64) = y64 /\ + word_subword (word_join x64 y64: (128)word) (64,64) = x64 /\ + word_subword (word_join x64 y64: (128)word) (0,32):(32)word = + word_subword y64 (0,32):(32)word /\ + word_subword (word_join x64 y64: (128)word) (32,32):(32)word = + word_subword y64 (32,32):(32)word /\ + word_subword (word_join x64 y64: (128)word) (64,32):(32)word = + word_subword x64 (0,32):(32)word /\ + word_subword (word_join x64 y64: (128)word) (96,32):(32)word = + word_subword x64 (32,32):(32)word /\ + word_subword + (word_join + (word_join x64_2 x64: (128)word) + (word_join y64_2 y64: (128)word): (256)word) + (64,128):(128)word = word_join x64 y64_2 /\ + // .. + word_zx + word_subword (word_zx x64:(128)word) (0,32):(32)word = word_subword x64 (0,32) /\ + word_subword (word_subword x64 (0,128):(128)word) (0,32):(32)word = word_subword x64 (0,32) /\ + word_subword (word_zx x64:(128)word) (32,32):(32)word = word_subword x64 (32,32) /\ + word_subword (word_subword x64 (0,128):(128)word) (32,32):(32)word = word_subword x64 (32,32) /\ + // .. + word_and + word_subword (word_and y128 (word_join x64_2 x64:(128)word)) (64,64) = + word_and (word_subword y128 (64,64):(64)word) x64_2 /\ + word_subword (word_and y128 (word_join x64_2 x64:(128)word)) (0,64) = + word_and (word_subword y128 (0,64):(64)word) x64 /\ + // .. + word_ushr + word_zx (word_subword (word_ushr x64 32) (0,32):(32)word):(64)word = word_ushr x64 32 /\ + word_ushr (word_join x32_2 x32:(64)word) 32 = word_zx x32_2`, + CONV_TAC WORD_BLAST);; + +let WORD_ADD_ASSOC_CONSTS = prove( + `!(x:(N)word) n m. + (word_add (word_add x (word n)) (word m)) = (word_add x (word (n+m)))`, + CONV_TAC WORD_RULE);; + +let WORD_OR_ADD_DISJ = prove(`! (x:(64)word) (y:(64)word). + word_or (word_shl x 32) (word_and y (word 4294967295)) = + word_add (word_shl x 32) (word_and y (word 4294967295))`, + REPEAT GEN_TAC THEN + IMP_REWRITE_TAC[WORD_ADD_OR] THEN + CONV_TAC WORD_BLAST);; + +let WORD_OF_BITS_32BITMASK = prove( + `word 4294967295 = word_of_bits {i | i < 32}`, + REWRITE_TAC [WORD_OF_BITS_MASK; ARITH_RULE `4294967295 = 2 EXP 32 - 1`]);; + +let WORD_MUL_EQ = prove( + `!(x:(64)word) (y:(64)word). word_mul x y = word ((val x * val y) MOD 2 EXP 64)`, + REWRITE_TAC[GSYM VAL_EQ; VAL_WORD_MUL; VAL_WORD; DIMINDEX_64; MOD_MOD_REFL; MOD_MOD_EXP_MIN] + THEN CONV_TAC(DEPTH_CONV NUM_MIN_CONV) THEN MESON_TAC[]);; + +(* ------------------------------------------------------------------------- *) +(* A few more lemmas about natural numbers. *) +(* ------------------------------------------------------------------------- *) + +let ADD_MOD_MOD_REFL = prove(`!a b m. + (a + b MOD m) MOD m = (a + b) MOD m /\ + (a MOD m + b) MOD m = (a + b) MOD m`, + REPEAT STRIP_TAC THEN + ONCE_REWRITE_TAC [GSYM (SPECL [`a:num`; `b:num`] MOD_ADD_MOD)] THEN + REWRITE_TAC [MOD_MOD_REFL]);; + +let ADD_DIV_MOD_SIMP_LEMMA = prove(`!x y m. + ~(m = 0) ==> (x MOD m + y) DIV m + x DIV m = (x + y) DIV m`, + REPEAT STRIP_TAC THEN + FIRST_ASSUM (fun thm -> ASSUME_TAC (MATCH_MP (SPECL [`x:num`; `m:num`] DIVMOD_EXIST) thm)) THEN + FIRST_X_ASSUM (fun thm -> CHOOSE_THEN (CHOOSE_THEN ASSUME_TAC) thm) THEN + ASM_REWRITE_TAC[] THEN + REWRITE_TAC[GSYM ADD_ASSOC] THEN + ASM_SIMP_TAC[MOD_MULT_ADD;DIV_MULT_ADD;MOD_LT;DIV_LT] THEN + ARITH_TAC);; + +let LT_MULT_ADD_MULT = prove(`!(a:num) (b:num) (c:num) (m:num). + 0 < m /\ a < m /\ b < m /\ c <= m ==> c * a + b < m * m`, + REPEAT STRIP_TAC THEN + TRANS_TAC LET_TRANS `(m:num) * (a:num) + (b:num)` THEN + CONJ_TAC THENL [ + IMP_REWRITE_TAC[LE_ADD2] THEN + CONJ_TAC THENL [ + IMP_REWRITE_TAC[LE_MULT2] THEN + REWRITE_TAC[LE_REFL]; + REWRITE_TAC[LE_REFL]]; + REPEAT STRIP_TAC THEN + DISJ_CASES_THEN (LABEL_TAC "mcases") (SPECL [`m:num`] num_CASES) THENL [ + (* m = 0 *) SUBST_ALL_TAC (ASSUME `m = 0`) THEN + RULE_ASSUM_TAC (REWRITE_RULE [GSYM ONE]) THEN + REWRITE_TAC [GSYM ONE] THEN + ASM_ARITH_TAC; + (* m = n + 1 *) REMOVE_THEN "mcases" (CHOOSE_THEN (LABEL_TAC "mcases'")) THEN + SUBST_ALL_TAC (ASSUME `m = SUC n`) THEN + RULE_ASSUM_TAC (REWRITE_RULE [ADD1]) THEN + REWRITE_TAC [ADD1] THEN + REWRITE_TAC [ARITH_RULE `(n + 1:num) * (n + 1:num) = (n + 1:num) * n + (n + 1:num)`] THEN + SUBGOAL_THEN `(a:num) <= (n:num)` ASSUME_TAC THENL [ASM_ARITH_TAC; ALL_TAC] THEN + MATCH_MP_TAC LET_ADD2 THEN + REWRITE_TAC [LE_MULT_LCANCEL] THEN + ASM_MESON_TAC[] + ]]);; + +let LT_ADD_MULT_MULT = prove(`!(a:num) (b:num) (c:num) (m:num). + 0 < m /\ a < m /\ b < m /\ c <= m ==> b + c * a < m * m`, + REPEAT STRIP_TAC THEN + TRANS_TAC LET_TRANS `(c:num) * (a:num) + (b:num)` THEN + CONJ_TAC THENL + [ARITH_TAC; ASM_MESON_TAC[LT_MULT_ADD_MULT]]);; + +let ADD_DIV_MOD_SIMP2_LEMMA = prove(`!(x:num) (y:num) (m:num). + ~(m = 0) ==> x DIV m + (y + x MOD m) DIV m = (x + y) DIV m`, + REPEAT STRIP_TAC THEN + FIRST_ASSUM (fun thm -> ASSUME_TAC (MATCH_MP (SPECL [`x:num`; `m:num`] DIVMOD_EXIST) thm)) THEN + FIRST_X_ASSUM (fun thm -> CHOOSE_THEN (CHOOSE_THEN ASSUME_TAC) thm) THEN + ASM_REWRITE_TAC[] THEN + REWRITE_TAC[GSYM ADD_ASSOC] THEN + ASM_SIMP_TAC[MOD_MULT_ADD;DIV_MULT_ADD;MOD_LT;DIV_LT;ADD_SYM] THEN + ARITH_TAC);; + +(* ------------------------------------------------------------------------- *) +(* A simple tactic that is helpful for debugging. *) +(* ------------------------------------------------------------------------- *) + +let PRINT_GOAL_TAC (desc: string): tactic = + fun gl -> let _ = Printf.printf "<%s>\n" desc; print_goal gl in ALL_TAC gl;; \ No newline at end of file diff --git a/include/s2n-bignum-c89.h b/include/s2n-bignum-c89.h index 5b558380..04803aac 100644 --- a/include/s2n-bignum-c89.h +++ b/include/s2n-bignum-c89.h @@ -22,6 +22,7 @@ * * - On ARM, the "_alt" forms target machines with higher multiplier * throughput, generally offering higher performance there. + * The "_neon" forms target machines with NEON instructions. * ---------------------------------------------------------------------------- */ @@ -254,6 +255,8 @@ extern uint64_t bignum_emontredc (uint64_t k, uint64_t *z, uint64_t *m, uint64_t /* Extended Montgomery reduce in 8-digit blocks, results in input-output buffer */ /* Inputs z[2*k], m[k], w; outputs function return (extra result bit) and z[2*k] */ extern uint64_t bignum_emontredc_8n (uint64_t k, uint64_t *z, uint64_t *m, uint64_t w); +extern uint64_t bignum_emontredc_8n_neon (uint64_t k, uint64_t *z, uint64_t *m, + uint64_t w); /* Test bignums for equality, x = y */ /* Inputs x[m], y[n]; output function return */ @@ -322,18 +325,24 @@ extern uint64_t bignum_iszero (uint64_t k, uint64_t *x); /* Multiply z := x * y */ /* Inputs x[16], y[16]; output z[32]; temporary buffer t[>=32] */ extern void bignum_kmul_16_32 (uint64_t z[32], uint64_t x[16], uint64_t y[16], uint64_t t[32]); +extern void bignum_kmul_16_32_neon (uint64_t z[32], uint64_t x[16], uint64_t y[16], + uint64_t t[32]); /* Multiply z := x * y */ /* Inputs x[32], y[32]; output z[64]; temporary buffer t[>=96] */ extern void bignum_kmul_32_64 (uint64_t z[64], uint64_t x[32], uint64_t y[32], uint64_t t[96]); +extern void bignum_kmul_32_64_neon (uint64_t z[64], uint64_t x[32], uint64_t y[32], + uint64_t t[96]); /* Square, z := x^2 */ /* Input x[16]; output z[32]; temporary buffer t[>=24] */ extern void bignum_ksqr_16_32 (uint64_t z[32], uint64_t x[16], uint64_t t[24]); +extern void bignum_ksqr_16_32_neon (uint64_t z[32], uint64_t x[16], uint64_t t[24]); /* Square, z := x^2 */ /* Input x[32]; output z[64]; temporary buffer t[>=72] */ extern void bignum_ksqr_32_64 (uint64_t z[64], uint64_t x[32], uint64_t t[72]); +extern void bignum_ksqr_32_64_neon (uint64_t z[64], uint64_t x[32], uint64_t t[72]); /* Compare bignums, x <= y */ /* Inputs x[m], y[n]; output function return */ @@ -541,6 +550,7 @@ extern void bignum_mul_6_12_alt (uint64_t z[12], uint64_t x[6], uint64_t y[6]); /* Inputs x[8], y[8]; output z[16] */ extern void bignum_mul_8_16 (uint64_t z[16], uint64_t x[8], uint64_t y[8]); extern void bignum_mul_8_16_alt (uint64_t z[16], uint64_t x[8], uint64_t y[8]); +extern void bignum_mul_8_16_neon (uint64_t z[16], uint64_t x[8], uint64_t y[8]); /* Multiply modulo p_25519, z := (x * y) mod p_25519 */ /* Inputs x[4], y[4]; output z[4] */ @@ -699,6 +709,7 @@ extern void bignum_sqr_6_12_alt (uint64_t z[12], uint64_t x[6]); /* Input x[8]; output z[16] */ extern void bignum_sqr_8_16 (uint64_t z[16], uint64_t x[8]); extern void bignum_sqr_8_16_alt (uint64_t z[16], uint64_t x[8]); +extern void bignum_sqr_8_16_neon (uint64_t z[16], uint64_t x[8]); /* Square modulo p_25519, z := (x^2) mod p_25519 */ /* Input x[4]; output z[4] */ diff --git a/include/s2n-bignum.h b/include/s2n-bignum.h index a9cf8b2a..213dca15 100644 --- a/include/s2n-bignum.h +++ b/include/s2n-bignum.h @@ -21,6 +21,7 @@ // // - On ARM, the "_alt" forms target machines with higher multiplier // throughput, generally offering higher performance there. +// The "_neon" forms target machines with NEON instructions. // ---------------------------------------------------------------------------- // Add, z := x + y @@ -252,6 +253,8 @@ extern uint64_t bignum_emontredc (uint64_t k, uint64_t *z, uint64_t *m, uint64_t // Extended Montgomery reduce in 8-digit blocks, results in input-output buffer // Inputs z[2*k], m[k], w; outputs function return (extra result bit) and z[2*k] extern uint64_t bignum_emontredc_8n (uint64_t k, uint64_t *z, uint64_t *m, uint64_t w); +extern uint64_t bignum_emontredc_8n_neon (uint64_t k, uint64_t *z, uint64_t *m, + uint64_t w); // Test bignums for equality, x = y // Inputs x[m], y[n]; output function return @@ -320,18 +323,26 @@ extern uint64_t bignum_iszero (uint64_t k, uint64_t *x); // Multiply z := x * y // Inputs x[16], y[16]; output z[32]; temporary buffer t[>=32] extern void bignum_kmul_16_32 (uint64_t z[static 32], uint64_t x[static 16], uint64_t y[static 16], uint64_t t[static 32]); +extern void bignum_kmul_16_32_neon (uint64_t z[static 32], uint64_t x[static 16], + uint64_t y[static 16], uint64_t t[static 32]); // Multiply z := x * y // Inputs x[32], y[32]; output z[64]; temporary buffer t[>=96] extern void bignum_kmul_32_64 (uint64_t z[static 64], uint64_t x[static 32], uint64_t y[static 32], uint64_t t[static 96]); +extern void bignum_kmul_32_64_neon (uint64_t z[static 64], uint64_t x[static 32], + uint64_t y[static 32], uint64_t t[static 96]); // Square, z := x^2 // Input x[16]; output z[32]; temporary buffer t[>=24] extern void bignum_ksqr_16_32 (uint64_t z[static 32], uint64_t x[static 16], uint64_t t[static 24]); +extern void bignum_ksqr_16_32_neon (uint64_t z[static 32], uint64_t x[static 16], + uint64_t t[static 24]); // Square, z := x^2 // Input x[32]; output z[64]; temporary buffer t[>=72] extern void bignum_ksqr_32_64 (uint64_t z[static 64], uint64_t x[static 32], uint64_t t[static 72]); +extern void bignum_ksqr_32_64_neon (uint64_t z[static 64], uint64_t x[static 32], + uint64_t t[static 72]); // Compare bignums, x <= y // Inputs x[m], y[n]; output function return @@ -539,6 +550,8 @@ extern void bignum_mul_6_12_alt (uint64_t z[static 12], uint64_t x[static 6], ui // Inputs x[8], y[8]; output z[16] extern void bignum_mul_8_16 (uint64_t z[static 16], uint64_t x[static 8], uint64_t y[static 8]); extern void bignum_mul_8_16_alt (uint64_t z[static 16], uint64_t x[static 8], uint64_t y[static 8]); +extern void bignum_mul_8_16_neon (uint64_t z[static 16], uint64_t x[static 8], + uint64_t y[static 8]); // Multiply modulo p_25519, z := (x * y) mod p_25519 // Inputs x[4], y[4]; output z[4] @@ -697,6 +710,7 @@ extern void bignum_sqr_6_12_alt (uint64_t z[static 12], uint64_t x[static 6]); // Input x[8]; output z[16] extern void bignum_sqr_8_16 (uint64_t z[static 16], uint64_t x[static 8]); extern void bignum_sqr_8_16_alt (uint64_t z[static 16], uint64_t x[static 8]); +extern void bignum_sqr_8_16_neon (uint64_t z[static 16], uint64_t x[static 8]); // Square modulo p_25519, z := (x^2) mod p_25519 // Input x[4]; output z[4] diff --git a/tests/arch.h b/tests/arch.h new file mode 100644 index 00000000..2fa56d9c --- /dev/null +++ b/tests/arch.h @@ -0,0 +1,51 @@ +// On x86 machines, restrict the set of tested functions appropriately +// if the machine does not seem to support the BMI2 and ADX extensions. + +enum arch_name { ARCH_X86_64, ARCH_AARCH64 }; + +#ifdef __x86_64__ + +int cpuid_extendedfeatures(void) +{ int a = 7, b = 0, c = 0, d = 0; + asm ("cpuid\n\t" + : "=a" (a), "=b" (b), "=c" (c), "=d" (d) + : "0" (a), "2" (c)); + return b; +} + +int supports_bmi2_and_adx(void) +{ int c = cpuid_extendedfeatures(); + return (c & (1ul<<8)) && (c & (1ul<<19)); +} + +int supports_neon(void) +{ // X86_64 does not support NEON. + return 0; +} + +enum arch_name get_arch_name() +{ return ARCH_X86_64; +} + +#else + +int supports_bmi2_and_adx(void) +{ // AArch64 does not support BMI2 or ADX extension. + return 0; +} + +int supports_neon(void) +{ +#ifdef __ARM_NEON + return 1; +#else + return 0; +#endif +} + +enum arch_name get_arch_name() +{ return ARCH_AARCH64; +} + +#endif + diff --git a/tests/test.c b/tests/test.c index 28e9e67e..c8d8744a 100644 --- a/tests/test.c +++ b/tests/test.c @@ -20,6 +20,10 @@ #include "../include/s2n-bignum.h" +// Functiosn for detecting architectures and instruction sets + +#include "arch.h" + // Some big static buffers (need them big enough for largest test) #define BUFFERSIZE 65536 @@ -3391,86 +3395,74 @@ int test_bignum_double_sm2(void) return 0; } -int test_bignum_emontredc(void) -{ uint64_t t, k, w, tc; - printf("Testing bignum_emontredc with %d cases\n",tests); +int test_bignum_emontredc_specific(const char *name, int is_8n, + uint64_t (*f)(uint64_t, uint64_t *, + uint64_t *, uint64_t)) { + uint64_t t, k, w, tc; + printf("Testing %s with %d cases\n", name, tests); int c; - for (t = 0; t < tests; ++t) - { k = (unsigned) rand() % MAXSIZE; - - random_bignum(k,b0); b0[0] |= 1; // b0 = m - w = word_negmodinv(b0[0]); // w = negated modular inverse - random_bignum(2*k,b4); // b4 = initial z - - reference_copy(2*k+1,b1,2*k,b4); // b1 = longer copy of z_0 - reference_copy(2*k+1,b2,2*k,b4); // b2 = also longer copy of z_0 - - tc = bignum_emontredc(k,b4,b0,w); - - reference_madd(2*k+1,b1,k,b4,k,b0); // b1 = q * m + z_0 - - c = ((b1[2*k] == tc) && - reference_eq_samelen(k,b4+k,b1+k) && - reference_iszero(k,b1)); + for (t = 0; t < tests; ++t) { + k = (unsigned)rand() % MAXSIZE; + if (is_8n) { + k = (k >> 3) << 3; + if (k == 0) + k = 8; + } - if (!c) - { printf("### Disparity reducing modulo: [size %4"PRIu64" -> %4"PRIu64"] " - "...%016"PRIx64" / 2^%"PRIu64" mod ...%016"PRIx64" = ...%016"PRIx64"\n", - 2*k,k,b2[0],64*k,b0[0],b4[k]); - return 1; - } - else if (VERBOSE) - { printf("OK: [size %4"PRIu64" -> %4"PRIu64"] " - "...%016"PRIx64" / 2^%"PRIu64" mod ...%016"PRIx64" = ...%016"PRIx64"\n", - 2*k,k,b2[0],64*k,b0[0],b4[0]); - } - } + random_bignum(k, b0); + b0[0] |= 1; // b0 = m + w = word_negmodinv(b0[0]); // w = negated modular inverse + random_bignum(2 * k, b4); // b4 = initial z + + reference_copy(2 * k + 1, b1, 2 * k, b4); // b1 = longer copy of z_0 + reference_copy(2 * k + 1, b2, 2 * k, b4); // b2 = also longer copy of z_0 + + tc = f(k, b4, b0, w); + + reference_madd(2 * k + 1, b1, k, b4, k, b0); // b1 = q * m + z_0 + + c = ((b1[2 * k] == tc) && reference_eq_samelen(k, b4 + k, b1 + k) && + reference_iszero(k, b1)); + + if (!c) { + printf("### Disparity reducing modulo: [size %4" PRIu64 " -> %4" PRIu64 + "] " + "...%016" PRIx64 " / 2^%" PRIu64 " mod ...%016" PRIx64 + " = ...%016" PRIx64 "\n", + 2 * k, k, b2[0], 64 * k, b0[0], b4[k]); + return 1; + } else if (VERBOSE) { + printf("OK: [size %4" PRIu64 " -> %4" PRIu64 "] " + "...%016" PRIx64 " / 2^%" PRIu64 " mod ...%016" PRIx64 + " = ...%016" PRIx64 "\n", + 2 * k, k, b2[0], 64 * k, b0[0], b4[0]); + } + } printf("All OK\n"); return 0; } -int test_bignum_emontredc_8n(void) -{ uint64_t t, k, w, tc; - printf("Testing bignum_emontredc_8n with %d cases\n",tests); - - int c; - for (t = 0; t < tests; ++t) - { k = (unsigned) rand() % MAXSIZE; - k = (k>>3)<<3; - if (k == 0) k = 8; - - random_bignum(k,b0); b0[0] |= 1; // b0 = m - w = word_negmodinv(b0[0]); // w = negated modular inverse - random_bignum(2*k,b4); // b4 = initial z - - reference_copy(2*k+1,b1,2*k,b4); // b1 = longer copy of z_0 - reference_copy(2*k+1,b2,2*k,b4); // b2 = also longer copy of z_0 - - tc = bignum_emontredc_8n(k,b4,b0,w); - - reference_madd(2*k+1,b1,k,b4,k,b0); // b1 = q * m + z_0 - - c = ((b1[2*k] == tc) && - reference_eq_samelen(k,b4+k,b1+k) && - reference_iszero(k,b1)); +int test_bignum_emontredc(void) +{ return test_bignum_emontredc_specific("bignum_emontredc", 0, + bignum_emontredc); +} - if (!c) - { printf("### Disparity reducing modulo: [size %4"PRIu64" -> %4"PRIu64"] " - "...%016"PRIx64" / 2^%"PRIu64" mod ...%016"PRIx64" = ...%016"PRIx64"\n", - 2*k,k,b2[0],64*k,b0[0],b4[k]); - return 1; - } - else if (VERBOSE) - { printf("OK: [size %4"PRIu64" -> %4"PRIu64"] " - "...%016"PRIx64" / 2^%"PRIu64" mod ...%016"PRIx64" = ...%016"PRIx64"\n", - 2*k,k,b2[0],64*k,b0[0],b4[0]); - } - } - printf("All OK\n"); - return 0; +int test_bignum_emontredc_8n(void) +{ return test_bignum_emontredc_specific("bignum_emontredc_8n", 1, + bignum_emontredc_8n); } +int test_bignum_emontredc_8n_neon(void) +{ +#ifdef __ARM_NEON + return test_bignum_emontredc_specific("bignum_emontredc_8n_neon", 1, + bignum_emontredc_8n_neon); +#else + // Do not call the neon function to avoid a linking failure error. + return 1; +#endif +} int test_bignum_eq(void) { uint64_t t, k1, k2; @@ -4016,10 +4008,32 @@ int test_bignum_kmul_16_32(void) { return test_bignum_kmul_specific(32,16,16,"bignum_kmul_16_32",bignum_kmul_16_32); } +int test_bignum_kmul_16_32_neon(void) +{ +#ifdef __ARM_NEON + return test_bignum_kmul_specific(32,16,16,"bignum_kmul_16_32_neon", + bignum_kmul_16_32_neon); +#else + // Do not call the neon function to avoid a linking failure error. + return 1; +#endif +} + int test_bignum_kmul_32_64(void) { return test_bignum_kmul_specific(64,32,32,"bignum_kmul_32_64",bignum_kmul_32_64); } +int test_bignum_kmul_32_64_neon(void) +{ +#ifdef __ARM_NEON + return test_bignum_kmul_specific(64,32,32,"bignum_kmul_32_64_neon", + bignum_kmul_32_64_neon); +#else + // Do not call the neon function to avoid a linking failure error. + return 1; +#endif +} + int test_bignum_ksqr_specific (uint64_t p,uint64_t n, char *name, void (*f)(uint64_t *,uint64_t *,uint64_t *)) @@ -4054,10 +4068,32 @@ int test_bignum_ksqr_16_32(void) { return test_bignum_ksqr_specific(32,16,"bignum_ksqr_16_32",bignum_ksqr_16_32); } +int test_bignum_ksqr_16_32_neon(void) +{ +#ifdef __ARM_NEON + return test_bignum_ksqr_specific(32,16,"bignum_ksqr_16_32_neon", + bignum_ksqr_16_32_neon); +#else + // Do not call the neon function to avoid a linking failure error. + return 1; +#endif +} + int test_bignum_ksqr_32_64(void) { return test_bignum_ksqr_specific(64,32,"bignum_ksqr_32_64",bignum_ksqr_32_64); } +int test_bignum_ksqr_32_64_neon(void) +{ +#ifdef __ARM_NEON + return test_bignum_ksqr_specific(64,32,"bignum_ksqr_32_64_neon", + bignum_ksqr_32_64_neon); +#else + // Do not call the neon function to avoid a linking failure error. + return 1; +#endif +} + int test_bignum_le(void) { uint64_t t, k1, k2; printf("Testing bignum_le with %d cases\n",tests); @@ -6053,8 +6089,19 @@ int test_bignum_mul_8_16_alt(void) { return test_bignum_mul_specific(16,8,8,"bignum_mul_8_16_alt",bignum_mul_8_16_alt); } -int test_bignum_mul_p25519(void) -{ uint64_t i, k; +int test_bignum_mul_8_16_neon(void) +{ +#ifdef __ARM_NEON + return test_bignum_mul_specific(16, 8, 8, "bignum_mul_8_16_neon", + bignum_mul_8_16_neon); +#else + // Do not call the neon function to avoid a linking failure error. + return 1; +#endif +} + +int test_bignum_mul_p25519(void) { + uint64_t i, k; printf("Testing bignum_mul_p25519 with %d cases\n",tests); uint64_t c; for (i = 0; i < tests; ++i) @@ -7368,6 +7415,16 @@ int test_bignum_sqr_8_16_alt(void) { return test_bignum_sqr_specific(16,8,"bignum_sqr_8_16_alt",bignum_sqr_8_16_alt); } +int test_bignum_sqr_8_16_neon(void) +{ +#ifdef __ARM_NEON + return test_bignum_sqr_specific(16,8,"bignum_sqr_8_16_neon",bignum_sqr_8_16_neon); +#else + // Do not call the neon function to avoid a linking failure error. + return 1; +#endif +} + int test_bignum_sqr_p25519(void) { uint64_t i, k; printf("Testing bignum_sqr_p25519 with %d cases\n",tests); @@ -10690,32 +10747,6 @@ int test_edwards25519_scalarmulbase_alt_tweetnacl(void) // Main dispatching to appropriate test code // **************************************************************************** -// On x86 machines, restrict the set of tested functions appropriately -// if the machine does not seem to support the BMI2 and ADX extensions. - -#ifdef __x86_64__ - -int cpuid_extendedfeatures(void) -{ int a = 7, b = 0, c = 0, d = 0; - asm ("cpuid\n\t" - : "=a" (a), "=b" (b), "=c" (c), "=d" (d) - : "0" (a), "2" (c)); - return b; -} - -int full_isa_support(void) -{ int c = cpuid_extendedfeatures(); - return (c & (1ul<<8)) && (c & (1ul<<19)); -} - -#else - -int full_isa_support(void) -{ return 1; -} - -#endif - static char *function_to_test; static int tested = 0; static int successes = 0; @@ -10723,6 +10754,8 @@ static int failures = 0; static int skipped = 0; static int inapplicable = 0; +// functionaltest runs f() if enabled is true and records the result. +// If the return value is nonzero, the test has failed. void functionaltest(int enabled,char *name,int (*f)(void)) { ++tested; // Only benchmark matching function name @@ -10764,7 +10797,7 @@ void functionaltest(int enabled,char *name,int (*f)(void)) // as a wildcard "*", e.g. "bignum_add_p_" int main(int argc, char *argv[]) -{ int bmi = full_isa_support(); +{ int bmi = get_arch_name() == ARCH_AARCH64 || supports_bmi2_and_adx(); int all = 1; int extrastrigger = 1; @@ -10876,7 +10909,7 @@ int main(int argc, char *argv[]) functionaltest(all,"bignum_inv_p25519",test_bignum_inv_p25519); functionaltest(all,"bignum_iszero",test_bignum_iszero); functionaltest(bmi,"bignum_kmul_16_32",test_bignum_kmul_16_32); - functionaltest(bmi,"bignum_kmul_32_64",test_bignum_kmul_32_64); + functionaltest(bmi, "bignum_kmul_32_64", test_bignum_kmul_32_64); functionaltest(bmi,"bignum_ksqr_16_32",test_bignum_ksqr_16_32); functionaltest(bmi,"bignum_ksqr_32_64",test_bignum_ksqr_32_64); functionaltest(all,"bignum_le",test_bignum_le); @@ -11073,6 +11106,17 @@ int main(int argc, char *argv[]) functionaltest(all,"word_negmodinv",test_word_negmodinv); functionaltest(all,"word_recip",test_word_recip); + if (get_arch_name() == ARCH_AARCH64) { + int neon = supports_neon(); + functionaltest(neon,"bignum_emontredc_8n_neon",test_bignum_emontredc_8n_neon); + functionaltest(neon,"bignum_kmul_16_32_neon", test_bignum_kmul_16_32_neon); + functionaltest(neon,"bignum_kmul_32_64_neon", test_bignum_kmul_32_64_neon); + functionaltest(neon,"bignum_ksqr_16_32_neon",test_bignum_ksqr_16_32_neon); + functionaltest(neon,"bignum_ksqr_32_64_neon",test_bignum_ksqr_32_64_neon); + functionaltest(neon,"bignum_mul_8_16_neon",test_bignum_mul_8_16_neon); + functionaltest(neon,"bignum_sqr_8_16_neon",test_bignum_sqr_8_16_neon); + } + if (extrastrigger) function_to_test = "_"; functionaltest(bmi,"known value tests",test_known_values);