From 3ade7a875efc898b7ded9978514751ae428c571c Mon Sep 17 00:00:00 2001 From: Sean Parkinson Date: Thu, 15 Aug 2024 08:58:56 +1000 Subject: [PATCH] RISC-V 64 ASM: Add Poly1305 implementation Implementation using standard and vector instructions. --- configure.ac | 28 +- src/include.am | 3 + wolfcrypt/src/poly1305.c | 14 +- wolfcrypt/src/port/riscv/riscv-64-poly1305.c | 669 +++++++++++++++++++ wolfssl/wolfcrypt/poly1305.h | 26 +- wolfssl/wolfcrypt/port/riscv/riscv-64-asm.h | 27 + 6 files changed, 744 insertions(+), 23 deletions(-) create mode 100644 wolfcrypt/src/port/riscv/riscv-64-poly1305.c diff --git a/configure.ac b/configure.ac index 3f5f44a3c1..a42c5678b6 100644 --- a/configure.ac +++ b/configure.ac @@ -3066,12 +3066,6 @@ do ;; no) ;; - zbkb) - # PACK, REV8 - ENABLED_RISCV_ASM=yes - AM_CFLAGS="$AM_CFLAGS -DWOLFSSL_RISCV_BIT_MANIPULATION" - AM_CFLAGS="$AM_CFLAGS -DWOLFSSL_RISCV_BASE_BIT_MANIPULATION" - ;; zbb) # REV8 ENABLED_RISCV_ASM=yes @@ -3082,6 +3076,16 @@ do ENABLED_RISCV_ASM=yes AM_CFLAGS="$AM_CFLAGS -DWOLFSSL_RISCV_CARRYLESS" ;; + zbkb) + # PACK, REV8 + ENABLED_RISCV_ASM=yes + AM_CFLAGS="$AM_CFLAGS -DWOLFSSL_RISCV_BIT_MANIPULATION" + AM_CFLAGS="$AM_CFLAGS -DWOLFSSL_RISCV_BASE_BIT_MANIPULATION" + ;; + zbt) + # FSL, FSR, FSRI, CMOV, CMIX - QEMU doesn't know about these instructions + AM_CFLAGS="$AM_CFLAGS -DWOLFSSL_RISCV_BIT_MANIPULATION_TERNARY" + ;; zkn|zkned) # AES encrypt/decrpyt, SHA-2 ENABLED_RISCV_ASM=yes @@ -3091,20 +3095,20 @@ do ENABLED_RISCV_ASM=yes AM_CFLAGS="$AM_CFLAGS -DWOLFSSL_RISCV_VECTOR" ;; - zvkg) - # VGMUL, VHHSH + zvbb|zvkb) + # VBREV8 ENABLED_RISCV_ASM=yes - AM_CFLAGS="$AM_CFLAGS -DWOLFSSL_RISCV_VECTOR_GCM" + AM_CFLAGS="$AM_CFLAGS -DWOLFSSL_RISCV_VECTOR_BASE_BIT_MANIPULATION" ;; zvbc) # VCLMUL, VCLMULH ENABLED_RISCV_ASM=yes AM_CFLAGS="$AM_CFLAGS -DWOLFSSL_RISCV_VECTOR_CARRYLESS" ;; - zvbb|zvkb) - # VBREV8 + zvkg) + # VGMUL, VHHSH ENABLED_RISCV_ASM=yes - AM_CFLAGS="$AM_CFLAGS -DWOLFSSL_RISCV_VECTOR_BASE_BIT_MANIPULATION" + AM_CFLAGS="$AM_CFLAGS -DWOLFSSL_RISCV_VECTOR_GCM" ;; zvkned) # Vector AES, SHA-2 diff --git a/src/include.am b/src/include.am index 544ad3ea8e..1bfc754f79 100644 --- a/src/include.am +++ b/src/include.am @@ -914,6 +914,9 @@ if BUILD_POLY1305 if BUILD_ARMASM src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/arm/armv8-poly1305.c endif +if BUILD_RISCV_ASM +src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/riscv/riscv-64-poly1305.c +endif src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/poly1305.c if !BUILD_X86_ASM if BUILD_INTELASM diff --git a/wolfcrypt/src/poly1305.c b/wolfcrypt/src/poly1305.c index 43528c23f0..b4b5c0f7ea 100644 --- a/wolfcrypt/src/poly1305.c +++ b/wolfcrypt/src/poly1305.c @@ -206,7 +206,7 @@ extern void poly1305_final_avx2(Poly1305* ctx, byte* mac); #endif #elif defined(POLY130564) -#ifndef WOLFSSL_ARMASM +#if !defined(WOLFSSL_ARMASM) && !defined(WOLFSSL_RISCV_ASM) static word64 U8TO64(const byte* p) { return @@ -230,7 +230,7 @@ extern void poly1305_final_avx2(Poly1305* ctx, byte* mac); p[6] = (byte)(v >> 48); p[7] = (byte)(v >> 56); } -#endif/* WOLFSSL_ARMASM */ +#endif/* !WOLFSSL_ARMASM && !WOLFSSL_RISCV_ASM */ #else /* if not 64 bit then use 32 bit */ static word32 U8TO32(const byte *p) @@ -268,7 +268,8 @@ static WC_INLINE void u32tole64(const word32 inLe32, byte outLe64[8]) } -#if !defined(WOLFSSL_ARMASM) || !defined(__aarch64__) +#if (!defined(WOLFSSL_ARMASM) || !defined(__aarch64__)) && \ + !defined(WOLFSSL_RISCV_ASM) /* This local function operates on a message with a given number of bytes with a given ctx pointer to a Poly1305 structure. @@ -491,9 +492,7 @@ static int poly1305_block(Poly1305* ctx, const unsigned char *m) return poly1305_blocks(ctx, m, POLY1305_BLOCK_SIZE); #endif } -#endif /* !defined(WOLFSSL_ARMASM) || !defined(__aarch64__) */ -#if !defined(WOLFSSL_ARMASM) || !defined(__aarch64__) int wc_Poly1305SetKey(Poly1305* ctx, const byte* key, word32 keySz) { #if defined(POLY130564) && !defined(USE_INTEL_POLY1305_SPEEDUP) @@ -789,7 +788,7 @@ int wc_Poly1305Final(Poly1305* ctx, byte* mac) return 0; } -#endif /* !defined(WOLFSSL_ARMASM) || !defined(__aarch64__) */ +#endif /* (!WOLFSSL_ARMASM || !__aarch64__) && !WOLFSSL_RISCV_ASM */ int wc_Poly1305Update(Poly1305* ctx, const byte* m, word32 bytes) @@ -884,7 +883,8 @@ int wc_Poly1305Update(Poly1305* ctx, const byte* m, word32 bytes) /* process full blocks */ if (bytes >= POLY1305_BLOCK_SIZE) { size_t want = ((size_t)bytes & ~((size_t)POLY1305_BLOCK_SIZE - 1)); -#if !defined(WOLFSSL_ARMASM) || !defined(__aarch64__) +#if (!defined(WOLFSSL_ARMASM) || !defined(__aarch64__)) && \ + !defined(WOLFSSL_RISCV_ASM) int ret; ret = poly1305_blocks(ctx, m, want); if (ret != 0) diff --git a/wolfcrypt/src/port/riscv/riscv-64-poly1305.c b/wolfcrypt/src/port/riscv/riscv-64-poly1305.c new file mode 100644 index 0000000000..a0b880b8a1 --- /dev/null +++ b/wolfcrypt/src/port/riscv/riscv-64-poly1305.c @@ -0,0 +1,669 @@ +/* riscv-64-poly1305.c + * + * Copyright (C) 2006-2024 wolfSSL Inc. + * + * This file is part of wolfSSL. + * + * wolfSSL is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * wolfSSL is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA + */ + +/* + * Based off the public domain implementations by Andrew Moon + * and Daniel J. Bernstein + */ + +#ifdef HAVE_CONFIG_H + #include +#endif + +#include +#include +#include + +#ifdef WOLFSSL_RISCV_ASM + +#ifdef HAVE_POLY1305 +#include +#include +#include +#include +#ifdef NO_INLINE + #include +#else + #define WOLFSSL_MISC_INCLUDED + #include +#endif +#ifdef CHACHA_AEAD_TEST + #include +#endif + + +#ifndef WOLFSSL_RISCV_BIT_MANIPULATION_TERNARY + +#define SPLIT_130(r0, r1, r2, a0, a1, a2, t) \ + "srli " #r1 ", " #a0 ", (64-12)\n\t" \ + "and " #r0 ", " #a0 ", a6\n\t" \ + "slli " #t ", " #a1 ", (2*12)\n\t" \ + "slli " #r2 ", " #a2 ", (2*12)\n\t" \ + "srli " #a1 ", " #a1 ", (64-2*12)\n\t" \ + "srli " #t ", " #t ", 12\n\t" \ + "or " #r2 ", " #a1 ", " #r2 "\n\t" \ + "or " #r1 ", " #r1 ", " #t "\n\t" + +#define SPLIT_128(r0, r1, r2, a0, a1, t) \ + "slli " #t ", " #a1 ", (2*12)\n\t" \ + "srli " #r1 ", " #a0 ", (64-12)\n\t" \ + "and " #r0 ", " #a0 ", a6\n\t" \ + "srli " #t ", " #t ", 12\n\t" \ + "srli " #r2 ", " #a1 ", (64-2*12)\n\t" \ + "or " #r1 ", " #r1 ", " #t "\n\t" + +#define REDIST(l, h, t) \ + "srli " #t ", " #l ", 52\n\t" \ + "slli " #h ", " #h ", 12\n\t" \ + "and " #l ", " #l ", a6\n\t" \ + "or " #h ", " #h ", " #t "\n\t" + +#define REDIST_HI(l, h, h2, t) \ + "srli " #h2 ", " #h ", 28\n\t" \ + "slli " #h ", " #h ", 24\n\t" \ + "srli " #t ", " #l ", 40\n\t" \ + "slli " #l ", " #l ", 12\n\t" \ + "and " #h ", " #h ", a6\n\t" \ + "and " #l ", " #l ", a6\n\t" \ + "or " #h ", " #h ", " #t "\n\t" + +#define REDIST_HI_26(l, h, t) \ + "srli " #t ", " #l ", 40\n\t" \ + "slli " #l ", " #l ", 12\n\t" \ + "slli " #h ", " #h ", 24\n\t" \ + "and " #l ", " #l ", a6\n\t" \ + "or " #h ", " #h ", " #t "\n\t" + +#else + +#define SPLIT_130(r0, r1, r2, a0, a1, a2, t) \ + "and " #r0 ", " #a0 ", a6\n\t" \ + FSRI(r1, a1, a0, 52) \ + FSRI(r2, a2, a1, 40) \ + "and " #r1 ", " #r1 ", a6\n\t" \ + "and " #r2 ", " #r2 ", a6\n\t" + +#define SPLIT_128(r0, r1, r2, a0, a1, t) \ + "srli " #r2 ", " #a1 ", 40\n\t" \ + FSRI(r1, a1, a0, 52) \ + "and " #r0 ", " #a0 ", a6\n\t" \ + "and " #r1 ", " #r1 ", a6\n\t" + +#define REDIST(l, h, t) \ + FSRI(h, h, l, 52) \ + "and " #l ", " #l ", a4\n\t" + +#define REDIST_HI(l, h, h2, t) \ + "srli " #h2 ", " #h ", 28\n\t" \ + FSRI(h, h, l, 40) \ + "slli " #l ", " #l ", 12\n\t" \ + "and " #h ", " #h ", a6\n\t" \ + "and " #l ", " #l ", a6\n\t" + +#define REDIST_HI_26(l, h, t) \ + FSRI(h, h, l, 40) \ + "slli " #l ", " #l ", 12\n\t" \ + "and " #l ", " #l ", a6\n\t" + +#endif + +#define RECALC(l, h, t) \ + "srli " #t ", " #l ", 52\n\t" \ + "and " #l ", " #l ", a6\n\t" \ + "add " #h ", " #h ", " #t "\n\t" + +static WC_INLINE void poly1305_blocks_riscv64_16(Poly1305* ctx, + const unsigned char *m, size_t bytes, int notLast) +{ + __asm__ __volatile__ ( + "addi %[bytes], %[bytes], -16\n\t" + "bltz %[bytes], L_poly1305_riscv64_16_64_done_%=\n\t" + + "li a4, 0xffffffc000000\n\t" + "li a5, 0x3ffffff\n\t" + "li a6, 0xfffffffffffff\n\t" + + /* Load r and h */ + "ld s8, %[ctx_r_0]\n\t" + "ld s9, %[ctx_r_1]\n\t" + + "ld s3, %[ctx_h_0]\n\t" + "ld s4, %[ctx_h_1]\n\t" + "ld s5, %[ctx_h_2]\n\t" + + "L_poly1305_riscv64_16_64_loop_%=:\n\t" + /* Load m */ + "ld t0, (%[m])\n\t" + "ld t1, 8(%[m])\n\t" + /* Split m into 26, 52, 52 */ + SPLIT_130(t2, t3, t4, t0, t1, %[notLast], t5) + + "add s3, s3, t2\n\t" + "add s4, s4, t3\n\t" + "add s5, s5, t4\n\t" + + /* r[0] * h[0] = [0, 1] */ + "mul t0, s8, s3\n\t" + "mulhu t1, s8, s3\n\t" + REDIST(t0, t1, s6) + /* r[0] * h[1] = [1, 2] */ + "mul t3, s8, s4\n\t" + "mulhu t2, s8, s4\n\t" + REDIST(t3, t2, s6) + "add t1, t1, t3\n\t" + /* r[1] * h[0] = [1, 2] */ + "mul t4, s9, s3\n\t" + "mulhu t5, s9, s3\n\t" + REDIST_HI(t4, t5, t3, s6) + "add t1, t1, t4\n\t" + "add t2, t2, t5\n\t" + /* r[0] * h[2] = [2, 3] */ + "mul t4, s8, s5\n\t" + "mulhu t5, s8, s5\n\t" + REDIST(t4, t5, s6) + "add t2, t2, t4\n\t" + "add t3, t3, t5\n\t" + /* r[1] * h[1] = [2, 3] */ + "mul t5, s9, s4\n\t" + "mulhu t6, s9, s4\n\t" + REDIST_HI(t5, t6, t4, s6) + "add t2, t2, t5\n\t" + "add t3, t3, t6\n\t" + /* r[1] * h[2] = [3, 4] */ + "mul t5, s9, s5\n\t" + "mulhu t6, s9, s5\n\t" + REDIST_HI_26(t5, t6, s6) + "add t3, t3, t5\n\t" + "add t4, t4, t6\n\t" + + RECALC(t1, t2, s6) + RECALC(t2, t3, s6) + RECALC(t3, t4, s6) + + /* h[0..4] % (2^130 - 5) */ + "slli s3, t3, 26\n\t" + "slli s4, t4, 26\n\t" + "and s3, s3, a4\n\t" + "and s4, s4, a4\n\t" + "srli t5, t2, 26\n\t" + "and t2, t2, a5\n\t" + "srli t3, t3, 26\n\t" + "srli t4, t4, 26\n\t" + "add t5, t5, s3\n\t" + "add t3, t3, s4\n\t" + + "slli s5, t5, 2\n\t" + "slli s3, t3, 2\n\t" + "slli s4, t4, 2\n\t" + "add t5, t5, s5\n\t" + "add t3, t3, s3\n\t" + "add t4, t4, s4\n\t" + + "add s3, t0, t5\n\t" + "add s4, t1, t3\n\t" + "add s5, t2, t4\n\t" + + /* h[0..2] % (2^130 - 5) */ + "and t5, s5, a4\n\t" + "and s5, s5, a5\n\t" + "srli t6, t5, 24\n\t" + "srli t5, t5, 26\n\t" + "add t5, t5, t6\n\t" + "add s3, s3, t5\n\t" + + "addi %[bytes], %[bytes], -16\n\t" + "addi %[m], %[m], 16\n\t" + "bgez %[bytes], L_poly1305_riscv64_16_64_loop_%=\n\t" + + "sd s3, %[ctx_h_0]\n\t" + "sd s4, %[ctx_h_1]\n\t" + "sd s5, %[ctx_h_2]\n\t" + "\n" + "L_poly1305_riscv64_16_64_done_%=:\n\t" + : [bytes] "+r" (bytes), [m] "+r" (m) + : [ctx_h_0] "m" (ctx->h[0]), [ctx_h_1] "m" (ctx->h[1]), + [ctx_h_2] "m" (ctx->h[2]), [ctx_r_0] "m" (ctx->r[0]), + [ctx_r_1] "m" (ctx->r[1]), [notLast] "r" ((word64)notLast) + : "memory", "t0", "t1", "t2", "t3", "t4", "t5", "t6", "s6", + "a4", "a5", "a6", /* Constants */ + "s3", "s4", "s5", /* h */ + "s8", "s9" /* r */ + ); +} + +#ifdef WOLFSSL_RISCV_VECTOR + +#define MUL_RES_REDIS(l, h, t) \ + VSRL_VI(t, l, 26) \ + VAND_VX(l, l, REG_A6) \ + VSRL_VI(t, t, 26) \ + VSLL_VI(h, h, 12) \ + VOR_VV(h, h, t) + +#endif + +void poly1305_blocks_riscv64(Poly1305* ctx, const unsigned char *m, + size_t bytes) +{ +#ifdef WOLFSSL_RISCV_VECTOR + __asm__ __volatile__ ( + "addi %[bytes], %[bytes], -32\n\t" + "bltz %[bytes], L_poly1305_riscv64_vec_done_%=\n\t" + + VSETIVLI(REG_ZERO, 2, 1, 1, 0b011, 0b000) + + "li a4, 0xffffffc000000\n\t" + "li a5, 0x3ffffff\n\t" + "li a6, 0xfffffffffffff\n\t" + + /* Load r and r^2 */ + "mv t0, %[r2]\n\t" + VL2RE64_V(REG_V2, REG_T0) + "addi t0, %[r2], 32\n\t" + VL1RE64_V(REG_V4, REG_T0) + + /* Load h */ + "ld t0, 0(%[h])\n\t" + "ld t1, 8(%[h])\n\t" + "ld t2, 16(%[h])\n\t" + + VMV_S_X(REG_V8, REG_T0) + VMV_S_X(REG_V9, REG_T1) + VMV_S_X(REG_V10, REG_T2) + + "L_poly1305_riscv64_vec_loop_%=:\n\t" + /* m0 + nfin */ + "ld t0, 0(%[m])\n\t" + "ld t1, 8(%[m])\n\t" + "li t6, 1\n\t" + /* Split m into 24, 52, 52 */ + SPLIT_130(t2, t3, t4, t0, t1, t6, t5) + VMV_S_X(REG_V11, REG_T2) + VMV_S_X(REG_V12, REG_T3) + VMV_S_X(REG_V13, REG_T4) + /* m1+ nfin */ + "ld t0, 16(%[m])\n\t" + "ld t1, 24(%[m])\n\t" + /* Split m into 24, 52, 52 */ + SPLIT_130(t2, t3, t4, t0, t1, t6, t5) + VMV_S_X(REG_V14, REG_T2) + VMV_S_X(REG_V15, REG_T3) + VMV_S_X(REG_V16, REG_T4) + /* h += m0 + nfin */ + VADD_VV(REG_V8, REG_V8, REG_V11) + VADD_VV(REG_V9, REG_V9, REG_V12) + VADD_VV(REG_V10, REG_V10, REG_V13) + /* h[0]|m1[0], h[1]|m1[1], h[2]|m1[2] */ + VSLIDEUP_VI(REG_V8, REG_V14, 1) + VSLIDEUP_VI(REG_V9, REG_V15, 1) + VSLIDEUP_VI(REG_V10, REG_V16, 1) + + /* hm[0] * r2r[0] */ + VMUL_VV(REG_V11, REG_V8, REG_V2) + VMULHU_VV(REG_V12, REG_V8, REG_V2) + MUL_RES_REDIS(REG_V11, REG_V12, REG_V18) + + /* + hm[0] * r2r[1] */ + VMUL_VV(REG_V14, REG_V8, REG_V3) + VMULHU_VV(REG_V13, REG_V8, REG_V3) + MUL_RES_REDIS(REG_V14, REG_V13, REG_V18) + VADD_VV(REG_V12, REG_V12, REG_V14) + /* + hm[1] * r2r[0] */ + VMUL_VV(REG_V14, REG_V9, REG_V2) + VMULHU_VV(REG_V15, REG_V9, REG_V2) + MUL_RES_REDIS(REG_V14, REG_V15, REG_V18) + VADD_VV(REG_V12, REG_V12, REG_V14) + VADD_VV(REG_V13, REG_V13, REG_V15) + + /* + hm[0] * r2r[2] */ + VMUL_VV(REG_V15, REG_V8, REG_V4) + VMULHU_VV(REG_V14, REG_V8, REG_V4) + MUL_RES_REDIS(REG_V15, REG_V14, REG_V18) + VADD_VV(REG_V13, REG_V13, REG_V15) + /* + hm[1] * r2r[1] */ + VMUL_VV(REG_V15, REG_V9, REG_V3) + VMULHU_VV(REG_V16, REG_V9, REG_V3) + MUL_RES_REDIS(REG_V15, REG_V16, REG_V18) + VADD_VV(REG_V13, REG_V13, REG_V15) + VADD_VV(REG_V14, REG_V14, REG_V16) + /* + hm[2] * r2r[0] */ + VMUL_VV(REG_V15, REG_V10, REG_V2) + VMULHU_VV(REG_V16, REG_V10, REG_V2) + MUL_RES_REDIS(REG_V15, REG_V16, REG_V18) + VADD_VV(REG_V13, REG_V13, REG_V15) + VADD_VV(REG_V14, REG_V14, REG_V16) + + /* + hm[1] * r2r[2] */ + VMUL_VV(REG_V16, REG_V9, REG_V4) + VMULHU_VV(REG_V15, REG_V9, REG_V4) + MUL_RES_REDIS(REG_V16, REG_V15, REG_V18) + VADD_VV(REG_V14, REG_V14, REG_V16) + /* + hm[2] * r2r[1] */ + VMUL_VV(REG_V16, REG_V10, REG_V3) + VMULHU_VV(REG_V17, REG_V10, REG_V3) + MUL_RES_REDIS(REG_V16, REG_V17, REG_V18) + VADD_VV(REG_V14, REG_V14, REG_V16) + VADD_VV(REG_V15, REG_V15, REG_V17) + + /* + hm[2] * r2r[2] */ + VMUL_VV(REG_V17, REG_V10, REG_V4) + VADD_VV(REG_V15, REG_V15, REG_V17) + + /* Get m1 * r down */ + VSLIDEDOWN_VI(REG_V18, REG_V11, 1) + VSLIDEDOWN_VI(REG_V19, REG_V12, 1) + VSLIDEDOWN_VI(REG_V20, REG_V13, 1) + VSLIDEDOWN_VI(REG_V21, REG_V14, 1) + VSLIDEDOWN_VI(REG_V22, REG_V15, 1) + + /* Add (h + m0) * r^2 + m1 * r */ + VADD_VV(REG_V11, REG_V11, REG_V18) + VADD_VV(REG_V12, REG_V12, REG_V19) + VADD_VV(REG_V13, REG_V13, REG_V20) + VADD_VV(REG_V14, REG_V14, REG_V21) + VADD_VV(REG_V15, REG_V15, REG_V22) + + /* h' % 2^130-5 */ + VSLL_VI(REG_V8, REG_V14, 26) + VSLL_VI(REG_V9, REG_V15, 26) + VAND_VX(REG_V8, REG_V8, REG_A4) + VAND_VX(REG_V9, REG_V9, REG_A4) + VSRL_VI(REG_V10, REG_V13, 26) + VAND_VX(REG_V13, REG_V13, REG_A5) + VSRL_VI(REG_V14, REG_V14, 26) + VSRL_VI(REG_V15, REG_V15, 26) + VADD_VV(REG_V10, REG_V10, REG_V8) + VADD_VV(REG_V14, REG_V14, REG_V9) + + VSLL_VI(REG_V16, REG_V10, 2) + VSLL_VI(REG_V17, REG_V14, 2) + VSLL_VI(REG_V18, REG_V15, 2) + VADD_VV(REG_V10, REG_V10, REG_V16) + VADD_VV(REG_V14, REG_V14, REG_V17) + VADD_VV(REG_V15, REG_V15, REG_V18) + + VADD_VV(REG_V8, REG_V11, REG_V10) + VADD_VV(REG_V9, REG_V12, REG_V14) + VADD_VV(REG_V10, REG_V13, REG_V15) + + /* h'' % 2^130-5 */ + VAND_VX(REG_V11, REG_V10, REG_A4) + VAND_VX(REG_V10, REG_V10, REG_A5) + VSRL_VI(REG_V12, REG_V11, 24) + VSRL_VI(REG_V11, REG_V11, 26) + VADD_VV(REG_V11, REG_V11, REG_V12) + VADD_VV(REG_V8, REG_V8, REG_V11) + + "addi %[bytes], %[bytes], -32\n\t" + "addi %[m], %[m], 32\n\t" + "bgez %[bytes], L_poly1305_riscv64_vec_loop_%=\n\t" + + VMV_X_S(REG_S3, REG_V8) + VMV_X_S(REG_S4, REG_V9) + VMV_X_S(REG_S5, REG_V10) + + "sd s3, 0(%[h])\n\t" + "sd s4, 8(%[h])\n\t" + "sd s5, 16(%[h])\n\t" + + "\n" + "L_poly1305_riscv64_vec_done_%=:\n\t" + "addi %[bytes], %[bytes], 32\n\t" + : [bytes] "+r" (bytes), [m] "+r" (m) + : [r2] "r" (ctx->r2), [h] "r" (ctx->h) + : "memory", "t0", "t1", "t2", "t3", "t4", "t5", "t6", + "s3", "s4", "s5", "a4", "a5", "a6" + ); +#endif + poly1305_blocks_riscv64_16(ctx, m, bytes, 1); +} + +void poly1305_block_riscv64(Poly1305* ctx, const unsigned char *m) +{ + poly1305_blocks_riscv64_16(ctx, m, POLY1305_BLOCK_SIZE, 1); +} + +#if defined(POLY130564) +static word64 clamp[] = { + 0x0ffffffc0fffffff, + 0x0ffffffc0ffffffc, +}; +#endif /* POLY130564 */ + + +int wc_Poly1305SetKey(Poly1305* ctx, const byte* key, word32 keySz) +{ + if (key == NULL) + return BAD_FUNC_ARG; + +#ifdef CHACHA_AEAD_TEST + word32 k; + printf("Poly key used:\n"); + for (k = 0; k < keySz; k++) { + printf("%02x", key[k]); + if ((k+1) % 8 == 0) + printf("\n"); + } + printf("\n"); +#endif + + if (keySz != 32 || ctx == NULL) + return BAD_FUNC_ARG; + + __asm__ __volatile__ ( + /* Load key material */ + "ld t0, 0(%[key])\n\t" + "ld t1, 8(%[key])\n\t" + "ld t2, 16(%[key])\n\t" + "ld t3, 24(%[key])\n\t" + /* Load clamp */ + "ld t4, 0(%[clamp])\n\t" + "ld t5, 8(%[clamp])\n\t" + /* Save pad for later */ + "sd t2, 0(%[ctx_pad])\n\t" + "sd t3, 8(%[ctx_pad])\n\t" + /* Apply clamp */ + /* r &= 0xffffffc0ffffffc0ffffffc0fffffff */ + "and t0, t0, t4\n\t" + "and t1, t1, t5\n\t" + /* Store r */ + "sd t0, 0(%[ctx_r])\n\t" + "sd t1, 8(%[ctx_r])\n\t" + +#ifdef WOLFSSL_RISCV_VECTOR + "li a6, 0xfffffffffffff\n\t" + /* Split r into parts less than 64 */ + SPLIT_128(t2, t3, t4, t0, t1, t5) + /* Store r */ + "sd t2, 8(%[ctx_r2])\n\t" + "sd t3, 24(%[ctx_r2])\n\t" + "sd t4, 40(%[ctx_r2])\n\t" + + /* r * r */ + /* r[0] * r[0] - 0, 1 */ + "mul t2, t0, t0\n\t" + "mulhu t3, t0, t0\n\t" + /* + r[0] * r[1] - 1, 2 */ + "mul t5, t1, t0\n\t" + "mulhu t6, t1, t0\n\t" + "add t3, t3, t5\n\t" + "sltu s1, t3, t5\n\t" + "add t4, t6, s1\n\t" + /* + r[1] * r[0] - 1, 2 */ + "add t3, t3, t5\n\t" + "sltu s1, t3, t5\n\t" + "add t4, t4, s1\n\t" + "add t4, t4, t6\n\t" + "sltu t5, t4, t6\n\t" + /* + r[1] * r[1] - 2, 3 */ + "mul s1, t1, t1\n\t" + "mulhu t6, t1, t1\n\t" + "add t4, t4, s1\n\t" + "sltu s1, t4, s1\n\t" + "add t5, t5, t6\n\t" + "add t5, t5, s1\n\t" + /* (r * r) % (2 ^ 130 - 5) */ + "andi t6, t4, -4\n\t" + "andi t4, t4, 3\n\t" + /* r[0..129] + r[130-191] * 4 */ + "add t2, t2, t6\n\t" + "sltu s1, t2, t6\n\t" + "add t3, t3, s1\n\t" + "sltu s1, t3, s1\n\t" + "add t4, t4, s1\n\t" + /* r[0..129] + r[130-193] */ + "srli t6, t6, 2\n\t" + "slli s1, t5, 62\n\t" + "or t6, t6, s1\n\t" + "add t2, t2, t6\n\t" + "sltu s1, t2, t6\n\t" + "add t3, t3, s1\n\t" + "sltu s1, t3, s1\n\t" + "add t4, t4, s1\n\t" + /* r[64..129] + r[194-253] * 4 */ + "add t3, t3, t5\n\t" + "sltu s1, t3, t5\n\t" + "add t4, t4, s1\n\t" + /* r[64..129] + r[194-253] */ + "srli t5, t5, 2\n\t" + "add t3, t3, t5\n\t" + "sltu s1, t3, t5\n\t" + "add t4, t4, s1\n\t" + /* Split r^2 into parts less than 64 */ + SPLIT_130(t0, t1, t2, t2, t3, t4, t5) + /* Store r^2 */ + "sd t0, 0(%[ctx_r2])\n\t" + "sd t1, 16(%[ctx_r2])\n\t" + "sd t2, 32(%[ctx_r2])\n\t" +#endif + + /* h (accumulator) = 0 */ + "sd x0, 0(%[ctx_h])\n\t" + "sd x0, 8(%[ctx_h])\n\t" + "sd x0, 16(%[ctx_h])\n\t" + /* Zero leftover */ + "sd x0, (%[ctx_leftover])\n\t" + : + : [clamp] "r" (clamp), [key] "r" (key), [ctx_r] "r" (ctx->r), +#ifdef WOLFSSL_RISCV_VECTOR + [ctx_r2] "r" (ctx->r2), +#endif + [ctx_h] "r" (ctx->h), [ctx_pad] "r" (ctx->pad), + [ctx_leftover] "r" (&ctx->leftover) + : "memory", "t0", "t1", "t2", "t3", "t4", "t5", "t6", "s1" +#ifdef WOLFSSL_RISCV_VECTOR + , "a6" +#endif + ); + + return 0; +} + + +int wc_Poly1305Final(Poly1305* ctx, byte* mac) +{ + + if (ctx == NULL) + return BAD_FUNC_ARG; + + /* process the remaining block */ + if (ctx->leftover) { + size_t i = ctx->leftover; + ctx->buffer[i++] = 1; + for (; i < POLY1305_BLOCK_SIZE; i++) + ctx->buffer[i] = 0; + poly1305_blocks_riscv64_16(ctx, ctx->buffer, POLY1305_BLOCK_SIZE, 0); + } + + __asm__ __volatile__ ( + /* Load raw h and padding. */ + "ld t0, %[ctx_h_0]\n\t" + "ld t1, %[ctx_h_1]\n\t" + "ld t2, %[ctx_h_2]\n\t" + "ld t3, %[ctx_pad_0]\n\t" + "ld t4, %[ctx_pad_1]\n\t" + + /* Shrink h to 2,64,64. */ + "slli t5, t1, 52\n\t" + "slli t6, t2, 40\n\t" + "srli t1, t1, 12\n\t" + "srli t2, t2, 24\n\t" + "add t1, t1, t6\n\t" + "sltu t6, t1, t6\n\t" + "add t2, t2, t6\n\t" + "add t0, t0, t5\n\t" + "sltu t5, t0, t5\n\t" + "add t1, t1, t5\n\t" + "sltu t5, t1, t5\n\t" + "add t2, t2, t5\n\t" + + /* Add padding to h */ + "add t0, t0, t3\n\t" + "sltu t3, t0, t3\n\t" + "add t1, t1, t3\n\t" + "sltu t3, t1, t3\n\t" + "add t2, t2, t3\n\t" + "add t1, t1, t4\n\t" + "sltu t4, t1, t4\n\t" + "add t2, t2, t4\n\t" + + /* Check if h is larger than p */ + "addi t3, t0, 5\n\t" + "sltiu t3, t3, 5\n\t" + "add t4, t1, t3\n\t" + "sltu t3, t4, t3\n\t" + "add t4, t2, t3\n\t" + /* Check if h+5 is larger than 2^130 */ + "addi t4, t4, -4\n\t" + "srli t4, t4, 63\n\t" + "addi t4, t4, -1\n\t" + "andi t4, t4, 5\n\t" + "add t0, t0, t4\n\t" + "sltu t3, t0, t4\n\t" + "add t1, t1, t3\n\t" + "sltu t3, t1, t3\n\t" + "add t2, t2, t3\n\t" + "andi t2, t2, 3\n\t" + "sd t0, 0(%[mac])\n\t" + "sd t1, 8(%[mac])\n\t" + /* Zero out h. */ + "sd x0, %[ctx_h_0]\n\t" + "sd x0, %[ctx_h_1]\n\t" + "sd x0, %[ctx_h_2]\n\t" + /* Zero out r. */ + "sd x0, %[ctx_r_0]\n\t" + "sd x0, %[ctx_r_1]\n\t" + /* Zero out pad. */ + "ld t3, %[ctx_pad_0]\n\t" + "ld t4, %[ctx_pad_1]\n\t" + : [mac] "+r" (mac) + : [ctx_pad_0] "m" (ctx->pad[0]), [ctx_pad_1] "m" (ctx->pad[1]), + [ctx_h_0] "m" (ctx->h[0]), [ctx_h_1] "m" (ctx->h[1]), + [ctx_h_2] "m" (ctx->h[2]), + [ctx_r_0] "m" (ctx->r[0]), [ctx_r_1] "m" (ctx->r[1]) + : "memory", "t0", "t1", "t2", "t3", "t4", "t5", "t6" + ); + + return 0; +} + +#endif /* HAVE_POLY1305 */ +#endif /* WOLFSSL_RISCV_ASM */ diff --git a/wolfssl/wolfcrypt/poly1305.h b/wolfssl/wolfcrypt/poly1305.h index 0bf2865b14..a765a8775d 100644 --- a/wolfssl/wolfcrypt/poly1305.h +++ b/wolfssl/wolfcrypt/poly1305.h @@ -88,8 +88,7 @@ typedef struct Poly1305 { size_t leftover; unsigned char finished; unsigned char started; -#else -#if defined(WOLFSSL_ARMASM) && defined(__aarch64__) +#elif defined(WOLFSSL_ARMASM) && defined(__aarch64__) ALIGN128 word64 r64[2]; ALIGN128 word32 r[5]; ALIGN128 word32 r_2[5]; /* r^2 */ @@ -97,6 +96,17 @@ typedef struct Poly1305 { ALIGN128 word32 h[5]; word32 pad[4]; word64 leftover; + unsigned char buffer[POLY1305_BLOCK_SIZE]; + unsigned char finished; +#elif defined(WOLFSSL_RISCV_ASM) + word64 r[2]; +#ifdef WOLFSSL_RISCV_VECTOR + word64 r2[6]; +#endif + word64 h[3]; + word64 pad[2]; + size_t leftover; + unsigned char buffer[POLY1305_BLOCK_SIZE]; #else #if defined(POLY130564) word64 r[3]; @@ -108,10 +118,9 @@ typedef struct Poly1305 { word32 pad[4]; #endif size_t leftover; -#endif /* WOLFSSL_ARMASM */ unsigned char buffer[POLY1305_BLOCK_SIZE]; unsigned char finished; -#endif +#endif /* WOLFSSL_ARMASM */ } Poly1305; /* does init */ @@ -141,6 +150,15 @@ void poly1305_blocks_aarch64(Poly1305* ctx, const unsigned char *m, void poly1305_block_aarch64(Poly1305* ctx, const unsigned char *m); #endif +#if defined(WOLFSSL_RISCV_ASM) +#define poly1305_blocks poly1305_blocks_riscv64 +#define poly1305_block poly1305_block_riscv64 + +void poly1305_blocks_riscv64(Poly1305* ctx, const unsigned char *m, + size_t bytes); +void poly1305_block_riscv64(Poly1305* ctx, const unsigned char *m); +#endif + #ifdef __cplusplus } /* extern "C" */ #endif diff --git a/wolfssl/wolfcrypt/port/riscv/riscv-64-asm.h b/wolfssl/wolfcrypt/port/riscv/riscv-64-asm.h index 7ca7291d46..89a84bf6d7 100644 --- a/wolfssl/wolfcrypt/port/riscv/riscv-64-asm.h +++ b/wolfssl/wolfcrypt/port/riscv/riscv-64-asm.h @@ -150,6 +150,15 @@ #endif /* WOLFSSL_RISCV_BASE_BIT_MANIPULATION */ +#ifdef WOLFSSL_RISCV_BIT_MANIPULATION_TERNARY + +/* rd = (rs1|rs3 >> imm)[0..63] */ +#define FSRI(rd, rs1, rs3, imm) \ + ASM_WORD((0b1 << 26) | (0b101 << 12) | (0b0110011 << 0) | \ + (rs3 << 27) | (imm << 20) | (rs1 << 15) | (rd << 7)) + +#endif + /* * Load and store */ @@ -242,6 +251,7 @@ ASM_WORD((0b000000 << 26) | (0b1 << 25) | \ (0b000 << 12) | (0b1010111 << 0) | \ (vs2 << 20) | (vs1 << 15) | (vd << 7)) + /* vd = vs1 ^ vs2 */ #define VXOR_VV(vd, vs1, vs2) \ ASM_WORD((0b001011 << 26) | (0b1 << 25) | \ @@ -252,6 +262,11 @@ ASM_WORD((0b001001 << 26) | (0b1 << 25) | \ (0b000 << 12) | (0b1010111 << 0) | \ (vd << 7) | (vs1 << 15) | (vs2 << 20)) +/* vd = vs1 & rs2 */ +#define VAND_VX(vd, vs2, rs1) \ + ASM_WORD((0b001001 << 26) | (0b1 << 25) | \ + (0b100 << 12) | (0b1010111 << 0) | \ + (vd << 7) | (rs1 << 15) | (vs2 << 20)) /* vd = vs1 | vs2 */ #define VOR_VV(vd, vs1, vs2) \ ASM_WORD((0b001010 << 26) | (0b1 << 25) | \ @@ -259,6 +274,18 @@ (vd << 7) | (vs1 << 15) | (vs2 << 20)) +/* vd = LOW(vs1 * vs2) */ +#define VMUL_VV(vd, vs1, vs2) \ + ASM_WORD((0b100101 << 26) | (0b1 << 25) | \ + (0b010 << 12) | (0b1010111 << 0) | \ + (vs2 << 20) | (vs1 << 15) | (vd << 7)) +/* vd = HIGH(vs1 * vs2) - unsigned * unsigned */ +#define VMULHU_VV(vd, vs1, vs2) \ + ASM_WORD((0b100100 << 26) | (0b1 << 25) | \ + (0b010 << 12) | (0b1010111 << 0) | \ + (vs2 << 20) | (vs1 << 15) | (vd << 7)) + + /* * Permute */