From 67528f91b3f022bee4c43c4bdb6938de4d5792a5 Mon Sep 17 00:00:00 2001 From: Sean Parkinson Date: Mon, 23 Sep 2024 09:05:17 +1000 Subject: [PATCH 01/11] Dilithium: fixes Fixes to hint error dectection. Fix public key decode to fail when DER length is zero for the public key data. --- wolfcrypt/src/dilithium.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/wolfcrypt/src/dilithium.c b/wolfcrypt/src/dilithium.c index da465efcc3..8b29994b0c 100644 --- a/wolfcrypt/src/dilithium.c +++ b/wolfcrypt/src/dilithium.c @@ -3411,7 +3411,7 @@ static int dilithium_check_hint(const byte* h, byte k, byte omega) } } /* Ensure the last hint is less than the current hint. */ - else if (h[i - 1] > h[i]) { + else if (h[i - 1] >= h[i]) { ret = SIG_VERIFY_E; break; } @@ -9654,7 +9654,7 @@ int wc_Dilithium_PublicKeyDecode(const byte* input, word32* inOutIdx, ret = dilitihium_get_der_length(input, &idx, &length, inSz); } if (ret == 0) { - if (input[idx] != 0) { + if ((input[idx] != 0) || (length == 0)) { ret = ASN_PARSE_E; } idx++; From 634e547fba66bab89334d9b436efc9e615d06e62 Mon Sep 17 00:00:00 2001 From: Colton Willey Date: Mon, 23 Sep 2024 10:04:33 -0700 Subject: [PATCH 02/11] Initial implementation of new option to always copy over key to SSL ctx --- src/internal.c | 7 +++++++ src/ssl.c | 7 +++++++ wolfssl/wolfcrypt/settings.h | 5 +++++ 3 files changed, 19 insertions(+) diff --git a/src/internal.c b/src/internal.c index 2fc63753f6..bae4046772 100644 --- a/src/internal.c +++ b/src/internal.c @@ -6829,7 +6829,14 @@ int SetSSL_CTX(WOLFSSL* ssl, WOLFSSL_CTX* ctx, int writeDup) ssl->buffers.certChainCnt = ctx->certChainCnt; #endif #ifndef WOLFSSL_BLIND_PRIVATE_KEY +#ifdef WOLFSSL_COPY_KEY + AllocCopyDer(&ssl->buffers.key, ctx->privateKey->buffer, + ctx->privateKey->length, ctx->privateKey->type, + ctx->privateKey->heap); + ssl->buffers.weOwnKey = 1; +#else ssl->buffers.key = ctx->privateKey; +#endif #else if (ctx->privateKey != NULL) { AllocCopyDer(&ssl->buffers.key, ctx->privateKey->buffer, diff --git a/src/ssl.c b/src/ssl.c index 264f2c04ec..310a1ed2d5 100644 --- a/src/ssl.c +++ b/src/ssl.c @@ -20410,7 +20410,14 @@ WOLFSSL_CTX* wolfSSL_set_SSL_CTX(WOLFSSL* ssl, WOLFSSL_CTX* ctx) ssl->buffers.certChainCnt = ctx->certChainCnt; #endif #ifndef WOLFSSL_BLIND_PRIVATE_KEY +#ifdef WOLFSSL_COPY_KEY + AllocCopyDer(&ssl->buffers.key, ctx->privateKey->buffer, + ctx->privateKey->length, ctx->privateKey->type, + ctx->privateKey->heap); + ssl->buffers.weOwnKey = 1; +#else ssl->buffers.key = ctx->privateKey; +#endif #else if (ctx->privateKey != NULL) { AllocCopyDer(&ssl->buffers.key, ctx->privateKey->buffer, diff --git a/wolfssl/wolfcrypt/settings.h b/wolfssl/wolfcrypt/settings.h index 03cd5e5501..07c4f746b3 100644 --- a/wolfssl/wolfcrypt/settings.h +++ b/wolfssl/wolfcrypt/settings.h @@ -3581,6 +3581,11 @@ extern void uITRON4_free(void *p) ; #define WOLFSSL_COPY_CERT #endif +#if defined(OPENSSL_ALL) && !defined(WOLFSSL_NO_COPY_KEY) + #undef WOLFSSL_COPY_KEY + #define WOLFSSL_COPY_KEY +#endif + /* * Keeps the "Finished" messages after a TLS handshake for use as the so-called * "tls-unique" channel binding. See comment in internal.h around clientFinished From cad2bbd7a7d9200f40e7fa8446c75bfabd196db3 Mon Sep 17 00:00:00 2001 From: Colton Willey Date: Mon, 23 Sep 2024 10:18:23 -0700 Subject: [PATCH 03/11] Add NULL checks on key copy --- src/internal.c | 16 ++++++++++++---- src/ssl.c | 16 ++++++++++++---- 2 files changed, 24 insertions(+), 8 deletions(-) diff --git a/src/internal.c b/src/internal.c index bae4046772..d05238ec89 100644 --- a/src/internal.c +++ b/src/internal.c @@ -6830,10 +6830,18 @@ int SetSSL_CTX(WOLFSSL* ssl, WOLFSSL_CTX* ctx, int writeDup) #endif #ifndef WOLFSSL_BLIND_PRIVATE_KEY #ifdef WOLFSSL_COPY_KEY - AllocCopyDer(&ssl->buffers.key, ctx->privateKey->buffer, - ctx->privateKey->length, ctx->privateKey->type, - ctx->privateKey->heap); - ssl->buffers.weOwnKey = 1; + if (ctx->privateKey != NULL) { + if (ssl->buffers.key != NULL) { + FreeDer(&ssl->buffers.key); + } + AllocCopyDer(&ssl->buffers.key, ctx->privateKey->buffer, + ctx->privateKey->length, ctx->privateKey->type, + ctx->privateKey->heap); + ssl->buffers.weOwnKey = 1; + } + else { + ssl->buffers.key = ctx->privateKey; + } #else ssl->buffers.key = ctx->privateKey; #endif diff --git a/src/ssl.c b/src/ssl.c index 310a1ed2d5..de97c8e5f1 100644 --- a/src/ssl.c +++ b/src/ssl.c @@ -20411,10 +20411,18 @@ WOLFSSL_CTX* wolfSSL_set_SSL_CTX(WOLFSSL* ssl, WOLFSSL_CTX* ctx) #endif #ifndef WOLFSSL_BLIND_PRIVATE_KEY #ifdef WOLFSSL_COPY_KEY - AllocCopyDer(&ssl->buffers.key, ctx->privateKey->buffer, - ctx->privateKey->length, ctx->privateKey->type, - ctx->privateKey->heap); - ssl->buffers.weOwnKey = 1; + if (ctx->privateKey != NULL) { + if (ssl->buffers.key != NULL) { + FreeDer(&ssl->buffers.key); + } + AllocCopyDer(&ssl->buffers.key, ctx->privateKey->buffer, + ctx->privateKey->length, ctx->privateKey->type, + ctx->privateKey->heap); + ssl->buffers.weOwnKey = 1; + } + else { + ssl->buffers.key = ctx->privateKey; + } #else ssl->buffers.key = ctx->privateKey; #endif From 1a4b821c6417d6dd06707d6c3e8bd04e89be85c8 Mon Sep 17 00:00:00 2001 From: Colton Willey Date: Mon, 23 Sep 2024 11:46:19 -0700 Subject: [PATCH 04/11] Add pthread link for liboqs testing --- configure.ac | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/configure.ac b/configure.ac index e07b0bb115..31a85bcfe3 100644 --- a/configure.ac +++ b/configure.ac @@ -1237,7 +1237,7 @@ AC_ARG_WITH([liboqs], tryliboqsdir="/usr/local" fi - CPPFLAGS="$AM_CPPFLAGS -DHAVE_LIBOQS -DHAVE_TLS_EXTENSIONS -I$tryliboqsdir/include" + CPPFLAGS="$AM_CPPFLAGS -DHAVE_LIBOQS -DHAVE_TLS_EXTENSIONS -I$tryliboqsdir/include -pthread" LDFLAGS="$AM_LDFLAGS $LDFLAGS -L$tryliboqsdir/lib" AC_LINK_IFELSE([AC_LANG_PROGRAM([[#include ]], [[ OQS_init(); ]])], [ liboqs_linked=yes ],[ liboqs_linked=no ]) From 2323a5cf59d67c19895ba04cf959c197e133694e Mon Sep 17 00:00:00 2001 From: Sean Parkinson Date: Thu, 26 Sep 2024 18:43:34 +1000 Subject: [PATCH 05/11] ARM32 ChaCha20, Poly1305: assembly code Add assembly code for ChaCha20 and Poly1305 on ARM32 when no NEON available. --- src/include.am | 12 +- wolfcrypt/src/chacha.c | 3 +- wolfcrypt/src/poly1305.c | 11 +- wolfcrypt/src/port/arm/armv8-32-aes-asm_c.c | 2 +- wolfcrypt/src/port/arm/armv8-32-chacha-asm.S | 522 ++++++++++++++++ .../src/port/arm/armv8-32-chacha-asm_c.c | 569 ++++++++++++++++++ .../src/port/arm/armv8-32-poly1305-asm.S | 356 +++++++++++ .../src/port/arm/armv8-32-poly1305-asm_c.c | 388 ++++++++++++ wolfcrypt/src/port/arm/armv8-32-sha3-asm.S | 110 ++-- wolfcrypt/src/port/arm/armv8-32-sha3-asm_c.c | 41 +- wolfcrypt/src/port/arm/armv8-chacha.c | 117 +++- wolfcrypt/src/port/arm/armv8-poly1305.c | 126 +++- wolfssl/wolfcrypt/chacha.h | 8 +- wolfssl/wolfcrypt/poly1305.h | 21 +- 14 files changed, 2177 insertions(+), 109 deletions(-) create mode 100644 wolfcrypt/src/port/arm/armv8-32-chacha-asm.S create mode 100644 wolfcrypt/src/port/arm/armv8-32-chacha-asm_c.c create mode 100644 wolfcrypt/src/port/arm/armv8-32-poly1305-asm.S create mode 100644 wolfcrypt/src/port/arm/armv8-32-poly1305-asm_c.c diff --git a/src/include.am b/src/include.am index c3d8376a1d..dbda409a2f 100644 --- a/src/include.am +++ b/src/include.am @@ -924,8 +924,10 @@ if BUILD_ARMASM src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/arm/armv8-poly1305.c src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/arm/thumb2-poly1305.c if BUILD_ARMASM_INLINE +src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/arm/armv8-32-poly1305-asm_c.c src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/arm/thumb2-poly1305-asm_c.c else +src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/arm/armv8-32-poly1305-asm.S src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/arm/thumb2-poly1305-asm.S endif !BUILD_ARMASM_INLINE endif @@ -999,17 +1001,17 @@ endif if BUILD_CHACHA src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/chacha.c -if BUILD_ARMASM_NEON -src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/arm/armv8-chacha.c -else if BUILD_ARMASM +src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/arm/armv8-chacha.c src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/arm/thumb2-chacha.c if BUILD_ARMASM_INLINE +src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/arm/armv8-32-chacha-asm_c.c src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/arm/thumb2-chacha-asm_c.c else +src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/arm/armv8-32-chacha-asm.S src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/arm/thumb2-chacha-asm.S endif !BUILD_ARMASM_INLINE -endif BUILD_ARMASM +else if BUILD_RISCV_ASM src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/riscv/riscv-64-chacha.c endif BUILD_RISCV_ASM @@ -1018,7 +1020,7 @@ if BUILD_INTELASM src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/chacha_asm.S endif BUILD_INTELASM endif !BUILD_X86_ASM -endif !BUILD_ARMASM_NEON +endif !BUILD_ARMASM if BUILD_POLY1305 src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/chacha20_poly1305.c endif BUILD_POLY1305 diff --git a/wolfcrypt/src/chacha.c b/wolfcrypt/src/chacha.c index f7ee6bba38..84b26eb564 100644 --- a/wolfcrypt/src/chacha.c +++ b/wolfcrypt/src/chacha.c @@ -72,8 +72,7 @@ Public domain. #endif /* HAVE_CHACHA */ -#if defined(WOLFSSL_ARMASM) && (!defined(WOLFSSL_ARMASM_NO_NEON) || \ - defined(__thumb__)) +#if defined(WOLFSSL_ARMASM) /* implementation is located in wolfcrypt/src/port/arm/armv8-chacha.c */ #elif defined(WOLFSSL_RISCV_ASM) diff --git a/wolfcrypt/src/poly1305.c b/wolfcrypt/src/poly1305.c index 48529d78c1..718289c4fd 100644 --- a/wolfcrypt/src/poly1305.c +++ b/wolfcrypt/src/poly1305.c @@ -232,7 +232,7 @@ extern void poly1305_final_avx2(Poly1305* ctx, byte* mac); } #endif/* !WOLFSSL_ARMASM && !WOLFSSL_RISCV_ASM */ /* if not 64 bit then use 32 bit */ -#elif !defined(WOLFSSL_ARMASM) || !defined(__thumb__) +#elif !defined(WOLFSSL_ARMASM) static word32 U8TO32(const byte *p) { @@ -269,8 +269,7 @@ static WC_INLINE void u32tole64(const word32 inLe32, byte outLe64[8]) } -#if (!defined(WOLFSSL_ARMASM) || (!defined(__aarch64__) && \ - !defined(__thumb__))) && !defined(WOLFSSL_RISCV_ASM) +#if !defined(WOLFSSL_ARMASM) && !defined(WOLFSSL_RISCV_ASM) /* This local function operates on a message with a given number of bytes with a given ctx pointer to a Poly1305 structure. @@ -789,8 +788,7 @@ int wc_Poly1305Final(Poly1305* ctx, byte* mac) return 0; } -#endif /* (!WOLFSSL_ARMASM || (!__aarch64__ && !__thumb__)) && - * !WOLFSSL_RISCV_ASM */ +#endif /* !WOLFSSL_ARMASM && !WOLFSSL_RISCV_ASM */ int wc_Poly1305Update(Poly1305* ctx, const byte* m, word32 bytes) @@ -885,8 +883,7 @@ int wc_Poly1305Update(Poly1305* ctx, const byte* m, word32 bytes) /* process full blocks */ if (bytes >= POLY1305_BLOCK_SIZE) { size_t want = ((size_t)bytes & ~((size_t)POLY1305_BLOCK_SIZE - 1)); -#if (!defined(WOLFSSL_ARMASM) || (!defined(__aarch64__) && \ - !defined(__thumb__))) && !defined(WOLFSSL_RISCV_ASM) +#if !defined(WOLFSSL_ARMASM) && !defined(WOLFSSL_RISCV_ASM) int ret; ret = poly1305_blocks(ctx, m, want); if (ret != 0) diff --git a/wolfcrypt/src/port/arm/armv8-32-aes-asm_c.c b/wolfcrypt/src/port/arm/armv8-32-aes-asm_c.c index 97edaf4a9b..f8ba89ac09 100644 --- a/wolfcrypt/src/port/arm/armv8-32-aes-asm_c.c +++ b/wolfcrypt/src/port/arm/armv8-32-aes-asm_c.c @@ -411,7 +411,7 @@ void AES_invert_key(unsigned char* ks_p, word32 rounds_p) static const uint32_t L_AES_ARM32_rcon[] = { 0x01000000, 0x02000000, 0x04000000, 0x08000000, 0x10000000, 0x20000000, 0x40000000, 0x80000000, - 0x1b000000, 0x36000000, + 0x1b000000, 0x36000000, }; void AES_set_encrypt_key(const unsigned char* key, word32 len, unsigned char* ks); diff --git a/wolfcrypt/src/port/arm/armv8-32-chacha-asm.S b/wolfcrypt/src/port/arm/armv8-32-chacha-asm.S new file mode 100644 index 0000000000..77ec219081 --- /dev/null +++ b/wolfcrypt/src/port/arm/armv8-32-chacha-asm.S @@ -0,0 +1,522 @@ +/* armv8-32-chacha-asm + * + * Copyright (C) 2006-2024 wolfSSL Inc. + * + * This file is part of wolfSSL. + * + * wolfSSL is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * wolfSSL is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA + */ + +/* Generated using (from wolfssl): + * cd ../scripts + * ruby ./chacha/chacha.rb arm32 ../wolfssl/wolfcrypt/src/port/arm/armv8-32-chacha-asm.S + */ + +#ifdef HAVE_CONFIG_H + #include +#endif /* HAVE_CONFIG_H */ +#include + +#ifdef WOLFSSL_ARMASM +#if !defined(__aarch64__) && defined(__arm__) && !defined(__thumb__) +#ifndef WOLFSSL_ARMASM_INLINE +#ifdef HAVE_CHACHA + .text + .align 4 + .globl wc_chacha_setiv + .type wc_chacha_setiv, %function +wc_chacha_setiv: + push {r4, lr} + add r3, r0, #52 + ldr r4, [r1] + ldr r12, [r1, #4] + ldr lr, [r1, #8] + str r2, [r0, #48] +#ifdef BIG_ENDIAN_ORDER + rev r4, r4 + rev r12, r12 + rev lr, lr +#endif /* BIG_ENDIAN_ORDER */ + stm r3, {r4, r12, lr} + pop {r4, pc} + .size wc_chacha_setiv,.-wc_chacha_setiv + .text + .type L_chacha_arm32_constants, %object + .size L_chacha_arm32_constants, 32 + .align 4 +L_chacha_arm32_constants: + .word 0x61707865 + .word 0x3120646e + .word 0x79622d36 + .word 0x6b206574 + .word 0x61707865 + .word 0x3320646e + .word 0x79622d32 + .word 0x6b206574 + .text + .align 4 + .globl wc_chacha_setkey + .type wc_chacha_setkey, %function +wc_chacha_setkey: + push {r4, r5, lr} + adr r3, L_chacha_arm32_constants + subs r2, r2, #16 + add r3, r3, r2 + # Start state with constants + ldm r3, {r4, r5, r12, lr} + stm r0!, {r4, r5, r12, lr} + # Next is first 16 bytes of key. + ldr r4, [r1] + ldr r5, [r1, #4] + ldr r12, [r1, #8] + ldr lr, [r1, #12] +#ifdef BIG_ENDIAN_ORDER + rev r4, r4 + rev r5, r5 + rev r12, r12 + rev lr, lr +#endif /* BIG_ENDIAN_ORDER */ + stm r0!, {r4, r5, r12, lr} + # Next 16 bytes of key. + beq L_chacha_arm32_setkey_same_keyb_ytes + # Update key pointer for next 16 bytes. + add r1, r1, r2 + ldr r4, [r1] + ldr r5, [r1, #4] + ldr r12, [r1, #8] + ldr lr, [r1, #12] +L_chacha_arm32_setkey_same_keyb_ytes: + stm r0, {r4, r5, r12, lr} + pop {r4, r5, pc} + .size wc_chacha_setkey,.-wc_chacha_setkey +#ifdef WOLFSSL_ARMASM_NO_NEON + .text + .align 4 + .globl wc_chacha_crypt_bytes + .type wc_chacha_crypt_bytes, %function +wc_chacha_crypt_bytes: + push {r4, r5, r6, r7, r8, r9, r10, r11, lr} + sub sp, sp, #52 + mov lr, r0 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + str r0, [sp, #32] + str r1, [sp, #36] +#else + strd r0, r1, [sp, #32] +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + str r2, [sp, #40] + str r3, [sp, #44] +#else + strd r2, r3, [sp, #40] +#endif +L_chacha_arm32_crypt_block: + # Put x[12]..x[15] onto stack. +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + ldr r4, [lr, #48] + ldr r5, [lr, #52] +#else + ldrd r4, r5, [lr, #48] +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + ldr r6, [lr, #56] + ldr r7, [lr, #60] +#else + ldrd r6, r7, [lr, #56] +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + str r4, [sp, #16] + str r5, [sp, #20] +#else + strd r4, r5, [sp, #16] +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + str r6, [sp, #24] + str r7, [sp, #28] +#else + strd r6, r7, [sp, #24] +#endif + # Load x[0]..x[12] into registers. + ldm lr, {r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12} + # 10x 2 full rounds to perform. + mov lr, #10 + str lr, [sp, #48] +L_chacha_arm32_crypt_loop: + # 0, 4, 8, 12 + # 1, 5, 9, 13 + ldr lr, [sp, #20] + add r0, r0, r4 + add r1, r1, r5 + eor r12, r12, r0 + eor lr, lr, r1 + ror r12, r12, #16 + ror lr, lr, #16 + add r8, r8, r12 + add r9, r9, lr + eor r4, r4, r8 + eor r5, r5, r9 + ror r4, r4, #20 + ror r5, r5, #20 + add r0, r0, r4 + add r1, r1, r5 + eor r12, r12, r0 + eor lr, lr, r1 + ror r12, r12, #24 + ror lr, lr, #24 + add r8, r8, r12 + add r9, r9, lr + eor r4, r4, r8 + eor r5, r5, r9 + ror r4, r4, #25 + ror r5, r5, #25 + str r12, [sp, #16] + str lr, [sp, #20] + # 2, 6, 10, 14 + # 3, 7, 11, 15 + ldr r12, [sp, #24] + ldr lr, [sp, #28] + add r2, r2, r6 + add r3, r3, r7 + eor r12, r12, r2 + eor lr, lr, r3 + ror r12, r12, #16 + ror lr, lr, #16 + add r10, r10, r12 + add r11, r11, lr + eor r6, r6, r10 + eor r7, r7, r11 + ror r6, r6, #20 + ror r7, r7, #20 + add r2, r2, r6 + add r3, r3, r7 + eor r12, r12, r2 + eor lr, lr, r3 + ror r12, r12, #24 + ror lr, lr, #24 + add r10, r10, r12 + add r11, r11, lr + eor r6, r6, r10 + eor r7, r7, r11 + ror r6, r6, #25 + ror r7, r7, #25 + # 3, 4, 9, 14 + # 0, 5, 10, 15 + add r3, r3, r4 + add r0, r0, r5 + eor r12, r12, r3 + eor lr, lr, r0 + ror r12, r12, #16 + ror lr, lr, #16 + add r9, r9, r12 + add r10, r10, lr + eor r4, r4, r9 + eor r5, r5, r10 + ror r4, r4, #20 + ror r5, r5, #20 + add r3, r3, r4 + add r0, r0, r5 + eor r12, r12, r3 + eor lr, lr, r0 + ror r12, r12, #24 + ror lr, lr, #24 + add r9, r9, r12 + add r10, r10, lr + eor r4, r4, r9 + eor r5, r5, r10 + ror r4, r4, #25 + ror r5, r5, #25 + str r12, [sp, #24] + str lr, [sp, #28] + ldr r12, [sp, #16] + ldr lr, [sp, #20] + # 1, 6, 11, 12 + # 2, 7, 8, 13 + add r1, r1, r6 + add r2, r2, r7 + eor r12, r12, r1 + eor lr, lr, r2 + ror r12, r12, #16 + ror lr, lr, #16 + add r11, r11, r12 + add r8, r8, lr + eor r6, r6, r11 + eor r7, r7, r8 + ror r6, r6, #20 + ror r7, r7, #20 + add r1, r1, r6 + add r2, r2, r7 + eor r12, r12, r1 + eor lr, lr, r2 + ror r12, r12, #24 + ror lr, lr, #24 + add r11, r11, r12 + add r8, r8, lr + eor r6, r6, r11 + eor r7, r7, r8 + ror r6, r6, #25 + ror r7, r7, #25 + str lr, [sp, #20] + # Check if we have done enough rounds. + ldr lr, [sp, #48] + subs lr, lr, #1 + str lr, [sp, #48] + bgt L_chacha_arm32_crypt_loop + stm sp, {r8, r9, r10, r11, r12} + ldr lr, [sp, #32] + mov r12, sp + # Add in original state + ldm lr!, {r8, r9, r10, r11} + add r0, r0, r8 + add r1, r1, r9 + add r2, r2, r10 + add r3, r3, r11 + ldm lr!, {r8, r9, r10, r11} + add r4, r4, r8 + add r5, r5, r9 + add r6, r6, r10 + add r7, r7, r11 + ldm r12, {r8, r9} + ldm lr!, {r10, r11} + add r8, r8, r10 + add r9, r9, r11 + stm r12!, {r8, r9} + ldm r12, {r8, r9} + ldm lr!, {r10, r11} + add r8, r8, r10 + add r9, r9, r11 + stm r12!, {r8, r9} + ldm r12, {r8, r9} + ldm lr!, {r10, r11} + add r8, r8, r10 + add r9, r9, r11 + add r10, r10, #1 + stm r12!, {r8, r9} + str r10, [lr, #-8] + ldm r12, {r8, r9} + ldm lr, {r10, r11} + add r8, r8, r10 + add r9, r9, r11 + stm r12, {r8, r9} + ldr r12, [sp, #44] + cmp r12, #0x40 + blt L_chacha_arm32_crypt_lt_block + ldr r12, [sp, #40] + ldr lr, [sp, #36] + # XOR state into 64 bytes. + ldr r8, [r12] + ldr r9, [r12, #4] + ldr r10, [r12, #8] + ldr r11, [r12, #12] + eor r0, r0, r8 + eor r1, r1, r9 + eor r2, r2, r10 + eor r3, r3, r11 + str r0, [lr] + str r1, [lr, #4] + str r2, [lr, #8] + str r3, [lr, #12] + ldr r8, [r12, #16] + ldr r9, [r12, #20] + ldr r10, [r12, #24] + ldr r11, [r12, #28] + eor r4, r4, r8 + eor r5, r5, r9 + eor r6, r6, r10 + eor r7, r7, r11 + str r4, [lr, #16] + str r5, [lr, #20] + str r6, [lr, #24] + str r7, [lr, #28] + ldr r4, [sp] + ldr r5, [sp, #4] + ldr r6, [sp, #8] + ldr r7, [sp, #12] + ldr r8, [r12, #32] + ldr r9, [r12, #36] + ldr r10, [r12, #40] + ldr r11, [r12, #44] + eor r4, r4, r8 + eor r5, r5, r9 + eor r6, r6, r10 + eor r7, r7, r11 + str r4, [lr, #32] + str r5, [lr, #36] + str r6, [lr, #40] + str r7, [lr, #44] + ldr r4, [sp, #16] + ldr r5, [sp, #20] + ldr r6, [sp, #24] + ldr r7, [sp, #28] + ldr r8, [r12, #48] + ldr r9, [r12, #52] + ldr r10, [r12, #56] + ldr r11, [r12, #60] + eor r4, r4, r8 + eor r5, r5, r9 + eor r6, r6, r10 + eor r7, r7, r11 + str r4, [lr, #48] + str r5, [lr, #52] + str r6, [lr, #56] + str r7, [lr, #60] + ldr r3, [sp, #44] + add r12, r12, #0x40 + add lr, lr, #0x40 + str r12, [sp, #40] + str lr, [sp, #36] + subs r3, r3, #0x40 + ldr lr, [sp, #32] + str r3, [sp, #44] + bne L_chacha_arm32_crypt_block + b L_chacha_arm32_crypt_done +L_chacha_arm32_crypt_lt_block: + # Store in over field of ChaCha. + ldr lr, [sp, #32] + add r12, lr, #0x44 + stm r12!, {r0, r1, r2, r3, r4, r5, r6, r7} + ldm sp, {r0, r1, r2, r3, r4, r5, r6, r7} + stm r12, {r0, r1, r2, r3, r4, r5, r6, r7} +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + ldr r2, [sp, #40] + ldr r3, [sp, #44] +#else + ldrd r2, r3, [sp, #40] +#endif + ldr r1, [sp, #36] + rsb r12, r3, #0x40 + str r12, [lr, #64] + add lr, lr, #0x44 +L_chacha_arm32_crypt_16byte_loop: + cmp r3, #16 + blt L_chacha_arm32_crypt_word_loop + # 16 bytes of state XORed into message. + ldm lr!, {r4, r5, r6, r7} + ldr r8, [r2] + ldr r9, [r2, #4] + ldr r10, [r2, #8] + ldr r11, [r2, #12] + eor r8, r8, r4 + eor r9, r9, r5 + eor r10, r10, r6 + eor r11, r11, r7 + subs r3, r3, #16 + str r8, [r1] + str r9, [r1, #4] + str r10, [r1, #8] + str r11, [r1, #12] + beq L_chacha_arm32_crypt_done + add r2, r2, #16 + add r1, r1, #16 + b L_chacha_arm32_crypt_16byte_loop +L_chacha_arm32_crypt_word_loop: + cmp r3, #4 + blt L_chacha_arm32_crypt_byte_start + # 4 bytes of state XORed into message. + ldr r4, [lr] + ldr r8, [r2] + eor r8, r8, r4 + subs r3, r3, #4 + str r8, [r1] + beq L_chacha_arm32_crypt_done + add lr, lr, #4 + add r2, r2, #4 + add r1, r1, #4 + b L_chacha_arm32_crypt_word_loop +L_chacha_arm32_crypt_byte_start: + ldr r4, [lr] +L_chacha_arm32_crypt_byte_loop: + ldrb r8, [r2] + eor r8, r8, r4 + subs r3, r3, #1 + strb r8, [r1] + beq L_chacha_arm32_crypt_done + lsr r4, r4, #8 + add r2, r2, #1 + add r1, r1, #1 + b L_chacha_arm32_crypt_byte_loop +L_chacha_arm32_crypt_done: + add sp, sp, #52 + pop {r4, r5, r6, r7, r8, r9, r10, r11, pc} + .size wc_chacha_crypt_bytes,.-wc_chacha_crypt_bytes + .text + .align 4 + .globl wc_chacha_use_over + .type wc_chacha_use_over, %function +wc_chacha_use_over: + push {r4, r5, r6, r7, r8, r9, lr} +L_chacha_arm32_over_16byte_loop: + cmp r3, #16 + blt L_chacha_arm32_over_word_loop + # 16 bytes of state XORed into message. + ldr r12, [r0] + ldr lr, [r0, #4] + ldr r4, [r0, #8] + ldr r5, [r0, #12] + ldr r6, [r2] + ldr r7, [r2, #4] + ldr r8, [r2, #8] + ldr r9, [r2, #12] + eor r12, r12, r6 + eor lr, lr, r7 + eor r4, r4, r8 + eor r5, r5, r9 + subs r3, r3, #16 + str r12, [r1] + str lr, [r1, #4] + str r4, [r1, #8] + str r5, [r1, #12] + beq L_chacha_arm32_over_done + add r0, r0, #16 + add r2, r2, #16 + add r1, r1, #16 + b L_chacha_arm32_over_16byte_loop +L_chacha_arm32_over_word_loop: + cmp r3, #4 + blt L_chacha_arm32_over_byte_loop + # 4 bytes of state XORed into message. + ldr r12, [r0] + ldr r6, [r2] + eor r12, r12, r6 + subs r3, r3, #4 + str r12, [r1] + beq L_chacha_arm32_over_done + add r0, r0, #4 + add r2, r2, #4 + add r1, r1, #4 + b L_chacha_arm32_over_word_loop +L_chacha_arm32_over_byte_loop: + # 4 bytes of state XORed into message. + ldrb r12, [r0] + ldrb r6, [r2] + eor r12, r12, r6 + subs r3, r3, #1 + strb r12, [r1] + beq L_chacha_arm32_over_done + add r0, r0, #1 + add r2, r2, #1 + add r1, r1, #1 + b L_chacha_arm32_over_byte_loop +L_chacha_arm32_over_done: + pop {r4, r5, r6, r7, r8, r9, pc} + .size wc_chacha_use_over,.-wc_chacha_use_over +#endif /* WOLFSSL_ARMASM_NO_NEON */ +#endif /* HAVE_CHACHA */ +#endif /* !__aarch64__ && __arm__ && !__thumb__ */ +#endif /* WOLFSSL_ARMASM */ + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif +#endif /* !WOLFSSL_ARMASM_INLINE */ diff --git a/wolfcrypt/src/port/arm/armv8-32-chacha-asm_c.c b/wolfcrypt/src/port/arm/armv8-32-chacha-asm_c.c new file mode 100644 index 0000000000..8c80fc4ad9 --- /dev/null +++ b/wolfcrypt/src/port/arm/armv8-32-chacha-asm_c.c @@ -0,0 +1,569 @@ +/* armv8-32-chacha-asm + * + * Copyright (C) 2006-2024 wolfSSL Inc. + * + * This file is part of wolfSSL. + * + * wolfSSL is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * wolfSSL is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA + */ + +/* Generated using (from wolfssl): + * cd ../scripts + * ruby ./chacha/chacha.rb arm32 ../wolfssl/wolfcrypt/src/port/arm/armv8-32-chacha-asm.c + */ + +#ifdef HAVE_CONFIG_H + #include +#endif /* HAVE_CONFIG_H */ +#include +#include + +#ifdef WOLFSSL_ARMASM +#if !defined(__aarch64__) && defined(__arm__) && !defined(__thumb__) +#include +#ifdef HAVE_CONFIG_H + #include +#endif /* HAVE_CONFIG_H */ +#include +#include +#ifdef WOLFSSL_ARMASM_INLINE + +#ifdef WOLFSSL_ARMASM +#if !defined(__aarch64__) && defined(__arm__) && !defined(__thumb__) + +#ifdef __IAR_SYSTEMS_ICC__ +#define __asm__ asm +#define __volatile__ volatile +#endif /* __IAR_SYSTEMS_ICC__ */ +#ifdef __KEIL__ +#define __asm__ __asm +#define __volatile__ volatile +#endif /* __KEIL__ */ +#ifdef HAVE_CHACHA +#include + +void wc_chacha_setiv(word32* x_p, const byte* iv_p, word32 counter_p) +{ + register word32* x asm ("r0") = (word32*)x_p; + register const byte* iv asm ("r1") = (const byte*)iv_p; + register word32 counter asm ("r2") = (word32)counter_p; + + __asm__ __volatile__ ( + "add r3, %[x], #52\n\t" + "ldr r4, [%[iv]]\n\t" + "ldr r12, [%[iv], #4]\n\t" + "ldr lr, [%[iv], #8]\n\t" + "str %[counter], [%[x], #48]\n\t" +#ifdef BIG_ENDIAN_ORDER + "rev r4, r4\n\t" + "rev r12, r12\n\t" + "rev lr, lr\n\t" +#endif /* BIG_ENDIAN_ORDER */ + "stm r3, {r4, r12, lr}\n\t" + : [x] "+r" (x), [iv] "+r" (iv), [counter] "+r" (counter) + : + : "memory", "r3", "r12", "lr", "r4", "cc" + ); +} + +static const uint32_t L_chacha_arm32_constants[] = { + 0x61707865, 0x3120646e, 0x79622d36, 0x6b206574, + 0x61707865, 0x3320646e, 0x79622d32, 0x6b206574, +}; + +void wc_chacha_setkey(word32* x_p, const byte* key_p, word32 keySz_p) +{ + register word32* x asm ("r0") = (word32*)x_p; + register const byte* key asm ("r1") = (const byte*)key_p; + register word32 keySz asm ("r2") = (word32)keySz_p; + register uint32_t* L_chacha_arm32_constants_c asm ("r3") = (uint32_t*)&L_chacha_arm32_constants; + + __asm__ __volatile__ ( + "subs %[keySz], %[keySz], #16\n\t" + "add r3, r3, %[keySz]\n\t" + /* Start state with constants */ + "ldm r3, {r4, r5, r12, lr}\n\t" + "stm %[x]!, {r4, r5, r12, lr}\n\t" + /* Next is first 16 bytes of key. */ + "ldr r4, [%[key]]\n\t" + "ldr r5, [%[key], #4]\n\t" + "ldr r12, [%[key], #8]\n\t" + "ldr lr, [%[key], #12]\n\t" +#ifdef BIG_ENDIAN_ORDER + "rev r4, r4\n\t" + "rev r5, r5\n\t" + "rev r12, r12\n\t" + "rev lr, lr\n\t" +#endif /* BIG_ENDIAN_ORDER */ + "stm %[x]!, {r4, r5, r12, lr}\n\t" + /* Next 16 bytes of key. */ + "beq L_chacha_arm32_setkey_same_keyb_ytes_%=\n\t" + /* Update key pointer for next 16 bytes. */ + "add %[key], %[key], %[keySz]\n\t" + "ldr r4, [%[key]]\n\t" + "ldr r5, [%[key], #4]\n\t" + "ldr r12, [%[key], #8]\n\t" + "ldr lr, [%[key], #12]\n\t" + "\n" + "L_chacha_arm32_setkey_same_keyb_ytes_%=: \n\t" + "stm %[x], {r4, r5, r12, lr}\n\t" + : [x] "+r" (x), [key] "+r" (key), [keySz] "+r" (keySz), [L_chacha_arm32_constants] "+r" (L_chacha_arm32_constants_c) + : + : "memory", "r12", "lr", "r4", "r5", "cc" + ); +} + +#ifdef WOLFSSL_ARMASM_NO_NEON +void wc_chacha_crypt_bytes(ChaCha* ctx_p, byte* c_p, const byte* m_p, word32 len_p) +{ + register ChaCha* ctx asm ("r0") = (ChaCha*)ctx_p; + register byte* c asm ("r1") = (byte*)c_p; + register const byte* m asm ("r2") = (const byte*)m_p; + register word32 len asm ("r3") = (word32)len_p; + + __asm__ __volatile__ ( + "sub sp, sp, #52\n\t" + "mov lr, %[ctx]\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "str %[ctx], [sp, #32]\n\t" + "str %[c], [sp, #36]\n\t" +#else + "strd %[ctx], %[c], [sp, #32]\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "str %[m], [sp, #40]\n\t" + "str %[len], [sp, #44]\n\t" +#else + "strd %[m], %[len], [sp, #40]\n\t" +#endif + "\n" + "L_chacha_arm32_crypt_block_%=: \n\t" + /* Put x[12]..x[15] onto stack. */ +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "ldr r4, [lr, #48]\n\t" + "ldr r5, [lr, #52]\n\t" +#else + "ldrd r4, r5, [lr, #48]\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "ldr r6, [lr, #56]\n\t" + "ldr r7, [lr, #60]\n\t" +#else + "ldrd r6, r7, [lr, #56]\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "str r4, [sp, #16]\n\t" + "str r5, [sp, #20]\n\t" +#else + "strd r4, r5, [sp, #16]\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "str r6, [sp, #24]\n\t" + "str r7, [sp, #28]\n\t" +#else + "strd r6, r7, [sp, #24]\n\t" +#endif + /* Load x[0]..x[12] into registers. */ + "ldm lr, {%[ctx], %[c], %[m], %[len], r4, r5, r6, r7, r8, r9, r10, r11, r12}\n\t" + /* 10x 2 full rounds to perform. */ + "mov lr, #10\n\t" + "str lr, [sp, #48]\n\t" + "\n" + "L_chacha_arm32_crypt_loop_%=: \n\t" + /* 0, 4, 8, 12 */ + /* 1, 5, 9, 13 */ + "ldr lr, [sp, #20]\n\t" + "add %[ctx], %[ctx], r4\n\t" + "add %[c], %[c], r5\n\t" + "eor r12, r12, %[ctx]\n\t" + "eor lr, lr, %[c]\n\t" + "ror r12, r12, #16\n\t" + "ror lr, lr, #16\n\t" + "add r8, r8, r12\n\t" + "add r9, r9, lr\n\t" + "eor r4, r4, r8\n\t" + "eor r5, r5, r9\n\t" + "ror r4, r4, #20\n\t" + "ror r5, r5, #20\n\t" + "add %[ctx], %[ctx], r4\n\t" + "add %[c], %[c], r5\n\t" + "eor r12, r12, %[ctx]\n\t" + "eor lr, lr, %[c]\n\t" + "ror r12, r12, #24\n\t" + "ror lr, lr, #24\n\t" + "add r8, r8, r12\n\t" + "add r9, r9, lr\n\t" + "eor r4, r4, r8\n\t" + "eor r5, r5, r9\n\t" + "ror r4, r4, #25\n\t" + "ror r5, r5, #25\n\t" + "str r12, [sp, #16]\n\t" + "str lr, [sp, #20]\n\t" + /* 2, 6, 10, 14 */ + /* 3, 7, 11, 15 */ + "ldr r12, [sp, #24]\n\t" + "ldr lr, [sp, #28]\n\t" + "add %[m], %[m], r6\n\t" + "add %[len], %[len], r7\n\t" + "eor r12, r12, %[m]\n\t" + "eor lr, lr, %[len]\n\t" + "ror r12, r12, #16\n\t" + "ror lr, lr, #16\n\t" + "add r10, r10, r12\n\t" + "add r11, r11, lr\n\t" + "eor r6, r6, r10\n\t" + "eor r7, r7, r11\n\t" + "ror r6, r6, #20\n\t" + "ror r7, r7, #20\n\t" + "add %[m], %[m], r6\n\t" + "add %[len], %[len], r7\n\t" + "eor r12, r12, %[m]\n\t" + "eor lr, lr, %[len]\n\t" + "ror r12, r12, #24\n\t" + "ror lr, lr, #24\n\t" + "add r10, r10, r12\n\t" + "add r11, r11, lr\n\t" + "eor r6, r6, r10\n\t" + "eor r7, r7, r11\n\t" + "ror r6, r6, #25\n\t" + "ror r7, r7, #25\n\t" + /* 3, 4, 9, 14 */ + /* 0, 5, 10, 15 */ + "add %[len], %[len], r4\n\t" + "add %[ctx], %[ctx], r5\n\t" + "eor r12, r12, %[len]\n\t" + "eor lr, lr, %[ctx]\n\t" + "ror r12, r12, #16\n\t" + "ror lr, lr, #16\n\t" + "add r9, r9, r12\n\t" + "add r10, r10, lr\n\t" + "eor r4, r4, r9\n\t" + "eor r5, r5, r10\n\t" + "ror r4, r4, #20\n\t" + "ror r5, r5, #20\n\t" + "add %[len], %[len], r4\n\t" + "add %[ctx], %[ctx], r5\n\t" + "eor r12, r12, %[len]\n\t" + "eor lr, lr, %[ctx]\n\t" + "ror r12, r12, #24\n\t" + "ror lr, lr, #24\n\t" + "add r9, r9, r12\n\t" + "add r10, r10, lr\n\t" + "eor r4, r4, r9\n\t" + "eor r5, r5, r10\n\t" + "ror r4, r4, #25\n\t" + "ror r5, r5, #25\n\t" + "str r12, [sp, #24]\n\t" + "str lr, [sp, #28]\n\t" + "ldr r12, [sp, #16]\n\t" + "ldr lr, [sp, #20]\n\t" + /* 1, 6, 11, 12 */ + /* 2, 7, 8, 13 */ + "add %[c], %[c], r6\n\t" + "add %[m], %[m], r7\n\t" + "eor r12, r12, %[c]\n\t" + "eor lr, lr, %[m]\n\t" + "ror r12, r12, #16\n\t" + "ror lr, lr, #16\n\t" + "add r11, r11, r12\n\t" + "add r8, r8, lr\n\t" + "eor r6, r6, r11\n\t" + "eor r7, r7, r8\n\t" + "ror r6, r6, #20\n\t" + "ror r7, r7, #20\n\t" + "add %[c], %[c], r6\n\t" + "add %[m], %[m], r7\n\t" + "eor r12, r12, %[c]\n\t" + "eor lr, lr, %[m]\n\t" + "ror r12, r12, #24\n\t" + "ror lr, lr, #24\n\t" + "add r11, r11, r12\n\t" + "add r8, r8, lr\n\t" + "eor r6, r6, r11\n\t" + "eor r7, r7, r8\n\t" + "ror r6, r6, #25\n\t" + "ror r7, r7, #25\n\t" + "str lr, [sp, #20]\n\t" + /* Check if we have done enough rounds. */ + "ldr lr, [sp, #48]\n\t" + "subs lr, lr, #1\n\t" + "str lr, [sp, #48]\n\t" + "bgt L_chacha_arm32_crypt_loop_%=\n\t" + "stm sp, {r8, r9, r10, r11, r12}\n\t" + "ldr lr, [sp, #32]\n\t" + "mov r12, sp\n\t" + /* Add in original state */ + "ldm lr!, {r8, r9, r10, r11}\n\t" + "add %[ctx], %[ctx], r8\n\t" + "add %[c], %[c], r9\n\t" + "add %[m], %[m], r10\n\t" + "add %[len], %[len], r11\n\t" + "ldm lr!, {r8, r9, r10, r11}\n\t" + "add r4, r4, r8\n\t" + "add r5, r5, r9\n\t" + "add r6, r6, r10\n\t" + "add r7, r7, r11\n\t" + "ldm r12, {r8, r9}\n\t" + "ldm lr!, {r10, r11}\n\t" + "add r8, r8, r10\n\t" + "add r9, r9, r11\n\t" + "stm r12!, {r8, r9}\n\t" + "ldm r12, {r8, r9}\n\t" + "ldm lr!, {r10, r11}\n\t" + "add r8, r8, r10\n\t" + "add r9, r9, r11\n\t" + "stm r12!, {r8, r9}\n\t" + "ldm r12, {r8, r9}\n\t" + "ldm lr!, {r10, r11}\n\t" + "add r8, r8, r10\n\t" + "add r9, r9, r11\n\t" + "add r10, r10, #1\n\t" + "stm r12!, {r8, r9}\n\t" + "str r10, [lr, #-8]\n\t" + "ldm r12, {r8, r9}\n\t" + "ldm lr, {r10, r11}\n\t" + "add r8, r8, r10\n\t" + "add r9, r9, r11\n\t" + "stm r12, {r8, r9}\n\t" + "ldr r12, [sp, #44]\n\t" + "cmp r12, #0x40\n\t" + "blt L_chacha_arm32_crypt_lt_block_%=\n\t" + "ldr r12, [sp, #40]\n\t" + "ldr lr, [sp, #36]\n\t" + /* XOR state into 64 bytes. */ + "ldr r8, [r12]\n\t" + "ldr r9, [r12, #4]\n\t" + "ldr r10, [r12, #8]\n\t" + "ldr r11, [r12, #12]\n\t" + "eor %[ctx], %[ctx], r8\n\t" + "eor %[c], %[c], r9\n\t" + "eor %[m], %[m], r10\n\t" + "eor %[len], %[len], r11\n\t" + "str %[ctx], [lr]\n\t" + "str %[c], [lr, #4]\n\t" + "str %[m], [lr, #8]\n\t" + "str %[len], [lr, #12]\n\t" + "ldr r8, [r12, #16]\n\t" + "ldr r9, [r12, #20]\n\t" + "ldr r10, [r12, #24]\n\t" + "ldr r11, [r12, #28]\n\t" + "eor r4, r4, r8\n\t" + "eor r5, r5, r9\n\t" + "eor r6, r6, r10\n\t" + "eor r7, r7, r11\n\t" + "str r4, [lr, #16]\n\t" + "str r5, [lr, #20]\n\t" + "str r6, [lr, #24]\n\t" + "str r7, [lr, #28]\n\t" + "ldr r4, [sp]\n\t" + "ldr r5, [sp, #4]\n\t" + "ldr r6, [sp, #8]\n\t" + "ldr r7, [sp, #12]\n\t" + "ldr r8, [r12, #32]\n\t" + "ldr r9, [r12, #36]\n\t" + "ldr r10, [r12, #40]\n\t" + "ldr r11, [r12, #44]\n\t" + "eor r4, r4, r8\n\t" + "eor r5, r5, r9\n\t" + "eor r6, r6, r10\n\t" + "eor r7, r7, r11\n\t" + "str r4, [lr, #32]\n\t" + "str r5, [lr, #36]\n\t" + "str r6, [lr, #40]\n\t" + "str r7, [lr, #44]\n\t" + "ldr r4, [sp, #16]\n\t" + "ldr r5, [sp, #20]\n\t" + "ldr r6, [sp, #24]\n\t" + "ldr r7, [sp, #28]\n\t" + "ldr r8, [r12, #48]\n\t" + "ldr r9, [r12, #52]\n\t" + "ldr r10, [r12, #56]\n\t" + "ldr r11, [r12, #60]\n\t" + "eor r4, r4, r8\n\t" + "eor r5, r5, r9\n\t" + "eor r6, r6, r10\n\t" + "eor r7, r7, r11\n\t" + "str r4, [lr, #48]\n\t" + "str r5, [lr, #52]\n\t" + "str r6, [lr, #56]\n\t" + "str r7, [lr, #60]\n\t" + "ldr %[len], [sp, #44]\n\t" + "add r12, r12, #0x40\n\t" + "add lr, lr, #0x40\n\t" + "str r12, [sp, #40]\n\t" + "str lr, [sp, #36]\n\t" + "subs %[len], %[len], #0x40\n\t" + "ldr lr, [sp, #32]\n\t" + "str %[len], [sp, #44]\n\t" + "bne L_chacha_arm32_crypt_block_%=\n\t" + "b L_chacha_arm32_crypt_done_%=\n\t" + "\n" + "L_chacha_arm32_crypt_lt_block_%=: \n\t" + /* Store in over field of ChaCha. */ + "ldr lr, [sp, #32]\n\t" + "add r12, lr, #0x44\n\t" + "stm r12!, {%[ctx], %[c], %[m], %[len], r4, r5, r6, r7}\n\t" + "ldm sp, {%[ctx], %[c], %[m], %[len], r4, r5, r6, r7}\n\t" + "stm r12, {%[ctx], %[c], %[m], %[len], r4, r5, r6, r7}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "ldr %[m], [sp, #40]\n\t" + "ldr %[len], [sp, #44]\n\t" +#else + "ldrd %[m], %[len], [sp, #40]\n\t" +#endif + "ldr %[c], [sp, #36]\n\t" + "rsb r12, %[len], #0x40\n\t" + "str r12, [lr, #64]\n\t" + "add lr, lr, #0x44\n\t" + "\n" + "L_chacha_arm32_crypt_16byte_loop_%=: \n\t" + "cmp %[len], #16\n\t" + "blt L_chacha_arm32_crypt_word_loop_%=\n\t" + /* 16 bytes of state XORed into message. */ + "ldm lr!, {r4, r5, r6, r7}\n\t" + "ldr r8, [%[m]]\n\t" + "ldr r9, [%[m], #4]\n\t" + "ldr r10, [%[m], #8]\n\t" + "ldr r11, [%[m], #12]\n\t" + "eor r8, r8, r4\n\t" + "eor r9, r9, r5\n\t" + "eor r10, r10, r6\n\t" + "eor r11, r11, r7\n\t" + "subs %[len], %[len], #16\n\t" + "str r8, [%[c]]\n\t" + "str r9, [%[c], #4]\n\t" + "str r10, [%[c], #8]\n\t" + "str r11, [%[c], #12]\n\t" + "beq L_chacha_arm32_crypt_done_%=\n\t" + "add %[m], %[m], #16\n\t" + "add %[c], %[c], #16\n\t" + "b L_chacha_arm32_crypt_16byte_loop_%=\n\t" + "\n" + "L_chacha_arm32_crypt_word_loop_%=: \n\t" + "cmp %[len], #4\n\t" + "blt L_chacha_arm32_crypt_byte_start_%=\n\t" + /* 4 bytes of state XORed into message. */ + "ldr r4, [lr]\n\t" + "ldr r8, [%[m]]\n\t" + "eor r8, r8, r4\n\t" + "subs %[len], %[len], #4\n\t" + "str r8, [%[c]]\n\t" + "beq L_chacha_arm32_crypt_done_%=\n\t" + "add lr, lr, #4\n\t" + "add %[m], %[m], #4\n\t" + "add %[c], %[c], #4\n\t" + "b L_chacha_arm32_crypt_word_loop_%=\n\t" + "\n" + "L_chacha_arm32_crypt_byte_start_%=: \n\t" + "ldr r4, [lr]\n\t" + "\n" + "L_chacha_arm32_crypt_byte_loop_%=: \n\t" + "ldrb r8, [%[m]]\n\t" + "eor r8, r8, r4\n\t" + "subs %[len], %[len], #1\n\t" + "strb r8, [%[c]]\n\t" + "beq L_chacha_arm32_crypt_done_%=\n\t" + "lsr r4, r4, #8\n\t" + "add %[m], %[m], #1\n\t" + "add %[c], %[c], #1\n\t" + "b L_chacha_arm32_crypt_byte_loop_%=\n\t" + "\n" + "L_chacha_arm32_crypt_done_%=: \n\t" + "add sp, sp, #52\n\t" + : [ctx] "+r" (ctx), [c] "+r" (c), [m] "+r" (m), [len] "+r" (len) + : + : "memory", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "cc" + ); +} + +void wc_chacha_use_over(byte* over_p, byte* output_p, const byte* input_p, word32 len_p) +{ + register byte* over asm ("r0") = (byte*)over_p; + register byte* output asm ("r1") = (byte*)output_p; + register const byte* input asm ("r2") = (const byte*)input_p; + register word32 len asm ("r3") = (word32)len_p; + + __asm__ __volatile__ ( + "\n" + "L_chacha_arm32_over_16byte_loop_%=: \n\t" + "cmp %[len], #16\n\t" + "blt L_chacha_arm32_over_word_loop_%=\n\t" + /* 16 bytes of state XORed into message. */ + "ldr r12, [%[over]]\n\t" + "ldr lr, [%[over], #4]\n\t" + "ldr r4, [%[over], #8]\n\t" + "ldr r5, [%[over], #12]\n\t" + "ldr r6, [%[input]]\n\t" + "ldr r7, [%[input], #4]\n\t" + "ldr r8, [%[input], #8]\n\t" + "ldr r9, [%[input], #12]\n\t" + "eor r12, r12, r6\n\t" + "eor lr, lr, r7\n\t" + "eor r4, r4, r8\n\t" + "eor r5, r5, r9\n\t" + "subs %[len], %[len], #16\n\t" + "str r12, [%[output]]\n\t" + "str lr, [%[output], #4]\n\t" + "str r4, [%[output], #8]\n\t" + "str r5, [%[output], #12]\n\t" + "beq L_chacha_arm32_over_done_%=\n\t" + "add %[over], %[over], #16\n\t" + "add %[input], %[input], #16\n\t" + "add %[output], %[output], #16\n\t" + "b L_chacha_arm32_over_16byte_loop_%=\n\t" + "\n" + "L_chacha_arm32_over_word_loop_%=: \n\t" + "cmp %[len], #4\n\t" + "blt L_chacha_arm32_over_byte_loop_%=\n\t" + /* 4 bytes of state XORed into message. */ + "ldr r12, [%[over]]\n\t" + "ldr r6, [%[input]]\n\t" + "eor r12, r12, r6\n\t" + "subs %[len], %[len], #4\n\t" + "str r12, [%[output]]\n\t" + "beq L_chacha_arm32_over_done_%=\n\t" + "add %[over], %[over], #4\n\t" + "add %[input], %[input], #4\n\t" + "add %[output], %[output], #4\n\t" + "b L_chacha_arm32_over_word_loop_%=\n\t" + "\n" + "L_chacha_arm32_over_byte_loop_%=: \n\t" + /* 4 bytes of state XORed into message. */ + "ldrb r12, [%[over]]\n\t" + "ldrb r6, [%[input]]\n\t" + "eor r12, r12, r6\n\t" + "subs %[len], %[len], #1\n\t" + "strb r12, [%[output]]\n\t" + "beq L_chacha_arm32_over_done_%=\n\t" + "add %[over], %[over], #1\n\t" + "add %[input], %[input], #1\n\t" + "add %[output], %[output], #1\n\t" + "b L_chacha_arm32_over_byte_loop_%=\n\t" + "\n" + "L_chacha_arm32_over_done_%=: \n\t" + : [over] "+r" (over), [output] "+r" (output), [input] "+r" (input), [len] "+r" (len) + : + : "memory", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9", "cc" + ); +} + +#endif /* WOLFSSL_ARMASM_NO_NEON */ +#endif /* HAVE_CHACHA */ +#endif /* !__aarch64__ && __arm__ && !__thumb__ */ +#endif /* WOLFSSL_ARMASM */ +#endif /* !defined(__aarch64__) && defined(__arm__) && !defined(__thumb__) */ +#endif /* WOLFSSL_ARMASM */ + +#endif /* WOLFSSL_ARMASM_INLINE */ diff --git a/wolfcrypt/src/port/arm/armv8-32-poly1305-asm.S b/wolfcrypt/src/port/arm/armv8-32-poly1305-asm.S new file mode 100644 index 0000000000..ffbd7b2705 --- /dev/null +++ b/wolfcrypt/src/port/arm/armv8-32-poly1305-asm.S @@ -0,0 +1,356 @@ +/* armv8-32-poly1305-asm + * + * Copyright (C) 2006-2024 wolfSSL Inc. + * + * This file is part of wolfSSL. + * + * wolfSSL is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * wolfSSL is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA + */ + +/* Generated using (from wolfssl): + * cd ../scripts + * ruby ./poly1305/poly1305.rb arm32 ../wolfssl/wolfcrypt/src/port/arm/armv8-32-poly1305-asm.S + */ + +#ifdef HAVE_CONFIG_H + #include +#endif /* HAVE_CONFIG_H */ +#include + +#ifdef WOLFSSL_ARMASM +#if !defined(__aarch64__) && defined(__arm__) && !defined(__thumb__) +#ifndef WOLFSSL_ARMASM_INLINE +#ifdef HAVE_POLY1305 + .text + .align 4 + .globl poly1305_blocks_arm32_16 + .type poly1305_blocks_arm32_16, %function +poly1305_blocks_arm32_16: + push {r4, r5, r6, r7, r8, r9, r10, r11, lr} + sub sp, sp, #28 + cmp r2, #0 + beq L_poly1305_arm32_16_done + add lr, sp, #12 + stm lr, {r0, r1, r2, r3} + # Get h pointer + add lr, r0, #16 + ldm lr, {r4, r5, r6, r7, r8} +L_poly1305_arm32_16_loop: + # Add m to h + ldr r1, [sp, #16] + ldr r2, [r1] + ldr r3, [r1, #4] + ldr r9, [r1, #8] + ldr r10, [r1, #12] + ldr r11, [sp, #24] + adds r4, r4, r2 + adcs r5, r5, r3 + adcs r6, r6, r9 + adcs r7, r7, r10 + add r1, r1, #16 + adc r8, r8, r11 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + stm lr, {r4, r5, r6, r7, r8} +#else + # h[0]-h[2] in r4-r6 for multiplication. + str r7, [lr, #12] + str r8, [lr, #16] +#endif /* WOLFSSL_ARM_ARCH && WOLFSSL_ARM_ARCH < 6 */ + str r1, [sp, #16] + ldr r1, [sp, #12] + # Multiply h by r +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + # r0 = #0, r1 = r, lr = h, r2 = h[j], r3 = r[i] + ldr r3, [r1] + eor r0, r0, r0 + # r[0] * h[0] + # h[0] in r4 + umull r4, r5, r3, r4 + # r[0] * h[2] + # h[2] in r6 + umull r6, r7, r3, r6 + # r[0] * h[4] + # h[4] in r8 + mul r8, r3, r8 + # r[0] * h[1] + ldr r2, [lr, #4] + mov r12, r0 + umlal r5, r12, r3, r2 + # r[0] * h[3] + ldr r2, [lr, #12] + adds r6, r6, r12 + adc r7, r7, r0 + umlal r7, r8, r3, r2 + # r[1] * h[0] + ldr r3, [r1, #4] + ldr r2, [lr] + mov r12, r0 + umlal r5, r12, r3, r2 + # r[1] * h[1] + ldr r2, [lr, #4] + adds r6, r6, r12 + adc r12, r0, r0 + umlal r6, r12, r3, r2 + # r[1] * h[2] + ldr r2, [lr, #8] + adds r7, r7, r12 + adc r12, r0, r0 + umlal r7, r12, r3, r2 + # r[1] * h[3] + ldr r2, [lr, #12] + adds r8, r8, r12 + adc r9, r0, r0 + umlal r8, r9, r3, r2 + # r[1] * h[4] + ldr r2, [lr, #16] + mla r9, r3, r2, r9 + # r[2] * h[0] + ldr r3, [r1, #8] + ldr r2, [lr] + mov r12, r0 + umlal r6, r12, r3, r2 + # r[2] * h[1] + ldr r2, [lr, #4] + adds r7, r7, r12 + adc r12, r0, r0 + umlal r7, r12, r3, r2 + # r[2] * h[2] + ldr r2, [lr, #8] + adds r8, r8, r12 + adc r12, r0, r0 + umlal r8, r12, r3, r2 + # r[2] * h[3] + ldr r2, [lr, #12] + adds r9, r9, r12 + adc r10, r0, r0 + umlal r9, r10, r3, r2 + # r[2] * h[4] + ldr r2, [lr, #16] + mla r10, r3, r2, r10 + # r[3] * h[0] + ldr r3, [r1, #12] + ldr r2, [lr] + mov r12, r0 + umlal r7, r12, r3, r2 + # r[3] * h[1] + ldr r2, [lr, #4] + adds r8, r8, r12 + adc r12, r0, r0 + umlal r8, r12, r3, r2 + # r[3] * h[2] + ldr r2, [lr, #8] + adds r9, r9, r12 + adc r10, r10, r0 + umlal r9, r10, r3, r2 + # r[3] * h[3] + ldr r2, [lr, #12] + mov r11, r0 + umlal r10, r11, r3, r2 + # r[3] * h[4] + ldr r2, [lr, #16] + mov r12, r0 + mla r11, r3, r2, r11 +#else + ldm r1, {r0, r1, r2, r3} + # r[0] * h[0] + umull r10, r11, r0, r4 + # r[1] * h[0] + umull r12, r7, r1, r4 + # r[0] * h[1] + umaal r11, r12, r0, r5 + # r[2] * h[0] + umull r8, r9, r2, r4 + # r[1] * h[1] + umaal r12, r8, r1, r5 + # r[0] * h[2] + umaal r12, r7, r0, r6 + # r[3] * h[0] + umaal r8, r9, r3, r4 + stm sp, {r10, r11, r12} + # r[2] * h[1] + umaal r7, r8, r2, r5 + # Replace h[0] with h[3] + ldr r4, [lr, #12] + # r[1] * h[2] + umull r10, r11, r1, r6 + # r[2] * h[2] + umaal r8, r9, r2, r6 + # r[0] * h[3] + umaal r7, r10, r0, r4 + # r[3] * h[1] + umaal r8, r11, r3, r5 + # r[1] * h[3] + umaal r8, r10, r1, r4 + # r[3] * h[2] + umaal r9, r11, r3, r6 + # r[2] * h[3] + umaal r9, r10, r2, r4 + # Replace h[1] with h[4] + ldr r5, [lr, #16] + # r[3] * h[3] + umaal r10, r11, r3, r4 + mov r12, #0 + # r[0] * h[4] + umaal r8, r12, r0, r5 + # r[1] * h[4] + umaal r9, r12, r1, r5 + # r[2] * h[4] + umaal r10, r12, r2, r5 + # r[3] * h[4] + umaal r11, r12, r3, r5 + # DONE + ldm sp, {r4, r5, r6} +#endif /* WOLFSSL_ARM_ARCH && WOLFSSL_ARM_ARCH < 6 */ + # r12 will be zero because r is masked. + # Load length + ldr r2, [sp, #20] + # Reduce mod 2^130 - 5 + bic r3, r8, #3 + and r8, r8, #3 + adds r4, r4, r3 + lsr r3, r3, #2 + adcs r5, r5, r9 + orr r3, r3, r9, LSL #30 + adcs r6, r6, r10 + lsr r9, r9, #2 + adcs r7, r7, r11 + orr r9, r9, r10, LSL #30 + adc r8, r8, r12 + lsr r10, r10, #2 + adds r4, r4, r3 + orr r10, r10, r11, LSL #30 + adcs r5, r5, r9 + lsr r11, r11, #2 + adcs r6, r6, r10 + adcs r7, r7, r11 + adc r8, r8, r12 + # Sub 16 from length. + subs r2, r2, #16 + # Store length. + str r2, [sp, #20] + # Loop again if more message to do. + bgt L_poly1305_arm32_16_loop + stm lr, {r4, r5, r6, r7, r8} +L_poly1305_arm32_16_done: + add sp, sp, #28 + pop {r4, r5, r6, r7, r8, r9, r10, r11, pc} + .size poly1305_blocks_arm32_16,.-poly1305_blocks_arm32_16 + .text + .type L_poly1305_arm32_clamp, %object + .size L_poly1305_arm32_clamp, 16 + .align 4 +L_poly1305_arm32_clamp: + .word 0xfffffff + .word 0xffffffc + .word 0xffffffc + .word 0xffffffc + .text + .align 4 + .globl poly1305_set_key + .type poly1305_set_key, %function +poly1305_set_key: + push {r4, r5, r6, r7, r8, lr} + # Load mask. + adr lr, L_poly1305_arm32_clamp + ldm lr, {r6, r7, r8, r12} + # Load and cache padding. + ldr r2, [r1, #16] + ldr r3, [r1, #20] + ldr r4, [r1, #24] + ldr r5, [r1, #28] + add lr, r0, #36 + stm lr, {r2, r3, r4, r5} + # Load, mask and store r. + ldr r2, [r1] + ldr r3, [r1, #4] + ldr r4, [r1, #8] + ldr r5, [r1, #12] + and r2, r2, r6 + and r3, r3, r7 + and r4, r4, r8 + and r5, r5, r12 + add lr, r0, #0 + stm lr, {r2, r3, r4, r5} + # h (accumulator) = 0 + eor r6, r6, r6 + eor r7, r7, r7 + eor r8, r8, r8 + eor r12, r12, r12 + add lr, r0, #16 + eor r5, r5, r5 + stm lr, {r5, r6, r7, r8, r12} + # Zero leftover + str r5, [r0, #52] + pop {r4, r5, r6, r7, r8, pc} + .size poly1305_set_key,.-poly1305_set_key + .text + .align 4 + .globl poly1305_final + .type poly1305_final, %function +poly1305_final: + push {r4, r5, r6, r7, r8, r9, lr} + add r9, r0, #16 + ldm r9, {r4, r5, r6, r7, r8} + # Add 5 and check for h larger than p. + adds r2, r4, #5 + adcs r2, r5, #0 + adcs r2, r6, #0 + adcs r2, r7, #0 + adc r2, r8, #0 + sub r2, r2, #4 + lsr r2, r2, #31 + sub r2, r2, #1 + and r2, r2, #5 + # Add 0/5 to h. + adds r4, r4, r2 + adcs r5, r5, #0 + adcs r6, r6, #0 + adc r7, r7, #0 + # Add padding + add r9, r0, #36 + ldm r9, {r2, r3, r12, lr} + adds r4, r4, r2 + adcs r5, r5, r3 + adcs r6, r6, r12 + adc r7, r7, lr + # Store MAC + str r4, [r1] + str r5, [r1, #4] + str r6, [r1, #8] + str r7, [r1, #12] + # Zero out h. + eor r4, r4, r4 + eor r5, r5, r5 + eor r6, r6, r6 + eor r7, r7, r7 + eor r8, r8, r8 + add r9, r0, #16 + stm r9, {r4, r5, r6, r7, r8} + # Zero out r. + add r9, r0, #0 + stm r9, {r4, r5, r6, r7} + # Zero out padding. + add r9, r0, #36 + stm r9, {r4, r5, r6, r7} + pop {r4, r5, r6, r7, r8, r9, pc} + .size poly1305_final,.-poly1305_final +#endif /* HAVE_POLY1305 */ +#endif /* !__aarch64__ && __arm__ && !__thumb__ */ +#endif /* WOLFSSL_ARMASM */ + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif +#endif /* !WOLFSSL_ARMASM_INLINE */ diff --git a/wolfcrypt/src/port/arm/armv8-32-poly1305-asm_c.c b/wolfcrypt/src/port/arm/armv8-32-poly1305-asm_c.c new file mode 100644 index 0000000000..2871293570 --- /dev/null +++ b/wolfcrypt/src/port/arm/armv8-32-poly1305-asm_c.c @@ -0,0 +1,388 @@ +/* armv8-32-poly1305-asm + * + * Copyright (C) 2006-2024 wolfSSL Inc. + * + * This file is part of wolfSSL. + * + * wolfSSL is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * wolfSSL is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA + */ + +/* Generated using (from wolfssl): + * cd ../scripts + * ruby ./poly1305/poly1305.rb arm32 ../wolfssl/wolfcrypt/src/port/arm/armv8-32-poly1305-asm.c + */ + +#ifdef HAVE_CONFIG_H + #include +#endif /* HAVE_CONFIG_H */ +#include +#include + +#ifdef WOLFSSL_ARMASM +#if !defined(__aarch64__) && defined(__arm__) && !defined(__thumb__) +#include +#ifdef HAVE_CONFIG_H + #include +#endif /* HAVE_CONFIG_H */ +#include +#include +#ifdef WOLFSSL_ARMASM_INLINE + +#ifdef WOLFSSL_ARMASM +#if !defined(__aarch64__) && defined(__arm__) && !defined(__thumb__) + +#ifdef __IAR_SYSTEMS_ICC__ +#define __asm__ asm +#define __volatile__ volatile +#endif /* __IAR_SYSTEMS_ICC__ */ +#ifdef __KEIL__ +#define __asm__ __asm +#define __volatile__ volatile +#endif /* __KEIL__ */ +#ifdef HAVE_POLY1305 +#include + +void poly1305_blocks_arm32_16(Poly1305* ctx_p, const byte* m_p, word32 len_p, int notLast_p) +{ + register Poly1305* ctx asm ("r0") = (Poly1305*)ctx_p; + register const byte* m asm ("r1") = (const byte*)m_p; + register word32 len asm ("r2") = (word32)len_p; + register int notLast asm ("r3") = (int)notLast_p; + + __asm__ __volatile__ ( + "sub sp, sp, #28\n\t" + "cmp %[len], #0\n\t" + "beq L_poly1305_arm32_16_done_%=\n\t" + "add lr, sp, #12\n\t" + "stm lr, {%[ctx], %[m], %[len], %[notLast]}\n\t" + /* Get h pointer */ + "add lr, %[ctx], #16\n\t" + "ldm lr, {r4, r5, r6, r7, r8}\n\t" + "\n" + "L_poly1305_arm32_16_loop_%=: \n\t" + /* Add m to h */ + "ldr %[m], [sp, #16]\n\t" + "ldr %[len], [%[m]]\n\t" + "ldr %[notLast], [%[m], #4]\n\t" + "ldr r9, [%[m], #8]\n\t" + "ldr r10, [%[m], #12]\n\t" + "ldr r11, [sp, #24]\n\t" + "adds r4, r4, %[len]\n\t" + "adcs r5, r5, %[notLast]\n\t" + "adcs r6, r6, r9\n\t" + "adcs r7, r7, r10\n\t" + "add %[m], %[m], #16\n\t" + "adc r8, r8, r11\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "stm lr, {r4, r5, r6, r7, r8}\n\t" +#else + /* h[0]-h[2] in r4-r6 for multiplication. */ + "str r7, [lr, #12]\n\t" + "str r8, [lr, #16]\n\t" +#endif /* WOLFSSL_ARM_ARCH && WOLFSSL_ARM_ARCH < 6 */ + "str %[m], [sp, #16]\n\t" + "ldr %[m], [sp, #12]\n\t" + /* Multiply h by r */ +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + /* r0 = #0, r1 = r, lr = h, r2 = h[j], r3 = r[i] */ + "ldr %[notLast], [%[m]]\n\t" + "eor %[ctx], %[ctx], %[ctx]\n\t" + /* r[0] * h[0] */ + /* h[0] in r4 */ + "umull r4, r5, %[notLast], r4\n\t" + /* r[0] * h[2] */ + /* h[2] in r6 */ + "umull r6, r7, %[notLast], r6\n\t" + /* r[0] * h[4] */ + /* h[4] in r8 */ + "mul r8, %[notLast], r8\n\t" + /* r[0] * h[1] */ + "ldr %[len], [lr, #4]\n\t" + "mov r12, %[ctx]\n\t" + "umlal r5, r12, %[notLast], %[len]\n\t" + /* r[0] * h[3] */ + "ldr %[len], [lr, #12]\n\t" + "adds r6, r6, r12\n\t" + "adc r7, r7, %[ctx]\n\t" + "umlal r7, r8, %[notLast], %[len]\n\t" + /* r[1] * h[0] */ + "ldr %[notLast], [%[m], #4]\n\t" + "ldr %[len], [lr]\n\t" + "mov r12, %[ctx]\n\t" + "umlal r5, r12, %[notLast], %[len]\n\t" + /* r[1] * h[1] */ + "ldr %[len], [lr, #4]\n\t" + "adds r6, r6, r12\n\t" + "adc r12, %[ctx], %[ctx]\n\t" + "umlal r6, r12, %[notLast], %[len]\n\t" + /* r[1] * h[2] */ + "ldr %[len], [lr, #8]\n\t" + "adds r7, r7, r12\n\t" + "adc r12, %[ctx], %[ctx]\n\t" + "umlal r7, r12, %[notLast], %[len]\n\t" + /* r[1] * h[3] */ + "ldr %[len], [lr, #12]\n\t" + "adds r8, r8, r12\n\t" + "adc r9, %[ctx], %[ctx]\n\t" + "umlal r8, r9, %[notLast], %[len]\n\t" + /* r[1] * h[4] */ + "ldr %[len], [lr, #16]\n\t" + "mla r9, %[notLast], %[len], r9\n\t" + /* r[2] * h[0] */ + "ldr %[notLast], [%[m], #8]\n\t" + "ldr %[len], [lr]\n\t" + "mov r12, %[ctx]\n\t" + "umlal r6, r12, %[notLast], %[len]\n\t" + /* r[2] * h[1] */ + "ldr %[len], [lr, #4]\n\t" + "adds r7, r7, r12\n\t" + "adc r12, %[ctx], %[ctx]\n\t" + "umlal r7, r12, %[notLast], %[len]\n\t" + /* r[2] * h[2] */ + "ldr %[len], [lr, #8]\n\t" + "adds r8, r8, r12\n\t" + "adc r12, %[ctx], %[ctx]\n\t" + "umlal r8, r12, %[notLast], %[len]\n\t" + /* r[2] * h[3] */ + "ldr %[len], [lr, #12]\n\t" + "adds r9, r9, r12\n\t" + "adc r10, %[ctx], %[ctx]\n\t" + "umlal r9, r10, %[notLast], %[len]\n\t" + /* r[2] * h[4] */ + "ldr %[len], [lr, #16]\n\t" + "mla r10, %[notLast], %[len], r10\n\t" + /* r[3] * h[0] */ + "ldr %[notLast], [%[m], #12]\n\t" + "ldr %[len], [lr]\n\t" + "mov r12, %[ctx]\n\t" + "umlal r7, r12, %[notLast], %[len]\n\t" + /* r[3] * h[1] */ + "ldr %[len], [lr, #4]\n\t" + "adds r8, r8, r12\n\t" + "adc r12, %[ctx], %[ctx]\n\t" + "umlal r8, r12, %[notLast], %[len]\n\t" + /* r[3] * h[2] */ + "ldr %[len], [lr, #8]\n\t" + "adds r9, r9, r12\n\t" + "adc r10, r10, %[ctx]\n\t" + "umlal r9, r10, %[notLast], %[len]\n\t" + /* r[3] * h[3] */ + "ldr %[len], [lr, #12]\n\t" + "mov r11, %[ctx]\n\t" + "umlal r10, r11, %[notLast], %[len]\n\t" + /* r[3] * h[4] */ + "ldr %[len], [lr, #16]\n\t" + "mov r12, %[ctx]\n\t" + "mla r11, %[notLast], %[len], r11\n\t" +#else + "ldm %[m], {%[ctx], %[m], %[len], %[notLast]}\n\t" + /* r[0] * h[0] */ + "umull r10, r11, %[ctx], r4\n\t" + /* r[1] * h[0] */ + "umull r12, r7, %[m], r4\n\t" + /* r[0] * h[1] */ + "umaal r11, r12, %[ctx], r5\n\t" + /* r[2] * h[0] */ + "umull r8, r9, %[len], r4\n\t" + /* r[1] * h[1] */ + "umaal r12, r8, %[m], r5\n\t" + /* r[0] * h[2] */ + "umaal r12, r7, %[ctx], r6\n\t" + /* r[3] * h[0] */ + "umaal r8, r9, %[notLast], r4\n\t" + "stm sp, {r10, r11, r12}\n\t" + /* r[2] * h[1] */ + "umaal r7, r8, %[len], r5\n\t" + /* Replace h[0] with h[3] */ + "ldr r4, [lr, #12]\n\t" + /* r[1] * h[2] */ + "umull r10, r11, %[m], r6\n\t" + /* r[2] * h[2] */ + "umaal r8, r9, %[len], r6\n\t" + /* r[0] * h[3] */ + "umaal r7, r10, %[ctx], r4\n\t" + /* r[3] * h[1] */ + "umaal r8, r11, %[notLast], r5\n\t" + /* r[1] * h[3] */ + "umaal r8, r10, %[m], r4\n\t" + /* r[3] * h[2] */ + "umaal r9, r11, %[notLast], r6\n\t" + /* r[2] * h[3] */ + "umaal r9, r10, %[len], r4\n\t" + /* Replace h[1] with h[4] */ + "ldr r5, [lr, #16]\n\t" + /* r[3] * h[3] */ + "umaal r10, r11, %[notLast], r4\n\t" + "mov r12, #0\n\t" + /* r[0] * h[4] */ + "umaal r8, r12, %[ctx], r5\n\t" + /* r[1] * h[4] */ + "umaal r9, r12, %[m], r5\n\t" + /* r[2] * h[4] */ + "umaal r10, r12, %[len], r5\n\t" + /* r[3] * h[4] */ + "umaal r11, r12, %[notLast], r5\n\t" + /* DONE */ + "ldm sp, {r4, r5, r6}\n\t" +#endif /* WOLFSSL_ARM_ARCH && WOLFSSL_ARM_ARCH < 6 */ + /* r12 will be zero because r is masked. */ + /* Load length */ + "ldr %[len], [sp, #20]\n\t" + /* Reduce mod 2^130 - 5 */ + "bic %[notLast], r8, #3\n\t" + "and r8, r8, #3\n\t" + "adds r4, r4, %[notLast]\n\t" + "lsr %[notLast], %[notLast], #2\n\t" + "adcs r5, r5, r9\n\t" + "orr %[notLast], %[notLast], r9, LSL #30\n\t" + "adcs r6, r6, r10\n\t" + "lsr r9, r9, #2\n\t" + "adcs r7, r7, r11\n\t" + "orr r9, r9, r10, LSL #30\n\t" + "adc r8, r8, r12\n\t" + "lsr r10, r10, #2\n\t" + "adds r4, r4, %[notLast]\n\t" + "orr r10, r10, r11, LSL #30\n\t" + "adcs r5, r5, r9\n\t" + "lsr r11, r11, #2\n\t" + "adcs r6, r6, r10\n\t" + "adcs r7, r7, r11\n\t" + "adc r8, r8, r12\n\t" + /* Sub 16 from length. */ + "subs %[len], %[len], #16\n\t" + /* Store length. */ + "str %[len], [sp, #20]\n\t" + /* Loop again if more message to do. */ + "bgt L_poly1305_arm32_16_loop_%=\n\t" + "stm lr, {r4, r5, r6, r7, r8}\n\t" + "\n" + "L_poly1305_arm32_16_done_%=: \n\t" + "add sp, sp, #28\n\t" + : [ctx] "+r" (ctx), [m] "+r" (m), [len] "+r" (len), [notLast] "+r" (notLast) + : + : "memory", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "cc" + ); +} + +static const uint32_t L_poly1305_arm32_clamp[] = { + 0x0fffffff, 0x0ffffffc, 0x0ffffffc, 0x0ffffffc, +}; + +void poly1305_set_key(Poly1305* ctx_p, const byte* key_p) +{ + register Poly1305* ctx asm ("r0") = (Poly1305*)ctx_p; + register const byte* key asm ("r1") = (const byte*)key_p; + register uint32_t* L_poly1305_arm32_clamp_c asm ("r2") = (uint32_t*)&L_poly1305_arm32_clamp; + + __asm__ __volatile__ ( + /* Load mask. */ + "mov lr, %[L_poly1305_arm32_clamp]\n\t" + "ldm lr, {r6, r7, r8, r12}\n\t" + /* Load and cache padding. */ + "ldr r2, [%[key], #16]\n\t" + "ldr r3, [%[key], #20]\n\t" + "ldr r4, [%[key], #24]\n\t" + "ldr r5, [%[key], #28]\n\t" + "add lr, %[ctx], #36\n\t" + "stm lr, {r2, r3, r4, r5}\n\t" + /* Load, mask and store r. */ + "ldr r2, [%[key]]\n\t" + "ldr r3, [%[key], #4]\n\t" + "ldr r4, [%[key], #8]\n\t" + "ldr r5, [%[key], #12]\n\t" + "and r2, r2, r6\n\t" + "and r3, r3, r7\n\t" + "and r4, r4, r8\n\t" + "and r5, r5, r12\n\t" + "add lr, %[ctx], #0\n\t" + "stm lr, {r2, r3, r4, r5}\n\t" + /* h (accumulator) = 0 */ + "eor r6, r6, r6\n\t" + "eor r7, r7, r7\n\t" + "eor r8, r8, r8\n\t" + "eor r12, r12, r12\n\t" + "add lr, %[ctx], #16\n\t" + "eor r5, r5, r5\n\t" + "stm lr, {r5, r6, r7, r8, r12}\n\t" + /* Zero leftover */ + "str r5, [%[ctx], #52]\n\t" + : [ctx] "+r" (ctx), [key] "+r" (key), [L_poly1305_arm32_clamp] "+r" (L_poly1305_arm32_clamp_c) + : + : "memory", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "cc" + ); +} + +void poly1305_final(Poly1305* ctx_p, byte* mac_p) +{ + register Poly1305* ctx asm ("r0") = (Poly1305*)ctx_p; + register byte* mac asm ("r1") = (byte*)mac_p; + + __asm__ __volatile__ ( + "add r9, %[ctx], #16\n\t" + "ldm r9, {r4, r5, r6, r7, r8}\n\t" + /* Add 5 and check for h larger than p. */ + "adds r2, r4, #5\n\t" + "adcs r2, r5, #0\n\t" + "adcs r2, r6, #0\n\t" + "adcs r2, r7, #0\n\t" + "adc r2, r8, #0\n\t" + "sub r2, r2, #4\n\t" + "lsr r2, r2, #31\n\t" + "sub r2, r2, #1\n\t" + "and r2, r2, #5\n\t" + /* Add 0/5 to h. */ + "adds r4, r4, r2\n\t" + "adcs r5, r5, #0\n\t" + "adcs r6, r6, #0\n\t" + "adc r7, r7, #0\n\t" + /* Add padding */ + "add r9, %[ctx], #36\n\t" + "ldm r9, {r2, r3, r12, lr}\n\t" + "adds r4, r4, r2\n\t" + "adcs r5, r5, r3\n\t" + "adcs r6, r6, r12\n\t" + "adc r7, r7, lr\n\t" + /* Store MAC */ + "str r4, [%[mac]]\n\t" + "str r5, [%[mac], #4]\n\t" + "str r6, [%[mac], #8]\n\t" + "str r7, [%[mac], #12]\n\t" + /* Zero out h. */ + "eor r4, r4, r4\n\t" + "eor r5, r5, r5\n\t" + "eor r6, r6, r6\n\t" + "eor r7, r7, r7\n\t" + "eor r8, r8, r8\n\t" + "add r9, %[ctx], #16\n\t" + "stm r9, {r4, r5, r6, r7, r8}\n\t" + /* Zero out r. */ + "add r9, %[ctx], #0\n\t" + "stm r9, {r4, r5, r6, r7}\n\t" + /* Zero out padding. */ + "add r9, %[ctx], #36\n\t" + "stm r9, {r4, r5, r6, r7}\n\t" + : [ctx] "+r" (ctx), [mac] "+r" (mac) + : + : "memory", "r2", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9", "cc" + ); +} + +#endif /* HAVE_POLY1305 */ +#endif /* !__aarch64__ && __arm__ && !__thumb__ */ +#endif /* WOLFSSL_ARMASM */ +#endif /* !defined(__aarch64__) && defined(__arm__) && !defined(__thumb__) */ +#endif /* WOLFSSL_ARMASM */ + +#endif /* WOLFSSL_ARMASM_INLINE */ diff --git a/wolfcrypt/src/port/arm/armv8-32-sha3-asm.S b/wolfcrypt/src/port/arm/armv8-32-sha3-asm.S index 76629726f7..6077a88b3e 100644 --- a/wolfcrypt/src/port/arm/armv8-32-sha3-asm.S +++ b/wolfcrypt/src/port/arm/armv8-32-sha3-asm.S @@ -32,6 +32,8 @@ #ifdef WOLFSSL_ARMASM #if !defined(__aarch64__) && defined(__arm__) && !defined(__thumb__) #ifndef WOLFSSL_ARMASM_INLINE +#ifdef WOLFSSL_SHA3 +#ifndef WOLFSSL_ARMASM_NO_NEON .text .type L_sha3_arm2_neon_rt, %object .size L_sha3_arm2_neon_rt, 192 @@ -85,60 +87,6 @@ L_sha3_arm2_neon_rt: .word 0x0 .word 0x80008008 .word 0x80000000 - .text - .type L_sha3_arm2_rt, %object - .size L_sha3_arm2_rt, 192 - .align 4 -L_sha3_arm2_rt: - .word 0x1 - .word 0x0 - .word 0x8082 - .word 0x0 - .word 0x808a - .word 0x80000000 - .word 0x80008000 - .word 0x80000000 - .word 0x808b - .word 0x0 - .word 0x80000001 - .word 0x0 - .word 0x80008081 - .word 0x80000000 - .word 0x8009 - .word 0x80000000 - .word 0x8a - .word 0x0 - .word 0x88 - .word 0x0 - .word 0x80008009 - .word 0x0 - .word 0x8000000a - .word 0x0 - .word 0x8000808b - .word 0x0 - .word 0x8b - .word 0x80000000 - .word 0x8089 - .word 0x80000000 - .word 0x8003 - .word 0x80000000 - .word 0x8002 - .word 0x80000000 - .word 0x80 - .word 0x80000000 - .word 0x800a - .word 0x0 - .word 0x8000000a - .word 0x80000000 - .word 0x80008081 - .word 0x80000000 - .word 0x8080 - .word 0x80000000 - .word 0x80000001 - .word 0x0 - .word 0x80008008 - .word 0x80000000 -#ifndef WOLFSSL_ARMASM_NO_NEON .text .align 4 .globl BlockSha3 @@ -407,6 +355,59 @@ L_sha3_arm32_neon_begin: .size BlockSha3,.-BlockSha3 #endif /* WOLFSSL_ARMASM_NO_NEON */ #ifdef WOLFSSL_ARMASM_NO_NEON + .text + .type L_sha3_arm2_rt, %object + .size L_sha3_arm2_rt, 192 + .align 4 +L_sha3_arm2_rt: + .word 0x1 + .word 0x0 + .word 0x8082 + .word 0x0 + .word 0x808a + .word 0x80000000 + .word 0x80008000 + .word 0x80000000 + .word 0x808b + .word 0x0 + .word 0x80000001 + .word 0x0 + .word 0x80008081 + .word 0x80000000 + .word 0x8009 + .word 0x80000000 + .word 0x8a + .word 0x0 + .word 0x88 + .word 0x0 + .word 0x80008009 + .word 0x0 + .word 0x8000000a + .word 0x0 + .word 0x8000808b + .word 0x0 + .word 0x8b + .word 0x80000000 + .word 0x8089 + .word 0x80000000 + .word 0x8003 + .word 0x80000000 + .word 0x8002 + .word 0x80000000 + .word 0x80 + .word 0x80000000 + .word 0x800a + .word 0x0 + .word 0x8000000a + .word 0x80000000 + .word 0x80008081 + .word 0x80000000 + .word 0x8080 + .word 0x80000000 + .word 0x80000001 + .word 0x0 + .word 0x80008008 + .word 0x80000000 .text .align 4 .globl BlockSha3 @@ -2391,6 +2392,7 @@ L_sha3_arm32_begin: pop {r4, r5, r6, r7, r8, r9, r10, r11, pc} .size BlockSha3,.-BlockSha3 #endif /* WOLFSSL_ARMASM_NO_NEON */ +#endif /* WOLFSSL_SHA3 */ #endif /* !__aarch64__ && __arm__ && !__thumb__ */ #endif /* WOLFSSL_ARMASM */ diff --git a/wolfcrypt/src/port/arm/armv8-32-sha3-asm_c.c b/wolfcrypt/src/port/arm/armv8-32-sha3-asm_c.c index 6d2efa1b0b..1a54d8af3a 100644 --- a/wolfcrypt/src/port/arm/armv8-32-sha3-asm_c.c +++ b/wolfcrypt/src/port/arm/armv8-32-sha3-asm_c.c @@ -51,6 +51,8 @@ #define __asm__ __asm #define __volatile__ volatile #endif /* __KEIL__ */ +#ifdef WOLFSSL_SHA3 +#ifndef WOLFSSL_ARMASM_NO_NEON static const uint64_t L_sha3_arm2_neon_rt[] = { 0x0000000000000001UL, 0x0000000000008082UL, 0x800000000000808aUL, 0x8000000080008000UL, @@ -66,29 +68,12 @@ static const uint64_t L_sha3_arm2_neon_rt[] = { 0x0000000080000001UL, 0x8000000080008008UL, }; -static const uint64_t L_sha3_arm2_rt[] = { - 0x0000000000000001UL, 0x0000000000008082UL, - 0x800000000000808aUL, 0x8000000080008000UL, - 0x000000000000808bUL, 0x0000000080000001UL, - 0x8000000080008081UL, 0x8000000000008009UL, - 0x000000000000008aUL, 0x0000000000000088UL, - 0x0000000080008009UL, 0x000000008000000aUL, - 0x000000008000808bUL, 0x800000000000008bUL, - 0x8000000000008089UL, 0x8000000000008003UL, - 0x8000000000008002UL, 0x8000000000000080UL, - 0x000000000000800aUL, 0x800000008000000aUL, - 0x8000000080008081UL, 0x8000000000008080UL, - 0x0000000080000001UL, 0x8000000080008008UL, -}; - #include -#ifndef WOLFSSL_ARMASM_NO_NEON void BlockSha3(word64* state_p) { register word64* state asm ("r0") = (word64*)state_p; register uint64_t* L_sha3_arm2_neon_rt_c asm ("r1") = (uint64_t*)&L_sha3_arm2_neon_rt; - register uint64_t* L_sha3_arm2_rt_c asm ("r2") = (uint64_t*)&L_sha3_arm2_rt; __asm__ __volatile__ ( "sub sp, sp, #16\n\t" @@ -348,16 +333,31 @@ void BlockSha3(word64* state_p) "vst1.8 {d20-d23}, [%[state]]!\n\t" "vst1.8 {d24}, [%[state]]\n\t" "add sp, sp, #16\n\t" - : [state] "+r" (state), [L_sha3_arm2_neon_rt] "+r" (L_sha3_arm2_neon_rt_c), [L_sha3_arm2_rt] "+r" (L_sha3_arm2_rt_c) + : [state] "+r" (state), [L_sha3_arm2_neon_rt] "+r" (L_sha3_arm2_neon_rt_c) : - : "memory", "r3", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29", "d30", "d31", "cc" + : "memory", "r2", "r3", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29", "d30", "d31", "cc" ); } #endif /* WOLFSSL_ARMASM_NO_NEON */ +#ifdef WOLFSSL_ARMASM_NO_NEON +static const uint64_t L_sha3_arm2_rt[] = { + 0x0000000000000001UL, 0x0000000000008082UL, + 0x800000000000808aUL, 0x8000000080008000UL, + 0x000000000000808bUL, 0x0000000080000001UL, + 0x8000000080008081UL, 0x8000000000008009UL, + 0x000000000000008aUL, 0x0000000000000088UL, + 0x0000000080008009UL, 0x000000008000000aUL, + 0x000000008000808bUL, 0x800000000000008bUL, + 0x8000000000008089UL, 0x8000000000008003UL, + 0x8000000000008002UL, 0x8000000000000080UL, + 0x000000000000800aUL, 0x800000008000000aUL, + 0x8000000080008081UL, 0x8000000000008080UL, + 0x0000000080000001UL, 0x8000000080008008UL, +}; + #include -#ifdef WOLFSSL_ARMASM_NO_NEON void BlockSha3(word64* state_p) { register word64* state asm ("r0") = (word64*)state_p; @@ -2348,6 +2348,7 @@ void BlockSha3(word64* state_p) } #endif /* WOLFSSL_ARMASM_NO_NEON */ +#endif /* WOLFSSL_SHA3 */ #endif /* !__aarch64__ && __arm__ && !__thumb__ */ #endif /* WOLFSSL_ARMASM */ #endif /* !defined(__aarch64__) && defined(__arm__) && !defined(__thumb__) */ diff --git a/wolfcrypt/src/port/arm/armv8-chacha.c b/wolfcrypt/src/port/arm/armv8-chacha.c index c7de0a265b..b5b516705a 100644 --- a/wolfcrypt/src/port/arm/armv8-chacha.c +++ b/wolfcrypt/src/port/arm/armv8-chacha.c @@ -29,7 +29,7 @@ #include -#if defined(WOLFSSL_ARMASM) && !defined(WOLFSSL_ARMASM_NO_NEON) +#if defined(WOLFSSL_ARMASM) #ifdef HAVE_CHACHA #include @@ -73,15 +73,43 @@ * Set up iv(nonce). Earlier versions used 64 bits instead of 96, this version * uses the typical AEAD 96 bit nonce and can do record sizes of 256 GB. */ -int wc_Chacha_SetIV(ChaCha* ctx, const byte* inIv, word32 counter) +int wc_Chacha_SetIV(ChaCha* ctx, const byte* iv, word32 counter) { +#ifndef __aarch64__ + int ret = 0; +#ifdef CHACHA_AEAD_TEST + word32 i; + + printf("NONCE : "); + if (iv != NULL) { + for (i = 0; i < CHACHA_IV_BYTES; i++) { + printf("%02x", iv[i]); + } + } + printf("\n\n"); +#endif + + /* Validate parameters. */ + if ((ctx == NULL) || (iv == NULL)) { + ret = BAD_FUNC_ARG; + } + if (ret == 0) { + /* No unused bytes to XOR into input. */ + ctx->left = 0; + + /* Set counter and IV into state. */ + wc_chacha_setiv(ctx->X, iv, counter); + } + + return ret; +#else word32 temp[CHACHA_IV_WORDS];/* used for alignment of memory */ #ifdef CHACHA_AEAD_TEST word32 i; printf("NONCE : "); for (i = 0; i < CHACHA_IV_BYTES; i++) { - printf("%02x", inIv[i]); + printf("%02x", iv[i]); } printf("\n\n"); #endif @@ -89,7 +117,7 @@ int wc_Chacha_SetIV(ChaCha* ctx, const byte* inIv, word32 counter) if (ctx == NULL) return BAD_FUNC_ARG; - XMEMCPY(temp, inIv, CHACHA_IV_BYTES); + XMEMCPY(temp, iv, CHACHA_IV_BYTES); ctx->left = 0; ctx->X[CHACHA_IV_BYTES+0] = counter; /* block counter */ @@ -98,18 +126,54 @@ int wc_Chacha_SetIV(ChaCha* ctx, const byte* inIv, word32 counter) ctx->X[CHACHA_IV_BYTES+3] = LITTLE32(temp[2]); /* counter from nonce */ return 0; +#endif } +#ifdef __aarch64__ /* "expand 32-byte k" as unsigned 32 byte */ static const word32 sigma[4] = {0x61707865, 0x3320646e, 0x79622d32, 0x6b206574}; /* "expand 16-byte k" as unsigned 16 byte */ static const word32 tau[4] = {0x61707865, 0x3120646e, 0x79622d36, 0x6b206574}; +#endif /** * Key setup. 8 word iv (nonce) */ int wc_Chacha_SetKey(ChaCha* ctx, const byte* key, word32 keySz) { +#ifndef __aarch64__ + int ret = 0; + +#ifdef CHACHA_AEAD_TEST + printf("ChaCha key used :\n"); + if (key != NULL) { + word32 i; + for (i = 0; i < keySz; i++) { + printf("%02x", key[i]); + if ((i % 8) == 7) + printf("\n"); + } + } + printf("\n\n"); +#endif + + /* Validate parameters. */ + if ((ctx == NULL) || (key == NULL)) { + ret = BAD_FUNC_ARG; + } + else if ((keySz != (CHACHA_MAX_KEY_SZ / 2)) && + (keySz != CHACHA_MAX_KEY_SZ )) { + ret = BAD_FUNC_ARG; + } + + if (ret == 0) { + ctx->left = 0; + + wc_chacha_setkey(ctx->X, key, keySz); + } + + return ret; +#else const word32* constants; const byte* k; @@ -169,8 +233,10 @@ int wc_Chacha_SetKey(ChaCha* ctx, const byte* key, word32 keySz) ctx->left = 0; return 0; +#endif } +#ifndef WOLFSSL_ARMASM_NO_NEON static const word32 L_chacha20_neon_inc_first_word[] = { 0x1, 0x0, @@ -2815,7 +2881,6 @@ static WC_INLINE void wc_Chacha_encrypt_64(const word32* input, const byte* m, } - /** * Encrypt a stream of bytes */ @@ -2862,40 +2927,68 @@ static void wc_Chacha_encrypt_bytes(ChaCha* ctx, const byte* m, byte* c, ctx->X[CHACHA_IV_BYTES] = PLUSONE(ctx->X[CHACHA_IV_BYTES]); } } +#endif /** * API to encrypt/decrypt a message of any size. */ int wc_Chacha_Process(ChaCha* ctx, byte* output, const byte* input, - word32 msglen) + word32 len) { +#ifdef WOLFSSL_ARMASM_NO_NEON + int ret = 0; + + if ((ctx == NULL) || (output == NULL) || (input == NULL)) { + ret = BAD_FUNC_ARG; + } + + /* Handle left over bytes from last block. */ + if ((ret == 0) && (len > 0) && (ctx->left > 0)) { + byte* over = ((byte*)ctx->over) + CHACHA_CHUNK_BYTES - ctx->left; + word32 l = min(len, ctx->left); + + wc_chacha_use_over(over, output, input, l); + + ctx->left -= l; + input += l; + output += l; + len -= l; + } + + if ((ret == 0) && (len != 0)) { + wc_chacha_crypt_bytes(ctx, output, input, len); + } + + return ret; +#else if (ctx == NULL || output == NULL || input == NULL) return BAD_FUNC_ARG; /* handle left overs */ - if (msglen > 0 && ctx->left > 0) { + if (len > 0 && ctx->left > 0) { byte* out; word32 i; out = (byte*)ctx->over + CHACHA_CHUNK_BYTES - ctx->left; - for (i = 0; i < msglen && i < ctx->left; i++) { + for (i = 0; i < len && i < ctx->left; i++) { output[i] = (byte)(input[i] ^ out[i]); } ctx->left -= i; - msglen -= i; + len -= i; output += i; input += i; } - if (msglen == 0) { + if (len == 0) { return 0; } - wc_Chacha_encrypt_bytes(ctx, input, output, msglen); + wc_Chacha_encrypt_bytes(ctx, input, output, len); return 0; +#endif } #endif /* HAVE_CHACHA */ -#endif /* WOLFSSL_ARMASM && !WOLFSSL_ARMASM_NO_NEON */ +#endif /* WOLFSSL_ARMASM */ diff --git a/wolfcrypt/src/port/arm/armv8-poly1305.c b/wolfcrypt/src/port/arm/armv8-poly1305.c index 4d838c7036..9527bbd9d1 100644 --- a/wolfcrypt/src/port/arm/armv8-poly1305.c +++ b/wolfcrypt/src/port/arm/armv8-poly1305.c @@ -32,7 +32,6 @@ #include #ifdef WOLFSSL_ARMASM -#ifdef __aarch64__ #ifdef HAVE_POLY1305 #include @@ -49,6 +48,8 @@ #include #endif +#ifdef __aarch64__ + static WC_INLINE void poly1305_blocks_aarch64_16(Poly1305* ctx, const unsigned char *m, size_t bytes) { @@ -1118,6 +1119,127 @@ int wc_Poly1305Final(Poly1305* ctx, byte* mac) return 0; } -#endif /* HAVE_POLY1305 */ +#else +#ifdef __thumb__ +/* Process 16 bytes of message at a time. + * + * @param [in] ctx Poly1305 context. + * @param [in] m Message to process. + * @param [in] bytes Length of message in bytes. + */ +void poly1305_blocks_thumb2(Poly1305* ctx, const unsigned char* m, + size_t bytes) +{ + poly1305_blocks_thumb2_16(ctx, m, bytes, 1); +} + +/* Process 16 bytes of message. + * + * @param [in] ctx Poly1305 context. + * @param [in] m Message to process. + */ +void poly1305_block_thumb2(Poly1305* ctx, const unsigned char* m) +{ + poly1305_blocks_thumb2_16(ctx, m, POLY1305_BLOCK_SIZE, 1); +} +#else +/* Process 16 bytes of message at a time. + * + * @param [in] ctx Poly1305 context. + * @param [in] m Message to process. + * @param [in] bytes Length of message in bytes. + */ +void poly1305_blocks_arm32(Poly1305* ctx, const unsigned char* m, size_t bytes) +{ + poly1305_blocks_arm32_16(ctx, m, bytes, 1); +} + +/* Process 16 bytes of message. + * + * @param [in] ctx Poly1305 context. + * @param [in] m Message to process. + */ +void poly1305_block_arm32(Poly1305* ctx, const unsigned char* m) +{ + poly1305_blocks_arm32_16(ctx, m, POLY1305_BLOCK_SIZE, 1); +} +#endif + +/* Set the key for the Poly1305 operation. + * + * @param [in] ctx Poly1305 context. + * @param [in] key Key data to use. + * @param [in] keySz Size of key in bytes. Must be 32. + * @return 0 on success. + * @return BAD_FUNC_ARG when ctx or key is NULL or keySz is not 32. + */ +int wc_Poly1305SetKey(Poly1305* ctx, const byte* key, word32 keySz) +{ + int ret = 0; + +#ifdef CHACHA_AEAD_TEST + word32 k; + printf("Poly key used:\n"); + if (key != NULL) { + for (k = 0; k < keySz; k++) { + printf("%02x", key[k]); + if ((k+1) % 8 == 0) + printf("\n"); + } + } + printf("\n"); +#endif + + /* Validate parameters. */ + if ((ctx == NULL) || (key == NULL) || (keySz != 32)) { + ret = BAD_FUNC_ARG; + } + + if (ret == 0) { + poly1305_set_key(ctx, key); + } + + return ret; +} + +/* Finalize the Poly1305 operation calculating the MAC. + * + * @param [in] ctx Poly1305 context. + * @param [in] mac Buffer to hold the MAC. Myst be at least 16 bytes long. + * @return 0 on success. + * @return BAD_FUNC_ARG when ctx or mac is NULL. + */ +int wc_Poly1305Final(Poly1305* ctx, byte* mac) +{ + int ret = 0; + + /* Validate parameters. */ + if ((ctx == NULL) || (mac == NULL)) { + ret = BAD_FUNC_ARG; + } + + /* Process the remaining partial block - last block. */ + if (ret == 0) { + if (ctx->leftover) { + size_t i = ctx->leftover; + ctx->buffer[i++] = 1; + for (; i < POLY1305_BLOCK_SIZE; i++) { + ctx->buffer[i] = 0; + } + #ifdef __thumb__ + poly1305_blocks_thumb2_16(ctx, ctx->buffer, POLY1305_BLOCK_SIZE, + 0); + #else + poly1305_blocks_arm32_16(ctx, ctx->buffer, POLY1305_BLOCK_SIZE, 0); + #endif + } + + poly1305_final(ctx, mac); + } + + return ret; +} + #endif /* __aarch64__ */ +#endif /* HAVE_POLY1305 */ #endif /* WOLFSSL_ARMASM */ diff --git a/wolfssl/wolfcrypt/chacha.h b/wolfssl/wolfcrypt/chacha.h index 42e71aee57..db4e5dd664 100644 --- a/wolfssl/wolfcrypt/chacha.h +++ b/wolfssl/wolfcrypt/chacha.h @@ -107,12 +107,18 @@ WOLFSSL_API int wc_XChacha_SetKey(ChaCha *ctx, const byte *key, word32 keySz, word32 counter); #endif -#if defined(WOLFSSL_ARMASM) && defined(__thumb__) +#if defined(WOLFSSL_ARMASM) + +#ifndef __aarch64__ void wc_chacha_setiv(word32* x, const byte* iv, word32 counter); void wc_chacha_setkey(word32* x, const byte* key, word32 keySz); +#endif + +#if defined(WOLFSSL_ARMASM_NO_NEON) || defined(__thumb__) void wc_chacha_use_over(byte* over, byte* output, const byte* input, word32 len); void wc_chacha_crypt_bytes(ChaCha* ctx, byte* c, const byte* m, word32 len); +#endif #endif diff --git a/wolfssl/wolfcrypt/poly1305.h b/wolfssl/wolfcrypt/poly1305.h index bcc48a6298..70ed1efa83 100644 --- a/wolfssl/wolfcrypt/poly1305.h +++ b/wolfssl/wolfcrypt/poly1305.h @@ -98,7 +98,7 @@ typedef struct Poly1305 { word64 leftover; unsigned char buffer[POLY1305_BLOCK_SIZE]; unsigned char finished; -#elif defined(WOLFSSL_ARMASM) && defined(__thumb__) +#elif defined(WOLFSSL_ARMASM) word32 r[4]; word32 h[5]; word32 pad[4]; @@ -147,16 +147,16 @@ WOLFSSL_API int wc_Poly1305_EncodeSizes64(Poly1305* ctx, word64 aadSz, WOLFSSL_API int wc_Poly1305_MAC(Poly1305* ctx, const byte* additional, word32 addSz, const byte* input, word32 sz, byte* tag, word32 tagSz); -#if defined(__aarch64__ ) && defined(WOLFSSL_ARMASM) +#if defined(WOLFSSL_ARMASM) +#if defined(__aarch64__ ) #define poly1305_blocks poly1305_blocks_aarch64 #define poly1305_block poly1305_block_aarch64 void poly1305_blocks_aarch64(Poly1305* ctx, const unsigned char *m, size_t bytes); void poly1305_block_aarch64(Poly1305* ctx, const unsigned char *m); -#endif - -#if defined(__thumb__ ) && defined(WOLFSSL_ARMASM) +#else +#if defined(__thumb__) #define poly1305_blocks poly1305_blocks_thumb2 #define poly1305_block poly1305_block_thumb2 @@ -166,9 +166,20 @@ void poly1305_block_thumb2(Poly1305* ctx, const unsigned char *m); void poly1305_blocks_thumb2_16(Poly1305* ctx, const unsigned char* m, word32 len, int notLast); +#else +#define poly1305_blocks poly1305_blocks_arm32 +#define poly1305_block poly1305_block_arm32 + +void poly1305_blocks_arm32(Poly1305* ctx, const unsigned char *m, size_t bytes); +void poly1305_block_arm32(Poly1305* ctx, const unsigned char *m); + +void poly1305_blocks_arm32_16(Poly1305* ctx, const unsigned char* m, word32 len, + int notLast); +#endif void poly1305_set_key(Poly1305* ctx, const byte* key); void poly1305_final(Poly1305* ctx, byte* mac); #endif +#endif /* WOLFSSL_ARMASM */ #if defined(WOLFSSL_RISCV_ASM) #define poly1305_blocks poly1305_blocks_riscv64 From 6414cf61a7107a55d90e8b758526f57409360952 Mon Sep 17 00:00:00 2001 From: Colton Willey Date: Thu, 26 Sep 2024 13:18:06 -0700 Subject: [PATCH 06/11] Update comments for new flags in settings.h --- wolfssl/wolfcrypt/settings.h | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/wolfssl/wolfcrypt/settings.h b/wolfssl/wolfcrypt/settings.h index 07c4f746b3..32730d8794 100644 --- a/wolfssl/wolfcrypt/settings.h +++ b/wolfssl/wolfcrypt/settings.h @@ -3576,11 +3576,17 @@ extern void uITRON4_free(void *p) ; #define KEEP_PEER_CERT #endif +/* Always copy certificate(s) from SSL CTX to each SSL object on creation, + * if this is not defined then each SSL object shares a pointer to the + * original certificate buffer owned by the SSL CTX. */ #if defined(OPENSSL_ALL) && !defined(WOLFSSL_NO_COPY_CERT) #undef WOLFSSL_COPY_CERT #define WOLFSSL_COPY_CERT #endif +/* Always copy private key from SSL CTX to each SSL object on creation, + * if this is not defined then each SSL object shares a pointer to the + * original key buffer owned by the SSL CTX. */ #if defined(OPENSSL_ALL) && !defined(WOLFSSL_NO_COPY_KEY) #undef WOLFSSL_COPY_KEY #define WOLFSSL_COPY_KEY From 60c249960232856bdc7f19cabc9eb39a197448ba Mon Sep 17 00:00:00 2001 From: Daniel Pouzzner Date: Fri, 27 Sep 2024 17:15:17 -0500 Subject: [PATCH 07/11] wolfssl/wolfcrypt/types.h: when defining fallback do-nothing SAVE_VECTOR_REGISTERS2(), also define SAVE_VECTOR_REGISTERS2_DOES_NOTHING, and likewise for fallback CAN_SAVE_VECTOR_REGISTERS, define CAN_SAVE_VECTOR_REGISTERS_ALWAYS_TRUE; wolfcrypt/src/aes.c: * when SAVE_VECTOR_REGISTERS2_DOES_NOTHING, define do-nothing VECTOR_REGISTERS_PUSH and VECTOR_REGISTERS_POP, to mollify Coverity CONSTANT_EXPRESSION_RESULT; * in AesGcmDecryptUpdate_aesni(), omit " && (c != NULL)" clause from computation of endA argument to AesGcmAadUpdate_aesni(), to mollify Coverity FORWARD_NULL (impermissible nullness is already checked and BAD_FUNC_ARGed by the sole caller, wc_AesGcmDecryptUpdate()); wolfcrypt/src/misc.c: add readUnalignedWord64(), writeUnalignedWord64(), readUnalignedWords64(), and writeUnalignedWords64(), for safe word64 access to possibly-unaligned data; wolfcrypt/src/wc_kyber_poly.c: use readUnalignedWords64() and readUnalignedWord64() to mitigate sanitizer-reported "load of misaligned address". --- wolfcrypt/src/aes.c | 13 +++++- wolfcrypt/src/misc.c | 46 ++++++++++++++++++ wolfcrypt/src/wc_kyber_poly.c | 88 ++++++++++++++++------------------- wolfssl/wolfcrypt/misc.h | 8 ++++ wolfssl/wolfcrypt/types.h | 2 + 5 files changed, 107 insertions(+), 50 deletions(-) diff --git a/wolfcrypt/src/aes.c b/wolfcrypt/src/aes.c index 4c9a8d1811..e76f66f135 100644 --- a/wolfcrypt/src/aes.c +++ b/wolfcrypt/src/aes.c @@ -4759,7 +4759,7 @@ int wc_AesSetIV(Aes* aes, const byte* iv) #ifdef WC_C_DYNAMIC_FALLBACK -#define VECTOR_REGISTERS_PUSH { \ +#define VECTOR_REGISTERS_PUSH { \ int orig_use_aesni = aes->use_aesni; \ if (aes->use_aesni && (SAVE_VECTOR_REGISTERS2() != 0)) { \ aes->use_aesni = 0; \ @@ -4774,6 +4774,15 @@ int wc_AesSetIV(Aes* aes, const byte* iv) } \ WC_DO_NOTHING +#elif defined(SAVE_VECTOR_REGISTERS2_DOES_NOTHING) + +#define VECTOR_REGISTERS_PUSH { \ + WC_DO_NOTHING + +#define VECTOR_REGISTERS_POP \ + } \ + WC_DO_NOTHING + #else #define VECTOR_REGISTERS_PUSH { \ @@ -9796,7 +9805,7 @@ static WARN_UNUSED_RESULT int AesGcmDecryptUpdate_aesni( ASSERT_SAVED_VECTOR_REGISTERS(); /* Hash in A, the Authentication Data */ - ret = AesGcmAadUpdate_aesni(aes, a, aSz, (cSz > 0) && (c != NULL)); + ret = AesGcmAadUpdate_aesni(aes, a, aSz, cSz > 0); if (ret != 0) return ret; diff --git a/wolfcrypt/src/misc.c b/wolfcrypt/src/misc.c index 7a9bcb02c9..e4b53d91f1 100644 --- a/wolfcrypt/src/misc.c +++ b/wolfcrypt/src/misc.c @@ -211,6 +211,52 @@ WC_MISC_STATIC WC_INLINE void ByteReverseWords(word32* out, const word32* in, #if defined(WORD64_AVAILABLE) && !defined(WOLFSSL_NO_WORD64_OPS) +WC_MISC_STATIC WC_INLINE word64 readUnalignedWord64(const byte *in) +{ + if (((wc_ptr_t)in & (wc_ptr_t)(sizeof(word64) - 1U)) == (wc_ptr_t)0) + return *(word64 *)in; + else { + word64 out; + XMEMCPY(&out, in, sizeof(word64)); + return out; + } +} + +WC_MISC_STATIC WC_INLINE word64 writeUnalignedWord64(void *out, word64 in) +{ + if (((wc_ptr_t)out & (wc_ptr_t)(sizeof(word64) - 1U)) == (wc_ptr_t)0) + *(word64 *)out = in; + else { + XMEMCPY(out, &in, sizeof(word64)); + } + return in; +} + +WC_MISC_STATIC WC_INLINE void readUnalignedWords64(word64 *out, const byte *in, + size_t count) +{ + if (((wc_ptr_t)in & (wc_ptr_t)(sizeof(word64) - 1U)) == (wc_ptr_t)0) { + const word64 *in_word64 = (const word64 *)in; + while (count-- > 0) + *out++ = *in_word64++; + } + else { + XMEMCPY(out, in, count * sizeof(word64)); + } +} + +WC_MISC_STATIC WC_INLINE void writeUnalignedWords64(byte *out, const word64 *in, + size_t count) +{ + if (((wc_ptr_t)out & (wc_ptr_t)(sizeof(word64) - 1U)) == (wc_ptr_t)0) { + word64 *out_word64 = (word64 *)out; + while (count-- > 0) + *out_word64++ = *in++; + } + else { + XMEMCPY(out, in, count * sizeof(word64)); + } +} WC_MISC_STATIC WC_INLINE word64 rotlFixed64(word64 x, word64 y) { diff --git a/wolfcrypt/src/wc_kyber_poly.c b/wolfcrypt/src/wc_kyber_poly.c index 492d159a8f..4514ad3179 100644 --- a/wolfcrypt/src/wc_kyber_poly.c +++ b/wolfcrypt/src/wc_kyber_poly.c @@ -67,6 +67,13 @@ #ifdef WOLFSSL_WC_KYBER +#ifdef NO_INLINE + #include +#else + #define WOLFSSL_MISC_INCLUDED + #include +#endif + /* Declared in wc_kyber.c to stop compiler optimizer from simplifying. */ extern volatile sword16 kyber_opt_blocker; @@ -1560,14 +1567,11 @@ static int kyber_gen_matrix_k3_avx2(sword16* a, byte* seed, int transposed) a += 4 * KYBER_N; } - state[0] = ((word64*)seed)[0]; - state[1] = ((word64*)seed)[1]; - state[2] = ((word64*)seed)[2]; - state[3] = ((word64*)seed)[3]; + readUnalignedWords64(state, seed, 4); /* Transposed value same as not. */ state[4] = 0x1f0000 + (2 << 8) + 2; XMEMSET(state + 5, 0, sizeof(*state) * (25 - 5)); - state[20] = 0x8000000000000000UL; + state[20] = W64LIT(0x8000000000000000); for (i = 0; i < GEN_MATRIX_SIZE; i += SHA3_128_BYTES) { if (IS_INTEL_BMI2(cpuid_flags)) { sha3_block_bmi2(state); @@ -1748,14 +1752,11 @@ static int kyber_gen_matrix_k2_aarch64(sword16* a, byte* seed, int transposed) a += 3 * KYBER_N; - state[0] = ((word64*)seed)[0]; - state[1] = ((word64*)seed)[1]; - state[2] = ((word64*)seed)[2]; - state[3] = ((word64*)seed)[3]; + readUnalignedWords64(state, seed, 4); /* Transposed value same as not. */ state[4] = 0x1f0000 + (1 << 8) + 1; XMEMSET(state + 5, 0, sizeof(*state) * (25 - 5)); - state[20] = 0x8000000000000000UL; + state[20] = W64LIT(0x8000000000000000); BlockSha3(state); p = (byte*)state; ctr0 = kyber_rej_uniform_neon(a, KYBER_N, p, XOF_BLOCK_SIZE); @@ -1899,14 +1900,11 @@ static int kyber_gen_matrix_k4_aarch64(sword16* a, byte* seed, int transposed) a += 3 * KYBER_N; } - state[0] = ((word64*)seed)[0]; - state[1] = ((word64*)seed)[1]; - state[2] = ((word64*)seed)[2]; - state[3] = ((word64*)seed)[3]; + readUnalignedWords64(state, seed, 4); /* Transposed value same as not. */ state[4] = 0x1f0000 + (3 << 8) + 3; XMEMSET(state + 5, 0, sizeof(*state) * (25 - 5)); - state[20] = 0x8000000000000000UL; + state[20] = W64LIT(0x8000000000000000); BlockSha3(state); p = (byte*)state; ctr0 = kyber_rej_uniform_neon(a, KYBER_N, p, XOF_BLOCK_SIZE); @@ -2047,18 +2045,15 @@ static int kyber_prf(wc_Shake* shake256, byte* out, unsigned int outLen, const byte* key) { #ifdef USE_INTEL_SPEEDUP - int i; word64 state[25]; (void)shake256; - for (i = 0; i < KYBER_SYM_SZ / 8; i++) { - state[i] = ((word64*)key)[i]; - } + readUnalignedWords64(state, key, KYBER_SYM_SZ / sizeof(word64)); state[KYBER_SYM_SZ / 8] = 0x1f00 | key[KYBER_SYM_SZ]; XMEMSET(state + KYBER_SYM_SZ / 8 + 1, 0, (25 - KYBER_SYM_SZ / 8 - 1) * sizeof(word64)); - state[WC_SHA3_256_COUNT - 1] = 0x8000000000000000UL; + state[WC_SHA3_256_COUNT - 1] = W64LIT(0x8000000000000000); if (IS_INTEL_BMI2(cpuid_flags)) { sha3_block_bmi2(state); @@ -2098,15 +2093,12 @@ static int kyber_prf(wc_Shake* shake256, byte* out, unsigned int outLen, int kyber_kdf(byte* seed, int seedLen, byte* out, int outLen) { word64 state[25]; - int i; - int len64 = seedLen / 8; + word32 len64 = seedLen / 8; - for (i = 0; i < len64; i++) { - state[i] = ((word64*)seed)[i]; - } + readUnalignedWords64(state, seed, len64); state[len64] = 0x1f; XMEMSET(state + len64 + 1, 0, (25 - len64 - 1) * sizeof(word64)); - state[WC_SHA3_256_COUNT - 1] = 0x8000000000000000UL; + state[WC_SHA3_256_COUNT - 1] = W64LIT(0x8000000000000000); if (IS_INTEL_BMI2(cpuid_flags)) { sha3_block_bmi2(state); @@ -2136,15 +2128,12 @@ int kyber_kdf(byte* seed, int seedLen, byte* out, int outLen) int kyber_kdf(byte* seed, int seedLen, byte* out, int outLen) { word64 state[25]; - int i; - int len64 = seedLen / 8; + word32 len64 = seedLen / 8; - for (i = 0; i < len64; i++) { - state[i] = ((word64*)seed)[i]; - } + readUnalignedWords64(state, seed, len64); state[len64] = 0x1f; XMEMSET(state + len64 + 1, 0, (25 - len64 - 1) * sizeof(word64)); - state[WC_SHA3_256_COUNT - 1] = 0x8000000000000000UL; + state[WC_SHA3_256_COUNT - 1] = W64LIT(0x8000000000000000); BlockSha3(state); XMEMCPY(out, state, outLen); @@ -2199,10 +2188,11 @@ static unsigned int kyber_rej_uniform_c(sword16* p, unsigned int len, i = 0; for (j = 0; j < minJ; j += 6) { /* Use 48 bits (6 bytes) as four 12-bit integers. */ - sword16 v0 = (*(word64*)r) & 0xfff; - sword16 v1 = ((*(word64*)r) >> 12) & 0xfff; - sword16 v2 = ((*(word64*)r) >> 24) & 0xfff; - sword16 v3 = ((*(word64*)r) >> 36) & 0xfff; + word64 r_word = readUnalignedWord64(r); + sword16 v0 = r_word & 0xfff; + sword16 v1 = (r_word >> 12) & 0xfff; + sword16 v2 = (r_word >> 24) & 0xfff; + sword16 v3 = (r_word >> 36) & 0xfff; p[i] = v0 & (0 - (v0 < KYBER_Q)); i += v0 < KYBER_Q; @@ -2219,10 +2209,11 @@ static unsigned int kyber_rej_uniform_c(sword16* p, unsigned int len, if (j < rLen) { for (; (i + 4 < len) && (j < rLen); j += 6) { /* Use 48 bits (6 bytes) as four 12-bit integers. */ - sword16 v0 = (*(word64*)r) & 0xfff; - sword16 v1 = ((*(word64*)r) >> 12) & 0xfff; - sword16 v2 = ((*(word64*)r) >> 24) & 0xfff; - sword16 v3 = ((*(word64*)r) >> 36) & 0xfff; + word64 r_word = readUnalignedWord64(r); + sword16 v0 = r_word & 0xfff; + sword16 v1 = (r_word >> 12) & 0xfff; + sword16 v2 = (r_word >> 24) & 0xfff; + sword16 v3 = (r_word >> 36) & 0xfff; p[i] = v0; i += v0 < KYBER_Q; @@ -2238,10 +2229,11 @@ static unsigned int kyber_rej_uniform_c(sword16* p, unsigned int len, } for (; (i < len) && (j < rLen); j += 6) { /* Use 48 bits (6 bytes) as four 12-bit integers. */ - sword16 v0 = (*(word64*)r) & 0xfff; - sword16 v1 = ((*(word64*)r) >> 12) & 0xfff; - sword16 v2 = ((*(word64*)r) >> 24) & 0xfff; - sword16 v3 = ((*(word64*)r) >> 36) & 0xfff; + word64 r_word = readUnalignedWord64(r); + sword16 v0 = r_word & 0xfff; + sword16 v1 = (r_word >> 12) & 0xfff; + sword16 v2 = (r_word >> 24) & 0xfff; + sword16 v3 = (r_word >> 36) & 0xfff; /* Reject first 12-bit integer if greater than or equal to q. */ if (v0 < KYBER_Q) { @@ -2511,9 +2503,9 @@ static void kyber_cbd_eta2(sword16* p, const byte* r) #endif /* Take the next 8 bytes, little endian, as a 64 bit value. */ #ifdef BIG_ENDIAN_ORDER - word64 t = ByteReverseWord64(*(word64*)r); + word64 t = ByteReverseWord64(readUnalignedWord64(r)); #else - word64 t = *(word64*)r; + word64 t = readUnalignedWord64(r); #endif word64 d; /* Add second bits to first. */ @@ -3023,7 +3015,7 @@ static void kyber_get_noise_eta3_aarch64(byte* rand, byte* seed, byte o) state[3] = ((word64*)seed)[3]; state[4] = 0x1f00 + o; XMEMSET(state + 5, 0, sizeof(*state) * (25 - 5)); - state[16] = 0x8000000000000000UL; + state[16] = W64LIT(0x8000000000000000); BlockSha3(state); XMEMCPY(rand , state, SHA3_256_BYTES); BlockSha3(state); @@ -3083,7 +3075,7 @@ static void kyber_get_noise_eta2_aarch64(byte* rand, byte* seed, byte o) /* Transposed value same as not. */ state[4] = 0x1f00 + o; XMEMSET(state + 5, 0, sizeof(*state) * (25 - 5)); - state[16] = 0x8000000000000000UL; + state[16] = W64LIT(0x8000000000000000); BlockSha3(state); } diff --git a/wolfssl/wolfcrypt/misc.h b/wolfssl/wolfcrypt/misc.h index cc068db441..9acc31b121 100644 --- a/wolfssl/wolfcrypt/misc.h +++ b/wolfssl/wolfcrypt/misc.h @@ -76,6 +76,14 @@ int ConstantCompare(const byte* a, const byte* b, int length); #ifdef WORD64_AVAILABLE WOLFSSL_LOCAL +word64 readUnalignedWord64(const byte *in); +WOLFSSL_LOCAL +word64 writeUnalignedWord64(void *out, word64 in); +WOLFSSL_LOCAL +void readUnalignedWords64(word64 *out, const byte *in, size_t count); +WOLFSSL_LOCAL +void writeUnalignedWords64(byte *out, const word64 *in, size_t count); +WOLFSSL_LOCAL word64 rotlFixed64(word64 x, word64 y); WOLFSSL_LOCAL word64 rotrFixed64(word64 x, word64 y); diff --git a/wolfssl/wolfcrypt/types.h b/wolfssl/wolfcrypt/types.h index 6ff0736220..2177722977 100644 --- a/wolfssl/wolfcrypt/types.h +++ b/wolfssl/wolfcrypt/types.h @@ -1729,9 +1729,11 @@ typedef struct w64wrapper { #endif #ifndef SAVE_VECTOR_REGISTERS2 #define SAVE_VECTOR_REGISTERS2() 0 + #define SAVE_VECTOR_REGISTERS2_DOES_NOTHING #endif #ifndef CAN_SAVE_VECTOR_REGISTERS #define CAN_SAVE_VECTOR_REGISTERS() 1 + #define CAN_SAVE_VECTOR_REGISTERS_ALWAYS_TRUE #endif #ifndef WC_DEBUG_SET_VECTOR_REGISTERS_RETVAL #define WC_DEBUG_SET_VECTOR_REGISTERS_RETVAL(x) WC_DO_NOTHING From e4301bc5547b5bb6cc65d467859b79f8344d23eb Mon Sep 17 00:00:00 2001 From: Sean Parkinson Date: Thu, 26 Sep 2024 22:15:46 +1000 Subject: [PATCH 08/11] ARM32 generated files: fix line lengths Generated ARM32 assembly files no longer have lines with more than 80 characters. --- src/include.am | 6 - wolfcrypt/src/port/arm/armv8-32-aes-asm.S | 3 +- wolfcrypt/src/port/arm/armv8-32-aes-asm_c.c | 190 +++++++++++------ wolfcrypt/src/port/arm/armv8-32-chacha-asm.S | 3 +- .../src/port/arm/armv8-32-chacha-asm_c.c | 39 ++-- wolfcrypt/src/port/arm/armv8-32-curve25519.S | 3 +- .../src/port/arm/armv8-32-curve25519_c.c | 201 ++++++++++-------- .../src/port/arm/armv8-32-poly1305-asm.S | 3 +- .../src/port/arm/armv8-32-poly1305-asm_c.c | 29 ++- wolfcrypt/src/port/arm/armv8-32-sha256-asm.S | 3 +- .../src/port/arm/armv8-32-sha256-asm_c.c | 26 ++- wolfcrypt/src/port/arm/armv8-32-sha3-asm.S | 3 +- wolfcrypt/src/port/arm/armv8-32-sha3-asm_c.c | 24 ++- wolfcrypt/src/port/arm/armv8-32-sha512-asm.S | 3 +- .../src/port/arm/armv8-32-sha512-asm_c.c | 26 ++- wolfcrypt/src/port/arm/armv8-chacha.c | 2 +- 16 files changed, 346 insertions(+), 218 deletions(-) diff --git a/src/include.am b/src/include.am index dbda409a2f..fa182f6ad9 100644 --- a/src/include.am +++ b/src/include.am @@ -164,13 +164,11 @@ if BUILD_ARMASM src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/arm/armv8-aes.c endif BUILD_ARMASM if BUILD_ARMASM_NEON -if !BUILD_ARMASM_CRYPTO if BUILD_ARMASM_INLINE src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/arm/armv8-32-aes-asm_c.c else src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/arm/armv8-32-aes-asm.S endif !BUILD_ARMASM_INLINE -endif !BUILD_ARMASM_CRYPTO else if BUILD_ARMASM if BUILD_ARMASM_INLINE @@ -336,13 +334,11 @@ if BUILD_ARMASM src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/arm/armv8-aes.c endif BUILD_ARMASM if BUILD_ARMASM_NEON -if !BUILD_ARMASM_CRYPTO if BUILD_ARMASM_INLINE src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/arm/armv8-32-aes-asm_c.c else src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/arm/armv8-32-aes-asm.S endif !BUILD_ARMASM_INLINE -endif !BUILD_ARMASM_CRYPTO else if BUILD_ARMASM if BUILD_ARMASM_INLINE @@ -701,7 +697,6 @@ if BUILD_ARMASM src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/arm/armv8-aes.c endif BUILD_ARMASM if BUILD_ARMASM_NEON -if !BUILD_ARMASM_CRYPTO if BUILD_ARMASM_INLINE src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/arm/armv8-32-aes-asm_c.c src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/arm/thumb2-aes-asm_c.c @@ -709,7 +704,6 @@ else src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/arm/armv8-32-aes-asm.S src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/arm/thumb2-aes-asm.S endif !BUILD_ARMASM_INLINE -endif !BUILD_ARMASM_CRYPTO else if BUILD_ARMASM if BUILD_ARMASM_INLINE diff --git a/wolfcrypt/src/port/arm/armv8-32-aes-asm.S b/wolfcrypt/src/port/arm/armv8-32-aes-asm.S index 345f19408e..553acadc29 100644 --- a/wolfcrypt/src/port/arm/armv8-32-aes-asm.S +++ b/wolfcrypt/src/port/arm/armv8-32-aes-asm.S @@ -21,7 +21,8 @@ /* Generated using (from wolfssl): * cd ../scripts - * ruby ./aes/aes.rb arm32 ../wolfssl/wolfcrypt/src/port/arm/armv8-32-aes-asm.S + * ruby ./aes/aes.rb arm32 \ + * ../wolfssl/wolfcrypt/src/port/arm/armv8-32-aes-asm.S */ #ifdef HAVE_CONFIG_H diff --git a/wolfcrypt/src/port/arm/armv8-32-aes-asm_c.c b/wolfcrypt/src/port/arm/armv8-32-aes-asm_c.c index f8ba89ac09..c21fbea524 100644 --- a/wolfcrypt/src/port/arm/armv8-32-aes-asm_c.c +++ b/wolfcrypt/src/port/arm/armv8-32-aes-asm_c.c @@ -21,7 +21,8 @@ /* Generated using (from wolfssl): * cd ../scripts - * ruby ./aes/aes.rb arm32 ../wolfssl/wolfcrypt/src/port/arm/armv8-32-aes-asm.c + * ruby ./aes/aes.rb arm32 \ + * ../wolfssl/wolfcrypt/src/port/arm/armv8-32-aes-asm.c */ #ifdef HAVE_CONFIG_H @@ -123,7 +124,9 @@ static const uint32_t L_AES_ARM32_td_data[] = { }; #endif /* HAVE_AES_DECRYPT */ -#if defined(HAVE_AES_DECRYPT) || defined(HAVE_AES_CBC) || defined(HAVE_AESCCM) || defined(HAVE_AESGCM) || defined(WOLFSSL_AES_DIRECT) || defined(WOLFSSL_AES_COUNTER) +#if defined(HAVE_AES_DECRYPT) || defined(HAVE_AES_CBC) || \ + defined(HAVE_AESCCM) || defined(HAVE_AESGCM) || \ + defined(WOLFSSL_AES_DIRECT) || defined(WOLFSSL_AES_COUNTER) static const uint32_t L_AES_ARM32_te_data[] = { 0xa5c66363, 0x84f87c7c, 0x99ee7777, 0x8df67b7b, 0x0dfff2f2, 0xbdd66b6b, 0xb1de6f6f, 0x5491c5c5, @@ -191,15 +194,19 @@ static const uint32_t L_AES_ARM32_te_data[] = { 0xcb7bb0b0, 0xfca85454, 0xd66dbbbb, 0x3a2c1616, }; -#endif /* HAVE_AES_DECRYPT || HAVE_AES_CBC || HAVE_AESCCM || HAVE_AESGCM || WOLFSSL_AES_DIRECT || WOLFSSL_AES_COUNTER */ +#endif /* HAVE_AES_DECRYPT || HAVE_AES_CBC || HAVE_AESCCM || HAVE_AESGCM || + * WOLFSSL_AES_DIRECT || WOLFSSL_AES_COUNTER */ #ifdef HAVE_AES_DECRYPT static const uint32_t* L_AES_ARM32_td = L_AES_ARM32_td_data; #endif /* HAVE_AES_DECRYPT */ -#if defined(HAVE_AES_DECRYPT) || defined(HAVE_AES_CBC) || defined(HAVE_AESCCM) || defined(HAVE_AESGCM) || defined(WOLFSSL_AES_DIRECT) || defined(WOLFSSL_AES_COUNTER) +#if defined(HAVE_AES_DECRYPT) || defined(HAVE_AES_CBC) || \ + defined(HAVE_AESCCM) || defined(HAVE_AESGCM) || \ + defined(WOLFSSL_AES_DIRECT) || defined(WOLFSSL_AES_COUNTER) static const uint32_t* L_AES_ARM32_te = L_AES_ARM32_te_data; -#endif /* HAVE_AES_DECRYPT || HAVE_AES_CBC || HAVE_AESCCM || HAVE_AESGCM || WOLFSSL_AES_DIRECT || WOLFSSL_AES_COUNTER */ +#endif /* HAVE_AES_DECRYPT || HAVE_AES_CBC || HAVE_AESCCM || HAVE_AESGCM || + * WOLFSSL_AES_DIRECT || WOLFSSL_AES_COUNTER */ #ifdef HAVE_AES_DECRYPT -void AES_invert_key(unsigned char* ks, word32 rounds); +void AES_invert_key(unsigned char* ks_p, word32 rounds_p); void AES_invert_key(unsigned char* ks_p, word32 rounds_p) { register unsigned char* ks asm ("r0") = (unsigned char*)ks_p; @@ -401,9 +408,12 @@ void AES_invert_key(unsigned char* ks_p, word32 rounds_p) "str r8, [%[ks]], #4\n\t" "subs r11, r11, #1\n\t" "bne L_AES_invert_key_mix_loop_%=\n\t" - : [ks] "+r" (ks), [rounds] "+r" (rounds), [L_AES_ARM32_te] "+r" (L_AES_ARM32_te_c), [L_AES_ARM32_td] "+r" (L_AES_ARM32_td_c) + : [ks] "+r" (ks), [rounds] "+r" (rounds), + [L_AES_ARM32_te] "+r" (L_AES_ARM32_te_c), + [L_AES_ARM32_td] "+r" (L_AES_ARM32_td_c) : - : "memory", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "cc" + : "memory", "cc", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9", + "r10", "r11" ); } @@ -411,17 +421,20 @@ void AES_invert_key(unsigned char* ks_p, word32 rounds_p) static const uint32_t L_AES_ARM32_rcon[] = { 0x01000000, 0x02000000, 0x04000000, 0x08000000, 0x10000000, 0x20000000, 0x40000000, 0x80000000, - 0x1b000000, 0x36000000, + 0x1b000000, 0x36000000 }; -void AES_set_encrypt_key(const unsigned char* key, word32 len, unsigned char* ks); -void AES_set_encrypt_key(const unsigned char* key_p, word32 len_p, unsigned char* ks_p) +void AES_set_encrypt_key(const unsigned char* key_p, word32 len_p, + unsigned char* ks_p); +void AES_set_encrypt_key(const unsigned char* key_p, word32 len_p, + unsigned char* ks_p) { register const unsigned char* key asm ("r0") = (const unsigned char*)key_p; register word32 len asm ("r1") = (word32)len_p; register unsigned char* ks asm ("r2") = (unsigned char*)ks_p; register uint32_t* L_AES_ARM32_te_c asm ("r3") = (uint32_t*)L_AES_ARM32_te; - register uint32_t* L_AES_ARM32_rcon_c asm ("r4") = (uint32_t*)&L_AES_ARM32_rcon; + register uint32_t* L_AES_ARM32_rcon_c asm ("r4") = + (uint32_t*)&L_AES_ARM32_rcon; __asm__ __volatile__ ( "mov r8, %[L_AES_ARM32_te]\n\t" @@ -922,14 +935,18 @@ void AES_set_encrypt_key(const unsigned char* key_p, word32 len_p, unsigned char "bne L_AES_set_encrypt_key_loop_128_%=\n\t" "\n" "L_AES_set_encrypt_key_end_%=: \n\t" - : [key] "+r" (key), [len] "+r" (len), [ks] "+r" (ks), [L_AES_ARM32_te] "+r" (L_AES_ARM32_te_c), [L_AES_ARM32_rcon] "+r" (L_AES_ARM32_rcon_c) + : [key] "+r" (key), [len] "+r" (len), [ks] "+r" (ks), + [L_AES_ARM32_te] "+r" (L_AES_ARM32_te_c), + [L_AES_ARM32_rcon] "+r" (L_AES_ARM32_rcon_c) : - : "memory", "r12", "lr", "r5", "r6", "r7", "r8", "cc" + : "memory", "cc", "r12", "lr", "r5", "r6", "r7", "r8" ); } -void AES_encrypt_block(const uint32_t* te, int nr, int len, const uint32_t* ks); -void AES_encrypt_block(const uint32_t* te_p, int nr_p, int len_p, const uint32_t* ks_p) +void AES_encrypt_block(const uint32_t* te_p, int nr_p, int len_p, + const uint32_t* ks_p); +void AES_encrypt_block(const uint32_t* te_p, int nr_p, int len_p, + const uint32_t* ks_p) { register const uint32_t* te asm ("r0") = (const uint32_t*)te_p; register int nr asm ("r1") = (int)nr_p; @@ -1573,23 +1590,27 @@ void AES_encrypt_block(const uint32_t* te_p, int nr_p, int len_p, const uint32_t "eor r5, r5, r9\n\t" "eor r6, r6, r10\n\t" "eor r7, r7, r11\n\t" - : [te] "+r" (te), [nr] "+r" (nr), [len] "+r" (len), [ks] "+r" (ks) + : [te] "+r" (te), [nr] "+r" (nr), [len] "+r" (len), [ks] "+r" (ks) : - : "memory", "lr", "cc" + : "memory", "cc", "lr" ); } -#if defined(HAVE_AESCCM) || defined(HAVE_AESGCM) || defined(WOLFSSL_AES_DIRECT) || defined(WOLFSSL_AES_COUNTER) +#if defined(HAVE_AESCCM) || defined(HAVE_AESGCM) || \ + defined(WOLFSSL_AES_DIRECT) || defined(WOLFSSL_AES_COUNTER) static const uint32_t* L_AES_ARM32_te_ecb = L_AES_ARM32_te_data; -void AES_ECB_encrypt(const unsigned char* in, unsigned char* out, unsigned long len, const unsigned char* ks, int nr); -void AES_ECB_encrypt(const unsigned char* in_p, unsigned char* out_p, unsigned long len_p, const unsigned char* ks_p, int nr_p) +void AES_ECB_encrypt(const unsigned char* in_p, unsigned char* out_p, + unsigned long len_p, const unsigned char* ks_p, int nr_p); +void AES_ECB_encrypt(const unsigned char* in_p, unsigned char* out_p, + unsigned long len_p, const unsigned char* ks_p, int nr_p) { register const unsigned char* in asm ("r0") = (const unsigned char*)in_p; register unsigned char* out asm ("r1") = (unsigned char*)out_p; register unsigned long len asm ("r2") = (unsigned long)len_p; register const unsigned char* ks asm ("r3") = (const unsigned char*)ks_p; register int nr asm ("r4") = (int)nr_p; - register uint32_t* L_AES_ARM32_te_ecb_c asm ("r5") = (uint32_t*)L_AES_ARM32_te_ecb; + register uint32_t* L_AES_ARM32_te_ecb_c asm ("r5") = + (uint32_t*)L_AES_ARM32_te_ecb; __asm__ __volatile__ ( "mov lr, %[in]\n\t" @@ -1822,17 +1843,23 @@ void AES_ECB_encrypt(const unsigned char* in_p, unsigned char* out_p, unsigned l "\n" "L_AES_ECB_encrypt_end_%=: \n\t" "pop {%[ks]}\n\t" - : [in] "+r" (in), [out] "+r" (out), [len] "+r" (len), [ks] "+r" (ks), [nr] "+r" (nr), [L_AES_ARM32_te_ecb] "+r" (L_AES_ARM32_te_ecb_c) + : [in] "+r" (in), [out] "+r" (out), [len] "+r" (len), [ks] "+r" (ks), + [nr] "+r" (nr), [L_AES_ARM32_te_ecb] "+r" (L_AES_ARM32_te_ecb_c) : - : "memory", "r12", "lr", "r6", "r7", "r8", "r9", "r10", "r11", "cc" + : "memory", "cc", "r12", "lr", "r6", "r7", "r8", "r9", "r10", "r11" ); } -#endif /* HAVE_AESCCM || HAVE_AESGCM || WOLFSSL_AES_DIRECT || WOLFSSL_AES_COUNTER */ +#endif /* HAVE_AESCCM || HAVE_AESGCM || WOLFSSL_AES_DIRECT || + * WOLFSSL_AES_COUNTER */ #ifdef HAVE_AES_CBC static const uint32_t* L_AES_ARM32_te_cbc = L_AES_ARM32_te_data; -void AES_CBC_encrypt(const unsigned char* in, unsigned char* out, unsigned long len, const unsigned char* ks, int nr, unsigned char* iv); -void AES_CBC_encrypt(const unsigned char* in_p, unsigned char* out_p, unsigned long len_p, const unsigned char* ks_p, int nr_p, unsigned char* iv_p) +void AES_CBC_encrypt(const unsigned char* in_p, unsigned char* out_p, + unsigned long len_p, const unsigned char* ks_p, int nr_p, + unsigned char* iv_p); +void AES_CBC_encrypt(const unsigned char* in_p, unsigned char* out_p, + unsigned long len_p, const unsigned char* ks_p, int nr_p, + unsigned char* iv_p) { register const unsigned char* in asm ("r0") = (const unsigned char*)in_p; register unsigned char* out asm ("r1") = (unsigned char*)out_p; @@ -1840,7 +1867,8 @@ void AES_CBC_encrypt(const unsigned char* in_p, unsigned char* out_p, unsigned l register const unsigned char* ks asm ("r3") = (const unsigned char*)ks_p; register int nr asm ("r4") = (int)nr_p; register unsigned char* iv asm ("r5") = (unsigned char*)iv_p; - register uint32_t* L_AES_ARM32_te_cbc_c asm ("r6") = (uint32_t*)L_AES_ARM32_te_cbc; + register uint32_t* L_AES_ARM32_te_cbc_c asm ("r6") = + (uint32_t*)L_AES_ARM32_te_cbc; __asm__ __volatile__ ( "mov r8, r4\n\t" @@ -2088,17 +2116,23 @@ void AES_CBC_encrypt(const unsigned char* in_p, unsigned char* out_p, unsigned l "L_AES_CBC_encrypt_end_%=: \n\t" "pop {%[ks], r9}\n\t" "stm r9, {r4, r5, r6, r7}\n\t" - : [in] "+r" (in), [out] "+r" (out), [len] "+r" (len), [ks] "+r" (ks), [nr] "+r" (nr), [iv] "+r" (iv), [L_AES_ARM32_te_cbc] "+r" (L_AES_ARM32_te_cbc_c) + : [in] "+r" (in), [out] "+r" (out), [len] "+r" (len), [ks] "+r" (ks), + [nr] "+r" (nr), [iv] "+r" (iv), + [L_AES_ARM32_te_cbc] "+r" (L_AES_ARM32_te_cbc_c) : - : "memory", "r12", "lr", "r7", "r8", "r9", "r10", "r11", "cc" + : "memory", "cc", "r12", "lr", "r7", "r8", "r9", "r10", "r11" ); } #endif /* HAVE_AES_CBC */ #ifdef WOLFSSL_AES_COUNTER static const uint32_t* L_AES_ARM32_te_ctr = L_AES_ARM32_te_data; -void AES_CTR_encrypt(const unsigned char* in, unsigned char* out, unsigned long len, const unsigned char* ks, int nr, unsigned char* ctr); -void AES_CTR_encrypt(const unsigned char* in_p, unsigned char* out_p, unsigned long len_p, const unsigned char* ks_p, int nr_p, unsigned char* ctr_p) +void AES_CTR_encrypt(const unsigned char* in_p, unsigned char* out_p, + unsigned long len_p, const unsigned char* ks_p, int nr_p, + unsigned char* ctr_p); +void AES_CTR_encrypt(const unsigned char* in_p, unsigned char* out_p, + unsigned long len_p, const unsigned char* ks_p, int nr_p, + unsigned char* ctr_p) { register const unsigned char* in asm ("r0") = (const unsigned char*)in_p; register unsigned char* out asm ("r1") = (unsigned char*)out_p; @@ -2106,7 +2140,8 @@ void AES_CTR_encrypt(const unsigned char* in_p, unsigned char* out_p, unsigned l register const unsigned char* ks asm ("r3") = (const unsigned char*)ks_p; register int nr asm ("r4") = (int)nr_p; register unsigned char* ctr asm ("r5") = (unsigned char*)ctr_p; - register uint32_t* L_AES_ARM32_te_ctr_c asm ("r6") = (uint32_t*)L_AES_ARM32_te_ctr; + register uint32_t* L_AES_ARM32_te_ctr_c asm ("r6") = + (uint32_t*)L_AES_ARM32_te_ctr; __asm__ __volatile__ ( "mov r12, r4\n\t" @@ -2356,16 +2391,19 @@ void AES_CTR_encrypt(const unsigned char* in_p, unsigned char* out_p, unsigned l "rev r7, r7\n\t" #endif /* WOLFSSL_ARM_ARCH && WOLFSSL_ARM_ARCH < 6 */ "stm r8, {r4, r5, r6, r7}\n\t" - : [in] "+r" (in), [out] "+r" (out), [len] "+r" (len), [ks] "+r" (ks), [nr] "+r" (nr), [ctr] "+r" (ctr), [L_AES_ARM32_te_ctr] "+r" (L_AES_ARM32_te_ctr_c) + : [in] "+r" (in), [out] "+r" (out), [len] "+r" (len), [ks] "+r" (ks), + [nr] "+r" (nr), [ctr] "+r" (ctr), + [L_AES_ARM32_te_ctr] "+r" (L_AES_ARM32_te_ctr_c) : - : "memory", "r12", "lr", "r7", "r8", "r9", "r10", "r11", "cc" + : "memory", "cc", "r12", "lr", "r7", "r8", "r9", "r10", "r11" ); } #endif /* WOLFSSL_AES_COUNTER */ #ifdef HAVE_AES_DECRYPT -#if defined(WOLFSSL_AES_DIRECT) || defined(WOLFSSL_AES_COUNTER) || defined(HAVE_AES_CBC) -void AES_decrypt_block(const uint32_t* td, int nr, const uint8_t* td4); +#if defined(WOLFSSL_AES_DIRECT) || defined(WOLFSSL_AES_COUNTER) || \ + defined(HAVE_AES_CBC) +void AES_decrypt_block(const uint32_t* td_p, int nr_p, const uint8_t* td4_p); void AES_decrypt_block(const uint32_t* td_p, int nr_p, const uint8_t* td4_p) { register const uint32_t* td asm ("r0") = (const uint32_t*)td_p; @@ -3009,9 +3047,9 @@ void AES_decrypt_block(const uint32_t* td_p, int nr_p, const uint8_t* td4_p) "eor r5, r5, r9\n\t" "eor r6, r6, r10\n\t" "eor r7, r7, r11\n\t" - : [td] "+r" (td), [nr] "+r" (nr), [td4] "+r" (td4) + : [td] "+r" (td), [nr] "+r" (nr), [td4] "+r" (td4) : - : "memory", "lr", "cc" + : "memory", "cc", "lr" ); } @@ -3052,16 +3090,20 @@ static const unsigned char L_AES_ARM32_td4[] = { }; #if defined(WOLFSSL_AES_DIRECT) || defined(WOLFSSL_AES_COUNTER) -void AES_ECB_decrypt(const unsigned char* in, unsigned char* out, unsigned long len, const unsigned char* ks, int nr); -void AES_ECB_decrypt(const unsigned char* in_p, unsigned char* out_p, unsigned long len_p, const unsigned char* ks_p, int nr_p) +void AES_ECB_decrypt(const unsigned char* in_p, unsigned char* out_p, + unsigned long len_p, const unsigned char* ks_p, int nr_p); +void AES_ECB_decrypt(const unsigned char* in_p, unsigned char* out_p, + unsigned long len_p, const unsigned char* ks_p, int nr_p) { register const unsigned char* in asm ("r0") = (const unsigned char*)in_p; register unsigned char* out asm ("r1") = (unsigned char*)out_p; register unsigned long len asm ("r2") = (unsigned long)len_p; register const unsigned char* ks asm ("r3") = (const unsigned char*)ks_p; register int nr asm ("r4") = (int)nr_p; - register uint32_t* L_AES_ARM32_td_ecb_c asm ("r5") = (uint32_t*)L_AES_ARM32_td_ecb; - register unsigned char* L_AES_ARM32_td4_c asm ("r6") = (unsigned char*)&L_AES_ARM32_td4; + register uint32_t* L_AES_ARM32_td_ecb_c asm ("r5") = + (uint32_t*)L_AES_ARM32_td_ecb; + register unsigned char* L_AES_ARM32_td4_c asm ("r6") = + (unsigned char*)&L_AES_ARM32_td4; __asm__ __volatile__ ( "mov r8, r4\n\t" @@ -3291,16 +3333,22 @@ void AES_ECB_decrypt(const unsigned char* in_p, unsigned char* out_p, unsigned l "bne L_AES_ECB_decrypt_loop_block_128_%=\n\t" "\n" "L_AES_ECB_decrypt_end_%=: \n\t" - : [in] "+r" (in), [out] "+r" (out), [len] "+r" (len), [ks] "+r" (ks), [nr] "+r" (nr), [L_AES_ARM32_td_ecb] "+r" (L_AES_ARM32_td_ecb_c), [L_AES_ARM32_td4] "+r" (L_AES_ARM32_td4_c) + : [in] "+r" (in), [out] "+r" (out), [len] "+r" (len), [ks] "+r" (ks), + [nr] "+r" (nr), [L_AES_ARM32_td_ecb] "+r" (L_AES_ARM32_td_ecb_c), + [L_AES_ARM32_td4] "+r" (L_AES_ARM32_td4_c) : - : "memory", "r12", "lr", "r7", "r8", "r9", "r10", "r11", "cc" + : "memory", "cc", "r12", "lr", "r7", "r8", "r9", "r10", "r11" ); } #endif /* WOLFSSL_AES_DIRECT || WOLFSSL_AES_COUNTER */ #ifdef HAVE_AES_CBC -void AES_CBC_decrypt(const unsigned char* in, unsigned char* out, unsigned long len, const unsigned char* ks, int nr, unsigned char* iv); -void AES_CBC_decrypt(const unsigned char* in_p, unsigned char* out_p, unsigned long len_p, const unsigned char* ks_p, int nr_p, unsigned char* iv_p) +void AES_CBC_decrypt(const unsigned char* in_p, unsigned char* out_p, + unsigned long len_p, const unsigned char* ks_p, int nr_p, + unsigned char* iv_p); +void AES_CBC_decrypt(const unsigned char* in_p, unsigned char* out_p, + unsigned long len_p, const unsigned char* ks_p, int nr_p, + unsigned char* iv_p) { register const unsigned char* in asm ("r0") = (const unsigned char*)in_p; register unsigned char* out asm ("r1") = (unsigned char*)out_p; @@ -3308,8 +3356,10 @@ void AES_CBC_decrypt(const unsigned char* in_p, unsigned char* out_p, unsigned l register const unsigned char* ks asm ("r3") = (const unsigned char*)ks_p; register int nr asm ("r4") = (int)nr_p; register unsigned char* iv asm ("r5") = (unsigned char*)iv_p; - register uint32_t* L_AES_ARM32_td_ecb_c asm ("r6") = (uint32_t*)L_AES_ARM32_td_ecb; - register unsigned char* L_AES_ARM32_td4_c asm ("r7") = (unsigned char*)&L_AES_ARM32_td4; + register uint32_t* L_AES_ARM32_td_ecb_c asm ("r6") = + (uint32_t*)L_AES_ARM32_td_ecb; + register unsigned char* L_AES_ARM32_td4_c asm ("r7") = + (unsigned char*)&L_AES_ARM32_td4; __asm__ __volatile__ ( "mov r8, r4\n\t" @@ -3923,9 +3973,12 @@ void AES_CBC_decrypt(const unsigned char* in_p, unsigned char* out_p, unsigned l "\n" "L_AES_CBC_decrypt_end_%=: \n\t" "pop {%[ks]-r4}\n\t" - : [in] "+r" (in), [out] "+r" (out), [len] "+r" (len), [ks] "+r" (ks), [nr] "+r" (nr), [iv] "+r" (iv), [L_AES_ARM32_td_ecb] "+r" (L_AES_ARM32_td_ecb_c), [L_AES_ARM32_td4] "+r" (L_AES_ARM32_td4_c) + : [in] "+r" (in), [out] "+r" (out), [len] "+r" (len), [ks] "+r" (ks), + [nr] "+r" (nr), [iv] "+r" (iv), + [L_AES_ARM32_td_ecb] "+r" (L_AES_ARM32_td_ecb_c), + [L_AES_ARM32_td4] "+r" (L_AES_ARM32_td4_c) : - : "memory", "r12", "lr", "r8", "r9", "r10", "r11", "cc" + : "memory", "cc", "r12", "lr", "r8", "r9", "r10", "r11" ); } @@ -3940,14 +3993,18 @@ static const uint32_t L_GCM_gmult_len_r[] = { 0x91800000, 0x8da00000, 0xa9c00000, 0xb5e00000, }; -void GCM_gmult_len(unsigned char* x, const unsigned char** m, const unsigned char* data, unsigned long len); -void GCM_gmult_len(unsigned char* x_p, const unsigned char** m_p, const unsigned char* data_p, unsigned long len_p) +void GCM_gmult_len(unsigned char* x_p, const unsigned char** m_p, + const unsigned char* data_p, unsigned long len_p); +void GCM_gmult_len(unsigned char* x_p, const unsigned char** m_p, + const unsigned char* data_p, unsigned long len_p) { register unsigned char* x asm ("r0") = (unsigned char*)x_p; register const unsigned char** m asm ("r1") = (const unsigned char**)m_p; - register const unsigned char* data asm ("r2") = (const unsigned char*)data_p; + register const unsigned char* data asm ("r2") = + (const unsigned char*)data_p; register unsigned long len asm ("r3") = (unsigned long)len_p; - register uint32_t* L_GCM_gmult_len_r_c asm ("r4") = (uint32_t*)&L_GCM_gmult_len_r; + register uint32_t* L_GCM_gmult_len_r_c asm ("r4") = + (uint32_t*)&L_GCM_gmult_len_r; __asm__ __volatile__ ( "mov lr, %[L_GCM_gmult_len_r]\n\t" @@ -4521,15 +4578,21 @@ void GCM_gmult_len(unsigned char* x_p, const unsigned char** m_p, const unsigned "subs %[len], %[len], #16\n\t" "add %[data], %[data], #16\n\t" "bne L_GCM_gmult_len_start_block_%=\n\t" - : [x] "+r" (x), [m] "+r" (m), [data] "+r" (data), [len] "+r" (len), [L_GCM_gmult_len_r] "+r" (L_GCM_gmult_len_r_c) + : [x] "+r" (x), [m] "+r" (m), [data] "+r" (data), [len] "+r" (len), + [L_GCM_gmult_len_r] "+r" (L_GCM_gmult_len_r_c) : - : "memory", "r12", "lr", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "cc" + : "memory", "cc", "r12", "lr", "r5", "r6", "r7", "r8", "r9", "r10", + "r11" ); } static const uint32_t* L_AES_ARM32_te_gcm = L_AES_ARM32_te_data; -void AES_GCM_encrypt(const unsigned char* in, unsigned char* out, unsigned long len, const unsigned char* ks, int nr, unsigned char* ctr); -void AES_GCM_encrypt(const unsigned char* in_p, unsigned char* out_p, unsigned long len_p, const unsigned char* ks_p, int nr_p, unsigned char* ctr_p) +void AES_GCM_encrypt(const unsigned char* in_p, unsigned char* out_p, + unsigned long len_p, const unsigned char* ks_p, int nr_p, + unsigned char* ctr_p); +void AES_GCM_encrypt(const unsigned char* in_p, unsigned char* out_p, + unsigned long len_p, const unsigned char* ks_p, int nr_p, + unsigned char* ctr_p) { register const unsigned char* in asm ("r0") = (const unsigned char*)in_p; register unsigned char* out asm ("r1") = (unsigned char*)out_p; @@ -4537,7 +4600,8 @@ void AES_GCM_encrypt(const unsigned char* in_p, unsigned char* out_p, unsigned l register const unsigned char* ks asm ("r3") = (const unsigned char*)ks_p; register int nr asm ("r4") = (int)nr_p; register unsigned char* ctr asm ("r5") = (unsigned char*)ctr_p; - register uint32_t* L_AES_ARM32_te_gcm_c asm ("r6") = (uint32_t*)L_AES_ARM32_te_gcm; + register uint32_t* L_AES_ARM32_te_gcm_c asm ("r6") = + (uint32_t*)L_AES_ARM32_te_gcm; __asm__ __volatile__ ( "mov r12, r4\n\t" @@ -4778,9 +4842,11 @@ void AES_GCM_encrypt(const unsigned char* in_p, unsigned char* out_p, unsigned l "rev r7, r7\n\t" #endif /* WOLFSSL_ARM_ARCH && WOLFSSL_ARM_ARCH < 6 */ "stm r8, {r4, r5, r6, r7}\n\t" - : [in] "+r" (in), [out] "+r" (out), [len] "+r" (len), [ks] "+r" (ks), [nr] "+r" (nr), [ctr] "+r" (ctr), [L_AES_ARM32_te_gcm] "+r" (L_AES_ARM32_te_gcm_c) + : [in] "+r" (in), [out] "+r" (out), [len] "+r" (len), [ks] "+r" (ks), + [nr] "+r" (nr), [ctr] "+r" (ctr), + [L_AES_ARM32_te_gcm] "+r" (L_AES_ARM32_te_gcm_c) : - : "memory", "r12", "lr", "r7", "r8", "r9", "r10", "r11", "cc" + : "memory", "cc", "r12", "lr", "r7", "r8", "r9", "r10", "r11" ); } diff --git a/wolfcrypt/src/port/arm/armv8-32-chacha-asm.S b/wolfcrypt/src/port/arm/armv8-32-chacha-asm.S index 77ec219081..b19bf515c2 100644 --- a/wolfcrypt/src/port/arm/armv8-32-chacha-asm.S +++ b/wolfcrypt/src/port/arm/armv8-32-chacha-asm.S @@ -21,7 +21,8 @@ /* Generated using (from wolfssl): * cd ../scripts - * ruby ./chacha/chacha.rb arm32 ../wolfssl/wolfcrypt/src/port/arm/armv8-32-chacha-asm.S + * ruby ./chacha/chacha.rb arm32 \ + * ../wolfssl/wolfcrypt/src/port/arm/armv8-32-chacha-asm.S */ #ifdef HAVE_CONFIG_H diff --git a/wolfcrypt/src/port/arm/armv8-32-chacha-asm_c.c b/wolfcrypt/src/port/arm/armv8-32-chacha-asm_c.c index 8c80fc4ad9..201cf2ee31 100644 --- a/wolfcrypt/src/port/arm/armv8-32-chacha-asm_c.c +++ b/wolfcrypt/src/port/arm/armv8-32-chacha-asm_c.c @@ -21,7 +21,8 @@ /* Generated using (from wolfssl): * cd ../scripts - * ruby ./chacha/chacha.rb arm32 ../wolfssl/wolfcrypt/src/port/arm/armv8-32-chacha-asm.c + * ruby ./chacha/chacha.rb arm32 \ + * ../wolfssl/wolfcrypt/src/port/arm/armv8-32-chacha-asm.c */ #ifdef HAVE_CONFIG_H @@ -72,9 +73,9 @@ void wc_chacha_setiv(word32* x_p, const byte* iv_p, word32 counter_p) "rev lr, lr\n\t" #endif /* BIG_ENDIAN_ORDER */ "stm r3, {r4, r12, lr}\n\t" - : [x] "+r" (x), [iv] "+r" (iv), [counter] "+r" (counter) + : [x] "+r" (x), [iv] "+r" (iv), [counter] "+r" (counter) : - : "memory", "r3", "r12", "lr", "r4", "cc" + : "memory", "cc", "r3", "r12", "lr", "r4" ); } @@ -88,7 +89,8 @@ void wc_chacha_setkey(word32* x_p, const byte* key_p, word32 keySz_p) register word32* x asm ("r0") = (word32*)x_p; register const byte* key asm ("r1") = (const byte*)key_p; register word32 keySz asm ("r2") = (word32)keySz_p; - register uint32_t* L_chacha_arm32_constants_c asm ("r3") = (uint32_t*)&L_chacha_arm32_constants; + register uint32_t* L_chacha_arm32_constants_c asm ("r3") = + (uint32_t*)&L_chacha_arm32_constants; __asm__ __volatile__ ( "subs %[keySz], %[keySz], #16\n\t" @@ -119,14 +121,16 @@ void wc_chacha_setkey(word32* x_p, const byte* key_p, word32 keySz_p) "\n" "L_chacha_arm32_setkey_same_keyb_ytes_%=: \n\t" "stm %[x], {r4, r5, r12, lr}\n\t" - : [x] "+r" (x), [key] "+r" (key), [keySz] "+r" (keySz), [L_chacha_arm32_constants] "+r" (L_chacha_arm32_constants_c) + : [x] "+r" (x), [key] "+r" (key), [keySz] "+r" (keySz), + [L_chacha_arm32_constants] "+r" (L_chacha_arm32_constants_c) : - : "memory", "r12", "lr", "r4", "r5", "cc" + : "memory", "cc", "r12", "lr", "r4", "r5" ); } #ifdef WOLFSSL_ARMASM_NO_NEON -void wc_chacha_crypt_bytes(ChaCha* ctx_p, byte* c_p, const byte* m_p, word32 len_p) +void wc_chacha_crypt_bytes(ChaCha* ctx_p, byte* c_p, const byte* m_p, + word32 len_p) { register ChaCha* ctx asm ("r0") = (ChaCha*)ctx_p; register byte* c asm ("r1") = (byte*)c_p; @@ -176,7 +180,7 @@ void wc_chacha_crypt_bytes(ChaCha* ctx_p, byte* c_p, const byte* m_p, word32 len "strd r6, r7, [sp, #24]\n\t" #endif /* Load x[0]..x[12] into registers. */ - "ldm lr, {%[ctx], %[c], %[m], %[len], r4, r5, r6, r7, r8, r9, r10, r11, r12}\n\t" + "ldm lr, {r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12}\n\t" /* 10x 2 full rounds to perform. */ "mov lr, #10\n\t" "str lr, [sp, #48]\n\t" @@ -414,9 +418,9 @@ void wc_chacha_crypt_bytes(ChaCha* ctx_p, byte* c_p, const byte* m_p, word32 len /* Store in over field of ChaCha. */ "ldr lr, [sp, #32]\n\t" "add r12, lr, #0x44\n\t" - "stm r12!, {%[ctx], %[c], %[m], %[len], r4, r5, r6, r7}\n\t" - "ldm sp, {%[ctx], %[c], %[m], %[len], r4, r5, r6, r7}\n\t" - "stm r12, {%[ctx], %[c], %[m], %[len], r4, r5, r6, r7}\n\t" + "stm r12!, {r0, r1, r2, r3, r4, r5, r6, r7}\n\t" + "ldm sp, {r0, r1, r2, r3, r4, r5, r6, r7}\n\t" + "stm r12, {r0, r1, r2, r3, r4, r5, r6, r7}\n\t" #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr %[m], [sp, #40]\n\t" "ldr %[len], [sp, #44]\n\t" @@ -482,13 +486,15 @@ void wc_chacha_crypt_bytes(ChaCha* ctx_p, byte* c_p, const byte* m_p, word32 len "\n" "L_chacha_arm32_crypt_done_%=: \n\t" "add sp, sp, #52\n\t" - : [ctx] "+r" (ctx), [c] "+r" (c), [m] "+r" (m), [len] "+r" (len) + : [ctx] "+r" (ctx), [c] "+r" (c), [m] "+r" (m), [len] "+r" (len) : - : "memory", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "cc" + : "memory", "cc", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9", + "r10", "r11" ); } -void wc_chacha_use_over(byte* over_p, byte* output_p, const byte* input_p, word32 len_p) +void wc_chacha_use_over(byte* over_p, byte* output_p, const byte* input_p, + word32 len_p) { register byte* over asm ("r0") = (byte*)over_p; register byte* output asm ("r1") = (byte*)output_p; @@ -553,9 +559,10 @@ void wc_chacha_use_over(byte* over_p, byte* output_p, const byte* input_p, word3 "b L_chacha_arm32_over_byte_loop_%=\n\t" "\n" "L_chacha_arm32_over_done_%=: \n\t" - : [over] "+r" (over), [output] "+r" (output), [input] "+r" (input), [len] "+r" (len) + : [over] "+r" (over), [output] "+r" (output), [input] "+r" (input), + [len] "+r" (len) : - : "memory", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9", "cc" + : "memory", "cc", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9" ); } diff --git a/wolfcrypt/src/port/arm/armv8-32-curve25519.S b/wolfcrypt/src/port/arm/armv8-32-curve25519.S index 69cb22e4e4..bf8daeec0e 100644 --- a/wolfcrypt/src/port/arm/armv8-32-curve25519.S +++ b/wolfcrypt/src/port/arm/armv8-32-curve25519.S @@ -21,7 +21,8 @@ /* Generated using (from wolfssl): * cd ../scripts - * ruby ./x25519/x25519.rb arm32 ../wolfssl/wolfcrypt/src/port/arm/armv8-32-curve25519.S + * ruby ./x25519/x25519.rb arm32 \ + * ../wolfssl/wolfcrypt/src/port/arm/armv8-32-curve25519.S */ #ifdef HAVE_CONFIG_H diff --git a/wolfcrypt/src/port/arm/armv8-32-curve25519_c.c b/wolfcrypt/src/port/arm/armv8-32-curve25519_c.c index 09ef2eb439..d00916ec66 100644 --- a/wolfcrypt/src/port/arm/armv8-32-curve25519_c.c +++ b/wolfcrypt/src/port/arm/armv8-32-curve25519_c.c @@ -21,7 +21,8 @@ /* Generated using (from wolfssl): * cd ../scripts - * ruby ./x25519/x25519.rb arm32 ../wolfssl/wolfcrypt/src/port/arm/armv8-32-curve25519.c + * ruby ./x25519/x25519.rb arm32 \ + * ../wolfssl/wolfcrypt/src/port/arm/armv8-32-curve25519.c */ #ifdef HAVE_CONFIG_H @@ -282,7 +283,7 @@ void fe_add_sub_op() /* Done Add-Sub */ : : - : "memory", "lr", "cc" + : "memory", "cc", "lr" ); } @@ -324,7 +325,7 @@ void fe_sub_op() /* Done Sub */ : : - : "memory", "lr", "cc" + : "memory", "cc", "lr" ); } @@ -336,9 +337,10 @@ void fe_sub(fe r_p, const fe a_p, const fe b_p) __asm__ __volatile__ ( "bl fe_sub_op\n\t" - : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) + : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) : - : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "lr", "cc" + : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", + "r11", "r12", "lr" ); } @@ -381,7 +383,7 @@ void fe_add_op() /* Done Add */ : : - : "memory", "lr", "cc" + : "memory", "cc", "lr" ); } @@ -393,9 +395,10 @@ void fe_add(fe r_p, const fe a_p, const fe b_p) __asm__ __volatile__ ( "bl fe_add_op\n\t" - : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) + : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) : - : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "lr", "cc" + : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", + "r11", "r12", "lr" ); } @@ -427,9 +430,9 @@ void fe_frombytes(fe out_p, const unsigned char* in_p) "str r7, [%[out], #20]\n\t" "str r8, [%[out], #24]\n\t" "str r9, [%[out], #28]\n\t" - : [out] "+r" (out), [in] "+r" (in) + : [out] "+r" (out), [in] "+r" (in) : - : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "cc" + : "memory", "cc", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9" ); } @@ -471,9 +474,9 @@ void fe_tobytes(unsigned char* out_p, const fe n_p) "str r7, [%[out], #20]\n\t" "str r8, [%[out], #24]\n\t" "str r9, [%[out], #28]\n\t" - : [out] "+r" (out), [n] "+r" (n) + : [out] "+r" (out), [n] "+r" (n) : - : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r12", "cc" + : "memory", "cc", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r12" ); } @@ -494,7 +497,7 @@ void fe_1(fe n_p) "stm %[n], {r2, r3, r4, r5, r6, r7, r8, r9}\n\t" : [n] "+r" (n) : - : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "cc" + : "memory", "cc", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9" ); } @@ -515,7 +518,7 @@ void fe_0(fe n_p) "stm %[n], {r2, r3, r4, r5, r6, r7, r8, r9}\n\t" : [n] "+r" (n) : - : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "cc" + : "memory", "cc", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9" ); } @@ -574,9 +577,9 @@ void fe_copy(fe r_p, const fe a_p) #else "strd r4, r5, [%[r], #24]\n\t" #endif - : [r] "+r" (r), [a] "+r" (a) + : [r] "+r" (r), [a] "+r" (a) : - : "memory", "r2", "r3", "r4", "r5", "cc" + : "memory", "cc", "r2", "r3", "r4", "r5" ); } @@ -601,9 +604,9 @@ void fe_neg(fe r_p, const fe a_p) "sbcs r4, lr, r4\n\t" "sbc r5, r12, r5\n\t" "stm %[r]!, {r2, r3, r4, r5}\n\t" - : [r] "+r" (r), [a] "+r" (a) + : [r] "+r" (r), [a] "+r" (a) : - : "memory", "r2", "r3", "r4", "r5", "r12", "lr", "cc" + : "memory", "cc", "r2", "r3", "r4", "r5", "r12", "lr" ); } @@ -645,7 +648,8 @@ int fe_isnonzero(const fe a_p) "orr %[a], r2, r4\n\t" : [a] "+r" (a) : - : "memory", "r1", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r12", "cc" + : "memory", "cc", "r1", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", + "r12" ); return (uint32_t)(size_t)a; } @@ -671,7 +675,7 @@ int fe_isnegative(const fe a_p) "eor %[a], %[a], r1\n\t" : [a] "+r" (a) : - : "memory", "r1", "r2", "r3", "r4", "r5", "cc" + : "memory", "cc", "r1", "r2", "r3", "r4", "r5" ); return (uint32_t)(size_t)a; } @@ -2405,9 +2409,10 @@ void fe_cmov_table(fe* r_p, fe* base_p, signed char b_p) #else "strd r8, r9, [%[r], #88]\n\t" #endif - : [r] "+r" (r), [base] "+r" (base), [b] "+r" (b) + : [r] "+r" (r), [base] "+r" (base), [b] "+r" (b) : - : "memory", "r4", "r5", "r6", "r7", "r8", "r9", "r3", "r10", "r11", "r12", "lr", "cc" + : "memory", "cc", "r4", "r5", "r6", "r7", "r8", "r9", "r3", "r10", + "r11", "r12", "lr" ); } @@ -2525,9 +2530,10 @@ void fe_cmov_table(fe* r_p, fe* base_p, signed char b_p) "and r7, r7, lr\n\t" "stm %[r]!, {r4, r5, r6, r7}\n\t" "sub %[base], %[base], %[b]\n\t" - : [r] "+r" (r), [base] "+r" (base), [b] "+r" (b) + : [r] "+r" (r), [base] "+r" (base), [b] "+r" (b) : - : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "lr", "cc" + : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", + "r11", "r12", "lr" ); } @@ -2914,7 +2920,7 @@ void fe_mul_op() "add sp, sp, #40\n\t" : : - : "memory", "lr", "cc" + : "memory", "cc", "lr" ); } @@ -3057,7 +3063,7 @@ void fe_mul_op() "add sp, sp, #16\n\t" : : - : "memory", "lr", "cc" + : "memory", "cc", "lr" ); } @@ -3070,9 +3076,10 @@ void fe_mul(fe r_p, const fe a_p, const fe b_p) __asm__ __volatile__ ( "bl fe_mul_op\n\t" - : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) + : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) : - : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "lr", "cc" + : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", + "r11", "r12", "lr" ); } @@ -3349,7 +3356,7 @@ void fe_sq_op() "add sp, sp, #0x44\n\t" : : - : "memory", "lr", "cc" + : "memory", "cc", "lr" ); } @@ -3478,7 +3485,7 @@ void fe_sq_op() "stm lr, {r0, r1, r2, r3, r4, r5, r6, r7}\n\t" : : - : "memory", "lr", "cc" + : "memory", "cc", "lr" ); } @@ -3490,9 +3497,10 @@ void fe_sq(fe r_p, const fe a_p) __asm__ __volatile__ ( "bl fe_sq_op\n\t" - : [r] "+r" (r), [a] "+r" (a) + : [r] "+r" (r), [a] "+r" (a) : - : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r12", "lr", "r10", "r11", "cc" + : "memory", "cc", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r12", + "lr", "r10", "r11" ); } @@ -3562,9 +3570,10 @@ void fe_mul121666(fe r_p, fe a_p) "adcs r8, r8, #0\n\t" "adc r9, r9, #0\n\t" "stm %[r], {r2, r3, r4, r5, r6, r7, r8, r9}\n\t" - : [r] "+r" (r), [a] "+r" (a) + : [r] "+r" (r), [a] "+r" (a) : - : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r12", "lr", "r10", "cc" + : "memory", "cc", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r12", + "lr", "r10" ); } @@ -3620,9 +3629,10 @@ void fe_mul121666(fe r_p, fe a_p) "adcs r8, r8, #0\n\t" "adc r9, r9, #0\n\t" "stm %[r], {r2, r3, r4, r5, r6, r7, r8, r9}\n\t" - : [r] "+r" (r), [a] "+r" (a) + : [r] "+r" (r), [a] "+r" (a) : - : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r12", "lr", "r10", "cc" + : "memory", "cc", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r12", + "lr", "r10" ); } @@ -4010,9 +4020,10 @@ int curve25519(byte* r_p, const byte* n_p, const byte* a_p) "bl fe_mul_op\n\t" "mov r0, #0\n\t" "add sp, sp, #0xbc\n\t" - : [r] "+r" (r), [n] "+r" (n), [a] "+r" (a) + : [r] "+r" (r), [n] "+r" (n), [a] "+r" (a) : - : "memory", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r3", "r12", "lr", "cc" + : "memory", "cc", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", + "r3", "r12", "lr" ); return (uint32_t)(size_t)r; } @@ -4323,9 +4334,10 @@ int curve25519(byte* r_p, const byte* n_p, const byte* a_p) "stm %[r], {r4, r5, r6, r7, r8, r9, r10, r11}\n\t" "mov r0, #0\n\t" "add sp, sp, #0xc0\n\t" - : [r] "+r" (r), [n] "+r" (n), [a] "+r" (a) + : [r] "+r" (r), [n] "+r" (n), [a] "+r" (a) : - : "memory", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r3", "r12", "lr", "cc" + : "memory", "cc", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", + "r3", "r12", "lr" ); return (uint32_t)(size_t)r; } @@ -4497,9 +4509,10 @@ void fe_invert(fe r_p, const fe a_p) "ldr %[a], [sp, #132]\n\t" "ldr %[r], [sp, #128]\n\t" "add sp, sp, #0x88\n\t" - : [r] "+r" (r), [a] "+r" (a) + : [r] "+r" (r), [a] "+r" (a) : - : "memory", "lr", "r12", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "cc" + : "memory", "cc", "lr", "r12", "r2", "r3", "r4", "r5", "r6", "r7", "r8", + "r9", "r10", "r11" ); } @@ -4817,9 +4830,9 @@ void fe_sq2(fe r_p, const fe a_p) "ldr r0, [sp, #64]\n\t" "stm r0, {r1, r2, r3, r4, r5, r6, r7, r8}\n\t" "add sp, sp, #0x44\n\t" - : [r] "+r" (r), [a] "+r" (a) + : [r] "+r" (r), [a] "+r" (a) : - : "memory", "lr", "cc" + : "memory", "cc", "lr" ); } @@ -4996,9 +5009,9 @@ void fe_sq2(fe r_p, const fe a_p) "stm r12, {r0, r1, r2, r3, r4, r5, r6, r7}\n\t" "mov r0, r12\n\t" "mov r1, lr\n\t" - : [r] "+r" (r), [a] "+r" (a) + : [r] "+r" (r), [a] "+r" (a) : - : "memory", "lr", "cc" + : "memory", "cc", "lr" ); } @@ -5167,9 +5180,10 @@ void fe_pow22523(fe r_p, const fe a_p) "ldr %[a], [sp, #100]\n\t" "ldr %[r], [sp, #96]\n\t" "add sp, sp, #0x68\n\t" - : [r] "+r" (r), [a] "+r" (a) + : [r] "+r" (r), [a] "+r" (a) : - : "memory", "lr", "r12", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "cc" + : "memory", "cc", "lr", "r12", "r2", "r3", "r4", "r5", "r6", "r7", "r8", + "r9", "r10", "r11" ); } @@ -5197,9 +5211,10 @@ void ge_p1p1_to_p2(ge_p2 * r_p, const ge_p1p1 * p_p) "add r0, r0, #0x40\n\t" "bl fe_mul_op\n\t" "add sp, sp, #8\n\t" - : [r] "+r" (r), [p] "+r" (p) + : [r] "+r" (r), [p] "+r" (p) : - : "memory", "lr", "r2", "r3", "r12", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "cc" + : "memory", "cc", "lr", "r2", "r3", "r12", "r4", "r5", "r6", "r7", "r8", + "r9", "r10", "r11" ); } @@ -5232,9 +5247,10 @@ void ge_p1p1_to_p3(ge_p3 * r_p, const ge_p1p1 * p_p) "add r0, r0, #0x60\n\t" "bl fe_mul_op\n\t" "add sp, sp, #8\n\t" - : [r] "+r" (r), [p] "+r" (p) + : [r] "+r" (r), [p] "+r" (p) : - : "memory", "lr", "r2", "r3", "r12", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "cc" + : "memory", "cc", "lr", "r2", "r3", "r12", "r4", "r5", "r6", "r7", "r8", + "r9", "r10", "r11" ); } @@ -5279,9 +5295,10 @@ void ge_p2_dbl(ge_p1p1 * r_p, const ge_p2 * p_p) "mov r1, r0\n\t" "bl fe_sub_op\n\t" "add sp, sp, #8\n\t" - : [r] "+r" (r), [p] "+r" (p) + : [r] "+r" (r), [p] "+r" (p) : - : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "lr", "cc" + : "memory", "cc", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", + "r11", "r12", "lr" ); } @@ -5365,9 +5382,10 @@ void ge_madd(ge_p1p1 * r_p, const ge_p3 * p_p, const ge_precomp * q_p) "add r1, r0, #32\n\t" "bl fe_add_sub_op\n\t" "add sp, sp, #12\n\t" - : [r] "+r" (r), [p] "+r" (p), [q] "+r" (q) + : [r] "+r" (r), [p] "+r" (p), [q] "+r" (q) : - : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "lr", "cc" + : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", + "r11", "r12", "lr" ); } @@ -5452,9 +5470,10 @@ void ge_msub(ge_p1p1 * r_p, const ge_p3 * p_p, const ge_precomp * q_p) "add r0, r0, #32\n\t" "bl fe_add_sub_op\n\t" "add sp, sp, #12\n\t" - : [r] "+r" (r), [p] "+r" (p), [q] "+r" (q) + : [r] "+r" (r), [p] "+r" (p), [q] "+r" (q) : - : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "lr", "cc" + : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", + "r11", "r12", "lr" ); } @@ -5539,9 +5558,10 @@ void ge_add(ge_p1p1 * r_p, const ge_p3 * p_p, const ge_cached* q_p) "add r0, r0, #32\n\t" "bl fe_add_sub_op\n\t" "add sp, sp, #44\n\t" - : [r] "+r" (r), [p] "+r" (p), [q] "+r" (q) + : [r] "+r" (r), [p] "+r" (p), [q] "+r" (q) : - : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "lr", "cc" + : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", + "r11", "r12", "lr" ); } @@ -5626,9 +5646,10 @@ void ge_sub(ge_p1p1 * r_p, const ge_p3 * p_p, const ge_cached* q_p) "add r0, r0, #0x40\n\t" "bl fe_add_sub_op\n\t" "add sp, sp, #44\n\t" - : [r] "+r" (r), [p] "+r" (p), [q] "+r" (q) + : [r] "+r" (r), [p] "+r" (p), [q] "+r" (q) : - : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "lr", "cc" + : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", + "r11", "r12", "lr" ); } @@ -6408,7 +6429,8 @@ void sc_reduce(byte* s_p) "add sp, sp, #56\n\t" : [s] "+r" (s) : - : "memory", "r1", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "lr", "cc" + : "memory", "cc", "r1", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", + "r10", "r11", "r12", "lr" ); } @@ -7059,7 +7081,8 @@ void sc_reduce(byte* s_p) "add sp, sp, #56\n\t" : [s] "+r" (s) : - : "memory", "r1", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "lr", "cc" + : "memory", "cc", "r1", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", + "r10", "r11", "r12", "lr" ); } @@ -7076,7 +7099,7 @@ void sc_muladd(byte* s_p, const byte* a_p, const byte* b_p, const byte* c_p) __asm__ __volatile__ ( "sub sp, sp, #0x50\n\t" "add lr, sp, #0x44\n\t" - "stm lr, {%[s], %[a], %[c]}\n\t" + "stm lr, {r0, r1, r3}\n\t" "mov %[s], #0\n\t" "ldr r12, [%[a]]\n\t" /* A[0] * B[0] */ @@ -7402,24 +7425,24 @@ void sc_muladd(byte* s_p, const byte* a_p, const byte* b_p, const byte* c_p) "adc r10, %[s], #0\n\t" "umlal r9, r10, r12, lr\n\t" "add lr, sp, #32\n\t" - "stm lr, {%[c], r4, r5, r6, r7, r8, r9, r10}\n\t" + "stm lr, {r3, r4, r5, r6, r7, r8, r9, r10}\n\t" "mov %[s], sp\n\t" /* Add c to a * b */ "ldr lr, [sp, #76]\n\t" - "ldm %[s], {%[b], %[c], r4, r5, r6, r7, r8, r9}\n\t" - "ldm lr!, {%[a], r10, r11, r12}\n\t" + "ldm %[s], {r2, r3, r4, r5, r6, r7, r8, r9}\n\t" + "ldm lr!, {r1, r10, r11, r12}\n\t" "adds %[b], %[b], %[a]\n\t" "adcs %[c], %[c], r10\n\t" "adcs r4, r4, r11\n\t" "adcs r5, r5, r12\n\t" - "ldm lr!, {%[a], r10, r11, r12}\n\t" + "ldm lr!, {r1, r10, r11, r12}\n\t" "adcs r6, r6, %[a]\n\t" "adcs r7, r7, r10\n\t" "adcs r8, r8, r11\n\t" "adcs r9, r9, r12\n\t" "mov %[a], r9\n\t" - "stm %[s]!, {%[b], %[c], r4, r5, r6, r7, r8, r9}\n\t" - "ldm %[s], {%[b], %[c], r4, r5, r6, r7, r8, r9}\n\t" + "stm %[s]!, {r2, r3, r4, r5, r6, r7, r8, r9}\n\t" + "ldm %[s], {r2, r3, r4, r5, r6, r7, r8, r9}\n\t" "adcs %[b], %[b], #0\n\t" "adcs %[c], %[c], #0\n\t" "adcs r4, r4, #0\n\t" @@ -7918,7 +7941,7 @@ void sc_muladd(byte* s_p, const byte* a_p, const byte* b_p, const byte* c_p) "mov r12, sp\n\t" /* Load bits 252-376 */ "add r12, r12, #28\n\t" - "ldm r12, {%[a], %[b], %[c], r4, r5}\n\t" + "ldm r12, {r1, r2, r3, r4, r5}\n\t" "lsl r5, r5, #4\n\t" "orr r5, r5, r4, lsr #28\n\t" "lsl r4, r4, #4\n\t" @@ -8097,7 +8120,7 @@ void sc_muladd(byte* s_p, const byte* a_p, const byte* b_p, const byte* c_p) "sbcs r9, r9, r5\n\t" "sbc %[a], %[a], %[a]\n\t" "sub %[s], %[s], #16\n\t" - "ldm %[s], {%[b], %[c], r4, r5}\n\t" + "ldm %[s], {r2, r3, r4, r5}\n\t" #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "mov r10, #0x5c\n\t" "lsl r10, r10, #8\n\t" @@ -8199,9 +8222,10 @@ void sc_muladd(byte* s_p, const byte* a_p, const byte* b_p, const byte* c_p) "str r8, [%[s], #24]\n\t" "str r9, [%[s], #28]\n\t" "add sp, sp, #0x50\n\t" - : [s] "+r" (s), [a] "+r" (a), [b] "+r" (b), [c] "+r" (c) + : [s] "+r" (s), [a] "+r" (a), [b] "+r" (b), [c] "+r" (c) : - : "memory", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "lr", "cc" + : "memory", "cc", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", + "r12", "lr" ); } @@ -8216,9 +8240,9 @@ void sc_muladd(byte* s_p, const byte* a_p, const byte* b_p, const byte* c_p) __asm__ __volatile__ ( "sub sp, sp, #0x50\n\t" "add lr, sp, #0x44\n\t" - "stm lr, {%[s], %[a], %[c]}\n\t" + "stm lr, {r0, r1, r3}\n\t" "mov lr, %[b]\n\t" - "ldm %[a], {%[s], %[a], %[b], %[c]}\n\t" + "ldm %[a], {r0, r1, r2, r3}\n\t" "ldm lr!, {r4, r5, r6}\n\t" "umull r10, r11, %[s], r4\n\t" "umull r12, r7, %[a], r4\n\t" @@ -8263,7 +8287,7 @@ void sc_muladd(byte* s_p, const byte* a_p, const byte* b_p, const byte* c_p) "umaal r4, r6, %[b], r7\n\t" "sub lr, lr, #16\n\t" "umaal r5, r6, %[c], r7\n\t" - "ldm %[s], {%[s], %[a], %[b], %[c]}\n\t" + "ldm %[s], {r0, r1, r2, r3}\n\t" "str r6, [sp, #64]\n\t" "ldm lr!, {r6}\n\t" "mov r7, #0\n\t" @@ -8315,24 +8339,24 @@ void sc_muladd(byte* s_p, const byte* a_p, const byte* b_p, const byte* c_p) "umaal r9, r10, %[c], lr\n\t" "mov %[c], r12\n\t" "add lr, sp, #32\n\t" - "stm lr, {%[c], r4, r5, r6, r7, r8, r9, r10}\n\t" + "stm lr, {r3, r4, r5, r6, r7, r8, r9, r10}\n\t" "mov %[s], sp\n\t" /* Add c to a * b */ "ldr lr, [sp, #76]\n\t" - "ldm %[s], {%[b], %[c], r4, r5, r6, r7, r8, r9}\n\t" - "ldm lr!, {%[a], r10, r11, r12}\n\t" + "ldm %[s], {r2, r3, r4, r5, r6, r7, r8, r9}\n\t" + "ldm lr!, {r1, r10, r11, r12}\n\t" "adds %[b], %[b], %[a]\n\t" "adcs %[c], %[c], r10\n\t" "adcs r4, r4, r11\n\t" "adcs r5, r5, r12\n\t" - "ldm lr!, {%[a], r10, r11, r12}\n\t" + "ldm lr!, {r1, r10, r11, r12}\n\t" "adcs r6, r6, %[a]\n\t" "adcs r7, r7, r10\n\t" "adcs r8, r8, r11\n\t" "adcs r9, r9, r12\n\t" "mov %[a], r9\n\t" - "stm %[s]!, {%[b], %[c], r4, r5, r6, r7, r8, r9}\n\t" - "ldm %[s], {%[b], %[c], r4, r5, r6, r7, r8, r9}\n\t" + "stm %[s]!, {r2, r3, r4, r5, r6, r7, r8, r9}\n\t" + "ldm %[s], {r2, r3, r4, r5, r6, r7, r8, r9}\n\t" "adcs %[b], %[b], #0\n\t" "adcs %[c], %[c], #0\n\t" "adcs r4, r4, #0\n\t" @@ -8738,7 +8762,7 @@ void sc_muladd(byte* s_p, const byte* a_p, const byte* b_p, const byte* c_p) "mov r12, sp\n\t" /* Load bits 252-376 */ "add r12, r12, #28\n\t" - "ldm r12, {%[a], %[b], %[c], r4, r5}\n\t" + "ldm r12, {r1, r2, r3, r4, r5}\n\t" "lsl r5, r5, #4\n\t" "orr r5, r5, r4, lsr #28\n\t" "lsl r4, r4, #4\n\t" @@ -8881,7 +8905,7 @@ void sc_muladd(byte* s_p, const byte* a_p, const byte* b_p, const byte* c_p) "sbcs r9, r9, r5\n\t" "sbc %[a], %[a], %[a]\n\t" "sub %[s], %[s], #16\n\t" - "ldm %[s], {%[b], %[c], r4, r5}\n\t" + "ldm %[s], {r2, r3, r4, r5}\n\t" #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "mov r10, #0x5c\n\t" "lsl r10, r10, #8\n\t" @@ -8983,9 +9007,10 @@ void sc_muladd(byte* s_p, const byte* a_p, const byte* b_p, const byte* c_p) "str r8, [%[s], #24]\n\t" "str r9, [%[s], #28]\n\t" "add sp, sp, #0x50\n\t" - : [s] "+r" (s), [a] "+r" (a), [b] "+r" (b), [c] "+r" (c) + : [s] "+r" (s), [a] "+r" (a), [b] "+r" (b), [c] "+r" (c) : - : "memory", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "lr", "cc" + : "memory", "cc", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", + "r12", "lr" ); } diff --git a/wolfcrypt/src/port/arm/armv8-32-poly1305-asm.S b/wolfcrypt/src/port/arm/armv8-32-poly1305-asm.S index ffbd7b2705..d7225828fa 100644 --- a/wolfcrypt/src/port/arm/armv8-32-poly1305-asm.S +++ b/wolfcrypt/src/port/arm/armv8-32-poly1305-asm.S @@ -21,7 +21,8 @@ /* Generated using (from wolfssl): * cd ../scripts - * ruby ./poly1305/poly1305.rb arm32 ../wolfssl/wolfcrypt/src/port/arm/armv8-32-poly1305-asm.S + * ruby ./poly1305/poly1305.rb arm32 \ + * ../wolfssl/wolfcrypt/src/port/arm/armv8-32-poly1305-asm.S */ #ifdef HAVE_CONFIG_H diff --git a/wolfcrypt/src/port/arm/armv8-32-poly1305-asm_c.c b/wolfcrypt/src/port/arm/armv8-32-poly1305-asm_c.c index 2871293570..da604101b7 100644 --- a/wolfcrypt/src/port/arm/armv8-32-poly1305-asm_c.c +++ b/wolfcrypt/src/port/arm/armv8-32-poly1305-asm_c.c @@ -21,7 +21,8 @@ /* Generated using (from wolfssl): * cd ../scripts - * ruby ./poly1305/poly1305.rb arm32 ../wolfssl/wolfcrypt/src/port/arm/armv8-32-poly1305-asm.c + * ruby ./poly1305/poly1305.rb arm32 \ + * ../wolfssl/wolfcrypt/src/port/arm/armv8-32-poly1305-asm.c */ #ifdef HAVE_CONFIG_H @@ -54,7 +55,8 @@ #ifdef HAVE_POLY1305 #include -void poly1305_blocks_arm32_16(Poly1305* ctx_p, const byte* m_p, word32 len_p, int notLast_p) +void poly1305_blocks_arm32_16(Poly1305* ctx_p, const byte* m_p, word32 len_p, + int notLast_p) { register Poly1305* ctx asm ("r0") = (Poly1305*)ctx_p; register const byte* m asm ("r1") = (const byte*)m_p; @@ -66,7 +68,7 @@ void poly1305_blocks_arm32_16(Poly1305* ctx_p, const byte* m_p, word32 len_p, in "cmp %[len], #0\n\t" "beq L_poly1305_arm32_16_done_%=\n\t" "add lr, sp, #12\n\t" - "stm lr, {%[ctx], %[m], %[len], %[notLast]}\n\t" + "stm lr, {r0, r1, r2, r3}\n\t" /* Get h pointer */ "add lr, %[ctx], #16\n\t" "ldm lr, {r4, r5, r6, r7, r8}\n\t" @@ -187,7 +189,7 @@ void poly1305_blocks_arm32_16(Poly1305* ctx_p, const byte* m_p, word32 len_p, in "mov r12, %[ctx]\n\t" "mla r11, %[notLast], %[len], r11\n\t" #else - "ldm %[m], {%[ctx], %[m], %[len], %[notLast]}\n\t" + "ldm %[m], {r0, r1, r2, r3}\n\t" /* r[0] * h[0] */ "umull r10, r11, %[ctx], r4\n\t" /* r[1] * h[0] */ @@ -270,9 +272,11 @@ void poly1305_blocks_arm32_16(Poly1305* ctx_p, const byte* m_p, word32 len_p, in "\n" "L_poly1305_arm32_16_done_%=: \n\t" "add sp, sp, #28\n\t" - : [ctx] "+r" (ctx), [m] "+r" (m), [len] "+r" (len), [notLast] "+r" (notLast) + : [ctx] "+r" (ctx), [m] "+r" (m), [len] "+r" (len), + [notLast] "+r" (notLast) : - : "memory", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "cc" + : "memory", "cc", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9", + "r10", "r11" ); } @@ -284,7 +288,8 @@ void poly1305_set_key(Poly1305* ctx_p, const byte* key_p) { register Poly1305* ctx asm ("r0") = (Poly1305*)ctx_p; register const byte* key asm ("r1") = (const byte*)key_p; - register uint32_t* L_poly1305_arm32_clamp_c asm ("r2") = (uint32_t*)&L_poly1305_arm32_clamp; + register uint32_t* L_poly1305_arm32_clamp_c asm ("r2") = + (uint32_t*)&L_poly1305_arm32_clamp; __asm__ __volatile__ ( /* Load mask. */ @@ -318,9 +323,10 @@ void poly1305_set_key(Poly1305* ctx_p, const byte* key_p) "stm lr, {r5, r6, r7, r8, r12}\n\t" /* Zero leftover */ "str r5, [%[ctx], #52]\n\t" - : [ctx] "+r" (ctx), [key] "+r" (key), [L_poly1305_arm32_clamp] "+r" (L_poly1305_arm32_clamp_c) + : [ctx] "+r" (ctx), [key] "+r" (key), + [L_poly1305_arm32_clamp] "+r" (L_poly1305_arm32_clamp_c) : - : "memory", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "cc" + : "memory", "cc", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8" ); } @@ -373,9 +379,10 @@ void poly1305_final(Poly1305* ctx_p, byte* mac_p) /* Zero out padding. */ "add r9, %[ctx], #36\n\t" "stm r9, {r4, r5, r6, r7}\n\t" - : [ctx] "+r" (ctx), [mac] "+r" (mac) + : [ctx] "+r" (ctx), [mac] "+r" (mac) : - : "memory", "r2", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9", "cc" + : "memory", "cc", "r2", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8", + "r9" ); } diff --git a/wolfcrypt/src/port/arm/armv8-32-sha256-asm.S b/wolfcrypt/src/port/arm/armv8-32-sha256-asm.S index 14a1ec48f5..bcbf3273a6 100644 --- a/wolfcrypt/src/port/arm/armv8-32-sha256-asm.S +++ b/wolfcrypt/src/port/arm/armv8-32-sha256-asm.S @@ -21,7 +21,8 @@ /* Generated using (from wolfssl): * cd ../scripts - * ruby ./sha2/sha256.rb arm32 ../wolfssl/wolfcrypt/src/port/arm/armv8-32-sha256-asm.S + * ruby ./sha2/sha256.rb arm32 \ + * ../wolfssl/wolfcrypt/src/port/arm/armv8-32-sha256-asm.S */ #ifdef HAVE_CONFIG_H diff --git a/wolfcrypt/src/port/arm/armv8-32-sha256-asm_c.c b/wolfcrypt/src/port/arm/armv8-32-sha256-asm_c.c index 391075340e..0a2e15e9be 100644 --- a/wolfcrypt/src/port/arm/armv8-32-sha256-asm_c.c +++ b/wolfcrypt/src/port/arm/armv8-32-sha256-asm_c.c @@ -21,7 +21,8 @@ /* Generated using (from wolfssl): * cd ../scripts - * ruby ./sha2/sha256.rb arm32 ../wolfssl/wolfcrypt/src/port/arm/armv8-32-sha256-asm.c + * ruby ./sha2/sha256.rb arm32 \ + * ../wolfssl/wolfcrypt/src/port/arm/armv8-32-sha256-asm.c */ #ifdef HAVE_CONFIG_H @@ -74,13 +75,14 @@ static const uint32_t L_SHA256_transform_len_k[] = { 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2, }; -void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len); +void Transform_Sha256_Len(wc_Sha256* sha256_p, const byte* data_p, word32 len_p); void Transform_Sha256_Len(wc_Sha256* sha256_p, const byte* data_p, word32 len_p) { register wc_Sha256* sha256 asm ("r0") = (wc_Sha256*)sha256_p; register const byte* data asm ("r1") = (const byte*)data_p; register word32 len asm ("r2") = (word32)len_p; - register uint32_t* L_SHA256_transform_len_k_c asm ("r3") = (uint32_t*)&L_SHA256_transform_len_k; + register uint32_t* L_SHA256_transform_len_k_c asm ("r3") = + (uint32_t*)&L_SHA256_transform_len_k; __asm__ __volatile__ ( "sub sp, sp, #0xc0\n\t" @@ -1732,9 +1734,11 @@ void Transform_Sha256_Len(wc_Sha256* sha256_p, const byte* data_p, word32 len_p) "add %[data], %[data], #0x40\n\t" "bne L_SHA256_transform_len_begin_%=\n\t" "add sp, sp, #0xc0\n\t" - : [sha256] "+r" (sha256), [data] "+r" (data), [len] "+r" (len), [L_SHA256_transform_len_k] "+r" (L_SHA256_transform_len_k_c) + : [sha256] "+r" (sha256), [data] "+r" (data), [len] "+r" (len), + [L_SHA256_transform_len_k] "+r" (L_SHA256_transform_len_k_c) : - : "memory", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "cc" + : "memory", "cc", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", + "r12" ); } @@ -1761,13 +1765,14 @@ static const uint32_t L_SHA256_transform_neon_len_k[] = { 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2, }; -void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len); +void Transform_Sha256_Len(wc_Sha256* sha256_p, const byte* data_p, word32 len_p); void Transform_Sha256_Len(wc_Sha256* sha256_p, const byte* data_p, word32 len_p) { register wc_Sha256* sha256 asm ("r0") = (wc_Sha256*)sha256_p; register const byte* data asm ("r1") = (const byte*)data_p; register word32 len asm ("r2") = (word32)len_p; - register uint32_t* L_SHA256_transform_neon_len_k_c asm ("r3") = (uint32_t*)&L_SHA256_transform_neon_len_k; + register uint32_t* L_SHA256_transform_neon_len_k_c asm ("r3") = + (uint32_t*)&L_SHA256_transform_neon_len_k; __asm__ __volatile__ ( "sub sp, sp, #24\n\t" @@ -2794,9 +2799,12 @@ void Transform_Sha256_Len(wc_Sha256* sha256_p, const byte* data_p, word32 len_p) "str r10, [sp, #8]\n\t" "bne L_SHA256_transform_neon_len_begin_%=\n\t" "add sp, sp, #24\n\t" - : [sha256] "+r" (sha256), [data] "+r" (data), [len] "+r" (len), [L_SHA256_transform_neon_len_k] "+r" (L_SHA256_transform_neon_len_k_c) + : [sha256] "+r" (sha256), [data] "+r" (data), [len] "+r" (len), + [L_SHA256_transform_neon_len_k] "+r" (L_SHA256_transform_neon_len_k_c) : - : "memory", "r4", "r5", "r6", "r7", "r8", "r9", "r12", "lr", "r10", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10", "d11", "cc" + : "memory", "cc", "r4", "r5", "r6", "r7", "r8", "r9", "r12", "lr", + "r10", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", + "d10", "d11" ); } diff --git a/wolfcrypt/src/port/arm/armv8-32-sha3-asm.S b/wolfcrypt/src/port/arm/armv8-32-sha3-asm.S index 6077a88b3e..7d2c60a89e 100644 --- a/wolfcrypt/src/port/arm/armv8-32-sha3-asm.S +++ b/wolfcrypt/src/port/arm/armv8-32-sha3-asm.S @@ -21,7 +21,8 @@ /* Generated using (from wolfssl): * cd ../scripts - * ruby ./sha3/sha3.rb arm32 ../wolfssl/wolfcrypt/src/port/arm/armv8-32-sha3-asm.S + * ruby ./sha3/sha3.rb arm32 \ + * ../wolfssl/wolfcrypt/src/port/arm/armv8-32-sha3-asm.S */ #ifdef HAVE_CONFIG_H diff --git a/wolfcrypt/src/port/arm/armv8-32-sha3-asm_c.c b/wolfcrypt/src/port/arm/armv8-32-sha3-asm_c.c index 1a54d8af3a..832aac1cb4 100644 --- a/wolfcrypt/src/port/arm/armv8-32-sha3-asm_c.c +++ b/wolfcrypt/src/port/arm/armv8-32-sha3-asm_c.c @@ -21,7 +21,8 @@ /* Generated using (from wolfssl): * cd ../scripts - * ruby ./sha3/sha3.rb arm32 ../wolfssl/wolfcrypt/src/port/arm/armv8-32-sha3-asm.c + * ruby ./sha3/sha3.rb arm32 \ + * ../wolfssl/wolfcrypt/src/port/arm/armv8-32-sha3-asm.c */ #ifdef HAVE_CONFIG_H @@ -73,7 +74,8 @@ static const uint64_t L_sha3_arm2_neon_rt[] = { void BlockSha3(word64* state_p) { register word64* state asm ("r0") = (word64*)state_p; - register uint64_t* L_sha3_arm2_neon_rt_c asm ("r1") = (uint64_t*)&L_sha3_arm2_neon_rt; + register uint64_t* L_sha3_arm2_neon_rt_c asm ("r1") = + (uint64_t*)&L_sha3_arm2_neon_rt; __asm__ __volatile__ ( "sub sp, sp, #16\n\t" @@ -333,9 +335,13 @@ void BlockSha3(word64* state_p) "vst1.8 {d20-d23}, [%[state]]!\n\t" "vst1.8 {d24}, [%[state]]\n\t" "add sp, sp, #16\n\t" - : [state] "+r" (state), [L_sha3_arm2_neon_rt] "+r" (L_sha3_arm2_neon_rt_c) + : [state] "+r" (state), + [L_sha3_arm2_neon_rt] "+r" (L_sha3_arm2_neon_rt_c) : - : "memory", "r2", "r3", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29", "d30", "d31", "cc" + : "memory", "cc", "r2", "r3", "d0", "d1", "d2", "d3", "d4", "d5", "d6", + "d7", "d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15", "d16", + "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d25", + "d26", "d27", "d28", "d29", "d30", "d31" ); } @@ -361,12 +367,11 @@ static const uint64_t L_sha3_arm2_rt[] = { void BlockSha3(word64* state_p) { register word64* state asm ("r0") = (word64*)state_p; - register uint64_t* L_sha3_arm2_neon_rt_c asm ("r1") = (uint64_t*)&L_sha3_arm2_neon_rt; - register uint64_t* L_sha3_arm2_rt_c asm ("r2") = (uint64_t*)&L_sha3_arm2_rt; + register uint64_t* L_sha3_arm2_rt_c asm ("r1") = + (uint64_t*)&L_sha3_arm2_rt; __asm__ __volatile__ ( "sub sp, sp, #0xcc\n\t" - "mov r1, %[L_sha3_arm2_rt]\n\t" "mov r2, #12\n\t" "\n" "L_sha3_arm32_begin_%=: \n\t" @@ -2341,9 +2346,10 @@ void BlockSha3(word64* state_p) "subs r2, r2, #1\n\t" "bne L_sha3_arm32_begin_%=\n\t" "add sp, sp, #0xcc\n\t" - : [state] "+r" (state), [L_sha3_arm2_neon_rt] "+r" (L_sha3_arm2_neon_rt_c), [L_sha3_arm2_rt] "+r" (L_sha3_arm2_rt_c) + : [state] "+r" (state), [L_sha3_arm2_rt] "+r" (L_sha3_arm2_rt_c) : - : "memory", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "cc" + : "memory", "cc", "r2", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8", + "r9", "r10", "r11" ); } diff --git a/wolfcrypt/src/port/arm/armv8-32-sha512-asm.S b/wolfcrypt/src/port/arm/armv8-32-sha512-asm.S index 4dbfeafad9..1df40cfc86 100644 --- a/wolfcrypt/src/port/arm/armv8-32-sha512-asm.S +++ b/wolfcrypt/src/port/arm/armv8-32-sha512-asm.S @@ -21,7 +21,8 @@ /* Generated using (from wolfssl): * cd ../scripts - * ruby ./sha2/sha512.rb arm32 ../wolfssl/wolfcrypt/src/port/arm/armv8-32-sha512-asm.S + * ruby ./sha2/sha512.rb arm32 \ + * ../wolfssl/wolfcrypt/src/port/arm/armv8-32-sha512-asm.S */ #ifdef HAVE_CONFIG_H diff --git a/wolfcrypt/src/port/arm/armv8-32-sha512-asm_c.c b/wolfcrypt/src/port/arm/armv8-32-sha512-asm_c.c index b59668d12a..eaaa6c7e87 100644 --- a/wolfcrypt/src/port/arm/armv8-32-sha512-asm_c.c +++ b/wolfcrypt/src/port/arm/armv8-32-sha512-asm_c.c @@ -21,7 +21,8 @@ /* Generated using (from wolfssl): * cd ../scripts - * ruby ./sha2/sha512.rb arm32 ../wolfssl/wolfcrypt/src/port/arm/armv8-32-sha512-asm.c + * ruby ./sha2/sha512.rb arm32 \ + * ../wolfssl/wolfcrypt/src/port/arm/armv8-32-sha512-asm.c */ #ifdef HAVE_CONFIG_H @@ -98,13 +99,14 @@ static const uint64_t L_SHA512_transform_len_k[] = { 0x5fcb6fab3ad6faecUL, 0x6c44198c4a475817UL, }; -void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len); +void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p); void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) { register wc_Sha512* sha512 asm ("r0") = (wc_Sha512*)sha512_p; register const byte* data asm ("r1") = (const byte*)data_p; register word32 len asm ("r2") = (word32)len_p; - register uint64_t* L_SHA512_transform_len_k_c asm ("r3") = (uint64_t*)&L_SHA512_transform_len_k; + register uint64_t* L_SHA512_transform_len_k_c asm ("r3") = + (uint64_t*)&L_SHA512_transform_len_k; __asm__ __volatile__ ( "sub sp, sp, #0xc0\n\t" @@ -7601,9 +7603,11 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "bne L_SHA512_transform_len_begin_%=\n\t" "eor r0, r0, r0\n\t" "add sp, sp, #0xc0\n\t" - : [sha512] "+r" (sha512), [data] "+r" (data), [len] "+r" (len), [L_SHA512_transform_len_k] "+r" (L_SHA512_transform_len_k_c) + : [sha512] "+r" (sha512), [data] "+r" (data), [len] "+r" (len), + [L_SHA512_transform_len_k] "+r" (L_SHA512_transform_len_k_c) : - : "memory", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "cc" + : "memory", "cc", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", + "r12" ); } @@ -7654,13 +7658,14 @@ static const uint64_t L_SHA512_transform_neon_len_k[] = { 0x5fcb6fab3ad6faecUL, 0x6c44198c4a475817UL, }; -void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len); +void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p); void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) { register wc_Sha512* sha512 asm ("r0") = (wc_Sha512*)sha512_p; register const byte* data asm ("r1") = (const byte*)data_p; register word32 len asm ("r2") = (word32)len_p; - register uint64_t* L_SHA512_transform_neon_len_k_c asm ("r3") = (uint64_t*)&L_SHA512_transform_neon_len_k; + register uint64_t* L_SHA512_transform_neon_len_k_c asm ("r3") = + (uint64_t*)&L_SHA512_transform_neon_len_k; __asm__ __volatile__ ( /* Load digest into working vars */ @@ -9151,9 +9156,12 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "subs %[len], %[len], #0x80\n\t" "sub r3, r3, #0x280\n\t" "bne L_SHA512_transform_neon_len_begin_%=\n\t" - : [sha512] "+r" (sha512), [data] "+r" (data), [len] "+r" (len), [L_SHA512_transform_neon_len_k] "+r" (L_SHA512_transform_neon_len_k_c) + : [sha512] "+r" (sha512), [data] "+r" (data), [len] "+r" (len), + [L_SHA512_transform_neon_len_k] "+r" (L_SHA512_transform_neon_len_k_c) : - : "memory", "r12", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15", "cc" + : "memory", "cc", "r12", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", + "d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15", "q8", "q9", + "q10", "q11", "q12", "q13", "q14", "q15" ); } diff --git a/wolfcrypt/src/port/arm/armv8-chacha.c b/wolfcrypt/src/port/arm/armv8-chacha.c index b5b516705a..5b1fd5baa4 100644 --- a/wolfcrypt/src/port/arm/armv8-chacha.c +++ b/wolfcrypt/src/port/arm/armv8-chacha.c @@ -166,7 +166,7 @@ int wc_Chacha_SetKey(ChaCha* ctx, const byte* key, word32 keySz) ret = BAD_FUNC_ARG; } - if (ret == 0) { + if (ret == 0) { ctx->left = 0; wc_chacha_setkey(ctx->X, key, keySz); From bb67069e4a031aa9d45179286a1f578b7b4a88ed Mon Sep 17 00:00:00 2001 From: Sean Parkinson Date: Mon, 30 Sep 2024 22:05:26 +1000 Subject: [PATCH 09/11] Kyber original: fix to work Encapsulate the message (hash of rand) for original. Final of FIPS 203 uses rand. --- wolfcrypt/src/wc_kyber.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/wolfcrypt/src/wc_kyber.c b/wolfcrypt/src/wc_kyber.c index 8e56bcc0e0..a53091c61f 100644 --- a/wolfcrypt/src/wc_kyber.c +++ b/wolfcrypt/src/wc_kyber.c @@ -630,7 +630,11 @@ int wc_KyberKey_EncapsulateWithRandom(KyberKey* key, unsigned char* ct, if (ret == 0) { /* Encapsulate the message using the key and the seed (coins). */ +#ifdef WOLFSSL_KYBER_ORIGINAL + ret = kyberkey_encapsulate(key, msg, kr + KYBER_SYM_SZ, ct); +#else ret = kyberkey_encapsulate(key, rand, kr + KYBER_SYM_SZ, ct); +#endif } #ifdef WOLFSSL_KYBER_ORIGINAL From 65853a41b9f3f28ffd367331279419767bf64c6b Mon Sep 17 00:00:00 2001 From: Daniel Pouzzner Date: Mon, 30 Sep 2024 23:19:49 -0500 Subject: [PATCH 10/11] fixes, coddling, and suppressions for clang-tidy complaints: examples/pem/pem.c: fix stdio stream leaks. src/ssl.c and src/ssl_load.c: suppress concurrency-mt-unsafe around getenv(). getenv() is threadsafe as long as no threads putenv() or setenv(). wolfssl/openssl/asn1.h: add parentheses to fix bugprone-macro-parentheses in ASN1_EX_TEMPLATE_TYPE(), and suppress misfiring bugprone-macro-parentheses around IMPLEMENT_ASN1_FUNCTIONS(). --- examples/pem/pem.c | 7 +++++++ src/ssl.c | 6 +++++- src/ssl_load.c | 8 +++++--- wolfssl/openssl/asn1.h | 8 ++++---- 4 files changed, 21 insertions(+), 8 deletions(-) diff --git a/examples/pem/pem.c b/examples/pem/pem.c index a58314d6d1..8d2659d401 100644 --- a/examples/pem/pem.c +++ b/examples/pem/pem.c @@ -1024,6 +1024,13 @@ int main(int argc, char* argv[]) if (ret < 0) { fprintf(stderr, "%s\n", wc_GetErrorString(ret)); } + + if (in_file != stdin) + (void)fclose(in_file); + + if (out_file != stdout) + (void)fclose(out_file); + return (ret == 0) ? 0 : 1; } diff --git a/src/ssl.c b/src/ssl.c index e2e040bfc2..2b47d3c6e0 100644 --- a/src/ssl.c +++ b/src/ssl.c @@ -23984,7 +23984,7 @@ int wolfSSL_RAND_seed(const void* seed, int len) */ const char* wolfSSL_RAND_file_name(char* fname, unsigned long len) { -#if !defined(NO_FILESYSTEM) && defined(XGETENV) +#if !defined(NO_FILESYSTEM) && defined(XGETENV) && !defined(NO_GETENV) char* rt; WOLFSSL_ENTER("wolfSSL_RAND_file_name"); @@ -23995,6 +23995,7 @@ const char* wolfSSL_RAND_file_name(char* fname, unsigned long len) XMEMSET(fname, 0, len); +/* // NOLINTBEGIN(concurrency-mt-unsafe) */ if ((rt = XGETENV("RANDFILE")) != NULL) { if (len > XSTRLEN(rt)) { XMEMCPY(fname, rt, XSTRLEN(rt)); @@ -24004,6 +24005,7 @@ const char* wolfSSL_RAND_file_name(char* fname, unsigned long len) rt = NULL; } } +/* // NOLINTEND(concurrency-mt-unsafe) */ /* $RANDFILE was not set or is too large, check $HOME */ if (rt == NULL) { @@ -24011,6 +24013,7 @@ const char* wolfSSL_RAND_file_name(char* fname, unsigned long len) WOLFSSL_MSG("Environment variable RANDFILE not set"); +/* // NOLINTBEGIN(concurrency-mt-unsafe) */ if ((rt = XGETENV("HOME")) == NULL) { #ifdef XALTHOMEVARNAME if ((rt = XGETENV(XALTHOMEVARNAME)) == NULL) { @@ -24023,6 +24026,7 @@ const char* wolfSSL_RAND_file_name(char* fname, unsigned long len) return NULL; #endif } +/* // NOLINTEND(concurrency-mt-unsafe) */ if (len > XSTRLEN(rt) + XSTRLEN(ap)) { fname[0] = '\0'; diff --git a/src/ssl_load.c b/src/ssl_load.c index f20de2c34d..0361edbdf5 100644 --- a/src/ssl_load.c +++ b/src/ssl_load.c @@ -5099,7 +5099,7 @@ int wolfSSL_CTX_use_RSAPrivateKey(WOLFSSL_CTX* ctx, WOLFSSL_RSA* rsa) int wolfSSL_CTX_set_default_verify_paths(WOLFSSL_CTX* ctx) { int ret; -#ifdef XGETENV +#if defined(XGETENV) && !defined(NO_GETENV) char* certDir = NULL; char* certFile = NULL; word32 flags = 0; @@ -5109,7 +5109,8 @@ int wolfSSL_CTX_set_default_verify_paths(WOLFSSL_CTX* ctx) WOLFSSL_ENTER("wolfSSL_CTX_set_default_verify_paths"); -#ifdef XGETENV +#if defined(XGETENV) && !defined(NO_GETENV) + /* // NOLINTBEGIN(concurrency-mt-unsafe) */ certDir = wc_strdup_ex(XGETENV("SSL_CERT_DIR"), DYNAMIC_TYPE_TMP_BUFFER); certFile = wc_strdup_ex(XGETENV("SSL_CERT_FILE"), DYNAMIC_TYPE_TMP_BUFFER); flags = WOLFSSL_LOAD_FLAG_PEM_CA_ONLY; @@ -5133,6 +5134,7 @@ int wolfSSL_CTX_set_default_verify_paths(WOLFSSL_CTX* ctx) ret = 0; } } + /* // NOLINTEND(concurrency-mt-unsafe) */ else #endif @@ -5157,7 +5159,7 @@ int wolfSSL_CTX_set_default_verify_paths(WOLFSSL_CTX* ctx) #endif } -#ifdef XGETENV +#if defined(XGETENV) && !defined(NO_GETENV) XFREE(certFile, NULL, DYNAMIC_TYPE_TMP_BUFFER); XFREE(certDir, NULL, DYNAMIC_TYPE_TMP_BUFFER); #endif diff --git a/wolfssl/openssl/asn1.h b/wolfssl/openssl/asn1.h index 9ae07986fb..5fbb726c5c 100644 --- a/wolfssl/openssl/asn1.h +++ b/wolfssl/openssl/asn1.h @@ -270,8 +270,8 @@ typedef struct WOLFSSL_ASN1_ITEM WOLFSSL_ASN1_ITEM; (WolfsslAsn1FreeCb)member_type##_free, \ (WolfsslAsn1i2dCb)i2d_##member_type, \ (WolfsslAsn1d2iCb)d2i_##member_type, \ - 0, flags & ASN1_TFLG_TAG_MASK ? tag : -1, 0, \ - !!(flags & ASN1_TFLG_EXPLICIT), TRUE } + 0, (flags) & ASN1_TFLG_TAG_MASK ? (tag) : -1, 0, \ + !!((flags) & ASN1_TFLG_EXPLICIT), TRUE } WOLFSSL_API void *wolfSSL_ASN1_item_new(const WOLFSSL_ASN1_ITEM *tpl); WOLFSSL_API void wolfSSL_ASN1_item_free(void *obj, @@ -282,7 +282,7 @@ WOLFSSL_API void* wolfSSL_ASN1_item_d2i(void** dst, const byte **src, long len, const WOLFSSL_ASN1_ITEM* item); /* Need function declaration otherwise compiler complains */ -/* // NOLINTBEGIN(readability-named-parameter) */ +/* // NOLINTBEGIN(readability-named-parameter,bugprone-macro-parentheses) */ #define IMPLEMENT_ASN1_FUNCTIONS(type) \ type *type##_new(void); \ type *type##_new(void){ \ @@ -303,7 +303,7 @@ WOLFSSL_API void* wolfSSL_ASN1_item_d2i(void** dst, const byte **src, long len, return (type*)wolfSSL_ASN1_item_d2i((void**)dst, src, len, \ &type##_template_data); \ } -/* // NOLINTEND(readability-named-parameter) */ +/* // NOLINTEND(readability-named-parameter,bugprone-macro-parentheses) */ #endif /* OPENSSL_ALL */ From 75a676bc7ee2c6f88f107348c695e15f21e67385 Mon Sep 17 00:00:00 2001 From: gojimmypi Date: Tue, 1 Oct 2024 03:19:31 -0700 Subject: [PATCH 11/11] Espressif _thread_local_start and _thread_local_end fix --- wolfcrypt/src/port/Espressif/esp_sdk_mem_lib.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/wolfcrypt/src/port/Espressif/esp_sdk_mem_lib.c b/wolfcrypt/src/port/Espressif/esp_sdk_mem_lib.c index 7cea73bda6..443438f70a 100644 --- a/wolfcrypt/src/port/Espressif/esp_sdk_mem_lib.c +++ b/wolfcrypt/src/port/Espressif/esp_sdk_mem_lib.c @@ -93,8 +93,11 @@ extern wc_ptr_t _heap_start[]; extern wc_ptr_t _heap_end[]; extern wc_ptr_t _rtc_data_start[]; extern wc_ptr_t _rtc_data_end[]; -extern void* _thread_local_start; -extern void* _thread_local_end; + +#if defined(CONFIG_IDF_TARGET_ARCH_XTENSA) && CONFIG_IDF_TARGET_ARCH_XTENSA == 1 + extern void* _thread_local_start; + extern void* _thread_local_end; +#endif /* See https://github.com/esp8266/esp8266-wiki/wiki/Memory-Map */ #define MEM_MAP_IO_START ((void*)(0x3FF00000)) @@ -186,7 +189,9 @@ int sdk_init_meminfo(void) { sdk_log_meminfo(SDK_MEMORY_SEGMENT_COUNT, NULL, NULL); /* print header */ sdk_log_meminfo(mem_map_io, MEM_MAP_IO_START, MEM_MAP_IO_END); +#if defined(CONFIG_IDF_TARGET_ARCH_XTENSA) && CONFIG_IDF_TARGET_ARCH_XTENSA == 1 sdk_log_meminfo(thread_local, _thread_local_start, _thread_local_end); +#endif sdk_log_meminfo(data, _data_start, _data_end); sdk_log_meminfo(user_data_ram, USER_DATA_START, USER_DATA_END); sdk_log_meminfo(bss, _bss_start, _bss_end);