From 24bb2b7fab93ac218f9e68abe07d8cec2bc85648 Mon Sep 17 00:00:00 2001 From: Sean Parkinson Date: Tue, 10 Dec 2024 12:47:03 +1000 Subject: [PATCH] Aarch64: make code compile when no hardware crypto avail Detects availability of instructions for Aarch64. --- .wolfssl_known_macro_extras | 2 + configure.ac | 11 +- wolfcrypt/benchmark/benchmark.c | 46 + wolfcrypt/src/aes.c | 193 ++- wolfcrypt/src/cpuid.c | 205 +++- wolfcrypt/src/port/arm/armv8-aes.c | 1600 +++++++++++++------------ wolfcrypt/src/port/arm/armv8-sha256.c | 209 +++- wolfssl/wolfcrypt/aes.h | 62 +- wolfssl/wolfcrypt/cpuid.h | 25 + 9 files changed, 1596 insertions(+), 757 deletions(-) diff --git a/.wolfssl_known_macro_extras b/.wolfssl_known_macro_extras index 6ec0d4cade..1394b64d14 100644 --- a/.wolfssl_known_macro_extras +++ b/.wolfssl_known_macro_extras @@ -548,6 +548,7 @@ WOLFCRYPT_FIPS_CORE_DYNAMIC_HASH_VALUE WOLFSENTRY_H WOLFSENTRY_NO_JSON WOLFSSL_32BIT_MILLI_TIME +WOLFSSL_AARCH64_PRIVILEGE_MODE WOLFSSL_AESNI_BY4 WOLFSSL_AESNI_BY6 WOLFSSL_AFTER_DATE_CLOCK_SKEW @@ -906,6 +907,7 @@ __MINGW32__ __MINGW64_VERSION_MAJOR __MINGW64__ __MWERKS__ +__OpenBSD__ __PIE__ __POWERPC__ __PPC__ diff --git a/configure.ac b/configure.ac index ff9b61de20..1216da9eeb 100644 --- a/configure.ac +++ b/configure.ac @@ -2950,6 +2950,7 @@ then fi +ENABLED_ARMASM_CRYPTO="unknown" ENABLED_ARMASM_INLINE="no" ENABLED_ARMASM_SHA3="no" ENABLED_ARMASM_CRYPTO_SM4="no" @@ -2971,6 +2972,9 @@ then inline) ENABLED_ARMASM_INLINE=yes ;; + no-crypto) + ENABLED_ARMASM_CRYPTO=no + ;; sha512-crypto | sha3-crypto) case $host_cpu in *aarch64*) @@ -3046,7 +3050,9 @@ then esac # Include options.h AM_CCASFLAGS="$AM_CCASFLAGS -DEXTERNAL_OPTS_OPENVPN" - ENABLED_ARMASM_CRYPTO=yes + if test "$ENABLED_ARMASM_CRYPTO" = "unknown"; then + ENABLED_ARMASM_CRYPTO=yes + fi ENABLED_ARMASM_NEON=yes ENABLED_ARM_64=yes @@ -3147,6 +3153,9 @@ fi if test "$ENABLED_ARMASM_SM4" = "yes"; then AM_CFLAGS="$AM_CFLAGS -DWOLFSSL_ARMASM_CRYPTO_SM4" fi +if test "$ENABLED_ARMASM_CRYPTO" = "unknown"; then + ENABLED_ARMASM_CRYPTO=no +fi if test "$ENABLED_ARMASM_CRYPTO" = "no"; then AM_CFLAGS="$AM_CFLAGS -DWOLFSSL_ARMASM_NO_HW_CRYPTO" fi diff --git a/wolfcrypt/benchmark/benchmark.c b/wolfcrypt/benchmark/benchmark.c index 21b6ff9272..8fb71da2ce 100644 --- a/wolfcrypt/benchmark/benchmark.c +++ b/wolfcrypt/benchmark/benchmark.c @@ -229,6 +229,8 @@ #include #endif +#include + #ifdef USE_FLAT_BENCHMARK_H #include "benchmark.h" #else @@ -3939,6 +3941,46 @@ static void* benchmarks_do(void* args) return NULL; } +#if defined(HAVE_CPUID) && defined(WOLFSSL_TEST_STATIC_BUILD) +static void print_cpu_features(void) +{ + word32 cpuid_flags = cpuid_get_flags(); + + printf("CPU: "); +#ifdef HAVE_CPUID_INTEL + printf("Intel"); +#ifdef WOLFSSL_X86_64_BUILD + printf(" x86_64"); +#else + printf(" x86"); +#endif + printf(" -"); + if (IS_INTEL_AVX1(cpuid_flags)) printf(" avx1"); + if (IS_INTEL_AVX2(cpuid_flags)) printf(" avx2"); + if (IS_INTEL_RDRAND(cpuid_flags)) printf(" rdrand"); + if (IS_INTEL_RDSEED(cpuid_flags)) printf(" rdseed"); + if (IS_INTEL_BMI2(cpuid_flags)) printf(" bmi2"); + if (IS_INTEL_AESNI(cpuid_flags)) printf(" aesni"); + if (IS_INTEL_ADX(cpuid_flags)) printf(" adx"); + if (IS_INTEL_MOVBE(cpuid_flags)) printf(" movbe"); + if (IS_INTEL_BMI1(cpuid_flags)) printf(" bmi1"); + if (IS_INTEL_SHA(cpuid_flags)) printf(" sha"); +#endif +#ifdef __aarch64__ + printf("Aarch64 -"); + if (IS_AARCH64_AES(cpuid_flags)) printf(" aes"); + if (IS_AARCH64_PMULL(cpuid_flags)) printf(" pmull"); + if (IS_AARCH64_SHA256(cpuid_flags)) printf(" sha256"); + if (IS_AARCH64_SHA512(cpuid_flags)) printf(" sha512"); + if (IS_AARCH64_RDM(cpuid_flags)) printf(" rdm"); + if (IS_AARCH64_SHA3(cpuid_flags)) printf(" sha3"); + if (IS_AARCH64_SM3(cpuid_flags)) printf(" sm3"); + if (IS_AARCH64_SM4(cpuid_flags)) printf(" sm4"); +#endif + printf("\n"); +} +#endif + int benchmark_init(void) { int ret = 0; @@ -3959,6 +4001,10 @@ int benchmark_init(void) return EXIT_FAILURE; } +#if defined(HAVE_CPUID) && defined(WOLFSSL_TEST_STATIC_BUILD) + print_cpu_features(); +#endif + #ifdef HAVE_WC_INTROSPECTION printf("Math: %s\n", wc_GetMathInfo()); #endif diff --git a/wolfcrypt/src/aes.c b/wolfcrypt/src/aes.c index 7f5e758475..f5d9f65ff0 100644 --- a/wolfcrypt/src/aes.c +++ b/wolfcrypt/src/aes.c @@ -106,7 +106,7 @@ block cipher mechanism that uses n-bit binary string parameter key with 128-bits #include #endif -#if !defined(WOLFSSL_ARMASM) && !defined(WOLFSSL_RISCV_ASM) +#if !defined(WOLFSSL_RISCV_ASM) #ifdef WOLFSSL_IMX6_CAAM_BLOB /* case of possibly not using hardware acceleration for AES but using key @@ -787,6 +787,26 @@ block cipher mechanism that uses n-bit binary string parameter key with 128-bits } #endif /* HAVE_AES_DECRYPT */ +#elif defined(__aarch64__) && defined(WOLFSSL_ARMASM) && \ + !defined(WOLFSSL_ARMASM_NO_HW_CRYPTO) + + #define NEED_AES_TABLES + + static int checkedCpuIdFlags = 0; + static word32 cpuid_flags = 0; + + static void Check_CPU_support_HwCrypto(Aes* aes) + { + if (checkedCpuIdFlags == 0) { + cpuid_flags = cpuid_get_flags(); + checkedCpuIdFlags = 1; + } + aes->use_aes_hw_crypto = IS_AARCH64_AES(cpuid_flags); + #ifdef HAVE_AESGCM + aes->use_pmull_hw_crypto = IS_AARCH64_PMULL(cpuid_flags); + #endif + } + #elif (defined(WOLFSSL_IMX6_CAAM) && !defined(NO_IMX6_CAAM_AES) \ && !defined(WOLFSSL_QNX_CAAM)) || \ ((defined(WOLFSSL_AFALG) || defined(WOLFSSL_DEVCRYPTO_AES)) && \ @@ -2875,6 +2895,13 @@ static WARN_UNUSED_RESULT int wc_AesEncrypt( printf("Skipping AES-NI\n"); #endif } +#elif defined(__aarch64__) && defined(WOLFSSL_ARMASM) && \ + !defined(WOLFSSL_ARMASM_NO_HW_CRYPTO) + if (aes->use_aes_hw_crypto) { + AES_encrypt_AARCH64(inBlock, outBlock, (byte*)aes->key, + (int)aes->rounds); + return 0; + } #endif /* WOLFSSL_AESNI */ #if defined(WOLFSSL_SCE) && !defined(WOLFSSL_SCE_NO_AES) AES_ECB_encrypt(aes, inBlock, outBlock, WC_AES_BLOCK_SIZE); @@ -3630,6 +3657,13 @@ static WARN_UNUSED_RESULT int wc_AesDecrypt( printf("Skipping AES-NI\n"); #endif } +#elif defined(__aarch64__) && defined(WOLFSSL_ARMASM) && \ + !defined(WOLFSSL_ARMASM_NO_HW_CRYPTO) + if (aes->use_aes_hw_crypto) { + AES_decrypt_AARCH64(inBlock, outBlock, (byte*)aes->key, + (int)aes->rounds); + return 0; + } #endif /* WOLFSSL_AESNI */ #if defined(WOLFSSL_SCE) && !defined(WOLFSSL_SCE_NO_AES) return AES_ECB_decrypt(aes, inBlock, outBlock, WC_AES_BLOCK_SIZE); @@ -4580,6 +4614,14 @@ static void AesSetKey_C(Aes* aes, const byte* key, word32 keySz, int dir) } #endif /* WOLFSSL_AESNI */ + #if defined(__aarch64__) && defined(WOLFSSL_ARMASM) && \ + !defined(WOLFSSL_ARMASM_NO_HW_CRYPTO) + Check_CPU_support_HwCrypto(aes); + if (aes->use_aes_hw_crypto) { + return AES_set_key_AARCH64(userKey, keylen, aes, dir); + } + #endif + #ifdef WOLFSSL_KCAPI_AES XMEMCPY(aes->devKey, userKey, keylen); if (aes->init != 0) { @@ -5777,6 +5819,14 @@ int wc_AesCbcEncrypt(Aes* aes, byte* out, const byte* in, word32 sz) } } else + #elif defined(__aarch64__) && defined(WOLFSSL_ARMASM) && \ + !defined(WOLFSSL_ARMASM_NO_HW_CRYPTO) + if (aes->use_aes_hw_crypto) { + AES_CBC_encrypt_AARCH64(in, out, sz, (byte*)aes->reg, + (byte*)aes->key, (int)aes->rounds); + ret = 0; + } + else #endif { ret = 0; @@ -5917,6 +5967,14 @@ int wc_AesCbcEncrypt(Aes* aes, byte* out, const byte* in, word32 sz) ret = 0; } else + #elif defined(__aarch64__) && defined(WOLFSSL_ARMASM) && \ + !defined(WOLFSSL_ARMASM_NO_HW_CRYPTO) + if (aes->use_aes_hw_crypto) { + AES_CBC_decrypt_AARCH64(in, out, sz, (byte*)aes->reg, + (byte*)aes->key, (int)aes->rounds); + ret = 0; + } + else #endif { ret = 0; @@ -6255,6 +6313,14 @@ int wc_AesCbcEncrypt(Aes* aes, byte* out, const byte* in, word32 sz) aes->left -= processed; sz -= processed; + #if defined(__aarch64__) && defined(WOLFSSL_ARMASM) && \ + !defined(WOLFSSL_ARMASM_NO_HW_CRYPTO) + if (aes->use_aes_hw_crypto) { + AES_CTR_encrypt_AARCH64(aes, out, in, sz); + return 0; + } + #endif + VECTOR_REGISTERS_PUSH; #if defined(HAVE_AES_ECB) && !defined(WOLFSSL_PIC32MZ_CRYPT) && \ @@ -6343,7 +6409,7 @@ int wc_AesCbcEncrypt(Aes* aes, byte* out, const byte* in, word32 sz) #endif /* NEED_AES_CTR_SOFT */ #endif /* WOLFSSL_AES_COUNTER */ -#endif /* !WOLFSSL_ARMASM && ! WOLFSSL_RISCV_ASM */ +#endif /* !WOLFSSL_RISCV_ASM */ /* @@ -6390,10 +6456,7 @@ static WC_INLINE void IncCtr(byte* ctr, word32 ctrSz) #endif -#ifdef WOLFSSL_ARMASM - /* implementation is located in wolfcrypt/src/port/arm/armv8-aes.c */ - -#elif defined(WOLFSSL_RISCV_ASM) +#if defined(WOLFSSL_RISCV_ASM) /* implemented in wolfcrypt/src/port/risc-v/riscv-64-aes.c */ #elif defined(WOLFSSL_AFALG) @@ -6603,6 +6666,13 @@ int wc_AesGcmSetKey(Aes* aes, const byte* key, word32 len) return ret; #endif /* WOLFSSL_RENESAS_RSIP && WOLFSSL_RENESAS_FSPSM_CRYPTONLY*/ +#if defined(__aarch64__) && defined(WOLFSSL_ARMASM) && \ + !defined(WOLFSSL_ARMASM_NO_HW_CRYPTO) + if (ret == 0 && aes->use_aes_hw_crypto && aes->use_pmull_hw_crypto) { + AES_GCM_set_key_AARCH64(aes, iv); + } + else +#endif #if !defined(FREESCALE_LTC_AES_GCM) if (ret == 0) { VECTOR_REGISTERS_PUSH; @@ -7320,6 +7390,8 @@ void GHASH(Gcm* gcm, const byte* a, word32 aSz, const byte* c, */ #define GHASH_INIT_EXTRA(aes) WC_DO_NOTHING +#if !defined(__aarch64__) || !defined(WOLFSSL_ARMASM) || \ + defined(WOLFSSL_ARMASM_NO_HW_CRYPTO) /* GHASH one block of data.. * * XOR block into tag and GMULT with H using pre-computed table. @@ -7333,6 +7405,7 @@ void GHASH(Gcm* gcm, const byte* a, word32 aSz, const byte* c, GMULT(AES_TAG(aes), (aes)->gcm.M0); \ } \ while (0) +#endif #endif /* WOLFSSL_AESGCM_STREAM */ #elif defined(WORD64_AVAILABLE) && !defined(GCM_WORD32) @@ -7928,8 +8001,17 @@ static void GHASH_INIT(Aes* aes) { /* Reset counts of AAD and cipher text. */ aes->aOver = 0; aes->cOver = 0; - /* Extra initialization based on implementation. */ - GHASH_INIT_EXTRA(aes); +#if defined(__aarch64__) && defined(WOLFSSL_ARMASM) && \ + !defined(WOLFSSL_ARMASM_NO_HW_CRYPTO) + if (aes->use_aes_hw_crypto && aes->use_pmull_hw_crypto) { + ; /* Don't do extra initialization. */ + } + else +#endif + { + /* Extra initialization based on implementation. */ + GHASH_INIT_EXTRA(aes); + } } /* Update the GHASH with AAD and/or cipher text. @@ -8590,6 +8672,14 @@ int wc_AesGcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz, } } else +#elif defined(__aarch64__) && defined(WOLFSSL_ARMASM) && \ + !defined(WOLFSSL_ARMASM_NO_HW_CRYPTO) + if (aes->use_aes_hw_crypto && aes->use_pmull_hw_crypto) { + AES_GCM_encrypt_AARCH64(aes, out, in, sz, iv, ivSz, authTag, authTagSz, + authIn, authInSz); + ret = 0; + } + else #endif /* WOLFSSL_AESNI */ { ret = AES_GCM_encrypt_C(aes, out, in, sz, iv, ivSz, authTag, authTagSz, @@ -9174,6 +9264,13 @@ int wc_AesGcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz, } } else +#elif defined(__aarch64__) && defined(WOLFSSL_ARMASM) && \ + !defined(WOLFSSL_ARMASM_NO_HW_CRYPTO) + if (aes->use_aes_hw_crypto && aes->use_pmull_hw_crypto) { + ret = AES_GCM_decrypt_AARCH64(aes, out, in, sz, iv, ivSz, authTag, + authTagSz, authIn, authInSz); + } + else #endif /* WOLFSSL_AESNI */ { ret = AES_GCM_decrypt_C(aes, out, in, sz, iv, ivSz, authTag, authTagSz, @@ -10088,7 +10185,20 @@ int wc_AesGcmInit(Aes* aes, const byte* key, word32 len, const byte* iv, RESTORE_VECTOR_REGISTERS(); } else - #endif + #elif defined(__aarch64__) && defined(WOLFSSL_ARMASM) && \ + !defined(WOLFSSL_ARMASM_NO_HW_CRYPTO) + if (aes->use_aes_hw_crypto) { + AES_GCM_init_AARCH64(aes, iv, ivSz); + + /* Reset state fields. */ + aes->over = 0; + aes->aSz = 0; + aes->cSz = 0; + /* Initialization for GHASH. */ + GHASH_INIT(aes); + } + else + #endif /* WOLFSSL_AESNI */ { ret = AesGcmInit_C(aes, iv, ivSz); } @@ -10214,6 +10324,13 @@ int wc_AesGcmEncryptUpdate(Aes* aes, byte* out, const byte* in, word32 sz, RESTORE_VECTOR_REGISTERS(); } else + #elif defined(__aarch64__) && defined(WOLFSSL_ARMASM) && \ + !defined(WOLFSSL_ARMASM_NO_HW_CRYPTO) + if (aes->use_aes_hw_crypto) { + AES_GCM_crypt_update_AARCH64(aes, out, in, sz); + GHASH_UPDATE_AARCH64(aes, authIn, authInSz, out, sz); + } + else #endif { /* Encrypt the plaintext. */ @@ -10267,6 +10384,12 @@ int wc_AesGcmEncryptFinal(Aes* aes, byte* authTag, word32 authTagSz) RESTORE_VECTOR_REGISTERS(); } else + #elif defined(__aarch64__) && defined(WOLFSSL_ARMASM) && \ + !defined(WOLFSSL_ARMASM_NO_HW_CRYPTO) + if (aes->use_aes_hw_crypto) { + AES_GCM_final_AARCH64(aes, authTag, authTagSz); + } + else #endif { ret = AesGcmFinal_C(aes, authTag, authTagSz); @@ -10350,6 +10473,13 @@ int wc_AesGcmDecryptUpdate(Aes* aes, byte* out, const byte* in, word32 sz, RESTORE_VECTOR_REGISTERS(); } else + #elif defined(__aarch64__) && defined(WOLFSSL_ARMASM) && \ + !defined(WOLFSSL_ARMASM_NO_HW_CRYPTO) + if (aes->use_aes_hw_crypto) { + GHASH_UPDATE_AARCH64(aes, authIn, authInSz, in, sz); + AES_GCM_crypt_update_AARCH64(aes, out, in, sz); + } + else #endif { /* Update the authentication tag with any authentication data and @@ -10401,6 +10531,17 @@ int wc_AesGcmDecryptFinal(Aes* aes, const byte* authTag, word32 authTagSz) RESTORE_VECTOR_REGISTERS(); } else + #elif defined(__aarch64__) && defined(WOLFSSL_ARMASM) && \ + !defined(WOLFSSL_ARMASM_NO_HW_CRYPTO) + if (aes->use_aes_hw_crypto) { + ALIGN32 byte calcTag[WC_AES_BLOCK_SIZE]; + AES_GCM_final_AARCH64(aes, calcTag, authTagSz); + /* Check calculated tag matches the one passed in. */ + if (ConstantCompare(authTag, calcTag, (int)authTagSz) != 0) { + ret = AES_GCM_AUTH_E; + } + } + else #endif { ALIGN32 byte calcTag[WC_AES_BLOCK_SIZE]; @@ -10677,10 +10818,7 @@ int wc_AesCcmCheckTagSize(int sz) return 0; } -#ifdef WOLFSSL_ARMASM - /* implementation located in wolfcrypt/src/port/arm/armv8-aes.c */ - -#elif defined(WOLFSSL_RISCV_ASM) +#if defined(WOLFSSL_RISCV_ASM) /* implementation located in wolfcrypt/src/port/risc-v/riscv-64-aes.c */ #elif defined(HAVE_COLDFIRE_SEC) @@ -11686,6 +11824,12 @@ static WARN_UNUSED_RESULT int _AesEcbEncrypt( AES_ECB_encrypt_AESNI(in, out, sz, (byte*)aes->key, (int)aes->rounds); } else +#elif defined(__aarch64__) && defined(WOLFSSL_ARMASM) && \ + !defined(WOLFSSL_ARMASM_NO_HW_CRYPTO) + if (aes->use_aes_hw_crypto) { + AES_encrypt_AARCH64(in, out, (byte*)aes->key, (int)aes->rounds); + } + else #endif { #ifdef NEED_AES_TABLES @@ -11738,6 +11882,12 @@ static WARN_UNUSED_RESULT int _AesEcbDecrypt( AES_ECB_decrypt_AESNI(in, out, sz, (byte*)aes->key, (int)aes->rounds); } else +#elif defined(__aarch64__) && defined(WOLFSSL_ARMASM) && \ + !defined(WOLFSSL_ARMASM_NO_HW_CRYPTO) + if (aes->use_aes_hw_crypto) { + AES_decrypt_AARCH64(in, out, (byte*)aes->key, (int)aes->rounds); + } + else #endif { #ifdef NEED_AES_TABLES @@ -12838,7 +12988,6 @@ void AES_XTS_decrypt_update_avx1(const unsigned char *in, unsigned char *out, wo #endif /* WOLFSSL_AESNI */ -#if !defined(WOLFSSL_ARMASM) || defined(WOLFSSL_ARMASM_NO_HW_CRYPTO) #ifdef HAVE_AES_ECB /* helper function for encrypting / decrypting full buffer at once */ static WARN_UNUSED_RESULT int _AesXtsHelper( @@ -13100,6 +13249,13 @@ int wc_AesXtsEncrypt(XtsAes* xaes, byte* out, const byte* in, word32 sz, RESTORE_VECTOR_REGISTERS(); } else +#elif defined(__aarch64__) && defined(WOLFSSL_ARMASM) && \ + !defined(WOLFSSL_ARMASM_NO_HW_CRYPTO) + if (aes->use_aes_hw_crypto) { + AES_XTS_encrypt_AARCH64(xaes, out, in, sz, i); + ret = 0; + } + else #endif { ret = AesXtsEncrypt_sw(xaes, out, in, sz, i); @@ -13533,6 +13689,13 @@ int wc_AesXtsDecrypt(XtsAes* xaes, byte* out, const byte* in, word32 sz, RESTORE_VECTOR_REGISTERS(); } else +#elif defined(__aarch64__) && defined(WOLFSSL_ARMASM) && \ + !defined(WOLFSSL_ARMASM_NO_HW_CRYPTO) + if (aes->use_aes_hw_crypto) { + AES_XTS_decrypt_AARCH64(xaes, out, in, sz, i); + ret = 0; + } + else #endif { ret = AesXtsDecrypt_sw(xaes, out, in, sz, i); @@ -13730,8 +13893,6 @@ int wc_AesXtsDecryptFinal(XtsAes* xaes, byte* out, const byte* in, word32 sz, #endif /* WOLFSSL_AESXTS_STREAM */ -#endif /* !WOLFSSL_ARMASM || WOLFSSL_ARMASM_NO_HW_CRYPTO */ - /* Same as wc_AesXtsEncryptSector but the sector gets incremented by one every * sectorSz bytes * diff --git a/wolfcrypt/src/cpuid.c b/wolfcrypt/src/cpuid.c index 67223860c8..2e63a092bf 100644 --- a/wolfcrypt/src/cpuid.c +++ b/wolfcrypt/src/cpuid.c @@ -28,7 +28,8 @@ #include -#if defined(HAVE_CPUID) || defined(HAVE_CPUID_INTEL) +#if defined(HAVE_CPUID) || defined(HAVE_CPUID_INTEL) || \ + defined(HAVE_CPUID_AARCH64) static word32 cpuid_check = 0; static word32 cpuid_flags = 0; #endif @@ -101,6 +102,208 @@ cpuid_check = 1; } } +#elif defined(HAVE_CPUID_AARCH64) + +#define CPUID_AARCH64_FEAT_AES ((word64)1 << 4) +#define CPUID_AARCH64_FEAT_PMULL ((word64)1 << 5) +#define CPUID_AARCH64_FEAT_SHA256 ((word64)1 << 12) +#define CPUID_AARCH64_FEAT_SHA256_512 ((word64)1 << 13) +#define CPUID_AARCH64_FEAT_RDM ((word64)1 << 28) +#define CPUID_AARCH64_FEAT_SHA3 ((word64)1 << 32) +#define CPUID_AARCH64_FEAT_SM3 ((word64)1 << 36) +#define CPUID_AARCH64_FEAT_SM4 ((word64)1 << 40) + +#ifdef WOLFSSL_AARCH64_PRIVILEGE_MODE + /* https://developer.arm.com/documentation/ddi0601/2024-09/AArch64-Registers + * /ID-AA64ISAR0-EL1--AArch64-Instruction-Set-Attribute-Register-0 */ + + void cpuid_set_flags(void) + { + if (!cpuid_check) { + word64 features; + + __asm__ __volatile ( + "mrs %[feat], ID_AA64ISAR0_EL1\n" + : [feat] "=r" (features) + : + : + ); + + if (features & CPUID_AARCH64_FEAT_AES) + cpuid_flags |= CPUID_AES; + if (features & CPUID_AARCH64_FEAT_PMULL) + cpuid_flags |= CPUID_PMULL; + if (features & CPUID_AARCH64_FEAT_SHA256) + cpuid_flags |= CPUID_SHA256; + if (features & CPUID_AARCH64_FEAT_SHA256_512) + cpuid_flags |= CPUID_SHA256 | CPUID_SHA512; + if (features & CPUID_AARCH64_FEAT_RDM) + cpuid_flags |= CPUID_RDM; + if (features & CPUID_AARCH64_FEAT_SHA3) + cpuid_flags |= CPUID_SHA3; + if (features & CPUID_AARCH64_FEAT_SM3) + cpuid_flags |= CPUID_SM3; + if (features & CPUID_AARCH64_FEAT_SM4) + cpuid_flags |= CPUID_SM4; + + cpuid_check = 1; + } + } +#elif defined(__linux__) + /* https://community.arm.com/arm-community-blogs/b/operating-systems-blog/ + * posts/runtime-detection-of-cpu-features-on-an-armv8-a-cpu */ + + #include + #include + + void cpuid_set_flags(void) + { + if (!cpuid_check) { + word64 hwcaps = getauxval(AT_HWCAP); + + if (hwcaps & HWCAP_AES) + cpuid_flags |= CPUID_AES; + if (hwcaps & HWCAP_PMULL) + cpuid_flags |= CPUID_PMULL; + if (hwcaps & HWCAP_SHA2) + cpuid_flags |= CPUID_SHA256; + if (hwcaps & HWCAP_SHA512) + cpuid_flags |= CPUID_SHA512; + if (hwcaps & HWCAP_ASIMDRDM) + cpuid_flags |= CPUID_RDM; + if (hwcaps & HWCAP_SHA3) + cpuid_flags |= CPUID_SHA3; + if (hwcaps & HWCAP_SM3) + cpuid_flags |= CPUID_SM3; + if (hwcaps & HWCAP_SM4) + cpuid_flags |= CPUID_SM4; + + cpuid_check = 1; + } + } +#elif defined(__ANDROID__) || defined(ANDROID) + /* https://community.arm.com/arm-community-blogs/b/operating-systems-blog/ + * posts/runtime-detection-of-cpu-features-on-an-armv8-a-cpu */ + + #include "cpu-features.h" + + void cpuid_set_flags(void) + { + if (!cpuid_check) { + word64 features = android_getCpuFeatures(); + + if (features & ANDROID_CPU_ARM_FEATURE_AES) + cpuid_flags |= CPUID_AES; + if (features & ANDROID_CPU_ARM_FEATURE_PMULL) + cpuid_flags |= CPUID_PMULL; + if (features & ANDROID_CPU_ARM_FEATURE_SHA2) + cpuid_flags |= CPUID_SHA256; + + cpuid_check = 1; + } + } +#elif defined(__APPLE__) + /* https://developer.apple.com/documentation/kernel/1387446-sysctlbyname/ + * determining_instruction_set_characteristics */ + + #include + + static word64 cpuid_get_sysctlbyname(const char* name) + { + word64 ret = 0; + size_t size = sizeof(ret); + + sysctlbyname(name, &ret, &size, NULL, 0); + + return ret; + } + + void cpuid_set_flags(void) + { + if (!cpuid_check) { + if (cpuid_get_sysctlbyname("hw.optional.arm.FEAT_AES") != 0) + cpuid_flags |= CPUID_AES; + if (cpuid_get_sysctlbyname("hw.optional.arm.FEAT_PMULL") != 0) + cpuid_flags |= CPUID_PMULL; + if (cpuid_get_sysctlbyname("hw.optional.arm.FEAT_SHA256") != 0) + cpuid_flags |= CPUID_SHA256; + if (cpuid_get_sysctlbyname("hw.optional.arm.FEAT_SHA512") != 0) + cpuid_flags |= CPUID_SHA512; + if (cpuid_get_sysctlbyname("hw.optional.arm.FEAT_RDM") != 0) + cpuid_flags |= CPUID_RDM; + if (cpuid_get_sysctlbyname("hw.optional.arm.FEAT_SHA3") != 0) + cpuid_flags |= CPUID_SHA3; + #ifdef WOLFSSL_ARMASM_CRYPTO_SM3 + cpuid_flags |= CPUID_SM3; + #endif + #ifdef WOLFSSL_ARMASM_CRYPTO_SM4 + cpuid_flags |= CPUID_SM4; + #endif + + cpuid_check = 1; + } + } +#elif defined(__FreeBSD__) || defined(__OpenBSD__) + /* https://man.freebsd.org/cgi/man.cgi?elf_aux_info(3) */ + + #include + + void cpuid_set_flags(void) + { + if (!cpuid_check) { + word64 features = 0; + + elf_aux_info(AT_HWCAP, &features, sizeof(features)); + + if (features & CPUID_AARCH64_FEAT_AES) + cpuid_flags |= CPUID_AES; + if (features & CPUID_AARCH64_FEAT_PMULL) + cpuid_flags |= CPUID_PMULL; + if (features & CPUID_AARCH64_FEAT_SHA256) + cpuid_flags |= CPUID_SHA256; + if (features & CPUID_AARCH64_FEAT_SHA256_512) + cpuid_flags |= CPUID_SHA256 | CPUID_SHA512; + if (features & CPUID_AARCH64_FEAT_RDM) + cpuid_flags |= CPUID_RDM; + if (features & CPUID_AARCH64_FEAT_SHA3) + cpuid_flags |= CPUID_SHA3; + if (features & CPUID_AARCH64_FEAT_SM3) + cpuid_flags |= CPUID_SM3; + if (features & CPUID_AARCH64_FEAT_SM4) + cpuid_flags |= CPUID_SM4; + + cpuid_check = 1; + } + } +#else + void cpuid_set_flags(void) + { + if (!cpuid_check) { + + #ifndef WOLFSSL_ARMASM_NO_HW_CRYPTO + cpuid_flags |= CPUID_AES; + cpuid_flags |= CPUID_PMULL; + cpuid_flags |= CPUID_SHA256; + #endif + #ifdef WOLFSSL_ARMASM_CRYPTO_SHA512 + cpuid_flags |= CPUID_SHA512; + #endif + #ifndef WOLFSSL_AARCH64_NO_SQRMLSH + cpuid_flags |= CPUID_RDM; + #endif + #ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 + cpuid_flags |= CPUID_SHA3; + #endif + #ifdef WOLFSSL_ARMASM_CRYPTO_SM3 + cpuid_flags |= CPUID_SM3; + #endif + #ifdef WOLFSSL_ARMASM_CRYPTO_SM4 + cpuid_flags |= CPUID_SM4; + #endif + cpuid_check = 1; + } + } +#endif #elif defined(HAVE_CPUID) void cpuid_set_flags(void) { diff --git a/wolfcrypt/src/port/arm/armv8-aes.c b/wolfcrypt/src/port/arm/armv8-aes.c index 0eca6775e8..9ae90e8cfa 100644 --- a/wolfcrypt/src/port/arm/armv8-aes.c +++ b/wolfcrypt/src/port/arm/armv8-aes.c @@ -175,48 +175,20 @@ static WC_INLINE void FlattenSzInBits(byte* buf, word32 sz) #endif /* HAVE_AESGCM */ -/* Similar to wolfSSL software implementation of expanding the AES key. - * Changed out the locations of where table look ups where made to - * use hardware instruction. Also altered decryption key to match. */ -int wc_AesSetKey(Aes* aes, const byte* userKey, word32 keylen, - const byte* iv, int dir) +int AES_set_key_AARCH64(const unsigned char *userKey, const int keylen, + Aes* aes, int dir) { word32 temp; - word32 *rk; + word32* rk = aes->key; unsigned int i = 0; -#if defined(AES_MAX_KEY_SIZE) - const word32 max_key_len = (AES_MAX_KEY_SIZE / 8); -#endif - - if (!((keylen == 16) || (keylen == 24) || (keylen == 32)) || - aes == NULL || userKey == NULL) - return BAD_FUNC_ARG; - - rk = aes->key; -#if defined(AES_MAX_KEY_SIZE) - /* Check key length */ - if (keylen > max_key_len) { - return BAD_FUNC_ARG; - } -#endif - - #if defined(WOLFSSL_AES_COUNTER) || defined(WOLFSSL_AES_CFB) || \ - defined(WOLFSSL_AES_OFB) || defined(WOLFSSL_AES_XTS) - aes->left = 0; - #endif /* WOLFSSL_AES_COUNTER */ - - aes->keylen = keylen; - aes->rounds = keylen/4 + 6; XMEMCPY(rk, userKey, keylen); - switch(keylen) - { + switch (keylen) { #if defined(AES_MAX_KEY_SIZE) && AES_MAX_KEY_SIZE >= 128 && \ defined(WOLFSSL_AES_128) case 16: - while (1) - { + while (1) { temp = rk[3]; SBOX(temp); temp = rotrFixed(temp, 8); @@ -235,8 +207,7 @@ int wc_AesSetKey(Aes* aes, const byte* userKey, word32 keylen, defined(WOLFSSL_AES_192) case 24: /* for (;;) here triggers a bug in VC60 SP4 w/ Pro Pack */ - while (1) - { + while (1) { temp = rk[5]; SBOX(temp); temp = rotrFixed(temp, 8); @@ -256,8 +227,7 @@ int wc_AesSetKey(Aes* aes, const byte* userKey, word32 keylen, #if defined(AES_MAX_KEY_SIZE) && AES_MAX_KEY_SIZE >= 256 && \ defined(WOLFSSL_AES_256) case 32: - while (1) - { + while (1) { temp = rk[7]; SBOX(temp); temp = rotrFixed(temp, 8); @@ -283,8 +253,7 @@ int wc_AesSetKey(Aes* aes, const byte* userKey, word32 keylen, return BAD_FUNC_ARG; } - if (dir == AES_DECRYPTION) - { + if (dir == AES_DECRYPTION) { #ifdef HAVE_AES_DECRYPT unsigned int j; rk = aes->key; @@ -308,9 +277,10 @@ int wc_AesSetKey(Aes* aes, const byte* userKey, word32 keylen, #endif /* HAVE_AES_DECRYPT */ } - return wc_AesSetIV(aes, iv); + return 0; } +#ifndef __aarch64__ #if defined(WOLFSSL_AES_DIRECT) || defined(WOLFSSL_AES_COUNTER) int wc_AesSetKeyDirect(Aes* aes, const byte* userKey, word32 keylen, const byte* iv, int dir) @@ -332,587 +302,521 @@ int wc_AesSetIV(Aes* aes, const byte* iv) return 0; } - +#endif #ifdef __aarch64__ /* AES CCM/GCM use encrypt direct but not decrypt */ #if defined(HAVE_AESCCM) || defined(HAVE_AESGCM) || \ - defined(WOLFSSL_AES_DIRECT) || defined(WOLFSSL_AES_COUNTER) - static int wc_AesEncrypt(Aes* aes, const byte* inBlock, byte* outBlock) - { - word32* keyPt = aes->key; + defined(WOLFSSL_AES_DIRECT) || defined(WOLFSSL_AES_COUNTER) || \ + defined(HAVE_AES_CBC) - /* - AESE exor's input with round key - shift rows of exor'ed result - sub bytes for shifted rows - */ +void AES_encrypt_AARCH64(const byte* inBlock, byte* outBlock, byte* key, int nr) +{ + /* + AESE exor's input with round key + shift rows of exor'ed result + sub bytes for shifted rows + */ - __asm__ __volatile__ ( - "LD1 {v0.16b}, [%[CtrIn]] \n" - "LD1 {v1.2d-v4.2d}, [%[Key]], #64 \n" - - "AESE v0.16b, v1.16b \n" - "AESMC v0.16b, v0.16b \n" - "AESE v0.16b, v2.16b \n" - "AESMC v0.16b, v0.16b \n" - "AESE v0.16b, v3.16b \n" - "AESMC v0.16b, v0.16b \n" - "AESE v0.16b, v4.16b \n" - "AESMC v0.16b, v0.16b \n" - - "LD1 {v1.2d-v4.2d}, [%[Key]], #64 \n" - "AESE v0.16b, v1.16b \n" - "AESMC v0.16b, v0.16b \n" - "AESE v0.16b, v2.16b \n" - "AESMC v0.16b, v0.16b \n" - "AESE v0.16b, v3.16b \n" - "AESMC v0.16b, v0.16b \n" - "AESE v0.16b, v4.16b \n" - "AESMC v0.16b, v0.16b \n" - - "LD1 {v1.2d-v2.2d}, [%[Key]], #32 \n" - "AESE v0.16b, v1.16b \n" - "AESMC v0.16b, v0.16b \n" - "AESE v0.16b, v2.16b \n" - - "#subtract rounds done so far and see if should continue\n" - "MOV w12, %w[R] \n" - "SUB w12, w12, #10 \n" - "CBZ w12, 1f \n" - "LD1 {v1.2d-v2.2d}, [%[Key]], #32 \n" - "AESMC v0.16b, v0.16b \n" - "AESE v0.16b, v1.16b \n" - "AESMC v0.16b, v0.16b \n" - "AESE v0.16b, v2.16b \n" - - "SUB w12, w12, #2 \n" - "CBZ w12, 1f \n" - "LD1 {v1.2d-v2.2d}, [%[Key]], #32 \n" - "AESMC v0.16b, v0.16b \n" - "AESE v0.16b, v1.16b \n" - "AESMC v0.16b, v0.16b \n" - "AESE v0.16b, v2.16b \n" + __asm__ __volatile__ ( + "LD1 {v0.16b}, [%[in]] \n" + "LD1 {v1.2d-v4.2d}, [%[key]], #64 \n" - "#Final AddRoundKey then store result \n" - "1: \n" - "LD1 {v1.2d}, [%[Key]], #16 \n" - "EOR v0.16b, v0.16b, v1.16b \n" - "ST1 {v0.16b}, [%[CtrOut]] \n" + "AESE v0.16b, v1.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v2.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v3.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v4.16b \n" + "AESMC v0.16b, v0.16b \n" - :[CtrOut] "=r" (outBlock), "=r" (keyPt), "=r" (aes->rounds), - "=r" (inBlock) - :"0" (outBlock), [Key] "1" (keyPt), [R] "2" (aes->rounds), - [CtrIn] "3" (inBlock) - : "cc", "memory", "w12", "v0", "v1", "v2", "v3", "v4" - ); + "LD1 {v1.2d-v4.2d}, [%[key]], #64 \n" + "AESE v0.16b, v1.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v2.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v3.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v4.16b \n" + "AESMC v0.16b, v0.16b \n" - return 0; - } -#endif /* AES_GCM, AES_CCM, DIRECT or COUNTER */ -#if defined(WOLFSSL_AES_DIRECT) || defined(WOLFSSL_AES_COUNTER) - #ifdef HAVE_AES_DECRYPT - static int wc_AesDecrypt(Aes* aes, const byte* inBlock, byte* outBlock) - { - word32* keyPt = aes->key; + "LD1 {v1.2d-v2.2d}, [%[key]], #32 \n" + "AESE v0.16b, v1.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v2.16b \n" - /* - AESE exor's input with round key - shift rows of exor'ed result - sub bytes for shifted rows - */ + "#subtract rounds done so far and see if should continue\n" + "MOV w12, %w[nr] \n" + "SUB w12, w12, #10 \n" + "CBZ w12, 1f \n" + "LD1 {v1.2d-v2.2d}, [%[key]], #32 \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v1.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v2.16b \n" - __asm__ __volatile__ ( - "LD1 {v0.16b}, [%[CtrIn]] \n" - "LD1 {v1.2d-v4.2d}, [%[Key]], #64 \n" - - "AESD v0.16b, v1.16b \n" - "AESIMC v0.16b, v0.16b \n" - "AESD v0.16b, v2.16b \n" - "AESIMC v0.16b, v0.16b \n" - "AESD v0.16b, v3.16b \n" - "AESIMC v0.16b, v0.16b \n" - "AESD v0.16b, v4.16b \n" - "AESIMC v0.16b, v0.16b \n" - - "LD1 {v1.2d-v4.2d}, [%[Key]], #64 \n" - "AESD v0.16b, v1.16b \n" - "AESIMC v0.16b, v0.16b \n" - "AESD v0.16b, v2.16b \n" - "AESIMC v0.16b, v0.16b \n" - "AESD v0.16b, v3.16b \n" - "AESIMC v0.16b, v0.16b \n" - "AESD v0.16b, v4.16b \n" - "AESIMC v0.16b, v0.16b \n" - - "LD1 {v1.2d-v2.2d}, [%[Key]], #32 \n" - "AESD v0.16b, v1.16b \n" - "AESIMC v0.16b, v0.16b \n" - "AESD v0.16b, v2.16b \n" - - "#subtract rounds done so far and see if should continue\n" - "MOV w12, %w[R] \n" - "SUB w12, w12, #10 \n" - "CBZ w12, 1f \n" - "LD1 {v1.2d-v2.2d}, [%[Key]], #32 \n" - "AESIMC v0.16b, v0.16b \n" - "AESD v0.16b, v1.16b \n" - "AESIMC v0.16b, v0.16b \n" - "AESD v0.16b, v2.16b \n" - - "SUB w12, w12, #2 \n" - "CBZ w12, 1f \n" - "LD1 {v1.2d-v2.2d}, [%[Key]], #32 \n" - "AESIMC v0.16b, v0.16b \n" - "AESD v0.16b, v1.16b \n" - "AESIMC v0.16b, v0.16b \n" - "AESD v0.16b, v2.16b \n" + "SUB w12, w12, #2 \n" + "CBZ w12, 1f \n" + "LD1 {v1.2d-v2.2d}, [%[key]], #32 \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v1.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v2.16b \n" - "#Final AddRoundKey then store result \n" - "1: \n" - "LD1 {v1.2d}, [%[Key]], #16 \n" - "EOR v0.16b, v0.16b, v1.16b \n" - "ST1 {v0.4s}, [%[CtrOut]] \n" + "#Final AddRoundKey then store result \n" + "1: \n" + "LD1 {v1.2d}, [%[key]], #16 \n" + "EOR v0.16b, v0.16b, v1.16b \n" + "ST1 {v0.16b}, [%[out]] \n" - :[CtrOut] "=r" (outBlock), "=r" (keyPt), "=r" (aes->rounds), - "=r" (inBlock) - :[Key] "1" (aes->key), "0" (outBlock), [R] "2" (aes->rounds), - [CtrIn] "3" (inBlock) - : "cc", "memory", "w12", "v0", "v1", "v2", "v3", "v4" - ); + : [key] "+r" (key) + : [in] "r" (inBlock), [out] "r" (outBlock), [nr] "r" (nr) + : "cc", "memory", "w12", "v0", "v1", "v2", "v3", "v4" + ); +} +#endif /* AES_GCM, AES_CCM, DIRECT or COUNTER */ +#if !defined(WC_AES_BITSLICED) || defined(WOLFSSL_AES_DIRECT) || \ + defined(WOLFSSL_AES_COUNTER) +#ifdef HAVE_AES_DECRYPT +void AES_decrypt_AARCH64(const byte* inBlock, byte* outBlock, byte* key, int nr) +{ + /* + AESE exor's input with round key + shift rows of exor'ed result + sub bytes for shifted rows + */ - return 0; + __asm__ __volatile__ ( + "LD1 {v0.16b}, [%[in]] \n" + "LD1 {v1.2d-v4.2d}, [%[key]], #64 \n" + + "AESD v0.16b, v1.16b \n" + "AESIMC v0.16b, v0.16b \n" + "AESD v0.16b, v2.16b \n" + "AESIMC v0.16b, v0.16b \n" + "AESD v0.16b, v3.16b \n" + "AESIMC v0.16b, v0.16b \n" + "AESD v0.16b, v4.16b \n" + "AESIMC v0.16b, v0.16b \n" + + "LD1 {v1.2d-v4.2d}, [%[key]], #64 \n" + "AESD v0.16b, v1.16b \n" + "AESIMC v0.16b, v0.16b \n" + "AESD v0.16b, v2.16b \n" + "AESIMC v0.16b, v0.16b \n" + "AESD v0.16b, v3.16b \n" + "AESIMC v0.16b, v0.16b \n" + "AESD v0.16b, v4.16b \n" + "AESIMC v0.16b, v0.16b \n" + + "LD1 {v1.2d-v2.2d}, [%[key]], #32 \n" + "AESD v0.16b, v1.16b \n" + "AESIMC v0.16b, v0.16b \n" + "AESD v0.16b, v2.16b \n" + + "#subtract rounds done so far and see if should continue\n" + "MOV w12, %w[nr] \n" + "SUB w12, w12, #10 \n" + "CBZ w12, 1f \n" + "LD1 {v1.2d-v2.2d}, [%[key]], #32 \n" + "AESIMC v0.16b, v0.16b \n" + "AESD v0.16b, v1.16b \n" + "AESIMC v0.16b, v0.16b \n" + "AESD v0.16b, v2.16b \n" + + "SUB w12, w12, #2 \n" + "CBZ w12, 1f \n" + "LD1 {v1.2d-v2.2d}, [%[key]], #32 \n" + "AESIMC v0.16b, v0.16b \n" + "AESD v0.16b, v1.16b \n" + "AESIMC v0.16b, v0.16b \n" + "AESD v0.16b, v2.16b \n" + + "#Final AddRoundKey then store result \n" + "1: \n" + "LD1 {v1.2d}, [%[key]], #16 \n" + "EOR v0.16b, v0.16b, v1.16b \n" + "ST1 {v0.4s}, [%[out]] \n" + + : [key] "+r" (key) + : [in] "r" (inBlock), [out] "r" (outBlock), [nr] "r" (nr) + : "cc", "memory", "w12", "v0", "v1", "v2", "v3", "v4" + ); } - #endif /* HAVE_AES_DECRYPT */ +#endif /* HAVE_AES_DECRYPT */ #endif /* DIRECT or COUNTER */ /* AES-CBC */ #ifdef HAVE_AES_CBC - int wc_AesCbcEncrypt(Aes* aes, byte* out, const byte* in, word32 sz) - { - word32 numBlocks = sz / AES_BLOCK_SIZE; - - if (aes == NULL || out == NULL || in == NULL) { - return BAD_FUNC_ARG; - } - - if (sz == 0) { - return 0; - } - -#ifdef WOLFSSL_AES_CBC_LENGTH_CHECKS - if (sz % AES_BLOCK_SIZE) { - return BAD_LENGTH_E; - } -#endif +void AES_CBC_encrypt_AARCH64(const byte* in, byte* out, word32 sz, byte* reg, + byte* key, int rounds) +{ + word32 numBlocks = sz / AES_BLOCK_SIZE; - /* do as many block size ops as possible */ - if (numBlocks > 0) { - word32* key = aes->key; - word32* reg = aes->reg; - /* - AESE exor's input with round key - shift rows of exor'ed result + /* + AESE exor's input with round key + shift rows of exor'ed result sub bytes for shifted rows - note: grouping AESE & AESMC together as pairs reduces latency - */ - switch(aes->rounds) { + note: grouping AESE & AESMC together as pairs reduces latency + */ + switch (rounds) { #ifdef WOLFSSL_AES_128 - case 10: /* AES 128 BLOCK */ - __asm__ __volatile__ ( - "MOV w11, %w[blocks] \n" - "LD1 {v1.2d-v4.2d}, [%[Key]], #64 \n" - "LD1 {v5.2d-v8.2d}, [%[Key]], #64 \n" - "LD1 {v9.2d-v11.2d},[%[Key]], #48 \n" - "LD1 {v0.2d}, [%[reg]] \n" - - "LD1 {v12.2d}, [%[input]], #16 \n" - "1:\n" - "#CBC operations, xorbuf in with current aes->reg \n" - "EOR v0.16b, v0.16b, v12.16b \n" - "AESE v0.16b, v1.16b \n" - "AESMC v0.16b, v0.16b \n" - "AESE v0.16b, v2.16b \n" - "AESMC v0.16b, v0.16b \n" - "AESE v0.16b, v3.16b \n" - "AESMC v0.16b, v0.16b \n" - "AESE v0.16b, v4.16b \n" - "AESMC v0.16b, v0.16b \n" - "AESE v0.16b, v5.16b \n" - "AESMC v0.16b, v0.16b \n" - "AESE v0.16b, v6.16b \n" - "AESMC v0.16b, v0.16b \n" - "AESE v0.16b, v7.16b \n" - "AESMC v0.16b, v0.16b \n" - "AESE v0.16b, v8.16b \n" - "AESMC v0.16b, v0.16b \n" - "AESE v0.16b, v9.16b \n" - "AESMC v0.16b, v0.16b \n" - "AESE v0.16b, v10.16b \n" - "SUB w11, w11, #1 \n" - "EOR v0.16b, v0.16b, v11.16b \n" - "ST1 {v0.2d}, [%[out]], #16 \n" - - "CBZ w11, 2f \n" - "LD1 {v12.2d}, [%[input]], #16 \n" - "B 1b \n" - - "2:\n" - "#store current counter value at the end \n" - "ST1 {v0.2d}, [%[regOut]] \n" - - :[out] "=r" (out), [regOut] "=r" (reg), "=r" (in) - :"0" (out), [Key] "r" (key), [input] "2" (in), - [blocks] "r" (numBlocks), [reg] "1" (reg) - : "cc", "memory", "w11", "v0", "v1", "v2", "v3", "v4", "v5", - "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13" - ); - break; + case 10: /* AES 128 BLOCK */ + __asm__ __volatile__ ( + "MOV w11, %w[blocks] \n" + "LD1 {v1.2d-v4.2d}, [%[key]], #64 \n" + "LD1 {v5.2d-v8.2d}, [%[key]], #64 \n" + "LD1 {v9.2d-v11.2d},[%[key]], #48 \n" + "LD1 {v0.2d}, [%[reg]] \n" + + "LD1 {v12.2d}, [%[in]], #16 \n" + "1:\n" + "#CBC operations, xorbuf in with current reg \n" + "EOR v0.16b, v0.16b, v12.16b \n" + "AESE v0.16b, v1.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v2.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v3.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v4.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v5.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v6.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v7.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v8.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v9.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v10.16b \n" + "SUB w11, w11, #1 \n" + "EOR v0.16b, v0.16b, v11.16b \n" + "ST1 {v0.2d}, [%[out]], #16 \n" + + "CBZ w11, 2f \n" + "LD1 {v12.2d}, [%[in]], #16 \n" + "B 1b \n" + + "2:\n" + "#store current counter value at the end \n" + "ST1 {v0.2d}, [%[reg]] \n" + + : [out] "+r" (out), [in] "+r" (in), [key] "+r" (key) + : [reg] "r" (reg), [blocks] "r" (numBlocks) + : "cc", "memory", "w11", "v0", "v1", "v2", "v3", "v4", "v5", + "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13" + ); + break; #endif /* WOLFSSL_AES_128 */ #ifdef WOLFSSL_AES_192 - case 12: /* AES 192 BLOCK */ - __asm__ __volatile__ ( - "MOV w11, %w[blocks] \n" - "LD1 {v1.2d-v4.2d}, %[Key], #64 \n" - "LD1 {v5.2d-v8.2d}, %[Key], #64 \n" - "LD1 {v9.2d-v12.2d},%[Key], #64 \n" - "LD1 {v13.2d}, %[Key], #16 \n" - "LD1 {v0.2d}, %[reg] \n" - - "LD1 {v14.2d}, [%[input]], #16 \n" - "1:\n" - "#CBC operations, xorbuf in with current aes->reg \n" - "EOR v0.16b, v0.16b, v14.16b \n" - "AESE v0.16b, v1.16b \n" - "AESMC v0.16b, v0.16b \n" - "AESE v0.16b, v2.16b \n" - "AESMC v0.16b, v0.16b \n" - "AESE v0.16b, v3.16b \n" - "AESMC v0.16b, v0.16b \n" - "AESE v0.16b, v4.16b \n" - "AESMC v0.16b, v0.16b \n" - "AESE v0.16b, v5.16b \n" - "AESMC v0.16b, v0.16b \n" - "AESE v0.16b, v6.16b \n" - "AESMC v0.16b, v0.16b \n" - "AESE v0.16b, v7.16b \n" - "AESMC v0.16b, v0.16b \n" - "AESE v0.16b, v8.16b \n" - "AESMC v0.16b, v0.16b \n" - "AESE v0.16b, v9.16b \n" - "AESMC v0.16b, v0.16b \n" - "AESE v0.16b, v10.16b \n" - "AESMC v0.16b, v0.16b \n" - "AESE v0.16b, v11.16b \n" - "AESMC v0.16b, v0.16b \n" - "AESE v0.16b, v12.16b \n" - "EOR v0.16b, v0.16b, v13.16b \n" - "SUB w11, w11, #1 \n" - "ST1 {v0.2d}, [%[out]], #16 \n" - - "CBZ w11, 2f \n" - "LD1 {v14.2d}, [%[input]], #16\n" - "B 1b \n" - - "2:\n" - "#store current counter value at the end \n" - "ST1 {v0.2d}, %[regOut] \n" - - - :[out] "=r" (out), [regOut] "=m" (aes->reg), "=r" (in) - :"0" (out), [Key] "m" (aes->key), [input] "2" (in), - [blocks] "r" (numBlocks), [reg] "m" (aes->reg) - : "cc", "memory", "w11", "v0", "v1", "v2", "v3", "v4", "v5", - "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14" - ); - break; + case 12: /* AES 192 BLOCK */ + __asm__ __volatile__ ( + "MOV w11, %w[blocks] \n" + "LD1 {v1.2d-v4.2d}, [%[key]], #64 \n" + "LD1 {v5.2d-v8.2d}, [%[key]], #64 \n" + "LD1 {v9.2d-v12.2d},[%[key]], #64 \n" + "LD1 {v13.2d}, [%[key]], #16 \n" + "LD1 {v0.2d}, [%[reg]] \n" + + "LD1 {v14.2d}, [%[in]], #16 \n" + "1:\n" + "#CBC operations, xorbuf in with current reg \n" + "EOR v0.16b, v0.16b, v14.16b \n" + "AESE v0.16b, v1.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v2.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v3.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v4.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v5.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v6.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v7.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v8.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v9.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v10.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v11.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v12.16b \n" + "EOR v0.16b, v0.16b, v13.16b \n" + "SUB w11, w11, #1 \n" + "ST1 {v0.2d}, [%[out]], #16 \n" + + "CBZ w11, 2f \n" + "LD1 {v14.2d}, [%[in]], #16\n" + "B 1b \n" + + "2:\n" + "#store current counter value at the end \n" + "ST1 {v0.2d}, [%[reg]] \n" + + : [out] "+r" (out), [in] "+r" (in), [key] "+r" (key) + : [reg] "r" (reg), [blocks] "r" (numBlocks) + : "cc", "memory", "w11", "v0", "v1", "v2", "v3", "v4", "v5", + "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14" + ); + break; #endif /* WOLFSSL_AES_192*/ #ifdef WOLFSSL_AES_256 - case 14: /* AES 256 BLOCK */ - __asm__ __volatile__ ( - "MOV w11, %w[blocks] \n" - "LD1 {v1.2d-v4.2d}, %[Key], #64 \n" - - "LD1 {v5.2d-v8.2d}, %[Key], #64 \n" - "LD1 {v9.2d-v12.2d}, %[Key], #64 \n" - "LD1 {v13.2d-v15.2d}, %[Key], #48 \n" - "LD1 {v0.2d}, %[reg] \n" - - "LD1 {v16.2d}, [%[input]], #16 \n" - "1: \n" - "#CBC operations, xorbuf in with current aes->reg \n" - "EOR v0.16b, v0.16b, v16.16b \n" - "AESE v0.16b, v1.16b \n" - "AESMC v0.16b, v0.16b \n" - "AESE v0.16b, v2.16b \n" - "AESMC v0.16b, v0.16b \n" - "AESE v0.16b, v3.16b \n" - "AESMC v0.16b, v0.16b \n" - "AESE v0.16b, v4.16b \n" - "AESMC v0.16b, v0.16b \n" - "AESE v0.16b, v5.16b \n" - "AESMC v0.16b, v0.16b \n" - "AESE v0.16b, v6.16b \n" - "AESMC v0.16b, v0.16b \n" - "AESE v0.16b, v7.16b \n" - "AESMC v0.16b, v0.16b \n" - "AESE v0.16b, v8.16b \n" - "AESMC v0.16b, v0.16b \n" - "AESE v0.16b, v9.16b \n" - "AESMC v0.16b, v0.16b \n" - "AESE v0.16b, v10.16b \n" - "AESMC v0.16b, v0.16b \n" - "AESE v0.16b, v11.16b \n" - "AESMC v0.16b, v0.16b \n" - "AESE v0.16b, v12.16b \n" - "AESMC v0.16b, v0.16b \n" - "AESE v0.16b, v13.16b \n" - "AESMC v0.16b, v0.16b \n" - "AESE v0.16b, v14.16b \n" - "EOR v0.16b, v0.16b, v15.16b \n" - "SUB w11, w11, #1 \n" - "ST1 {v0.2d}, [%[out]], #16 \n" - - "CBZ w11, 2f \n" - "LD1 {v16.2d}, [%[input]], #16 \n" - "B 1b \n" + case 14: /* AES 256 BLOCK */ + __asm__ __volatile__ ( + "MOV w11, %w[blocks] \n" + "LD1 {v1.2d-v4.2d}, [%[key]], #64 \n" - "2: \n" - "#store current counter value at the end \n" - "ST1 {v0.2d}, %[regOut] \n" + "LD1 {v5.2d-v8.2d}, [%[key]], #64 \n" + "LD1 {v9.2d-v12.2d}, [%[key]], #64 \n" + "LD1 {v13.2d-v15.2d}, [%[key]], #48 \n" + "LD1 {v0.2d}, [%[reg]] \n" + "LD1 {v16.2d}, [%[in]], #16 \n" + "1: \n" + "#CBC operations, xorbuf in with current reg \n" + "EOR v0.16b, v0.16b, v16.16b \n" + "AESE v0.16b, v1.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v2.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v3.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v4.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v5.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v6.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v7.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v8.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v9.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v10.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v11.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v12.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v13.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v14.16b \n" + "EOR v0.16b, v0.16b, v15.16b \n" + "SUB w11, w11, #1 \n" + "ST1 {v0.2d}, [%[out]], #16 \n" + + "CBZ w11, 2f \n" + "LD1 {v16.2d}, [%[in]], #16 \n" + "B 1b \n" - :[out] "=r" (out), [regOut] "=m" (aes->reg), "=r" (in) - :"0" (out), [Key] "m" (aes->key), [input] "2" (in), - [blocks] "r" (numBlocks), [reg] "m" (aes->reg) - : "cc", "memory", "w11", "v0", "v1", "v2", "v3", "v4", "v5", - "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14","v15", - "v16" - ); - break; + "2: \n" + "#store current counter value at the end \n" + "ST1 {v0.2d}, [%[reg]] \n" + + : [out] "+r" (out), [in] "+r" (in), [key] "+r" (key) + : [reg] "r" (reg), [blocks] "r" (numBlocks) + : "cc", "memory", "w11", "v0", "v1", "v2", "v3", "v4", "v5", + "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14","v15", + "v16" + ); + break; #endif /* WOLFSSL_AES_256 */ - default: - WOLFSSL_MSG("Bad AES-CBC round value"); - return BAD_FUNC_ARG; - } - } - - return 0; } +} - #ifdef HAVE_AES_DECRYPT - int wc_AesCbcDecrypt(Aes* aes, byte* out, const byte* in, word32 sz) - { - word32 numBlocks = sz / AES_BLOCK_SIZE; - - if (aes == NULL || out == NULL || in == NULL) { - return BAD_FUNC_ARG; - } - - if (sz == 0) { - return 0; - } - - if (sz % AES_BLOCK_SIZE) { -#ifdef WOLFSSL_AES_CBC_LENGTH_CHECKS - return BAD_LENGTH_E; -#else - return BAD_FUNC_ARG; -#endif - } - - /* do as many block size ops as possible */ - if (numBlocks > 0) { - word32* key = aes->key; - word32* reg = aes->reg; +#ifdef HAVE_AES_DECRYPT +void AES_CBC_decrypt_AARCH64(const byte* in, byte* out, word32 sz, + byte* reg, byte* key, int rounds) +{ + word32 numBlocks = sz / AES_BLOCK_SIZE; - switch(aes->rounds) { + switch (rounds) { #ifdef WOLFSSL_AES_128 - case 10: /* AES 128 BLOCK */ - __asm__ __volatile__ ( - "MOV w11, %w[blocks] \n" - "LD1 {v1.2d-v4.2d}, [%[Key]], #64 \n" - "LD1 {v5.2d-v8.2d}, [%[Key]], #64 \n" - "LD1 {v9.2d-v11.2d},[%[Key]], #48 \n" - "LD1 {v13.2d}, [%[reg]] \n" - - "1:\n" - "LD1 {v0.2d}, [%[input]], #16 \n" - "MOV v12.16b, v0.16b \n" - "AESD v0.16b, v1.16b \n" - "AESIMC v0.16b, v0.16b \n" - "AESD v0.16b, v2.16b \n" - "AESIMC v0.16b, v0.16b \n" - "AESD v0.16b, v3.16b \n" - "AESIMC v0.16b, v0.16b \n" - "AESD v0.16b, v4.16b \n" - "AESIMC v0.16b, v0.16b \n" - "AESD v0.16b, v5.16b \n" - "AESIMC v0.16b, v0.16b \n" - "AESD v0.16b, v6.16b \n" - "AESIMC v0.16b, v0.16b \n" - "AESD v0.16b, v7.16b \n" - "AESIMC v0.16b, v0.16b \n" - "AESD v0.16b, v8.16b \n" - "AESIMC v0.16b, v0.16b \n" - "AESD v0.16b, v9.16b \n" - "AESIMC v0.16b, v0.16b \n" - "AESD v0.16b, v10.16b \n" - "EOR v0.16b, v0.16b, v11.16b \n" - - "EOR v0.16b, v0.16b, v13.16b \n" - "SUB w11, w11, #1 \n" - "ST1 {v0.2d}, [%[out]], #16 \n" - "MOV v13.16b, v12.16b \n" - - "CBZ w11, 2f \n" - "B 1b \n" + case 10: /* AES 128 BLOCK */ + __asm__ __volatile__ ( + "MOV w11, %w[blocks] \n" + "LD1 {v1.2d-v4.2d}, [%[key]], #64 \n" + "LD1 {v5.2d-v8.2d}, [%[key]], #64 \n" + "LD1 {v9.2d-v11.2d},[%[key]], #48 \n" + "LD1 {v13.2d}, [%[reg]] \n" + + "1:\n" + "LD1 {v0.2d}, [%[in]], #16 \n" + "MOV v12.16b, v0.16b \n" + "AESD v0.16b, v1.16b \n" + "AESIMC v0.16b, v0.16b \n" + "AESD v0.16b, v2.16b \n" + "AESIMC v0.16b, v0.16b \n" + "AESD v0.16b, v3.16b \n" + "AESIMC v0.16b, v0.16b \n" + "AESD v0.16b, v4.16b \n" + "AESIMC v0.16b, v0.16b \n" + "AESD v0.16b, v5.16b \n" + "AESIMC v0.16b, v0.16b \n" + "AESD v0.16b, v6.16b \n" + "AESIMC v0.16b, v0.16b \n" + "AESD v0.16b, v7.16b \n" + "AESIMC v0.16b, v0.16b \n" + "AESD v0.16b, v8.16b \n" + "AESIMC v0.16b, v0.16b \n" + "AESD v0.16b, v9.16b \n" + "AESIMC v0.16b, v0.16b \n" + "AESD v0.16b, v10.16b \n" + "EOR v0.16b, v0.16b, v11.16b \n" + + "EOR v0.16b, v0.16b, v13.16b \n" + "SUB w11, w11, #1 \n" + "ST1 {v0.2d}, [%[out]], #16 \n" + "MOV v13.16b, v12.16b \n" + + "CBZ w11, 2f \n" + "B 1b \n" - "2: \n" - "#store current counter value at the end \n" - "ST1 {v13.2d}, [%[regOut]] \n" + "2: \n" + "#store current counter value at the end \n" + "ST1 {v13.2d}, [%[reg]] \n" - :[out] "=r" (out), [regOut] "=r" (reg), "=r" (in) - :"0" (out), [Key] "r" (key), [input] "2" (in), - [blocks] "r" (numBlocks), [reg] "1" (reg) - : "cc", "memory", "w11", "v0", "v1", "v2", "v3", "v4", "v5", - "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13" - ); - break; + : [out] "+r" (out), [in] "+r" (in), [key] "+r" (key) + : [reg] "r" (reg), [blocks] "r" (numBlocks) + : "cc", "memory", "w11", "v0", "v1", "v2", "v3", "v4", "v5", + "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13" + ); + break; #endif /* WOLFSSL_AES_128 */ #ifdef WOLFSSL_AES_192 - case 12: /* AES 192 BLOCK */ - __asm__ __volatile__ ( - "MOV w11, %w[blocks] \n" - "LD1 {v1.2d-v4.2d}, [%[Key]], #64 \n" - "LD1 {v5.2d-v8.2d}, [%[Key]], #64 \n" - "LD1 {v9.2d-v12.2d},[%[Key]], #64 \n" - "LD1 {v13.16b}, [%[Key]], #16 \n" - "LD1 {v15.2d}, [%[reg]] \n" - - "LD1 {v0.2d}, [%[input]], #16 \n" - "1: \n" - "MOV v14.16b, v0.16b \n" - "AESD v0.16b, v1.16b \n" - "AESIMC v0.16b, v0.16b \n" - "AESD v0.16b, v2.16b \n" - "AESIMC v0.16b, v0.16b \n" - "AESD v0.16b, v3.16b \n" - "AESIMC v0.16b, v0.16b \n" - "AESD v0.16b, v4.16b \n" - "AESIMC v0.16b, v0.16b \n" - "AESD v0.16b, v5.16b \n" - "AESIMC v0.16b, v0.16b \n" - "AESD v0.16b, v6.16b \n" - "AESIMC v0.16b, v0.16b \n" - "AESD v0.16b, v7.16b \n" - "AESIMC v0.16b, v0.16b \n" - "AESD v0.16b, v8.16b \n" - "AESIMC v0.16b, v0.16b \n" - "AESD v0.16b, v9.16b \n" - "AESIMC v0.16b, v0.16b \n" - "AESD v0.16b, v10.16b \n" - "AESIMC v0.16b, v0.16b \n" - "AESD v0.16b, v11.16b \n" - "AESIMC v0.16b, v0.16b \n" - "AESD v0.16b, v12.16b \n" - "EOR v0.16b, v0.16b, v13.16b \n" - - "EOR v0.16b, v0.16b, v15.16b \n" - "SUB w11, w11, #1 \n" - "ST1 {v0.2d}, [%[out]], #16 \n" - "MOV v15.16b, v14.16b \n" - - "CBZ w11, 2f \n" - "LD1 {v0.2d}, [%[input]], #16 \n" - "B 1b \n" - - "2:\n" - "#store current counter value at the end \n" - "ST1 {v15.2d}, [%[regOut]] \n" - - :[out] "=r" (out), [regOut] "=r" (reg), "=r" (in) - :"0" (out), [Key] "r" (key), [input] "2" (in), - [blocks] "r" (numBlocks), [reg] "1" (reg) - : "cc", "memory", "w11", "v0", "v1", "v2", "v3", "v4", "v5", - "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15" - ); - break; + case 12: /* AES 192 BLOCK */ + __asm__ __volatile__ ( + "MOV w11, %w[blocks] \n" + "LD1 {v1.2d-v4.2d}, [%[key]], #64 \n" + "LD1 {v5.2d-v8.2d}, [%[key]], #64 \n" + "LD1 {v9.2d-v12.2d},[%[key]], #64 \n" + "LD1 {v13.16b}, [%[key]], #16 \n" + "LD1 {v15.2d}, [%[reg]] \n" + + "LD1 {v0.2d}, [%[in]], #16 \n" + "1: \n" + "MOV v14.16b, v0.16b \n" + "AESD v0.16b, v1.16b \n" + "AESIMC v0.16b, v0.16b \n" + "AESD v0.16b, v2.16b \n" + "AESIMC v0.16b, v0.16b \n" + "AESD v0.16b, v3.16b \n" + "AESIMC v0.16b, v0.16b \n" + "AESD v0.16b, v4.16b \n" + "AESIMC v0.16b, v0.16b \n" + "AESD v0.16b, v5.16b \n" + "AESIMC v0.16b, v0.16b \n" + "AESD v0.16b, v6.16b \n" + "AESIMC v0.16b, v0.16b \n" + "AESD v0.16b, v7.16b \n" + "AESIMC v0.16b, v0.16b \n" + "AESD v0.16b, v8.16b \n" + "AESIMC v0.16b, v0.16b \n" + "AESD v0.16b, v9.16b \n" + "AESIMC v0.16b, v0.16b \n" + "AESD v0.16b, v10.16b \n" + "AESIMC v0.16b, v0.16b \n" + "AESD v0.16b, v11.16b \n" + "AESIMC v0.16b, v0.16b \n" + "AESD v0.16b, v12.16b \n" + "EOR v0.16b, v0.16b, v13.16b \n" + + "EOR v0.16b, v0.16b, v15.16b \n" + "SUB w11, w11, #1 \n" + "ST1 {v0.2d}, [%[out]], #16 \n" + "MOV v15.16b, v14.16b \n" + + "CBZ w11, 2f \n" + "LD1 {v0.2d}, [%[in]], #16 \n" + "B 1b \n" + + "2:\n" + "#store current counter value at the end \n" + "ST1 {v15.2d}, [%[reg]] \n" + + : [out] "+r" (out), [in] "+r" (in), [key] "+r" (key) + : [reg] "r" (reg), [blocks] "r" (numBlocks) + : "cc", "memory", "w11", "v0", "v1", "v2", "v3", "v4", "v5", + "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15" + ); + break; #endif /* WOLFSSL_AES_192 */ #ifdef WOLFSSL_AES_256 - case 14: /* AES 256 BLOCK */ - __asm__ __volatile__ ( - "MOV w11, %w[blocks] \n" - "LD1 {v1.2d-v4.2d}, [%[Key]], #64 \n" - "LD1 {v5.2d-v8.2d}, [%[Key]], #64 \n" - "LD1 {v9.2d-v12.2d}, [%[Key]], #64 \n" - "LD1 {v13.2d-v15.2d}, [%[Key]], #48 \n" - "LD1 {v17.2d}, [%[reg]] \n" - - "LD1 {v0.2d}, [%[input]], #16 \n" - "1: \n" - "MOV v16.16b, v0.16b \n" - "AESD v0.16b, v1.16b \n" - "AESIMC v0.16b, v0.16b \n" - "AESD v0.16b, v2.16b \n" - "AESIMC v0.16b, v0.16b \n" - "AESD v0.16b, v3.16b \n" - "AESIMC v0.16b, v0.16b \n" - "AESD v0.16b, v4.16b \n" - "AESIMC v0.16b, v0.16b \n" - "AESD v0.16b, v5.16b \n" - "AESIMC v0.16b, v0.16b \n" - "AESD v0.16b, v6.16b \n" - "AESIMC v0.16b, v0.16b \n" - "AESD v0.16b, v7.16b \n" - "AESIMC v0.16b, v0.16b \n" - "AESD v0.16b, v8.16b \n" - "AESIMC v0.16b, v0.16b \n" - "AESD v0.16b, v9.16b \n" - "AESIMC v0.16b, v0.16b \n" - "AESD v0.16b, v10.16b \n" - "AESIMC v0.16b, v0.16b \n" - "AESD v0.16b, v11.16b \n" - "AESIMC v0.16b, v0.16b \n" - "AESD v0.16b, v12.16b \n" - "AESIMC v0.16b, v0.16b \n" - "AESD v0.16b, v13.16b \n" - "AESIMC v0.16b, v0.16b \n" - "AESD v0.16b, v14.16b \n" - "EOR v0.16b, v0.16b, v15.16b \n" - - "EOR v0.16b, v0.16b, v17.16b \n" - "SUB w11, w11, #1 \n" - "ST1 {v0.2d}, [%[out]], #16 \n" - "MOV v17.16b, v16.16b \n" - - "CBZ w11, 2f \n" - "LD1 {v0.2d}, [%[input]], #16 \n" - "B 1b \n" - - "2:\n" - "#store current counter value at the end \n" - "ST1 {v17.2d}, [%[regOut]] \n" - - :[out] "=r" (out), [regOut] "=r" (reg), "=r" (in) - :"0" (out), [Key] "r" (key), [input] "2" (in), - [blocks] "r" (numBlocks), [reg] "1" (reg) - : "cc", "memory", "w11", "v0", "v1", "v2", "v3", "v4", "v5", - "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14","v15", - "v16", "v17" - ); - break; + case 14: /* AES 256 BLOCK */ + __asm__ __volatile__ ( + "MOV w11, %w[blocks] \n" + "LD1 {v1.2d-v4.2d}, [%[key]], #64 \n" + "LD1 {v5.2d-v8.2d}, [%[key]], #64 \n" + "LD1 {v9.2d-v12.2d}, [%[key]], #64 \n" + "LD1 {v13.2d-v15.2d}, [%[key]], #48 \n" + "LD1 {v17.2d}, [%[reg]] \n" + + "LD1 {v0.2d}, [%[in]], #16 \n" + "1: \n" + "MOV v16.16b, v0.16b \n" + "AESD v0.16b, v1.16b \n" + "AESIMC v0.16b, v0.16b \n" + "AESD v0.16b, v2.16b \n" + "AESIMC v0.16b, v0.16b \n" + "AESD v0.16b, v3.16b \n" + "AESIMC v0.16b, v0.16b \n" + "AESD v0.16b, v4.16b \n" + "AESIMC v0.16b, v0.16b \n" + "AESD v0.16b, v5.16b \n" + "AESIMC v0.16b, v0.16b \n" + "AESD v0.16b, v6.16b \n" + "AESIMC v0.16b, v0.16b \n" + "AESD v0.16b, v7.16b \n" + "AESIMC v0.16b, v0.16b \n" + "AESD v0.16b, v8.16b \n" + "AESIMC v0.16b, v0.16b \n" + "AESD v0.16b, v9.16b \n" + "AESIMC v0.16b, v0.16b \n" + "AESD v0.16b, v10.16b \n" + "AESIMC v0.16b, v0.16b \n" + "AESD v0.16b, v11.16b \n" + "AESIMC v0.16b, v0.16b \n" + "AESD v0.16b, v12.16b \n" + "AESIMC v0.16b, v0.16b \n" + "AESD v0.16b, v13.16b \n" + "AESIMC v0.16b, v0.16b \n" + "AESD v0.16b, v14.16b \n" + "EOR v0.16b, v0.16b, v15.16b \n" + + "EOR v0.16b, v0.16b, v17.16b \n" + "SUB w11, w11, #1 \n" + "ST1 {v0.2d}, [%[out]], #16 \n" + "MOV v17.16b, v16.16b \n" + + "CBZ w11, 2f \n" + "LD1 {v0.2d}, [%[in]], #16 \n" + "B 1b \n" + + "2:\n" + "#store current counter value at the end \n" + "ST1 {v17.2d}, [%[reg]] \n" + + : [out] "+r" (out), [in] "+r" (in), [key] "+r" (key) + : [reg] "r" (reg), [blocks] "r" (numBlocks) + : "cc", "memory", "w11", "v0", "v1", "v2", "v3", "v4", "v5", + "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14","v15", + "v16", "v17" + ); + break; #endif /* WOLFSSL_AES_256 */ - default: - WOLFSSL_MSG("Bad AES-CBC round value"); - return BAD_FUNC_ARG; - } - } - - return 0; } - #endif +} +#endif #endif /* HAVE_AES_CBC */ @@ -1420,40 +1324,11 @@ static void wc_aes_ctr_encrypt_asm(Aes* aes, byte* out, const byte* in, } } -int wc_AesCtrEncrypt(Aes* aes, byte* out, const byte* in, word32 sz) +void AES_CTR_encrypt_AARCH64(Aes* aes, byte* out, const byte* in, word32 sz) { byte* tmp; word32 numBlocks; - if (aes == NULL || out == NULL || in == NULL) { - return BAD_FUNC_ARG; - } - switch(aes->rounds) { - #ifdef WOLFSSL_AES_128 - case 10: /* AES 128 BLOCK */ - #endif /* WOLFSSL_AES_128 */ - #ifdef WOLFSSL_AES_192 - case 12: /* AES 192 BLOCK */ - #endif /* WOLFSSL_AES_192 */ - #ifdef WOLFSSL_AES_256 - case 14: /* AES 256 BLOCK */ - #endif /* WOLFSSL_AES_256 */ - break; - default: - WOLFSSL_MSG("Bad AES-CTR round value"); - return BAD_FUNC_ARG; - } - - - tmp = (byte*)aes->tmp + AES_BLOCK_SIZE - aes->left; - - /* consume any unused bytes left in aes->tmp */ - while ((aes->left != 0) && (sz != 0)) { - *(out++) = *(in++) ^ *(tmp++); - aes->left--; - sz--; - } - /* do as many block size ops as possible */ numBlocks = sz / AES_BLOCK_SIZE; if (numBlocks > 0) { @@ -1478,14 +1353,6 @@ int wc_AesCtrEncrypt(Aes* aes, byte* out, const byte* in, word32 sz) aes->left--; } } - return 0; -} - -int wc_AesCtrSetKey(Aes* aes, const byte* key, word32 len, - const byte* iv, int dir) -{ - (void)dir; - return wc_AesSetKey(aes, key, len, iv, AES_ENCRYPTION); } #endif /* WOLFSSL_AES_COUNTER */ @@ -1500,7 +1367,7 @@ int wc_AesCtrSetKey(Aes* aes, const byte* key, word32 len, /* PMULL and RBIT only with AArch64 */ /* Use ARM hardware for polynomial multiply */ -void GMULT(byte* X, byte* Y) +void GMULT_AARCH64(byte* X, byte* Y) { __asm__ volatile ( "LD1 {v0.16b}, [%[X]] \n" @@ -1532,7 +1399,7 @@ void GMULT(byte* X, byte* Y) ); } -void GHASH(Gcm* gcm, const byte* a, word32 aSz, const byte* c, +static void GHASH_AARCH64(Gcm* gcm, const byte* a, word32 aSz, const byte* c, word32 cSz, byte* s, word32 sSz) { byte scratch[AES_BLOCK_SIZE]; @@ -1899,12 +1766,291 @@ void GHASH(Gcm* gcm, const byte* a, word32 aSz, const byte* c, "v8", "v9", "v10", "v11", "v12", "v13", "v14" ); - XMEMCPY(s, scratch, sSz); + XMEMCPY(s, scratch, sSz); +} + +#ifdef WOLFSSL_AESGCM_STREAM + /* Access initialization counter data. */ + #define AES_INITCTR(aes) ((aes)->streamData + 0 * AES_BLOCK_SIZE) + /* Access counter data. */ + #define AES_COUNTER(aes) ((aes)->streamData + 1 * AES_BLOCK_SIZE) + /* Access tag data. */ + #define AES_TAG(aes) ((aes)->streamData + 2 * AES_BLOCK_SIZE) + /* Access last GHASH block. */ + #define AES_LASTGBLOCK(aes) ((aes)->streamData + 3 * AES_BLOCK_SIZE) + /* Access last encrypted block. */ + #define AES_LASTBLOCK(aes) ((aes)->streamData + 4 * AES_BLOCK_SIZE) + +/* GHASH one block of data. + * + * XOR block into tag and GMULT with H. + * + * @param [in, out] aes AES GCM object. + * @param [in] block Block of AAD or cipher text. + */ +#define GHASH_ONE_BLOCK_AARCH64(aes, block) \ + do { \ + xorbuf(AES_TAG(aes), block, AES_BLOCK_SIZE); \ + GMULT_AARCH64(AES_TAG(aes), aes->gcm.H); \ + } \ + while (0) + +/* Hash in the lengths of the AAD and cipher text in bits. + * + * Default implementation. + * + * @param [in, out] aes AES GCM object. + */ +#define GHASH_LEN_BLOCK_AARCH64(aes) \ + do { \ + byte scratch[AES_BLOCK_SIZE]; \ + FlattenSzInBits(&scratch[0], aes->aSz); \ + FlattenSzInBits(&scratch[8], aes->cSz); \ + GHASH_ONE_BLOCK_AARCH64(aes, scratch); \ + } \ + while (0) + +/* Update the GHASH with AAD and/or cipher text. + * + * @param [in,out] aes AES GCM object. + * @param [in] a Additional authentication data buffer. + * @param [in] aSz Size of data in AAD buffer. + * @param [in] c Cipher text buffer. + * @param [in] cSz Size of data in cipher text buffer. + */ +void GHASH_UPDATE_AARCH64(Aes* aes, const byte* a, word32 aSz, const byte* c, + word32 cSz) +{ + word32 blocks; + word32 partial; + + /* Hash in A, the Additional Authentication Data */ + if (aSz != 0 && a != NULL) { + /* Update count of AAD we have hashed. */ + aes->aSz += aSz; + /* Check if we have unprocessed data. */ + if (aes->aOver > 0) { + /* Calculate amount we can use - fill up the block. */ + byte sz = AES_BLOCK_SIZE - aes->aOver; + if (sz > aSz) { + sz = aSz; + } + /* Copy extra into last GHASH block array and update count. */ + XMEMCPY(AES_LASTGBLOCK(aes) + aes->aOver, a, sz); + aes->aOver += sz; + if (aes->aOver == AES_BLOCK_SIZE) { + /* We have filled up the block and can process. */ + GHASH_ONE_BLOCK_AARCH64(aes, AES_LASTGBLOCK(aes)); + /* Reset count. */ + aes->aOver = 0; + } + /* Used up some data. */ + aSz -= sz; + a += sz; + } + + /* Calculate number of blocks of AAD and the leftover. */ + blocks = aSz / AES_BLOCK_SIZE; + partial = aSz % AES_BLOCK_SIZE; + /* GHASH full blocks now. */ + while (blocks--) { + GHASH_ONE_BLOCK_AARCH64(aes, a); + a += AES_BLOCK_SIZE; + } + if (partial != 0) { + /* Cache the partial block. */ + XMEMCPY(AES_LASTGBLOCK(aes), a, partial); + aes->aOver = (byte)partial; + } + } + if (aes->aOver > 0 && cSz > 0 && c != NULL) { + /* No more AAD coming and we have a partial block. */ + /* Fill the rest of the block with zeros. */ + byte sz = AES_BLOCK_SIZE - aes->aOver; + XMEMSET(AES_LASTGBLOCK(aes) + aes->aOver, 0, sz); + /* GHASH last AAD block. */ + GHASH_ONE_BLOCK_AARCH64(aes, AES_LASTGBLOCK(aes)); + /* Clear partial count for next time through. */ + aes->aOver = 0; + } + + /* Hash in C, the Ciphertext */ + if (cSz != 0 && c != NULL) { + /* Update count of cipher text we have hashed. */ + aes->cSz += cSz; + if (aes->cOver > 0) { + /* Calculate amount we can use - fill up the block. */ + byte sz = AES_BLOCK_SIZE - aes->cOver; + if (sz > cSz) { + sz = cSz; + } + XMEMCPY(AES_LASTGBLOCK(aes) + aes->cOver, c, sz); + /* Update count of unused encrypted counter. */ + aes->cOver += sz; + if (aes->cOver == AES_BLOCK_SIZE) { + /* We have filled up the block and can process. */ + GHASH_ONE_BLOCK_AARCH64(aes, AES_LASTGBLOCK(aes)); + /* Reset count. */ + aes->cOver = 0; + } + /* Used up some data. */ + cSz -= sz; + c += sz; + } + + /* Calculate number of blocks of cipher text and the leftover. */ + blocks = cSz / AES_BLOCK_SIZE; + partial = cSz % AES_BLOCK_SIZE; + /* GHASH full blocks now. */ + while (blocks--) { + GHASH_ONE_BLOCK_AARCH64(aes, c); + c += AES_BLOCK_SIZE; + } + if (partial != 0) { + /* Cache the partial block. */ + XMEMCPY(AES_LASTGBLOCK(aes), c, partial); + aes->cOver = (byte)partial; + } + } +} + +/* Finalize the GHASH calculation. + * + * Complete hashing cipher text and hash the AAD and cipher text lengths. + * + * @param [in, out] aes AES GCM object. + * @param [out] s Authentication tag. + * @param [in] sSz Size of authentication tag required. + */ +static void GHASH_FINAL_AARCH64(Aes* aes, byte* s, word32 sSz) +{ + /* AAD block incomplete when > 0 */ + byte over = aes->aOver; + + if (aes->cOver > 0) { + /* Cipher text block incomplete. */ + over = aes->cOver; + } + if (over > 0) { + /* Zeroize the unused part of the block. */ + XMEMSET(AES_LASTGBLOCK(aes) + over, 0, AES_BLOCK_SIZE - over); + /* Hash the last block of cipher text. */ + GHASH_ONE_BLOCK_AARCH64(aes, AES_LASTGBLOCK(aes)); + } + /* Hash in the lengths of AAD and cipher text in bits */ + GHASH_LEN_BLOCK_AARCH64(aes); + /* Copy the result into s. */ + XMEMCPY(s, AES_TAG(aes), sSz); +} + +void AES_GCM_init_AARCH64(Aes* aes, const byte* iv, word32 ivSz) +{ + ALIGN32 byte counter[AES_BLOCK_SIZE]; + + if (ivSz == GCM_NONCE_MID_SZ) { + /* Counter is IV with bottom 4 bytes set to: 0x00,0x00,0x00,0x01. */ + XMEMCPY(counter, iv, ivSz); + XMEMSET(counter + GCM_NONCE_MID_SZ, 0, + AES_BLOCK_SIZE - GCM_NONCE_MID_SZ - 1); + counter[AES_BLOCK_SIZE - 1] = 1; + } + else { + /* Counter is GHASH of IV. */ + #ifdef OPENSSL_EXTRA + word32 aadTemp = aes->gcm.aadLen; + aes->gcm.aadLen = 0; + #endif + GHASH_AARCH64(&aes->gcm, NULL, 0, iv, ivSz, counter, AES_BLOCK_SIZE); + GMULT_AARCH64(counter, aes->gcm.H); + #ifdef OPENSSL_EXTRA + aes->gcm.aadLen = aadTemp; + #endif + } + + /* Copy in the counter for use with cipher. */ + XMEMCPY(AES_COUNTER(aes), counter, AES_BLOCK_SIZE); + /* Encrypt initial counter into a buffer for GCM. */ + AES_encrypt_AARCH64(counter, AES_INITCTR(aes), (byte*)aes->key, + (int)aes->rounds); +} + +void AES_GCM_crypt_update_AARCH64(Aes* aes, byte* out, const byte* in, + word32 sz) +{ + word32 blocks; + word32 partial; + + /* Check if previous encrypted block was not used up. */ + if (aes->over > 0) { + byte pSz = AES_BLOCK_SIZE - aes->over; + if (pSz > sz) pSz = sz; + + /* Use some/all of last encrypted block. */ + xorbufout(out, AES_LASTBLOCK(aes) + aes->over, in, pSz); + aes->over = (aes->over + pSz) & (AES_BLOCK_SIZE - 1); + + /* Some data used. */ + sz -= pSz; + in += pSz; + out += pSz; + } + + /* Calculate the number of blocks needing to be encrypted and any leftover. + */ + blocks = sz / AES_BLOCK_SIZE; + partial = sz & (AES_BLOCK_SIZE - 1); + + /* Encrypt block by block. */ + while (blocks--) { + ALIGN32 byte scratch[AES_BLOCK_SIZE]; + IncrementGcmCounter(AES_COUNTER(aes)); + /* Encrypt counter into a buffer. */ + AES_encrypt_AARCH64(AES_COUNTER(aes), scratch, (byte*)aes->key, + (int)aes->rounds); + /* XOR plain text into encrypted counter into cipher text buffer. */ + xorbufout(out, scratch, in, AES_BLOCK_SIZE); + /* Data complete. */ + in += AES_BLOCK_SIZE; + out += AES_BLOCK_SIZE; + } + + if (partial != 0) { + /* Generate an extra block and use up as much as needed. */ + IncrementGcmCounter(AES_COUNTER(aes)); + /* Encrypt counter into cache. */ + AES_encrypt_AARCH64(AES_COUNTER(aes), AES_LASTBLOCK(aes), + (byte*)aes->key, (int)aes->rounds); + /* XOR plain text into encrypted counter into cipher text buffer. */ + xorbufout(out, AES_LASTBLOCK(aes), in, partial); + /* Keep amount of encrypted block used. */ + aes->over = partial; + } +} + +/* Calculates authentication tag for AES GCM. C implementation. + * + * @param [in, out] aes AES object. + * @param [out] authTag Buffer to store authentication tag in. + * @param [in] authTagSz Length of tag to create. + */ +void AES_GCM_final_AARCH64(Aes* aes, byte* authTag, word32 authTagSz) +{ + /* Calculate authentication tag. */ + GHASH_FINAL_AARCH64(aes, authTag, authTagSz); + /* XOR in as much of encrypted counter as is required. */ + xorbuf(authTag, AES_INITCTR(aes), authTagSz); +#ifdef OPENSSL_EXTRA + /* store AAD size for next call */ + aes->gcm.aadLen = aes->aSz; +#endif + /* Zeroize last block to protect sensitive data. */ + ForceZero(AES_LASTBLOCK(aes), AES_BLOCK_SIZE); } +#endif /* WOLFSSL_AESGCM_STREAM */ #ifdef WOLFSSL_AES_128 /* internal function : see wc_AesGcmEncrypt */ -static int Aes128GcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz, +static void Aes128GcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz, const byte* iv, word32 ivSz, byte* authTag, word32 authTagSz, const byte* authIn, word32 authInSz) { @@ -1924,8 +2070,8 @@ static int Aes128GcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz, counter[AES_BLOCK_SIZE - 1] = 1; } else { - GHASH(&aes->gcm, NULL, 0, iv, ivSz, counter, AES_BLOCK_SIZE); - GMULT(counter, aes->gcm.H); + GHASH_AARCH64(&aes->gcm, NULL, 0, iv, ivSz, counter, AES_BLOCK_SIZE); + GMULT_AARCH64(counter, aes->gcm.H); } __asm__ __volatile__ ( @@ -3543,14 +3689,11 @@ static int Aes128GcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz, "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31" ); - - - return 0; } #endif /* WOLFSSL_AES_128 */ #ifdef WOLFSSL_AES_192 /* internal function : see wc_AesGcmEncrypt */ -static int Aes192GcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz, +static void Aes192GcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz, const byte* iv, word32 ivSz, byte* authTag, word32 authTagSz, const byte* authIn, word32 authInSz) { @@ -3570,8 +3713,8 @@ static int Aes192GcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz, counter[AES_BLOCK_SIZE - 1] = 1; } else { - GHASH(&aes->gcm, NULL, 0, iv, ivSz, counter, AES_BLOCK_SIZE); - GMULT(counter, aes->gcm.H); + GHASH_AARCH64(&aes->gcm, NULL, 0, iv, ivSz, counter, AES_BLOCK_SIZE); + GMULT_AARCH64(counter, aes->gcm.H); } __asm__ __volatile__ ( @@ -5306,14 +5449,11 @@ static int Aes192GcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz, "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31" ); - - - return 0; } #endif /* WOLFSSL_AES_192 */ #ifdef WOLFSSL_AES_256 /* internal function : see wc_AesGcmEncrypt */ -static int Aes256GcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz, +static void Aes256GcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz, const byte* iv, word32 ivSz, byte* authTag, word32 authTagSz, const byte* authIn, word32 authInSz) { @@ -5333,8 +5473,8 @@ static int Aes256GcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz, counter[AES_BLOCK_SIZE - 1] = 1; } else { - GHASH(&aes->gcm, NULL, 0, iv, ivSz, counter, AES_BLOCK_SIZE); - GMULT(counter, aes->gcm.H); + GHASH_AARCH64(&aes->gcm, NULL, 0, iv, ivSz, counter, AES_BLOCK_SIZE); + GMULT_AARCH64(counter, aes->gcm.H); } __asm__ __volatile__ ( @@ -7200,9 +7340,6 @@ static int Aes256GcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz, "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31" ); - - - return 0; } #endif /* WOLFSSL_AES_256 */ @@ -7227,41 +7364,29 @@ static int Aes256GcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz, * by Conrado P.L. Gouvea and Julio Lopez reduction on 256bit value using * Algorithm 5 */ -int wc_AesGcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz, +void AES_GCM_encrypt_AARCH64(Aes* aes, byte* out, const byte* in, word32 sz, const byte* iv, word32 ivSz, byte* authTag, word32 authTagSz, const byte* authIn, word32 authInSz) { - /* sanity checks */ - if ((aes == NULL) || (iv == NULL && ivSz > 0) || (authTag == NULL) || - ((authIn == NULL) && (authInSz > 0)) || (ivSz == 0)) { - WOLFSSL_MSG("a NULL parameter passed in when size is larger than 0"); - return BAD_FUNC_ARG; - } - - if ((authTagSz < WOLFSSL_MIN_AUTH_TAG_SZ) || (authTagSz > AES_BLOCK_SIZE)) { - WOLFSSL_MSG("GcmEncrypt authTagSz error"); - return BAD_FUNC_ARG; - } - switch (aes->rounds) { #ifdef WOLFSSL_AES_128 case 10: - return Aes128GcmEncrypt(aes, out, in, sz, iv, ivSz, - authTag, authTagSz, authIn, authInSz); + Aes128GcmEncrypt(aes, out, in, sz, iv, ivSz, authTag, authTagSz, + authIn, authInSz); + break; #endif #ifdef WOLFSSL_AES_192 case 12: - return Aes192GcmEncrypt(aes, out, in, sz, iv, ivSz, - authTag, authTagSz, authIn, authInSz); + Aes192GcmEncrypt(aes, out, in, sz, iv, ivSz, authTag, authTagSz, + authIn, authInSz); + break; #endif #ifdef WOLFSSL_AES_256 case 14: - return Aes256GcmEncrypt(aes, out, in, sz, iv, ivSz, - authTag, authTagSz, authIn, authInSz); + Aes256GcmEncrypt(aes, out, in, sz, iv, ivSz, authTag, authTagSz, + authIn, authInSz); + break; #endif - default: - WOLFSSL_MSG("AES-GCM invalid round number"); - return BAD_FUNC_ARG; } } @@ -7284,8 +7409,8 @@ static int Aes128GcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz, counter[AES_BLOCK_SIZE - 1] = 1; } else { - GHASH(&aes->gcm, NULL, 0, iv, ivSz, counter, AES_BLOCK_SIZE); - GMULT(counter, aes->gcm.H); + GHASH_AARCH64(&aes->gcm, NULL, 0, iv, ivSz, counter, AES_BLOCK_SIZE); + GMULT_AARCH64(counter, aes->gcm.H); } __asm__ __volatile__ ( @@ -8935,8 +9060,8 @@ static int Aes192GcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz, counter[AES_BLOCK_SIZE - 1] = 1; } else { - GHASH(&aes->gcm, NULL, 0, iv, ivSz, counter, AES_BLOCK_SIZE); - GMULT(counter, aes->gcm.H); + GHASH_AARCH64(&aes->gcm, NULL, 0, iv, ivSz, counter, AES_BLOCK_SIZE); + GMULT_AARCH64(counter, aes->gcm.H); } __asm__ __volatile__ ( @@ -10703,8 +10828,8 @@ static int Aes256GcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz, counter[AES_BLOCK_SIZE - 1] = 1; } else { - GHASH(&aes->gcm, NULL, 0, iv, ivSz, counter, AES_BLOCK_SIZE); - GMULT(counter, aes->gcm.H); + GHASH_AARCH64(&aes->gcm, NULL, 0, iv, ivSz, counter, AES_BLOCK_SIZE); + GMULT_AARCH64(counter, aes->gcm.H); } __asm__ __volatile__ ( @@ -12587,38 +12712,30 @@ static int Aes256GcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz, * authIn: additional data buffer * authInSz: size of additional data buffer */ -int wc_AesGcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz, +int AES_GCM_decrypt_AARCH64(Aes* aes, byte* out, const byte* in, word32 sz, const byte* iv, word32 ivSz, const byte* authTag, word32 authTagSz, const byte* authIn, word32 authInSz) { /* sanity checks */ - if ((aes == NULL) || (iv == NULL) || (authTag == NULL) || - (authTagSz > AES_BLOCK_SIZE) || (authTagSz == 0) || (ivSz == 0) || - ((sz != 0) && ((in == NULL) || (out == NULL)))) { - WOLFSSL_MSG("a NULL parameter passed in when size is larger than 0"); - return BAD_FUNC_ARG; - } - switch (aes->rounds) { #ifdef WOLFSSL_AES_128 case 10: - return Aes128GcmDecrypt(aes, out, in, sz, iv, ivSz, - authTag, authTagSz, authIn, authInSz); + return Aes128GcmDecrypt(aes, out, in, sz, iv, ivSz, authTag, + authTagSz, authIn, authInSz); #endif #ifdef WOLFSSL_AES_192 case 12: - return Aes192GcmDecrypt(aes, out, in, sz, iv, ivSz, - authTag, authTagSz, authIn, authInSz); + return Aes192GcmDecrypt(aes, out, in, sz, iv, ivSz, authTag, + authTagSz, authIn, authInSz); #endif #ifdef WOLFSSL_AES_256 case 14: - return Aes256GcmDecrypt(aes, out, in, sz, iv, ivSz, - authTag, authTagSz, authIn, authInSz); + return Aes256GcmDecrypt(aes, out, in, sz, iv, ivSz, authTag, + authTagSz, authIn, authInSz); #endif - default: - WOLFSSL_MSG("AES-GCM invalid round number"); - return BAD_FUNC_ARG; } + + return BAD_FUNC_ARG; } #endif /* HAVE_AES_DECRYPT */ @@ -14179,6 +14296,7 @@ int wc_AesGcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz, while (blocks--) { IncrementGcmCounter(ctr); wc_AesEncrypt(aes, ctr, scratch); +#endif xorbuf(scratch, c, AES_BLOCK_SIZE); XMEMCPY(p, scratch, AES_BLOCK_SIZE); p += AES_BLOCK_SIZE; @@ -14201,10 +14319,9 @@ int wc_AesGcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz, #endif /* HAVE_AES_DECRYPT */ #endif /* HAVE_AESGCM */ -#endif /* aarch64 */ - #ifdef HAVE_AESGCM #ifdef WOLFSSL_AESGCM_STREAM +#ifndef __aarch64__ /* Access initialization counter data. */ #define AES_INITCTR(aes) ((aes)->streamData + 0 * AES_BLOCK_SIZE) /* Access counter data. */ @@ -14422,8 +14539,13 @@ static void AesGcmInit_C(Aes* aes, const byte* iv, word32 ivSz) word32 aadTemp = aes->gcm.aadLen; aes->gcm.aadLen = 0; #endif + #ifdef __aarch64__ + GHASH_AARCH64(&aes->gcm, NULL, 0, iv, ivSz, counter, AES_BLOCK_SIZE); + GMULT_AARCH64(counter, aes->gcm.H); + #else GHASH(&aes->gcm, NULL, 0, iv, ivSz, counter, AES_BLOCK_SIZE); GMULT(counter, aes->gcm.H); + #endif #ifdef OPENSSL_EXTRA aes->gcm.aadLen = aadTemp; #endif @@ -14432,7 +14554,8 @@ static void AesGcmInit_C(Aes* aes, const byte* iv, word32 ivSz) /* Copy in the counter for use with cipher. */ XMEMCPY(AES_COUNTER(aes), counter, AES_BLOCK_SIZE); /* Encrypt initial counter into a buffer for GCM. */ - wc_AesEncrypt(aes, counter, AES_INITCTR(aes)); + AES_encrypt_AARCH64(counter, AES_INITCTR(aes), (byte*)aes->key, + aes->rounds); /* Reset state fields. */ aes->over = 0; aes->aSz = 0; @@ -14480,7 +14603,8 @@ static void AesGcmCryptUpdate_C(Aes* aes, byte* out, const byte* in, word32 sz) ALIGN32 byte scratch[AES_BLOCK_SIZE]; IncrementGcmCounter(AES_COUNTER(aes)); /* Encrypt counter into a buffer. */ - wc_AesEncrypt(aes, AES_COUNTER(aes), scratch); + AES_encrypt_AARCH64(AES_COUNTER(aes), scratch, (byte*)aes->key, + aes->rounds); /* XOR plain text into encrypted counter into cipher text buffer. */ xorbufout(out, scratch, in, AES_BLOCK_SIZE); /* Data complete. */ @@ -14492,7 +14616,8 @@ static void AesGcmCryptUpdate_C(Aes* aes, byte* out, const byte* in, word32 sz) /* Generate an extra block and use up as much as needed. */ IncrementGcmCounter(AES_COUNTER(aes)); /* Encrypt counter into cache. */ - wc_AesEncrypt(aes, AES_COUNTER(aes), AES_LASTBLOCK(aes)); + AES_encrypt_AARCH64(AES_COUNTER(aes), AES_LASTBLOCK(aes), + (byte*)aes->key, (int)aes->rounds); /* XOR plain text into encrypted counter into cipher text buffer. */ xorbufout(out, AES_LASTBLOCK(aes), in, partial); /* Keep amount of encrypted block used. */ @@ -14836,11 +14961,13 @@ int wc_AesGcmDecryptFinal(Aes* aes, const byte* authTag, word32 authTagSz) return ret; } #endif /* HAVE_AES_DECRYPT || HAVE_AESGCM_DECRYPT */ +#endif /* !__aarch64__ */ #endif /* WOLFSSL_AESGCM_STREAM */ #endif /* HAVE_AESGCM */ #ifdef HAVE_AESCCM +#ifndef __aarch64__ /* Software version of AES-CCM from wolfcrypt/src/aes.c * Gets some speed up from hardware acceleration of wc_AesEncrypt */ @@ -15110,11 +15237,30 @@ int wc_AesCcmDecrypt(Aes* aes, byte* out, const byte* in, word32 inSz, return result; } #endif /* HAVE_AES_DECRYPT */ +#endif /* !__aarch64__ */ #endif /* HAVE_AESCCM */ #ifdef HAVE_AESGCM /* common GCM functions 32 and 64 bit */ +#if defined(__aarch64__) +void AES_GCM_set_key_AARCH64(Aes* aes, byte* iv) +{ + + AES_encrypt_AARCH64(iv, aes->gcm.H, (byte*)aes->key, aes->rounds); + { + word32* pt = (word32*)aes->gcm.H; + __asm__ volatile ( + "LD1 {v0.16b}, [%[h]] \n" + "RBIT v0.16b, v0.16b \n" + "ST1 {v0.16b}, [%[out]] \n" + : [out] "=r" (pt) + : [h] "0" (pt) + : "cc", "memory", "v0" + ); + } +} +#else int wc_AesGcmSetKey(Aes* aes, const byte* key, word32 len) { int ret; @@ -15132,19 +15278,6 @@ int wc_AesGcmSetKey(Aes* aes, const byte* key, word32 len) #endif wc_AesEncrypt(aes, iv, aes->gcm.H); - #if defined(__aarch64__) - { - word32* pt = (word32*)aes->gcm.H; - __asm__ volatile ( - "LD1 {v0.16b}, [%[h]] \n" - "RBIT v0.16b, v0.16b \n" - "ST1 {v0.16b}, [%[out]] \n" - : [out] "=r" (pt) - : [h] "0" (pt) - : "cc", "memory", "v0" - ); - } - #else { word32* pt = (word32*)aes->gcm.H; __asm__ volatile ( @@ -15157,14 +15290,15 @@ int wc_AesGcmSetKey(Aes* aes, const byte* key, word32 len) : "cc", "memory", "q0" ); } - #endif } return ret; } +#endif #endif /* HAVE_AESGCM */ +#ifndef __aarch64__ /* AES-DIRECT */ #if defined(WOLFSSL_AES_DIRECT) /* Allow direct access to one block encrypt */ @@ -15188,6 +15322,7 @@ int wc_AesGcmSetKey(Aes* aes, const byte* key, word32 len) } #endif /* HAVE_AES_DECRYPT */ #endif /* WOLFSSL_AES_DIRECT */ +#endif /* !__aarch64__ */ #ifdef WOLFSSL_AES_XTS @@ -15371,26 +15506,12 @@ int wc_AesGcmSetKey(Aes* aes, const byte* key, word32 len) * * returns 0 on success */ -int wc_AesXtsEncrypt(XtsAes* xaes, byte* out, const byte* in, word32 sz, - const byte* i, word32 iSz) +void AES_XTS_encrypt_AARCH64(XtsAes* xaes, byte* out, const byte* in, word32 sz, + const byte* i) { - int ret = 0; word32 blocks = (sz / AES_BLOCK_SIZE); byte tmp[AES_BLOCK_SIZE]; - if (xaes == NULL || out == NULL || in == NULL) { - return BAD_FUNC_ARG; - } - - if (iSz < AES_BLOCK_SIZE) { - return BAD_FUNC_ARG; - } - - if (blocks == 0) { - WOLFSSL_MSG("Plain text input too small for encryption"); - return BAD_FUNC_ARG; - } - __asm__ __volatile__ ( "MOV x19, 0x87 \n" @@ -15691,8 +15812,6 @@ int wc_AesXtsEncrypt(XtsAes* xaes, byte* out, const byte* in, word32 sz, "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23" ); - - return ret; } /* Same process as encryption but Aes key is AES_DECRYPTION type. @@ -15707,27 +15826,13 @@ int wc_AesXtsEncrypt(XtsAes* xaes, byte* out, const byte* in, word32 sz, * * returns 0 on success */ -int wc_AesXtsDecrypt(XtsAes* xaes, byte* out, const byte* in, word32 sz, - const byte* i, word32 iSz) +void AES_XTS_decrypt_AARCH64(XtsAes* xaes, byte* out, const byte* in, word32 sz, + const byte* i) { - int ret = 0; word32 blocks = (sz / AES_BLOCK_SIZE); byte tmp[AES_BLOCK_SIZE]; byte stl = (sz % AES_BLOCK_SIZE); - if (xaes == NULL || out == NULL || in == NULL) { - return BAD_FUNC_ARG; - } - - if (iSz < AES_BLOCK_SIZE) { - return BAD_FUNC_ARG; - } - - if (blocks == 0) { - WOLFSSL_MSG("Plain text input too small for encryption"); - return BAD_FUNC_ARG; - } - /* if Stealing then break out of loop one block early to handle special * case */ blocks -= (stl > 0); @@ -16039,8 +16144,6 @@ int wc_AesXtsDecrypt(XtsAes* xaes, byte* out, const byte* in, word32 sz, "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23" ); - - return ret; } #else @@ -16556,6 +16659,7 @@ extern void GCM_gmult_len(byte* x, /* const */ byte m[32][AES_BLOCK_SIZE], extern void AES_GCM_encrypt(const unsigned char* in, unsigned char* out, unsigned long len, const unsigned char* ks, int nr, unsigned char* ctr); +#ifndef __aarch64__ int wc_AesSetKey(Aes* aes, const byte* userKey, word32 keylen, const byte* iv, int dir) { @@ -17144,9 +17248,22 @@ static WC_INLINE void RIGHTSHIFTX(byte* x) } #if defined(GCM_TABLE) || defined(GCM_TABLE_4BIT) + +#if defined(__aarch64__) && !defined(BIG_ENDIAN_ORDER) +static WC_INLINE void Shift4_M0(byte *r8, byte *z8) +{ + int i; + for (i = 15; i > 0; i--) + r8[i] = (byte)(z8[i-1] << 4) | (byte)(z8[i] >> 4); + r8[0] = (byte)(z8[0] >> 4); +} +#endif + void GenerateM0(Gcm* gcm) { +#if !defined(__aarch64__) || !defined(BIG_ENDIAN_ORDER) int i; +#endif byte (*m)[AES_BLOCK_SIZE] = gcm->M0; /* 0 times -> 0x0 */ @@ -17191,6 +17308,7 @@ void GenerateM0(Gcm* gcm) XMEMCPY(m[0xf], m[0x8], AES_BLOCK_SIZE); xorbuf (m[0xf], m[0x7], AES_BLOCK_SIZE); +#ifndef __aarch64__ for (i = 0; i < 16; i++) { word32* m32 = (word32*)gcm->M0[i]; m32[0] = ByteReverseWord32(m32[0]); @@ -17198,6 +17316,11 @@ void GenerateM0(Gcm* gcm) m32[2] = ByteReverseWord32(m32[2]); m32[3] = ByteReverseWord32(m32[3]); } +#elif !defined(BIG_ENDIAN_ORDER) + for (i = 0; i < 16; i++) { + Shift4_M0(m[16+i], m[i]); + } +#endif } #endif /* GCM_TABLE */ @@ -17235,6 +17358,7 @@ int wc_AesGcmSetKey(Aes* aes, const byte* key, word32 len) return ret; } +#ifndef __aarch64__ static WC_INLINE void IncrementGcmCounter(byte* inOutCtr) { int i; @@ -17245,6 +17369,7 @@ static WC_INLINE void IncrementGcmCounter(byte* inOutCtr) return; } } +#endif static WC_INLINE void FlattenSzInBits(byte* buf, word32 sz) { @@ -17561,6 +17686,7 @@ int wc_AesGcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz, return 0; } #endif /* HAVE_AESGCM */ +#endif /* !__aarch64__ */ #endif /* !WOLFSSL_ARMASM_NO_HW_CRYPTO */ #endif /* !NO_AES && WOLFSSL_ARMASM */ diff --git a/wolfcrypt/src/port/arm/armv8-sha256.c b/wolfcrypt/src/port/arm/armv8-sha256.c index dabe7af9c3..9d5dc25609 100644 --- a/wolfcrypt/src/port/arm/armv8-sha256.c +++ b/wolfcrypt/src/port/arm/armv8-sha256.c @@ -1407,7 +1407,214 @@ static WC_INLINE int Sha256Final(wc_Sha256* sha256, byte* hash) return ret; } -#else /* */ +#elif defined(__aarch64__) + + static const FLASH_QUALIFIER ALIGN32 word32 K[64] = { + 0x428A2F98L, 0x71374491L, 0xB5C0FBCFL, 0xE9B5DBA5L, 0x3956C25BL, + 0x59F111F1L, 0x923F82A4L, 0xAB1C5ED5L, 0xD807AA98L, 0x12835B01L, + 0x243185BEL, 0x550C7DC3L, 0x72BE5D74L, 0x80DEB1FEL, 0x9BDC06A7L, + 0xC19BF174L, 0xE49B69C1L, 0xEFBE4786L, 0x0FC19DC6L, 0x240CA1CCL, + 0x2DE92C6FL, 0x4A7484AAL, 0x5CB0A9DCL, 0x76F988DAL, 0x983E5152L, + 0xA831C66DL, 0xB00327C8L, 0xBF597FC7L, 0xC6E00BF3L, 0xD5A79147L, + 0x06CA6351L, 0x14292967L, 0x27B70A85L, 0x2E1B2138L, 0x4D2C6DFCL, + 0x53380D13L, 0x650A7354L, 0x766A0ABBL, 0x81C2C92EL, 0x92722C85L, + 0xA2BFE8A1L, 0xA81A664BL, 0xC24B8B70L, 0xC76C51A3L, 0xD192E819L, + 0xD6990624L, 0xF40E3585L, 0x106AA070L, 0x19A4C116L, 0x1E376C08L, + 0x2748774CL, 0x34B0BCB5L, 0x391C0CB3L, 0x4ED8AA4AL, 0x5B9CCA4FL, + 0x682E6FF3L, 0x748F82EEL, 0x78A5636FL, 0x84C87814L, 0x8CC70208L, + 0x90BEFFFAL, 0xA4506CEBL, 0xBEF9A3F7L, 0xC67178F2L + }; + +/* Both versions of Ch and Maj are logically the same, but with the second set + the compilers can recognize them better for optimization */ +#ifdef WOLFSSL_SHA256_BY_SPEC + /* SHA256 math based on specification */ + #define Ch(x,y,z) ((z) ^ ((x) & ((y) ^ (z)))) + #define Maj(x,y,z) ((((x) | (y)) & (z)) | ((x) & (y))) +#else + /* SHA256 math reworked for easier compiler optimization */ + #define Ch(x,y,z) ((((y) ^ (z)) & (x)) ^ (z)) + #define Maj(x,y,z) ((((x) ^ (y)) & ((y) ^ (z))) ^ (y)) +#endif + #define R(x, n) (((x) & 0xFFFFFFFFU) >> (n)) + + #define S(x, n) rotrFixed(x, n) + #define Sigma0(x) (S(x, 2) ^ S(x, 13) ^ S(x, 22)) + #define Sigma1(x) (S(x, 6) ^ S(x, 11) ^ S(x, 25)) + #define Gamma0(x) (S(x, 7) ^ S(x, 18) ^ R(x, 3)) + #define Gamma1(x) (S(x, 17) ^ S(x, 19) ^ R(x, 10)) + + #define a(i) S[(0-(i)) & 7] + #define b(i) S[(1-(i)) & 7] + #define c(i) S[(2-(i)) & 7] + #define d(i) S[(3-(i)) & 7] + #define e(i) S[(4-(i)) & 7] + #define f(i) S[(5-(i)) & 7] + #define g(i) S[(6-(i)) & 7] + #define h(i) S[(7-(i)) & 7] + + #ifndef XTRANSFORM + #define XTRANSFORM(S, D) Transform_Sha256((S),(D)) + #endif + +#ifndef SHA256_MANY_REGISTERS + #define RND(j) \ + t0 = h(j) + Sigma1(e(j)) + Ch(e(j), f(j), g(j)) + K[i+(j)] + \ + W[i+(j)]; \ + t1 = Sigma0(a(j)) + Maj(a(j), b(j), c(j)); \ + d(j) += t0; \ + h(j) = t0 + t1 + + static void Transform_Sha256(wc_Sha256* sha256, const byte* data) + { + word32 S[8], t0, t1; + int i; + + #ifdef WOLFSSL_SMALL_STACK_CACHE + word32* W = sha256->W; + if (W == NULL) { + W = (word32*)XMALLOC(sizeof(word32) * WC_SHA256_BLOCK_SIZE, NULL, + DYNAMIC_TYPE_DIGEST); + if (W == NULL) + return MEMORY_E; + sha256->W = W; + } + #elif defined(WOLFSSL_SMALL_STACK) + word32* W; + W = (word32*)XMALLOC(sizeof(word32) * WC_SHA256_BLOCK_SIZE, NULL, + DYNAMIC_TYPE_TMP_BUFFER); + if (W == NULL) + return MEMORY_E; + #else + word32 W[WC_SHA256_BLOCK_SIZE]; + #endif + + /* Copy context->state[] to working vars */ + for (i = 0; i < 8; i++) + S[i] = sha256->digest[i]; + + for (i = 0; i < 16; i++) + W[i] = *((const word32*)&data[i*(int)sizeof(word32)]); + + for (i = 16; i < WC_SHA256_BLOCK_SIZE; i++) + W[i] = Gamma1(W[i-2]) + W[i-7] + Gamma0(W[i-15]) + W[i-16]; + + #ifdef USE_SLOW_SHA256 + /* not unrolled - ~2k smaller and ~25% slower */ + for (i = 0; i < WC_SHA256_BLOCK_SIZE; i += 8) { + int j; + for (j = 0; j < 8; j++) { /* braces needed here for macros {} */ + RND(j); + } + } + #else + /* partially loop unrolled */ + for (i = 0; i < WC_SHA256_BLOCK_SIZE; i += 8) { + RND(0); RND(1); RND(2); RND(3); + RND(4); RND(5); RND(6); RND(7); + } + #endif /* USE_SLOW_SHA256 */ + + /* Add the working vars back into digest state[] */ + for (i = 0; i < 8; i++) { + sha256->digest[i] += S[i]; + } + + #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SMALL_STACK_CACHE) + ForceZero(W, sizeof(word32) * WC_SHA256_BLOCK_SIZE); + XFREE(W, NULL, DYNAMIC_TYPE_TMP_BUFFER); + #endif + } +#else + /* SHA256 version that keeps all data in registers */ + #define SCHED1(j) (W[j] = *((word32*)&data[j*sizeof(word32)])) + #define SCHED(j) ( \ + W[ j & 15] += \ + Gamma1(W[(j-2) & 15])+ \ + W[(j-7) & 15] + \ + Gamma0(W[(j-15) & 15]) \ + ) + + #define RND1(j) \ + t0 = h(j) + Sigma1(e(j)) + Ch(e(j), f(j), g(j)) + K[i+j] + SCHED1(j); \ + t1 = Sigma0(a(j)) + Maj(a(j), b(j), c(j)); \ + d(j) += t0; \ + h(j) = t0 + t1 + #define RNDN(j) \ + t0 = h(j) + Sigma1(e(j)) + Ch(e(j), f(j), g(j)) + K[i+j] + SCHED(j); \ + t1 = Sigma0(a(j)) + Maj(a(j), b(j), c(j)); \ + d(j) += t0; \ + h(j) = t0 + t1 + + static void Transform_Sha256(wc_Sha256* sha256, const byte* data) + { + word32 S[8], t0, t1; + int i; + #ifdef USE_SLOW_SHA256 + int j; + #endif + word32 W[WC_SHA256_BLOCK_SIZE/sizeof(word32)]; + + /* Copy digest to working vars */ + S[0] = sha256->digest[0]; + S[1] = sha256->digest[1]; + S[2] = sha256->digest[2]; + S[3] = sha256->digest[3]; + S[4] = sha256->digest[4]; + S[5] = sha256->digest[5]; + S[6] = sha256->digest[6]; + S[7] = sha256->digest[7]; + + i = 0; + #ifdef USE_SLOW_SHA256 + for (j = 0; j < 16; j++) { + RND1(j); + } + for (i = 16; i < 64; i += 16) { + for (j = 0; j < 16; j++) { + RNDN(j); + } + } + #else + RND1( 0); RND1( 1); RND1( 2); RND1( 3); + RND1( 4); RND1( 5); RND1( 6); RND1( 7); + RND1( 8); RND1( 9); RND1(10); RND1(11); + RND1(12); RND1(13); RND1(14); RND1(15); + /* 64 operations, partially loop unrolled */ + for (i = 16; i < 64; i += 16) { + RNDN( 0); RNDN( 1); RNDN( 2); RNDN( 3); + RNDN( 4); RNDN( 5); RNDN( 6); RNDN( 7); + RNDN( 8); RNDN( 9); RNDN(10); RNDN(11); + RNDN(12); RNDN(13); RNDN(14); RNDN(15); + } + #endif + + /* Add the working vars back into digest */ + sha256->digest[0] += S[0]; + sha256->digest[1] += S[1]; + sha256->digest[2] += S[2]; + sha256->digest[3] += S[3]; + sha256->digest[4] += S[4]; + sha256->digest[5] += S[5]; + sha256->digest[6] += S[6]; + sha256->digest[7] += S[7]; + } +#endif /* SHA256_MANY_REGISTERS */ + +static void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, + word32 len) +{ + while (len > 0) { + byte tmp[WC_SHA256_BLOCK_SIZE]; + ByteReverseWords((word32*)tmp, (const word32*)data, + WC_SHA256_BLOCK_SIZE); + Transform_Sha256(sha256, tmp); + data += WC_SHA256_BLOCK_SIZE; + len -= WC_SHA256_BLOCK_SIZE; + } +} + +#else extern void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len); diff --git a/wolfssl/wolfcrypt/aes.h b/wolfssl/wolfcrypt/aes.h index 61a3433ea9..ab2159abf3 100644 --- a/wolfssl/wolfcrypt/aes.h +++ b/wolfssl/wolfcrypt/aes.h @@ -61,7 +61,7 @@ typedef struct Gcm { #endif WOLFSSL_LOCAL void GenerateM0(Gcm* gcm); -#ifdef WOLFSSL_ARMASM +#if !defined(__aarch64__) && defined(WOLFSSL_ARMASM) WOLFSSL_LOCAL void GMULT(byte* X, byte* Y); #endif WOLFSSL_LOCAL void GHASH(Gcm* gcm, const byte* a, word32 aSz, const byte* c, @@ -304,6 +304,13 @@ struct Aes { #ifdef WOLFSSL_AESNI byte use_aesni; #endif /* WOLFSSL_AESNI */ +#if defined(__aarch64__) && defined(WOLFSSL_ARMASM) && \ + !defined(WOLFSSL_ARMASM_NO_HW_CRYPTO) + byte use_aes_hw_crypto; +#ifdef HAVE_AESGCM + byte use_pmull_hw_crypto; +#endif +#endif /* __aarch64__ && WOLFSSL_ARMASM && !WOLFSSL_ARMASM_NO_HW_CRYPTO */ #ifdef WOLF_CRYPTO_CB int devId; void* devCtx; @@ -832,6 +839,59 @@ WOLFSSL_API int wc_AesEaxFree(AesEax* eax); #endif /* WOLFSSL_AES_EAX */ +#if defined(__aarch64__) && defined(WOLFSSL_ARMASM) && \ + !defined(WOLFSSL_ARMASM_NO_HW_CRYPTO) +/* GHASH one block of data. + * + * XOR block into tag and GMULT with H. + * + * @param [in, out] aes AES GCM object. + * @param [in] block Block of AAD or cipher text. + */ +#define GHASH_ONE_BLOCK(aes, block) \ + do { \ + xorbuf(AES_TAG(aes), block, AES_BLOCK_SIZE); \ + GMULT_AARCH64(AES_TAG(aes), aes->gcm.H); \ + } \ + while (0) + +WOLFSSL_LOCAL int AES_set_key_AARCH64(const unsigned char *userKey, + const int keylen, Aes* aes, int dir); +WOLFSSL_LOCAL void AES_encrypt_AARCH64(const byte* inBlock, byte* outBlock, + byte* key, int nr); +WOLFSSL_LOCAL void AES_decrypt_AARCH64(const byte* inBlock, byte* outBlock, + byte* key, int nr); +WOLFSSL_LOCAL void AES_CBC_encrypt_AARCH64(const byte* in, byte* out, word32 sz, + byte* reg, byte* key, int rounds); +WOLFSSL_LOCAL void AES_CBC_decrypt_AARCH64(const byte* in, byte* out, word32 sz, + byte* reg, byte* key, int rounds); +WOLFSSL_LOCAL void AES_CTR_encrypt_AARCH64(Aes* aes, byte* out, const byte* in, + word32 sz); +WOLFSSL_LOCAL void GMULT_AARCH64(byte* X, byte* Y); +#ifdef WOLFSSL_AESGCM_STREAM +WOLFSSL_LOCAL void GHASH_UPDATE_AARCH64(Aes* aes, const byte* a, word32 aSz, + const byte* c, word32 cSz); +WOLFSSL_LOCAL void AES_GCM_init_AARCH64(Aes* aes, const byte* iv, word32 ivSz); +WOLFSSL_LOCAL void AES_GCM_crypt_update_AARCH64(Aes* aes, byte* out, + const byte* in, word32 sz); +WOLFSSL_LOCAL void AES_GCM_final_AARCH64(Aes* aes, byte* authTag, + word32 authTagSz); +#endif +WOLFSSL_LOCAL void AES_GCM_set_key_AARCH64(Aes* aes, byte* iv); +WOLFSSL_LOCAL void AES_GCM_encrypt_AARCH64(Aes* aes, byte* out, const byte* in, + word32 sz, const byte* iv, word32 ivSz, byte* authTag, word32 authTagSz, + const byte* authIn, word32 authInSz); +WOLFSSL_LOCAL int AES_GCM_decrypt_AARCH64(Aes* aes, byte* out, const byte* in, + word32 sz, const byte* iv, word32 ivSz, const byte* authTag, + word32 authTagSz, const byte* authIn, word32 authInSz); + +#ifdef WOLFSSL_AES_XTS +WOLFSSL_LOCAL void AES_XTS_encrypt_AARCH64(XtsAes* xaes, byte* out, + const byte* in, word32 sz, const byte* i); +WOLFSSL_LOCAL void AES_XTS_decrypt_AARCH64(XtsAes* xaes, byte* out, + const byte* in, word32 sz, const byte* i); +#endif /* WOLFSSL_AES_XTS */ +#endif /* __aarch64__ && WOLFSSL_ARMASM && !WOLFSSL_ARMASM_NO_HW_CRYPTO */ #ifdef __cplusplus } /* extern "C" */ diff --git a/wolfssl/wolfcrypt/cpuid.h b/wolfssl/wolfcrypt/cpuid.h index c91b628b5b..b7a5714798 100644 --- a/wolfssl/wolfcrypt/cpuid.h +++ b/wolfssl/wolfcrypt/cpuid.h @@ -38,6 +38,11 @@ #define HAVE_CPUID #define HAVE_CPUID_INTEL #endif +#if (defined(WOLFSSL_AARCH64_BUILD) || (defined(__aarch64__) && \ + defined(WOLFSSL_ARMASM))) && !defined(WOLFSSL_NO_ASM) + #define HAVE_CPUID + #define HAVE_CPUID_AARCH64 +#endif #ifdef HAVE_CPUID_INTEL @@ -63,6 +68,26 @@ #define IS_INTEL_BMI1(f) ((f) & CPUID_BMI1) #define IS_INTEL_SHA(f) ((f) & CPUID_SHA) +#elif defined(HAVE_CPUID_AARCH64) + + #define CPUID_AES 0x0001 + #define CPUID_PMULL 0x0002 + #define CPUID_SHA256 0x0004 + #define CPUID_SHA512 0x0008 + #define CPUID_RDM 0x0010 + #define CPUID_SHA3 0x0020 + #define CPUID_SM3 0x0040 + #define CPUID_SM4 0x0080 + + #define IS_AARCH64_AES(f) ((f) & CPUID_AES) + #define IS_AARCH64_PMULL(f) ((f) & CPUID_PMULL) + #define IS_AARCH64_SHA256(f) ((f) & CPUID_SHA256) + #define IS_AARCH64_SHA512(f) ((f) & CPUID_SHA512) + #define IS_AARCH64_RDM(f) ((f) & CPUID_RDM) + #define IS_AARCH64_SHA3(f) ((f) & CPUID_SHA3) + #define IS_AARCH64_SM3(f) ((f) & CPUID_SM3) + #define IS_AARCH64_SM4(f) ((f) & CPUID_SM4) + #endif #ifdef HAVE_CPUID